summaryrefslogtreecommitdiff
path: root/mpn/pa32
diff options
context:
space:
mode:
authortege <tege@gmplib.org>2002-03-26 17:30:50 +0100
committertege <tege@gmplib.org>2002-03-26 17:30:50 +0100
commit59ccffe8a22a0c991f9c44a8c17db500f9b08a51 (patch)
treef2f02ca330e0de3bd9b186f54071a1c8f1b195a5 /mpn/pa32
parent337d7796e7aadc1055f2e2e4c89f5849f6a1f994 (diff)
downloadgmp-59ccffe8a22a0c991f9c44a8c17db500f9b08a51.tar.gz
Moved from mpn/hppa.
Diffstat (limited to 'mpn/pa32')
-rw-r--r--mpn/pa32/README143
-rw-r--r--mpn/pa32/add_n.asm54
-rw-r--r--mpn/pa32/gmp-mparam.h56
-rw-r--r--mpn/pa32/hppa1_1/addmul_1.asm98
-rw-r--r--mpn/pa32/hppa1_1/gmp-mparam.h63
-rw-r--r--mpn/pa32/hppa1_1/mul_1.asm94
-rw-r--r--mpn/pa32/hppa1_1/pa7100/add_n.asm74
-rw-r--r--mpn/pa32/hppa1_1/pa7100/addmul_1.asm192
-rw-r--r--mpn/pa32/hppa1_1/pa7100/lshift.asm86
-rw-r--r--mpn/pa32/hppa1_1/pa7100/rshift.asm83
-rw-r--r--mpn/pa32/hppa1_1/pa7100/sub_n.asm75
-rw-r--r--mpn/pa32/hppa1_1/pa7100/submul_1.asm198
-rw-r--r--mpn/pa32/hppa1_1/sqr_diagonal.asm51
-rw-r--r--mpn/pa32/hppa1_1/submul_1.asm107
-rw-r--r--mpn/pa32/hppa1_1/udiv_qrnnd.asm96
-rw-r--r--mpn/pa32/hppa1_1/umul.asm38
-rw-r--r--mpn/pa32/hppa2_0/add_n.asm98
-rw-r--r--mpn/pa32/hppa2_0/gmp-mparam.h57
-rw-r--r--mpn/pa32/hppa2_0/sqr_diagonal.asm103
-rw-r--r--mpn/pa32/hppa2_0/sub_n.asm98
-rw-r--r--mpn/pa32/lshift.asm66
-rw-r--r--mpn/pa32/rshift.asm63
-rw-r--r--mpn/pa32/sub_n.asm55
-rw-r--r--mpn/pa32/udiv_qrnnd.asm282
24 files changed, 2330 insertions, 0 deletions
diff --git a/mpn/pa32/README b/mpn/pa32/README
new file mode 100644
index 000000000..b7c26318d
--- /dev/null
+++ b/mpn/pa32/README
@@ -0,0 +1,143 @@
+Copyright 1996, 1999, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+02111-1307, USA.
+
+
+
+
+
+
+This directory contains mpn functions for various HP PA-RISC chips. Code
+that runs faster on the PA7100 and later implementations, is in the pa7100
+directory.
+
+RELEVANT OPTIMIZATION ISSUES
+
+ Load and Store timing
+
+On the PA7000 no memory instructions can issue the two cycles after a store.
+For the PA7100, this is reduced to one cycle.
+
+The PA7100 has a lookup-free cache, so it helps to schedule loads and the
+dependent instruction really far from each other.
+
+STATUS
+
+1. mpn_mul_1 could be improved to 6.5 cycles/limb on the PA7100, using the
+ instructions below (but some sw pipelining is needed to avoid the
+ xmpyu-fstds delay):
+
+ fldds s1_ptr
+
+ xmpyu
+ fstds N(%r30)
+ xmpyu
+ fstds N(%r30)
+
+ ldws N(%r30)
+ ldws N(%r30)
+ ldws N(%r30)
+ ldws N(%r30)
+
+ addc
+ stws res_ptr
+ addc
+ stws res_ptr
+
+ addib Loop
+
+2. mpn_addmul_1 could be improved from the current 10 to 7.5 cycles/limb
+ (asymptotically) on the PA7100, using the instructions below. With proper
+ sw pipelining and the unrolling level below, the speed becomes 8
+ cycles/limb.
+
+ fldds s1_ptr
+ fldds s1_ptr
+
+ xmpyu
+ fstds N(%r30)
+ xmpyu
+ fstds N(%r30)
+ xmpyu
+ fstds N(%r30)
+ xmpyu
+ fstds N(%r30)
+
+ ldws N(%r30)
+ ldws N(%r30)
+ ldws N(%r30)
+ ldws N(%r30)
+ ldws N(%r30)
+ ldws N(%r30)
+ ldws N(%r30)
+ ldws N(%r30)
+ addc
+ addc
+ addc
+ addc
+ addc %r0,%r0,cy-limb
+
+ ldws res_ptr
+ ldws res_ptr
+ ldws res_ptr
+ ldws res_ptr
+ add
+ stws res_ptr
+ addc
+ stws res_ptr
+ addc
+ stws res_ptr
+ addc
+ stws res_ptr
+
+ addib
+
+3. For the PA8000 we have to stick to using 32-bit limbs before compiler
+ support emerges. But we want to use 64-bit operations whenever possible,
+ in particular for loads and stores. It is possible to handle mpn_add_n
+ efficiently by rotating (when s1/s2 are aligned), masking+bit field
+ inserting when (they are not). The speed should double compared to the
+ code used today.
+
+
+
+
+LABEL SYNTAX
+
+The HP-UX assembler takes labels starting in column 0 with no colon,
+
+ L$loop ldws,mb -4(0,%r25),%r22
+
+Gas on hppa GNU/Linux however requires a colon,
+
+ L$loop: ldws,mb -4(0,%r25),%r22
+
+Fortunately both accept a ".label" pseudo-op,
+
+ .label L$loop
+ ldws,mb -4(0,%r25),%r22
+
+
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/mpn/pa32/add_n.asm b/mpn/pa32/add_n.asm
new file mode 100644
index 000000000..9b201c445
--- /dev/null
+++ b/mpn/pa32/add_n.asm
@@ -0,0 +1,54 @@
+dnl HP-PA mpn_add_n -- Add two limb vectors of the same length > 0 and store
+dnl sum in a third limb vector.
+
+dnl Copyright 1992, 1994, 2000, 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr gr26
+C s1_ptr gr25
+C s2_ptr gr24
+C size gr23
+
+C One might want to unroll this as for other processors, but it turns out that
+C the data cache contention after a store makes such unrolling useless. We
+C can't come under 5 cycles/limb anyway.
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+
+ addib,= -1,%r23,L(end) C check for (SIZE == 1)
+ add %r20,%r19,%r28 C add first limbs ignoring cy
+
+ .label L(loop)
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addib,<> -1,%r23,L(loop)
+ addc %r20,%r19,%r28
+
+ .label L(end)
+ stws %r28,0(0,%r26)
+ bv 0(%r2)
+ addc %r0,%r0,%r28
+EPILOGUE()
diff --git a/mpn/pa32/gmp-mparam.h b/mpn/pa32/gmp-mparam.h
new file mode 100644
index 000000000..2fec625c3
--- /dev/null
+++ b/mpn/pa32/gmp-mparam.h
@@ -0,0 +1,56 @@
+/* HP-PA 1.0 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+
+/* These values are for the PA7100 using GCC. */
+/* Generated by tuneup.c, 2000-10-27. */
+
+#ifndef MUL_KARATSUBA_THRESHOLD
+#define MUL_KARATSUBA_THRESHOLD 30
+#endif
+#ifndef MUL_TOOM3_THRESHOLD
+#define MUL_TOOM3_THRESHOLD 141
+#endif
+
+#ifndef SQR_KARATSUBA_THRESHOLD
+#define SQR_KARATSUBA_THRESHOLD 59
+#endif
+#ifndef SQR_TOOM3_THRESHOLD
+#define SQR_TOOM3_THRESHOLD 177
+#endif
+
+#ifndef DIV_DC_THRESHOLD
+#define DIV_DC_THRESHOLD 108
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 18
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 46
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 33
+#endif
diff --git a/mpn/pa32/hppa1_1/addmul_1.asm b/mpn/pa32/hppa1_1/addmul_1.asm
new file mode 100644
index 000000000..40fefe505
--- /dev/null
+++ b/mpn/pa32/hppa1_1/addmul_1.asm
@@ -0,0 +1,98 @@
+dnl HP-PA 1.1 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
+dnl result to a second limb vector.
+
+dnl Copyright 1992, 1993, 1994, 2000, 2001, 2002 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr r26
+C s1_ptr r25
+C size r24
+C s2_limb r23
+
+C This runs at 11 cycles/limb on a PA7000. With the used instructions, it can
+C not become faster due to data cache contention after a store. On the PA7100
+C it runs at 10 cycles/limb.
+
+C There are some ideas described in mul_1.asm that applies to this code too.
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+C .callinfo frame=64,no_calls
+
+ ldo 64(%r30),%r30
+ fldws,ma 4(%r25),%fr5
+ stw %r23,-16(%r30) C move s2_limb ...
+ addib,= -1,%r24,L(just_one_limb)
+ fldws -16(%r30),%fr4 C ... into fr4
+ add %r0,%r0,%r0 C clear carry
+ xmpyu %fr4,%fr5,%fr6
+ fldws,ma 4(%r25),%fr7
+ fstds %fr6,-16(%r30)
+ xmpyu %fr4,%fr7,%fr8
+ ldw -12(%r30),%r19 C least significant limb in product
+ ldw -16(%r30),%r28
+
+ fstds %fr8,-16(%r30)
+ addib,= -1,%r24,L(end)
+ ldw -12(%r30),%r1
+
+C Main loop
+ .label L(loop)
+ ldws 0(%r26),%r29
+ fldws,ma 4(%r25),%fr5
+ add %r29,%r19,%r19
+ stws,ma %r19,4(%r26)
+ addc %r28,%r1,%r19
+ xmpyu %fr4,%fr5,%fr6
+ ldw -16(%r30),%r28
+ fstds %fr6,-16(%r30)
+ addc %r0,%r28,%r28
+ addib,<> -1,%r24,L(loop)
+ ldw -12(%r30),%r1
+
+ .label L(end)
+ ldw 0(%r26),%r29
+ add %r29,%r19,%r19
+ stws,ma %r19,4(%r26)
+ addc %r28,%r1,%r19
+ ldw -16(%r30),%r28
+ ldws 0(%r26),%r29
+ addc %r0,%r28,%r28
+ add %r29,%r19,%r19
+ stws,ma %r19,4(%r26)
+ addc %r0,%r28,%r28
+ bv 0(%r2)
+ ldo -64(%r30),%r30
+
+ .label L(just_one_limb)
+ xmpyu %fr4,%fr5,%fr6
+ ldw 0(%r26),%r29
+ fstds %fr6,-16(%r30)
+ ldw -12(%r30),%r1
+ ldw -16(%r30),%r28
+ add %r29,%r1,%r19
+ stw %r19,0(%r26)
+ addc %r0,%r28,%r28
+ bv 0(%r2)
+ ldo -64(%r30),%r30
+EPILOGUE()
diff --git a/mpn/pa32/hppa1_1/gmp-mparam.h b/mpn/pa32/hppa1_1/gmp-mparam.h
new file mode 100644
index 000000000..f14ac6643
--- /dev/null
+++ b/mpn/pa32/hppa1_1/gmp-mparam.h
@@ -0,0 +1,63 @@
+/* HP-PA 1.1 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2002-03-07, gcc 2.8 (pa7100/100MHz) */
+
+#define MUL_KARATSUBA_THRESHOLD 30
+#define MUL_TOOM3_THRESHOLD 141
+
+#define SQR_BASECASE_THRESHOLD 4
+#define SQR_KARATSUBA_THRESHOLD 55
+#define SQR_TOOM3_THRESHOLD 185
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* always */
+#define DIV_DC_THRESHOLD 95
+#define POWM_THRESHOLD 150
+
+#define GCD_ACCEL_THRESHOLD 3
+#define GCDEXT_THRESHOLD 0 /* always */
+#define JACOBI_BASE_METHOD 2
+
+#define DIVREM_1_NORM_THRESHOLD 3
+#define DIVREM_1_UNNORM_THRESHOLD 6
+#define MOD_1_NORM_THRESHOLD 3
+#define MOD_1_UNNORM_THRESHOLD 6
+#define USE_PREINV_DIVREM_1 1
+#define USE_PREINV_MOD_1 1
+#define DIVREM_2_THRESHOLD 0 /* always */
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always */
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 23
+#define SET_STR_THRESHOLD 6589
+
+#define MUL_FFT_TABLE { 592, 1440, 2688, 5632, 14336, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD 608
+#define MUL_FFT_THRESHOLD 5888
+
+#define SQR_FFT_TABLE { 624, 1504, 2688, 6656, 18432, 40960, 0 }
+#define SQR_FFT_MODF_THRESHOLD 640
+#define SQR_FFT_THRESHOLD 5376
diff --git a/mpn/pa32/hppa1_1/mul_1.asm b/mpn/pa32/hppa1_1/mul_1.asm
new file mode 100644
index 000000000..bc336c55e
--- /dev/null
+++ b/mpn/pa32/hppa1_1/mul_1.asm
@@ -0,0 +1,94 @@
+dnl HP-PA 1.1 mpn_mul_1 -- Multiply a limb vector with a limb and store the
+dnl result in a second limb vector.
+
+dnl Copyright 1992, 1993, 1994, 2000, 2001, 2002 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr r26
+C s1_ptr r25
+C size r24
+C s2_limb r23
+
+C This runs at 9 cycles/limb on a PA7000. With the used instructions, it can
+C not become faster due to data cache contention after a store. On the PA7100
+C it runs at 7 cycles/limb.
+
+C We could use fldds to read two limbs at a time from the S1 array, and that
+C could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and
+C PA7100, respectively. We don't do that since it does not seem worth the
+C (alignment) troubles...
+
+C At least the PA7100 is rumored to be able to deal with cache-misses without
+C stalling instruction issue. If this is true, and the cache is actually also
+C lockup-free, we should use a deeper software pipeline, and load from S1 very
+C early! (The loads and stores to -12(sp) will surely be in the cache.)
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+C .callinfo frame=64,no_calls
+
+ ldo 64(%r30),%r30
+ fldws,ma 4(%r25),%fr5
+ stw %r23,-16(%r30) C move s2_limb ...
+ addib,= -1,%r24,L(just_one_limb)
+ fldws -16(%r30),%fr4 C ... into fr4
+ add %r0,%r0,%r0 C clear carry
+ xmpyu %fr4,%fr5,%fr6
+ fldws,ma 4(%r25),%fr7
+ fstds %fr6,-16(%r30)
+ xmpyu %fr4,%fr7,%fr8
+ ldw -12(%r30),%r19 C least significant limb in product
+ ldw -16(%r30),%r28
+
+ fstds %fr8,-16(%r30)
+ addib,= -1,%r24,L(end)
+ ldw -12(%r30),%r1
+
+C Main loop
+ .label L(loop)
+ fldws,ma 4(%r25),%fr5
+ stws,ma %r19,4(%r26)
+ addc %r28,%r1,%r19
+ xmpyu %fr4,%fr5,%fr6
+ ldw -16(%r30),%r28
+ fstds %fr6,-16(%r30)
+ addib,<> -1,%r24,L(loop)
+ ldw -12(%r30),%r1
+
+ .label L(end)
+ stws,ma %r19,4(%r26)
+ addc %r28,%r1,%r19
+ ldw -16(%r30),%r28
+ stws,ma %r19,4(%r26)
+ addc %r0,%r28,%r28
+ bv 0(%r2)
+ ldo -64(%r30),%r30
+
+ .label L(just_one_limb)
+ xmpyu %fr4,%fr5,%fr6
+ fstds %fr6,-16(%r30)
+ ldw -16(%r30),%r28
+ ldo -64(%r30),%r30
+ bv 0(%r2)
+ fstws %fr6R,0(%r26)
+EPILOGUE()
diff --git a/mpn/pa32/hppa1_1/pa7100/add_n.asm b/mpn/pa32/hppa1_1/pa7100/add_n.asm
new file mode 100644
index 000000000..e515456b6
--- /dev/null
+++ b/mpn/pa32/hppa1_1/pa7100/add_n.asm
@@ -0,0 +1,74 @@
+dnl HP-PA mpn_add_n -- Add two limb vectors of the same length > 0 and store
+dnl sum in a third limb vector. Optimized for the PA7100, where is runs at
+dnl 4.25 cycles/limb.
+
+dnl Copyright 1992, 1994, 2000, 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr r26
+C s1_ptr r25
+C s2_ptr r24
+C size r23
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+
+ addib,<= -5,%r23,L(rest)
+ add %r20,%r19,%r28 C add first limbs ignoring cy
+
+ .label L(loop)
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addc %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addc %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addc %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addib,> -4,%r23,L(loop)
+ addc %r20,%r19,%r28
+
+ .label L(rest)
+ addib,= 4,%r23,L(end)
+ nop
+
+ .label L(eloop)
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addib,> -1,%r23,L(eloop)
+ addc %r20,%r19,%r28
+
+ .label L(end)
+ stws %r28,0(0,%r26)
+ bv 0(%r2)
+ addc %r0,%r0,%r28
+EPILOGUE()
diff --git a/mpn/pa32/hppa1_1/pa7100/addmul_1.asm b/mpn/pa32/hppa1_1/pa7100/addmul_1.asm
new file mode 100644
index 000000000..58290c54b
--- /dev/null
+++ b/mpn/pa32/hppa1_1/pa7100/addmul_1.asm
@@ -0,0 +1,192 @@
+dnl HP-PA 7100/7200 mpn_addmul_1 -- Multiply a limb vector with a limb and
+dnl add the result to a second limb vector.
+
+dnl Copyright 1995, 2000, 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(`res_ptr',`%r26')
+define(`s1_ptr',`%r25')
+define(`size',`%r24')
+define(`s2_limb',`%r23')
+
+define(`cylimb',`%r28')
+define(`s0',`%r19')
+define(`s1',`%r20')
+define(`s2',`%r3')
+define(`s3',`%r4')
+define(`lo0',`%r21')
+define(`lo1',`%r5')
+define(`lo2',`%r6')
+define(`lo3',`%r7')
+define(`hi0',`%r22')
+define(`hi1',`%r23') C safe to reuse
+define(`hi2',`%r29')
+define(`hi3',`%r1')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+C .callinfo frame=128,no_calls
+
+ ldo 128(%r30),%r30
+ stws s2_limb,-16(%r30)
+ add %r0,%r0,cylimb C clear cy and cylimb
+ addib,< -4,size,L(few_limbs)
+ fldws -16(%r30),%fr31R
+
+ ldo -112(%r30),%r31
+ stw %r3,-96(%r30)
+ stw %r4,-92(%r30)
+ stw %r5,-88(%r30)
+ stw %r6,-84(%r30)
+ stw %r7,-80(%r30)
+
+ bb,>=,n s1_ptr,29,L(0)
+
+ fldws,ma 4(s1_ptr),%fr4
+ ldws 0(res_ptr),s0
+ xmpyu %fr4,%fr31R,%fr5
+ fstds %fr5,-16(%r31)
+ ldws -16(%r31),cylimb
+ ldws -12(%r31),lo0
+ add s0,lo0,s0
+ addib,< -1,size,L(few_limbs)
+ stws,ma s0,4(res_ptr)
+
+C start software pipeline ----------------------------------------------------
+ .label L(0)
+ fldds,ma 8(s1_ptr),%fr4
+ fldds,ma 8(s1_ptr),%fr8
+
+ xmpyu %fr4L,%fr31R,%fr5
+ xmpyu %fr4R,%fr31R,%fr6
+ xmpyu %fr8L,%fr31R,%fr9
+ xmpyu %fr8R,%fr31R,%fr10
+
+ fstds %fr5,-16(%r31)
+ fstds %fr6,-8(%r31)
+ fstds %fr9,0(%r31)
+ fstds %fr10,8(%r31)
+
+ ldws -16(%r31),hi0
+ ldws -12(%r31),lo0
+ ldws -8(%r31),hi1
+ ldws -4(%r31),lo1
+ ldws 0(%r31),hi2
+ ldws 4(%r31),lo2
+ ldws 8(%r31),hi3
+ ldws 12(%r31),lo3
+
+ addc lo0,cylimb,lo0
+ addc lo1,hi0,lo1
+ addc lo2,hi1,lo2
+ addc lo3,hi2,lo3
+
+ addib,< -4,size,L(end)
+ addc %r0,hi3,cylimb C propagate carry into cylimb
+C main loop ------------------------------------------------------------------
+ .label L(loop)
+ fldds,ma 8(s1_ptr),%fr4
+ fldds,ma 8(s1_ptr),%fr8
+
+ ldws 0(res_ptr),s0
+ xmpyu %fr4L,%fr31R,%fr5
+ ldws 4(res_ptr),s1
+ xmpyu %fr4R,%fr31R,%fr6
+ ldws 8(res_ptr),s2
+ xmpyu %fr8L,%fr31R,%fr9
+ ldws 12(res_ptr),s3
+ xmpyu %fr8R,%fr31R,%fr10
+
+ fstds %fr5,-16(%r31)
+ add s0,lo0,s0
+ fstds %fr6,-8(%r31)
+ addc s1,lo1,s1
+ fstds %fr9,0(%r31)
+ addc s2,lo2,s2
+ fstds %fr10,8(%r31)
+ addc s3,lo3,s3
+
+ ldws -16(%r31),hi0
+ ldws -12(%r31),lo0
+ ldws -8(%r31),hi1
+ ldws -4(%r31),lo1
+ ldws 0(%r31),hi2
+ ldws 4(%r31),lo2
+ ldws 8(%r31),hi3
+ ldws 12(%r31),lo3
+
+ addc lo0,cylimb,lo0
+ stws,ma s0,4(res_ptr)
+ addc lo1,hi0,lo1
+ stws,ma s1,4(res_ptr)
+ addc lo2,hi1,lo2
+ stws,ma s2,4(res_ptr)
+ addc lo3,hi2,lo3
+ stws,ma s3,4(res_ptr)
+
+ addib,>= -4,size,L(loop)
+ addc %r0,hi3,cylimb C propagate carry into cylimb
+C finish software pipeline ---------------------------------------------------
+ .label L(end)
+ ldws 0(res_ptr),s0
+ ldws 4(res_ptr),s1
+ ldws 8(res_ptr),s2
+ ldws 12(res_ptr),s3
+
+ add s0,lo0,s0
+ stws,ma s0,4(res_ptr)
+ addc s1,lo1,s1
+ stws,ma s1,4(res_ptr)
+ addc s2,lo2,s2
+ stws,ma s2,4(res_ptr)
+ addc s3,lo3,s3
+ stws,ma s3,4(res_ptr)
+
+C restore callee-saves registers ---------------------------------------------
+ ldw -96(%r30),%r3
+ ldw -92(%r30),%r4
+ ldw -88(%r30),%r5
+ ldw -84(%r30),%r6
+ ldw -80(%r30),%r7
+
+ .label L(few_limbs)
+ addib,=,n 4,size,L(ret)
+
+ .label L(loop2)
+ fldws,ma 4(s1_ptr),%fr4
+ ldws 0(res_ptr),s0
+ xmpyu %fr4,%fr31R,%fr5
+ fstds %fr5,-16(%r30)
+ ldws -16(%r30),hi0
+ ldws -12(%r30),lo0
+ addc lo0,cylimb,lo0
+ addc %r0,hi0,cylimb
+ add s0,lo0,s0
+ stws,ma s0,4(res_ptr)
+ addib,<> -1,size,L(loop2)
+ nop
+
+ .label L(ret)
+ addc %r0,cylimb,cylimb
+ bv 0(%r2)
+ ldo -128(%r30),%r30
+EPILOGUE(mpn_addmul_1)
diff --git a/mpn/pa32/hppa1_1/pa7100/lshift.asm b/mpn/pa32/hppa1_1/pa7100/lshift.asm
new file mode 100644
index 000000000..336b0daaa
--- /dev/null
+++ b/mpn/pa32/hppa1_1/pa7100/lshift.asm
@@ -0,0 +1,86 @@
+dnl HP-PA mpn_lshift -- Shift a number left.
+dnl Optimized for the PA7100, where is runs at 3.25 cycles/limb.
+
+dnl Copyright 1992, 1994, 2000, 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr r26
+C s_ptr r25
+C size r24
+C cnt r23
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+ sh2add %r24,%r25,%r25
+ sh2add %r24,%r26,%r26
+ ldws,mb -4(0,%r25),%r22
+ subi 32,%r23,%r1
+ mtsar %r1
+ addib,= -1,%r24,L(0004)
+ vshd %r0,%r22,%r28 C compute carry out limb
+ ldws,mb -4(0,%r25),%r29
+ addib,<= -5,%r24,L(rest)
+ vshd %r22,%r29,%r20
+
+ .label L(loop)
+ ldws,mb -4(0,%r25),%r22
+ stws,mb %r20,-4(0,%r26)
+ vshd %r29,%r22,%r20
+ ldws,mb -4(0,%r25),%r29
+ stws,mb %r20,-4(0,%r26)
+ vshd %r22,%r29,%r20
+ ldws,mb -4(0,%r25),%r22
+ stws,mb %r20,-4(0,%r26)
+ vshd %r29,%r22,%r20
+ ldws,mb -4(0,%r25),%r29
+ stws,mb %r20,-4(0,%r26)
+ addib,> -4,%r24,L(loop)
+ vshd %r22,%r29,%r20
+
+ .label L(rest)
+ addib,= 4,%r24,L(end1)
+ nop
+
+ .label L(eloop)
+ ldws,mb -4(0,%r25),%r22
+ stws,mb %r20,-4(0,%r26)
+ addib,<= -1,%r24,L(end2)
+ vshd %r29,%r22,%r20
+ ldws,mb -4(0,%r25),%r29
+ stws,mb %r20,-4(0,%r26)
+ addib,> -1,%r24,L(eloop)
+ vshd %r22,%r29,%r20
+
+ .label L(end1)
+ stws,mb %r20,-4(0,%r26)
+ vshd %r29,%r0,%r20
+ bv 0(%r2)
+ stw %r20,-4(0,%r26)
+
+ .label L(end2)
+ stws,mb %r20,-4(0,%r26)
+
+ .label L(0004)
+ vshd %r22,%r0,%r20
+ bv 0(%r2)
+ stw %r20,-4(0,%r26)
+EPILOGUE()
diff --git a/mpn/pa32/hppa1_1/pa7100/rshift.asm b/mpn/pa32/hppa1_1/pa7100/rshift.asm
new file mode 100644
index 000000000..17ac0eecb
--- /dev/null
+++ b/mpn/pa32/hppa1_1/pa7100/rshift.asm
@@ -0,0 +1,83 @@
+dnl HP-PA mpn_rshift -- Shift a number right.
+dnl Optimized for the PA7100, where is runs at 3.25 cycles/limb.
+
+dnl Copyright 1992, 1994, 2000, 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr r26
+C s_ptr r25
+C size r24
+C cnt r23
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+ ldws,ma 4(0,%r25),%r22
+ mtsar %r23
+ addib,= -1,%r24,L(0004)
+ vshd %r22,%r0,%r28 C compute carry out limb
+ ldws,ma 4(0,%r25),%r29
+ addib,<= -5,%r24,L(rest)
+ vshd %r29,%r22,%r20
+
+ .label L(loop)
+ ldws,ma 4(0,%r25),%r22
+ stws,ma %r20,4(0,%r26)
+ vshd %r22,%r29,%r20
+ ldws,ma 4(0,%r25),%r29
+ stws,ma %r20,4(0,%r26)
+ vshd %r29,%r22,%r20
+ ldws,ma 4(0,%r25),%r22
+ stws,ma %r20,4(0,%r26)
+ vshd %r22,%r29,%r20
+ ldws,ma 4(0,%r25),%r29
+ stws,ma %r20,4(0,%r26)
+ addib,> -4,%r24,L(loop)
+ vshd %r29,%r22,%r20
+
+ .label L(rest)
+ addib,= 4,%r24,L(end1)
+ nop
+
+ .label L(eloop)
+ ldws,ma 4(0,%r25),%r22
+ stws,ma %r20,4(0,%r26)
+ addib,<= -1,%r24,L(end2)
+ vshd %r22,%r29,%r20
+ ldws,ma 4(0,%r25),%r29
+ stws,ma %r20,4(0,%r26)
+ addib,> -1,%r24,L(eloop)
+ vshd %r29,%r22,%r20
+
+ .label L(end1)
+ stws,ma %r20,4(0,%r26)
+ vshd %r0,%r29,%r20
+ bv 0(%r2)
+ stw %r20,0(0,%r26)
+
+ .label L(end2)
+ stws,ma %r20,4(0,%r26)
+
+ .label L(0004)
+ vshd %r0,%r22,%r20
+ bv 0(%r2)
+ stw %r20,0(0,%r26)
+EPILOGUE()
diff --git a/mpn/pa32/hppa1_1/pa7100/sub_n.asm b/mpn/pa32/hppa1_1/pa7100/sub_n.asm
new file mode 100644
index 000000000..14073cf19
--- /dev/null
+++ b/mpn/pa32/hppa1_1/pa7100/sub_n.asm
@@ -0,0 +1,75 @@
+dnl HP-PA mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl store difference in a third limb vector. Optimized for the PA7100, where
+dnl is runs at 4.25 cycles/limb.
+
+dnl Copyright 1992, 1994, 2000, 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr r26
+C s1_ptr r25
+C s2_ptr r24
+C size r23
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+
+ addib,<= -5,%r23,L(rest)
+ sub %r20,%r19,%r28 C subtract first limbs ignoring cy
+
+ .label L(loop)
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ subb %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ subb %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ subb %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addib,> -4,%r23,L(loop)
+ subb %r20,%r19,%r28
+
+ .label L(rest)
+ addib,= 4,%r23,L(end)
+ nop
+
+ .label L(eloop)
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addib,> -1,%r23,L(eloop)
+ subb %r20,%r19,%r28
+
+ .label L(end)
+ stws %r28,0(0,%r26)
+ addc %r0,%r0,%r28
+ bv 0(%r2)
+ subi 1,%r28,%r28
+EPILOGUE()
diff --git a/mpn/pa32/hppa1_1/pa7100/submul_1.asm b/mpn/pa32/hppa1_1/pa7100/submul_1.asm
new file mode 100644
index 000000000..f99079251
--- /dev/null
+++ b/mpn/pa32/hppa1_1/pa7100/submul_1.asm
@@ -0,0 +1,198 @@
+dnl HP-PA 7100/7200 mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl subtract the result from a second limb vector.
+
+dnl Copyright 1995, 2000, 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(`res_ptr',`%r26')
+define(`s1_ptr',`%r25')
+define(`size',`%r24')
+define(`s2_limb',`%r23')
+
+define(`cylimb',`%r28')
+define(`s0',`%r19')
+define(`s1',`%r20')
+define(`s2',`%r3')
+define(`s3',`%r4')
+define(`lo0',`%r21')
+define(`lo1',`%r5')
+define(`lo2',`%r6')
+define(`lo3',`%r7')
+define(`hi0',`%r22')
+define(`hi1',`%r23') C safe to reuse
+define(`hi2',`%r29')
+define(`hi3',`%r1')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+C .callinfo frame=128,no_calls
+
+ ldo 128(%r30),%r30
+ stws s2_limb,-16(%r30)
+ add %r0,%r0,cylimb C clear cy and cylimb
+ addib,< -4,size,L(few_limbs)
+ fldws -16(%r30),%fr31R
+
+ ldo -112(%r30),%r31
+ stw %r3,-96(%r30)
+ stw %r4,-92(%r30)
+ stw %r5,-88(%r30)
+ stw %r6,-84(%r30)
+ stw %r7,-80(%r30)
+
+ bb,>=,n s1_ptr,29,L(0)
+
+ fldws,ma 4(s1_ptr),%fr4
+ ldws 0(res_ptr),s0
+ xmpyu %fr4,%fr31R,%fr5
+ fstds %fr5,-16(%r31)
+ ldws -16(%r31),cylimb
+ ldws -12(%r31),lo0
+ sub s0,lo0,s0
+ add s0,lo0,%r0 C invert cy
+ addib,< -1,size,L(few_limbs)
+ stws,ma s0,4(res_ptr)
+
+C start software pipeline ----------------------------------------------------
+ .label L(0)
+ fldds,ma 8(s1_ptr),%fr4
+ fldds,ma 8(s1_ptr),%fr8
+
+ xmpyu %fr4L,%fr31R,%fr5
+ xmpyu %fr4R,%fr31R,%fr6
+ xmpyu %fr8L,%fr31R,%fr9
+ xmpyu %fr8R,%fr31R,%fr10
+
+ fstds %fr5,-16(%r31)
+ fstds %fr6,-8(%r31)
+ fstds %fr9,0(%r31)
+ fstds %fr10,8(%r31)
+
+ ldws -16(%r31),hi0
+ ldws -12(%r31),lo0
+ ldws -8(%r31),hi1
+ ldws -4(%r31),lo1
+ ldws 0(%r31),hi2
+ ldws 4(%r31),lo2
+ ldws 8(%r31),hi3
+ ldws 12(%r31),lo3
+
+ addc lo0,cylimb,lo0
+ addc lo1,hi0,lo1
+ addc lo2,hi1,lo2
+ addc lo3,hi2,lo3
+
+ addib,< -4,size,L(end)
+ addc %r0,hi3,cylimb C propagate carry into cylimb
+C main loop ------------------------------------------------------------------
+ .label L(loop)
+ fldds,ma 8(s1_ptr),%fr4
+ fldds,ma 8(s1_ptr),%fr8
+
+ ldws 0(res_ptr),s0
+ xmpyu %fr4L,%fr31R,%fr5
+ ldws 4(res_ptr),s1
+ xmpyu %fr4R,%fr31R,%fr6
+ ldws 8(res_ptr),s2
+ xmpyu %fr8L,%fr31R,%fr9
+ ldws 12(res_ptr),s3
+ xmpyu %fr8R,%fr31R,%fr10
+
+ fstds %fr5,-16(%r31)
+ sub s0,lo0,s0
+ fstds %fr6,-8(%r31)
+ subb s1,lo1,s1
+ fstds %fr9,0(%r31)
+ subb s2,lo2,s2
+ fstds %fr10,8(%r31)
+ subb s3,lo3,s3
+ subb %r0,%r0,lo0 C these two insns ...
+ add lo0,lo0,%r0 C ... just invert cy
+
+ ldws -16(%r31),hi0
+ ldws -12(%r31),lo0
+ ldws -8(%r31),hi1
+ ldws -4(%r31),lo1
+ ldws 0(%r31),hi2
+ ldws 4(%r31),lo2
+ ldws 8(%r31),hi3
+ ldws 12(%r31),lo3
+
+ addc lo0,cylimb,lo0
+ stws,ma s0,4(res_ptr)
+ addc lo1,hi0,lo1
+ stws,ma s1,4(res_ptr)
+ addc lo2,hi1,lo2
+ stws,ma s2,4(res_ptr)
+ addc lo3,hi2,lo3
+ stws,ma s3,4(res_ptr)
+
+ addib,>= -4,size,L(loop)
+ addc %r0,hi3,cylimb C propagate carry into cylimb
+C finish software pipeline ---------------------------------------------------
+ .label L(end)
+ ldws 0(res_ptr),s0
+ ldws 4(res_ptr),s1
+ ldws 8(res_ptr),s2
+ ldws 12(res_ptr),s3
+
+ sub s0,lo0,s0
+ stws,ma s0,4(res_ptr)
+ subb s1,lo1,s1
+ stws,ma s1,4(res_ptr)
+ subb s2,lo2,s2
+ stws,ma s2,4(res_ptr)
+ subb s3,lo3,s3
+ stws,ma s3,4(res_ptr)
+ subb %r0,%r0,lo0 C these two insns ...
+ add lo0,lo0,%r0 C ... invert cy
+
+C restore callee-saves registers ---------------------------------------------
+ ldw -96(%r30),%r3
+ ldw -92(%r30),%r4
+ ldw -88(%r30),%r5
+ ldw -84(%r30),%r6
+ ldw -80(%r30),%r7
+
+ .label L(few_limbs)
+ addib,=,n 4,size,L(ret)
+
+ .label L(loop2)
+ fldws,ma 4(s1_ptr),%fr4
+ ldws 0(res_ptr),s0
+ xmpyu %fr4,%fr31R,%fr5
+ fstds %fr5,-16(%r30)
+ ldws -16(%r30),hi0
+ ldws -12(%r30),lo0
+ addc lo0,cylimb,lo0
+ addc %r0,hi0,cylimb
+ sub s0,lo0,s0
+ add s0,lo0,%r0 C invert cy
+ stws,ma s0,4(res_ptr)
+ addib,<> -1,size,L(loop2)
+ nop
+
+ .label L(ret)
+ addc %r0,cylimb,cylimb
+ bv 0(%r2)
+ ldo -128(%r30),%r30
+EPILOGUE(mpn_submul_1)
diff --git a/mpn/pa32/hppa1_1/sqr_diagonal.asm b/mpn/pa32/hppa1_1/sqr_diagonal.asm
new file mode 100644
index 000000000..53e4254b7
--- /dev/null
+++ b/mpn/pa32/hppa1_1/sqr_diagonal.asm
@@ -0,0 +1,51 @@
+dnl HP-PA 1.1 32-bit mpn_sqr_diagonal.
+
+dnl Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C This code runs at 6 cycles/limb on the PA7100 and 2.5 cycles/limb on PA8x00.
+C 2-way unrolling wouldn't help the PA7100; it could however bring times down
+C to 2.0 cycles/limb for the PA8x00.
+
+C INPUT PARAMETERS
+define(`rp',`%r26')
+define(`up',`%r25')
+define(`n',`%r24')
+
+ASM_START()
+PROLOGUE(mpn_sqr_diagonal)
+ ldo 4(rp),rp
+ fldws,ma 4(up),%fr4r
+ addib,= -1,n,L(exit)
+ xmpyu %fr4r,%fr4r,%fr5
+
+ .label L(loop)
+ fldws,ma 4(up),%fr4r
+ fstws %fr5r,-4(rp)
+ fstws,ma %fr5l,8(rp)
+ addib,<> -1,n,L(loop)
+ xmpyu %fr4r,%fr4r,%fr5
+
+ .label L(exit)
+ fstws %fr5r,-4(rp)
+ bv 0(%r2)
+ fstws %fr5l,0(rp)
+EPILOGUE(mpn_sqr_diagonal)
diff --git a/mpn/pa32/hppa1_1/submul_1.asm b/mpn/pa32/hppa1_1/submul_1.asm
new file mode 100644
index 000000000..3a6c756cc
--- /dev/null
+++ b/mpn/pa32/hppa1_1/submul_1.asm
@@ -0,0 +1,107 @@
+dnl HP-PA 1.1 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+dnl the result from a second limb vector.
+
+dnl Copyright 1992, 1993, 1994, 2000, 2001, 2002 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr r26
+C s1_ptr r25
+C size r24
+C s2_limb r23
+
+C This runs at 12 cycles/limb on a PA7000. With the used instructions, it can
+C not become faster due to data cache contention after a store. On the PA7100
+C it runs at 11 cycles/limb.
+
+C There are some ideas described in mul_1.asm that applies to this code too.
+
+C It seems possible to make this run as fast as mpn_addmul_1, if we use
+C sub,>>= %r29,%r19,%r22
+C addi 1,%r28,%r28
+C but that requires reworking the hairy software pipeline...
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+C .callinfo frame=64,no_calls
+
+ ldo 64(%r30),%r30
+ fldws,ma 4(%r25),%fr5
+ stw %r23,-16(%r30) C move s2_limb ...
+ addib,= -1,%r24,L(just_one_limb)
+ fldws -16(%r30),%fr4 C ... into fr4
+ add %r0,%r0,%r0 C clear carry
+ xmpyu %fr4,%fr5,%fr6
+ fldws,ma 4(%r25),%fr7
+ fstds %fr6,-16(%r30)
+ xmpyu %fr4,%fr7,%fr8
+ ldw -12(%r30),%r19 C least significant limb in product
+ ldw -16(%r30),%r28
+
+ fstds %fr8,-16(%r30)
+ addib,= -1,%r24,L(end)
+ ldw -12(%r30),%r1
+
+C Main loop
+ .label L(loop)
+ ldws 0(%r26),%r29
+ fldws,ma 4(%r25),%fr5
+ sub %r29,%r19,%r22
+ add %r22,%r19,%r0
+ stws,ma %r22,4(%r26)
+ addc %r28,%r1,%r19
+ xmpyu %fr4,%fr5,%fr6
+ ldw -16(%r30),%r28
+ fstds %fr6,-16(%r30)
+ addc %r0,%r28,%r28
+ addib,<> -1,%r24,L(loop)
+ ldw -12(%r30),%r1
+
+ .label L(end)
+ ldw 0(%r26),%r29
+ sub %r29,%r19,%r22
+ add %r22,%r19,%r0
+ stws,ma %r22,4(%r26)
+ addc %r28,%r1,%r19
+ ldw -16(%r30),%r28
+ ldws 0(%r26),%r29
+ addc %r0,%r28,%r28
+ sub %r29,%r19,%r22
+ add %r22,%r19,%r0
+ stws,ma %r22,4(%r26)
+ addc %r0,%r28,%r28
+ bv 0(%r2)
+ ldo -64(%r30),%r30
+
+ .label L(just_one_limb)
+ xmpyu %fr4,%fr5,%fr6
+ ldw 0(%r26),%r29
+ fstds %fr6,-16(%r30)
+ ldw -12(%r30),%r1
+ ldw -16(%r30),%r28
+ sub %r29,%r1,%r22
+ add %r22,%r1,%r0
+ stw %r22,0(%r26)
+ addc %r0,%r28,%r28
+ bv 0(%r2)
+ ldo -64(%r30),%r30
+EPILOGUE()
diff --git a/mpn/pa32/hppa1_1/udiv_qrnnd.asm b/mpn/pa32/hppa1_1/udiv_qrnnd.asm
new file mode 100644
index 000000000..0fb302e53
--- /dev/null
+++ b/mpn/pa32/hppa1_1/udiv_qrnnd.asm
@@ -0,0 +1,96 @@
+dnl HP-PA __udiv_qrnnd division support, used from longlong.h.
+dnl This version runs fast on PA 7000 and later.
+
+dnl Copyright 1993, 1994, 2000, 2001 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rem_ptr gr26
+C n1 gr25
+C n0 gr24
+C d gr23
+
+C This file has caused a lot of trouble, since it demands PIC reference to
+C static data, which triggers bugs in gas (at least version 2.7 through
+C 2.11.2). When the bug is triggered, many bogus relocs are generated. The
+C current solution is to stuff data right into the code, and refer it using
+C absolute offsets. Fragile to be sure, but nothing else seems to work.
+
+ASM_START()
+ifdef(`PIC',`',
+` RODATA
+ INT64(L(0000), 0x43f00000, 0x0) C 2^64
+')
+
+PROLOGUE(mpn_udiv_qrnnd)
+ .proc
+ .callinfo frame=64,no_calls
+ .entry
+
+ ldo 64(%r30),%r30
+
+ stws %r25,-16(0,%r30) C n_hi
+ stws %r24,-12(0,%r30) C n_lo
+
+ifdef(`PIC',
+` bl .+20,%r31
+ dep %r0,31,2,%r31
+ .word 0x0 C padding for alignment
+ .word 0x43f00000, 0x0 C 2^64
+ ldo 4(%r31),%r31',
+` ldil `L'%L(0000),%r31
+ ldo R%L(0000)(%r31),%r31')
+
+ fldds -16(0,%r30),%fr5
+ stws %r23,-12(0,%r30)
+ comib,<= 0,%r25,L(1)
+ fcnvxf,dbl,dbl %fr5,%fr5
+ fldds 0(0,%r31),%fr4
+ fadd,dbl %fr4,%fr5,%fr5
+
+ .label L(1)
+ fcpy,sgl %fr0,%fr6L
+ fldws -12(0,%r30),%fr6R
+ fcnvxf,dbl,dbl %fr6,%fr4
+
+ fdiv,dbl %fr5,%fr4,%fr5
+
+ fcnvfx,dbl,dbl %fr5,%fr4
+ fstws %fr4R,-16(%r30)
+ xmpyu %fr4R,%fr6R,%fr6
+ ldws -16(%r30),%r28
+ fstds %fr6,-16(0,%r30)
+ ldws -12(0,%r30),%r21
+ ldws -16(0,%r30),%r20
+ sub %r24,%r21,%r22
+ subb %r25,%r20,%r20
+ comib,= 0,%r20,L(2)
+ ldo -64(%r30),%r30
+
+ add %r22,%r23,%r22
+ ldo -1(%r28),%r28
+
+ .label L(2)
+ bv 0(%r2)
+ stws %r22,0(0,%r26)
+ .exit
+ .procend
+EPILOGUE(mpn_udiv_qrnnd)
diff --git a/mpn/pa32/hppa1_1/umul.asm b/mpn/pa32/hppa1_1/umul.asm
new file mode 100644
index 000000000..d8bc93317
--- /dev/null
+++ b/mpn/pa32/hppa1_1/umul.asm
@@ -0,0 +1,38 @@
+dnl Copyright 1999, 2001 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+C .callinfo frame=64,no_calls
+
+ ldo 64(%r30),%r30
+ stw %r25,-16(0,%r30)
+ fldws -16(0,%r30),%fr22R
+ stw %r24,-16(0,%r30)
+ fldws -16(0,%r30),%fr22L
+ xmpyu %fr22R,%fr22L,%fr22
+ fstds %fr22,-16(0,%r30)
+ ldw -16(0,%r30),%r28
+ ldw -12(0,%r30),%r29
+ stw %r29,0(0,%r26)
+ bv 0(%r2)
+ ldo -64(%r30),%r30
+EPILOGUE()
diff --git a/mpn/pa32/hppa2_0/add_n.asm b/mpn/pa32/hppa2_0/add_n.asm
new file mode 100644
index 000000000..fc0f14356
--- /dev/null
+++ b/mpn/pa32/hppa2_0/add_n.asm
@@ -0,0 +1,98 @@
+dnl HP-PA 2.0 32-bit mpn_add_n -- Add two limb vectors of the same length > 0
+dnl and store sum in a third limb vector.
+
+dnl Copyright 1997, 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr gr26
+C s1_ptr gr25
+C s2_ptr gr24
+C size gr23
+
+C This runs at 2 cycles/limb on PA8000.
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+ sub %r0,%r23,%r22
+ zdep %r22,30,3,%r28 C r28 = 2 * (-n & 7)
+ zdep %r22,29,3,%r22 C r22 = 4 * (-n & 7)
+ sub %r25,%r22,%r25 C offset s1_ptr
+ sub %r24,%r22,%r24 C offset s2_ptr
+ sub %r26,%r22,%r26 C offset res_ptr
+ blr %r28,%r0 C branch into loop
+ add %r0,%r0,%r0 C reset carry
+
+ .label L(loop)
+ ldw 0(%r25),%r20
+ ldw 0(%r24),%r31
+ addc %r20,%r31,%r20
+ stw %r20,0(%r26)
+
+ .label L(7)
+ ldw 4(%r25),%r21
+ ldw 4(%r24),%r19
+ addc %r21,%r19,%r21
+ stw %r21,4(%r26)
+
+ .label L(6)
+ ldw 8(%r25),%r20
+ ldw 8(%r24),%r31
+ addc %r20,%r31,%r20
+ stw %r20,8(%r26)
+
+ .label L(5)
+ ldw 12(%r25),%r21
+ ldw 12(%r24),%r19
+ addc %r21,%r19,%r21
+ stw %r21,12(%r26)
+
+ .label L(4)
+ ldw 16(%r25),%r20
+ ldw 16(%r24),%r31
+ addc %r20,%r31,%r20
+ stw %r20,16(%r26)
+
+ .label L(3)
+ ldw 20(%r25),%r21
+ ldw 20(%r24),%r19
+ addc %r21,%r19,%r21
+ stw %r21,20(%r26)
+
+ .label L(2)
+ ldw 24(%r25),%r20
+ ldw 24(%r24),%r31
+ addc %r20,%r31,%r20
+ stw %r20,24(%r26)
+
+ .label L(1)
+ ldw 28(%r25),%r21
+ ldo 32(%r25),%r25
+ ldw 28(%r24),%r19
+ addc %r21,%r19,%r21
+ stw %r21,28(%r26)
+ ldo 32(%r24),%r24
+ addib,> -8,%r23,L(loop)
+ ldo 32(%r26),%r26
+
+ bv (%r2)
+ addc %r0,%r0,%r28
+EPILOGUE()
diff --git a/mpn/pa32/hppa2_0/gmp-mparam.h b/mpn/pa32/hppa2_0/gmp-mparam.h
new file mode 100644
index 000000000..f2306773c
--- /dev/null
+++ b/mpn/pa32/hppa2_0/gmp-mparam.h
@@ -0,0 +1,57 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+
+/* Generated by tuneup.c, 2001-02-18, gcc 2.95 */
+
+#define MUL_KARATSUBA_THRESHOLD 16
+#define MUL_TOOM3_THRESHOLD 129
+
+#define SQR_BASECASE_THRESHOLD 6
+#define SQR_KARATSUBA_THRESHOLD 48
+#define SQR_TOOM3_THRESHOLD 153
+
+#define DIV_SB_PREINV_THRESHOLD 6
+#define DIV_DC_THRESHOLD 102
+#define POWM_THRESHOLD 166
+
+#define GCD_ACCEL_THRESHOLD 4
+#define GCDEXT_THRESHOLD 0
+
+#define DIVREM_1_NORM_THRESHOLD 4
+#define DIVREM_1_UNNORM_THRESHOLD 6
+#define MOD_1_NORM_THRESHOLD 4
+#define MOD_1_UNNORM_THRESHOLD 6
+#define USE_PREINV_MOD_1 0
+#define DIVREM_2_THRESHOLD 0
+#define DIVEXACT_1_THRESHOLD 0
+#define MODEXACT_1_ODD_THRESHOLD 0
+
+#define MUL_FFT_TABLE { 656, 928, 1920, 3584, 14336, 24576, 0 }
+#define MUL_FFT_MODF_THRESHOLD 584
+#define MUL_FFT_THRESHOLD 3840
+
+#define SQR_FFT_TABLE { 656, 928, 1920, 3584, 14336, 24576, 0 }
+#define SQR_FFT_MODF_THRESHOLD 616
+#define SQR_FFT_THRESHOLD 3840
diff --git a/mpn/pa32/hppa2_0/sqr_diagonal.asm b/mpn/pa32/hppa2_0/sqr_diagonal.asm
new file mode 100644
index 000000000..463aa45f4
--- /dev/null
+++ b/mpn/pa32/hppa2_0/sqr_diagonal.asm
@@ -0,0 +1,103 @@
+dnl HP-PA 32-bit mpn_sqr_diagonal optimized for the PA8x00.
+
+dnl Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C This code runs at 6 cycles/limb on the PA7100 and 2 cycles/limb on PA8x00.
+C The 2-way unrolling is actually not helping the PA7100.
+
+C INPUT PARAMETERS
+define(`rp',`%r26')
+define(`up',`%r25')
+define(`n',`%r24')
+
+ASM_START()
+PROLOGUE(mpn_sqr_diagonal)
+
+ fldws,ma 4(up),%fr4r
+ addib,= -1,n,L(end1)
+ ldo 4(rp),rp
+
+ fldws,ma 4(up),%fr6r
+ addib,= -1,n,L(end2)
+ xmpyu %fr4r,%fr4r,%fr5
+
+ fldws,ma 4(up),%fr4r
+ addib,= -1,n,L(end3)
+ xmpyu %fr6r,%fr6r,%fr7
+
+
+ .label L(loop)
+ fldws,ma 4(up),%fr6r
+ fstws %fr5r,-4(rp)
+ fstws,ma %fr5l,8(rp)
+ addib,= -1,n,L(exite)
+ xmpyu %fr4r,%fr4r,%fr5
+ fldws,ma 4(up),%fr4r
+ fstws %fr7r,-4(rp)
+ fstws,ma %fr7l,8(rp)
+ addib,<> -1,n,L(loop)
+ xmpyu %fr6r,%fr6r,%fr7
+
+ .label L(exito)
+ fstws %fr5r,-4(rp)
+ fstws %fr5l,0(rp)
+ xmpyu %fr4r,%fr4r,%fr5
+ fstws %fr7r,4(rp)
+ fstws %fr7l,8(rp)
+ fstws,mb %fr5r,12(rp)
+ bv 0(%r2)
+ fstws %fr5l,4(rp)
+
+ .label L(exite)
+ fstws %fr7r,-4(rp)
+ fstws %fr7l,0(rp)
+ xmpyu %fr6r,%fr6r,%fr7
+ fstws %fr5r,4(rp)
+ fstws %fr5l,8(rp)
+ fstws,mb %fr7r,12(rp)
+ bv 0(%r2)
+ fstws %fr7l,4(rp)
+
+ .label L(end1)
+ xmpyu %fr4r,%fr4r,%fr5
+ fstws %fr5r,-4(rp)
+ bv 0(%r2)
+ fstws,ma %fr5l,8(rp)
+
+ .label L(end2)
+ xmpyu %fr6r,%fr6r,%fr7
+ fstws %fr5r,-4(rp)
+ fstws %fr5l,0(rp)
+ fstws %fr7r,4(rp)
+ bv 0(%r2)
+ fstws %fr7l,8(rp)
+
+ .label L(end3)
+ fstws %fr5r,-4(rp)
+ fstws %fr5l,0(rp)
+ xmpyu %fr4r,%fr4r,%fr5
+ fstws %fr7r,4(rp)
+ fstws %fr7l,8(rp)
+ fstws,mb %fr5r,12(rp)
+ bv 0(%r2)
+ fstws %fr5l,4(rp)
+EPILOGUE(mpn_sqr_diagonal)
diff --git a/mpn/pa32/hppa2_0/sub_n.asm b/mpn/pa32/hppa2_0/sub_n.asm
new file mode 100644
index 000000000..d6b930dee
--- /dev/null
+++ b/mpn/pa32/hppa2_0/sub_n.asm
@@ -0,0 +1,98 @@
+dnl HP-PA 2.0 32-bit mpn_sub_n -- Subtract two limb vectors of the same
+dnl length > 0 and store difference in a third limb vector.
+
+dnl Copyright 1997, 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr gr26
+C s1_ptr gr25
+C s2_ptr gr24
+C size gr23
+
+C This runs at 2 cycles/limb on PA8000.
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+ sub %r0,%r23,%r22
+ zdep %r22,30,3,%r28 C r28 = 2 * (-n & 7)
+ zdep %r22,29,3,%r22 C r22 = 4 * (-n & 7)
+ sub %r25,%r22,%r25 C offset s1_ptr
+ sub %r24,%r22,%r24 C offset s2_ptr
+ blr %r28,%r0 C branch into loop
+ sub %r26,%r22,%r26 C offset res_ptr and set carry
+
+ .label L(loop)
+ ldw 0(%r25),%r20
+ ldw 0(%r24),%r31
+ subb %r20,%r31,%r20
+ stw %r20,0(%r26)
+
+ .label L(7)
+ ldw 4(%r25),%r21
+ ldw 4(%r24),%r19
+ subb %r21,%r19,%r21
+ stw %r21,4(%r26)
+
+ .label L(6)
+ ldw 8(%r25),%r20
+ ldw 8(%r24),%r31
+ subb %r20,%r31,%r20
+ stw %r20,8(%r26)
+
+ .label L(5)
+ ldw 12(%r25),%r21
+ ldw 12(%r24),%r19
+ subb %r21,%r19,%r21
+ stw %r21,12(%r26)
+
+ .label L(4)
+ ldw 16(%r25),%r20
+ ldw 16(%r24),%r31
+ subb %r20,%r31,%r20
+ stw %r20,16(%r26)
+
+ .label L(3)
+ ldw 20(%r25),%r21
+ ldw 20(%r24),%r19
+ subb %r21,%r19,%r21
+ stw %r21,20(%r26)
+
+ .label L(2)
+ ldw 24(%r25),%r20
+ ldw 24(%r24),%r31
+ subb %r20,%r31,%r20
+ stw %r20,24(%r26)
+
+ .label L(1)
+ ldw 28(%r25),%r21
+ ldo 32(%r25),%r25
+ ldw 28(%r24),%r19
+ subb %r21,%r19,%r21
+ stw %r21,28(%r26)
+ ldo 32(%r24),%r24
+ addib,> -8,%r23,L(loop)
+ ldo 32(%r26),%r26
+
+ addc %r0,%r0,%r28
+ bv (%r2)
+ subi 1,%r28,%r28
+EPILOGUE()
diff --git a/mpn/pa32/lshift.asm b/mpn/pa32/lshift.asm
new file mode 100644
index 000000000..393f4dc0c
--- /dev/null
+++ b/mpn/pa32/lshift.asm
@@ -0,0 +1,66 @@
+dnl HP-PA mpn_lshift -- Shift a number left.
+
+dnl Copyright 1992, 1994, 2000, 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr gr26
+C s_ptr gr25
+C size gr24
+C cnt gr23
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+ sh2add %r24,%r25,%r25
+ sh2add %r24,%r26,%r26
+ ldws,mb -4(0,%r25),%r22
+ subi 32,%r23,%r1
+ mtsar %r1
+ addib,= -1,%r24,L(0004)
+ vshd %r0,%r22,%r28 C compute carry out limb
+ ldws,mb -4(0,%r25),%r29
+ addib,= -1,%r24,L(0002)
+ vshd %r22,%r29,%r20
+
+ .label L(loop)
+ ldws,mb -4(0,%r25),%r22
+ stws,mb %r20,-4(0,%r26)
+ addib,= -1,%r24,L(0003)
+ vshd %r29,%r22,%r20
+ ldws,mb -4(0,%r25),%r29
+ stws,mb %r20,-4(0,%r26)
+ addib,<> -1,%r24,L(loop)
+ vshd %r22,%r29,%r20
+
+ .label L(0002)
+ stws,mb %r20,-4(0,%r26)
+ vshd %r29,%r0,%r20
+ bv 0(%r2)
+ stw %r20,-4(0,%r26)
+
+ .label L(0003)
+ stws,mb %r20,-4(0,%r26)
+
+ .label L(0004)
+ vshd %r22,%r0,%r20
+ bv 0(%r2)
+ stw %r20,-4(0,%r26)
+EPILOGUE()
diff --git a/mpn/pa32/rshift.asm b/mpn/pa32/rshift.asm
new file mode 100644
index 000000000..f4efdab42
--- /dev/null
+++ b/mpn/pa32/rshift.asm
@@ -0,0 +1,63 @@
+dnl HP-PA mpn_rshift -- Shift a number right.
+
+dnl Copyright 1992, 1994, 2000, 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr gr26
+C s_ptr gr25
+C size gr24
+C cnt gr23
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+ ldws,ma 4(0,%r25),%r22
+ mtsar %r23
+ addib,= -1,%r24,L(0004)
+ vshd %r22,%r0,%r28 C compute carry out limb
+ ldws,ma 4(0,%r25),%r29
+ addib,= -1,%r24,L(0002)
+ vshd %r29,%r22,%r20
+
+ .label L(loop)
+ ldws,ma 4(0,%r25),%r22
+ stws,ma %r20,4(0,%r26)
+ addib,= -1,%r24,L(0003)
+ vshd %r22,%r29,%r20
+ ldws,ma 4(0,%r25),%r29
+ stws,ma %r20,4(0,%r26)
+ addib,<> -1,%r24,L(loop)
+ vshd %r29,%r22,%r20
+
+ .label L(0002)
+ stws,ma %r20,4(0,%r26)
+ vshd %r0,%r29,%r20
+ bv 0(%r2)
+ stw %r20,0(0,%r26)
+
+ .label L(0003)
+ stws,ma %r20,4(0,%r26)
+
+ .label L(0004)
+ vshd %r0,%r22,%r20
+ bv 0(%r2)
+ stw %r20,0(0,%r26)
+EPILOGUE()
diff --git a/mpn/pa32/sub_n.asm b/mpn/pa32/sub_n.asm
new file mode 100644
index 000000000..33a15e468
--- /dev/null
+++ b/mpn/pa32/sub_n.asm
@@ -0,0 +1,55 @@
+dnl HP-PA mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl store difference in a third limb vector.
+
+dnl Copyright 1992, 1994, 2000, 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr gr26
+C s1_ptr gr25
+C s2_ptr gr24
+C size gr23
+
+C One might want to unroll this as for other processors, but it turns out that
+C the data cache contention after a store makes such unrolling useless. We
+C can't come under 5 cycles/limb anyway.
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+
+ addib,= -1,%r23,L(end) C check for (SIZE == 1)
+ sub %r20,%r19,%r28 C subtract first limbs ignoring cy
+
+ .label L(loop)
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addib,<> -1,%r23,L(loop)
+ subb %r20,%r19,%r28
+
+ .label L(end)
+ stws %r28,0(0,%r26)
+ addc %r0,%r0,%r28
+ bv 0(%r2)
+ subi 1,%r28,%r28
+EPILOGUE()
diff --git a/mpn/pa32/udiv_qrnnd.asm b/mpn/pa32/udiv_qrnnd.asm
new file mode 100644
index 000000000..ff4c8a47e
--- /dev/null
+++ b/mpn/pa32/udiv_qrnnd.asm
@@ -0,0 +1,282 @@
+dnl HP-PA __udiv_qrnnd division support, used from longlong.h.
+dnl This version runs fast on pre-PA7000 CPUs.
+
+dnl Copyright 1993, 1994, 2000, 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rem_ptr gr26
+C n1 gr25
+C n0 gr24
+C d gr23
+
+C The code size is a bit excessive. We could merge the last two ds;addc
+C sequences by simply moving the "bb,< Odd" instruction down. The only
+C trouble is the FFFFFFFF code that would need some hacking.
+
+ASM_START()
+PROLOGUE(mpn_udiv_qrnnd)
+ comb,< %r23,0,L(largedivisor)
+ sub %r0,%r23,%r1 C clear cy as side-effect
+ ds %r0,%r1,%r0
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r23,%r25
+ addc %r24,%r24,%r28
+ ds %r25,%r23,%r25
+ comclr,>= %r25,%r0,%r0
+ addl %r25,%r23,%r25
+ stws %r25,0(0,%r26)
+ bv 0(%r2)
+ addc %r28,%r28,%r28
+
+ .label L(largedivisor)
+ extru %r24,31,1,%r19 C r19 = n0 & 1
+ bb,< %r23,31,L(odd)
+ extru %r23,30,31,%r22 C r22 = d >> 1
+ shd %r25,%r24,1,%r24 C r24 = new n0
+ extru %r25,30,31,%r25 C r25 = new n1
+ sub %r0,%r22,%r21
+ ds %r0,%r21,%r0
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ comclr,>= %r25,%r0,%r0
+ addl %r25,%r22,%r25
+ sh1addl %r25,%r19,%r25
+ stws %r25,0(0,%r26)
+ bv 0(%r2)
+ addc %r24,%r24,%r28
+
+ .label L(odd)
+ addib,sv,n 1,%r22,L(FFFFFFFF) C r22 = (d / 2 + 1)
+ shd %r25,%r24,1,%r24 C r24 = new n0
+ extru %r25,30,31,%r25 C r25 = new n1
+ sub %r0,%r22,%r21
+ ds %r0,%r21,%r0
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r24
+ ds %r25,%r22,%r25
+ addc %r24,%r24,%r28
+ comclr,>= %r25,%r0,%r0
+ addl %r25,%r22,%r25
+ sh1addl %r25,%r19,%r25
+C We have computed (n1,,n0) / (d + 1), q' = r28, r' = r25
+ add,nuv %r28,%r25,%r25
+ addl %r25,%r1,%r25
+ addc %r0,%r28,%r28
+ sub,<< %r25,%r23,%r0
+ addl %r25,%r1,%r25
+ stws %r25,0(0,%r26)
+ bv 0(%r2)
+ addc %r0,%r28,%r28
+
+C This is just a special case of the code above.
+C We come here when d == 0xFFFFFFFF
+ .label L(FFFFFFFF)
+ add,uv %r25,%r24,%r24
+ sub,<< %r24,%r23,%r0
+ ldo 1(%r24),%r24
+ stws %r24,0(0,%r26)
+ bv 0(%r2)
+ addc %r0,%r25,%r28
+EPILOGUE()