diff options
author | tege <tege@gmplib.org> | 2002-03-26 17:30:50 +0100 |
---|---|---|
committer | tege <tege@gmplib.org> | 2002-03-26 17:30:50 +0100 |
commit | 59ccffe8a22a0c991f9c44a8c17db500f9b08a51 (patch) | |
tree | f2f02ca330e0de3bd9b186f54071a1c8f1b195a5 /mpn/pa32 | |
parent | 337d7796e7aadc1055f2e2e4c89f5849f6a1f994 (diff) | |
download | gmp-59ccffe8a22a0c991f9c44a8c17db500f9b08a51.tar.gz |
Moved from mpn/hppa.
Diffstat (limited to 'mpn/pa32')
-rw-r--r-- | mpn/pa32/README | 143 | ||||
-rw-r--r-- | mpn/pa32/add_n.asm | 54 | ||||
-rw-r--r-- | mpn/pa32/gmp-mparam.h | 56 | ||||
-rw-r--r-- | mpn/pa32/hppa1_1/addmul_1.asm | 98 | ||||
-rw-r--r-- | mpn/pa32/hppa1_1/gmp-mparam.h | 63 | ||||
-rw-r--r-- | mpn/pa32/hppa1_1/mul_1.asm | 94 | ||||
-rw-r--r-- | mpn/pa32/hppa1_1/pa7100/add_n.asm | 74 | ||||
-rw-r--r-- | mpn/pa32/hppa1_1/pa7100/addmul_1.asm | 192 | ||||
-rw-r--r-- | mpn/pa32/hppa1_1/pa7100/lshift.asm | 86 | ||||
-rw-r--r-- | mpn/pa32/hppa1_1/pa7100/rshift.asm | 83 | ||||
-rw-r--r-- | mpn/pa32/hppa1_1/pa7100/sub_n.asm | 75 | ||||
-rw-r--r-- | mpn/pa32/hppa1_1/pa7100/submul_1.asm | 198 | ||||
-rw-r--r-- | mpn/pa32/hppa1_1/sqr_diagonal.asm | 51 | ||||
-rw-r--r-- | mpn/pa32/hppa1_1/submul_1.asm | 107 | ||||
-rw-r--r-- | mpn/pa32/hppa1_1/udiv_qrnnd.asm | 96 | ||||
-rw-r--r-- | mpn/pa32/hppa1_1/umul.asm | 38 | ||||
-rw-r--r-- | mpn/pa32/hppa2_0/add_n.asm | 98 | ||||
-rw-r--r-- | mpn/pa32/hppa2_0/gmp-mparam.h | 57 | ||||
-rw-r--r-- | mpn/pa32/hppa2_0/sqr_diagonal.asm | 103 | ||||
-rw-r--r-- | mpn/pa32/hppa2_0/sub_n.asm | 98 | ||||
-rw-r--r-- | mpn/pa32/lshift.asm | 66 | ||||
-rw-r--r-- | mpn/pa32/rshift.asm | 63 | ||||
-rw-r--r-- | mpn/pa32/sub_n.asm | 55 | ||||
-rw-r--r-- | mpn/pa32/udiv_qrnnd.asm | 282 |
24 files changed, 2330 insertions, 0 deletions
diff --git a/mpn/pa32/README b/mpn/pa32/README new file mode 100644 index 000000000..b7c26318d --- /dev/null +++ b/mpn/pa32/README @@ -0,0 +1,143 @@ +Copyright 1996, 1999, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. + + + + + + +This directory contains mpn functions for various HP PA-RISC chips. Code +that runs faster on the PA7100 and later implementations, is in the pa7100 +directory. + +RELEVANT OPTIMIZATION ISSUES + + Load and Store timing + +On the PA7000 no memory instructions can issue the two cycles after a store. +For the PA7100, this is reduced to one cycle. + +The PA7100 has a lookup-free cache, so it helps to schedule loads and the +dependent instruction really far from each other. + +STATUS + +1. mpn_mul_1 could be improved to 6.5 cycles/limb on the PA7100, using the + instructions below (but some sw pipelining is needed to avoid the + xmpyu-fstds delay): + + fldds s1_ptr + + xmpyu + fstds N(%r30) + xmpyu + fstds N(%r30) + + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + + addc + stws res_ptr + addc + stws res_ptr + + addib Loop + +2. mpn_addmul_1 could be improved from the current 10 to 7.5 cycles/limb + (asymptotically) on the PA7100, using the instructions below. With proper + sw pipelining and the unrolling level below, the speed becomes 8 + cycles/limb. + + fldds s1_ptr + fldds s1_ptr + + xmpyu + fstds N(%r30) + xmpyu + fstds N(%r30) + xmpyu + fstds N(%r30) + xmpyu + fstds N(%r30) + + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + addc + addc + addc + addc + addc %r0,%r0,cy-limb + + ldws res_ptr + ldws res_ptr + ldws res_ptr + ldws res_ptr + add + stws res_ptr + addc + stws res_ptr + addc + stws res_ptr + addc + stws res_ptr + + addib + +3. For the PA8000 we have to stick to using 32-bit limbs before compiler + support emerges. But we want to use 64-bit operations whenever possible, + in particular for loads and stores. It is possible to handle mpn_add_n + efficiently by rotating (when s1/s2 are aligned), masking+bit field + inserting when (they are not). The speed should double compared to the + code used today. + + + + +LABEL SYNTAX + +The HP-UX assembler takes labels starting in column 0 with no colon, + + L$loop ldws,mb -4(0,%r25),%r22 + +Gas on hppa GNU/Linux however requires a colon, + + L$loop: ldws,mb -4(0,%r25),%r22 + +Fortunately both accept a ".label" pseudo-op, + + .label L$loop + ldws,mb -4(0,%r25),%r22 + + + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/mpn/pa32/add_n.asm b/mpn/pa32/add_n.asm new file mode 100644 index 000000000..9b201c445 --- /dev/null +++ b/mpn/pa32/add_n.asm @@ -0,0 +1,54 @@ +dnl HP-PA mpn_add_n -- Add two limb vectors of the same length > 0 and store +dnl sum in a third limb vector. + +dnl Copyright 1992, 1994, 2000, 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr gr26 +C s1_ptr gr25 +C s2_ptr gr24 +C size gr23 + +C One might want to unroll this as for other processors, but it turns out that +C the data cache contention after a store makes such unrolling useless. We +C can't come under 5 cycles/limb anyway. + +ASM_START() +PROLOGUE(mpn_add_n) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,= -1,%r23,L(end) C check for (SIZE == 1) + add %r20,%r19,%r28 C add first limbs ignoring cy + + .label L(loop) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,<> -1,%r23,L(loop) + addc %r20,%r19,%r28 + + .label L(end) + stws %r28,0(0,%r26) + bv 0(%r2) + addc %r0,%r0,%r28 +EPILOGUE() diff --git a/mpn/pa32/gmp-mparam.h b/mpn/pa32/gmp-mparam.h new file mode 100644 index 000000000..2fec625c3 --- /dev/null +++ b/mpn/pa32/gmp-mparam.h @@ -0,0 +1,56 @@ +/* HP-PA 1.0 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 + +/* These values are for the PA7100 using GCC. */ +/* Generated by tuneup.c, 2000-10-27. */ + +#ifndef MUL_KARATSUBA_THRESHOLD +#define MUL_KARATSUBA_THRESHOLD 30 +#endif +#ifndef MUL_TOOM3_THRESHOLD +#define MUL_TOOM3_THRESHOLD 141 +#endif + +#ifndef SQR_KARATSUBA_THRESHOLD +#define SQR_KARATSUBA_THRESHOLD 59 +#endif +#ifndef SQR_TOOM3_THRESHOLD +#define SQR_TOOM3_THRESHOLD 177 +#endif + +#ifndef DIV_DC_THRESHOLD +#define DIV_DC_THRESHOLD 108 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 18 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 46 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 33 +#endif diff --git a/mpn/pa32/hppa1_1/addmul_1.asm b/mpn/pa32/hppa1_1/addmul_1.asm new file mode 100644 index 000000000..40fefe505 --- /dev/null +++ b/mpn/pa32/hppa1_1/addmul_1.asm @@ -0,0 +1,98 @@ +dnl HP-PA 1.1 mpn_addmul_1 -- Multiply a limb vector with a limb and add the +dnl result to a second limb vector. + +dnl Copyright 1992, 1993, 1994, 2000, 2001, 2002 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r26 +C s1_ptr r25 +C size r24 +C s2_limb r23 + +C This runs at 11 cycles/limb on a PA7000. With the used instructions, it can +C not become faster due to data cache contention after a store. On the PA7100 +C it runs at 10 cycles/limb. + +C There are some ideas described in mul_1.asm that applies to this code too. + +ASM_START() +PROLOGUE(mpn_addmul_1) +C .callinfo frame=64,no_calls + + ldo 64(%r30),%r30 + fldws,ma 4(%r25),%fr5 + stw %r23,-16(%r30) C move s2_limb ... + addib,= -1,%r24,L(just_one_limb) + fldws -16(%r30),%fr4 C ... into fr4 + add %r0,%r0,%r0 C clear carry + xmpyu %fr4,%fr5,%fr6 + fldws,ma 4(%r25),%fr7 + fstds %fr6,-16(%r30) + xmpyu %fr4,%fr7,%fr8 + ldw -12(%r30),%r19 C least significant limb in product + ldw -16(%r30),%r28 + + fstds %fr8,-16(%r30) + addib,= -1,%r24,L(end) + ldw -12(%r30),%r1 + +C Main loop + .label L(loop) + ldws 0(%r26),%r29 + fldws,ma 4(%r25),%fr5 + add %r29,%r19,%r19 + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + xmpyu %fr4,%fr5,%fr6 + ldw -16(%r30),%r28 + fstds %fr6,-16(%r30) + addc %r0,%r28,%r28 + addib,<> -1,%r24,L(loop) + ldw -12(%r30),%r1 + + .label L(end) + ldw 0(%r26),%r29 + add %r29,%r19,%r19 + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + ldw -16(%r30),%r28 + ldws 0(%r26),%r29 + addc %r0,%r28,%r28 + add %r29,%r19,%r19 + stws,ma %r19,4(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + + .label L(just_one_limb) + xmpyu %fr4,%fr5,%fr6 + ldw 0(%r26),%r29 + fstds %fr6,-16(%r30) + ldw -12(%r30),%r1 + ldw -16(%r30),%r28 + add %r29,%r1,%r19 + stw %r19,0(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 +EPILOGUE() diff --git a/mpn/pa32/hppa1_1/gmp-mparam.h b/mpn/pa32/hppa1_1/gmp-mparam.h new file mode 100644 index 000000000..f14ac6643 --- /dev/null +++ b/mpn/pa32/hppa1_1/gmp-mparam.h @@ -0,0 +1,63 @@ +/* HP-PA 1.1 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 + +/* Generated by tuneup.c, 2002-03-07, gcc 2.8 (pa7100/100MHz) */ + +#define MUL_KARATSUBA_THRESHOLD 30 +#define MUL_TOOM3_THRESHOLD 141 + +#define SQR_BASECASE_THRESHOLD 4 +#define SQR_KARATSUBA_THRESHOLD 55 +#define SQR_TOOM3_THRESHOLD 185 + +#define DIV_SB_PREINV_THRESHOLD 0 /* always */ +#define DIV_DC_THRESHOLD 95 +#define POWM_THRESHOLD 150 + +#define GCD_ACCEL_THRESHOLD 3 +#define GCDEXT_THRESHOLD 0 /* always */ +#define JACOBI_BASE_METHOD 2 + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 6 +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 6 +#define USE_PREINV_DIVREM_1 1 +#define USE_PREINV_MOD_1 1 +#define DIVREM_2_THRESHOLD 0 /* always */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define MODEXACT_1_ODD_THRESHOLD 0 /* always */ + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 23 +#define SET_STR_THRESHOLD 6589 + +#define MUL_FFT_TABLE { 592, 1440, 2688, 5632, 14336, 40960, 0 } +#define MUL_FFT_MODF_THRESHOLD 608 +#define MUL_FFT_THRESHOLD 5888 + +#define SQR_FFT_TABLE { 624, 1504, 2688, 6656, 18432, 40960, 0 } +#define SQR_FFT_MODF_THRESHOLD 640 +#define SQR_FFT_THRESHOLD 5376 diff --git a/mpn/pa32/hppa1_1/mul_1.asm b/mpn/pa32/hppa1_1/mul_1.asm new file mode 100644 index 000000000..bc336c55e --- /dev/null +++ b/mpn/pa32/hppa1_1/mul_1.asm @@ -0,0 +1,94 @@ +dnl HP-PA 1.1 mpn_mul_1 -- Multiply a limb vector with a limb and store the +dnl result in a second limb vector. + +dnl Copyright 1992, 1993, 1994, 2000, 2001, 2002 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r26 +C s1_ptr r25 +C size r24 +C s2_limb r23 + +C This runs at 9 cycles/limb on a PA7000. With the used instructions, it can +C not become faster due to data cache contention after a store. On the PA7100 +C it runs at 7 cycles/limb. + +C We could use fldds to read two limbs at a time from the S1 array, and that +C could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and +C PA7100, respectively. We don't do that since it does not seem worth the +C (alignment) troubles... + +C At least the PA7100 is rumored to be able to deal with cache-misses without +C stalling instruction issue. If this is true, and the cache is actually also +C lockup-free, we should use a deeper software pipeline, and load from S1 very +C early! (The loads and stores to -12(sp) will surely be in the cache.) + +ASM_START() +PROLOGUE(mpn_mul_1) +C .callinfo frame=64,no_calls + + ldo 64(%r30),%r30 + fldws,ma 4(%r25),%fr5 + stw %r23,-16(%r30) C move s2_limb ... + addib,= -1,%r24,L(just_one_limb) + fldws -16(%r30),%fr4 C ... into fr4 + add %r0,%r0,%r0 C clear carry + xmpyu %fr4,%fr5,%fr6 + fldws,ma 4(%r25),%fr7 + fstds %fr6,-16(%r30) + xmpyu %fr4,%fr7,%fr8 + ldw -12(%r30),%r19 C least significant limb in product + ldw -16(%r30),%r28 + + fstds %fr8,-16(%r30) + addib,= -1,%r24,L(end) + ldw -12(%r30),%r1 + +C Main loop + .label L(loop) + fldws,ma 4(%r25),%fr5 + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + xmpyu %fr4,%fr5,%fr6 + ldw -16(%r30),%r28 + fstds %fr6,-16(%r30) + addib,<> -1,%r24,L(loop) + ldw -12(%r30),%r1 + + .label L(end) + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + ldw -16(%r30),%r28 + stws,ma %r19,4(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + + .label L(just_one_limb) + xmpyu %fr4,%fr5,%fr6 + fstds %fr6,-16(%r30) + ldw -16(%r30),%r28 + ldo -64(%r30),%r30 + bv 0(%r2) + fstws %fr6R,0(%r26) +EPILOGUE() diff --git a/mpn/pa32/hppa1_1/pa7100/add_n.asm b/mpn/pa32/hppa1_1/pa7100/add_n.asm new file mode 100644 index 000000000..e515456b6 --- /dev/null +++ b/mpn/pa32/hppa1_1/pa7100/add_n.asm @@ -0,0 +1,74 @@ +dnl HP-PA mpn_add_n -- Add two limb vectors of the same length > 0 and store +dnl sum in a third limb vector. Optimized for the PA7100, where is runs at +dnl 4.25 cycles/limb. + +dnl Copyright 1992, 1994, 2000, 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r26 +C s1_ptr r25 +C s2_ptr r24 +C size r23 + +ASM_START() +PROLOGUE(mpn_add_n) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,<= -5,%r23,L(rest) + add %r20,%r19,%r28 C add first limbs ignoring cy + + .label L(loop) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addc %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addc %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addc %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,> -4,%r23,L(loop) + addc %r20,%r19,%r28 + + .label L(rest) + addib,= 4,%r23,L(end) + nop + + .label L(eloop) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,> -1,%r23,L(eloop) + addc %r20,%r19,%r28 + + .label L(end) + stws %r28,0(0,%r26) + bv 0(%r2) + addc %r0,%r0,%r28 +EPILOGUE() diff --git a/mpn/pa32/hppa1_1/pa7100/addmul_1.asm b/mpn/pa32/hppa1_1/pa7100/addmul_1.asm new file mode 100644 index 000000000..58290c54b --- /dev/null +++ b/mpn/pa32/hppa1_1/pa7100/addmul_1.asm @@ -0,0 +1,192 @@ +dnl HP-PA 7100/7200 mpn_addmul_1 -- Multiply a limb vector with a limb and +dnl add the result to a second limb vector. + +dnl Copyright 1995, 2000, 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`res_ptr',`%r26') +define(`s1_ptr',`%r25') +define(`size',`%r24') +define(`s2_limb',`%r23') + +define(`cylimb',`%r28') +define(`s0',`%r19') +define(`s1',`%r20') +define(`s2',`%r3') +define(`s3',`%r4') +define(`lo0',`%r21') +define(`lo1',`%r5') +define(`lo2',`%r6') +define(`lo3',`%r7') +define(`hi0',`%r22') +define(`hi1',`%r23') C safe to reuse +define(`hi2',`%r29') +define(`hi3',`%r1') + +ASM_START() +PROLOGUE(mpn_addmul_1) +C .callinfo frame=128,no_calls + + ldo 128(%r30),%r30 + stws s2_limb,-16(%r30) + add %r0,%r0,cylimb C clear cy and cylimb + addib,< -4,size,L(few_limbs) + fldws -16(%r30),%fr31R + + ldo -112(%r30),%r31 + stw %r3,-96(%r30) + stw %r4,-92(%r30) + stw %r5,-88(%r30) + stw %r6,-84(%r30) + stw %r7,-80(%r30) + + bb,>=,n s1_ptr,29,L(0) + + fldws,ma 4(s1_ptr),%fr4 + ldws 0(res_ptr),s0 + xmpyu %fr4,%fr31R,%fr5 + fstds %fr5,-16(%r31) + ldws -16(%r31),cylimb + ldws -12(%r31),lo0 + add s0,lo0,s0 + addib,< -1,size,L(few_limbs) + stws,ma s0,4(res_ptr) + +C start software pipeline ---------------------------------------------------- + .label L(0) + fldds,ma 8(s1_ptr),%fr4 + fldds,ma 8(s1_ptr),%fr8 + + xmpyu %fr4L,%fr31R,%fr5 + xmpyu %fr4R,%fr31R,%fr6 + xmpyu %fr8L,%fr31R,%fr9 + xmpyu %fr8R,%fr31R,%fr10 + + fstds %fr5,-16(%r31) + fstds %fr6,-8(%r31) + fstds %fr9,0(%r31) + fstds %fr10,8(%r31) + + ldws -16(%r31),hi0 + ldws -12(%r31),lo0 + ldws -8(%r31),hi1 + ldws -4(%r31),lo1 + ldws 0(%r31),hi2 + ldws 4(%r31),lo2 + ldws 8(%r31),hi3 + ldws 12(%r31),lo3 + + addc lo0,cylimb,lo0 + addc lo1,hi0,lo1 + addc lo2,hi1,lo2 + addc lo3,hi2,lo3 + + addib,< -4,size,L(end) + addc %r0,hi3,cylimb C propagate carry into cylimb +C main loop ------------------------------------------------------------------ + .label L(loop) + fldds,ma 8(s1_ptr),%fr4 + fldds,ma 8(s1_ptr),%fr8 + + ldws 0(res_ptr),s0 + xmpyu %fr4L,%fr31R,%fr5 + ldws 4(res_ptr),s1 + xmpyu %fr4R,%fr31R,%fr6 + ldws 8(res_ptr),s2 + xmpyu %fr8L,%fr31R,%fr9 + ldws 12(res_ptr),s3 + xmpyu %fr8R,%fr31R,%fr10 + + fstds %fr5,-16(%r31) + add s0,lo0,s0 + fstds %fr6,-8(%r31) + addc s1,lo1,s1 + fstds %fr9,0(%r31) + addc s2,lo2,s2 + fstds %fr10,8(%r31) + addc s3,lo3,s3 + + ldws -16(%r31),hi0 + ldws -12(%r31),lo0 + ldws -8(%r31),hi1 + ldws -4(%r31),lo1 + ldws 0(%r31),hi2 + ldws 4(%r31),lo2 + ldws 8(%r31),hi3 + ldws 12(%r31),lo3 + + addc lo0,cylimb,lo0 + stws,ma s0,4(res_ptr) + addc lo1,hi0,lo1 + stws,ma s1,4(res_ptr) + addc lo2,hi1,lo2 + stws,ma s2,4(res_ptr) + addc lo3,hi2,lo3 + stws,ma s3,4(res_ptr) + + addib,>= -4,size,L(loop) + addc %r0,hi3,cylimb C propagate carry into cylimb +C finish software pipeline --------------------------------------------------- + .label L(end) + ldws 0(res_ptr),s0 + ldws 4(res_ptr),s1 + ldws 8(res_ptr),s2 + ldws 12(res_ptr),s3 + + add s0,lo0,s0 + stws,ma s0,4(res_ptr) + addc s1,lo1,s1 + stws,ma s1,4(res_ptr) + addc s2,lo2,s2 + stws,ma s2,4(res_ptr) + addc s3,lo3,s3 + stws,ma s3,4(res_ptr) + +C restore callee-saves registers --------------------------------------------- + ldw -96(%r30),%r3 + ldw -92(%r30),%r4 + ldw -88(%r30),%r5 + ldw -84(%r30),%r6 + ldw -80(%r30),%r7 + + .label L(few_limbs) + addib,=,n 4,size,L(ret) + + .label L(loop2) + fldws,ma 4(s1_ptr),%fr4 + ldws 0(res_ptr),s0 + xmpyu %fr4,%fr31R,%fr5 + fstds %fr5,-16(%r30) + ldws -16(%r30),hi0 + ldws -12(%r30),lo0 + addc lo0,cylimb,lo0 + addc %r0,hi0,cylimb + add s0,lo0,s0 + stws,ma s0,4(res_ptr) + addib,<> -1,size,L(loop2) + nop + + .label L(ret) + addc %r0,cylimb,cylimb + bv 0(%r2) + ldo -128(%r30),%r30 +EPILOGUE(mpn_addmul_1) diff --git a/mpn/pa32/hppa1_1/pa7100/lshift.asm b/mpn/pa32/hppa1_1/pa7100/lshift.asm new file mode 100644 index 000000000..336b0daaa --- /dev/null +++ b/mpn/pa32/hppa1_1/pa7100/lshift.asm @@ -0,0 +1,86 @@ +dnl HP-PA mpn_lshift -- Shift a number left. +dnl Optimized for the PA7100, where is runs at 3.25 cycles/limb. + +dnl Copyright 1992, 1994, 2000, 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r26 +C s_ptr r25 +C size r24 +C cnt r23 + +ASM_START() +PROLOGUE(mpn_lshift) + sh2add %r24,%r25,%r25 + sh2add %r24,%r26,%r26 + ldws,mb -4(0,%r25),%r22 + subi 32,%r23,%r1 + mtsar %r1 + addib,= -1,%r24,L(0004) + vshd %r0,%r22,%r28 C compute carry out limb + ldws,mb -4(0,%r25),%r29 + addib,<= -5,%r24,L(rest) + vshd %r22,%r29,%r20 + + .label L(loop) + ldws,mb -4(0,%r25),%r22 + stws,mb %r20,-4(0,%r26) + vshd %r29,%r22,%r20 + ldws,mb -4(0,%r25),%r29 + stws,mb %r20,-4(0,%r26) + vshd %r22,%r29,%r20 + ldws,mb -4(0,%r25),%r22 + stws,mb %r20,-4(0,%r26) + vshd %r29,%r22,%r20 + ldws,mb -4(0,%r25),%r29 + stws,mb %r20,-4(0,%r26) + addib,> -4,%r24,L(loop) + vshd %r22,%r29,%r20 + + .label L(rest) + addib,= 4,%r24,L(end1) + nop + + .label L(eloop) + ldws,mb -4(0,%r25),%r22 + stws,mb %r20,-4(0,%r26) + addib,<= -1,%r24,L(end2) + vshd %r29,%r22,%r20 + ldws,mb -4(0,%r25),%r29 + stws,mb %r20,-4(0,%r26) + addib,> -1,%r24,L(eloop) + vshd %r22,%r29,%r20 + + .label L(end1) + stws,mb %r20,-4(0,%r26) + vshd %r29,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) + + .label L(end2) + stws,mb %r20,-4(0,%r26) + + .label L(0004) + vshd %r22,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) +EPILOGUE() diff --git a/mpn/pa32/hppa1_1/pa7100/rshift.asm b/mpn/pa32/hppa1_1/pa7100/rshift.asm new file mode 100644 index 000000000..17ac0eecb --- /dev/null +++ b/mpn/pa32/hppa1_1/pa7100/rshift.asm @@ -0,0 +1,83 @@ +dnl HP-PA mpn_rshift -- Shift a number right. +dnl Optimized for the PA7100, where is runs at 3.25 cycles/limb. + +dnl Copyright 1992, 1994, 2000, 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r26 +C s_ptr r25 +C size r24 +C cnt r23 + +ASM_START() +PROLOGUE(mpn_rshift) + ldws,ma 4(0,%r25),%r22 + mtsar %r23 + addib,= -1,%r24,L(0004) + vshd %r22,%r0,%r28 C compute carry out limb + ldws,ma 4(0,%r25),%r29 + addib,<= -5,%r24,L(rest) + vshd %r29,%r22,%r20 + + .label L(loop) + ldws,ma 4(0,%r25),%r22 + stws,ma %r20,4(0,%r26) + vshd %r22,%r29,%r20 + ldws,ma 4(0,%r25),%r29 + stws,ma %r20,4(0,%r26) + vshd %r29,%r22,%r20 + ldws,ma 4(0,%r25),%r22 + stws,ma %r20,4(0,%r26) + vshd %r22,%r29,%r20 + ldws,ma 4(0,%r25),%r29 + stws,ma %r20,4(0,%r26) + addib,> -4,%r24,L(loop) + vshd %r29,%r22,%r20 + + .label L(rest) + addib,= 4,%r24,L(end1) + nop + + .label L(eloop) + ldws,ma 4(0,%r25),%r22 + stws,ma %r20,4(0,%r26) + addib,<= -1,%r24,L(end2) + vshd %r22,%r29,%r20 + ldws,ma 4(0,%r25),%r29 + stws,ma %r20,4(0,%r26) + addib,> -1,%r24,L(eloop) + vshd %r29,%r22,%r20 + + .label L(end1) + stws,ma %r20,4(0,%r26) + vshd %r0,%r29,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) + + .label L(end2) + stws,ma %r20,4(0,%r26) + + .label L(0004) + vshd %r0,%r22,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) +EPILOGUE() diff --git a/mpn/pa32/hppa1_1/pa7100/sub_n.asm b/mpn/pa32/hppa1_1/pa7100/sub_n.asm new file mode 100644 index 000000000..14073cf19 --- /dev/null +++ b/mpn/pa32/hppa1_1/pa7100/sub_n.asm @@ -0,0 +1,75 @@ +dnl HP-PA mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. Optimized for the PA7100, where +dnl is runs at 4.25 cycles/limb. + +dnl Copyright 1992, 1994, 2000, 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r26 +C s1_ptr r25 +C s2_ptr r24 +C size r23 + +ASM_START() +PROLOGUE(mpn_sub_n) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,<= -5,%r23,L(rest) + sub %r20,%r19,%r28 C subtract first limbs ignoring cy + + .label L(loop) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + subb %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + subb %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + subb %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,> -4,%r23,L(loop) + subb %r20,%r19,%r28 + + .label L(rest) + addib,= 4,%r23,L(end) + nop + + .label L(eloop) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,> -1,%r23,L(eloop) + subb %r20,%r19,%r28 + + .label L(end) + stws %r28,0(0,%r26) + addc %r0,%r0,%r28 + bv 0(%r2) + subi 1,%r28,%r28 +EPILOGUE() diff --git a/mpn/pa32/hppa1_1/pa7100/submul_1.asm b/mpn/pa32/hppa1_1/pa7100/submul_1.asm new file mode 100644 index 000000000..f99079251 --- /dev/null +++ b/mpn/pa32/hppa1_1/pa7100/submul_1.asm @@ -0,0 +1,198 @@ +dnl HP-PA 7100/7200 mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright 1995, 2000, 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`res_ptr',`%r26') +define(`s1_ptr',`%r25') +define(`size',`%r24') +define(`s2_limb',`%r23') + +define(`cylimb',`%r28') +define(`s0',`%r19') +define(`s1',`%r20') +define(`s2',`%r3') +define(`s3',`%r4') +define(`lo0',`%r21') +define(`lo1',`%r5') +define(`lo2',`%r6') +define(`lo3',`%r7') +define(`hi0',`%r22') +define(`hi1',`%r23') C safe to reuse +define(`hi2',`%r29') +define(`hi3',`%r1') + +ASM_START() +PROLOGUE(mpn_submul_1) +C .callinfo frame=128,no_calls + + ldo 128(%r30),%r30 + stws s2_limb,-16(%r30) + add %r0,%r0,cylimb C clear cy and cylimb + addib,< -4,size,L(few_limbs) + fldws -16(%r30),%fr31R + + ldo -112(%r30),%r31 + stw %r3,-96(%r30) + stw %r4,-92(%r30) + stw %r5,-88(%r30) + stw %r6,-84(%r30) + stw %r7,-80(%r30) + + bb,>=,n s1_ptr,29,L(0) + + fldws,ma 4(s1_ptr),%fr4 + ldws 0(res_ptr),s0 + xmpyu %fr4,%fr31R,%fr5 + fstds %fr5,-16(%r31) + ldws -16(%r31),cylimb + ldws -12(%r31),lo0 + sub s0,lo0,s0 + add s0,lo0,%r0 C invert cy + addib,< -1,size,L(few_limbs) + stws,ma s0,4(res_ptr) + +C start software pipeline ---------------------------------------------------- + .label L(0) + fldds,ma 8(s1_ptr),%fr4 + fldds,ma 8(s1_ptr),%fr8 + + xmpyu %fr4L,%fr31R,%fr5 + xmpyu %fr4R,%fr31R,%fr6 + xmpyu %fr8L,%fr31R,%fr9 + xmpyu %fr8R,%fr31R,%fr10 + + fstds %fr5,-16(%r31) + fstds %fr6,-8(%r31) + fstds %fr9,0(%r31) + fstds %fr10,8(%r31) + + ldws -16(%r31),hi0 + ldws -12(%r31),lo0 + ldws -8(%r31),hi1 + ldws -4(%r31),lo1 + ldws 0(%r31),hi2 + ldws 4(%r31),lo2 + ldws 8(%r31),hi3 + ldws 12(%r31),lo3 + + addc lo0,cylimb,lo0 + addc lo1,hi0,lo1 + addc lo2,hi1,lo2 + addc lo3,hi2,lo3 + + addib,< -4,size,L(end) + addc %r0,hi3,cylimb C propagate carry into cylimb +C main loop ------------------------------------------------------------------ + .label L(loop) + fldds,ma 8(s1_ptr),%fr4 + fldds,ma 8(s1_ptr),%fr8 + + ldws 0(res_ptr),s0 + xmpyu %fr4L,%fr31R,%fr5 + ldws 4(res_ptr),s1 + xmpyu %fr4R,%fr31R,%fr6 + ldws 8(res_ptr),s2 + xmpyu %fr8L,%fr31R,%fr9 + ldws 12(res_ptr),s3 + xmpyu %fr8R,%fr31R,%fr10 + + fstds %fr5,-16(%r31) + sub s0,lo0,s0 + fstds %fr6,-8(%r31) + subb s1,lo1,s1 + fstds %fr9,0(%r31) + subb s2,lo2,s2 + fstds %fr10,8(%r31) + subb s3,lo3,s3 + subb %r0,%r0,lo0 C these two insns ... + add lo0,lo0,%r0 C ... just invert cy + + ldws -16(%r31),hi0 + ldws -12(%r31),lo0 + ldws -8(%r31),hi1 + ldws -4(%r31),lo1 + ldws 0(%r31),hi2 + ldws 4(%r31),lo2 + ldws 8(%r31),hi3 + ldws 12(%r31),lo3 + + addc lo0,cylimb,lo0 + stws,ma s0,4(res_ptr) + addc lo1,hi0,lo1 + stws,ma s1,4(res_ptr) + addc lo2,hi1,lo2 + stws,ma s2,4(res_ptr) + addc lo3,hi2,lo3 + stws,ma s3,4(res_ptr) + + addib,>= -4,size,L(loop) + addc %r0,hi3,cylimb C propagate carry into cylimb +C finish software pipeline --------------------------------------------------- + .label L(end) + ldws 0(res_ptr),s0 + ldws 4(res_ptr),s1 + ldws 8(res_ptr),s2 + ldws 12(res_ptr),s3 + + sub s0,lo0,s0 + stws,ma s0,4(res_ptr) + subb s1,lo1,s1 + stws,ma s1,4(res_ptr) + subb s2,lo2,s2 + stws,ma s2,4(res_ptr) + subb s3,lo3,s3 + stws,ma s3,4(res_ptr) + subb %r0,%r0,lo0 C these two insns ... + add lo0,lo0,%r0 C ... invert cy + +C restore callee-saves registers --------------------------------------------- + ldw -96(%r30),%r3 + ldw -92(%r30),%r4 + ldw -88(%r30),%r5 + ldw -84(%r30),%r6 + ldw -80(%r30),%r7 + + .label L(few_limbs) + addib,=,n 4,size,L(ret) + + .label L(loop2) + fldws,ma 4(s1_ptr),%fr4 + ldws 0(res_ptr),s0 + xmpyu %fr4,%fr31R,%fr5 + fstds %fr5,-16(%r30) + ldws -16(%r30),hi0 + ldws -12(%r30),lo0 + addc lo0,cylimb,lo0 + addc %r0,hi0,cylimb + sub s0,lo0,s0 + add s0,lo0,%r0 C invert cy + stws,ma s0,4(res_ptr) + addib,<> -1,size,L(loop2) + nop + + .label L(ret) + addc %r0,cylimb,cylimb + bv 0(%r2) + ldo -128(%r30),%r30 +EPILOGUE(mpn_submul_1) diff --git a/mpn/pa32/hppa1_1/sqr_diagonal.asm b/mpn/pa32/hppa1_1/sqr_diagonal.asm new file mode 100644 index 000000000..53e4254b7 --- /dev/null +++ b/mpn/pa32/hppa1_1/sqr_diagonal.asm @@ -0,0 +1,51 @@ +dnl HP-PA 1.1 32-bit mpn_sqr_diagonal. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C This code runs at 6 cycles/limb on the PA7100 and 2.5 cycles/limb on PA8x00. +C 2-way unrolling wouldn't help the PA7100; it could however bring times down +C to 2.0 cycles/limb for the PA8x00. + +C INPUT PARAMETERS +define(`rp',`%r26') +define(`up',`%r25') +define(`n',`%r24') + +ASM_START() +PROLOGUE(mpn_sqr_diagonal) + ldo 4(rp),rp + fldws,ma 4(up),%fr4r + addib,= -1,n,L(exit) + xmpyu %fr4r,%fr4r,%fr5 + + .label L(loop) + fldws,ma 4(up),%fr4r + fstws %fr5r,-4(rp) + fstws,ma %fr5l,8(rp) + addib,<> -1,n,L(loop) + xmpyu %fr4r,%fr4r,%fr5 + + .label L(exit) + fstws %fr5r,-4(rp) + bv 0(%r2) + fstws %fr5l,0(rp) +EPILOGUE(mpn_sqr_diagonal) diff --git a/mpn/pa32/hppa1_1/submul_1.asm b/mpn/pa32/hppa1_1/submul_1.asm new file mode 100644 index 000000000..3a6c756cc --- /dev/null +++ b/mpn/pa32/hppa1_1/submul_1.asm @@ -0,0 +1,107 @@ +dnl HP-PA 1.1 mpn_submul_1 -- Multiply a limb vector with a limb and subtract +dnl the result from a second limb vector. + +dnl Copyright 1992, 1993, 1994, 2000, 2001, 2002 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r26 +C s1_ptr r25 +C size r24 +C s2_limb r23 + +C This runs at 12 cycles/limb on a PA7000. With the used instructions, it can +C not become faster due to data cache contention after a store. On the PA7100 +C it runs at 11 cycles/limb. + +C There are some ideas described in mul_1.asm that applies to this code too. + +C It seems possible to make this run as fast as mpn_addmul_1, if we use +C sub,>>= %r29,%r19,%r22 +C addi 1,%r28,%r28 +C but that requires reworking the hairy software pipeline... + +ASM_START() +PROLOGUE(mpn_submul_1) +C .callinfo frame=64,no_calls + + ldo 64(%r30),%r30 + fldws,ma 4(%r25),%fr5 + stw %r23,-16(%r30) C move s2_limb ... + addib,= -1,%r24,L(just_one_limb) + fldws -16(%r30),%fr4 C ... into fr4 + add %r0,%r0,%r0 C clear carry + xmpyu %fr4,%fr5,%fr6 + fldws,ma 4(%r25),%fr7 + fstds %fr6,-16(%r30) + xmpyu %fr4,%fr7,%fr8 + ldw -12(%r30),%r19 C least significant limb in product + ldw -16(%r30),%r28 + + fstds %fr8,-16(%r30) + addib,= -1,%r24,L(end) + ldw -12(%r30),%r1 + +C Main loop + .label L(loop) + ldws 0(%r26),%r29 + fldws,ma 4(%r25),%fr5 + sub %r29,%r19,%r22 + add %r22,%r19,%r0 + stws,ma %r22,4(%r26) + addc %r28,%r1,%r19 + xmpyu %fr4,%fr5,%fr6 + ldw -16(%r30),%r28 + fstds %fr6,-16(%r30) + addc %r0,%r28,%r28 + addib,<> -1,%r24,L(loop) + ldw -12(%r30),%r1 + + .label L(end) + ldw 0(%r26),%r29 + sub %r29,%r19,%r22 + add %r22,%r19,%r0 + stws,ma %r22,4(%r26) + addc %r28,%r1,%r19 + ldw -16(%r30),%r28 + ldws 0(%r26),%r29 + addc %r0,%r28,%r28 + sub %r29,%r19,%r22 + add %r22,%r19,%r0 + stws,ma %r22,4(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + + .label L(just_one_limb) + xmpyu %fr4,%fr5,%fr6 + ldw 0(%r26),%r29 + fstds %fr6,-16(%r30) + ldw -12(%r30),%r1 + ldw -16(%r30),%r28 + sub %r29,%r1,%r22 + add %r22,%r1,%r0 + stw %r22,0(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 +EPILOGUE() diff --git a/mpn/pa32/hppa1_1/udiv_qrnnd.asm b/mpn/pa32/hppa1_1/udiv_qrnnd.asm new file mode 100644 index 000000000..0fb302e53 --- /dev/null +++ b/mpn/pa32/hppa1_1/udiv_qrnnd.asm @@ -0,0 +1,96 @@ +dnl HP-PA __udiv_qrnnd division support, used from longlong.h. +dnl This version runs fast on PA 7000 and later. + +dnl Copyright 1993, 1994, 2000, 2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr gr26 +C n1 gr25 +C n0 gr24 +C d gr23 + +C This file has caused a lot of trouble, since it demands PIC reference to +C static data, which triggers bugs in gas (at least version 2.7 through +C 2.11.2). When the bug is triggered, many bogus relocs are generated. The +C current solution is to stuff data right into the code, and refer it using +C absolute offsets. Fragile to be sure, but nothing else seems to work. + +ASM_START() +ifdef(`PIC',`', +` RODATA + INT64(L(0000), 0x43f00000, 0x0) C 2^64 +') + +PROLOGUE(mpn_udiv_qrnnd) + .proc + .callinfo frame=64,no_calls + .entry + + ldo 64(%r30),%r30 + + stws %r25,-16(0,%r30) C n_hi + stws %r24,-12(0,%r30) C n_lo + +ifdef(`PIC', +` bl .+20,%r31 + dep %r0,31,2,%r31 + .word 0x0 C padding for alignment + .word 0x43f00000, 0x0 C 2^64 + ldo 4(%r31),%r31', +` ldil `L'%L(0000),%r31 + ldo R%L(0000)(%r31),%r31') + + fldds -16(0,%r30),%fr5 + stws %r23,-12(0,%r30) + comib,<= 0,%r25,L(1) + fcnvxf,dbl,dbl %fr5,%fr5 + fldds 0(0,%r31),%fr4 + fadd,dbl %fr4,%fr5,%fr5 + + .label L(1) + fcpy,sgl %fr0,%fr6L + fldws -12(0,%r30),%fr6R + fcnvxf,dbl,dbl %fr6,%fr4 + + fdiv,dbl %fr5,%fr4,%fr5 + + fcnvfx,dbl,dbl %fr5,%fr4 + fstws %fr4R,-16(%r30) + xmpyu %fr4R,%fr6R,%fr6 + ldws -16(%r30),%r28 + fstds %fr6,-16(0,%r30) + ldws -12(0,%r30),%r21 + ldws -16(0,%r30),%r20 + sub %r24,%r21,%r22 + subb %r25,%r20,%r20 + comib,= 0,%r20,L(2) + ldo -64(%r30),%r30 + + add %r22,%r23,%r22 + ldo -1(%r28),%r28 + + .label L(2) + bv 0(%r2) + stws %r22,0(0,%r26) + .exit + .procend +EPILOGUE(mpn_udiv_qrnnd) diff --git a/mpn/pa32/hppa1_1/umul.asm b/mpn/pa32/hppa1_1/umul.asm new file mode 100644 index 000000000..d8bc93317 --- /dev/null +++ b/mpn/pa32/hppa1_1/umul.asm @@ -0,0 +1,38 @@ +dnl Copyright 1999, 2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_umul_ppmm) +C .callinfo frame=64,no_calls + + ldo 64(%r30),%r30 + stw %r25,-16(0,%r30) + fldws -16(0,%r30),%fr22R + stw %r24,-16(0,%r30) + fldws -16(0,%r30),%fr22L + xmpyu %fr22R,%fr22L,%fr22 + fstds %fr22,-16(0,%r30) + ldw -16(0,%r30),%r28 + ldw -12(0,%r30),%r29 + stw %r29,0(0,%r26) + bv 0(%r2) + ldo -64(%r30),%r30 +EPILOGUE() diff --git a/mpn/pa32/hppa2_0/add_n.asm b/mpn/pa32/hppa2_0/add_n.asm new file mode 100644 index 000000000..fc0f14356 --- /dev/null +++ b/mpn/pa32/hppa2_0/add_n.asm @@ -0,0 +1,98 @@ +dnl HP-PA 2.0 32-bit mpn_add_n -- Add two limb vectors of the same length > 0 +dnl and store sum in a third limb vector. + +dnl Copyright 1997, 1998, 2000, 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr gr26 +C s1_ptr gr25 +C s2_ptr gr24 +C size gr23 + +C This runs at 2 cycles/limb on PA8000. + +ASM_START() +PROLOGUE(mpn_add_n) + sub %r0,%r23,%r22 + zdep %r22,30,3,%r28 C r28 = 2 * (-n & 7) + zdep %r22,29,3,%r22 C r22 = 4 * (-n & 7) + sub %r25,%r22,%r25 C offset s1_ptr + sub %r24,%r22,%r24 C offset s2_ptr + sub %r26,%r22,%r26 C offset res_ptr + blr %r28,%r0 C branch into loop + add %r0,%r0,%r0 C reset carry + + .label L(loop) + ldw 0(%r25),%r20 + ldw 0(%r24),%r31 + addc %r20,%r31,%r20 + stw %r20,0(%r26) + + .label L(7) + ldw 4(%r25),%r21 + ldw 4(%r24),%r19 + addc %r21,%r19,%r21 + stw %r21,4(%r26) + + .label L(6) + ldw 8(%r25),%r20 + ldw 8(%r24),%r31 + addc %r20,%r31,%r20 + stw %r20,8(%r26) + + .label L(5) + ldw 12(%r25),%r21 + ldw 12(%r24),%r19 + addc %r21,%r19,%r21 + stw %r21,12(%r26) + + .label L(4) + ldw 16(%r25),%r20 + ldw 16(%r24),%r31 + addc %r20,%r31,%r20 + stw %r20,16(%r26) + + .label L(3) + ldw 20(%r25),%r21 + ldw 20(%r24),%r19 + addc %r21,%r19,%r21 + stw %r21,20(%r26) + + .label L(2) + ldw 24(%r25),%r20 + ldw 24(%r24),%r31 + addc %r20,%r31,%r20 + stw %r20,24(%r26) + + .label L(1) + ldw 28(%r25),%r21 + ldo 32(%r25),%r25 + ldw 28(%r24),%r19 + addc %r21,%r19,%r21 + stw %r21,28(%r26) + ldo 32(%r24),%r24 + addib,> -8,%r23,L(loop) + ldo 32(%r26),%r26 + + bv (%r2) + addc %r0,%r0,%r28 +EPILOGUE() diff --git a/mpn/pa32/hppa2_0/gmp-mparam.h b/mpn/pa32/hppa2_0/gmp-mparam.h new file mode 100644 index 000000000..f2306773c --- /dev/null +++ b/mpn/pa32/hppa2_0/gmp-mparam.h @@ -0,0 +1,57 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999, 2000, 2001, 2002 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 + +/* Generated by tuneup.c, 2001-02-18, gcc 2.95 */ + +#define MUL_KARATSUBA_THRESHOLD 16 +#define MUL_TOOM3_THRESHOLD 129 + +#define SQR_BASECASE_THRESHOLD 6 +#define SQR_KARATSUBA_THRESHOLD 48 +#define SQR_TOOM3_THRESHOLD 153 + +#define DIV_SB_PREINV_THRESHOLD 6 +#define DIV_DC_THRESHOLD 102 +#define POWM_THRESHOLD 166 + +#define GCD_ACCEL_THRESHOLD 4 +#define GCDEXT_THRESHOLD 0 + +#define DIVREM_1_NORM_THRESHOLD 4 +#define DIVREM_1_UNNORM_THRESHOLD 6 +#define MOD_1_NORM_THRESHOLD 4 +#define MOD_1_UNNORM_THRESHOLD 6 +#define USE_PREINV_MOD_1 0 +#define DIVREM_2_THRESHOLD 0 +#define DIVEXACT_1_THRESHOLD 0 +#define MODEXACT_1_ODD_THRESHOLD 0 + +#define MUL_FFT_TABLE { 656, 928, 1920, 3584, 14336, 24576, 0 } +#define MUL_FFT_MODF_THRESHOLD 584 +#define MUL_FFT_THRESHOLD 3840 + +#define SQR_FFT_TABLE { 656, 928, 1920, 3584, 14336, 24576, 0 } +#define SQR_FFT_MODF_THRESHOLD 616 +#define SQR_FFT_THRESHOLD 3840 diff --git a/mpn/pa32/hppa2_0/sqr_diagonal.asm b/mpn/pa32/hppa2_0/sqr_diagonal.asm new file mode 100644 index 000000000..463aa45f4 --- /dev/null +++ b/mpn/pa32/hppa2_0/sqr_diagonal.asm @@ -0,0 +1,103 @@ +dnl HP-PA 32-bit mpn_sqr_diagonal optimized for the PA8x00. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C This code runs at 6 cycles/limb on the PA7100 and 2 cycles/limb on PA8x00. +C The 2-way unrolling is actually not helping the PA7100. + +C INPUT PARAMETERS +define(`rp',`%r26') +define(`up',`%r25') +define(`n',`%r24') + +ASM_START() +PROLOGUE(mpn_sqr_diagonal) + + fldws,ma 4(up),%fr4r + addib,= -1,n,L(end1) + ldo 4(rp),rp + + fldws,ma 4(up),%fr6r + addib,= -1,n,L(end2) + xmpyu %fr4r,%fr4r,%fr5 + + fldws,ma 4(up),%fr4r + addib,= -1,n,L(end3) + xmpyu %fr6r,%fr6r,%fr7 + + + .label L(loop) + fldws,ma 4(up),%fr6r + fstws %fr5r,-4(rp) + fstws,ma %fr5l,8(rp) + addib,= -1,n,L(exite) + xmpyu %fr4r,%fr4r,%fr5 + fldws,ma 4(up),%fr4r + fstws %fr7r,-4(rp) + fstws,ma %fr7l,8(rp) + addib,<> -1,n,L(loop) + xmpyu %fr6r,%fr6r,%fr7 + + .label L(exito) + fstws %fr5r,-4(rp) + fstws %fr5l,0(rp) + xmpyu %fr4r,%fr4r,%fr5 + fstws %fr7r,4(rp) + fstws %fr7l,8(rp) + fstws,mb %fr5r,12(rp) + bv 0(%r2) + fstws %fr5l,4(rp) + + .label L(exite) + fstws %fr7r,-4(rp) + fstws %fr7l,0(rp) + xmpyu %fr6r,%fr6r,%fr7 + fstws %fr5r,4(rp) + fstws %fr5l,8(rp) + fstws,mb %fr7r,12(rp) + bv 0(%r2) + fstws %fr7l,4(rp) + + .label L(end1) + xmpyu %fr4r,%fr4r,%fr5 + fstws %fr5r,-4(rp) + bv 0(%r2) + fstws,ma %fr5l,8(rp) + + .label L(end2) + xmpyu %fr6r,%fr6r,%fr7 + fstws %fr5r,-4(rp) + fstws %fr5l,0(rp) + fstws %fr7r,4(rp) + bv 0(%r2) + fstws %fr7l,8(rp) + + .label L(end3) + fstws %fr5r,-4(rp) + fstws %fr5l,0(rp) + xmpyu %fr4r,%fr4r,%fr5 + fstws %fr7r,4(rp) + fstws %fr7l,8(rp) + fstws,mb %fr5r,12(rp) + bv 0(%r2) + fstws %fr5l,4(rp) +EPILOGUE(mpn_sqr_diagonal) diff --git a/mpn/pa32/hppa2_0/sub_n.asm b/mpn/pa32/hppa2_0/sub_n.asm new file mode 100644 index 000000000..d6b930dee --- /dev/null +++ b/mpn/pa32/hppa2_0/sub_n.asm @@ -0,0 +1,98 @@ +dnl HP-PA 2.0 32-bit mpn_sub_n -- Subtract two limb vectors of the same +dnl length > 0 and store difference in a third limb vector. + +dnl Copyright 1997, 1998, 2000, 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr gr26 +C s1_ptr gr25 +C s2_ptr gr24 +C size gr23 + +C This runs at 2 cycles/limb on PA8000. + +ASM_START() +PROLOGUE(mpn_sub_n) + sub %r0,%r23,%r22 + zdep %r22,30,3,%r28 C r28 = 2 * (-n & 7) + zdep %r22,29,3,%r22 C r22 = 4 * (-n & 7) + sub %r25,%r22,%r25 C offset s1_ptr + sub %r24,%r22,%r24 C offset s2_ptr + blr %r28,%r0 C branch into loop + sub %r26,%r22,%r26 C offset res_ptr and set carry + + .label L(loop) + ldw 0(%r25),%r20 + ldw 0(%r24),%r31 + subb %r20,%r31,%r20 + stw %r20,0(%r26) + + .label L(7) + ldw 4(%r25),%r21 + ldw 4(%r24),%r19 + subb %r21,%r19,%r21 + stw %r21,4(%r26) + + .label L(6) + ldw 8(%r25),%r20 + ldw 8(%r24),%r31 + subb %r20,%r31,%r20 + stw %r20,8(%r26) + + .label L(5) + ldw 12(%r25),%r21 + ldw 12(%r24),%r19 + subb %r21,%r19,%r21 + stw %r21,12(%r26) + + .label L(4) + ldw 16(%r25),%r20 + ldw 16(%r24),%r31 + subb %r20,%r31,%r20 + stw %r20,16(%r26) + + .label L(3) + ldw 20(%r25),%r21 + ldw 20(%r24),%r19 + subb %r21,%r19,%r21 + stw %r21,20(%r26) + + .label L(2) + ldw 24(%r25),%r20 + ldw 24(%r24),%r31 + subb %r20,%r31,%r20 + stw %r20,24(%r26) + + .label L(1) + ldw 28(%r25),%r21 + ldo 32(%r25),%r25 + ldw 28(%r24),%r19 + subb %r21,%r19,%r21 + stw %r21,28(%r26) + ldo 32(%r24),%r24 + addib,> -8,%r23,L(loop) + ldo 32(%r26),%r26 + + addc %r0,%r0,%r28 + bv (%r2) + subi 1,%r28,%r28 +EPILOGUE() diff --git a/mpn/pa32/lshift.asm b/mpn/pa32/lshift.asm new file mode 100644 index 000000000..393f4dc0c --- /dev/null +++ b/mpn/pa32/lshift.asm @@ -0,0 +1,66 @@ +dnl HP-PA mpn_lshift -- Shift a number left. + +dnl Copyright 1992, 1994, 2000, 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr gr26 +C s_ptr gr25 +C size gr24 +C cnt gr23 + +ASM_START() +PROLOGUE(mpn_lshift) + sh2add %r24,%r25,%r25 + sh2add %r24,%r26,%r26 + ldws,mb -4(0,%r25),%r22 + subi 32,%r23,%r1 + mtsar %r1 + addib,= -1,%r24,L(0004) + vshd %r0,%r22,%r28 C compute carry out limb + ldws,mb -4(0,%r25),%r29 + addib,= -1,%r24,L(0002) + vshd %r22,%r29,%r20 + + .label L(loop) + ldws,mb -4(0,%r25),%r22 + stws,mb %r20,-4(0,%r26) + addib,= -1,%r24,L(0003) + vshd %r29,%r22,%r20 + ldws,mb -4(0,%r25),%r29 + stws,mb %r20,-4(0,%r26) + addib,<> -1,%r24,L(loop) + vshd %r22,%r29,%r20 + + .label L(0002) + stws,mb %r20,-4(0,%r26) + vshd %r29,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) + + .label L(0003) + stws,mb %r20,-4(0,%r26) + + .label L(0004) + vshd %r22,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) +EPILOGUE() diff --git a/mpn/pa32/rshift.asm b/mpn/pa32/rshift.asm new file mode 100644 index 000000000..f4efdab42 --- /dev/null +++ b/mpn/pa32/rshift.asm @@ -0,0 +1,63 @@ +dnl HP-PA mpn_rshift -- Shift a number right. + +dnl Copyright 1992, 1994, 2000, 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr gr26 +C s_ptr gr25 +C size gr24 +C cnt gr23 + +ASM_START() +PROLOGUE(mpn_rshift) + ldws,ma 4(0,%r25),%r22 + mtsar %r23 + addib,= -1,%r24,L(0004) + vshd %r22,%r0,%r28 C compute carry out limb + ldws,ma 4(0,%r25),%r29 + addib,= -1,%r24,L(0002) + vshd %r29,%r22,%r20 + + .label L(loop) + ldws,ma 4(0,%r25),%r22 + stws,ma %r20,4(0,%r26) + addib,= -1,%r24,L(0003) + vshd %r22,%r29,%r20 + ldws,ma 4(0,%r25),%r29 + stws,ma %r20,4(0,%r26) + addib,<> -1,%r24,L(loop) + vshd %r29,%r22,%r20 + + .label L(0002) + stws,ma %r20,4(0,%r26) + vshd %r0,%r29,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) + + .label L(0003) + stws,ma %r20,4(0,%r26) + + .label L(0004) + vshd %r0,%r22,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) +EPILOGUE() diff --git a/mpn/pa32/sub_n.asm b/mpn/pa32/sub_n.asm new file mode 100644 index 000000000..33a15e468 --- /dev/null +++ b/mpn/pa32/sub_n.asm @@ -0,0 +1,55 @@ +dnl HP-PA mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. + +dnl Copyright 1992, 1994, 2000, 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr gr26 +C s1_ptr gr25 +C s2_ptr gr24 +C size gr23 + +C One might want to unroll this as for other processors, but it turns out that +C the data cache contention after a store makes such unrolling useless. We +C can't come under 5 cycles/limb anyway. + +ASM_START() +PROLOGUE(mpn_sub_n) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,= -1,%r23,L(end) C check for (SIZE == 1) + sub %r20,%r19,%r28 C subtract first limbs ignoring cy + + .label L(loop) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,<> -1,%r23,L(loop) + subb %r20,%r19,%r28 + + .label L(end) + stws %r28,0(0,%r26) + addc %r0,%r0,%r28 + bv 0(%r2) + subi 1,%r28,%r28 +EPILOGUE() diff --git a/mpn/pa32/udiv_qrnnd.asm b/mpn/pa32/udiv_qrnnd.asm new file mode 100644 index 000000000..ff4c8a47e --- /dev/null +++ b/mpn/pa32/udiv_qrnnd.asm @@ -0,0 +1,282 @@ +dnl HP-PA __udiv_qrnnd division support, used from longlong.h. +dnl This version runs fast on pre-PA7000 CPUs. + +dnl Copyright 1993, 1994, 2000, 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr gr26 +C n1 gr25 +C n0 gr24 +C d gr23 + +C The code size is a bit excessive. We could merge the last two ds;addc +C sequences by simply moving the "bb,< Odd" instruction down. The only +C trouble is the FFFFFFFF code that would need some hacking. + +ASM_START() +PROLOGUE(mpn_udiv_qrnnd) + comb,< %r23,0,L(largedivisor) + sub %r0,%r23,%r1 C clear cy as side-effect + ds %r0,%r1,%r0 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r28 + ds %r25,%r23,%r25 + comclr,>= %r25,%r0,%r0 + addl %r25,%r23,%r25 + stws %r25,0(0,%r26) + bv 0(%r2) + addc %r28,%r28,%r28 + + .label L(largedivisor) + extru %r24,31,1,%r19 C r19 = n0 & 1 + bb,< %r23,31,L(odd) + extru %r23,30,31,%r22 C r22 = d >> 1 + shd %r25,%r24,1,%r24 C r24 = new n0 + extru %r25,30,31,%r25 C r25 = new n1 + sub %r0,%r22,%r21 + ds %r0,%r21,%r0 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + comclr,>= %r25,%r0,%r0 + addl %r25,%r22,%r25 + sh1addl %r25,%r19,%r25 + stws %r25,0(0,%r26) + bv 0(%r2) + addc %r24,%r24,%r28 + + .label L(odd) + addib,sv,n 1,%r22,L(FFFFFFFF) C r22 = (d / 2 + 1) + shd %r25,%r24,1,%r24 C r24 = new n0 + extru %r25,30,31,%r25 C r25 = new n1 + sub %r0,%r22,%r21 + ds %r0,%r21,%r0 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r28 + comclr,>= %r25,%r0,%r0 + addl %r25,%r22,%r25 + sh1addl %r25,%r19,%r25 +C We have computed (n1,,n0) / (d + 1), q' = r28, r' = r25 + add,nuv %r28,%r25,%r25 + addl %r25,%r1,%r25 + addc %r0,%r28,%r28 + sub,<< %r25,%r23,%r0 + addl %r25,%r1,%r25 + stws %r25,0(0,%r26) + bv 0(%r2) + addc %r0,%r28,%r28 + +C This is just a special case of the code above. +C We come here when d == 0xFFFFFFFF + .label L(FFFFFFFF) + add,uv %r25,%r24,%r24 + sub,<< %r24,%r23,%r0 + ldo 1(%r24),%r24 + stws %r24,0(0,%r26) + bv 0(%r2) + addc %r0,%r25,%r28 +EPILOGUE() |