diff options
Diffstat (limited to 'rts/gmp/mpn/pa64')
-rw-r--r-- | rts/gmp/mpn/pa64/README | 38 | ||||
-rw-r--r-- | rts/gmp/mpn/pa64/add_n.s | 90 | ||||
-rw-r--r-- | rts/gmp/mpn/pa64/addmul_1.S | 167 | ||||
-rw-r--r-- | rts/gmp/mpn/pa64/gmp-mparam.h | 65 | ||||
-rw-r--r-- | rts/gmp/mpn/pa64/lshift.s | 103 | ||||
-rw-r--r-- | rts/gmp/mpn/pa64/mul_1.S | 158 | ||||
-rw-r--r-- | rts/gmp/mpn/pa64/rshift.s | 100 | ||||
-rw-r--r-- | rts/gmp/mpn/pa64/sub_n.s | 90 | ||||
-rw-r--r-- | rts/gmp/mpn/pa64/submul_1.S | 170 | ||||
-rw-r--r-- | rts/gmp/mpn/pa64/udiv_qrnnd.c | 111 | ||||
-rw-r--r-- | rts/gmp/mpn/pa64/umul_ppmm.S | 74 |
11 files changed, 1166 insertions, 0 deletions
diff --git a/rts/gmp/mpn/pa64/README b/rts/gmp/mpn/pa64/README new file mode 100644 index 0000000000..8d2976dabc --- /dev/null +++ b/rts/gmp/mpn/pa64/README @@ -0,0 +1,38 @@ +This directory contains mpn functions for 64-bit PA-RISC 2.0. + +RELEVANT OPTIMIZATION ISSUES + +The PA8000 has a multi-issue pipeline with large buffers for instructions +awaiting pending results. Therefore, no latency scheduling is necessary +(and might actually be harmful). + +Two 64-bit loads can be completed per cycle. One 64-bit store can be +completed per cycle. A store cannot complete in the same cycle as a load. + +STATUS + +* mpn_lshift, mpn_rshift, mpn_add_n, mpn_sub_n are all well-tuned and run at + the peak cache bandwidth; 1.5 cycles/limb for shifting and 2.0 cycles/limb + for add/subtract. + +* The multiplication functions run at 11 cycles/limb. The cache bandwidth + allows 7.5 cycles/limb. Perhaps it would be possible, using unrolling or + better scheduling, to get closer to the cache bandwidth limit. + +* xaddmul_1.S contains a quicker method for forming the 128 bit product. It + uses some fewer operations, and keep the carry flag live across the loop + boundary. But it seems hard to make it run more than 1/4 cycle faster + than the old code. Perhaps we really ought to unroll this loop be 2x? + 2x should suffice since register latency schedling is never needed, + but the unrolling would hide the store-load latency. Here is a sketch: + + 1. A multiply and store 64-bit products + 2. B sum 64-bit products 128-bit product + 3. B load 64-bit products to integer registers + 4. B multiply and store 64-bit products + 5. A sum 64-bit products 128-bit product + 6. A load 64-bit products to integer registers + 7. goto 1 + + In practice, adjacent groups (1 and 2, 2 and 3, etc) will be interleaved + for better instruction mix. diff --git a/rts/gmp/mpn/pa64/add_n.s b/rts/gmp/mpn/pa64/add_n.s new file mode 100644 index 0000000000..22ff19c184 --- /dev/null +++ b/rts/gmp/mpn/pa64/add_n.s @@ -0,0 +1,90 @@ +; HP-PA 2.0 __gmpn_add_n -- Add two limb vectors of the same length > 0 and +; store sum in a third limb vector. + +; Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + +; This runs at 2 cycles/limb on PA8000. + + .level 2.0n + .code + .export __gmpn_add_n,entry +__gmpn_add_n + .proc + .callinfo frame=0,args_saved + .entry + + sub %r0,%r23,%r22 + depw,z %r22,30,3,%r28 ; r28 = 2 * (-n & 7) + depw,z %r22,28,3,%r22 ; r22 = 8 * (-n & 7) + sub %r25,%r22,%r25 ; offset s1_ptr + sub %r24,%r22,%r24 ; offset s2_ptr + sub %r26,%r22,%r26 ; offset res_ptr + blr %r28,%r0 ; branch into loop + add %r0,%r0,%r0 ; reset carry + +L$loop ldd 0(%r25),%r20 + ldd 0(%r24),%r31 + add,dc %r20,%r31,%r20 + std %r20,0(%r26) +L$7 ldd 8(%r25),%r21 + ldd 8(%r24),%r19 + add,dc %r21,%r19,%r21 + std %r21,8(%r26) +L$6 ldd 16(%r25),%r20 + ldd 16(%r24),%r31 + add,dc %r20,%r31,%r20 + std %r20,16(%r26) +L$5 ldd 24(%r25),%r21 + ldd 24(%r24),%r19 + add,dc %r21,%r19,%r21 + std %r21,24(%r26) +L$4 ldd 32(%r25),%r20 + ldd 32(%r24),%r31 + add,dc %r20,%r31,%r20 + std %r20,32(%r26) +L$3 ldd 40(%r25),%r21 + ldd 40(%r24),%r19 + add,dc %r21,%r19,%r21 + std %r21,40(%r26) +L$2 ldd 48(%r25),%r20 + ldd 48(%r24),%r31 + add,dc %r20,%r31,%r20 + std %r20,48(%r26) +L$1 ldd 56(%r25),%r21 + ldo 64(%r25),%r25 + ldd 56(%r24),%r19 + add,dc %r21,%r19,%r21 + std %r21,56(%r26) + ldo 64(%r24),%r24 + addib,> -8,%r23,L$loop + ldo 64(%r26),%r26 + + add,dc %r0,%r0,%r29 + bve (%r2) + .exit + ldi 0,%r28 + .procend diff --git a/rts/gmp/mpn/pa64/addmul_1.S b/rts/gmp/mpn/pa64/addmul_1.S new file mode 100644 index 0000000000..b1885b432c --- /dev/null +++ b/rts/gmp/mpn/pa64/addmul_1.S @@ -0,0 +1,167 @@ +; HP-PA 2.0 64-bit __gmpn_addmul_1 -- Multiply a limb vector with a limb and +; add the result to a second limb vector. + +; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +; INPUT PARAMETERS +#define rptr %r26 +#define sptr %r25 +#define size %r24 +#define s2limb -56(%r30) + +; This runs at 11 cycles/limb on a PA8000. It might be possible to make +; it faster, but the PA8000 pipeline is not publically documented and it +; is very complex to reverse engineer + +#define t1 %r19 +#define rlimb %r20 +#define hi %r21 +#define lo %r22 +#define m0 %r28 +#define m1 %r3 +#define cylimb %r29 +#define t3 %r4 +#define t2 %r6 +#define t5 %r23 +#define t4 %r31 + .level 2.0n + .code + .export __gmpn_addmul_1,entry +__gmpn_addmul_1 + .proc + .callinfo frame=128,no_calls + .entry + fldd -56(%r30),%fr5 ; s2limb passed on stack + ldo 128(%r30),%r30 + add %r0,%r0,cylimb ; clear cy and cylimb + + std %r3,-96(%r30) + std %r4,-88(%r30) + std %r5,-80(%r30) + std %r6,-72(%r30) + depdi,z 1,31,1,%r5 + + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd -128(%r30),lo ; lo = low 64 bit of product + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + addib,= -1,%r24,L$end1 + nop + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + addib,= -1,%r24,L$end2 + nop +L$loop + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m1 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,rlimb,rlimb + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + add t4,rlimb,t3 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + std t3,0(rptr) + addib,<> -1,%r24,L$loop + ldo 8(rptr),rptr +L$end2 + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,rlimb,rlimb + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + add t4,rlimb,t3 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr +L$end1 + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + add cylimb,rlimb,rlimb + add,dc t2,hi,cylimb + add t4,rlimb,t3 + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr + + ldd -96(%r30),%r3 + ldd -88(%r30),%r4 + ldd -80(%r30),%r5 + ldd -72(%r30),%r6 + + extrd,u cylimb,31,32,%r28 + bve (%r2) + .exit + ldo -128(%r30),%r30 + .procend diff --git a/rts/gmp/mpn/pa64/gmp-mparam.h b/rts/gmp/mpn/pa64/gmp-mparam.h new file mode 100644 index 0000000000..847735b987 --- /dev/null +++ b/rts/gmp/mpn/pa64/gmp-mparam.h @@ -0,0 +1,65 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 64 +#define BYTES_PER_MP_LIMB 8 +#define BITS_PER_LONGINT 64 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +/* These values were measured in a PA8000 using the system compiler version + A.10.32.30. Presumably the PA8200 and PA8500 have the same timing + characteristic, but GCC might give somewhat different results. */ +/* Generated by tuneup.c, 2000-07-25. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 16 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 105 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 40 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 116 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 72 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 94 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 50 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 46 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 1 +#endif diff --git a/rts/gmp/mpn/pa64/lshift.s b/rts/gmp/mpn/pa64/lshift.s new file mode 100644 index 0000000000..994bc1c4d6 --- /dev/null +++ b/rts/gmp/mpn/pa64/lshift.s @@ -0,0 +1,103 @@ +; HP-PA 2.0 __gmpn_lshift -- + +; Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; size gr24 +; cnt gr23 + +; This runs at 1.5 cycles/limb on PA8000. + + .level 2.0n + .code + .export __gmpn_lshift,entry +__gmpn_lshift + .proc + .callinfo frame=0,args_saved + .entry + + shladd %r24,3,%r25,%r25 + shladd %r24,3,%r26,%r26 + subi 64,%r23,%r23 + mtsar %r23 + ldd -8(%r25),%r21 + addib,= -1,%r24,L$end + shrpd %r0,%r21,%sar,%r29 ; compute carry out limb + depw,z %r24,31,3,%r28 ; r28 = (size & 7) + sub %r0,%r24,%r22 + depw,z %r22,28,3,%r22 ; r22 = 8 * (-size & 7) + add %r25,%r22,%r25 ; offset s1_ptr + blr %r28,%r0 ; branch into jump table + add %r26,%r22,%r26 ; offset res_ptr + b L$0 + nop + b L$1 + copy %r21,%r20 + b L$2 + nop + b L$3 + copy %r21,%r20 + b L$4 + nop + b L$5 + copy %r21,%r20 + b L$6 + nop + b L$7 + copy %r21,%r20 + +L$loop +L$0 ldd -16(%r25),%r20 + shrpd %r21,%r20,%sar,%r21 + std %r21,-8(%r26) +L$7 ldd -24(%r25),%r21 + shrpd %r20,%r21,%sar,%r20 + std %r20,-16(%r26) +L$6 ldd -32(%r25),%r20 + shrpd %r21,%r20,%sar,%r21 + std %r21,-24(%r26) +L$5 ldd -40(%r25),%r21 + shrpd %r20,%r21,%sar,%r20 + std %r20,-32(%r26) +L$4 ldd -48(%r25),%r20 + shrpd %r21,%r20,%sar,%r21 + std %r21,-40(%r26) +L$3 ldd -56(%r25),%r21 + shrpd %r20,%r21,%sar,%r20 + std %r20,-48(%r26) +L$2 ldd -64(%r25),%r20 + shrpd %r21,%r20,%sar,%r21 + std %r21,-56(%r26) +L$1 ldd -72(%r25),%r21 + ldo -64(%r25),%r25 + shrpd %r20,%r21,%sar,%r20 + std %r20,-64(%r26) + addib,> -8,%r24,L$loop + ldo -64(%r26),%r26 + +L$end shrpd %r21,%r0,%sar,%r21 + std %r21,-8(%r26) + bve (%r2) + .exit + extrd,u %r29,31,32,%r28 + .procend diff --git a/rts/gmp/mpn/pa64/mul_1.S b/rts/gmp/mpn/pa64/mul_1.S new file mode 100644 index 0000000000..ab310c1264 --- /dev/null +++ b/rts/gmp/mpn/pa64/mul_1.S @@ -0,0 +1,158 @@ +; HP-PA 2.0 64-bit __gmpn_mul_1 -- Multiply a limb vector with a limb and +; store the result in a second limb vector. + +; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +; INPUT PARAMETERS +#define rptr %r26 +#define sptr %r25 +#define size %r24 +#define s2limb -56(%r30) + +; This runs at 11 cycles/limb on a PA8000. It might be possible to make +; it faster, but the PA8000 pipeline is not publically documented and it +; is very complex to reverse engineer + +#define t1 %r19 +#define rlimb %r20 +#define hi %r21 +#define lo %r22 +#define m0 %r28 +#define m1 %r3 +#define cylimb %r29 +#define t3 %r4 +#define t2 %r6 +#define t5 %r23 +#define t4 %r31 + .level 2.0n + .code + .export __gmpn_mul_1,entry +__gmpn_mul_1 + .proc + .callinfo frame=128,no_calls + .entry + fldd -56(%r30),%fr5 ; s2limb passed on stack + ldo 128(%r30),%r30 + add %r0,%r0,cylimb ; clear cy and cylimb + + std %r3,-96(%r30) + std %r4,-88(%r30) + std %r5,-80(%r30) + std %r6,-72(%r30) + depdi,z 1,31,1,%r5 + + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd -128(%r30),lo ; lo = low 64 bit of product + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + addib,= -1,%r24,L$end1 + nop + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + addib,= -1,%r24,L$end2 + nop +L$loop + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m1 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,t4,t3 + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + std t3,0(rptr) + addib,<> -1,%r24,L$loop + ldo 8(rptr),rptr +L$end2 + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,t4,t3 + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + std t3,0(rptr) + ldo 8(rptr),rptr +L$end1 + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t2 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + add cylimb,t4,t3 + add,dc t2,hi,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr + + ldd -96(%r30),%r3 + ldd -88(%r30),%r4 + ldd -80(%r30),%r5 + ldd -72(%r30),%r6 + + extrd,u cylimb,31,32,%r28 + bve (%r2) + .exit + ldo -128(%r30),%r30 + .procend diff --git a/rts/gmp/mpn/pa64/rshift.s b/rts/gmp/mpn/pa64/rshift.s new file mode 100644 index 0000000000..f0730e2a91 --- /dev/null +++ b/rts/gmp/mpn/pa64/rshift.s @@ -0,0 +1,100 @@ +; HP-PA 2.0 __gmpn_rshift -- + +; Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; size gr24 +; cnt gr23 + +; This runs at 1.5 cycles/limb on PA8000. + + .level 2.0n + .code + .export __gmpn_rshift,entry +__gmpn_rshift + .proc + .callinfo frame=0,args_saved + .entry + + mtsar %r23 + ldd 0(%r25),%r21 + addib,= -1,%r24,L$end + shrpd %r21,%r0,%sar,%r29 ; compute carry out limb + depw,z %r24,31,3,%r28 ; r28 = (size & 7) + sub %r0,%r24,%r22 + depw,z %r22,28,3,%r22 ; r22 = 8 * (-size & 7) + sub %r25,%r22,%r25 ; offset s1_ptr + blr %r28,%r0 ; branch into jump table + sub %r26,%r22,%r26 ; offset res_ptr + b L$0 + nop + b L$1 + copy %r21,%r20 + b L$2 + nop + b L$3 + copy %r21,%r20 + b L$4 + nop + b L$5 + copy %r21,%r20 + b L$6 + nop + b L$7 + copy %r21,%r20 + +L$loop +L$0 ldd 8(%r25),%r20 + shrpd %r20,%r21,%sar,%r21 + std %r21,0(%r26) +L$7 ldd 16(%r25),%r21 + shrpd %r21,%r20,%sar,%r20 + std %r20,8(%r26) +L$6 ldd 24(%r25),%r20 + shrpd %r20,%r21,%sar,%r21 + std %r21,16(%r26) +L$5 ldd 32(%r25),%r21 + shrpd %r21,%r20,%sar,%r20 + std %r20,24(%r26) +L$4 ldd 40(%r25),%r20 + shrpd %r20,%r21,%sar,%r21 + std %r21,32(%r26) +L$3 ldd 48(%r25),%r21 + shrpd %r21,%r20,%sar,%r20 + std %r20,40(%r26) +L$2 ldd 56(%r25),%r20 + shrpd %r20,%r21,%sar,%r21 + std %r21,48(%r26) +L$1 ldd 64(%r25),%r21 + ldo 64(%r25),%r25 + shrpd %r21,%r20,%sar,%r20 + std %r20,56(%r26) + addib,> -8,%r24,L$loop + ldo 64(%r26),%r26 + +L$end shrpd %r0,%r21,%sar,%r21 + std %r21,0(%r26) + bve (%r2) + .exit + extrd,u %r29,31,32,%r28 + .procend diff --git a/rts/gmp/mpn/pa64/sub_n.s b/rts/gmp/mpn/pa64/sub_n.s new file mode 100644 index 0000000000..dda1f54b34 --- /dev/null +++ b/rts/gmp/mpn/pa64/sub_n.s @@ -0,0 +1,90 @@ +; HP-PA 2.0 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 +; and store difference in a third limb vector. + +; Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + +; This runs at 2 cycles/limb on PA8000. + + .level 2.0n + .code + .export __gmpn_sub_n,entry +__gmpn_sub_n + .proc + .callinfo frame=0,args_saved + .entry + + sub %r0,%r23,%r22 + depw,z %r22,30,3,%r28 ; r28 = 2 * (-n & 7) + depw,z %r22,28,3,%r22 ; r22 = 8 * (-n & 7) + sub %r25,%r22,%r25 ; offset s1_ptr + sub %r24,%r22,%r24 ; offset s2_ptr + blr %r28,%r0 ; branch into loop + sub %r26,%r22,%r26 ; offset res_ptr and set carry + +L$loop ldd 0(%r25),%r20 + ldd 0(%r24),%r31 + sub,db %r20,%r31,%r20 + std %r20,0(%r26) +L$7 ldd 8(%r25),%r21 + ldd 8(%r24),%r19 + sub,db %r21,%r19,%r21 + std %r21,8(%r26) +L$6 ldd 16(%r25),%r20 + ldd 16(%r24),%r31 + sub,db %r20,%r31,%r20 + std %r20,16(%r26) +L$5 ldd 24(%r25),%r21 + ldd 24(%r24),%r19 + sub,db %r21,%r19,%r21 + std %r21,24(%r26) +L$4 ldd 32(%r25),%r20 + ldd 32(%r24),%r31 + sub,db %r20,%r31,%r20 + std %r20,32(%r26) +L$3 ldd 40(%r25),%r21 + ldd 40(%r24),%r19 + sub,db %r21,%r19,%r21 + std %r21,40(%r26) +L$2 ldd 48(%r25),%r20 + ldd 48(%r24),%r31 + sub,db %r20,%r31,%r20 + std %r20,48(%r26) +L$1 ldd 56(%r25),%r21 + ldo 64(%r25),%r25 + ldd 56(%r24),%r19 + sub,db %r21,%r19,%r21 + std %r21,56(%r26) + ldo 64(%r24),%r24 + addib,> -8,%r23,L$loop + ldo 64(%r26),%r26 + + add,dc %r0,%r0,%r29 + subi 1,%r29,%r29 + bve (%r2) + .exit + ldi 0,%r28 + .procend diff --git a/rts/gmp/mpn/pa64/submul_1.S b/rts/gmp/mpn/pa64/submul_1.S new file mode 100644 index 0000000000..27666b99df --- /dev/null +++ b/rts/gmp/mpn/pa64/submul_1.S @@ -0,0 +1,170 @@ +; HP-PA 2.0 64-bit __gmpn_submul_1 -- Multiply a limb vector with a limb and +; subtract the result from a second limb vector. + +; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +; INPUT PARAMETERS +#define rptr %r26 +#define sptr %r25 +#define size %r24 +#define s2limb -56(%r30) + +; This runs at 11 cycles/limb on a PA8000. It might be possible to make +; it faster, but the PA8000 pipeline is not publically documented and it +; is very complex to reverse engineer + +#define t1 %r19 +#define rlimb %r20 +#define hi %r21 +#define lo %r22 +#define m0 %r28 +#define m1 %r3 +#define cylimb %r29 +#define t3 %r4 +#define t2 %r6 +#define t5 %r23 +#define t4 %r31 + .level 2.0n + .code + .export __gmpn_submul_1,entry +__gmpn_submul_1 + .proc + .callinfo frame=128,no_calls + .entry + fldd -56(%r30),%fr5 ; s2limb passed on stack + ldo 128(%r30),%r30 + add %r0,%r0,cylimb ; clear cy and cylimb + + std %r3,-96(%r30) + std %r4,-88(%r30) + std %r5,-80(%r30) + std %r6,-72(%r30) + depdi,z 1,31,1,%r5 + + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd -128(%r30),lo ; lo = low 64 bit of product + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + addib,= -1,%r24,L$end1 + nop + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + addib,= -1,%r24,L$end2 + nop +L$loop + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m1 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,t4,t4 + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + sub rlimb,t4,t3 + add t4,t3,%r0 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + std t3,0(rptr) + addib,<> -1,%r24,L$loop + ldo 8(rptr),rptr +L$end2 + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,t4,t4 + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + sub rlimb,t4,t3 + add t4,t3,%r0 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr +L$end1 + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + add cylimb,t4,t4 + add,dc t2,hi,cylimb + sub rlimb,t4,t3 + add t4,t3,%r0 + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr + + ldd -96(%r30),%r3 + ldd -88(%r30),%r4 + ldd -80(%r30),%r5 + ldd -72(%r30),%r6 + + extrd,u cylimb,31,32,%r28 + bve (%r2) + .exit + ldo -128(%r30),%r30 + .procend diff --git a/rts/gmp/mpn/pa64/udiv_qrnnd.c b/rts/gmp/mpn/pa64/udiv_qrnnd.c new file mode 100644 index 0000000000..1c9fe084db --- /dev/null +++ b/rts/gmp/mpn/pa64/udiv_qrnnd.c @@ -0,0 +1,111 @@ +/* +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +#define TWO64 18446744073709551616.0 + +mp_limb_t +#if __STDC__ +__MPN(udiv_qrnnd) (mp_limb_t n1, mp_limb_t n0, mp_limb_t d, mp_limb_t *r) +#else +__MPN(udiv_qrnnd) (n1, n0, d, r) + mp_limb_t n1; + mp_limb_t n0; + mp_limb_t d; + mp_limb_t *r; +#endif +{ + mp_limb_t q1, q2, q; + mp_limb_t p1, p0; + double di, dq; + + di = 1.0 / d; + + /* Generate upper 53 bits of quotient. Be careful here; the `double' + quotient may be rounded to 2^64 which we cannot safely convert back + to a 64-bit integer. */ + dq = (TWO64 * (double) n1 + (double) n0) * di; + if (dq >= TWO64) + q1 = 0xfffffffffffff800LL; + else + q1 = (mp_limb_t) dq; + + /* Multiply back in order to compare the product to the dividend. */ + umul_ppmm (p1, p0, q1, d); + + /* Was the 53-bit quotient greater that our sought quotient? Test the + sign of the partial remainder to find out. */ + if (n1 < p1 || (n1 == p1 && n0 < p0)) + { + /* 53-bit quotient too large. Partial remainder is negative. + Compute the absolute value of the remainder in n1,,n0. */ + n1 = p1 - (n1 + (p0 < n0)); + n0 = p0 - n0; + + /* Now use the partial remainder as new dividend to compute more bits of + quotient. This is an adjustment for the one we got previously. */ + q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di); + umul_ppmm (p1, p0, q2, d); + + q = q1 - q2; + if (n1 < p1 || (n1 == p1 && n0 <= p0)) + { + n0 = p0 - n0; + } + else + { + n0 = p0 - n0; + n0 += d; + q--; + } + } + else + { + n1 = n1 - (p1 + (n0 < p0)); + n0 = n0 - p0; + + q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di); + umul_ppmm (p1, p0, q2, d); + + q = q1 + q2; + if (n1 < p1 || (n1 == p1 && n0 < p0)) + { + n0 = n0 - p0; + n0 += d; + q--; + } + else + { + n0 = n0 - p0; + if (n0 >= d) + { + n0 -= d; + q++; + } + } + } + + *r = n0; + return q; +} diff --git a/rts/gmp/mpn/pa64/umul_ppmm.S b/rts/gmp/mpn/pa64/umul_ppmm.S new file mode 100644 index 0000000000..ceff2d752f --- /dev/null +++ b/rts/gmp/mpn/pa64/umul_ppmm.S @@ -0,0 +1,74 @@ +; Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +#define p0 %r28 +#define p1 %r29 +#define t32 %r19 +#define t0 %r20 +#define t1 %r21 +#define x %r22 +#define m0 %r23 +#define m1 %r24 + .level 2.0n + .code + .export __gmpn_umul_ppmm,entry +__gmpn_umul_ppmm + .proc + .callinfo frame=128,no_calls + .entry + ldo 128(%r30),%r30 + depd %r25,31,32,%r26 + std %r26,-64(%r30) + depd %r23,31,32,%r24 + std %r24,-56(%r30) + + ldw -180(%r30),%r31 + + fldd -64(%r30),%fr4 + fldd -56(%r30),%fr5 + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + + depdi,z 1,31,1,t32 ; t32 = 2^32 + + ldd -128(%r30),p0 ; lo = low 64 bit of product + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),p1 ; hi = high 64 bit of product + + add,l,*nuv m0,m1,x ; x = m1+m0 + add,l t32,p1,p1 ; propagate carry to mid of p1 + depd,z x,31,32,t0 ; lo32(m1+m0) + add t0,p0,p0 + extrd,u x,31,32,t1 ; hi32(m1+m0) + add,dc t1,p1,p1 + + std p0,0(%r31) ; store low half of product + extrd,u p1,31,32,%r28 ; return high half of product + bve (%r2) + .exit + ldo -128(%r30),%r30 + .procend |