diff options
author | Simon Marlow <simonmar@microsoft.com> | 2006-04-07 02:05:11 +0000 |
---|---|---|
committer | Simon Marlow <simonmar@microsoft.com> | 2006-04-07 02:05:11 +0000 |
commit | 0065d5ab628975892cea1ec7303f968c3338cbe1 (patch) | |
tree | 8e2afe0ab48ee33cf95009809d67c9649573ef92 /rts/gmp/mpn/pa64w/addmul_1.S | |
parent | 28a464a75e14cece5db40f2765a29348273ff2d2 (diff) | |
download | haskell-0065d5ab628975892cea1ec7303f968c3338cbe1.tar.gz |
Reorganisation of the source tree
Most of the other users of the fptools build system have migrated to
Cabal, and with the move to darcs we can now flatten the source tree
without losing history, so here goes.
The main change is that the ghc/ subdir is gone, and most of what it
contained is now at the top level. The build system now makes no
pretense at being multi-project, it is just the GHC build system.
No doubt this will break many things, and there will be a period of
instability while we fix the dependencies. A straightforward build
should work, but I haven't yet fixed binary/source distributions.
Changes to the Building Guide will follow, too.
Diffstat (limited to 'rts/gmp/mpn/pa64w/addmul_1.S')
-rw-r--r-- | rts/gmp/mpn/pa64w/addmul_1.S | 168 |
1 files changed, 168 insertions, 0 deletions
diff --git a/rts/gmp/mpn/pa64w/addmul_1.S b/rts/gmp/mpn/pa64w/addmul_1.S new file mode 100644 index 0000000000..4799f90fc5 --- /dev/null +++ b/rts/gmp/mpn/pa64w/addmul_1.S @@ -0,0 +1,168 @@ +; HP-PA 2.0 64-bit __gmpn_addmul_1 -- Multiply a limb vector with a limb and +; add the result to a second limb vector. + +; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +; INPUT PARAMETERS +#define rptr %r26 +#define sptr %r25 +#define size %r24 +#define s2limb %r23 + +; This runs at 11 cycles/limb on a PA8000. It might be possible to make +; it faster, but the PA8000 pipeline is not publically documented and it +; is very complex to reverse engineer + +#define t1 %r19 +#define rlimb %r20 +#define hi %r21 +#define lo %r22 +#define m0 %r28 +#define m1 %r3 +#define cylimb %r29 +#define t3 %r4 +#define t2 %r6 +#define t5 %r23 +#define t4 %r31 + .level 2.0w + .code + .export __gmpn_addmul_1,entry +__gmpn_addmul_1 + .proc + .callinfo frame=128,no_calls + .entry + std s2limb,-56(%r30) + fldd -56(%r30),%fr5 + ldo 128(%r30),%r30 + add %r0,%r0,cylimb ; clear cy and cylimb + + std %r3,-96(%r30) + std %r4,-88(%r30) + std %r5,-80(%r30) + std %r6,-72(%r30) + depdi,z 1,31,1,%r5 + + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd -128(%r30),lo ; lo = low 64 bit of product + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + addib,= -1,%r24,L$end1 + nop + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + addib,= -1,%r24,L$end2 + nop +L$loop + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m1 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,rlimb,rlimb + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + add t4,rlimb,t3 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + std t3,0(rptr) + addib,<> -1,%r24,L$loop + ldo 8(rptr),rptr +L$end2 + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,rlimb,rlimb + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + add t4,rlimb,t3 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr +L$end1 + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + add cylimb,rlimb,rlimb + add,dc t2,hi,cylimb + add t4,rlimb,t3 + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr + + ldd -96(%r30),%r3 + ldd -88(%r30),%r4 + ldd -80(%r30),%r5 + ldd -72(%r30),%r6 + + copy cylimb,%r28 + bve (%r2) + .exit + ldo -128(%r30),%r30 + .procend |