1 files changed, 0 insertions, 935 deletions
diff --git a/security/nss/lib/freebl/mpi/hppa20.s b/security/nss/lib/freebl/mpi/hppa20.s
deleted file mode 100644
index 4cabd249b..000000000
--- a/security/nss/lib/freebl/mpi/hppa20.s
+++ /dev/null
@@ -1,935 +0,0 @@
-; ***** BEGIN LICENSE BLOCK *****
-; Version: MPL 1.1/GPL 2.0/LGPL 2.1
-;
-; The contents of this file are subject to the Mozilla Public License Version
-; 1.1 (the "License"); you may not use this file except in compliance with
-; the License. You may obtain a copy of the License at
-; http://www.mozilla.org/MPL/
-;
-; Software distributed under the License is distributed on an "AS IS" basis,
-; WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
-; for the specific language governing rights and limitations under the
-; License.
-;
-; The Original Code is MAXPY multiple-precision integer arithmetic.
-;
-; The Initial Developer of the Original Code is
-; the Hewlett-Packard Company.
-; Portions created by the Initial Developer are Copyright (C) 1997
-; the Initial Developer. All Rights Reserved.
-;
-; Contributor(s):
-;   coded by:   William B. Ackerman
-;
-; Alternatively, the contents of this file may be used under the terms of
-; either the GNU General Public License Version 2 or later (the "GPL"), or
-; the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
-; in which case the provisions of the GPL or the LGPL are applicable instead
-; of those above. If you wish to allow use of your version of this file only
-; under the terms of either the GPL or the LGPL, and not to allow others to
-; use your version of this file under the terms of the MPL, indicate your
-; decision by deleting the provisions above and replace them with the notice
-; and other provisions required by the GPL or the LGPL. If you do not delete
-; the provisions above, a recipient may use your version of this file under
-; the terms of any one of the MPL, the GPL or the LGPL.
-;
-; ***** END LICENSE BLOCK *****
-
-#ifdef __LP64__
-        .LEVEL   2.0W
-#else
-;       .LEVEL   1.1
-;       .ALLOW   2.0N
-        .LEVEL   2.0N
-#endif
-        .SPACE   $TEXT$,SORT=8
-        .SUBSPA  $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24
-
-; ***************************************************************
-;
-;                 maxpy_[little/big]
-;
-; ***************************************************************
-
-; There is no default -- you must specify one or the other.
-#define LITTLE_WORDIAN 1
-
-#ifdef LITTLE_WORDIAN
-#define EIGHT 8
-#define SIXTEEN 16
-#define THIRTY_TWO 32
-#define UN_EIGHT -8
-#define UN_SIXTEEN -16
-#define UN_TWENTY_FOUR -24
-#endif
-
-#ifdef BIG_WORDIAN
-#define EIGHT -8
-#define SIXTEEN -16
-#define THIRTY_TWO -32
-#define UN_EIGHT 8
-#define UN_SIXTEEN 16
-#define UN_TWENTY_FOUR 24
-#endif
-
-; This performs a multiple-precision integer version of "daxpy",
-; Using the selected addressing direction.  "Little-wordian" means that
-; the least significant word of a number is stored at the lowest address.
-; "Big-wordian" means that the most significant word is at the lowest
-; address.  Either way, the incoming address of the vector is that
-; of the least significant word.  That means that, for little-wordian
-; addressing, we move the address upward as we propagate carries
-; from the least significant word to the most significant.  For
-; big-wordian we move the address downward.
-
-; We use the following registers:
-;
-;     r2   return PC, of course
-;     r26 = arg1 =  length
-;     r25 = arg2 =  address of scalar
-;     r24 = arg3 =  multiplicand vector
-;     r23 = arg4 =  result vector
-;
-;     fr9 = scalar loaded once only from r25
-
-; The cycle counts shown in the bodies below are simply the result of a
-; scheduling by hand.  The actual PCX-U hardware does it differently.
-; The intention is that the overall speed is the same.
-
-; The pipeline startup and shutdown code is constructed in the usual way,
-; by taking the loop bodies and removing unnecessary instructions.
-; We have left the comments describing cycle numbers in the code.
-; These are intended for reference when comparing with the main loop,
-; and have no particular relationship to actual cycle numbers.
-
-#ifdef LITTLE_WORDIAN
-maxpy_little
-#else
-maxpy_big
-#endif
-        .PROC
-        .CALLINFO FRAME=120,ENTRY_GR=%r4
-        .ENTER
-
-; Of course, real men don't use the sissy "enter" and "leave" commands.
-; They write their own stack manipulation stuff.  Unfortunately,
-; that doesn't generate complete unwind info, whereas "enter" and
-; "leave" (if the documentation is to be believed) do so.  Therefore,
-; we use the sissy commands.  We have verified (by real-man methods)
-; that the above command generates what we want:
-;       STW,MA  %r3,128(%sp)
-;       STW     %r4,-124(%sp)
-
-        ADDIB,< -1,%r26,$L0         ; If N = 0, exit immediately.
-        FLDD    0(%r25),%fr9        ; fr9 = scalar
-
-; First startup
-
-        FLDD    0(%r24),%fr24       ; Cycle 1
-        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
-        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
-        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
-        CMPIB,> 3,%r26,$N_IS_SMALL  ; Pick out cases N = 1, 2, or 3
-        XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6
-        FLDD    EIGHT(%r24),%fr28   ; Cycle 8
-        XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
-        FSTD    %fr24,-96(%sp)
-        XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
-        FSTD    %fr25,-80(%sp)
-        LDO     SIXTEEN(%r24),%r24  ; Cycle 12
-        FSTD    %fr31,-64(%sp)
-        XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
-        FSTD    %fr27,-48(%sp)
-
-; Second startup
-
-        XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
-        FSTD    %fr30,-56(%sp)
-        FLDD    0(%r24),%fr24
-
-        FSTD    %fr26,-88(%sp)      ; Cycle 2
-
-        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
-        FSTD    %fr28,-104(%sp)
-
-        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
-        LDD     -96(%sp),%r3
-        FSTD    %fr29,-72(%sp)
-
-        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
-        LDD     -64(%sp),%r19
-        LDD     -80(%sp),%r21
-
-        XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6
-        LDD     -56(%sp),%r20
-        ADD     %r21,%r3,%r3
-
-        ADD,DC  %r20,%r19,%r19      ; Cycle 7
-        LDD     -88(%sp),%r4
-        SHRPD   %r3,%r0,32,%r21
-        LDD     -48(%sp),%r1
-
-        FLDD    EIGHT(%r24),%fr28   ; Cycle 8
-        LDD     -104(%sp),%r31
-        ADD,DC  %r0,%r0,%r20
-        SHRPD   %r19,%r3,32,%r3
-
-        LDD     -72(%sp),%r29       ; Cycle 9
-        SHRPD   %r20,%r19,32,%r20
-        ADD     %r21,%r1,%r1
-
-        XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
-        ADD,DC  %r3,%r4,%r4
-        FSTD    %fr24,-96(%sp)
-
-        XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
-        ADD,DC  %r0,%r20,%r20
-        LDD     0(%r23),%r3
-        FSTD    %fr25,-80(%sp)
-
-        LDO     SIXTEEN(%r24),%r24  ; Cycle 12
-        FSTD    %fr31,-64(%sp)
-
-        XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
-        ADD     %r0,%r0,%r0         ; clear the carry bit
-        ADDIB,<= -4,%r26,$ENDLOOP   ; actually happens in cycle 12
-        FSTD    %fr27,-48(%sp)
-;        MFCTL   %cr16,%r21         ; for timing
-;        STD     %r21,-112(%sp)
-
-; Here is the loop.
-
-$LOOP   XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
-        ADD,DC  %r29,%r4,%r4
-        FSTD    %fr30,-56(%sp)
-        FLDD    0(%r24),%fr24
-
-        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
-        ADD,DC  %r0,%r20,%r20
-        FSTD    %fr26,-88(%sp)
-
-        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
-        ADD     %r3,%r1,%r1
-        FSTD    %fr28,-104(%sp)
-        LDD     UN_EIGHT(%r23),%r21
-
-        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
-        ADD,DC  %r21,%r4,%r28
-        FSTD    %fr29,-72(%sp)    
-        LDD     -96(%sp),%r3
-
-        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
-        ADD,DC  %r20,%r31,%r22
-        LDD     -64(%sp),%r19
-        LDD     -80(%sp),%r21
-
-        XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6
-        ADD     %r21,%r3,%r3
-        LDD     -56(%sp),%r20
-        STD     %r1,UN_SIXTEEN(%r23)
-
-        ADD,DC  %r20,%r19,%r19      ; Cycle 7
-        SHRPD   %r3,%r0,32,%r21
-        LDD     -88(%sp),%r4
-        LDD     -48(%sp),%r1
-
-        ADD,DC  %r0,%r0,%r20        ; Cycle 8
-        SHRPD   %r19,%r3,32,%r3
-        FLDD    EIGHT(%r24),%fr28
-        LDD     -104(%sp),%r31
-
-        SHRPD   %r20,%r19,32,%r20   ; Cycle 9
-        ADD     %r21,%r1,%r1
-        STD     %r28,UN_EIGHT(%r23)
-        LDD     -72(%sp),%r29
-
-        XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
-        ADD,DC  %r3,%r4,%r4
-        FSTD    %fr24,-96(%sp)
-
-        XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
-        ADD,DC  %r0,%r20,%r20
-        FSTD    %fr25,-80(%sp)
-        LDD     0(%r23),%r3
-
-        LDO     SIXTEEN(%r24),%r24  ; Cycle 12
-        FSTD    %fr31,-64(%sp)
-
-        XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
-        ADD     %r22,%r1,%r1
-        ADDIB,> -2,%r26,$LOOP       ; actually happens in cycle 12
-        FSTD    %fr27,-48(%sp)
-
-$ENDLOOP
-
-; Shutdown code, first stage.
-
-;        MFCTL   %cr16,%r21         ; for timing
-;        STD     %r21,UN_SIXTEEN(%r23)
-;        LDD     -112(%sp),%r21
-;        STD     %r21,UN_EIGHT(%r23)
-
-        XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
-        ADD,DC  %r29,%r4,%r4
-        CMPIB,= 0,%r26,$ONEMORE
-        FSTD    %fr30,-56(%sp)
-
-        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
-        ADD,DC  %r0,%r20,%r20
-        FSTD    %fr26,-88(%sp)
-
-        ADD     %r3,%r1,%r1         ; Cycle 3
-        FSTD    %fr28,-104(%sp)
-        LDD     UN_EIGHT(%r23),%r21
-
-        ADD,DC  %r21,%r4,%r28       ; Cycle 4
-        FSTD    %fr29,-72(%sp)    
-        STD     %r28,UN_EIGHT(%r23) ; moved up from cycle 9
-        LDD     -96(%sp),%r3
-
-        ADD,DC  %r20,%r31,%r22      ; Cycle 5
-        STD     %r1,UN_SIXTEEN(%r23)
-$JOIN4
-        LDD     -64(%sp),%r19
-        LDD     -80(%sp),%r21
-
-        ADD     %r21,%r3,%r3        ; Cycle 6
-        LDD     -56(%sp),%r20
-
-        ADD,DC  %r20,%r19,%r19      ; Cycle 7
-        SHRPD   %r3,%r0,32,%r21
-        LDD     -88(%sp),%r4
-        LDD     -48(%sp),%r1
-
-        ADD,DC  %r0,%r0,%r20        ; Cycle 8
-        SHRPD   %r19,%r3,32,%r3
-        LDD     -104(%sp),%r31
-
-        SHRPD   %r20,%r19,32,%r20   ; Cycle 9
-        ADD     %r21,%r1,%r1
-        LDD     -72(%sp),%r29
-
-        ADD,DC  %r3,%r4,%r4         ; Cycle 10
-
-        ADD,DC  %r0,%r20,%r20       ; Cycle 11
-        LDD     0(%r23),%r3
-
-        ADD     %r22,%r1,%r1        ; Cycle 13
-
-; Shutdown code, second stage.
-
-        ADD,DC  %r29,%r4,%r4        ; Cycle 1
-
-        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
-        ADD,DC  %r0,%r20,%r20
-
-        LDD     UN_EIGHT(%r23),%r21 ; Cycle 3
-        ADD     %r3,%r1,%r1
-
-        ADD,DC  %r21,%r4,%r28       ; Cycle 4
-
-        ADD,DC  %r20,%r31,%r22      ; Cycle 5
-
-        STD     %r1,UN_SIXTEEN(%r23); Cycle 6
-
-        STD     %r28,UN_EIGHT(%r23) ; Cycle 9
-
-        LDD     0(%r23),%r3         ; Cycle 11
-
-; Shutdown code, third stage.
-
-        LDO     SIXTEEN(%r23),%r23
-        ADD     %r3,%r22,%r1
-$JOIN1  ADD,DC  %r0,%r0,%r21
-        CMPIB,*= 0,%r21,$L0         ; if no overflow, exit
-        STD     %r1,UN_SIXTEEN(%r23)
-
-; Final carry propagation
-
-$FINAL1 LDO     EIGHT(%r23),%r23
-        LDD     UN_SIXTEEN(%r23),%r21
-        ADDI    1,%r21,%r21
-        CMPIB,*= 0,%r21,$FINAL1     ; Keep looping if there is a carry.
-        STD     %r21,UN_SIXTEEN(%r23)
-        B       $L0
-        NOP
-
-; Here is the code that handles the difficult cases N=1, N=2, and N=3.
-; We do the usual trick -- branch out of the startup code at appropriate
-; points, and branch into the shutdown code.
-
-$N_IS_SMALL
-        CMPIB,= 0,%r26,$N_IS_ONE
-        FSTD    %fr24,-96(%sp)      ; Cycle 10
-        FLDD    EIGHT(%r24),%fr28   ; Cycle 8
-        XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
-        XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
-        FSTD    %fr25,-80(%sp)
-        FSTD    %fr31,-64(%sp)      ; Cycle 12
-        XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
-        FSTD    %fr27,-48(%sp)
-        XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
-        CMPIB,= 2,%r26,$N_IS_THREE
-        FSTD    %fr30,-56(%sp)
-
-; N = 2
-        FSTD    %fr26,-88(%sp)      ; Cycle 2
-        FSTD    %fr28,-104(%sp)     ; Cycle 3
-        LDD     -96(%sp),%r3        ; Cycle 4
-        FSTD    %fr29,-72(%sp)
-        B       $JOIN4
-        ADD     %r0,%r0,%r22
-
-$N_IS_THREE
-        FLDD    SIXTEEN(%r24),%fr24
-        FSTD    %fr26,-88(%sp)      ; Cycle 2
-        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
-        FSTD    %fr28,-104(%sp)
-        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
-        LDD     -96(%sp),%r3
-        FSTD    %fr29,-72(%sp)
-        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
-        LDD     -64(%sp),%r19
-        LDD     -80(%sp),%r21
-        B       $JOIN3
-        ADD     %r0,%r0,%r22
-
-$N_IS_ONE
-        FSTD    %fr25,-80(%sp)
-        FSTD    %fr27,-48(%sp)
-        FSTD    %fr26,-88(%sp)      ; Cycle 2
-        B       $JOIN5
-        ADD     %r0,%r0,%r22
-
-; We came out of the unrolled loop with wrong parity.  Do one more
-; single cycle.  This is quite tricky, because of the way the
-; carry chains and SHRPD chains have been chopped up.
-
-$ONEMORE
-
-        FLDD    0(%r24),%fr24
-
-        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
-        ADD,DC  %r0,%r20,%r20
-        FSTD    %fr26,-88(%sp)
-
-        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
-        FSTD    %fr28,-104(%sp)
-        LDD     UN_EIGHT(%r23),%r21
-        ADD     %r3,%r1,%r1
-
-        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
-        ADD,DC  %r21,%r4,%r28
-        STD     %r28,UN_EIGHT(%r23) ; moved from cycle 9
-        LDD     -96(%sp),%r3
-        FSTD    %fr29,-72(%sp)    
-
-        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
-        ADD,DC  %r20,%r31,%r22
-        LDD     -64(%sp),%r19
-        LDD     -80(%sp),%r21
-
-        STD     %r1,UN_SIXTEEN(%r23); Cycle 6
-$JOIN3
-        XMPYU   %fr9L,%fr24R,%fr24
-        LDD     -56(%sp),%r20
-        ADD     %r21,%r3,%r3
-
-        ADD,DC  %r20,%r19,%r19      ; Cycle 7
-        LDD     -88(%sp),%r4
-        SHRPD   %r3,%r0,32,%r21
-        LDD     -48(%sp),%r1
-
-        LDD     -104(%sp),%r31      ; Cycle 8
-        ADD,DC  %r0,%r0,%r20
-        SHRPD   %r19,%r3,32,%r3
-
-        LDD     -72(%sp),%r29       ; Cycle 9
-        SHRPD   %r20,%r19,32,%r20
-        ADD     %r21,%r1,%r1
-
-        ADD,DC  %r3,%r4,%r4         ; Cycle 10
-        FSTD    %fr24,-96(%sp)
-
-        ADD,DC  %r0,%r20,%r20       ; Cycle 11
-        LDD     0(%r23),%r3
-        FSTD    %fr25,-80(%sp)
-
-        ADD     %r22,%r1,%r1        ; Cycle 13
-        FSTD    %fr27,-48(%sp)
-
-; Shutdown code, stage 1-1/2.
-
-        ADD,DC  %r29,%r4,%r4        ; Cycle 1
-
-        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
-        ADD,DC  %r0,%r20,%r20     
-        FSTD    %fr26,-88(%sp)
-
-        LDD     UN_EIGHT(%r23),%r21 ; Cycle 3
-        ADD     %r3,%r1,%r1
-
-        ADD,DC  %r21,%r4,%r28       ; Cycle 4
-        STD     %r28,UN_EIGHT(%r23) ; moved from cycle 9
-
-        ADD,DC  %r20,%r31,%r22      ; Cycle 5
-        STD     %r1,UN_SIXTEEN(%r23)
-$JOIN5
-        LDD     -96(%sp),%r3        ; moved from cycle 4
-        LDD     -80(%sp),%r21
-        ADD     %r21,%r3,%r3        ; Cycle 6
-        ADD,DC  %r0,%r0,%r19        ; Cycle 7
-        LDD     -88(%sp),%r4
-        SHRPD   %r3,%r0,32,%r21
-        LDD     -48(%sp),%r1
-        SHRPD   %r19,%r3,32,%r3     ; Cycle 8
-        ADD     %r21,%r1,%r1        ; Cycle 9
-        ADD,DC  %r3,%r4,%r4         ; Cycle 10
-        LDD     0(%r23),%r3         ; Cycle 11
-        ADD     %r22,%r1,%r1        ; Cycle 13
-
-; Shutdown code, stage 2-1/2.
-
-        ADD,DC  %r0,%r4,%r4         ; Cycle 1
-        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
-        LDD     UN_EIGHT(%r23),%r21 ; Cycle 3
-        ADD     %r3,%r1,%r1
-        STD     %r1,UN_SIXTEEN(%r23)
-        ADD,DC  %r21,%r4,%r1
-        B       $JOIN1
-        LDO     EIGHT(%r23),%r23
-
-; exit
-
-$L0
-        .LEAVE
-
-; We have verified that the above command generates what we want:
-;       LDW     -124(%sp),%r4
-;       BVE     (%r2)
-;       LDW,MB  -128(%sp),%r3
-
-        .PROCEND
-
-; ***************************************************************
-;
-;                 add_diag_[little/big]
-;
-; ***************************************************************
-
-; The arguments are as follows:
-;     r2   return PC, of course
-;     r26 = arg1 =  length
-;     r25 = arg2 =  vector to square
-;     r24 = arg3 =  result vector
-
-#ifdef LITTLE_WORDIAN
-add_diag_little
-#else
-add_diag_big
-#endif
-        .PROC
-        .CALLINFO FRAME=120,ENTRY_GR=%r4
-        .ENTER
-
-        ADDIB,< -1,%r26,$Z0         ; If N=0, exit immediately.
-        NOP
-
-; Startup code
-
-        FLDD    0(%r25),%fr7        ; Cycle 2 (alternate body)
-        XMPYU   %fr7R,%fr7R,%fr29   ; Cycle 4
-        XMPYU   %fr7L,%fr7R,%fr27   ; Cycle 5
-        XMPYU   %fr7L,%fr7L,%fr30
-        LDO     SIXTEEN(%r25),%r25  ; Cycle 6
-        FSTD    %fr29,-88(%sp)
-        FSTD    %fr27,-72(%sp)      ; Cycle 7
-        CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body)
-        FSTD    %fr30,-96(%sp)
-        FLDD    UN_EIGHT(%r25),%fr7 ; Cycle 2
-        LDD     -88(%sp),%r22       ; Cycle 3
-        LDD     -72(%sp),%r31       ; Cycle 4
-        XMPYU   %fr7R,%fr7R,%fr28
-        XMPYU   %fr7L,%fr7R,%fr24   ; Cycle 5
-        XMPYU   %fr7L,%fr7L,%fr31
-        LDD     -96(%sp),%r20       ; Cycle 6
-        FSTD    %fr28,-80(%sp)
-        ADD     %r0,%r0,%r0         ; clear the carry bit
-        ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7
-        FSTD    %fr24,-64(%sp)
-
-; Here is the loop.  It is unrolled twice, modelled after the "alternate body" and then the "main body".
-
-$DIAGLOOP
-        SHRPD   %r31,%r0,31,%r3     ; Cycle 1 (alternate body)
-        LDO     SIXTEEN(%r25),%r25
-        LDD     0(%r24),%r1
-        FSTD    %fr31,-104(%sp)
-        SHRPD   %r0,%r31,31,%r4     ; Cycle 2
-        ADD,DC  %r22,%r3,%r3
-        FLDD    UN_SIXTEEN(%r25),%fr7   
-        ADD,DC  %r0,%r20,%r20       ; Cycle 3
-        ADD     %r1,%r3,%r3
-        XMPYU   %fr7R,%fr7R,%fr29   ; Cycle 4
-        LDD     -80(%sp),%r21
-        STD     %r3,0(%r24)
-        XMPYU   %fr7L,%fr7R,%fr27   ; Cycle 5
-        XMPYU   %fr7L,%fr7L,%fr30
-        LDD     -64(%sp),%r29       
-        LDD     EIGHT(%r24),%r1  
-        ADD,DC  %r4,%r20,%r20       ; Cycle 6
-        LDD     -104(%sp),%r19
-        FSTD    %fr29,-88(%sp)
-        ADD     %r20,%r1,%r1        ; Cycle 7
-        FSTD    %fr27,-72(%sp)
-        SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)
-        LDO     THIRTY_TWO(%r24),%r24
-        LDD     UN_SIXTEEN(%r24),%r28
-        FSTD    %fr30,-96(%sp)
-        SHRPD   %r0,%r29,31,%r3     ; Cycle 2
-        ADD,DC  %r21,%r4,%r4
-        FLDD    UN_EIGHT(%r25),%fr7
-        STD     %r1,UN_TWENTY_FOUR(%r24)
-        ADD,DC  %r0,%r19,%r19       ; Cycle 3
-        ADD     %r28,%r4,%r4
-        XMPYU   %fr7R,%fr7R,%fr28   ; Cycle 4
-        LDD     -88(%sp),%r22
-        STD     %r4,UN_SIXTEEN(%r24)
-        XMPYU   %fr7L,%fr7R,%fr24   ; Cycle 5
-        XMPYU   %fr7L,%fr7L,%fr31
-        LDD     -72(%sp),%r31
-        LDD     UN_EIGHT(%r24),%r28
-        ADD,DC  %r3,%r19,%r19       ; Cycle 6
-        LDD     -96(%sp),%r20
-        FSTD    %fr28,-80(%sp)
-        ADD     %r19,%r28,%r28      ; Cycle 7
-        FSTD    %fr24,-64(%sp)
-        ADDIB,> -2,%r26,$DIAGLOOP   ; Cycle 8
-        STD     %r28,UN_EIGHT(%r24)
-
-$ENDDIAGLOOP
-
-        ADD,DC  %r0,%r22,%r22    
-        CMPIB,= 0,%r26,$ONEMOREDIAG
-        SHRPD   %r31,%r0,31,%r3
-
-; Shutdown code, first stage.
-
-        FSTD    %fr31,-104(%sp)     ; Cycle 1 (alternate body)
-        LDD     0(%r24),%r28
-        SHRPD   %r0,%r31,31,%r4     ; Cycle 2
-        ADD     %r3,%r22,%r3
-        ADD,DC  %r0,%r20,%r20       ; Cycle 3
-        LDD     -80(%sp),%r21
-        ADD     %r3,%r28,%r3
-        LDD     -64(%sp),%r29       ; Cycle 4
-        STD     %r3,0(%r24)
-        LDD     EIGHT(%r24),%r1     ; Cycle 5
-        LDO     SIXTEEN(%r25),%r25  ; Cycle 6
-        LDD     -104(%sp),%r19
-        ADD,DC  %r4,%r20,%r20
-        ADD     %r20,%r1,%r1        ; Cycle 7
-        ADD,DC  %r0,%r21,%r21       ; Cycle 8
-        STD     %r1,EIGHT(%r24)
-
-; Shutdown code, second stage.
-
-        SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)
-        LDO     THIRTY_TWO(%r24),%r24
-        LDD     UN_SIXTEEN(%r24),%r1
-        SHRPD   %r0,%r29,31,%r3      ; Cycle 2
-        ADD     %r4,%r21,%r4
-        ADD,DC  %r0,%r19,%r19       ; Cycle 3
-        ADD     %r4,%r1,%r4
-        STD     %r4,UN_SIXTEEN(%r24); Cycle 4
-        LDD     UN_EIGHT(%r24),%r28 ; Cycle 5
-        ADD,DC  %r3,%r19,%r19       ; Cycle 6       
-        ADD     %r19,%r28,%r28      ; Cycle 7
-        ADD,DC  %r0,%r0,%r22        ; Cycle 8
-        CMPIB,*= 0,%r22,$Z0         ; if no overflow, exit
-        STD     %r28,UN_EIGHT(%r24)
-
-; Final carry propagation
-
-$FDIAG2
-        LDO     EIGHT(%r24),%r24
-        LDD     UN_EIGHT(%r24),%r26
-        ADDI    1,%r26,%r26
-        CMPIB,*= 0,%r26,$FDIAG2     ; Keep looping if there is a carry.
-        STD     %r26,UN_EIGHT(%r24)
-
-        B   $Z0
-        NOP
-
-; Here is the code that handles the difficult case N=1.
-; We do the usual trick -- branch out of the startup code at appropriate
-; points, and branch into the shutdown code.
-
-$DIAG_N_IS_ONE
-
-        LDD     -88(%sp),%r22
-        LDD     -72(%sp),%r31
-        B       $JOINDIAG
-        LDD     -96(%sp),%r20
-
-; We came out of the unrolled loop with wrong parity.  Do one more
-; single cycle.  This is the "alternate body".  It will, of course,
-; give us opposite registers from the other case, so we need
-; completely different shutdown code.
-
-$ONEMOREDIAG
-        FSTD    %fr31,-104(%sp)     ; Cycle 1 (alternate body)
-        LDD     0(%r24),%r28
-        FLDD    0(%r25),%fr7        ; Cycle 2
-        SHRPD   %r0,%r31,31,%r4
-        ADD     %r3,%r22,%r3
-        ADD,DC  %r0,%r20,%r20       ; Cycle 3
-        LDD     -80(%sp),%r21
-        ADD     %r3,%r28,%r3
-        LDD     -64(%sp),%r29       ; Cycle 4
-        STD     %r3,0(%r24)
-        XMPYU   %fr7R,%fr7R,%fr29
-        LDD     EIGHT(%r24),%r1     ; Cycle 5
-        XMPYU   %fr7L,%fr7R,%fr27
-        XMPYU   %fr7L,%fr7L,%fr30
-        LDD     -104(%sp),%r19      ; Cycle 6
-        FSTD    %fr29,-88(%sp)
-        ADD,DC  %r4,%r20,%r20
-        FSTD    %fr27,-72(%sp)      ; Cycle 7
-        ADD     %r20,%r1,%r1
-        ADD,DC  %r0,%r21,%r21       ; Cycle 8
-        STD     %r1,EIGHT(%r24)
-
-; Shutdown code, first stage.
-
-        SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)
-        LDO     THIRTY_TWO(%r24),%r24
-        FSTD    %fr30,-96(%sp)
-        LDD     UN_SIXTEEN(%r24),%r1
-        SHRPD   %r0,%r29,31,%r3     ; Cycle 2
-        ADD     %r4,%r21,%r4
-        ADD,DC  %r0,%r19,%r19       ; Cycle 3
-        LDD     -88(%sp),%r22
-        ADD     %r4,%r1,%r4
-        LDD     -72(%sp),%r31       ; Cycle 4
-        STD     %r4,UN_SIXTEEN(%r24)
-        LDD     UN_EIGHT(%r24),%r28 ; Cycle 5
-        LDD     -96(%sp),%r20       ; Cycle 6
-        ADD,DC  %r3,%r19,%r19
-        ADD     %r19,%r28,%r28      ; Cycle 7
-        ADD,DC  %r0,%r22,%r22       ; Cycle 8
-        STD     %r28,UN_EIGHT(%r24)
-
-; Shutdown code, second stage.
-
-$JOINDIAG
-        SHRPD   %r31,%r0,31,%r3     ; Cycle 1 (alternate body)
-        LDD     0(%r24),%r28        
-        SHRPD   %r0,%r31,31,%r4     ; Cycle 2
-        ADD     %r3,%r22,%r3
-        ADD,DC  %r0,%r20,%r20       ; Cycle 3
-        ADD     %r3,%r28,%r3
-        STD     %r3,0(%r24)         ; Cycle 4
-        LDD     EIGHT(%r24),%r1     ; Cycle 5
-        ADD,DC  %r4,%r20,%r20
-        ADD     %r20,%r1,%r1        ; Cycle 7
-        ADD,DC  %r0,%r0,%r21        ; Cycle 8
-        CMPIB,*= 0,%r21,$Z0         ; if no overflow, exit
-        STD     %r1,EIGHT(%r24)
-
-; Final carry propagation
-
-$FDIAG1
-        LDO     EIGHT(%r24),%r24
-        LDD     EIGHT(%r24),%r26
-        ADDI    1,%r26,%r26
-        CMPIB,*= 0,%r26,$FDIAG1    ; Keep looping if there is a carry.
-        STD     %r26,EIGHT(%r24)
-
-$Z0
-        .LEAVE
-        .PROCEND
-;	.ALLOW
-
-        .SPACE         $TEXT$
-        .SUBSPA        $CODE$
-#ifdef LITTLE_WORDIAN
-        .EXPORT        maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
-        .EXPORT        add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
-#else
-        .EXPORT        maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
-        .EXPORT        add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
-#endif
-        .END
-
-
-; How to use "maxpy_PA20_little" and "maxpy_PA20_big"
-; 
-; The routine "maxpy_PA20_little" or "maxpy_PA20_big"
-; performs a 64-bit x any-size multiply, and adds the
-; result to an area of memory.  That is, it performs
-; something like
-; 
-;      A B C D
-;    *       Z
-;   __________
-;    P Q R S T
-; 
-; and then adds the "PQRST" vector into an area of memory,
-; handling all carries.
-; 
-; Digression on nomenclature and endian-ness:
-; 
-; Each of the capital letters in the above represents a 64-bit
-; quantity.  That is, you could think of the discussion as
-; being in terms of radix-16-quintillion arithmetic.  The data
-; type being manipulated is "unsigned long long int".  This
-; requires the 64-bit extension of the HP-UX C compiler,
-; available at release 10.  You need these compiler flags to
-; enable these extensions:
-; 
-;       -Aa +e +DA2.0 +DS2.0
-; 
-; (The first specifies ANSI C, the second enables the
-; extensions, which are beyond ANSI C, and the third and
-; fourth tell the compiler to use whatever features of the
-; PA2.0 architecture it wishes, in order to made the code more
-; efficient.  Since the presence of the assembly code will
-; make the program unable to run on anything less than PA2.0,
-; you might as well gain the performance enhancements in the C
-; code as well.)
-; 
-; Questions of "endian-ness" often come up, usually in the
-; context of byte ordering in a word.  These routines have a
-; similar issue, that could be called "wordian-ness".
-; Independent of byte ordering (PA is always big-endian), one
-; can make two choices when representing extremely large
-; numbers as arrays of 64-bit doublewords in memory.
-; 
-; "Little-wordian" layout means that the least significant
-; word of a number is stored at the lowest address.
-; 
-;   MSW     LSW
-;    |       |
-;    V       V
-; 
-;    A B C D E
-; 
-;    ^     ^ ^
-;    |     | |____ address 0
-;    |     |
-;    |     |_______address 8
-;    |
-;    address 32
-; 
-; "Big-wordian" means that the most significant word is at the
-; lowest address.
-; 
-;   MSW     LSW
-;    |       |
-;    V       V
-; 
-;    A B C D E
-; 
-;    ^     ^ ^
-;    |     | |____ address 32
-;    |     |
-;    |     |_______address 24
-;    |
-;    address 0
-; 
-; When you compile the file, you must specify one or the other, with
-; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN".
-; 
-;     Incidentally, you assemble this file as part of your
-;     project with the same C compiler as the rest of the program.
-;     My "makefile" for a superprecision arithmetic package has
-;     the following stuff:
-; 
-;     # definitions:
-;     CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1
-;     CFLAGS = +O3
-;     LDFLAGS = -L /usr/lib -Wl,-aarchive
-; 
-;     # general build rule for ".s" files:
-;     .s.o:
-;             $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN
-; 
-;     # Now any bind step that calls for pa20.o will assemble pa20.s
-; 
-; End of digression, back to arithmetic:
-; 
-; The way we multiply two huge numbers is, of course, to multiply
-; the "ABCD" vector by each of the "WXYZ" doublewords, adding
-; the result vectors with increasing offsets, the way we learned
-; in school, back before we all used calculators:
-; 
-;            A B C D
-;          * W X Y Z
-;         __________
-;          P Q R S T
-;        E F G H I
-;      M N O P Q
-;  + R S T U V
-;    _______________
-;    F I N A L S U M
-; 
-; So we call maxpy_PA20_big (in my case; my package is
-; big-wordian) repeatedly, giving the W, X, Y, and Z arguments
-; in turn as the "scalar", and giving the "ABCD" vector each
-; time.  We direct it to add its result into an area of memory
-; that we have cleared at the start.  We skew the exact
-; location into that area with each call.
-; 
-; The prototype for the function is
-; 
-; extern void maxpy_PA20_big(
-;    int length,        /* Number of doublewords in the multiplicand vector. */
-;    const long long int *scalaraddr,    /* Address to fetch the scalar. */
-;    const long long int *multiplicand,  /* The multiplicand vector. */
-;    long long int *result);             /* Where to accumulate the result. */
-; 
-; (You should place a copy of this prototype in an include file
-; or in your C file.)
-; 
-; Now, IN ALL CASES, the given address for the multiplicand or
-; the result is that of the LEAST SIGNIFICANT DOUBLEWORD.
-; That word is, of course, the word at which the routine
-; starts processing.  "maxpy_PA20_little" then increases the
-; addresses as it computes.  "maxpy_PA20_big" decreases them.
-; 
-; In our example above, "length" would be 4 in each case.
-; "multiplicand" would be the "ABCD" vector.  Specifically,
-; the address of the element "D".  "scalaraddr" would be the
-; address of "W", "X", "Y", or "Z" on the four calls that we
-; would make.  (The order doesn't matter, of course.)
-; "result" would be the appropriate address in the result
-; area.  When multiplying by "Z", that would be the least
-; significant word.  When multiplying by "Y", it would be the
-; next higher word (8 bytes higher if little-wordian; 8 bytes
-; lower if big-wordian), and so on.  The size of the result
-; area must be the the sum of the sizes of the multiplicand
-; and multiplier vectors, and must be initialized to zero
-; before we start.
-; 
-; Whenever the routine adds its partial product into the result
-; vector, it follows carry chains as far as they need to go.
-; 
-; Here is the super-precision multiply routine that I use for
-; my package.  The package is big-wordian.  I have taken out
-; handling of exponents (it's a floating point package):
-; 
-; static void mul_PA20(
-;   int size,
-;   const long long int *arg1,
-;   const long long int *arg2,
-;   long long int *result)
-; {
-;    int i;
-; 
-;    for (i=0 ; i<2*size ; i++) result[i] = 0ULL;
-; 
-;    for (i=0 ; i<size ; i++) {
-;       maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]);
-;    }
-; }