summaryrefslogtreecommitdiff
path: root/rts/gmp/mpn/x86
diff options
context:
space:
mode:
Diffstat (limited to 'rts/gmp/mpn/x86')
-rw-r--r--rts/gmp/mpn/x86/README40
-rw-r--r--rts/gmp/mpn/x86/README.family333
-rw-r--r--rts/gmp/mpn/x86/addsub_n.S174
-rw-r--r--rts/gmp/mpn/x86/aors_n.asm187
-rw-r--r--rts/gmp/mpn/x86/aorsmul_1.asm134
-rw-r--r--rts/gmp/mpn/x86/copyd.asm80
-rw-r--r--rts/gmp/mpn/x86/copyi.asm79
-rw-r--r--rts/gmp/mpn/x86/diveby3.asm115
-rw-r--r--rts/gmp/mpn/x86/divrem_1.asm232
-rw-r--r--rts/gmp/mpn/x86/k6/README237
-rw-r--r--rts/gmp/mpn/x86/k6/aors_n.asm329
-rw-r--r--rts/gmp/mpn/x86/k6/aorsmul_1.asm372
-rw-r--r--rts/gmp/mpn/x86/k6/cross.pl141
-rw-r--r--rts/gmp/mpn/x86/k6/diveby3.asm110
-rw-r--r--rts/gmp/mpn/x86/k6/gmp-mparam.h97
-rw-r--r--rts/gmp/mpn/x86/k6/k62mmx/copyd.asm179
-rw-r--r--rts/gmp/mpn/x86/k6/k62mmx/copyi.asm196
-rw-r--r--rts/gmp/mpn/x86/k6/k62mmx/lshift.asm286
-rw-r--r--rts/gmp/mpn/x86/k6/k62mmx/rshift.asm285
-rw-r--r--rts/gmp/mpn/x86/k6/mmx/com_n.asm91
-rw-r--r--rts/gmp/mpn/x86/k6/mmx/logops_n.asm212
-rw-r--r--rts/gmp/mpn/x86/k6/mmx/lshift.asm122
-rw-r--r--rts/gmp/mpn/x86/k6/mmx/popham.asm238
-rw-r--r--rts/gmp/mpn/x86/k6/mmx/rshift.asm122
-rw-r--r--rts/gmp/mpn/x86/k6/mul_1.asm272
-rw-r--r--rts/gmp/mpn/x86/k6/mul_basecase.asm600
-rw-r--r--rts/gmp/mpn/x86/k6/sqr_basecase.asm672
-rw-r--r--rts/gmp/mpn/x86/k7/README145
-rw-r--r--rts/gmp/mpn/x86/k7/aors_n.asm250
-rw-r--r--rts/gmp/mpn/x86/k7/aorsmul_1.asm364
-rw-r--r--rts/gmp/mpn/x86/k7/diveby3.asm131
-rw-r--r--rts/gmp/mpn/x86/k7/gmp-mparam.h100
-rw-r--r--rts/gmp/mpn/x86/k7/mmx/copyd.asm136
-rw-r--r--rts/gmp/mpn/x86/k7/mmx/copyi.asm147
-rw-r--r--rts/gmp/mpn/x86/k7/mmx/divrem_1.asm718
-rw-r--r--rts/gmp/mpn/x86/k7/mmx/lshift.asm472
-rw-r--r--rts/gmp/mpn/x86/k7/mmx/mod_1.asm457
-rw-r--r--rts/gmp/mpn/x86/k7/mmx/popham.asm239
-rw-r--r--rts/gmp/mpn/x86/k7/mmx/rshift.asm471
-rw-r--r--rts/gmp/mpn/x86/k7/mul_1.asm265
-rw-r--r--rts/gmp/mpn/x86/k7/mul_basecase.asm593
-rw-r--r--rts/gmp/mpn/x86/k7/sqr_basecase.asm627
-rw-r--r--rts/gmp/mpn/x86/lshift.asm90
-rw-r--r--rts/gmp/mpn/x86/mod_1.asm141
-rw-r--r--rts/gmp/mpn/x86/mul_1.asm130
-rw-r--r--rts/gmp/mpn/x86/mul_basecase.asm209
-rw-r--r--rts/gmp/mpn/x86/p6/README95
-rw-r--r--rts/gmp/mpn/x86/p6/aorsmul_1.asm300
-rw-r--r--rts/gmp/mpn/x86/p6/diveby3.asm37
-rw-r--r--rts/gmp/mpn/x86/p6/gmp-mparam.h96
-rw-r--r--rts/gmp/mpn/x86/p6/mmx/divrem_1.asm677
-rw-r--r--rts/gmp/mpn/x86/p6/mmx/mod_1.asm444
-rw-r--r--rts/gmp/mpn/x86/p6/mmx/popham.asm31
-rw-r--r--rts/gmp/mpn/x86/p6/p3mmx/popham.asm30
-rw-r--r--rts/gmp/mpn/x86/p6/sqr_basecase.asm641
-rw-r--r--rts/gmp/mpn/x86/pentium/README77
-rw-r--r--rts/gmp/mpn/x86/pentium/aors_n.asm196
-rw-r--r--rts/gmp/mpn/x86/pentium/aorsmul_1.asm99
-rw-r--r--rts/gmp/mpn/x86/pentium/diveby3.asm183
-rw-r--r--rts/gmp/mpn/x86/pentium/gmp-mparam.h97
-rw-r--r--rts/gmp/mpn/x86/pentium/lshift.asm236
-rw-r--r--rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h97
-rw-r--r--rts/gmp/mpn/x86/pentium/mmx/lshift.asm455
-rw-r--r--rts/gmp/mpn/x86/pentium/mmx/popham.asm30
-rw-r--r--rts/gmp/mpn/x86/pentium/mmx/rshift.asm460
-rw-r--r--rts/gmp/mpn/x86/pentium/mul_1.asm79
-rw-r--r--rts/gmp/mpn/x86/pentium/mul_basecase.asm135
-rw-r--r--rts/gmp/mpn/x86/pentium/rshift.asm236
-rw-r--r--rts/gmp/mpn/x86/pentium/sqr_basecase.asm520
-rw-r--r--rts/gmp/mpn/x86/rshift.asm92
-rw-r--r--rts/gmp/mpn/x86/udiv.asm44
-rw-r--r--rts/gmp/mpn/x86/umul.asm43
-rw-r--r--rts/gmp/mpn/x86/x86-defs.m4713
73 files changed, 0 insertions, 17763 deletions
diff --git a/rts/gmp/mpn/x86/README b/rts/gmp/mpn/x86/README
deleted file mode 100644
index 3507548b8c..0000000000
--- a/rts/gmp/mpn/x86/README
+++ /dev/null
@@ -1,40 +0,0 @@
-
- X86 MPN SUBROUTINES
-
-
-This directory contains mpn functions for various 80x86 chips.
-
-
-CODE ORGANIZATION
-
- x86 i386, i486, generic
- x86/pentium Intel Pentium (P5, P54)
- x86/pentium/mmx Intel Pentium with MMX (P55)
- x86/p6 Intel Pentium Pro
- x86/p6/mmx Intel Pentium II, III
- x86/p6/p3mmx Intel Pentium III
- x86/k6 AMD K6, K6-2, K6-3
- x86/k6/mmx
- x86/k6/k62mmx AMD K6-2
- x86/k7 AMD Athlon
- x86/k7/mmx
-
-
-The x86 directory is also the main support for P6 at the moment, and
-is something of a blended style, meant to be reasonable on all x86s.
-
-
-
-STATUS
-
-The code is well-optimized for AMD and Intel chips, but not so well
-optimized for Cyrix chips.
-
-
-
-RELEVANT OPTIMIZATION ISSUES
-
-For implementations with slow double shift instructions (SHLD and
-SHRD), it might be better to mimic their operation with SHL+SHR+OR.
-(M2 is likely to benefit from that, but not Pentium due to its slow
-plain SHL and SHR.)
diff --git a/rts/gmp/mpn/x86/README.family b/rts/gmp/mpn/x86/README.family
deleted file mode 100644
index 3bc73f58b0..0000000000
--- a/rts/gmp/mpn/x86/README.family
+++ /dev/null
@@ -1,333 +0,0 @@
-
- X86 CPU FAMILY MPN SUBROUTINES
-
-
-This file has some notes on things common to all the x86 family code.
-
-
-
-ASM FILES
-
-The x86 .asm files are BSD style x86 assembler code, first put through m4
-for macro processing. The generic mpn/asm-defs.m4 is used, together with
-mpn/x86/x86-defs.m4. Detailed notes are in those files.
-
-The code is meant for use with GNU "gas" or a system "as". There's no
-support for assemblers that demand Intel style, and with gas freely
-available and easy to use that shouldn't be a problem.
-
-
-
-STACK FRAME
-
-m4 macros are used to define the parameters passed on the stack, and these
-act like comments on what the stack frame looks like too. For example,
-mpn_mul_1() has the following.
-
- defframe(PARAM_MULTIPLIER, 16)
- defframe(PARAM_SIZE, 12)
- defframe(PARAM_SRC, 8)
- defframe(PARAM_DST, 4)
-
-Here PARAM_MULTIPLIER gets defined as `FRAME+16(%esp)', and the others
-similarly. The return address is at offset 0, but there's not normally any
-need to access that.
-
-FRAME is redefined as necessary through the code so it's the number of bytes
-pushed on the stack, and hence the offsets in the parameter macros stay
-correct. At the start of a routine FRAME should be zero.
-
- deflit(`FRAME',0)
- ...
- deflit(`FRAME',4)
- ...
- deflit(`FRAME',8)
- ...
-
-Helper macros FRAME_pushl(), FRAME_popl(), FRAME_addl_esp() and
-FRAME_subl_esp() exist to adjust FRAME for the effect of those instructions,
-and can be used instead of explicit definitions if preferred.
-defframe_pushl() is a combination FRAME_pushl() and defframe().
-
-There's generally some slackness in redefining FRAME. If new values aren't
-going to get used, then the redefinitions are omitted to keep from
-cluttering up the code. This happens for instance at the end of a routine,
-where there might be just four register pops and then a ret, so FRAME isn't
-getting used.
-
-Local variables and saved registers can be similarly defined, with negative
-offsets representing stack space below the initial stack pointer. For
-example,
-
- defframe(SAVE_ESI, -4)
- defframe(SAVE_EDI, -8)
- defframe(VAR_COUNTER,-12)
-
- deflit(STACK_SPACE, 12)
-
-Here STACK_SPACE gets used in a "subl $STACK_SPACE, %esp" to allocate the
-space, and that instruction must be followed by a redefinition of FRAME
-(setting it equal to STACK_SPACE) to reflect the change in %esp.
-
-Definitions for pushed registers are only put in when they're going to be
-used. If registers are just saved and restored with pushes and pops then
-definitions aren't made.
-
-
-
-ASSEMBLER EXPRESSIONS
-
-Only addition and subtraction seem to be universally available, certainly
-that's all the Solaris 8 "as" seems to accept. If expressions are wanted
-then m4 eval() should be used.
-
-In particular note that a "/" anywhere in a line starts a comment in Solaris
-"as", and in some configurations of gas too.
-
- addl $32/2, %eax <-- wrong
-
- addl $eval(32/2), %eax <-- right
-
-Binutils gas/config/tc-i386.c has a choice between "/" being a comment
-anywhere in a line, or only at the start. FreeBSD patches 2.9.1 to select
-the latter, and as of 2.9.5 it's the default for GNU/Linux too.
-
-
-
-ASSEMBLER COMMENTS
-
-Solaris "as" doesn't support "#" commenting, using /* */ instead,
-unfortunately. For that reason "C" commenting is used (see asm-defs.m4) and
-the intermediate ".s" files have no comments.
-
-
-
-ZERO DISPLACEMENTS
-
-In a couple of places addressing modes like 0(%ebx) with a byte-sized zero
-displacement are wanted, rather than (%ebx) with no displacement. These are
-either for computed jumps or to get desirable code alignment. Explicit
-.byte sequences are used to ensure the assembler doesn't turn 0(%ebx) into
-(%ebx). The Zdisp() macro in x86-defs.m4 is used for this.
-
-Current gas 2.9.5 or recent 2.9.1 leave 0(%ebx) as written, but old gas
-1.92.3 changes it. In general changing would be the sort of "optimization"
-an assembler might perform, hence explicit ".byte"s are used where
-necessary.
-
-
-
-SHLD/SHRD INSTRUCTIONS
-
-The %cl count forms of double shift instructions like "shldl %cl,%eax,%ebx"
-must be written "shldl %eax,%ebx" for some assemblers. gas takes either,
-Solaris "as" doesn't allow %cl, gcc generates %cl for gas and NeXT (which is
-gas), and omits %cl elsewhere.
-
-For GMP an autoconf test is used to determine whether %cl should be used and
-the macros shldl, shrdl, shldw and shrdw in mpn/x86/x86-defs.m4 then pass
-through or omit %cl as necessary. See comments with those macros for usage.
-
-
-
-DIRECTION FLAG
-
-The x86 calling conventions say that the direction flag should be clear at
-function entry and exit. (See iBCS2 and SVR4 ABI books, references below.)
-
-Although this has been so since the year dot, it's not absolutely clear
-whether it's universally respected. Since it's better to be safe than
-sorry, gmp follows glibc and does a "cld" if it depends on the direction
-flag being clear. This happens only in a few places.
-
-
-
-POSITION INDEPENDENT CODE
-
-Defining the symbol PIC in m4 processing selects position independent code.
-This mainly affects computed jumps, and these are implemented in a
-self-contained fashion (without using the global offset table). The few
-calls from assembly code to global functions use the normal procedure
-linkage table.
-
-PIC is necessary for ELF shared libraries because they can be mapped into
-different processes at different virtual addresses. Text relocations in
-shared libraries are allowed, but that presumably means a page with such a
-relocation isn't shared. The use of the PLT for PIC adds a fixed cost to
-every function call, which is small but might be noticeable when working with
-small operands.
-
-Calls from one library function to another don't need to go through the PLT,
-since of course the call instruction uses a displacement, not an absolute
-address, and the relative locations of object files are known when libgmp.so
-is created. "ld -Bsymbolic" (or "gcc -Wl,-Bsymbolic") will resolve calls
-this way, so that there's no jump through the PLT, but of course leaving
-setups of the GOT address in %ebx that may be unnecessary.
-
-The %ebx setup could be avoided in assembly if a separate option controlled
-PIC for calls as opposed to computed jumps etc. But there's only ever
-likely to be a handful of calls out of assembler, and getting the same
-optimization for C intra-library calls would be more important. There seems
-no easy way to tell gcc that certain functions can be called non-PIC, and
-unfortunately many gmp functions use the global memory allocation variables,
-so they need the GOT anyway. Object files with no global data references
-and only intra-library calls could go into the library as non-PIC under
--Bsymbolic. Integrating this into libtool and automake is left as an
-exercise for the reader.
-
-
-
-SIMPLE LOOPS
-
-The overheads in setting up for an unrolled loop can mean that at small
-sizes a simple loop is faster. Making small sizes go fast is important,
-even if it adds a cycle or two to bigger sizes. To this end various
-routines choose between a simple loop and an unrolled loop according to
-operand size. The path to the simple loop, or to special case code for
-small sizes, is always as fast as possible.
-
-Adding a simple loop requires a conditional jump to choose between the
-simple and unrolled code. The size of a branch misprediction penalty
-affects whether a simple loop is worthwhile.
-
-The convention is for an m4 definition UNROLL_THRESHOLD to set the crossover
-point, with sizes < UNROLL_THRESHOLD using the simple loop, sizes >=
-UNROLL_THRESHOLD using the unrolled loop. If position independent code adds
-a couple of cycles to an unrolled loop setup, the threshold will vary with
-PIC or non-PIC. Something like the following is typical.
-
- ifdef(`PIC',`
- deflit(UNROLL_THRESHOLD, 10)
- ',`
- deflit(UNROLL_THRESHOLD, 8)
- ')
-
-There's no automated way to determine the threshold. Setting it to a small
-value and then to a big value makes it possible to measure the simple and
-unrolled loops each over a range of sizes, from which the crossover point
-can be determined. Alternately, just adjust the threshold up or down until
-there's no more speedups.
-
-
-
-UNROLLED LOOP CODING
-
-The x86 addressing modes allow a byte displacement of -128 to +127, making
-it possible to access 256 bytes, which is 64 limbs, without adjusting
-pointer registers within the loop. Dword sized displacements can be used
-too, but they increase code size, and unrolling to 64 ought to be enough.
-
-When unrolling to the full 64 limbs/loop, the limb at the top of the loop
-will have a displacement of -128, so pointers have to have a corresponding
-+128 added before entering the loop. When unrolling to 32 limbs/loop
-displacements 0 to 127 can be used with 0 at the top of the loop and no
-adjustment needed to the pointers.
-
-Where 64 limbs/loop is supported, the +128 adjustment is done only when 64
-limbs/loop is selected. Usually the gain in speed using 64 instead of 32 or
-16 is small, so support for 64 limbs/loop is generally only for comparison.
-
-
-
-COMPUTED JUMPS
-
-When working from least significant limb to most significant limb (most
-routines) the computed jump and pointer calculations in preparation for an
-unrolled loop are as follows.
-
- S = operand size in limbs
- N = number of limbs per loop (UNROLL_COUNT)
- L = log2 of unrolling (UNROLL_LOG2)
- M = mask for unrolling (UNROLL_MASK)
- C = code bytes per limb in the loop
- B = bytes per limb (4 for x86)
-
- computed jump (-S & M) * C + entrypoint
- subtract from pointers (-S & M) * B
- initial loop counter (S-1) >> L
- displacements 0 to B*(N-1)
-
-The loop counter is decremented at the end of each loop, and the looping
-stops when the decrement takes the counter to -1. The displacements are for
-the addressing accessing each limb, eg. a load with "movl disp(%ebx), %eax".
-
-Usually the multiply by "C" can be handled without an imul, using instead an
-leal, or a shift and subtract.
-
-When working from most significant to least significant limb (eg. mpn_lshift
-and mpn_copyd), the calculations change as follows.
-
- add to pointers (-S & M) * B
- displacements 0 to -B*(N-1)
-
-
-
-OLD GAS 1.92.3
-
-This version comes with FreeBSD 2.2.8 and has a couple of gremlins that
-affect gmp code.
-
-Firstly, an expression involving two forward references to labels comes out
-as zero. For example,
-
- addl $bar-foo, %eax
- foo:
- nop
- bar:
-
-This should lead to "addl $1, %eax", but it comes out as "addl $0, %eax".
-When only one forward reference is involved, it works correctly, as for
-example,
-
- foo:
- addl $bar-foo, %eax
- nop
- bar:
-
-Secondly, an expression involving two labels can't be used as the
-displacement for an leal. For example,
-
- foo:
- nop
- bar:
- leal bar-foo(%eax,%ebx,8), %ecx
-
-A slightly cryptic error is given, "Unimplemented segment type 0 in
-parse_operand". When only one label is used it's ok, and the label can be a
-forward reference too, as for example,
-
- leal foo(%eax,%ebx,8), %ecx
- nop
- foo:
-
-These problems only affect PIC computed jump calculations. The workarounds
-are just to do an leal without a displacement and then an addl, and to make
-sure the code is placed so that there's at most one forward reference in the
-addl.
-
-
-
-REFERENCES
-
-"Intel Architecture Software Developer's Manual", volumes 1 to 3, 1999,
-order numbers 243190, 243191 and 243192. Available on-line,
-
- ftp://download.intel.com/design/PentiumII/manuals/243190.htm
- ftp://download.intel.com/design/PentiumII/manuals/243191.htm
- ftp://download.intel.com/design/PentiumII/manuals/243192.htm
-
-"Intel386 Family Binary Compatibility Specification 2", Intel Corporation,
-published by McGraw-Hill, 1991, ISBN 0-07-031219-2.
-
-"System V Application Binary Interface", Unix System Laboratories Inc, 1992,
-published by Prentice Hall, ISBN 0-13-880410-9. And the "Intel386 Processor
-Supplement", AT&T, 1991, ISBN 0-13-877689-X. (These have details of ELF
-shared library PIC coding.)
-
-
-
-----------------
-Local variables:
-mode: text
-fill-column: 76
-End:
diff --git a/rts/gmp/mpn/x86/addsub_n.S b/rts/gmp/mpn/x86/addsub_n.S
deleted file mode 100644
index fe6f648f53..0000000000
--- a/rts/gmp/mpn/x86/addsub_n.S
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Currently not working and not used. */
-
-/*
-Copyright (C) 1999 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of the GNU Lesser General Public License as published by
-the Free Software Foundation; either version 2.1 of the License, or (at your
-option) any later version.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-License for more details.
-
-You should have received a copy of the GNU Lesser General Public License
-along with the GNU MP Library; see the file COPYING.LIB. If not, write to
-the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-MA 02111-1307, USA.
-*/
-
-
-#define SAVE_BORROW_RESTORE_CARRY(r) adcl r,r; shll $31,r
-#define SAVE_CARRY_RESTORE_BORROW(r) adcl r,r
-
- .globl mpn_addsub_n_0
- .globl mpn_addsub_n_1
-
-/* Cute i386/i486/p6 addsub loop for the "full overlap" case r1==s2,r2==s1.
- We let subtraction and addition alternate in being two limbs
- ahead of the other, thereby avoiding some SAVE_RESTORE. */
-// r1 = r2 + r1 edi = esi + edi
-// r2 = r2 - r1 esi = esi - edi
-// s1 s2
-// r2 r1
-// eax,ebx,ecx,edx,esi,edi,ebp
-mpn_addsub_n_0:
- pushl %edi
- pushl %esi
- pushl %ebx
- pushl %ebp
-
- movl 20(%esp),%edi /* res_ptr */
- movl 24(%esp),%esi /* s1_ptr */
- movl 36(%esp),%ebp /* size */
-
- shrl $2,%ebp
- xorl %edx,%edx
- .align 4
-Loop0: // L=load E=execute S=store
- movl (%esi),%ebx // sub 0 L
- movl 4(%esi),%ecx // sub 1 L
- sbbl (%edi),%ebx // sub 0 LE
- sbbl 4(%edi),%ecx // sub 1 LE
-// SAVE_BORROW_RESTORE_CARRY(%edx)
- movl (%esi),%eax // add 0 L
- adcl %eax,(%edi) // add 0 LES
- movl 4(%esi),%eax // add 1 L
- adcl %eax,4(%edi) // add 1 LES
- movl %ebx,(%esi) // sub 0 S
- movl %ecx,4(%esi) // sub 1 S
- movl 8(%esi),%ebx // add 2 L
- adcl 8(%edi),%ebx // add 2 LE
- movl 12(%esi),%ecx // add 3 L
- adcl 12(%edi),%ecx // add 3 LE
-// SAVE_CARRY_RESTORE_BORROW(%edx)
- movl 8(%edi),%eax // sub 2 L
- sbbl %eax,8(%esi) // sub 2 LES
- movl 12(%edi),%eax // sub 3 L
- sbbl %eax,12(%esi) // sub 3 LES
- movl %ebx,8(%edi) // add 2 S
- movl %ecx,12(%edi) // add 3 S
- leal 16(%esi),%esi
- leal 16(%edi),%edi
- decl %ebp
- jnz Loop0
-
- popl %ebp
- popl %ebx
- popl %esi
- popl %edi
- ret
-
-/* Cute i386/i486/p6 addsub loop for the "full overlap" case r1==s1,r2==s2.
- We let subtraction and addition alternate in being two limbs
- ahead of the other, thereby avoiding some SAVE_RESTORE. */
-// r1 = r1 + r2 edi = edi + esi
-// r2 = r1 - r2 esi = edi - esi
-// s2 s1
-// r2 r1
-// eax,ebx,ecx,edx,esi,edi,ebp
-mpn_addsub_n_1:
- pushl %edi
- pushl %esi
- pushl %ebx
- pushl %ebp
-
- movl 20(%esp),%edi /* res_ptr */
- movl 24(%esp),%esi /* s1_ptr */
- movl 36(%esp),%ebp /* size */
-
- shrl $2,%ebp
- xorl %edx,%edx
- .align 4
-Loop1: // L=load E=execute S=store
- movl (%edi),%ebx // sub 0 L
- sbbl (%esi),%ebx // sub 0 LE
- movl 4(%edi),%ecx // sub 1 L
- sbbl 4(%esi),%ecx // sub 1 LE
-// SAVE_BORROW_RESTORE_CARRY(%edx)
- movl (%esi),%eax // add 0 L
- adcl %eax,(%edi) // add 0 LES
- movl 4(%esi),%eax // add 1 L
- adcl %eax,4(%edi) // add 1 LES
- movl %ebx,(%esi) // sub 0 S
- movl %ecx,4(%esi) // sub 1 S
- movl 8(%esi),%ebx // add 2 L
- adcl 8(%edi),%ebx // add 2 LE
- movl 12(%esi),%ecx // add 3 L
- adcl 12(%edi),%ecx // add 3 LE
-// SAVE_CARRY_RESTORE_BORROW(%edx)
- movl 8(%edi),%eax // sub 2 L
- sbbl 8(%esi),%eax // sub 2 LES
- movl %eax,8(%esi) // sub 2 S
- movl 12(%edi),%eax // sub 3 L
- sbbl 12(%esi),%eax // sub 3 LE
- movl %eax,12(%esi) // sub 3 S
- movl %ebx,8(%edi) // add 2 S
- movl %ecx,12(%edi) // add 3 S
- leal 16(%esi),%esi
- leal 16(%edi),%edi
- decl %ebp
- jnz Loop1
-
- popl %ebp
- popl %ebx
- popl %esi
- popl %edi
- ret
-
- .globl mpn_copy
-mpn_copy:
- pushl %edi
- pushl %esi
- pushl %ebx
- pushl %ebp
-
- movl 20(%esp),%edi /* res_ptr */
- movl 24(%esp),%esi /* s1_ptr */
- movl 28(%esp),%ebp /* size */
-
- shrl $2,%ebp
- .align 4
-Loop2:
- movl (%esi),%eax
- movl 4(%esi),%ebx
- movl %eax,(%edi)
- movl %ebx,4(%edi)
- movl 8(%esi),%eax
- movl 12(%esi),%ebx
- movl %eax,8(%edi)
- movl %ebx,12(%edi)
- leal 16(%esi),%esi
- leal 16(%edi),%edi
- decl %ebp
- jnz Loop2
-
- popl %ebp
- popl %ebx
- popl %esi
- popl %edi
- ret
diff --git a/rts/gmp/mpn/x86/aors_n.asm b/rts/gmp/mpn/x86/aors_n.asm
deleted file mode 100644
index 18ef816b4d..0000000000
--- a/rts/gmp/mpn/x86/aors_n.asm
+++ /dev/null
@@ -1,187 +0,0 @@
-dnl x86 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
-
-dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
-dnl Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-ifdef(`OPERATION_add_n',`
- define(M4_inst, adcl)
- define(M4_function_n, mpn_add_n)
- define(M4_function_nc, mpn_add_nc)
-
-',`ifdef(`OPERATION_sub_n',`
- define(M4_inst, sbbl)
- define(M4_function_n, mpn_sub_n)
- define(M4_function_nc, mpn_sub_nc)
-
-',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
-')')')
-
-MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
-
-
-C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
-C mp_size_t size);
-C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
-C mp_size_t size, mp_limb_t carry);
-
-defframe(PARAM_CARRY,20)
-defframe(PARAM_SIZE, 16)
-defframe(PARAM_SRC2, 12)
-defframe(PARAM_SRC1, 8)
-defframe(PARAM_DST, 4)
-
- .text
- ALIGN(8)
-
-PROLOGUE(M4_function_nc)
-deflit(`FRAME',0)
-
- pushl %edi FRAME_pushl()
- pushl %esi FRAME_pushl()
-
- movl PARAM_DST,%edi
- movl PARAM_SRC1,%esi
- movl PARAM_SRC2,%edx
- movl PARAM_SIZE,%ecx
-
- movl %ecx,%eax
- shrl $3,%ecx C compute count for unrolled loop
- negl %eax
- andl $7,%eax C get index where to start loop
- jz LF(M4_function_n,oopgo) C necessary special case for 0
- incl %ecx C adjust loop count
- shll $2,%eax C adjustment for pointers...
- subl %eax,%edi C ... since they are offset ...
- subl %eax,%esi C ... by a constant when we ...
- subl %eax,%edx C ... enter the loop
- shrl $2,%eax C restore previous value
-
-ifdef(`PIC',`
- C Calculate start address in loop for PIC. Due to limitations in
- C old gas, LF(M4_function_n,oop)-L(0a)-3 cannot be put into the leal
- call L(0a)
-L(0a): leal (%eax,%eax,8),%eax
- addl (%esp),%eax
- addl $LF(M4_function_n,oop)-L(0a)-3,%eax
- addl $4,%esp
-',`
- C Calculate start address in loop for non-PIC.
- leal LF(M4_function_n,oop)-3(%eax,%eax,8),%eax
-')
-
- C These lines initialize carry from the 5th parameter. Should be
- C possible to simplify.
- pushl %ebp FRAME_pushl()
- movl PARAM_CARRY,%ebp
- shrl $1,%ebp C shift bit 0 into carry
- popl %ebp FRAME_popl()
-
- jmp *%eax C jump into loop
-
-EPILOGUE()
-
-
- ALIGN(8)
-PROLOGUE(M4_function_n)
-deflit(`FRAME',0)
-
- pushl %edi FRAME_pushl()
- pushl %esi FRAME_pushl()
-
- movl PARAM_DST,%edi
- movl PARAM_SRC1,%esi
- movl PARAM_SRC2,%edx
- movl PARAM_SIZE,%ecx
-
- movl %ecx,%eax
- shrl $3,%ecx C compute count for unrolled loop
- negl %eax
- andl $7,%eax C get index where to start loop
- jz L(oop) C necessary special case for 0
- incl %ecx C adjust loop count
- shll $2,%eax C adjustment for pointers...
- subl %eax,%edi C ... since they are offset ...
- subl %eax,%esi C ... by a constant when we ...
- subl %eax,%edx C ... enter the loop
- shrl $2,%eax C restore previous value
-
-ifdef(`PIC',`
- C Calculate start address in loop for PIC. Due to limitations in
- C some assemblers, L(oop)-L(0b)-3 cannot be put into the leal
- call L(0b)
-L(0b): leal (%eax,%eax,8),%eax
- addl (%esp),%eax
- addl $L(oop)-L(0b)-3,%eax
- addl $4,%esp
-',`
- C Calculate start address in loop for non-PIC.
- leal L(oop)-3(%eax,%eax,8),%eax
-')
- jmp *%eax C jump into loop
-
-L(oopgo):
- pushl %ebp FRAME_pushl()
- movl PARAM_CARRY,%ebp
- shrl $1,%ebp C shift bit 0 into carry
- popl %ebp FRAME_popl()
-
- ALIGN(8)
-L(oop): movl (%esi),%eax
- M4_inst (%edx),%eax
- movl %eax,(%edi)
- movl 4(%esi),%eax
- M4_inst 4(%edx),%eax
- movl %eax,4(%edi)
- movl 8(%esi),%eax
- M4_inst 8(%edx),%eax
- movl %eax,8(%edi)
- movl 12(%esi),%eax
- M4_inst 12(%edx),%eax
- movl %eax,12(%edi)
- movl 16(%esi),%eax
- M4_inst 16(%edx),%eax
- movl %eax,16(%edi)
- movl 20(%esi),%eax
- M4_inst 20(%edx),%eax
- movl %eax,20(%edi)
- movl 24(%esi),%eax
- M4_inst 24(%edx),%eax
- movl %eax,24(%edi)
- movl 28(%esi),%eax
- M4_inst 28(%edx),%eax
- movl %eax,28(%edi)
- leal 32(%edi),%edi
- leal 32(%esi),%esi
- leal 32(%edx),%edx
- decl %ecx
- jnz L(oop)
-
- sbbl %eax,%eax
- negl %eax
-
- popl %esi
- popl %edi
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/aorsmul_1.asm b/rts/gmp/mpn/x86/aorsmul_1.asm
deleted file mode 100644
index f32ad83989..0000000000
--- a/rts/gmp/mpn/x86/aorsmul_1.asm
+++ /dev/null
@@ -1,134 +0,0 @@
-dnl x86 __gmpn_addmul_1 (for 386 and 486) -- Multiply a limb vector with a
-dnl limb and add the result to a second limb vector.
-
-
-dnl Copyright (C) 1992, 1994, 1997, 1999, 2000 Free Software Foundation,
-dnl Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-ifdef(`OPERATION_addmul_1',`
- define(M4_inst, addl)
- define(M4_function_1, mpn_addmul_1)
-
-',`ifdef(`OPERATION_submul_1',`
- define(M4_inst, subl)
- define(M4_function_1, mpn_submul_1)
-
-',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
-')')')
-
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
-
-
-C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t mult);
-
-define(PARAM_MULTIPLIER, `FRAME+16(%esp)')
-define(PARAM_SIZE, `FRAME+12(%esp)')
-define(PARAM_SRC, `FRAME+8(%esp)')
-define(PARAM_DST, `FRAME+4(%esp)')
-
- TEXT
- ALIGN(8)
-
-PROLOGUE(M4_function_1)
-deflit(`FRAME',0)
-
- pushl %edi
- pushl %esi
- pushl %ebx
- pushl %ebp
-deflit(`FRAME',16)
-
- movl PARAM_DST,%edi
- movl PARAM_SRC,%esi
- movl PARAM_SIZE,%ecx
-
- xorl %ebx,%ebx
- andl $3,%ecx
- jz L(end0)
-
-L(oop0):
- movl (%esi),%eax
- mull PARAM_MULTIPLIER
- leal 4(%esi),%esi
- addl %ebx,%eax
- movl $0,%ebx
- adcl %ebx,%edx
- M4_inst %eax,(%edi)
- adcl %edx,%ebx C propagate carry into cylimb
-
- leal 4(%edi),%edi
- decl %ecx
- jnz L(oop0)
-
-L(end0):
- movl PARAM_SIZE,%ecx
- shrl $2,%ecx
- jz L(end)
-
- ALIGN(8)
-L(oop): movl (%esi),%eax
- mull PARAM_MULTIPLIER
- addl %eax,%ebx
- movl $0,%ebp
- adcl %edx,%ebp
-
- movl 4(%esi),%eax
- mull PARAM_MULTIPLIER
- M4_inst %ebx,(%edi)
- adcl %eax,%ebp C new lo + cylimb
- movl $0,%ebx
- adcl %edx,%ebx
-
- movl 8(%esi),%eax
- mull PARAM_MULTIPLIER
- M4_inst %ebp,4(%edi)
- adcl %eax,%ebx C new lo + cylimb
- movl $0,%ebp
- adcl %edx,%ebp
-
- movl 12(%esi),%eax
- mull PARAM_MULTIPLIER
- M4_inst %ebx,8(%edi)
- adcl %eax,%ebp C new lo + cylimb
- movl $0,%ebx
- adcl %edx,%ebx
-
- M4_inst %ebp,12(%edi)
- adcl $0,%ebx C propagate carry into cylimb
-
- leal 16(%esi),%esi
- leal 16(%edi),%edi
- decl %ecx
- jnz L(oop)
-
-L(end): movl %ebx,%eax
-
- popl %ebp
- popl %ebx
- popl %esi
- popl %edi
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/copyd.asm b/rts/gmp/mpn/x86/copyd.asm
deleted file mode 100644
index 439640e836..0000000000
--- a/rts/gmp/mpn/x86/copyd.asm
+++ /dev/null
@@ -1,80 +0,0 @@
-dnl x86 mpn_copyd -- copy limb vector, decrementing.
-dnl
-dnl Future: On P6 an MMX loop should be able to go faster than this code.
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
-C
-C Copy src,size to dst,size, working from high to low addresses.
-C
-C The code here is very generic and can be expected to be reasonable on all
-C the x86 family.
-C
-C P5 - 1.0 cycles/limb.
-C
-C P6 - 2.4 cycles/limb, approx 40 cycles startup.
-
-defframe(PARAM_SIZE,12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-deflit(`FRAME',0)
-
- .text
- ALIGN(32)
-
-PROLOGUE(mpn_copyd)
- C eax saved esi
- C ebx
- C ecx counter
- C edx saved edi
- C esi src
- C edi dst
- C ebp
-
- movl PARAM_SIZE, %ecx
- movl %esi, %eax
-
- movl PARAM_SRC, %esi
- movl %edi, %edx
-
- movl PARAM_DST, %edi
- leal -4(%esi,%ecx,4), %esi
-
- leal -4(%edi,%ecx,4), %edi
-
- std
-
- rep
- movsl
-
- cld
-
- movl %eax, %esi
- movl %edx, %edi
-
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/copyi.asm b/rts/gmp/mpn/x86/copyi.asm
deleted file mode 100644
index 5bc4e36689..0000000000
--- a/rts/gmp/mpn/x86/copyi.asm
+++ /dev/null
@@ -1,79 +0,0 @@
-dnl x86 mpn_copyi -- copy limb vector, incrementing.
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
-C
-C Copy src,size to dst,size, working from low to high addresses.
-C
-C The code here is very generic and can be expected to be reasonable on all
-C the x86 family.
-C
-C P5 - 1.0 cycles/limb.
-C
-C P6 - 0.75 cycles/limb. An MMX based copy was tried, but was found to be
-C slower than a rep movs in all cases. The fastest MMX found was 0.8
-C cycles/limb (when fully aligned). A rep movs seems to have a startup
-C time of about 15 cycles, but doing something special for small sizes
-C could lead to a branch misprediction that would destroy any saving.
-C For now a plain rep movs seems ok for P6.
-
-defframe(PARAM_SIZE,12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-deflit(`FRAME',0)
-
- .text
- ALIGN(32)
-
- C eax saved esi
- C ebx
- C ecx counter
- C edx saved edi
- C esi src
- C edi dst
- C ebp
-
-PROLOGUE(mpn_copyi)
-
- movl PARAM_SIZE, %ecx
- movl %esi, %eax
-
- movl PARAM_SRC, %esi
- movl %edi, %edx
-
- movl PARAM_DST, %edi
-
- cld C better safe than sorry, see mpn/x86/README.family
-
- rep
- movsl
-
- movl %eax, %esi
- movl %edx, %edi
-
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/diveby3.asm b/rts/gmp/mpn/x86/diveby3.asm
deleted file mode 100644
index df879da9e1..0000000000
--- a/rts/gmp/mpn/x86/diveby3.asm
+++ /dev/null
@@ -1,115 +0,0 @@
-dnl x86 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
-
-
-dnl Copyright (C) 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-dnl The following all have their own optimized versions of this routine,
-dnl but for reference the code here runs as follows.
-dnl
-dnl cycles/limb
-dnl P54 18.0
-dnl P55 17.0
-dnl P6 14.5
-dnl K6 14.0
-dnl K7 10.0
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t carry);
-
-defframe(PARAM_CARRY,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-dnl multiplicative inverse of 3, modulo 2^32
-deflit(INVERSE_3, 0xAAAAAAAB)
-
-dnl ceil(b/3) and ceil(b*2/3) where b=2^32
-deflit(ONE_THIRD_CEIL, 0x55555556)
-deflit(TWO_THIRDS_CEIL, 0xAAAAAAAB)
-
- .text
- ALIGN(8)
-
-PROLOGUE(mpn_divexact_by3c)
-deflit(`FRAME',0)
-
- movl PARAM_SRC, %ecx
- pushl %ebp FRAME_pushl()
-
- movl PARAM_SIZE, %ebp
- pushl %edi FRAME_pushl()
-
- movl PARAM_DST, %edi
- pushl %esi FRAME_pushl()
-
- movl $INVERSE_3, %esi
- pushl %ebx FRAME_pushl()
-
- leal (%ecx,%ebp,4), %ecx
- movl PARAM_CARRY, %ebx
-
- leal (%edi,%ebp,4), %edi
- negl %ebp
-
-
- ALIGN(8)
-L(top):
- C eax scratch, low product
- C ebx carry limb (0 to 3)
- C ecx &src[size]
- C edx scratch, high product
- C esi multiplier
- C edi &dst[size]
- C ebp counter, limbs, negative
-
- movl (%ecx,%ebp,4), %eax
-
- subl %ebx, %eax
-
- setc %bl
-
- imull %esi
-
- cmpl $ONE_THIRD_CEIL, %eax
- movl %eax, (%edi,%ebp,4)
-
- sbbl $-1, %ebx C +1 if eax>=ceil(b/3)
- cmpl $TWO_THIRDS_CEIL, %eax
-
- sbbl $-1, %ebx C +1 if eax>=ceil(b*2/3)
- incl %ebp
-
- jnz L(top)
-
-
- movl %ebx, %eax
- popl %ebx
- popl %esi
- popl %edi
- popl %ebp
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/divrem_1.asm b/rts/gmp/mpn/x86/divrem_1.asm
deleted file mode 100644
index 12f14676d6..0000000000
--- a/rts/gmp/mpn/x86/divrem_1.asm
+++ /dev/null
@@ -1,232 +0,0 @@
-dnl x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-dnl cycles/limb
-dnl K6 20
-dnl P5 44
-dnl P6 39
-dnl 486 approx 43 maybe
-dnl
-dnl
-dnl The following have their own optimized divrem_1 implementations, but
-dnl for reference the code here runs as follows.
-dnl
-dnl cycles/limb
-dnl P6MMX 39
-dnl K7 42
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
-C mp_srcptr src, mp_size_t size, mp_limb_t divisor);
-C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
-C mp_srcptr src, mp_size_t size, mp_limb_t divisor);
-C
-C Divide src,size by divisor and store the quotient in dst+xsize,size.
-C Extend the division to fractional quotient limbs in dst,xsize. Return the
-C remainder. Either or both xsize and size can be 0.
-C
-C mpn_divrem_1c takes a carry parameter which is an initial high limb,
-C effectively one extra limb at the top of src,size. Must have
-C carry<divisor.
-C
-C
-C Essentially the code is the same as the division based part of
-C mpn/generic/divrem_1.c, but has the following advantages.
-C
-C - If gcc isn't being used then divrem_1.c will get the generic C
-C udiv_qrnnd() and be rather slow.
-C
-C - On K6, using the loop instruction is a 10% speedup, but gcc doesn't
-C generate that instruction (as of gcc 2.95.2 at least).
-C
-C A test is done to see if the high limb is less the the divisor, and if so
-C one less div is done. A div is between 20 and 40 cycles on the various
-C x86s, so assuming high<divisor about half the time, then this test saves
-C half that amount. The branch misprediction penalty on each chip is less
-C than half a div.
-C
-C
-C K6: Back-to-back div instructions run at 20 cycles, the same as the loop
-C here, so it seems there's nothing to gain by rearranging the loop.
-C Pairing the mov and loop instructions was found to gain nothing. (The
-C same is true of the mpn/x86/mod_1.asm loop.)
-C
-C With a "decl/jnz" rather than a "loop" this code runs at 22 cycles.
-C The loop_or_decljnz macro is an easy way to get a 10% speedup.
-C
-C The fast K6 multiply might be thought to suit a multiply-by-inverse,
-C but that algorithm has been found to suffer from the releatively poor
-C carry handling on K6 and too many auxiliary instructions. The
-C fractional part however could be done at about 13 c/l.
-C
-C P5: Moving the load down to pair with the store might save 1 cycle, but
-C that doesn't seem worth bothering with, since it'd be only a 2.2%
-C saving.
-C
-C Again here the auxiliary instructions hinder a multiply-by-inverse,
-C though there might be a 10-15% speedup available
-
-
-defframe(PARAM_CARRY, 24)
-defframe(PARAM_DIVISOR,20)
-defframe(PARAM_SIZE, 16)
-defframe(PARAM_SRC, 12)
-defframe(PARAM_XSIZE, 8)
-defframe(PARAM_DST, 4)
-
- .text
- ALIGN(16)
-
-PROLOGUE(mpn_divrem_1c)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %ecx
- pushl %edi FRAME_pushl()
-
- movl PARAM_SRC, %edi
- pushl %esi FRAME_pushl()
-
- movl PARAM_DIVISOR, %esi
- pushl %ebx FRAME_pushl()
-
- movl PARAM_DST, %ebx
- pushl %ebp FRAME_pushl()
-
- movl PARAM_XSIZE, %ebp
- orl %ecx, %ecx
-
- movl PARAM_CARRY, %edx
- jz LF(mpn_divrem_1,fraction)
-
- leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
- jmp LF(mpn_divrem_1,integer_top)
-
-EPILOGUE()
-
-
-PROLOGUE(mpn_divrem_1)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %ecx
- pushl %edi FRAME_pushl()
-
- movl PARAM_SRC, %edi
- pushl %esi FRAME_pushl()
-
- movl PARAM_DIVISOR, %esi
- orl %ecx,%ecx
-
- jz L(size_zero)
- pushl %ebx FRAME_pushl()
-
- movl -4(%edi,%ecx,4), %eax C src high limb
- xorl %edx, %edx
-
- movl PARAM_DST, %ebx
- pushl %ebp FRAME_pushl()
-
- movl PARAM_XSIZE, %ebp
- cmpl %esi, %eax
-
- leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
- jae L(integer_entry)
-
-
- C high<divisor, so high of dst is zero, and avoid one div
-
- movl %edx, (%ebx,%ecx,4)
- decl %ecx
-
- movl %eax, %edx
- jz L(fraction)
-
-
-L(integer_top):
- C eax scratch (quotient)
- C ebx dst+4*xsize-4
- C ecx counter
- C edx scratch (remainder)
- C esi divisor
- C edi src
- C ebp xsize
-
- movl -4(%edi,%ecx,4), %eax
-L(integer_entry):
-
- divl %esi
-
- movl %eax, (%ebx,%ecx,4)
- loop_or_decljnz L(integer_top)
-
-
-L(fraction):
- orl %ebp, %ecx
- jz L(done)
-
- movl PARAM_DST, %ebx
-
-
-L(fraction_top):
- C eax scratch (quotient)
- C ebx dst
- C ecx counter
- C edx scratch (remainder)
- C esi divisor
- C edi
- C ebp
-
- xorl %eax, %eax
-
- divl %esi
-
- movl %eax, -4(%ebx,%ecx,4)
- loop_or_decljnz L(fraction_top)
-
-
-L(done):
- popl %ebp
- movl %edx, %eax
- popl %ebx
- popl %esi
- popl %edi
- ret
-
-
-L(size_zero):
-deflit(`FRAME',8)
- movl PARAM_XSIZE, %ecx
- xorl %eax, %eax
-
- movl PARAM_DST, %edi
-
- cld C better safe than sorry, see mpn/x86/README.family
-
- rep
- stosl
-
- popl %esi
- popl %edi
- ret
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/README b/rts/gmp/mpn/x86/k6/README
deleted file mode 100644
index 3ad96c8b89..0000000000
--- a/rts/gmp/mpn/x86/k6/README
+++ /dev/null
@@ -1,237 +0,0 @@
-
- AMD K6 MPN SUBROUTINES
-
-
-
-This directory contains code optimized for AMD K6 CPUs, meaning K6, K6-2 and
-K6-3.
-
-The mmx and k62mmx subdirectories have routines using MMX instructions. All
-K6s have MMX, the separate directories are just so that ./configure can omit
-them if the assembler doesn't support MMX.
-
-
-
-
-STATUS
-
-Times for the loops, with all code and data in L1 cache, are as follows.
-
- cycles/limb
-
- mpn_add_n/sub_n 3.25 normal, 2.75 in-place
-
- mpn_mul_1 6.25
- mpn_add/submul_1 7.65-8.4 (varying with data values)
-
- mpn_mul_basecase 9.25 cycles/crossproduct (approx)
- mpn_sqr_basecase 4.7 cycles/crossproduct (approx)
- or 9.2 cycles/triangleproduct (approx)
-
- mpn_divrem_1 20.0
- mpn_mod_1 20.0
- mpn_divexact_by3 11.0
-
- mpn_l/rshift 3.0
-
- mpn_copyi/copyd 1.0
-
- mpn_com_n 1.5-1.85 \
- mpn_and/andn/ior/xor_n 1.5-1.75 | varying with
- mpn_iorn/xnor_n 2.0-2.25 | data alignment
- mpn_nand/nior_n 2.0-2.25 /
-
- mpn_popcount 12.5
- mpn_hamdist 13.0
-
-
-K6-2 and K6-3 have dual-issue MMX and get the following improvements.
-
- mpn_l/rshift 1.75
-
- mpn_copyi/copyd 0.56 or 1.0 \
- |
- mpn_com_n 1.0-1.2 | varying with
- mpn_and/andn/ior/xor_n 1.2-1.5 | data alignment
- mpn_iorn/xnor_n 1.5-2.0 |
- mpn_nand/nior_n 1.75-2.0 /
-
- mpn_popcount 9.0
- mpn_hamdist 11.5
-
-
-Prefetching of sources hasn't yet given any joy. With the 3DNow "prefetch"
-instruction, code seems to run slower, and with just "mov" loads it doesn't
-seem faster. Results so far are inconsistent. The K6 does a hardware
-prefetch of the second cache line in a sector, so the penalty for not
-prefetching in software is reduced.
-
-
-
-
-NOTES
-
-All K6 family chips have MMX, but only K6-2 and K6-3 have 3DNow.
-
-Plain K6 executes MMX instructions only in the X pipe, but K6-2 and K6-3 can
-execute them in both X and Y (and together).
-
-Branch misprediction penalty is 1 to 4 cycles (Optimization Manual
-chapter 6 table 12).
-
-Write-allocate L1 data cache means prefetching of destinations is unnecessary.
-Store queue is 7 entries of 64 bits each.
-
-Floating point multiplications can be done in parallel with integer
-multiplications, but there doesn't seem to be any way to make use of this.
-
-
-
-OPTIMIZATIONS
-
-Unrolled loops are used to reduce looping overhead. The unrolling is
-configurable up to 32 limbs/loop for most routines, up to 64 for some.
-
-Sometimes computed jumps into the unrolling are used to handle sizes not a
-multiple of the unrolling. An attractive feature of this is that times
-smoothly increase with operand size, but an indirect jump is about 6 cycles
-and the setups about another 6, so it depends on how much the unrolled code
-is faster than a simple loop as to whether a computed jump ought to be used.
-
-Position independent code is implemented using a call to get eip for
-computed jumps and a ret is always done, rather than an addl $4,%esp or a
-popl, so the CPU return address branch prediction stack stays synchronised
-with the actual stack in memory. Such a call however still costs 4 to 7
-cycles.
-
-Branch prediction, in absence of any history, will guess forward jumps are
-not taken and backward jumps are taken. Where possible it's arranged that
-the less likely or less important case is under a taken forward jump.
-
-
-
-MMX
-
-Putting emms or femms as late as possible in a routine seems to be fastest.
-Perhaps an emms or femms stalls until all outstanding MMX instructions have
-completed, so putting it later gives them a chance to complete on their own,
-in parallel with other operations (like register popping).
-
-The Optimization Manual chapter 5 recommends using a femms on K6-2 and K6-3
-at the start of a routine, in case it's been preceded by x87 floating point
-operations. This isn't done because in gmp programs it's expected that x87
-floating point won't be much used and that chances are an mpn routine won't
-have been preceded by any x87 code.
-
-
-
-CODING
-
-Instructions in general code are shown paired if they can decode and execute
-together, meaning two short decode instructions with the second not
-depending on the first, only the first using the shifter, no more than one
-load, and no more than one store.
-
-K6 does some out of order execution so the pairings aren't essential, they
-just show what slots might be available. When decoding is the limiting
-factor things can be scheduled that might not execute until later.
-
-
-
-NOTES
-
-Code alignment
-
-- if an opcode/modrm or 0Fh/opcode/modrm crosses a cache line boundary,
- short decode is inhibited. The cross.pl script detects this.
-
-- loops and branch targets should be aligned to 16 bytes, or ensure at least
- 2 instructions before a 32 byte boundary. This makes use of the 16 byte
- cache in the BTB.
-
-Addressing modes
-
-- (%esi) degrades decoding from short to vector. 0(%esi) doesn't have this
- problem, and can be used as an equivalent, or easier is just to use a
- different register, like %ebx.
-
-- K6 and pre-CXT core K6-2 have the following problem. (K6-2 CXT and K6-3
- have it fixed, these being cpuid function 1 signatures 0x588 to 0x58F).
-
- If more than 3 bytes are needed to determine instruction length then
- decoding degrades from direct to long, or from long to vector. This
- happens with forms like "0F opcode mod/rm" with mod/rm=00-xxx-100 since
- with mod=00 the sib determines whether there's a displacement.
-
- This affects all MMX and 3DNow instructions, and others with an 0F prefix
- like movzbl. The modes affected are anything with an index and no
- displacement, or an index but no base, and this includes (%esp) which is
- really (,%esp,1).
-
- The cross.pl script detects problem cases. The workaround is to always
- use a displacement, and to do this with Zdisp if it's zero so the
- assembler doesn't discard it.
-
- See Optimization Manual rev D page 67 and 3DNow Porting Guide rev B pages
- 13-14 and 36-37.
-
-Calls
-
-- indirect jumps and calls are not branch predicted, they measure about 6
- cycles.
-
-Various
-
-- adcl 2 cycles of decode, maybe 2 cycles executing in the X pipe
-- bsf 12-27 cycles
-- emms 5 cycles
-- femms 3 cycles
-- jecxz 2 cycles taken, 13 not taken (optimization manual says 7 not taken)
-- divl 20 cycles back-to-back
-- imull 2 decode, 2 execute
-- mull 2 decode, 3 execute (optimization manual decoding sample)
-- prefetch 2 cycles
-- rcll/rcrl implicit by one bit: 2 cycles
- immediate or %cl count: 11 + 2 per bit for dword
- 13 + 4 per bit for byte
-- setCC 2 cycles
-- xchgl %eax,reg 1.5 cycles, back-to-back (strange)
- reg,reg 2 cycles, back-to-back
-
-
-
-
-REFERENCES
-
-"AMD-K6 Processor Code Optimization Application Note", AMD publication
-number 21924, revision D amendment 0, January 2000. This describes K6-2 and
-K6-3. Available on-line,
-
- http://www.amd.com/K6/k6docs/pdf/21924.pdf
-
-"AMD-K6 MMX Enhanced Processor x86 Code Optimization Application Note", AMD
-publication number 21828, revision A amendment 0, August 1997. This is an
-older edition of the above document, describing plain K6. Available
-on-line,
-
- http://www.amd.com/K6/k6docs/pdf/21828.pdf
-
-"3DNow Technology Manual", AMD publication number 21928F/0-August 1999.
-This describes the femms and prefetch instructions, but nothing else from
-3DNow has been used. Available on-line,
-
- http://www.amd.com/K6/k6docs/pdf/21928.pdf
-
-"3DNow Instruction Porting Guide", AMD publication number 22621, revision B,
-August 1999. This has some notes on general K6 optimizations as well as
-3DNow. Available on-line,
-
- http://www.amd.com/products/cpg/athlon/techdocs/pdf/22621.pdf
-
-
-
-----------------
-Local variables:
-mode: text
-fill-column: 76
-End:
diff --git a/rts/gmp/mpn/x86/k6/aors_n.asm b/rts/gmp/mpn/x86/k6/aors_n.asm
deleted file mode 100644
index 31b05ada51..0000000000
--- a/rts/gmp/mpn/x86/k6/aors_n.asm
+++ /dev/null
@@ -1,329 +0,0 @@
-dnl AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
-dnl
-dnl K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-ifdef(`OPERATION_add_n', `
- define(M4_inst, adcl)
- define(M4_function_n, mpn_add_n)
- define(M4_function_nc, mpn_add_nc)
- define(M4_description, add)
-',`ifdef(`OPERATION_sub_n', `
- define(M4_inst, sbbl)
- define(M4_function_n, mpn_sub_n)
- define(M4_function_nc, mpn_sub_nc)
- define(M4_description, subtract)
-',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
-')')')
-
-MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
-
-
-C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
-C mp_size_t size);
-C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
-C mp_size_t size, mp_limb_t carry);
-C
-C Calculate src1,size M4_description src2,size, and store the result in
-C dst,size. The return value is the carry bit from the top of the result
-C (1 or 0).
-C
-C The _nc version accepts 1 or 0 for an initial carry into the low limb of
-C the calculation. Note values other than 1 or 0 here will lead to garbage
-C results.
-C
-C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
-C an in-place dst+=src to 2.5 c/l. The unrolled loops have 1 cycle/loop of
-C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
-
-define(PARAM_CARRY, `FRAME+20(%esp)')
-define(PARAM_SIZE, `FRAME+16(%esp)')
-define(PARAM_SRC2, `FRAME+12(%esp)')
-define(PARAM_SRC1, `FRAME+8(%esp)')
-define(PARAM_DST, `FRAME+4(%esp)')
-deflit(`FRAME',0)
-
-dnl minimum 5 because the unrolled code can't handle less
-deflit(UNROLL_THRESHOLD, 5)
-
- .text
- ALIGN(32)
-
-PROLOGUE(M4_function_nc)
- movl PARAM_CARRY, %eax
- jmp LF(M4_function_n,start)
-EPILOGUE()
-
-
-PROLOGUE(M4_function_n)
- xorl %eax, %eax
-L(start):
- movl PARAM_SIZE, %ecx
- pushl %ebx
-FRAME_pushl()
-
- movl PARAM_SRC1, %ebx
- pushl %edi
-FRAME_pushl()
-
- movl PARAM_SRC2, %edx
- cmpl $UNROLL_THRESHOLD, %ecx
-
- movl PARAM_DST, %edi
- jae L(unroll)
-
-
- shrl %eax C initial carry flag
-
- C offset 0x21 here, close enough to aligned
-L(simple):
- C eax scratch
- C ebx src1
- C ecx counter
- C edx src2
- C esi
- C edi dst
- C ebp
- C
- C The store to (%edi) could be done with a stosl; it'd be smaller
- C code, but there's no speed gain and a cld would have to be added
- C (per mpn/x86/README.family).
-
- movl (%ebx), %eax
- leal 4(%ebx), %ebx
-
- M4_inst (%edx), %eax
-
- movl %eax, (%edi)
- leal 4(%edi), %edi
-
- leal 4(%edx), %edx
- loop L(simple)
-
-
- movl $0, %eax
- popl %edi
-
- setc %al
-
- popl %ebx
- ret
-
-
-C -----------------------------------------------------------------------------
-L(unroll):
- C eax carry
- C ebx src1
- C ecx counter
- C edx src2
- C esi
- C edi dst
- C ebp
-
- cmpl %edi, %ebx
- pushl %esi
-
- je L(inplace)
-
-ifdef(`OPERATION_add_n',`
- cmpl %edi, %edx
-
- je L(inplace_reverse)
-')
-
- movl %ecx, %esi
-
- andl $-4, %ecx
- andl $3, %esi
-
- leal (%ebx,%ecx,4), %ebx
- leal (%edx,%ecx,4), %edx
- leal (%edi,%ecx,4), %edi
-
- negl %ecx
- shrl %eax
-
- ALIGN(32)
-L(normal_top):
- C eax counter, qwords, negative
- C ebx src1
- C ecx scratch
- C edx src2
- C esi
- C edi dst
- C ebp
-
- movl (%ebx,%ecx,4), %eax
- leal 5(%ecx), %ecx
- M4_inst -20(%edx,%ecx,4), %eax
- movl %eax, -20(%edi,%ecx,4)
-
- movl 4-20(%ebx,%ecx,4), %eax
- M4_inst 4-20(%edx,%ecx,4), %eax
- movl %eax, 4-20(%edi,%ecx,4)
-
- movl 8-20(%ebx,%ecx,4), %eax
- M4_inst 8-20(%edx,%ecx,4), %eax
- movl %eax, 8-20(%edi,%ecx,4)
-
- movl 12-20(%ebx,%ecx,4), %eax
- M4_inst 12-20(%edx,%ecx,4), %eax
- movl %eax, 12-20(%edi,%ecx,4)
-
- loop L(normal_top)
-
-
- decl %esi
- jz L(normal_finish_one)
- js L(normal_done)
-
- C two or three more limbs
-
- movl (%ebx), %eax
- M4_inst (%edx), %eax
- movl %eax, (%edi)
-
- movl 4(%ebx), %eax
- M4_inst 4(%edx), %eax
- decl %esi
- movl %eax, 4(%edi)
-
- jz L(normal_done)
- movl $2, %ecx
-
-L(normal_finish_one):
- movl (%ebx,%ecx,4), %eax
- M4_inst (%edx,%ecx,4), %eax
- movl %eax, (%edi,%ecx,4)
-
-L(normal_done):
- popl %esi
- popl %edi
-
- movl $0, %eax
- popl %ebx
-
- setc %al
-
- ret
-
-
-C -----------------------------------------------------------------------------
-
-ifdef(`OPERATION_add_n',`
-L(inplace_reverse):
- C dst==src2
-
- movl %ebx, %edx
-')
-
-L(inplace):
- C eax initial carry
- C ebx
- C ecx size
- C edx src
- C esi
- C edi dst
- C ebp
-
- leal -1(%ecx), %esi
- decl %ecx
-
- andl $-4, %ecx
- andl $3, %esi
-
- movl (%edx), %ebx C src low limb
- leal (%edx,%ecx,4), %edx
-
- leal (%edi,%ecx,4), %edi
- negl %ecx
-
- shrl %eax
-
-
- ALIGN(32)
-L(inplace_top):
- C eax
- C ebx next src limb
- C ecx size
- C edx src
- C esi
- C edi dst
- C ebp
-
- M4_inst %ebx, (%edi,%ecx,4)
-
- movl 4(%edx,%ecx,4), %eax
- leal 5(%ecx), %ecx
-
- M4_inst %eax, 4-20(%edi,%ecx,4)
-
- movl 8-20(%edx,%ecx,4), %eax
- movl 12-20(%edx,%ecx,4), %ebx
-
- M4_inst %eax, 8-20(%edi,%ecx,4)
- M4_inst %ebx, 12-20(%edi,%ecx,4)
-
- movl 16-20(%edx,%ecx,4), %ebx
- loop L(inplace_top)
-
-
- C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
-
- M4_inst %ebx, (%edi)
-
- decl %esi
- jz L(inplace_finish_one)
- js L(inplace_done)
-
- C two or three more limbs
-
- movl 4(%edx), %eax
- movl 8(%edx), %ebx
- M4_inst %eax, 4(%edi)
- M4_inst %ebx, 8(%edi)
-
- decl %esi
- movl $2, %ecx
-
- jz L(normal_done)
-
-L(inplace_finish_one):
- movl 4(%edx,%ecx,4), %eax
- M4_inst %eax, 4(%edi,%ecx,4)
-
-L(inplace_done):
- popl %esi
- popl %edi
-
- movl $0, %eax
- popl %ebx
-
- setc %al
-
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/aorsmul_1.asm b/rts/gmp/mpn/x86/k6/aorsmul_1.asm
deleted file mode 100644
index da4120fe2f..0000000000
--- a/rts/gmp/mpn/x86/k6/aorsmul_1.asm
+++ /dev/null
@@ -1,372 +0,0 @@
-dnl AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
-dnl
-dnl K6: 7.65 to 8.5 cycles/limb (at 16 limbs/loop and depending on the data),
-dnl PIC adds about 6 cycles at the start.
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-dnl K6: large multpliers small multpliers
-dnl UNROLL_COUNT cycles/limb cycles/limb
-dnl 4 9.5 7.78
-dnl 8 9.0 7.78
-dnl 16 8.4 7.65
-dnl 32 8.4 8.2
-dnl
-dnl Maximum possible unrolling with the current code is 32.
-dnl
-dnl Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
-dnl byte block, which might explain the good speed at that unrolling.
-
-deflit(UNROLL_COUNT, 16)
-
-
-ifdef(`OPERATION_addmul_1', `
- define(M4_inst, addl)
- define(M4_function_1, mpn_addmul_1)
- define(M4_function_1c, mpn_addmul_1c)
- define(M4_description, add it to)
- define(M4_desc_retval, carry)
-',`ifdef(`OPERATION_submul_1', `
- define(M4_inst, subl)
- define(M4_function_1, mpn_submul_1)
- define(M4_function_1c, mpn_submul_1c)
- define(M4_description, subtract it from)
- define(M4_desc_retval, borrow)
-',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
-')')')
-
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
-
-
-C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t mult);
-C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t mult, mp_limb_t carry);
-C
-C Calculate src,size multiplied by mult and M4_description dst,size.
-C Return the M4_desc_retval limb from the top of the result.
-C
-C The jadcl0()s in the unrolled loop makes the speed data dependent. Small
-C multipliers (most significant few bits clear) result in few carry bits and
-C speeds up to 7.65 cycles/limb are attained. Large multipliers (most
-C significant few bits set) make the carry bits 50/50 and lead to something
-C more like 8.4 c/l. (With adcl's both of these would be 9.3 c/l.)
-C
-C It's important that the gains for jadcl0 on small multipliers don't come
-C at the cost of slowing down other data. Tests on uniformly distributed
-C random data, designed to confound branch prediction, show about a 7%
-C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
-C overheads included).
-C
-C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
-C 11.0 cycles/limb), and hence isn't used.
-C
-C In the simple loop, note that running ecx from negative to zero and using
-C it as an index in the two movs wouldn't help. It would save one
-C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
-C that would be collapsed by this.
-C
-C
-C jadcl0
-C ------
-C
-C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
-C firstly the instruction decoding and secondly the fact that there's a
-C carry bit for the jadcl0 only on average about 1/4 of the time.
-C
-C The code in the unrolled loop decodes something like the following.
-C
-C decode cycles
-C mull %ebp 2
-C M4_inst %esi, disp(%edi) 1
-C adcl %eax, %ecx 2
-C movl %edx, %esi \ 1
-C jnc 1f /
-C incl %esi \ 1
-C 1: movl disp(%ebx), %eax /
-C ---
-C 7
-C
-C In a back-to-back style test this measures 7 with the jnc not taken, or 8
-C with it taken (both when correctly predicted). This is opposite to the
-C measurements showing small multipliers running faster than large ones.
-C Watch this space for more info ...
-C
-C It's not clear how much branch misprediction might be costing. The K6
-C doco says it will be 1 to 4 cycles, but presumably it's near the low end
-C of that range to get the measured results.
-C
-C
-C In the code the two carries are more or less the preceding mul product and
-C the calculation is roughly
-C
-C x*y + u*b+v
-C
-C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
-C v are the two limbs it's added to (being the low of the next mul, and a
-C limb from the destination).
-C
-C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
-C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
-C x*y/b^2. If x, y, u and v are random and uniformly distributed between 0
-C and b-1, then the total probability can be summed over x and y,
-C
-C 1 b-1 b-1 x*y 1 b*(b-1) b*(b-1)
-C --- * sum sum --- = --- * ------- * ------- = 1/4
-C b^2 x=0 y=1 b^2 b^4 2 2
-C
-C Actually it's a very tiny bit less than 1/4 of course. If y is fixed,
-C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
-
-
-ifdef(`PIC',`
-deflit(UNROLL_THRESHOLD, 9)
-',`
-deflit(UNROLL_THRESHOLD, 6)
-')
-
-defframe(PARAM_CARRY, 20)
-defframe(PARAM_MULTIPLIER,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
- .text
- ALIGN(32)
-
-PROLOGUE(M4_function_1c)
- pushl %esi
-deflit(`FRAME',4)
- movl PARAM_CARRY, %esi
- jmp LF(M4_function_1,start_nc)
-EPILOGUE()
-
-PROLOGUE(M4_function_1)
- push %esi
-deflit(`FRAME',4)
- xorl %esi, %esi C initial carry
-
-L(start_nc):
- movl PARAM_SIZE, %ecx
- pushl %ebx
-deflit(`FRAME',8)
-
- movl PARAM_SRC, %ebx
- pushl %edi
-deflit(`FRAME',12)
-
- cmpl $UNROLL_THRESHOLD, %ecx
- movl PARAM_DST, %edi
-
- pushl %ebp
-deflit(`FRAME',16)
- jae L(unroll)
-
-
- C simple loop
-
- movl PARAM_MULTIPLIER, %ebp
-
-L(simple):
- C eax scratch
- C ebx src
- C ecx counter
- C edx scratch
- C esi carry
- C edi dst
- C ebp multiplier
-
- movl (%ebx), %eax
- addl $4, %ebx
-
- mull %ebp
-
- addl $4, %edi
- addl %esi, %eax
-
- adcl $0, %edx
-
- M4_inst %eax, -4(%edi)
-
- adcl $0, %edx
-
- movl %edx, %esi
- loop L(simple)
-
-
- popl %ebp
- popl %edi
-
- popl %ebx
- movl %esi, %eax
-
- popl %esi
- ret
-
-
-
-C -----------------------------------------------------------------------------
-C The unrolled loop uses a "two carry limbs" scheme. At the top of the loop
-C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
-C For the computed jump an odd size means they start one way around, an even
-C size the other.
-C
-C VAR_JUMP holds the computed jump temporarily because there's not enough
-C registers at the point of doing the mul for the initial two carry limbs.
-C
-C The add/adc for the initial carry in %esi is necessary only for the
-C mpn_addmul/submul_1c entry points. Duplicating the startup code to
-C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
-C idea.
-
-dnl overlapping with parameters already fetched
-define(VAR_COUNTER, `PARAM_SIZE')
-define(VAR_JUMP, `PARAM_DST')
-
-L(unroll):
- C eax
- C ebx src
- C ecx size
- C edx
- C esi initial carry
- C edi dst
- C ebp
-
- movl %ecx, %edx
- decl %ecx
-
- subl $2, %edx
- negl %ecx
-
- shrl $UNROLL_LOG2, %edx
- andl $UNROLL_MASK, %ecx
-
- movl %edx, VAR_COUNTER
- movl %ecx, %edx
-
- shll $4, %edx
- negl %ecx
-
- C 15 code bytes per limb
-ifdef(`PIC',`
- call L(pic_calc)
-L(here):
-',`
- leal L(entry) (%edx,%ecx,1), %edx
-')
- movl (%ebx), %eax C src low limb
-
- movl PARAM_MULTIPLIER, %ebp
- movl %edx, VAR_JUMP
-
- mull %ebp
-
- addl %esi, %eax C initial carry (from _1c)
- jadcl0( %edx)
-
-
- leal 4(%ebx,%ecx,4), %ebx
- movl %edx, %esi C high carry
-
- movl VAR_JUMP, %edx
- leal (%edi,%ecx,4), %edi
-
- testl $1, %ecx
- movl %eax, %ecx C low carry
-
- jz L(noswap)
- movl %esi, %ecx C high,low carry other way around
-
- movl %eax, %esi
-L(noswap):
-
- jmp *%edx
-
-
-ifdef(`PIC',`
-L(pic_calc):
- C See README.family about old gas bugs
- leal (%edx,%ecx,1), %edx
- addl $L(entry)-L(here), %edx
- addl (%esp), %edx
- ret
-')
-
-
-C -----------------------------------------------------------
- ALIGN(32)
-L(top):
-deflit(`FRAME',16)
- C eax scratch
- C ebx src
- C ecx carry lo
- C edx scratch
- C esi carry hi
- C edi dst
- C ebp multiplier
- C
- C 15 code bytes per limb
-
- leal UNROLL_BYTES(%edi), %edi
-
-L(entry):
-forloop(`i', 0, UNROLL_COUNT/2-1, `
- deflit(`disp0', eval(2*i*4))
- deflit(`disp1', eval(disp0 + 4))
-
-Zdisp( movl, disp0,(%ebx), %eax)
- mull %ebp
-Zdisp( M4_inst,%ecx, disp0,(%edi))
- adcl %eax, %esi
- movl %edx, %ecx
- jadcl0( %ecx)
-
- movl disp1(%ebx), %eax
- mull %ebp
- M4_inst %esi, disp1(%edi)
- adcl %eax, %ecx
- movl %edx, %esi
- jadcl0( %esi)
-')
-
- decl VAR_COUNTER
- leal UNROLL_BYTES(%ebx), %ebx
-
- jns L(top)
-
-
- popl %ebp
- M4_inst %ecx, UNROLL_BYTES(%edi)
-
- popl %edi
- movl %esi, %eax
-
- popl %ebx
- jadcl0( %eax)
-
- popl %esi
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/cross.pl b/rts/gmp/mpn/x86/k6/cross.pl
deleted file mode 100644
index 21734f3e52..0000000000
--- a/rts/gmp/mpn/x86/k6/cross.pl
+++ /dev/null
@@ -1,141 +0,0 @@
-#! /usr/bin/perl
-
-# Copyright (C) 2000 Free Software Foundation, Inc.
-#
-# This file is part of the GNU MP Library.
-#
-# The GNU MP Library is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published
-# by the Free Software Foundation; either version 2.1 of the License, or (at
-# your option) any later version.
-#
-# The GNU MP Library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-# License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public License
-# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
-# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-# MA 02111-1307, USA.
-
-
-# Usage: cross.pl [filename.o]...
-#
-# Produce an annotated disassembly of the given object files, indicating
-# certain code alignment and addressing mode problems afflicting K6 chips.
-# "ZZ" is used on all annotations, so this can be searched for.
-#
-# With no arguments, all .o files corresponding to .asm files are processed.
-# This is good in the mpn object directory of a k6*-*-* build.
-#
-# As far as fixing problems goes, any cache line crossing problems in loops
-# get attention, but as a rule it's too tedious to rearrange code or slip in
-# nops to fix every problem in setup or finishup code.
-#
-# Bugs:
-#
-# Instructions without mod/rm bytes or which are already vector decoded are
-# unaffected by cache line boundary crossing, but not all of these have yet
-# been put in as exceptions. All that occur in practice in GMP are present
-# though.
-#
-# There's no messages for using the vector decoded addressing mode (%esi),
-# but that mode is easy to avoid when coding.
-
-use strict;
-
-sub disassemble {
- my ($file) = @_;
- my ($addr,$b1,$b2,$b3, $prefix,$opcode,$modrm);
-
- open (IN, "objdump -Srfh $file |")
- || die "Cannot open pipe from objdump\n";
- while (<IN>) {
- print;
-
- if (/^[ \t]*[0-9]+[ \t]+\.text[ \t]/ && /2\*\*([0-9]+)$/) {
- if ($1 < 5) {
- print "ZZ need at least 2**5 for predictable cache line crossing\n";
- }
- }
-
- if (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)[ \t]+([0-9a-f]+)/) {
- ($addr,$b1,$b2,$b3) = ($1,$2,$3,$4);
-
- } elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)/) {
- ($addr,$b1,$b2,$b3) = ($1,$2,$3,'');
-
- } elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)/) {
- ($addr,$b1,$b2,$b3) = ($1,$2,'','');
-
- } else {
- next;
- }
-
- if ($b1 =~ /0f/) {
- $prefix = $b1;
- $opcode = $b2;
- $modrm = $b3;
- } else {
- $prefix = '';
- $opcode = $b1;
- $modrm = $b2;
- }
-
- # modrm of the form 00-xxx-100 with an 0F prefix is the problem case
- # for K6 and pre-CXT K6-2
- if ($prefix =~ /0f/
- && $opcode !~ /^8/ # jcond disp32
- && $modrm =~ /^[0-3][4c]/) {
- print "ZZ ($file) >3 bytes to determine instruction length\n";
- }
-
- # with just an opcode, starting 1f mod 20h
- if ($addr =~ /[13579bdf]f$/
- && $prefix !~ /0f/
- && $opcode !~ /1[012345]/ # adc
- && $opcode !~ /1[89abcd]/ # sbb
- && $opcode !~ /68/ # push $imm32
- && $opcode !~ /^7/ # jcond disp8
- && $opcode !~ /a[89]/ # test+imm
- && $opcode !~ /a[a-f]/ # stos/lods/scas
- && $opcode !~ /b8/ # movl $imm32,%eax
- && $opcode !~ /e[0123]/ # loop/loopz/loopnz/jcxz
- && $opcode !~ /e[b9]/ # jmp disp8/disp32
- && $opcode !~ /f[89abcd]/ # clc,stc,cli,sti,cld,std
- && !($opcode =~ /f[67]/ # grp 1
- && $modrm =~ /^[2367abef]/) # mul, imul, div, idiv
- && $modrm !~ /^$/) {
- print "ZZ ($file) opcode/modrm cross 32-byte boundary\n";
- }
-
- # with an 0F prefix, anything starting at 1f mod 20h
- if ($addr =~ /[13579bdf][f]$/
- && $prefix =~ /0f/) {
- print "ZZ ($file) prefix/opcode cross 32-byte boundary\n";
- }
-
- # with an 0F prefix, anything with mod/rm starting at 1e mod 20h
- if ($addr =~ /[13579bdf][e]$/
- && $prefix =~ /0f/
- && $opcode !~ /^8/ # jcond disp32
- && $modrm !~ /^$/) {
- print "ZZ ($file) prefix/opcode/modrm cross 32-byte boundary\n";
- }
- }
- close IN || die "Error from objdump (or objdump not available)\n";
-}
-
-
-my @files;
-if ($#ARGV >= 0) {
- @files = @ARGV;
-} else {
- @files = glob "*.asm";
- map {s/.asm/.o/} @files;
-}
-
-foreach (@files) {
- disassemble($_);
-}
diff --git a/rts/gmp/mpn/x86/k6/diveby3.asm b/rts/gmp/mpn/x86/k6/diveby3.asm
deleted file mode 100644
index ffb97bc380..0000000000
--- a/rts/gmp/mpn/x86/k6/diveby3.asm
+++ /dev/null
@@ -1,110 +0,0 @@
-dnl AMD K6 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
-dnl
-dnl K6: 11.0 cycles/limb
-
-
-dnl Copyright (C) 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t carry);
-C
-C Using %esi in (%esi,%ecx,4) or 0(%esi,%ecx,4) addressing modes doesn't
-C lead to vector decoding, unlike plain (%esi) does.
-
-defframe(PARAM_CARRY,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-dnl multiplicative inverse of 3, modulo 2^32
-deflit(INVERSE_3, 0xAAAAAAAB)
-
- .text
- ALIGN(32)
-
-PROLOGUE(mpn_divexact_by3c)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %ecx
- pushl %esi defframe_pushl(SAVE_ESI)
-
- movl PARAM_SRC, %esi
- pushl %edi defframe_pushl(SAVE_EDI)
-
- movl PARAM_DST, %edi
- pushl %ebx defframe_pushl(SAVE_EBX)
-
- movl PARAM_CARRY, %ebx
- leal (%esi,%ecx,4), %esi
-
- pushl $3 defframe_pushl(VAR_THREE)
- leal (%edi,%ecx,4), %edi
-
- negl %ecx
-
-
- C Need 32 alignment for claimed speed, to avoid the movl store
- C opcode/modrm crossing a cache line boundary
-
- ALIGN(32)
-L(top):
- C eax scratch, low product
- C ebx carry limb (0 to 3)
- C ecx counter, limbs, negative
- C edx scratch, high product
- C esi &src[size]
- C edi &dst[size]
- C ebp
- C
- C The 0(%esi,%ecx,4) form pads so the finishup "movl %ebx, %eax"
- C doesn't cross a 32 byte boundary, saving a couple of cycles
- C (that's a fixed couple, not per loop).
-
-Zdisp( movl, 0,(%esi,%ecx,4), %eax)
- subl %ebx, %eax
-
- setc %bl
-
- imull $INVERSE_3, %eax
-
- movl %eax, (%edi,%ecx,4)
- addl $2, %ecx
-
- mull VAR_THREE
-
- addl %edx, %ebx
- loop L(top)
-
-
- movl SAVE_ESI, %esi
- movl %ebx, %eax
-
- movl SAVE_EBX, %ebx
-
- movl SAVE_EDI, %edi
- addl $FRAME, %esp
-
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/gmp-mparam.h b/rts/gmp/mpn/x86/k6/gmp-mparam.h
deleted file mode 100644
index 77f3948d77..0000000000
--- a/rts/gmp/mpn/x86/k6/gmp-mparam.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* AMD K6 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of the GNU Lesser General Public License as published by
-the Free Software Foundation; either version 2.1 of the License, or (at your
-option) any later version.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-License for more details.
-
-You should have received a copy of the GNU Lesser General Public License
-along with the GNU MP Library; see the file COPYING.LIB. If not, write to
-the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-MA 02111-1307, USA. */
-
-#define BITS_PER_MP_LIMB 32
-#define BYTES_PER_MP_LIMB 4
-#define BITS_PER_LONGINT 32
-#define BITS_PER_INT 32
-#define BITS_PER_SHORTINT 16
-#define BITS_PER_CHAR 8
-
-
-#ifndef UMUL_TIME
-#define UMUL_TIME 3 /* cycles */
-#endif
-
-#ifndef UDIV_TIME
-#define UDIV_TIME 20 /* cycles */
-#endif
-
-/* bsfl takes 12-27 cycles, put an average for uniform random numbers */
-#ifndef COUNT_TRAILING_ZEROS_TIME
-#define COUNT_TRAILING_ZEROS_TIME 14 /* cycles */
-#endif
-
-
-/* Generated by tuneup.c, 2000-07-04. */
-
-#ifndef KARATSUBA_MUL_THRESHOLD
-#define KARATSUBA_MUL_THRESHOLD 18
-#endif
-#ifndef TOOM3_MUL_THRESHOLD
-#define TOOM3_MUL_THRESHOLD 130
-#endif
-
-#ifndef KARATSUBA_SQR_THRESHOLD
-#define KARATSUBA_SQR_THRESHOLD 34
-#endif
-#ifndef TOOM3_SQR_THRESHOLD
-#define TOOM3_SQR_THRESHOLD 116
-#endif
-
-#ifndef BZ_THRESHOLD
-#define BZ_THRESHOLD 68
-#endif
-
-#ifndef FIB_THRESHOLD
-#define FIB_THRESHOLD 98
-#endif
-
-#ifndef POWM_THRESHOLD
-#define POWM_THRESHOLD 13
-#endif
-
-#ifndef GCD_ACCEL_THRESHOLD
-#define GCD_ACCEL_THRESHOLD 4
-#endif
-#ifndef GCDEXT_THRESHOLD
-#define GCDEXT_THRESHOLD 67
-#endif
-
-#ifndef FFT_MUL_TABLE
-#define FFT_MUL_TABLE { 528, 1184, 2176, 5632, 14336, 40960, 0 }
-#endif
-#ifndef FFT_MODF_MUL_THRESHOLD
-#define FFT_MODF_MUL_THRESHOLD 472
-#endif
-#ifndef FFT_MUL_THRESHOLD
-#define FFT_MUL_THRESHOLD 4352
-#endif
-
-#ifndef FFT_SQR_TABLE
-#define FFT_SQR_TABLE { 528, 1184, 2176, 5632, 14336, 40960, 0 }
-#endif
-#ifndef FFT_MODF_SQR_THRESHOLD
-#define FFT_MODF_SQR_THRESHOLD 544
-#endif
-#ifndef FFT_SQR_THRESHOLD
-#define FFT_SQR_THRESHOLD 4352
-#endif
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm b/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm
deleted file mode 100644
index 20a33e6ccf..0000000000
--- a/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm
+++ /dev/null
@@ -1,179 +0,0 @@
-dnl AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
-dnl
-dnl K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
-dnl alignment.
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-dnl K6-2 aligned:
-dnl UNROLL_COUNT cycles/limb
-dnl 8 0.75
-dnl 16 0.625
-dnl 32 0.5625
-dnl 64 0.53
-dnl Maximum possible with the current code is 64, the minimum is 2.
-
-deflit(UNROLL_COUNT, 32)
-
-
-C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
-C
-C Copy src,size to dst,size, processing limbs from high to low addresses.
-C
-C The comments in copyi.asm apply here too.
-
-
-defframe(PARAM_SIZE,12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-deflit(`FRAME',0)
-
- .text
- ALIGN(32)
-
-PROLOGUE(mpn_copyd)
- movl PARAM_SIZE, %ecx
- movl %esi, %eax
-
- movl PARAM_SRC, %esi
- movl %edi, %edx
-
- std
-
- movl PARAM_DST, %edi
- cmpl $UNROLL_COUNT, %ecx
-
- leal -4(%esi,%ecx,4), %esi
-
- leal -4(%edi,%ecx,4), %edi
- ja L(unroll)
-
-L(simple):
- rep
- movsl
-
- cld
-
- movl %eax, %esi
- movl %edx, %edi
-
- ret
-
-
-L(unroll):
- C if src and dst are different alignments mod8, then use rep movs
- C if src and dst are both 4mod8 then process one limb to get 0mod8
-
- pushl %ebx
- leal (%esi,%edi), %ebx
-
- testb $4, %bl
- popl %ebx
-
- jnz L(simple)
- testl $4, %esi
-
- leal -UNROLL_COUNT(%ecx), %ecx
- jnz L(already_aligned)
-
- movsl
-
- decl %ecx
-L(already_aligned):
-
-
-ifelse(UNROLL_BYTES,256,`
- subl $128, %esi
- subl $128, %edi
-')
-
- C offset 0x3D here, but gets full speed without further alignment
-L(top):
- C eax saved esi
- C ebx
- C ecx counter, limbs
- C edx saved edi
- C esi src, incrementing
- C edi dst, incrementing
- C ebp
- C
- C `disp' is never 0, so don't need to force 0(%esi).
-
-deflit(CHUNK_COUNT, 2)
-forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
- deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
- movq disp(%esi), %mm0
- movq %mm0, disp(%edi)
-')
-
- leal -UNROLL_BYTES(%esi), %esi
- subl $UNROLL_COUNT, %ecx
-
- leal -UNROLL_BYTES(%edi), %edi
- jns L(top)
-
-
- C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
- C UNROLL_COUNT-1 limbs remaining
-
- testb $eval(UNROLL_COUNT/2), %cl
-
- leal UNROLL_COUNT(%ecx), %ecx
- jz L(not_half)
-
-
- C at an unroll count of 32 this block of code is 16 cycles faster than
- C the rep movs, less 3 or 4 to test whether to do it
-
-forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
- deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
- movq disp(%esi), %mm0
- movq %mm0, disp(%edi)
-')
-
- subl $eval(UNROLL_BYTES/2), %esi
- subl $eval(UNROLL_BYTES/2), %edi
-
- subl $eval(UNROLL_COUNT/2), %ecx
-L(not_half):
-
-
-ifelse(UNROLL_BYTES,256,`
- addl $128, %esi
- addl $128, %edi
-')
-
- rep
- movsl
-
- cld
-
- movl %eax, %esi
- movl %edx, %edi
-
- femms
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm b/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm
deleted file mode 100644
index 215d805f2e..0000000000
--- a/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm
+++ /dev/null
@@ -1,196 +0,0 @@
-dnl AMD K6-2 mpn_copyi -- copy limb vector, incrementing.
-dnl
-dnl K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
-dnl alignment.
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-dnl K6-2 aligned:
-dnl UNROLL_COUNT cycles/limb
-dnl 8 0.75
-dnl 16 0.625
-dnl 32 0.5625
-dnl 64 0.53
-dnl Maximum possible with the current code is 64, the minimum is 2.
-
-deflit(UNROLL_COUNT, 32)
-
-
-C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
-C
-C The MMX loop is faster than a rep movs when src and dst are both 0mod8.
-C With one 0mod8 and one 4mod8 it's 1.056 c/l and the rep movs at 1.0 c/l is
-C used instead.
-C
-C mod8
-C src dst
-C 0 0 both aligned, use mmx
-C 0 4 unaligned, use rep movs
-C 4 0 unaligned, use rep movs
-C 4 4 do one movs, then both aligned, use mmx
-C
-C The MMX code on aligned data is 0.5 c/l, plus loop overhead of 2
-C cycles/loop, which is 0.0625 c/l at 32 limbs/loop.
-C
-C A pattern of two movq loads and two movq stores (or four and four) was
-C tried, but found to be the same speed as just one of each.
-C
-C Note that this code only suits K6-2 and K6-3. Plain K6 does only one mmx
-C instruction per cycle, so "movq"s are no faster than the simple 1 c/l rep
-C movs.
-C
-C Enhancement:
-C
-C Addressing modes like disp(%esi,%ecx,4) aren't currently used. They'd
-C make it possible to avoid incrementing %esi and %edi in the loop and hence
-C get loop overhead down to 1 cycle. Care would be needed to avoid bad
-C cache line crossings since the "movq"s would then be 5 code bytes rather
-C than 4.
-
-
-defframe(PARAM_SIZE,12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-deflit(`FRAME',0)
-
- .text
- ALIGN(32)
-
-PROLOGUE(mpn_copyi)
- movl PARAM_SIZE, %ecx
- movl %esi, %eax
-
- movl PARAM_SRC, %esi
- movl %edi, %edx
-
- cld
-
- movl PARAM_DST, %edi
- cmpl $UNROLL_COUNT, %ecx
-
- ja L(unroll)
-
-L(simple):
- rep
- movsl
-
- movl %eax, %esi
- movl %edx, %edi
-
- ret
-
-
-L(unroll):
- C if src and dst are different alignments mod8, then use rep movs
- C if src and dst are both 4mod8 then process one limb to get 0mod8
-
- pushl %ebx
- leal (%esi,%edi), %ebx
-
- testb $4, %bl
- popl %ebx
-
- jnz L(simple)
- testl $4, %esi
-
- leal -UNROLL_COUNT(%ecx), %ecx
- jz L(already_aligned)
-
- decl %ecx
-
- movsl
-L(already_aligned):
-
-
-ifelse(UNROLL_BYTES,256,`
- addl $128, %esi
- addl $128, %edi
-')
-
- C this is offset 0x34, no alignment needed
-L(top):
- C eax saved esi
- C ebx
- C ecx counter, limbs
- C edx saved edi
- C esi src, incrementing
- C edi dst, incrementing
- C ebp
- C
- C Zdisp gets 0(%esi) left that way to avoid vector decode, and with
- C 0(%edi) keeps code aligned to 16 byte boundaries.
-
-deflit(CHUNK_COUNT, 2)
-forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
- deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
-Zdisp( movq, disp,(%esi), %mm0)
-Zdisp( movq, %mm0, disp,(%edi))
-')
-
- addl $UNROLL_BYTES, %esi
- subl $UNROLL_COUNT, %ecx
-
- leal UNROLL_BYTES(%edi), %edi
- jns L(top)
-
-
- C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
- C UNROLL_COUNT-1 limbs remaining
-
- testb $eval(UNROLL_COUNT/2), %cl
-
- leal UNROLL_COUNT(%ecx), %ecx
- jz L(not_half)
-
- C at an unroll count of 32 this block of code is 16 cycles faster than
- C the rep movs, less 3 or 4 to test whether to do it
-
-forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
- deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
- movq disp(%esi), %mm0
- movq %mm0, disp(%edi)
-')
- addl $eval(UNROLL_BYTES/2), %esi
- addl $eval(UNROLL_BYTES/2), %edi
-
- subl $eval(UNROLL_COUNT/2), %ecx
-L(not_half):
-
-
-ifelse(UNROLL_BYTES,256,`
- subl $128, %esi
- subl $128, %edi
-')
-
- rep
- movsl
-
- movl %eax, %esi
- movl %edx, %edi
-
- femms
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm b/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm
deleted file mode 100644
index f6d54f97a8..0000000000
--- a/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm
+++ /dev/null
@@ -1,286 +0,0 @@
-dnl AMD K6-2 mpn_lshift -- mpn left shift.
-dnl
-dnl K6-2: 1.75 cycles/limb
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C unsigned shift);
-C
-
-defframe(PARAM_SHIFT,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-deflit(`FRAME',0)
-
-dnl used after src has been fetched
-define(VAR_RETVAL,`PARAM_SRC')
-
-dnl minimum 9, because unrolled loop can't handle less
-deflit(UNROLL_THRESHOLD, 9)
-
- .text
- ALIGN(32)
-
-PROLOGUE(mpn_lshift)
-deflit(`FRAME',0)
-
- C The 1 limb case can be done without the push %ebx, but it's then
- C still the same speed. The push is left as a free helping hand for
- C the two_or_more code.
-
- movl PARAM_SIZE, %eax
- pushl %ebx FRAME_pushl()
-
- movl PARAM_SRC, %ebx
- decl %eax
-
- movl PARAM_SHIFT, %ecx
- jnz L(two_or_more)
-
- movl (%ebx), %edx C src limb
- movl PARAM_DST, %ebx
-
- shldl( %cl, %edx, %eax) C return value
-
- shll %cl, %edx
-
- movl %edx, (%ebx) C dst limb
- popl %ebx
-
- ret
-
-
-C -----------------------------------------------------------------------------
- ALIGN(16) C avoid offset 0x1f
-L(two_or_more):
- C eax size-1
- C ebx src
- C ecx shift
- C edx
-
- movl (%ebx,%eax,4), %edx C src high limb
- negl %ecx
-
- movd PARAM_SHIFT, %mm6
- addl $32, %ecx C 32-shift
-
- shrl %cl, %edx
- cmpl $UNROLL_THRESHOLD-1, %eax
-
- movl %edx, VAR_RETVAL
- jae L(unroll)
-
-
- movd %ecx, %mm7
- movl %eax, %ecx
-
- movl PARAM_DST, %eax
-
-L(simple):
- C eax dst
- C ebx src
- C ecx counter, size-1 to 1
- C edx retval
- C
- C mm0 scratch
- C mm6 shift
- C mm7 32-shift
-
- movq -4(%ebx,%ecx,4), %mm0
-
- psrlq %mm7, %mm0
-
-Zdisp( movd, %mm0, 0,(%eax,%ecx,4))
- loop L(simple)
-
-
- movd (%ebx), %mm0
- popl %ebx
-
- psllq %mm6, %mm0
-
- movd %mm0, (%eax)
- movl %edx, %eax
-
- femms
- ret
-
-
-C -----------------------------------------------------------------------------
- ALIGN(16)
-L(unroll):
- C eax size-1
- C ebx src
- C ecx 32-shift
- C edx retval (but instead VAR_RETVAL is used)
- C
- C mm6 shift
-
- addl $32, %ecx
- movl PARAM_DST, %edx
-
- movd %ecx, %mm7
- subl $7, %eax C size-8
-
- leal (%edx,%eax,4), %ecx C alignment of dst
-
- movq 32-8(%ebx,%eax,4), %mm2 C src high qword
- testb $4, %cl
-
- jz L(dst_aligned)
- psllq %mm6, %mm2
-
- psrlq $32, %mm2
- decl %eax
-
- movd %mm2, 32(%edx,%eax,4) C dst high limb
- movq 32-8(%ebx,%eax,4), %mm2 C new src high qword
-L(dst_aligned):
-
- movq 32-16(%ebx,%eax,4), %mm0 C src second highest qword
-
-
- C This loop is the important bit, the rest is just support for it.
- C Four src limbs are held at the start, and four more will be read.
- C Four dst limbs will be written. This schedule seems necessary for
- C full speed.
- C
- C The use of size-8 lets the loop stop when %eax goes negative and
- C leaves -4 to -1 which can be tested with test $1 and $2.
-
-L(top):
- C eax counter, size-8 step by -4 until <0
- C ebx src
- C ecx
- C edx dst
- C
- C mm0 src next qword
- C mm1 scratch
- C mm2 src prev qword
- C mm6 shift
- C mm7 64-shift
-
- psllq %mm6, %mm2
- subl $4, %eax
-
- movq %mm0, %mm1
- psrlq %mm7, %mm0
-
- por %mm0, %mm2
- movq 24(%ebx,%eax,4), %mm0
-
- psllq %mm6, %mm1
- movq %mm2, 40(%edx,%eax,4)
-
- movq %mm0, %mm2
- psrlq %mm7, %mm0
-
- por %mm0, %mm1
- movq 16(%ebx,%eax,4), %mm0
-
- movq %mm1, 32(%edx,%eax,4)
- jnc L(top)
-
-
- C Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4.
- C
- C 8(%ebx) is the next source, and 24(%edx) is the next destination.
- C %eax is between -4 and -1, representing respectively 0 to 3 extra
- C limbs that must be read.
-
-
- testl $2, %eax C testl to avoid bad cache line crossing
- jz L(finish_nottwo)
-
- C Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes
- C new mm2 and a new mm0 is loaded.
-
- psllq %mm6, %mm2
- movq %mm0, %mm1
-
- psrlq %mm7, %mm0
- subl $2, %eax
-
- por %mm0, %mm2
- movq 16(%ebx,%eax,4), %mm0
-
- movq %mm2, 32(%edx,%eax,4)
- movq %mm1, %mm2
-L(finish_nottwo):
-
-
- C lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0
-
- testb $1, %al
- psllq %mm6, %mm2
-
- movq %mm0, %mm1
- psrlq %mm7, %mm0
-
- por %mm0, %mm2
- psllq %mm6, %mm1
-
- movq %mm2, 24(%edx,%eax,4)
- jz L(finish_even)
-
-
- C Size is odd, so mm1 and one extra limb to process.
-
- movd (%ebx), %mm0 C src[0]
- popl %ebx
-deflit(`FRAME',0)
-
- movq %mm0, %mm2
- psllq $32, %mm0
-
- psrlq %mm7, %mm0
-
- psllq %mm6, %mm2
- por %mm0, %mm1
-
- movq %mm1, 4(%edx) C dst[1,2]
- movd %mm2, (%edx) C dst[0]
-
- movl VAR_RETVAL, %eax
-
- femms
- ret
-
-
- nop C avoid bad cache line crossing
-L(finish_even):
-deflit(`FRAME',4)
- C Size is even, so only mm1 left to process.
-
- movq %mm1, (%edx) C dst[0,1]
- movl VAR_RETVAL, %eax
-
- popl %ebx
- femms
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm b/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm
deleted file mode 100644
index 8a8c144241..0000000000
--- a/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm
+++ /dev/null
@@ -1,285 +0,0 @@
-dnl AMD K6-2 mpn_rshift -- mpn right shift.
-dnl
-dnl K6-2: 1.75 cycles/limb
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C unsigned shift);
-C
-
-defframe(PARAM_SHIFT,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-deflit(`FRAME',0)
-
-dnl Minimum 9, because the unrolled loop can't handle less.
-dnl
-deflit(UNROLL_THRESHOLD, 9)
-
- .text
- ALIGN(32)
-
-PROLOGUE(mpn_rshift)
-deflit(`FRAME',0)
-
- C The 1 limb case can be done without the push %ebx, but it's then
- C still the same speed. The push is left as a free helping hand for
- C the two_or_more code.
-
- movl PARAM_SIZE, %eax
- pushl %ebx FRAME_pushl()
-
- movl PARAM_SRC, %ebx
- decl %eax
-
- movl PARAM_SHIFT, %ecx
- jnz L(two_or_more)
-
- movl (%ebx), %edx C src limb
- movl PARAM_DST, %ebx
-
- shrdl( %cl, %edx, %eax) C return value
-
- shrl %cl, %edx
-
- movl %edx, (%ebx) C dst limb
- popl %ebx
-
- ret
-
-
-C -----------------------------------------------------------------------------
- ALIGN(16) C avoid offset 0x1f
-L(two_or_more):
- C eax size-1
- C ebx src
- C ecx shift
- C edx
-
- movl (%ebx), %edx C src low limb
- negl %ecx
-
- addl $32, %ecx
- movd PARAM_SHIFT, %mm6
-
- shll %cl, %edx
- cmpl $UNROLL_THRESHOLD-1, %eax
-
- jae L(unroll)
-
-
- C eax size-1
- C ebx src
- C ecx 32-shift
- C edx retval
- C
- C mm6 shift
-
- movl PARAM_DST, %ecx
- leal (%ebx,%eax,4), %ebx
-
- leal -4(%ecx,%eax,4), %ecx
- negl %eax
-
- C This loop runs at about 3 cycles/limb, which is the amount of
- C decoding, and this is despite every second access being unaligned.
-
-L(simple):
- C eax counter, -(size-1) to -1
- C ebx &src[size-1]
- C ecx &dst[size-1]
- C edx retval
- C
- C mm0 scratch
- C mm6 shift
-
-Zdisp( movq, 0,(%ebx,%eax,4), %mm0)
- incl %eax
-
- psrlq %mm6, %mm0
-
-Zdisp( movd, %mm0, 0,(%ecx,%eax,4))
- jnz L(simple)
-
-
- movq %mm0, (%ecx)
- movl %edx, %eax
-
- popl %ebx
-
- femms
- ret
-
-
-C -----------------------------------------------------------------------------
- ALIGN(16)
-L(unroll):
- C eax size-1
- C ebx src
- C ecx 32-shift
- C edx retval
- C
- C mm6 shift
-
- addl $32, %ecx
- subl $7, %eax C size-8
-
- movd %ecx, %mm7
- movl PARAM_DST, %ecx
-
- movq (%ebx), %mm2 C src low qword
- leal (%ebx,%eax,4), %ebx C src end - 32
-
- testb $4, %cl
- leal (%ecx,%eax,4), %ecx C dst end - 32
-
- notl %eax C -(size-7)
- jz L(dst_aligned)
-
- psrlq %mm6, %mm2
- incl %eax
-
-Zdisp( movd, %mm2, 0,(%ecx,%eax,4)) C dst low limb
- movq 4(%ebx,%eax,4), %mm2 C new src low qword
-L(dst_aligned):
-
- movq 12(%ebx,%eax,4), %mm0 C src second lowest qword
- nop C avoid bad cache line crossing
-
-
- C This loop is the important bit, the rest is just support for it.
- C Four src limbs are held at the start, and four more will be read.
- C Four dst limbs will be written. This schedule seems necessary for
- C full speed.
- C
- C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
- C and leaves 0 to 3 which can be tested with test $1 and $2.
-
-L(top):
- C eax counter, -(size-7) step by +4 until >=0
- C ebx src end - 32
- C ecx dst end - 32
- C edx retval
- C
- C mm0 src next qword
- C mm1 scratch
- C mm2 src prev qword
- C mm6 shift
- C mm7 64-shift
-
- psrlq %mm6, %mm2
- addl $4, %eax
-
- movq %mm0, %mm1
- psllq %mm7, %mm0
-
- por %mm0, %mm2
- movq 4(%ebx,%eax,4), %mm0
-
- psrlq %mm6, %mm1
- movq %mm2, -12(%ecx,%eax,4)
-
- movq %mm0, %mm2
- psllq %mm7, %mm0
-
- por %mm0, %mm1
- movq 12(%ebx,%eax,4), %mm0
-
- movq %mm1, -4(%ecx,%eax,4)
- ja L(top) C jump if no carry and not zero
-
-
-
- C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
- C to 3 representing respectively 3 to 0 further limbs.
-
- testl $2, %eax C testl to avoid bad cache line crossings
- jnz L(finish_nottwo)
-
- C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
- C becomes new mm2 and a new mm0 is loaded.
-
- psrlq %mm6, %mm2
- movq %mm0, %mm1
-
- psllq %mm7, %mm0
- addl $2, %eax
-
- por %mm0, %mm2
- movq 12(%ebx,%eax,4), %mm0
-
- movq %mm2, -4(%ecx,%eax,4)
- movq %mm1, %mm2
-L(finish_nottwo):
-
-
- testb $1, %al
- psrlq %mm6, %mm2
-
- movq %mm0, %mm1
- psllq %mm7, %mm0
-
- por %mm0, %mm2
- psrlq %mm6, %mm1
-
- movq %mm2, 4(%ecx,%eax,4)
- jnz L(finish_even)
-
-
- C one further extra limb to process
-
- movd 32-4(%ebx), %mm0 C src[size-1], most significant limb
- popl %ebx
-
- movq %mm0, %mm2
- psllq %mm7, %mm0
-
- por %mm0, %mm1
- psrlq %mm6, %mm2
-
- movq %mm1, 32-12(%ecx) C dst[size-3,size-2]
- movd %mm2, 32-4(%ecx) C dst[size-1]
-
- movl %edx, %eax C retval
-
- femms
- ret
-
-
- nop C avoid bad cache line crossing
-L(finish_even):
- C no further extra limbs
-
- movq %mm1, 32-8(%ecx) C dst[size-2,size-1]
- movl %edx, %eax C retval
-
- popl %ebx
-
- femms
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/com_n.asm b/rts/gmp/mpn/x86/k6/mmx/com_n.asm
deleted file mode 100644
index 8915080f0f..0000000000
--- a/rts/gmp/mpn/x86/k6/mmx/com_n.asm
+++ /dev/null
@@ -1,91 +0,0 @@
-dnl AMD K6-2 mpn_com_n -- mpn bitwise one's complement.
-dnl
-dnl alignment dst/src, A=0mod8 N=4mod8
-dnl A/A A/N N/A N/N
-dnl K6-2 1.0 1.18 1.18 1.18 cycles/limb
-dnl K6 1.5 1.85 1.75 1.85
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C void mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size);
-C
-C Take the bitwise ones-complement of src,size and write it to dst,size.
-
-defframe(PARAM_SIZE,12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
- .text
- ALIGN(32)
-PROLOGUE(mpn_com_n)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %ecx
- movl PARAM_SRC, %eax
- movl PARAM_DST, %edx
- shrl %ecx
- jnz L(two_or_more)
-
- movl (%eax), %eax
- notl %eax
- movl %eax, (%edx)
- ret
-
-
-L(two_or_more):
- pushl %ebx
-FRAME_pushl()
- movl %ecx, %ebx
-
- pcmpeqd %mm7, %mm7 C all ones
-
-
- ALIGN(16)
-L(top):
- C eax src
- C ebx floor(size/2)
- C ecx counter
- C edx dst
- C esi
- C edi
- C ebp
-
- movq -8(%eax,%ecx,8), %mm0
- pxor %mm7, %mm0
- movq %mm0, -8(%edx,%ecx,8)
- loop L(top)
-
-
- jnc L(no_extra)
- movl (%eax,%ebx,8), %eax
- notl %eax
- movl %eax, (%edx,%ebx,8)
-L(no_extra):
-
- popl %ebx
- emms_or_femms
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/logops_n.asm b/rts/gmp/mpn/x86/k6/mmx/logops_n.asm
deleted file mode 100644
index 46cb3b7ea5..0000000000
--- a/rts/gmp/mpn/x86/k6/mmx/logops_n.asm
+++ /dev/null
@@ -1,212 +0,0 @@
-dnl AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
-dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
-dnl
-dnl alignment dst/src1/src2, A=0mod8, N=4mod8
-dnl A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
-dnl
-dnl K6-2 1.2 1.5 1.5 1.2 1.2 1.5 1.5 1.2 and,andn,ior,xor
-dnl K6-2 1.5 1.75 2.0 1.75 1.75 2.0 1.75 1.5 iorn,xnor
-dnl K6-2 1.75 2.0 2.0 2.0 2.0 2.0 2.0 1.75 nand,nior
-dnl
-dnl K6 1.5 1.68 1.75 1.2 1.75 1.75 1.68 1.5 and,andn,ior,xor
-dnl K6 2.0 2.0 2.25 2.25 2.25 2.25 2.0 2.0 iorn,xnor
-dnl K6 2.0 2.25 2.25 2.25 2.25 2.25 2.25 2.0 nand,nior
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-dnl M4_p and M4_i are the MMX and integer instructions
-dnl M4_*_neg_dst means whether to negate the final result before writing
-dnl M4_*_neg_src2 means whether to negate the src2 values before using them
-
-define(M4_choose_op,
-m4_assert_numargs(7)
-`ifdef(`OPERATION_$1',`
-define(`M4_function', `mpn_$1')
-define(`M4_operation', `$1')
-define(`M4_p', `$2')
-define(`M4_p_neg_dst', `$3')
-define(`M4_p_neg_src2',`$4')
-define(`M4_i', `$5')
-define(`M4_i_neg_dst', `$6')
-define(`M4_i_neg_src2',`$7')
-')')
-
-dnl xnor is done in "iorn" style because it's a touch faster than "nior"
-dnl style (the two are equivalent for xor).
-
-M4_choose_op( and_n, pand,0,0, andl,0,0)
-M4_choose_op( andn_n, pandn,0,0, andl,0,1)
-M4_choose_op( nand_n, pand,1,0, andl,1,0)
-M4_choose_op( ior_n, por,0,0, orl,0,0)
-M4_choose_op( iorn_n, por,0,1, orl,0,1)
-M4_choose_op( nior_n, por,1,0, orl,1,0)
-M4_choose_op( xor_n, pxor,0,0, xorl,0,0)
-M4_choose_op( xnor_n, pxor,0,1, xorl,0,1)
-
-ifdef(`M4_function',,
-`m4_error(`Unrecognised or undefined OPERATION symbol
-')')
-
-MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
-
-
-C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
-C mp_size_t size);
-C
-C Do src1,size M4_operation src2,size, storing the result in dst,size.
-C
-C Unaligned movq loads and stores are a bit slower than aligned ones. The
-C test at the start of the routine checks the alignment of src1 and if
-C necessary processes one limb separately at the low end to make it aligned.
-C
-C The raw speeds without this alignment switch are as follows.
-C
-C alignment dst/src1/src2, A=0mod8, N=4mod8
-C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
-C
-C K6 1.5 2.0 1.5 2.0 and,andn,ior,xor
-C K6 1.75 2.2 2.0 2.28 iorn,xnor
-C K6 2.0 2.25 2.35 2.28 nand,nior
-C
-C
-C Future:
-C
-C K6 can do one 64-bit load per cycle so each of these routines should be
-C able to approach 1.0 c/l, if aligned. The basic and/andn/ior/xor might be
-C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
-C The others are 4 instructions per 2 limbs, and so can only approach 1.0
-C because there's nowhere to hide some loop control.
-
-defframe(PARAM_SIZE,16)
-defframe(PARAM_SRC2,12)
-defframe(PARAM_SRC1,8)
-defframe(PARAM_DST, 4)
-deflit(`FRAME',0)
-
- .text
- ALIGN(32)
-PROLOGUE(M4_function)
- movl PARAM_SIZE, %ecx
- pushl %ebx
- FRAME_pushl()
- movl PARAM_SRC1, %eax
- movl PARAM_SRC2, %ebx
- cmpl $1, %ecx
- movl PARAM_DST, %edx
- ja L(two_or_more)
-
-
- movl (%ebx), %ecx
- popl %ebx
-ifelse(M4_i_neg_src2,1,`notl %ecx')
- M4_i (%eax), %ecx
-ifelse(M4_i_neg_dst,1,` notl %ecx')
- movl %ecx, (%edx)
-
- ret
-
-
-L(two_or_more):
- C eax src1
- C ebx src2
- C ecx size
- C edx dst
- C esi
- C edi
- C ebp
- C
- C carry bit is low of size
-
- pushl %esi
- FRAME_pushl()
- testl $4, %eax
- jz L(alignment_ok)
-
- movl (%ebx), %esi
- addl $4, %ebx
-ifelse(M4_i_neg_src2,1,`notl %esi')
- M4_i (%eax), %esi
- addl $4, %eax
-ifelse(M4_i_neg_dst,1,` notl %esi')
- movl %esi, (%edx)
- addl $4, %edx
- decl %ecx
-
-L(alignment_ok):
- movl %ecx, %esi
- shrl %ecx
- jnz L(still_two_or_more)
-
- movl (%ebx), %ecx
- popl %esi
-ifelse(M4_i_neg_src2,1,`notl %ecx')
- M4_i (%eax), %ecx
-ifelse(M4_i_neg_dst,1,` notl %ecx')
- popl %ebx
- movl %ecx, (%edx)
- ret
-
-
-L(still_two_or_more):
-ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
- pcmpeqd %mm7, %mm7 C all ones
-')
-
- ALIGN(16)
-L(top):
- C eax src1
- C ebx src2
- C ecx counter
- C edx dst
- C esi
- C edi
- C ebp
- C
- C carry bit is low of size
-
- movq -8(%ebx,%ecx,8), %mm0
-ifelse(M4_p_neg_src2,1,`pxor %mm7, %mm0')
- M4_p -8(%eax,%ecx,8), %mm0
-ifelse(M4_p_neg_dst,1,` pxor %mm7, %mm0')
- movq %mm0, -8(%edx,%ecx,8)
-
- loop L(top)
-
-
- jnc L(no_extra)
-
- movl -4(%ebx,%esi,4), %ebx
-ifelse(M4_i_neg_src2,1,`notl %ebx')
- M4_i -4(%eax,%esi,4), %ebx
-ifelse(M4_i_neg_dst,1,` notl %ebx')
- movl %ebx, -4(%edx,%esi,4)
-L(no_extra):
-
- popl %esi
- popl %ebx
- emms_or_femms
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/lshift.asm b/rts/gmp/mpn/x86/k6/mmx/lshift.asm
deleted file mode 100644
index f1dc83db46..0000000000
--- a/rts/gmp/mpn/x86/k6/mmx/lshift.asm
+++ /dev/null
@@ -1,122 +0,0 @@
-dnl AMD K6 mpn_lshift -- mpn left shift.
-dnl
-dnl K6: 3.0 cycles/limb
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C unsigned shift);
-C
-C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
-C instructions. This is despite every second fetch being unaligned.
-
-
-defframe(PARAM_SHIFT,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
- .text
- ALIGN(32)
-
-PROLOGUE(mpn_lshift)
-deflit(`FRAME',0)
-
- C The 1 limb case can be done without the push %ebx, but it's then
- C still the same speed. The push is left as a free helping hand for
- C the two_or_more code.
-
- movl PARAM_SIZE, %eax
- pushl %ebx FRAME_pushl()
-
- movl PARAM_SRC, %ebx
- decl %eax
-
- movl PARAM_SHIFT, %ecx
- jnz L(two_or_more)
-
- movl (%ebx), %edx C src limb
- movl PARAM_DST, %ebx
-
- shldl( %cl, %edx, %eax) C return value
-
- shll %cl, %edx
-
- movl %edx, (%ebx) C dst limb
- popl %ebx
-
- ret
-
-
- ALIGN(16) C avoid offset 0x1f
- nop C avoid bad cache line crossing
-L(two_or_more):
- C eax size-1
- C ebx src
- C ecx shift
- C edx
-
- movl (%ebx,%eax,4), %edx C src high limb
- negl %ecx
-
- movd PARAM_SHIFT, %mm6
- addl $32, %ecx C 32-shift
-
- shrl %cl, %edx
-
- movd %ecx, %mm7
- movl PARAM_DST, %ecx
-
-L(top):
- C eax counter, size-1 to 1
- C ebx src
- C ecx dst
- C edx retval
- C
- C mm0 scratch
- C mm6 shift
- C mm7 32-shift
-
- movq -4(%ebx,%eax,4), %mm0
- decl %eax
-
- psrlq %mm7, %mm0
-
- movd %mm0, 4(%ecx,%eax,4)
- jnz L(top)
-
-
- movd (%ebx), %mm0
- popl %ebx
-
- psllq %mm6, %mm0
- movl %edx, %eax
-
- movd %mm0, (%ecx)
-
- emms
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/popham.asm b/rts/gmp/mpn/x86/k6/mmx/popham.asm
deleted file mode 100644
index 2c619252bb..0000000000
--- a/rts/gmp/mpn/x86/k6/mmx/popham.asm
+++ /dev/null
@@ -1,238 +0,0 @@
-dnl AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
-dnl hamming distance.
-dnl
-dnl popcount hamdist
-dnl K6-2: 9.0 11.5 cycles/limb
-dnl K6: 12.5 13.0
-
-
-dnl Copyright (C) 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
-C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
-C
-C The code here isn't optimal, but it's already a 2x speedup over the plain
-C integer mpn/generic/popcount.c,hamdist.c.
-
-
-ifdef(`OPERATION_popcount',,
-`ifdef(`OPERATION_hamdist',,
-`m4_error(`Need OPERATION_popcount or OPERATION_hamdist
-')m4exit(1)')')
-
-define(HAM,
-m4_assert_numargs(1)
-`ifdef(`OPERATION_hamdist',`$1')')
-
-define(POP,
-m4_assert_numargs(1)
-`ifdef(`OPERATION_popcount',`$1')')
-
-HAM(`
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC2, 8)
-defframe(PARAM_SRC, 4)
-define(M4_function,mpn_hamdist)
-')
-POP(`
-defframe(PARAM_SIZE, 8)
-defframe(PARAM_SRC, 4)
-define(M4_function,mpn_popcount)
-')
-
-MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
-
-
-ifdef(`PIC',,`
- dnl non-PIC
-
- DATA
- ALIGN(8)
-
-define(LS,
-m4_assert_numargs(1)
-`LF(M4_function,`$1')')
-
-LS(rodata_AAAAAAAAAAAAAAAA):
- .long 0xAAAAAAAA
- .long 0xAAAAAAAA
-
-LS(rodata_3333333333333333):
- .long 0x33333333
- .long 0x33333333
-
-LS(rodata_0F0F0F0F0F0F0F0F):
- .long 0x0F0F0F0F
- .long 0x0F0F0F0F
-
-LS(rodata_000000FF000000FF):
- .long 0x000000FF
- .long 0x000000FF
-')
-
- .text
- ALIGN(32)
-
-POP(`ifdef(`PIC', `
- C avoid shrl crossing a 32-byte boundary
- nop')')
-
-PROLOGUE(M4_function)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %ecx
- orl %ecx, %ecx
- jz L(zero)
-
-ifdef(`PIC',`
- movl $0xAAAAAAAA, %eax
- movl $0x33333333, %edx
-
- movd %eax, %mm7
- movd %edx, %mm6
-
- movl $0x0F0F0F0F, %eax
- movl $0x000000FF, %edx
-
- punpckldq %mm7, %mm7
- punpckldq %mm6, %mm6
-
- movd %eax, %mm5
- movd %edx, %mm4
-
- punpckldq %mm5, %mm5
- punpckldq %mm4, %mm4
-',`
-
- movq LS(rodata_AAAAAAAAAAAAAAAA), %mm7
- movq LS(rodata_3333333333333333), %mm6
- movq LS(rodata_0F0F0F0F0F0F0F0F), %mm5
- movq LS(rodata_000000FF000000FF), %mm4
-')
-
-define(REG_AAAAAAAAAAAAAAAA, %mm7)
-define(REG_3333333333333333, %mm6)
-define(REG_0F0F0F0F0F0F0F0F, %mm5)
-define(REG_000000FF000000FF, %mm4)
-
-
- movl PARAM_SRC, %eax
-HAM(` movl PARAM_SRC2, %edx')
-
- pxor %mm2, %mm2 C total
-
- shrl %ecx
- jnc L(top)
-
-Zdisp( movd, 0,(%eax,%ecx,8), %mm1)
-
-HAM(`
-Zdisp( movd, 0,(%edx,%ecx,8), %mm0)
- pxor %mm0, %mm1
-')
-
- incl %ecx
- jmp L(loaded)
-
-
- ALIGN(16)
-POP(` nop C alignment to avoid crossing 32-byte boundaries')
-
-L(top):
- C eax src
- C ebx
- C ecx counter, qwords, decrementing
- C edx [hamdist] src2
- C
- C mm0 (scratch)
- C mm1 (scratch)
- C mm2 total (low dword)
- C mm3
- C mm4 \
- C mm5 | special constants
- C mm6 |
- C mm7 /
-
- movq -8(%eax,%ecx,8), %mm1
-HAM(` pxor -8(%edx,%ecx,8), %mm1')
-
-L(loaded):
- movq %mm1, %mm0
- pand REG_AAAAAAAAAAAAAAAA, %mm1
-
- psrlq $1, %mm1
-HAM(` nop C code alignment')
-
- psubd %mm1, %mm0 C bit pairs
-HAM(` nop C code alignment')
-
-
- movq %mm0, %mm1
- psrlq $2, %mm0
-
- pand REG_3333333333333333, %mm0
- pand REG_3333333333333333, %mm1
-
- paddd %mm1, %mm0 C nibbles
-
-
- movq %mm0, %mm1
- psrlq $4, %mm0
-
- pand REG_0F0F0F0F0F0F0F0F, %mm0
- pand REG_0F0F0F0F0F0F0F0F, %mm1
-
- paddd %mm1, %mm0 C bytes
-
- movq %mm0, %mm1
- psrlq $8, %mm0
-
-
- paddb %mm1, %mm0 C words
-
-
- movq %mm0, %mm1
- psrlq $16, %mm0
-
- paddd %mm1, %mm0 C dwords
-
- pand REG_000000FF000000FF, %mm0
-
- paddd %mm0, %mm2 C low to total
- psrlq $32, %mm0
-
- paddd %mm0, %mm2 C high to total
- loop L(top)
-
-
-
- movd %mm2, %eax
- emms_or_femms
- ret
-
-L(zero):
- movl $0, %eax
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/rshift.asm b/rts/gmp/mpn/x86/k6/mmx/rshift.asm
deleted file mode 100644
index cc5948f26c..0000000000
--- a/rts/gmp/mpn/x86/k6/mmx/rshift.asm
+++ /dev/null
@@ -1,122 +0,0 @@
-dnl AMD K6 mpn_rshift -- mpn right shift.
-dnl
-dnl K6: 3.0 cycles/limb
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C unsigned shift);
-C
-C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
-C instructions. This is despite every second fetch being unaligned.
-
-
-defframe(PARAM_SHIFT,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-deflit(`FRAME',0)
-
- .text
- ALIGN(32)
-
-PROLOGUE(mpn_rshift)
-deflit(`FRAME',0)
-
- C The 1 limb case can be done without the push %ebx, but it's then
- C still the same speed. The push is left as a free helping hand for
- C the two_or_more code.
-
- movl PARAM_SIZE, %eax
- pushl %ebx FRAME_pushl()
-
- movl PARAM_SRC, %ebx
- decl %eax
-
- movl PARAM_SHIFT, %ecx
- jnz L(two_or_more)
-
- movl (%ebx), %edx C src limb
- movl PARAM_DST, %ebx
-
- shrdl( %cl, %edx, %eax) C return value
-
- shrl %cl, %edx
-
- movl %edx, (%ebx) C dst limb
- popl %ebx
-
- ret
-
-
- ALIGN(16) C avoid offset 0x1f
-L(two_or_more):
- C eax size-1
- C ebx src
- C ecx shift
- C edx
-
- movl (%ebx), %edx C src low limb
- negl %ecx
-
- addl $32, %ecx C 32-shift
- movd PARAM_SHIFT, %mm6
-
- shll %cl, %edx C retval
- movl PARAM_DST, %ecx
-
- leal (%ebx,%eax,4), %ebx
-
- leal -4(%ecx,%eax,4), %ecx
- negl %eax
-
-
-L(simple):
- C eax counter (negative)
- C ebx &src[size-1]
- C ecx &dst[size-1]
- C edx retval
- C
- C mm0 scratch
- C mm6 shift
-
-Zdisp( movq, 0,(%ebx,%eax,4), %mm0)
- incl %eax
-
- psrlq %mm6, %mm0
-
-Zdisp( movd, %mm0, 0,(%ecx,%eax,4))
- jnz L(simple)
-
-
- movq %mm0, (%ecx)
- movl %edx, %eax
-
- popl %ebx
-
- emms
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mul_1.asm b/rts/gmp/mpn/x86/k6/mul_1.asm
deleted file mode 100644
index c2220fe4ca..0000000000
--- a/rts/gmp/mpn/x86/k6/mul_1.asm
+++ /dev/null
@@ -1,272 +0,0 @@
-dnl AMD K6 mpn_mul_1 -- mpn by limb multiply.
-dnl
-dnl K6: 6.25 cycles/limb.
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t multiplier);
-C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t multiplier, mp_limb_t carry);
-C
-C Multiply src,size by mult and store the result in dst,size.
-C Return the carry limb from the top of the result.
-C
-C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
-C the low limb of the result.
-
-defframe(PARAM_CARRY, 20)
-defframe(PARAM_MULTIPLIER,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-dnl minimum 5 because the unrolled code can't handle less
-deflit(UNROLL_THRESHOLD, 5)
-
- .text
- ALIGN(32)
-
-PROLOGUE(mpn_mul_1c)
- pushl %esi
-deflit(`FRAME',4)
- movl PARAM_CARRY, %esi
- jmp LF(mpn_mul_1,start_nc)
-EPILOGUE()
-
-
-PROLOGUE(mpn_mul_1)
- push %esi
-deflit(`FRAME',4)
- xorl %esi, %esi C initial carry
-
-L(start_nc):
- mov PARAM_SIZE, %ecx
- push %ebx
-FRAME_pushl()
-
- movl PARAM_SRC, %ebx
- push %edi
-FRAME_pushl()
-
- movl PARAM_DST, %edi
- pushl %ebp
-FRAME_pushl()
-
- cmpl $UNROLL_THRESHOLD, %ecx
- movl PARAM_MULTIPLIER, %ebp
-
- jae L(unroll)
-
-
- C code offset 0x22 here, close enough to aligned
-L(simple):
- C eax scratch
- C ebx src
- C ecx counter
- C edx scratch
- C esi carry
- C edi dst
- C ebp multiplier
- C
- C this loop 8 cycles/limb
-
- movl (%ebx), %eax
- addl $4, %ebx
-
- mull %ebp
-
- addl %esi, %eax
- movl $0, %esi
-
- adcl %edx, %esi
-
- movl %eax, (%edi)
- addl $4, %edi
-
- loop L(simple)
-
-
- popl %ebp
-
- popl %edi
- popl %ebx
-
- movl %esi, %eax
- popl %esi
-
- ret
-
-
-C -----------------------------------------------------------------------------
-C The code for each limb is 6 cycles, with instruction decoding being the
-C limiting factor. At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
-C cycles/limb in total.
-C
-C The secret ingredient to get 6.25 is to start the loop with the mul and
-C have the load/store pair at the end. Rotating the load/store to the top
-C is an 0.5 c/l slowdown. (Some address generation effect probably.)
-C
-C The whole unrolled loop fits nicely in exactly 80 bytes.
-
-
- ALIGN(16) C already aligned to 16 here actually
-L(unroll):
- movl (%ebx), %eax
- leal -16(%ebx,%ecx,4), %ebx
-
- leal -16(%edi,%ecx,4), %edi
- subl $4, %ecx
-
- negl %ecx
-
-
- ALIGN(16) C one byte nop for this alignment
-L(top):
- C eax scratch
- C ebx &src[size-4]
- C ecx counter
- C edx scratch
- C esi carry
- C edi &dst[size-4]
- C ebp multiplier
-
- mull %ebp
-
- addl %esi, %eax
- movl $0, %esi
-
- adcl %edx, %esi
-
- movl %eax, (%edi,%ecx,4)
- movl 4(%ebx,%ecx,4), %eax
-
-
- mull %ebp
-
- addl %esi, %eax
- movl $0, %esi
-
- adcl %edx, %esi
-
- movl %eax, 4(%edi,%ecx,4)
- movl 8(%ebx,%ecx,4), %eax
-
-
- mull %ebp
-
- addl %esi, %eax
- movl $0, %esi
-
- adcl %edx, %esi
-
- movl %eax, 8(%edi,%ecx,4)
- movl 12(%ebx,%ecx,4), %eax
-
-
- mull %ebp
-
- addl %esi, %eax
- movl $0, %esi
-
- adcl %edx, %esi
-
- movl %eax, 12(%edi,%ecx,4)
- movl 16(%ebx,%ecx,4), %eax
-
-
- addl $4, %ecx
- js L(top)
-
-
-
- C eax next src limb
- C ebx &src[size-4]
- C ecx 0 to 3 representing respectively 4 to 1 further limbs
- C edx
- C esi carry
- C edi &dst[size-4]
-
- testb $2, %cl
- jnz L(finish_not_two)
-
- mull %ebp
-
- addl %esi, %eax
- movl $0, %esi
-
- adcl %edx, %esi
-
- movl %eax, (%edi,%ecx,4)
- movl 4(%ebx,%ecx,4), %eax
-
-
- mull %ebp
-
- addl %esi, %eax
- movl $0, %esi
-
- adcl %edx, %esi
-
- movl %eax, 4(%edi,%ecx,4)
- movl 8(%ebx,%ecx,4), %eax
-
- addl $2, %ecx
-L(finish_not_two):
-
-
- testb $1, %cl
- jnz L(finish_not_one)
-
- mull %ebp
-
- addl %esi, %eax
- movl $0, %esi
-
- adcl %edx, %esi
-
- movl %eax, 8(%edi)
- movl 12(%ebx), %eax
-L(finish_not_one):
-
-
- mull %ebp
-
- addl %esi, %eax
- popl %ebp
-
- adcl $0, %edx
-
- movl %eax, 12(%edi)
- popl %edi
-
- popl %ebx
- movl %edx, %eax
-
- popl %esi
-
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mul_basecase.asm b/rts/gmp/mpn/x86/k6/mul_basecase.asm
deleted file mode 100644
index 1f5a3a4b4b..0000000000
--- a/rts/gmp/mpn/x86/k6/mul_basecase.asm
+++ /dev/null
@@ -1,600 +0,0 @@
-dnl AMD K6 mpn_mul_basecase -- multiply two mpn numbers.
-dnl
-dnl K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop
-dnl unrolling).
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-dnl K6: UNROLL_COUNT cycles/product (approx)
-dnl 8 9.75
-dnl 16 9.3
-dnl 32 9.3
-dnl Maximum possible with the current code is 32.
-dnl
-dnl With 16 the inner unrolled loop fits exactly in a 256 byte block, which
-dnl might explain it's good performance.
-
-deflit(UNROLL_COUNT, 16)
-
-
-C void mpn_mul_basecase (mp_ptr wp,
-C mp_srcptr xp, mp_size_t xsize,
-C mp_srcptr yp, mp_size_t ysize);
-C
-C Calculate xp,xsize multiplied by yp,ysize, storing the result in
-C wp,xsize+ysize.
-C
-C This routine is essentially the same as mpn/generic/mul_basecase.c, but
-C it's faster because it does most of the mpn_addmul_1() entry code only
-C once. The saving is about 10-20% on typical sizes coming from the
-C Karatsuba multiply code.
-C
-C Future:
-C
-C The unrolled loop could be shared by mpn_addmul_1, with some extra stack
-C setups and maybe 2 or 3 wasted cycles at the end. Code saving would be
-C 256 bytes.
-
-ifdef(`PIC',`
-deflit(UNROLL_THRESHOLD, 8)
-',`
-deflit(UNROLL_THRESHOLD, 8)
-')
-
-defframe(PARAM_YSIZE,20)
-defframe(PARAM_YP, 16)
-defframe(PARAM_XSIZE,12)
-defframe(PARAM_XP, 8)
-defframe(PARAM_WP, 4)
-
- .text
- ALIGN(32)
-PROLOGUE(mpn_mul_basecase)
-deflit(`FRAME',0)
-
- movl PARAM_XSIZE, %ecx
- movl PARAM_YP, %eax
-
- movl PARAM_XP, %edx
- movl (%eax), %eax C yp low limb
-
- cmpl $2, %ecx
- ja L(xsize_more_than_two_limbs)
- je L(two_by_something)
-
-
- C one limb by one limb
-
- movl (%edx), %edx C xp low limb
- movl PARAM_WP, %ecx
-
- mull %edx
-
- movl %eax, (%ecx)
- movl %edx, 4(%ecx)
- ret
-
-
-C -----------------------------------------------------------------------------
-L(two_by_something):
- decl PARAM_YSIZE
- pushl %ebx
-deflit(`FRAME',4)
-
- movl PARAM_WP, %ebx
- pushl %esi
-deflit(`FRAME',8)
-
- movl %eax, %ecx C yp low limb
- movl (%edx), %eax C xp low limb
-
- movl %edx, %esi C xp
- jnz L(two_by_two)
-
-
- C two limbs by one limb
-
- mull %ecx
-
- movl %eax, (%ebx)
- movl 4(%esi), %eax
-
- movl %edx, %esi C carry
-
- mull %ecx
-
- addl %eax, %esi
- movl %esi, 4(%ebx)
-
- adcl $0, %edx
-
- movl %edx, 8(%ebx)
- popl %esi
-
- popl %ebx
- ret
-
-
-
-C -----------------------------------------------------------------------------
- ALIGN(16)
-L(two_by_two):
- C eax xp low limb
- C ebx wp
- C ecx yp low limb
- C edx
- C esi xp
- C edi
- C ebp
-deflit(`FRAME',8)
-
- mull %ecx C xp[0] * yp[0]
-
- push %edi
-deflit(`FRAME',12)
- movl %eax, (%ebx)
-
- movl 4(%esi), %eax
- movl %edx, %edi C carry, for wp[1]
-
- mull %ecx C xp[1] * yp[0]
-
- addl %eax, %edi
- movl PARAM_YP, %ecx
-
- adcl $0, %edx
-
- movl %edi, 4(%ebx)
- movl 4(%ecx), %ecx C yp[1]
-
- movl 4(%esi), %eax C xp[1]
- movl %edx, %edi C carry, for wp[2]
-
- mull %ecx C xp[1] * yp[1]
-
- addl %eax, %edi
-
- adcl $0, %edx
-
- movl (%esi), %eax C xp[0]
- movl %edx, %esi C carry, for wp[3]
-
- mull %ecx C xp[0] * yp[1]
-
- addl %eax, 4(%ebx)
- adcl %edx, %edi
- adcl $0, %esi
-
- movl %edi, 8(%ebx)
- popl %edi
-
- movl %esi, 12(%ebx)
- popl %esi
-
- popl %ebx
- ret
-
-
-C -----------------------------------------------------------------------------
- ALIGN(16)
-L(xsize_more_than_two_limbs):
-
-C The first limb of yp is processed with a simple mpn_mul_1 style loop
-C inline. Unrolling this doesn't seem worthwhile since it's only run once
-C (whereas the addmul below is run ysize-1 many times). A call to the
-C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
-C popping, and doesn't seem likely to be worthwhile on the typical 10-20
-C limb operations the Karatsuba code calls here with.
-
- C eax yp[0]
- C ebx
- C ecx xsize
- C edx xp
- C esi
- C edi
- C ebp
-deflit(`FRAME',0)
-
- pushl %edi defframe_pushl(SAVE_EDI)
- pushl %ebp defframe_pushl(SAVE_EBP)
-
- movl PARAM_WP, %edi
- pushl %esi defframe_pushl(SAVE_ESI)
-
- movl %eax, %ebp
- pushl %ebx defframe_pushl(SAVE_EBX)
-
- leal (%edx,%ecx,4), %ebx C xp end
- xorl %esi, %esi
-
- leal (%edi,%ecx,4), %edi C wp end of mul1
- negl %ecx
-
-
-L(mul1):
- C eax scratch
- C ebx xp end
- C ecx counter, negative
- C edx scratch
- C esi carry
- C edi wp end of mul1
- C ebp multiplier
-
- movl (%ebx,%ecx,4), %eax
-
- mull %ebp
-
- addl %esi, %eax
- movl $0, %esi
-
- adcl %edx, %esi
-
- movl %eax, (%edi,%ecx,4)
- incl %ecx
-
- jnz L(mul1)
-
-
- movl PARAM_YSIZE, %edx
- movl %esi, (%edi) C final carry
-
- movl PARAM_XSIZE, %ecx
- decl %edx
-
- jnz L(ysize_more_than_one_limb)
-
- popl %ebx
- popl %esi
- popl %ebp
- popl %edi
- ret
-
-
-L(ysize_more_than_one_limb):
- cmpl $UNROLL_THRESHOLD, %ecx
- movl PARAM_YP, %eax
-
- jae L(unroll)
-
-
-C -----------------------------------------------------------------------------
-C Simple addmul loop.
-C
-C Using ebx and edi pointing at the ends of their respective locations saves
-C a couple of instructions in the outer loop. The inner loop is still 11
-C cycles, the same as the simple loop in aorsmul_1.asm.
-
- C eax yp
- C ebx xp end
- C ecx xsize
- C edx ysize-1
- C esi
- C edi wp end of mul1
- C ebp
-
- movl 4(%eax), %ebp C multiplier
- negl %ecx
-
- movl %ecx, PARAM_XSIZE C -xsize
- xorl %esi, %esi C initial carry
-
- leal 4(%eax,%edx,4), %eax C yp end
- negl %edx
-
- movl %eax, PARAM_YP
- movl %edx, PARAM_YSIZE
-
- jmp L(simple_outer_entry)
-
-
- C aligning here saves a couple of cycles
- ALIGN(16)
-L(simple_outer_top):
- C edx ysize counter, negative
-
- movl PARAM_YP, %eax C yp end
- xorl %esi, %esi C carry
-
- movl PARAM_XSIZE, %ecx C -xsize
- movl %edx, PARAM_YSIZE
-
- movl (%eax,%edx,4), %ebp C yp limb multiplier
-L(simple_outer_entry):
- addl $4, %edi
-
-
-L(simple_inner):
- C eax scratch
- C ebx xp end
- C ecx counter, negative
- C edx scratch
- C esi carry
- C edi wp end of this addmul
- C ebp multiplier
-
- movl (%ebx,%ecx,4), %eax
-
- mull %ebp
-
- addl %esi, %eax
- movl $0, %esi
-
- adcl $0, %edx
- addl %eax, (%edi,%ecx,4)
- adcl %edx, %esi
-
- incl %ecx
- jnz L(simple_inner)
-
-
- movl PARAM_YSIZE, %edx
- movl %esi, (%edi)
-
- incl %edx
- jnz L(simple_outer_top)
-
-
- popl %ebx
- popl %esi
- popl %ebp
- popl %edi
- ret
-
-
-C -----------------------------------------------------------------------------
-C Unrolled loop.
-C
-C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for
-C some comments.
-C
-C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to
-C 0, inclusive.
-C
-C VAR_JMP is the computed jump into the unrolled loop.
-C
-C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop
-C is entered.
-C
-C VAR_XP_LOW is the least significant limb of xp, which is needed at the
-C start of the unrolled loop. This can't just be fetched through the xp
-C pointer because of the offset applied to it.
-C
-C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
-C inclusive.
-C
-C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
-C added to give the location of the next limb of yp, which is the multiplier
-C in the unrolled loop.
-C
-C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added
-C to give the starting point in the destination for each unrolled loop (this
-C point is one limb upwards for each limb of yp processed).
-C
-C Having PARAM_YSIZE count negative to zero means it's not necessary to
-C store new values of PARAM_YP and PARAM_WP on each loop. Those values on
-C the stack remain constant and on each loop an leal adjusts them with the
-C PARAM_YSIZE counter value.
-
-
-defframe(VAR_COUNTER, -20)
-defframe(VAR_COUNTER_INIT, -24)
-defframe(VAR_JMP, -28)
-defframe(VAR_XP_LOW, -32)
-deflit(VAR_STACK_SPACE, 16)
-
-dnl For some strange reason using (%esp) instead of 0(%esp) is a touch
-dnl slower in this code, hence the defframe empty-if-zero feature is
-dnl disabled.
-dnl
-dnl If VAR_COUNTER is at (%esp), the effect is worse. In this case the
-dnl unrolled loop is 255 instead of 256 bytes, but quite how this affects
-dnl anything isn't clear.
-dnl
-define(`defframe_empty_if_zero_disabled',1)
-
-L(unroll):
- C eax yp (not used)
- C ebx xp end (not used)
- C ecx xsize
- C edx ysize-1
- C esi
- C edi wp end of mul1 (not used)
- C ebp
-deflit(`FRAME', 16)
-
- leal -2(%ecx), %ebp C one limb processed at start,
- decl %ecx C and ebp is one less
-
- shrl $UNROLL_LOG2, %ebp
- negl %ecx
-
- subl $VAR_STACK_SPACE, %esp
-deflit(`FRAME', 16+VAR_STACK_SPACE)
- andl $UNROLL_MASK, %ecx
-
- movl %ecx, %esi
- shll $4, %ecx
-
- movl %ebp, VAR_COUNTER_INIT
- negl %esi
-
- C 15 code bytes per limb
-ifdef(`PIC',`
- call L(pic_calc)
-L(unroll_here):
-',`
- leal L(unroll_entry) (%ecx,%esi,1), %ecx
-')
-
- movl PARAM_XP, %ebx
- movl %ebp, VAR_COUNTER
-
- movl PARAM_WP, %edi
- movl %ecx, VAR_JMP
-
- movl (%ebx), %eax
- leal 4(%edi,%esi,4), %edi C wp adjust for unrolling and mul1
-
- leal (%ebx,%esi,4), %ebx C xp adjust for unrolling
-
- movl %eax, VAR_XP_LOW
-
- movl %ebx, PARAM_XP
- movl PARAM_YP, %ebx
-
- leal (%edi,%edx,4), %ecx C wp adjust for ysize indexing
- movl 4(%ebx), %ebp C multiplier (yp second limb)
-
- leal 4(%ebx,%edx,4), %ebx C yp adjust for ysize indexing
-
- movl %ecx, PARAM_WP
-
- leal 1(%esi), %ecx C adjust parity for decl %ecx above
-
- movl %ebx, PARAM_YP
- negl %edx
-
- movl %edx, PARAM_YSIZE
- jmp L(unroll_outer_entry)
-
-
-ifdef(`PIC',`
-L(pic_calc):
- C See README.family about old gas bugs
- leal (%ecx,%esi,1), %ecx
- addl $L(unroll_entry)-L(unroll_here), %ecx
- addl (%esp), %ecx
- ret
-')
-
-
-C -----------------------------------------------------------------------------
- C Aligning here saves a couple of cycles per loop. Using 32 doesn't
- C cost any extra space, since the inner unrolled loop below is
- C aligned to 32.
- ALIGN(32)
-L(unroll_outer_top):
- C edx ysize
-
- movl PARAM_YP, %eax
- movl %edx, PARAM_YSIZE C incremented ysize counter
-
- movl PARAM_WP, %edi
-
- movl VAR_COUNTER_INIT, %ebx
- movl (%eax,%edx,4), %ebp C next multiplier
-
- movl PARAM_XSIZE, %ecx
- leal (%edi,%edx,4), %edi C adjust wp for where we are in yp
-
- movl VAR_XP_LOW, %eax
- movl %ebx, VAR_COUNTER
-
-L(unroll_outer_entry):
- mull %ebp
-
- C using testb is a tiny bit faster than testl
- testb $1, %cl
-
- movl %eax, %ecx C low carry
- movl VAR_JMP, %eax
-
- movl %edx, %esi C high carry
- movl PARAM_XP, %ebx
-
- jnz L(unroll_noswap)
- movl %ecx, %esi C high,low carry other way around
-
- movl %edx, %ecx
-L(unroll_noswap):
-
- jmp *%eax
-
-
-
-C -----------------------------------------------------------------------------
- ALIGN(32)
-L(unroll_top):
- C eax scratch
- C ebx xp
- C ecx carry low
- C edx scratch
- C esi carry high
- C edi wp
- C ebp multiplier
- C VAR_COUNTER loop counter
- C
- C 15 code bytes each limb
-
- leal UNROLL_BYTES(%edi), %edi
-
-L(unroll_entry):
-deflit(CHUNK_COUNT,2)
-forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
- deflit(`disp0', eval(i*CHUNK_COUNT*4))
- deflit(`disp1', eval(disp0 + 4))
- deflit(`disp2', eval(disp1 + 4))
-
- movl disp1(%ebx), %eax
- mull %ebp
-Zdisp( addl, %ecx, disp0,(%edi))
- adcl %eax, %esi
- movl %edx, %ecx
- jadcl0( %ecx)
-
- movl disp2(%ebx), %eax
- mull %ebp
- addl %esi, disp1(%edi)
- adcl %eax, %ecx
- movl %edx, %esi
- jadcl0( %esi)
-')
-
- decl VAR_COUNTER
- leal UNROLL_BYTES(%ebx), %ebx
-
- jns L(unroll_top)
-
-
- movl PARAM_YSIZE, %edx
- addl %ecx, UNROLL_BYTES(%edi)
-
- adcl $0, %esi
-
- incl %edx
- movl %esi, UNROLL_BYTES+4(%edi)
-
- jnz L(unroll_outer_top)
-
-
- movl SAVE_ESI, %esi
- movl SAVE_EBP, %ebp
- movl SAVE_EDI, %edi
- movl SAVE_EBX, %ebx
-
- addl $FRAME, %esp
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/sqr_basecase.asm b/rts/gmp/mpn/x86/k6/sqr_basecase.asm
deleted file mode 100644
index 70d49b3e57..0000000000
--- a/rts/gmp/mpn/x86/k6/sqr_basecase.asm
+++ /dev/null
@@ -1,672 +0,0 @@
-dnl AMD K6 mpn_sqr_basecase -- square an mpn number.
-dnl
-dnl K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
-dnl product (measured on the speed difference between 17 and 33 limbs,
-dnl which is roughly the Karatsuba recursing range).
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-dnl KARATSUBA_SQR_THRESHOLD_MAX is the maximum KARATSUBA_SQR_THRESHOLD this
-dnl code supports. This value is used only by the tune program to know
-dnl what it can go up to. (An attempt to compile with a bigger value will
-dnl trigger some m4_assert()s in the code, making the build fail.)
-dnl
-dnl The value is determined by requiring the displacements in the unrolled
-dnl addmul to fit in single bytes. This means a maximum UNROLL_COUNT of
-dnl 63, giving a maximum KARATSUBA_SQR_THRESHOLD of 66.
-
-deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66)
-
-
-dnl Allow a value from the tune program to override config.m4.
-
-ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
-`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
-
-
-dnl UNROLL_COUNT is the number of code chunks in the unrolled addmul. The
-dnl number required is determined by KARATSUBA_SQR_THRESHOLD, since
-dnl mpn_sqr_basecase only needs to handle sizes < KARATSUBA_SQR_THRESHOLD.
-dnl
-dnl The first addmul is the biggest, and this takes the second least
-dnl significant limb and multiplies it by the third least significant and
-dnl up. Hence for a maximum operand size of KARATSUBA_SQR_THRESHOLD-1
-dnl limbs, UNROLL_COUNT needs to be KARATSUBA_SQR_THRESHOLD-3.
-
-m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
-deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
-
-
-C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
-C
-C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
-C lot of function call overheads are avoided, especially when the given size
-C is small.
-C
-C The code size might look a bit excessive, but not all of it is executed
-C and so won't fill up the code cache. The 1x1, 2x2 and 3x3 special cases
-C clearly apply only to those sizes; mid sizes like 10x10 only need part of
-C the unrolled addmul; and big sizes like 35x35 that do need all of it will
-C at least be getting value for money, because 35x35 spends something like
-C 5780 cycles here.
-C
-C Different values of UNROLL_COUNT give slightly different speeds, between
-C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
-C This isn't a big difference, but it's presumably some alignment effect
-C which if understood could give a simple speedup.
-
-defframe(PARAM_SIZE,12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
- .text
- ALIGN(32)
-PROLOGUE(mpn_sqr_basecase)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %ecx
- movl PARAM_SRC, %eax
-
- cmpl $2, %ecx
- je L(two_limbs)
-
- movl PARAM_DST, %edx
- ja L(three_or_more)
-
-
-C -----------------------------------------------------------------------------
-C one limb only
- C eax src
- C ebx
- C ecx size
- C edx dst
-
- movl (%eax), %eax
- movl %edx, %ecx
-
- mull %eax
-
- movl %eax, (%ecx)
- movl %edx, 4(%ecx)
- ret
-
-
-C -----------------------------------------------------------------------------
- ALIGN(16)
-L(two_limbs):
- C eax src
- C ebx
- C ecx size
- C edx dst
-
- pushl %ebx
- movl %eax, %ebx C src
-deflit(`FRAME',4)
-
- movl (%ebx), %eax
- movl PARAM_DST, %ecx
-
- mull %eax C src[0]^2
-
- movl %eax, (%ecx)
- movl 4(%ebx), %eax
-
- movl %edx, 4(%ecx)
-
- mull %eax C src[1]^2
-
- movl %eax, 8(%ecx)
- movl (%ebx), %eax
-
- movl %edx, 12(%ecx)
- movl 4(%ebx), %edx
-
- mull %edx C src[0]*src[1]
-
- addl %eax, 4(%ecx)
-
- adcl %edx, 8(%ecx)
- adcl $0, 12(%ecx)
-
- popl %ebx
- addl %eax, 4(%ecx)
-
- adcl %edx, 8(%ecx)
- adcl $0, 12(%ecx)
-
- ret
-
-
-C -----------------------------------------------------------------------------
-L(three_or_more):
-deflit(`FRAME',0)
- cmpl $4, %ecx
- jae L(four_or_more)
-
-
-C -----------------------------------------------------------------------------
-C three limbs
- C eax src
- C ecx size
- C edx dst
-
- pushl %ebx
- movl %eax, %ebx C src
-
- movl (%ebx), %eax
- movl %edx, %ecx C dst
-
- mull %eax C src[0] ^ 2
-
- movl %eax, (%ecx)
- movl 4(%ebx), %eax
-
- movl %edx, 4(%ecx)
- pushl %esi
-
- mull %eax C src[1] ^ 2
-
- movl %eax, 8(%ecx)
- movl 8(%ebx), %eax
-
- movl %edx, 12(%ecx)
- pushl %edi
-
- mull %eax C src[2] ^ 2
-
- movl %eax, 16(%ecx)
- movl (%ebx), %eax
-
- movl %edx, 20(%ecx)
- movl 4(%ebx), %edx
-
- mull %edx C src[0] * src[1]
-
- movl %eax, %esi
- movl (%ebx), %eax
-
- movl %edx, %edi
- movl 8(%ebx), %edx
-
- pushl %ebp
- xorl %ebp, %ebp
-
- mull %edx C src[0] * src[2]
-
- addl %eax, %edi
- movl 4(%ebx), %eax
-
- adcl %edx, %ebp
-
- movl 8(%ebx), %edx
-
- mull %edx C src[1] * src[2]
-
- addl %eax, %ebp
-
- adcl $0, %edx
-
-
- C eax will be dst[5]
- C ebx
- C ecx dst
- C edx dst[4]
- C esi dst[1]
- C edi dst[2]
- C ebp dst[3]
-
- xorl %eax, %eax
- addl %esi, %esi
- adcl %edi, %edi
- adcl %ebp, %ebp
- adcl %edx, %edx
- adcl $0, %eax
-
- addl %esi, 4(%ecx)
- adcl %edi, 8(%ecx)
- adcl %ebp, 12(%ecx)
-
- popl %ebp
- popl %edi
-
- adcl %edx, 16(%ecx)
-
- popl %esi
- popl %ebx
-
- adcl %eax, 20(%ecx)
- ASSERT(nc)
-
- ret
-
-
-C -----------------------------------------------------------------------------
-
-defframe(SAVE_EBX, -4)
-defframe(SAVE_ESI, -8)
-defframe(SAVE_EDI, -12)
-defframe(SAVE_EBP, -16)
-defframe(VAR_COUNTER,-20)
-defframe(VAR_JMP, -24)
-deflit(STACK_SPACE, 24)
-
- ALIGN(16)
-L(four_or_more):
-
- C eax src
- C ebx
- C ecx size
- C edx dst
- C esi
- C edi
- C ebp
-
-C First multiply src[0]*src[1..size-1] and store at dst[1..size].
-C
-C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
-C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
-C a 5780 cycle operation, which is not surprising since the loop here is 8
-C c/l and mpn_mul_1 is 6.25 c/l.
-
- subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
-
- movl %edi, SAVE_EDI
- leal 4(%edx), %edi
-
- movl %ebx, SAVE_EBX
- leal 4(%eax), %ebx
-
- movl %esi, SAVE_ESI
- xorl %esi, %esi
-
- movl %ebp, SAVE_EBP
-
- C eax
- C ebx src+4
- C ecx size
- C edx
- C esi
- C edi dst+4
- C ebp
-
- movl (%eax), %ebp C multiplier
- leal -1(%ecx), %ecx C size-1, and pad to a 16 byte boundary
-
-
- ALIGN(16)
-L(mul_1):
- C eax scratch
- C ebx src ptr
- C ecx counter
- C edx scratch
- C esi carry
- C edi dst ptr
- C ebp multiplier
-
- movl (%ebx), %eax
- addl $4, %ebx
-
- mull %ebp
-
- addl %esi, %eax
- movl $0, %esi
-
- adcl %edx, %esi
-
- movl %eax, (%edi)
- addl $4, %edi
-
- loop L(mul_1)
-
-
-C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
-C
-C The last two addmuls, which are the bottom right corner of the product
-C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
-C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
-C cases that need to be done.
-C
-C The unrolled code is the same as mpn_addmul_1(), see that routine for some
-C comments.
-C
-C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
-C
-C VAR_JMP is the computed jump into the unrolled code, stepped by one code
-C chunk each outer loop.
-C
-C K6 doesn't do any branch prediction on indirect jumps, which is good
-C actually because it's a different target each time. The unrolled addmul
-C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
-C the indirect jump is quickly recovered.
-
-
-dnl This value is also implicitly encoded in a shift and add.
-dnl
-deflit(CODE_BYTES_PER_LIMB, 15)
-
-dnl With the unmodified &src[size] and &dst[size] pointers, the
-dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT
-dnl values up to 31. Above that an offset must be added to them.
-dnl
-deflit(OFFSET,
-ifelse(eval(UNROLL_COUNT>31),1,
-eval((UNROLL_COUNT-31)*4),
-0))
-
- C eax
- C ebx &src[size]
- C ecx
- C edx
- C esi carry
- C edi &dst[size]
- C ebp
-
- movl PARAM_SIZE, %ecx
- movl %esi, (%edi)
-
- subl $4, %ecx
- jz L(corner)
-
- movl %ecx, %edx
-ifelse(OFFSET,0,,
-` subl $OFFSET, %ebx')
-
- shll $4, %ecx
-ifelse(OFFSET,0,,
-` subl $OFFSET, %edi')
-
- negl %ecx
-
-ifdef(`PIC',`
- call L(pic_calc)
-L(here):
-',`
- leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
-')
- negl %edx
-
-
- C The calculated jump mustn't be before the start of the available
- C code. This is the limitation UNROLL_COUNT puts on the src operand
- C size, but checked here using the jump address directly.
- C
- ASSERT(ae,`
- movl_text_address( L(unroll_inner_start), %eax)
- cmpl %eax, %ecx
- ')
-
-
-C -----------------------------------------------------------------------------
- ALIGN(16)
-L(unroll_outer_top):
- C eax
- C ebx &src[size], constant
- C ecx VAR_JMP
- C edx VAR_COUNTER, limbs, negative
- C esi high limb to store
- C edi dst ptr, high of last addmul
- C ebp
-
- movl -12+OFFSET(%ebx,%edx,4), %ebp C multiplier
- movl %edx, VAR_COUNTER
-
- movl -8+OFFSET(%ebx,%edx,4), %eax C first limb of multiplicand
-
- mull %ebp
-
- testb $1, %cl
-
- movl %edx, %esi C high carry
- movl %ecx, %edx C jump
-
- movl %eax, %ecx C low carry
- leal CODE_BYTES_PER_LIMB(%edx), %edx
-
- movl %edx, VAR_JMP
- leal 4(%edi), %edi
-
- C A branch-free version of this using some xors was found to be a
- C touch slower than just a conditional jump, despite the jump
- C switching between taken and not taken on every loop.
-
-ifelse(eval(UNROLL_COUNT%2),0,
- jz,jnz) L(unroll_noswap)
- movl %esi, %eax C high,low carry other way around
-
- movl %ecx, %esi
- movl %eax, %ecx
-L(unroll_noswap):
-
- jmp *%edx
-
-
- C Must be on an even address here so the low bit of the jump address
- C will indicate which way around ecx/esi should start.
- C
- C An attempt was made at padding here to get the end of the unrolled
- C code to come out on a good alignment, to save padding before
- C L(corner). This worked, but turned out to run slower than just an
- C ALIGN(2). The reason for this is not clear, it might be related
- C to the different speeds on different UNROLL_COUNTs noted above.
-
- ALIGN(2)
-
-L(unroll_inner_start):
- C eax scratch
- C ebx src
- C ecx carry low
- C edx scratch
- C esi carry high
- C edi dst
- C ebp multiplier
- C
- C 15 code bytes each limb
- C ecx/esi swapped on each chunk
-
-forloop(`i', UNROLL_COUNT, 1, `
- deflit(`disp_src', eval(-i*4 + OFFSET))
- deflit(`disp_dst', eval(disp_src - 4))
-
- m4_assert(`disp_src>=-128 && disp_src<128')
- m4_assert(`disp_dst>=-128 && disp_dst<128')
-
-ifelse(eval(i%2),0,`
-Zdisp( movl, disp_src,(%ebx), %eax)
- mull %ebp
-Zdisp( addl, %esi, disp_dst,(%edi))
- adcl %eax, %ecx
- movl %edx, %esi
- jadcl0( %esi)
-',`
- dnl this one comes out last
-Zdisp( movl, disp_src,(%ebx), %eax)
- mull %ebp
-Zdisp( addl, %ecx, disp_dst,(%edi))
- adcl %eax, %esi
- movl %edx, %ecx
- jadcl0( %ecx)
-')
-')
-L(unroll_inner_end):
-
- addl %esi, -4+OFFSET(%edi)
-
- movl VAR_COUNTER, %edx
- jadcl0( %ecx)
-
- movl %ecx, m4_empty_if_zero(OFFSET)(%edi)
- movl VAR_JMP, %ecx
-
- incl %edx
- jnz L(unroll_outer_top)
-
-
-ifelse(OFFSET,0,,`
- addl $OFFSET, %ebx
- addl $OFFSET, %edi
-')
-
-
-C -----------------------------------------------------------------------------
- ALIGN(16)
-L(corner):
- C ebx &src[size]
- C edi &dst[2*size-5]
-
- movl -12(%ebx), %ebp
-
- movl -8(%ebx), %eax
- movl %eax, %ecx
-
- mull %ebp
-
- addl %eax, -4(%edi)
- adcl $0, %edx
-
- movl -4(%ebx), %eax
- movl %edx, %esi
- movl %eax, %ebx
-
- mull %ebp
-
- addl %esi, %eax
- adcl $0, %edx
-
- addl %eax, (%edi)
- adcl $0, %edx
-
- movl %edx, %esi
- movl %ebx, %eax
-
- mull %ecx
-
- addl %esi, %eax
- movl %eax, 4(%edi)
-
- adcl $0, %edx
-
- movl %edx, 8(%edi)
-
-
-C -----------------------------------------------------------------------------
-C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
-C The loop measures about 6 cycles/iteration, though it looks like it should
-C decode in 5.
-
-L(lshift_start):
- movl PARAM_SIZE, %ecx
-
- movl PARAM_DST, %edi
- subl $1, %ecx C size-1 and clear carry
-
- movl PARAM_SRC, %ebx
- movl %ecx, %edx
-
- xorl %eax, %eax C ready for adcl
-
-
- ALIGN(16)
-L(lshift):
- C eax
- C ebx src (for later use)
- C ecx counter, decrementing
- C edx size-1 (for later use)
- C esi
- C edi dst, incrementing
- C ebp
-
- rcll 4(%edi)
- rcll 8(%edi)
- leal 8(%edi), %edi
- loop L(lshift)
-
-
- adcl %eax, %eax
-
- movl %eax, 4(%edi) C dst most significant limb
- movl (%ebx), %eax C src[0]
-
- leal 4(%ebx,%edx,4), %ebx C &src[size]
- subl %edx, %ecx C -(size-1)
-
-
-C -----------------------------------------------------------------------------
-C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
-C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
-C low limb of src[0]^2.
-
-
- mull %eax
-
- movl %eax, (%edi,%ecx,8) C dst[0]
-
-
- ALIGN(16)
-L(diag):
- C eax scratch
- C ebx &src[size]
- C ecx counter, negative
- C edx carry
- C esi scratch
- C edi dst[2*size-2]
- C ebp
-
- movl (%ebx,%ecx,4), %eax
- movl %edx, %esi
-
- mull %eax
-
- addl %esi, 4(%edi,%ecx,8)
- adcl %eax, 8(%edi,%ecx,8)
- adcl $0, %edx
-
- incl %ecx
- jnz L(diag)
-
-
- movl SAVE_EBX, %ebx
- movl SAVE_ESI, %esi
-
- addl %edx, 4(%edi) C dst most significant limb
-
- movl SAVE_EDI, %edi
- movl SAVE_EBP, %ebp
- addl $FRAME, %esp
- ret
-
-
-
-C -----------------------------------------------------------------------------
-ifdef(`PIC',`
-L(pic_calc):
- C See README.family about old gas bugs
- addl (%esp), %ecx
- addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
- addl %edx, %ecx
- ret
-')
-
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/README b/rts/gmp/mpn/x86/k7/README
deleted file mode 100644
index c34315c401..0000000000
--- a/rts/gmp/mpn/x86/k7/README
+++ /dev/null
@@ -1,145 +0,0 @@
-
- AMD K7 MPN SUBROUTINES
-
-
-This directory contains code optimized for the AMD Athlon CPU.
-
-The mmx subdirectory has routines using MMX instructions. All Athlons have
-MMX, the separate directory is just so that configure can omit it if the
-assembler doesn't support MMX.
-
-
-
-STATUS
-
-Times for the loops, with all code and data in L1 cache.
-
- cycles/limb
- mpn_add/sub_n 1.6
-
- mpn_copyi 0.75 or 1.0 \ varying with data alignment
- mpn_copyd 0.75 or 1.0 /
-
- mpn_divrem_1 17.0 integer part, 15.0 fractional part
- mpn_mod_1 17.0
- mpn_divexact_by3 8.0
-
- mpn_l/rshift 1.2
-
- mpn_mul_1 3.4
- mpn_addmul/submul_1 3.9
-
- mpn_mul_basecase 4.42 cycles/crossproduct (approx)
-
- mpn_popcount 5.0
- mpn_hamdist 6.0
-
-Prefetching of sources hasn't yet been tried.
-
-
-
-NOTES
-
-cmov, MMX, 3DNow and some extensions to MMX and 3DNow are available.
-
-Write-allocate L1 data cache means prefetching of destinations is unnecessary.
-
-Floating point multiplications can be done in parallel with integer
-multiplications, but there doesn't seem to be any way to make use of this.
-
-Unsigned "mul"s can be issued every 3 cycles. This suggests 3 is a limit on
-the speed of the multiplication routines. The documentation shows mul
-executing in IEU0 (or maybe in IEU0 and IEU1 together), so it might be that,
-to get near 3 cycles code has to be arranged so that nothing else is issued
-to IEU0. A busy IEU0 could explain why some code takes 4 cycles and other
-apparently equivalent code takes 5.
-
-
-
-OPTIMIZATIONS
-
-Unrolled loops are used to reduce looping overhead. The unrolling is
-configurable up to 32 limbs/loop for most routines and up to 64 for some.
-The K7 has 64k L1 code cache so quite big unrolling is allowable.
-
-Computed jumps into the unrolling are used to handle sizes not a multiple of
-the unrolling. An attractive feature of this is that times increase
-smoothly with operand size, but it may be that some routines should just
-have simple loops to finish up, especially when PIC adds between 2 and 16
-cycles to get %eip.
-
-Position independent code is implemented using a call to get %eip for the
-computed jumps and a ret is always done, rather than an addl $4,%esp or a
-popl, so the CPU return address branch prediction stack stays synchronised
-with the actual stack in memory.
-
-Branch prediction, in absence of any history, will guess forward jumps are
-not taken and backward jumps are taken. Where possible it's arranged that
-the less likely or less important case is under a taken forward jump.
-
-
-
-CODING
-
-Instructions in general code have been shown grouped if they can execute
-together, which means up to three direct-path instructions which have no
-successive dependencies. K7 always decodes three and has out-of-order
-execution, but the groupings show what slots might be available and what
-dependency chains exist.
-
-When there's vector-path instructions an effort is made to get triplets of
-direct-path instructions in between them, even if there's dependencies,
-since this maximizes decoding throughput and might save a cycle or two if
-decoding is the limiting factor.
-
-
-
-INSTRUCTIONS
-
-adcl direct
-divl 39 cycles back-to-back
-lodsl,etc vector
-loop 1 cycle vector (decl/jnz opens up one decode slot)
-movd reg vector
-movd mem direct
-mull issue every 3 cycles, latency 4 cycles low word, 6 cycles high word
-popl vector (use movl for more than one pop)
-pushl direct, will pair with a load
-shrdl %cl vector, 3 cycles, seems to be 3 decode too
-xorl r,r false read dependency recognised
-
-
-
-REFERENCES
-
-"AMD Athlon Processor X86 Code Optimization Guide", AMD publication number
-22007, revision E, November 1999. Available on-line,
-
- http://www.amd.com/products/cpg/athlon/techdocs/pdf/22007.pdf
-
-"3DNow Technology Manual", AMD publication number 21928F/0-August 1999.
-This describes the femms and prefetch instructions. Available on-line,
-
- http://www.amd.com/K6/k6docs/pdf/21928.pdf
-
-"AMD Extensions to the 3DNow and MMX Instruction Sets Manual", AMD
-publication number 22466, revision B, August 1999. This describes
-instructions added in the Athlon processor, such as pswapd and the extra
-prefetch forms. Available on-line,
-
- http://www.amd.com/products/cpg/athlon/techdocs/pdf/22466.pdf
-
-"3DNow Instruction Porting Guide", AMD publication number 22621, revision B,
-August 1999. This has some notes on general Athlon optimizations as well as
-3DNow. Available on-line,
-
- http://www.amd.com/products/cpg/athlon/techdocs/pdf/22621.pdf
-
-
-
-
-----------------
-Local variables:
-mode: text
-fill-column: 76
-End:
diff --git a/rts/gmp/mpn/x86/k7/aors_n.asm b/rts/gmp/mpn/x86/k7/aors_n.asm
deleted file mode 100644
index 85fa9d3036..0000000000
--- a/rts/gmp/mpn/x86/k7/aors_n.asm
+++ /dev/null
@@ -1,250 +0,0 @@
-dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
-dnl
-dnl K7: 1.64 cycles/limb (at 16 limb/loop).
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-dnl K7: UNROLL_COUNT cycles/limb
-dnl 8 1.9
-dnl 16 1.64
-dnl 32 1.7
-dnl 64 2.0
-dnl Maximum possible with the current code is 64.
-
-deflit(UNROLL_COUNT, 16)
-
-
-ifdef(`OPERATION_add_n', `
- define(M4_inst, adcl)
- define(M4_function_n, mpn_add_n)
- define(M4_function_nc, mpn_add_nc)
- define(M4_description, add)
-',`ifdef(`OPERATION_sub_n', `
- define(M4_inst, sbbl)
- define(M4_function_n, mpn_sub_n)
- define(M4_function_nc, mpn_sub_nc)
- define(M4_description, subtract)
-',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
-')')')
-
-MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
-
-
-C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
-C mp_size_t size);
-C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
-C mp_size_t size, mp_limb_t carry);
-C
-C Calculate src1,size M4_description src2,size, and store the result in
-C dst,size. The return value is the carry bit from the top of the result (1
-C or 0).
-C
-C The _nc version accepts 1 or 0 for an initial carry into the low limb of
-C the calculation. Note values other than 1 or 0 here will lead to garbage
-C results.
-C
-C This code runs at 1.64 cycles/limb, which is probably the best possible
-C with plain integer operations. Each limb is 2 loads and 1 store, and in
-C one cycle the K7 can do two loads, or a load and a store, leading to 1.5
-C c/l.
-
-dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
-ifdef(`PIC',`
-deflit(UNROLL_THRESHOLD, 8)
-',`
-deflit(UNROLL_THRESHOLD, 8)
-')
-
-defframe(PARAM_CARRY,20)
-defframe(PARAM_SIZE, 16)
-defframe(PARAM_SRC2, 12)
-defframe(PARAM_SRC1, 8)
-defframe(PARAM_DST, 4)
-
-defframe(SAVE_EBP, -4)
-defframe(SAVE_ESI, -8)
-defframe(SAVE_EBX, -12)
-defframe(SAVE_EDI, -16)
-deflit(STACK_SPACE, 16)
-
- .text
- ALIGN(32)
-deflit(`FRAME',0)
-
-PROLOGUE(M4_function_nc)
- movl PARAM_CARRY, %eax
- jmp LF(M4_function_n,start)
-EPILOGUE()
-
-PROLOGUE(M4_function_n)
-
- xorl %eax, %eax C carry
-L(start):
- movl PARAM_SIZE, %ecx
- subl $STACK_SPACE, %esp
-deflit(`FRAME',STACK_SPACE)
-
- movl %edi, SAVE_EDI
- movl %ebx, SAVE_EBX
- cmpl $UNROLL_THRESHOLD, %ecx
-
- movl PARAM_SRC2, %edx
- movl PARAM_SRC1, %ebx
- jae L(unroll)
-
- movl PARAM_DST, %edi
- leal (%ebx,%ecx,4), %ebx
- leal (%edx,%ecx,4), %edx
-
- leal (%edi,%ecx,4), %edi
- negl %ecx
- shrl %eax
-
- C This loop in in a single 16 byte code block already, so no
- C alignment necessary.
-L(simple):
- C eax scratch
- C ebx src1
- C ecx counter
- C edx src2
- C esi
- C edi dst
- C ebp
-
- movl (%ebx,%ecx,4), %eax
- M4_inst (%edx,%ecx,4), %eax
- movl %eax, (%edi,%ecx,4)
- incl %ecx
- jnz L(simple)
-
- movl $0, %eax
- movl SAVE_EDI, %edi
-
- movl SAVE_EBX, %ebx
- setc %al
- addl $STACK_SPACE, %esp
-
- ret
-
-
-C -----------------------------------------------------------------------------
- C This is at 0x55, close enough to aligned.
-L(unroll):
-deflit(`FRAME',STACK_SPACE)
- movl %ebp, SAVE_EBP
- andl $-2, %ecx C size low bit masked out
- andl $1, PARAM_SIZE C size low bit kept
-
- movl %ecx, %edi
- decl %ecx
- movl PARAM_DST, %ebp
-
- shrl $UNROLL_LOG2, %ecx
- negl %edi
- movl %esi, SAVE_ESI
-
- andl $UNROLL_MASK, %edi
-
-ifdef(`PIC',`
- call L(pic_calc)
-L(here):
-',`
- leal L(entry) (%edi,%edi,8), %esi C 9 bytes per
-')
- negl %edi
- shrl %eax
-
- leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
- leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
- leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
-
- jmp *%esi
-
-
-ifdef(`PIC',`
-L(pic_calc):
- C See README.family about old gas bugs
- leal (%edi,%edi,8), %esi
- addl $L(entry)-L(here), %esi
- addl (%esp), %esi
- ret
-')
-
-
-C -----------------------------------------------------------------------------
- ALIGN(32)
-L(top):
- C eax zero
- C ebx src1
- C ecx counter
- C edx src2
- C esi scratch (was computed jump)
- C edi dst
- C ebp scratch
-
- leal UNROLL_BYTES(%edx), %edx
-
-L(entry):
-deflit(CHUNK_COUNT, 2)
-forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
- deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
- deflit(`disp1', eval(disp0 + 4))
-
-Zdisp( movl, disp0,(%ebx), %esi)
- movl disp1(%ebx), %ebp
-Zdisp( M4_inst,disp0,(%edx), %esi)
-Zdisp( movl, %esi, disp0,(%edi))
- M4_inst disp1(%edx), %ebp
- movl %ebp, disp1(%edi)
-')
-
- decl %ecx
- leal UNROLL_BYTES(%ebx), %ebx
- leal UNROLL_BYTES(%edi), %edi
- jns L(top)
-
-
- mov PARAM_SIZE, %esi
- movl SAVE_EBP, %ebp
- movl $0, %eax
-
- decl %esi
- js L(even)
-
- movl (%ebx), %ecx
- M4_inst UNROLL_BYTES(%edx), %ecx
- movl %ecx, (%edi)
-L(even):
-
- movl SAVE_EDI, %edi
- movl SAVE_EBX, %ebx
- setc %al
-
- movl SAVE_ESI, %esi
- addl $STACK_SPACE, %esp
-
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/aorsmul_1.asm b/rts/gmp/mpn/x86/k7/aorsmul_1.asm
deleted file mode 100644
index 9f9c3daaf4..0000000000
--- a/rts/gmp/mpn/x86/k7/aorsmul_1.asm
+++ /dev/null
@@ -1,364 +0,0 @@
-dnl AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
-dnl
-dnl K7: 3.9 cycles/limb.
-dnl
-dnl Future: It should be possible to avoid the separate mul after the
-dnl unrolled loop by moving the movl/adcl to the top.
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-dnl K7: UNROLL_COUNT cycles/limb
-dnl 4 4.42
-dnl 8 4.16
-dnl 16 3.9
-dnl 32 3.9
-dnl 64 3.87
-dnl Maximum possible with the current code is 64.
-
-deflit(UNROLL_COUNT, 16)
-
-
-ifdef(`OPERATION_addmul_1',`
- define(M4_inst, addl)
- define(M4_function_1, mpn_addmul_1)
- define(M4_function_1c, mpn_addmul_1c)
- define(M4_description, add it to)
- define(M4_desc_retval, carry)
-',`ifdef(`OPERATION_submul_1',`
- define(M4_inst, subl)
- define(M4_function_1, mpn_submul_1)
- define(M4_function_1c, mpn_submul_1c)
- define(M4_description, subtract it from)
- define(M4_desc_retval, borrow)
-',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
-')')')
-
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
-
-
-C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t mult);
-C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t mult, mp_limb_t carry);
-C
-C Calculate src,size multiplied by mult and M4_description dst,size.
-C Return the M4_desc_retval limb from the top of the result.
-
-ifdef(`PIC',`
-deflit(UNROLL_THRESHOLD, 9)
-',`
-deflit(UNROLL_THRESHOLD, 6)
-')
-
-defframe(PARAM_CARRY, 20)
-defframe(PARAM_MULTIPLIER,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-deflit(`FRAME',0)
-
-defframe(SAVE_EBX, -4)
-defframe(SAVE_ESI, -8)
-defframe(SAVE_EDI, -12)
-defframe(SAVE_EBP, -16)
-deflit(SAVE_SIZE, 16)
-
- .text
- ALIGN(32)
-PROLOGUE(M4_function_1)
- movl PARAM_SIZE, %edx
- movl PARAM_SRC, %eax
- xorl %ecx, %ecx
-
- decl %edx
- jnz LF(M4_function_1c,start_1)
-
- movl (%eax), %eax
- movl PARAM_DST, %ecx
-
- mull PARAM_MULTIPLIER
-
- M4_inst %eax, (%ecx)
- adcl $0, %edx
- movl %edx, %eax
-
- ret
-EPILOGUE()
-
- ALIGN(16)
-PROLOGUE(M4_function_1c)
- movl PARAM_SIZE, %edx
- movl PARAM_SRC, %eax
-
- decl %edx
- jnz L(more_than_one_limb)
-
- movl (%eax), %eax
- movl PARAM_DST, %ecx
-
- mull PARAM_MULTIPLIER
-
- addl PARAM_CARRY, %eax
-
- adcl $0, %edx
- M4_inst %eax, (%ecx)
-
- adcl $0, %edx
- movl %edx, %eax
-
- ret
-
-
- C offset 0x44 so close enough to aligned
-L(more_than_one_limb):
- movl PARAM_CARRY, %ecx
-L(start_1):
- C eax src
- C ecx initial carry
- C edx size-1
- subl $SAVE_SIZE, %esp
-deflit(`FRAME',16)
-
- movl %ebx, SAVE_EBX
- movl %esi, SAVE_ESI
- movl %edx, %ebx C size-1
-
- movl PARAM_SRC, %esi
- movl %ebp, SAVE_EBP
- cmpl $UNROLL_THRESHOLD, %edx
-
- movl PARAM_MULTIPLIER, %ebp
- movl %edi, SAVE_EDI
-
- movl (%esi), %eax C src low limb
- movl PARAM_DST, %edi
- ja L(unroll)
-
-
- C simple loop
-
- leal 4(%esi,%ebx,4), %esi C point one limb past last
- leal (%edi,%ebx,4), %edi C point at last limb
- negl %ebx
-
- C The movl to load the next source limb is done well ahead of the
- C mul. This is necessary for full speed, and leads to one limb
- C handled separately at the end.
-
-L(simple):
- C eax src limb
- C ebx loop counter
- C ecx carry limb
- C edx scratch
- C esi src
- C edi dst
- C ebp multiplier
-
- mull %ebp
-
- addl %eax, %ecx
- adcl $0, %edx
-
- M4_inst %ecx, (%edi,%ebx,4)
- movl (%esi,%ebx,4), %eax
- adcl $0, %edx
-
- incl %ebx
- movl %edx, %ecx
- jnz L(simple)
-
-
- mull %ebp
-
- movl SAVE_EBX, %ebx
- movl SAVE_ESI, %esi
- movl SAVE_EBP, %ebp
-
- addl %eax, %ecx
- adcl $0, %edx
-
- M4_inst %ecx, (%edi)
- adcl $0, %edx
- movl SAVE_EDI, %edi
-
- addl $SAVE_SIZE, %esp
- movl %edx, %eax
- ret
-
-
-
-C -----------------------------------------------------------------------------
- ALIGN(16)
-L(unroll):
- C eax src low limb
- C ebx size-1
- C ecx carry
- C edx size-1
- C esi src
- C edi dst
- C ebp multiplier
-
-dnl overlapping with parameters no longer needed
-define(VAR_COUNTER,`PARAM_SIZE')
-define(VAR_JUMP, `PARAM_MULTIPLIER')
-
- subl $2, %ebx C (size-2)-1
- decl %edx C size-2
-
- shrl $UNROLL_LOG2, %ebx
- negl %edx
-
- movl %ebx, VAR_COUNTER
- andl $UNROLL_MASK, %edx
-
- movl %edx, %ebx
- shll $4, %edx
-
-ifdef(`PIC',`
- call L(pic_calc)
-L(here):
-',`
- leal L(entry) (%edx,%ebx,1), %edx
-')
- negl %ebx
- movl %edx, VAR_JUMP
-
- mull %ebp
-
- addl %eax, %ecx C initial carry, becomes low carry
- adcl $0, %edx
- testb $1, %bl
-
- movl 4(%esi), %eax C src second limb
- leal ifelse(UNROLL_BYTES,256,128+) 8(%esi,%ebx,4), %esi
- leal ifelse(UNROLL_BYTES,256,128) (%edi,%ebx,4), %edi
-
- movl %edx, %ebx C high carry
- cmovnz( %ecx, %ebx) C high,low carry other way around
- cmovnz( %edx, %ecx)
-
- jmp *VAR_JUMP
-
-
-ifdef(`PIC',`
-L(pic_calc):
- C See README.family about old gas bugs
- leal (%edx,%ebx,1), %edx
- addl $L(entry)-L(here), %edx
- addl (%esp), %edx
- ret
-')
-
-
-C -----------------------------------------------------------------------------
-C This code uses a "two carry limbs" scheme. At the top of the loop the
-C carries are ebx=lo, ecx=hi, then they swap for each limb processed. For
-C the computed jump an odd size means they start one way around, an even
-C size the other. Either way one limb is handled separately at the start of
-C the loop.
-C
-C The positioning of the movl to load the next source limb is important.
-C Moving it after the adcl with a view to avoiding a separate mul at the end
-C of the loop slows the code down.
-
- ALIGN(32)
-L(top):
- C eax src limb
- C ebx carry high
- C ecx carry low
- C edx scratch
- C esi src+8
- C edi dst
- C ebp multiplier
- C
- C VAR_COUNTER loop counter
- C
- C 17 bytes each limb
-
-L(entry):
-deflit(CHUNK_COUNT,2)
-forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
- deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
- deflit(`disp1', eval(disp0 + 4))
-
- mull %ebp
-
-Zdisp( M4_inst,%ecx, disp0,(%edi))
- movl $0, %ecx
-
- adcl %eax, %ebx
-
-Zdisp( movl, disp0,(%esi), %eax)
- adcl %edx, %ecx
-
-
- mull %ebp
-
- M4_inst %ebx, disp1(%edi)
- movl $0, %ebx
-
- adcl %eax, %ecx
-
- movl disp1(%esi), %eax
- adcl %edx, %ebx
-')
-
- decl VAR_COUNTER
- leal UNROLL_BYTES(%esi), %esi
- leal UNROLL_BYTES(%edi), %edi
-
- jns L(top)
-
-
- C eax src limb
- C ebx carry high
- C ecx carry low
- C edx
- C esi
- C edi dst (points at second last limb)
- C ebp multiplier
-deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
-deflit(`disp1', eval(disp0-0 + 4))
-
- mull %ebp
-
- M4_inst %ecx, disp0(%edi)
- movl SAVE_EBP, %ebp
-
- adcl %ebx, %eax
- movl SAVE_EBX, %ebx
- movl SAVE_ESI, %esi
-
- adcl $0, %edx
- M4_inst %eax, disp1(%edi)
- movl SAVE_EDI, %edi
-
- adcl $0, %edx
- addl $SAVE_SIZE, %esp
-
- movl %edx, %eax
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/diveby3.asm b/rts/gmp/mpn/x86/k7/diveby3.asm
deleted file mode 100644
index 57684958a5..0000000000
--- a/rts/gmp/mpn/x86/k7/diveby3.asm
+++ /dev/null
@@ -1,131 +0,0 @@
-dnl AMD K7 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
-dnl
-dnl K7: 8.0 cycles/limb
-
-
-dnl Copyright (C) 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t carry);
-
-defframe(PARAM_CARRY,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-dnl multiplicative inverse of 3, modulo 2^32
-deflit(INVERSE_3, 0xAAAAAAAB)
-
-dnl ceil(b/3) and floor(b*2/3) where b=2^32
-deflit(ONE_THIRD_CEIL, 0x55555556)
-deflit(TWO_THIRDS_FLOOR, 0xAAAAAAAA)
-
- .text
- ALIGN(32)
-
-PROLOGUE(mpn_divexact_by3c)
-deflit(`FRAME',0)
-
- movl PARAM_SRC, %ecx
- pushl %ebx defframe_pushl(SAVE_EBX)
-
- movl PARAM_CARRY, %ebx
- pushl %ebp defframe_pushl(SAVE_EBP)
-
- movl PARAM_SIZE, %ebp
- pushl %edi defframe_pushl(SAVE_EDI)
-
- movl (%ecx), %eax C src low limb
- pushl %esi defframe_pushl(SAVE_ESI)
-
- movl PARAM_DST, %edi
- movl $TWO_THIRDS_FLOOR, %esi
- leal -4(%ecx,%ebp,4), %ecx C &src[size-1]
-
- subl %ebx, %eax
-
- setc %bl
- decl %ebp
- jz L(last)
-
- leal (%edi,%ebp,4), %edi C &dst[size-1]
- negl %ebp
-
-
- ALIGN(16)
-L(top):
- C eax src limb, carry subtracted
- C ebx carry limb (0 or 1)
- C ecx &src[size-1]
- C edx scratch
- C esi TWO_THIRDS_FLOOR
- C edi &dst[size-1]
- C ebp counter, limbs, negative
-
- imull $INVERSE_3, %eax, %edx
-
- movl 4(%ecx,%ebp,4), %eax C next src limb
- cmpl $ONE_THIRD_CEIL, %edx
-
- sbbl $-1, %ebx C +1 if result>=ceil(b/3)
- cmpl %edx, %esi
-
- sbbl %ebx, %eax C and further 1 if result>=ceil(b*2/3)
- movl %edx, (%edi,%ebp,4)
- incl %ebp
-
- setc %bl C new carry
- jnz L(top)
-
-
-
-L(last):
- C eax src limb, carry subtracted
- C ebx carry limb (0 or 1)
- C ecx &src[size-1]
- C edx scratch
- C esi multiplier
- C edi &dst[size-1]
- C ebp
-
- imull $INVERSE_3, %eax
-
- cmpl $ONE_THIRD_CEIL, %eax
- movl %eax, (%edi)
- movl SAVE_EBP, %ebp
-
- sbbl $-1, %ebx C +1 if eax>=ceil(b/3)
- cmpl %eax, %esi
- movl $0, %eax
-
- adcl %ebx, %eax C further +1 if eax>=ceil(b*2/3)
- movl SAVE_EDI, %edi
- movl SAVE_ESI, %esi
-
- movl SAVE_EBX, %ebx
- addl $FRAME, %esp
-
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/gmp-mparam.h b/rts/gmp/mpn/x86/k7/gmp-mparam.h
deleted file mode 100644
index c3bba0afc4..0000000000
--- a/rts/gmp/mpn/x86/k7/gmp-mparam.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/* AMD K7 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of the GNU Lesser General Public License as published by
-the Free Software Foundation; either version 2.1 of the License, or (at your
-option) any later version.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-License for more details.
-
-You should have received a copy of the GNU Lesser General Public License
-along with the GNU MP Library; see the file COPYING.LIB. If not, write to
-the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-MA 02111-1307, USA. */
-
-#define BITS_PER_MP_LIMB 32
-#define BYTES_PER_MP_LIMB 4
-#define BITS_PER_LONGINT 32
-#define BITS_PER_INT 32
-#define BITS_PER_SHORTINT 16
-#define BITS_PER_CHAR 8
-
-
-/* the low limb is ready after 4 cycles, but normally it's the high limb
- which is of interest, and that comes out after 6 cycles */
-#ifndef UMUL_TIME
-#define UMUL_TIME 6 /* cycles */
-#endif
-
-/* AMD doco says 40, but it measures 39 back-to-back */
-#ifndef UDIV_TIME
-#define UDIV_TIME 39 /* cycles */
-#endif
-
-/* using bsf */
-#ifndef COUNT_TRAILING_ZEROS_TIME
-#define COUNT_TRAILING_ZEROS_TIME 7 /* cycles */
-#endif
-
-
-/* Generated by tuneup.c, 2000-07-06. */
-
-#ifndef KARATSUBA_MUL_THRESHOLD
-#define KARATSUBA_MUL_THRESHOLD 26
-#endif
-#ifndef TOOM3_MUL_THRESHOLD
-#define TOOM3_MUL_THRESHOLD 177
-#endif
-
-#ifndef KARATSUBA_SQR_THRESHOLD
-#define KARATSUBA_SQR_THRESHOLD 52
-#endif
-#ifndef TOOM3_SQR_THRESHOLD
-#define TOOM3_SQR_THRESHOLD 173
-#endif
-
-#ifndef BZ_THRESHOLD
-#define BZ_THRESHOLD 76
-#endif
-
-#ifndef FIB_THRESHOLD
-#define FIB_THRESHOLD 114
-#endif
-
-#ifndef POWM_THRESHOLD
-#define POWM_THRESHOLD 34
-#endif
-
-#ifndef GCD_ACCEL_THRESHOLD
-#define GCD_ACCEL_THRESHOLD 5
-#endif
-#ifndef GCDEXT_THRESHOLD
-#define GCDEXT_THRESHOLD 54
-#endif
-
-#ifndef FFT_MUL_TABLE
-#define FFT_MUL_TABLE { 720, 1440, 2944, 7680, 18432, 57344, 0 }
-#endif
-#ifndef FFT_MODF_MUL_THRESHOLD
-#define FFT_MODF_MUL_THRESHOLD 736
-#endif
-#ifndef FFT_MUL_THRESHOLD
-#define FFT_MUL_THRESHOLD 6912
-#endif
-
-#ifndef FFT_SQR_TABLE
-#define FFT_SQR_TABLE { 784, 1696, 3200, 7680, 18432, 57344, 0 }
-#endif
-#ifndef FFT_MODF_SQR_THRESHOLD
-#define FFT_MODF_SQR_THRESHOLD 800
-#endif
-#ifndef FFT_SQR_THRESHOLD
-#define FFT_SQR_THRESHOLD 8448
-#endif
diff --git a/rts/gmp/mpn/x86/k7/mmx/copyd.asm b/rts/gmp/mpn/x86/k7/mmx/copyd.asm
deleted file mode 100644
index 33214daa1f..0000000000
--- a/rts/gmp/mpn/x86/k7/mmx/copyd.asm
+++ /dev/null
@@ -1,136 +0,0 @@
-dnl AMD K7 mpn_copyd -- copy limb vector, decrementing.
-dnl
-dnl alignment dst/src, A=0mod8 N=4mod8
-dnl A/A A/N N/A N/N
-dnl K7 0.75 1.0 1.0 0.75
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
-C
-C The various comments in mpn/x86/k7/copyi.asm apply here too.
-
-defframe(PARAM_SIZE,12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-deflit(`FRAME',0)
-
-dnl parameter space reused
-define(SAVE_EBX,`PARAM_SIZE')
-define(SAVE_ESI,`PARAM_SRC')
-
-dnl minimum 5 since the unrolled code can't handle less than 5
-deflit(UNROLL_THRESHOLD, 5)
-
- .text
- ALIGN(32)
-PROLOGUE(mpn_copyd)
-
- movl PARAM_SIZE, %ecx
- movl %ebx, SAVE_EBX
-
- movl PARAM_SRC, %eax
- movl PARAM_DST, %edx
-
- cmpl $UNROLL_THRESHOLD, %ecx
- jae L(unroll)
-
- orl %ecx, %ecx
- jz L(simple_done)
-
-L(simple):
- C eax src
- C ebx scratch
- C ecx counter
- C edx dst
- C
- C this loop is 2 cycles/limb
-
- movl -4(%eax,%ecx,4), %ebx
- movl %ebx, -4(%edx,%ecx,4)
- decl %ecx
- jnz L(simple)
-
-L(simple_done):
- movl SAVE_EBX, %ebx
- ret
-
-
-L(unroll):
- movl %esi, SAVE_ESI
- leal (%eax,%ecx,4), %ebx
- leal (%edx,%ecx,4), %esi
-
- andl %esi, %ebx
- movl SAVE_ESI, %esi
- subl $4, %ecx C size-4
-
- testl $4, %ebx C testl to pad code closer to 16 bytes for L(top)
- jz L(aligned)
-
- C both src and dst unaligned, process one limb to align them
- movl 12(%eax,%ecx,4), %ebx
- movl %ebx, 12(%edx,%ecx,4)
- decl %ecx
-L(aligned):
-
-
- ALIGN(16)
-L(top):
- C eax src
- C ebx
- C ecx counter, limbs
- C edx dst
-
- movq 8(%eax,%ecx,4), %mm0
- movq (%eax,%ecx,4), %mm1
- subl $4, %ecx
- movq %mm0, 16+8(%edx,%ecx,4)
- movq %mm1, 16(%edx,%ecx,4)
- jns L(top)
-
-
- C now %ecx is -4 to -1 representing respectively 0 to 3 limbs remaining
-
- testb $2, %cl
- jz L(finish_not_two)
-
- movq 8(%eax,%ecx,4), %mm0
- movq %mm0, 8(%edx,%ecx,4)
-L(finish_not_two):
-
- testb $1, %cl
- jz L(done)
-
- movl (%eax), %ebx
- movl %ebx, (%edx)
-
-L(done):
- movl SAVE_EBX, %ebx
- emms
- ret
-
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/copyi.asm b/rts/gmp/mpn/x86/k7/mmx/copyi.asm
deleted file mode 100644
index b234a1628c..0000000000
--- a/rts/gmp/mpn/x86/k7/mmx/copyi.asm
+++ /dev/null
@@ -1,147 +0,0 @@
-dnl AMD K7 mpn_copyi -- copy limb vector, incrementing.
-dnl
-dnl alignment dst/src, A=0mod8 N=4mod8
-dnl A/A A/N N/A N/N
-dnl K7 0.75 1.0 1.0 0.75
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
-C
-C Copy src,size to dst,size.
-C
-C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at
-C 1.33 c/l.
-C
-C The K7 can do two loads, or two stores, or a load and a store, in one
-C cycle, so if those are 64-bit operations then 0.5 c/l should be possible,
-C however nothing under 0.7 c/l is known.
-C
-C If both source and destination are unaligned then one limb is processed at
-C the start to make them aligned and so get 0.75 c/l, whereas if they'd been
-C used unaligned it would be 1.5 c/l.
-
-defframe(PARAM_SIZE,12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-dnl parameter space reused
-define(SAVE_EBX,`PARAM_SIZE')
-
-dnl minimum 5 since the unrolled code can't handle less than 5
-deflit(UNROLL_THRESHOLD, 5)
-
- .text
- ALIGN(32)
-PROLOGUE(mpn_copyi)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %ecx
- movl %ebx, SAVE_EBX
-
- movl PARAM_SRC, %eax
- movl PARAM_DST, %edx
-
- cmpl $UNROLL_THRESHOLD, %ecx
- jae L(unroll)
-
- orl %ecx, %ecx
- jz L(simple_done)
-
-L(simple):
- C eax src, incrementing
- C ebx scratch
- C ecx counter
- C edx dst, incrementing
- C
- C this loop is 2 cycles/limb
-
- movl (%eax), %ebx
- movl %ebx, (%edx)
- decl %ecx
- leal 4(%eax), %eax
- leal 4(%edx), %edx
- jnz L(simple)
-
-L(simple_done):
- movl SAVE_EBX, %ebx
- ret
-
-
-L(unroll):
- movl %eax, %ebx
- leal -12(%eax,%ecx,4), %eax C src end - 12
- subl $3, %ecx C size-3
-
- andl %edx, %ebx
- leal (%edx,%ecx,4), %edx C dst end - 12
- negl %ecx
-
- testl $4, %ebx C testl to pad code closer to 16 bytes for L(top)
- jz L(aligned)
-
- C both src and dst unaligned, process one limb to align them
- movl (%eax,%ecx,4), %ebx
- movl %ebx, (%edx,%ecx,4)
- incl %ecx
-L(aligned):
-
-
- ALIGN(16)
-L(top):
- C eax src end - 12
- C ebx
- C ecx counter, negative, limbs
- C edx dst end - 12
-
- movq (%eax,%ecx,4), %mm0
- movq 8(%eax,%ecx,4), %mm1
- addl $4, %ecx
- movq %mm0, -16(%edx,%ecx,4)
- movq %mm1, -16+8(%edx,%ecx,4)
- ja L(top) C jump no carry and not zero
-
-
- C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining
-
- testb $2, %cl
- jnz L(finish_not_two)
-
- movq (%eax,%ecx,4), %mm0
- movq %mm0, (%edx,%ecx,4)
-L(finish_not_two):
-
- testb $1, %cl
- jnz L(done)
-
- movl 8(%eax), %ebx
- movl %ebx, 8(%edx)
-
-L(done):
- movl SAVE_EBX, %ebx
- emms
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm b/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm
deleted file mode 100644
index 483ad6a9a1..0000000000
--- a/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm
+++ /dev/null
@@ -1,718 +0,0 @@
-dnl AMD K7 mpn_divrem_1 -- mpn by limb division.
-dnl
-dnl K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
-C mp_srcptr src, mp_size_t size,
-C mp_limb_t divisor);
-C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
-C mp_srcptr src, mp_size_t size,
-C mp_limb_t divisor, mp_limb_t carry);
-C
-C The method and nomenclature follow part 8 of "Division by Invariant
-C Integers using Multiplication" by Granlund and Montgomery, reference in
-C gmp.texi.
-C
-C The "and"s shown in the paper are done here with "cmov"s. "m" is written
-C for m', and "d" for d_norm, which won't cause any confusion since it's
-C only the normalized divisor that's of any use in the code. "b" is written
-C for 2^N, the size of a limb, N being 32 here.
-C
-C mpn_divrem_1 avoids one division if the src high limb is less than the
-C divisor. mpn_divrem_1c doesn't check for a zero carry, since in normal
-C circumstances that will be a very rare event.
-C
-C There's a small bias towards expecting xsize==0, by having code for
-C xsize==0 in a straight line and xsize!=0 under forward jumps.
-
-
-dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
-dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
-dnl
-dnl The inverse takes about 50 cycles to calculate, but after that the
-dnl multiply is 17 c/l versus division at 42 c/l.
-dnl
-dnl At 3 limbs the mul is a touch faster than div on the integer part, and
-dnl even more so on the fractional part.
-
-deflit(MUL_THRESHOLD, 3)
-
-
-defframe(PARAM_CARRY, 24)
-defframe(PARAM_DIVISOR,20)
-defframe(PARAM_SIZE, 16)
-defframe(PARAM_SRC, 12)
-defframe(PARAM_XSIZE, 8)
-defframe(PARAM_DST, 4)
-
-defframe(SAVE_EBX, -4)
-defframe(SAVE_ESI, -8)
-defframe(SAVE_EDI, -12)
-defframe(SAVE_EBP, -16)
-
-defframe(VAR_NORM, -20)
-defframe(VAR_INVERSE, -24)
-defframe(VAR_SRC, -28)
-defframe(VAR_DST, -32)
-defframe(VAR_DST_STOP,-36)
-
-deflit(STACK_SPACE, 36)
-
- .text
- ALIGN(32)
-
-PROLOGUE(mpn_divrem_1c)
-deflit(`FRAME',0)
- movl PARAM_CARRY, %edx
- movl PARAM_SIZE, %ecx
- subl $STACK_SPACE, %esp
-deflit(`FRAME',STACK_SPACE)
-
- movl %ebx, SAVE_EBX
- movl PARAM_XSIZE, %ebx
-
- movl %edi, SAVE_EDI
- movl PARAM_DST, %edi
-
- movl %ebp, SAVE_EBP
- movl PARAM_DIVISOR, %ebp
-
- movl %esi, SAVE_ESI
- movl PARAM_SRC, %esi
-
- leal -4(%edi,%ebx,4), %edi
- jmp LF(mpn_divrem_1,start_1c)
-
-EPILOGUE()
-
-
- C offset 0x31, close enough to aligned
-PROLOGUE(mpn_divrem_1)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %ecx
- movl $0, %edx C initial carry (if can't skip a div)
- subl $STACK_SPACE, %esp
-deflit(`FRAME',STACK_SPACE)
-
- movl %ebp, SAVE_EBP
- movl PARAM_DIVISOR, %ebp
-
- movl %ebx, SAVE_EBX
- movl PARAM_XSIZE, %ebx
-
- movl %esi, SAVE_ESI
- movl PARAM_SRC, %esi
- orl %ecx, %ecx
-
- movl %edi, SAVE_EDI
- movl PARAM_DST, %edi
- leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
-
- jz L(no_skip_div)
- movl -4(%esi,%ecx,4), %eax C src high limb
-
- cmpl %ebp, %eax C one less div if high<divisor
- jnb L(no_skip_div)
-
- movl $0, (%edi,%ecx,4) C dst high limb
- decl %ecx C size-1
- movl %eax, %edx C src high limb as initial carry
-L(no_skip_div):
-
-
-L(start_1c):
- C eax
- C ebx xsize
- C ecx size
- C edx carry
- C esi src
- C edi &dst[xsize-1]
- C ebp divisor
-
- leal (%ebx,%ecx), %eax C size+xsize
- cmpl $MUL_THRESHOLD, %eax
- jae L(mul_by_inverse)
-
-
-C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
-C It'd be possible to write them out without the looping, but no speedup
-C would be expected.
-C
-C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
-C integer part, but curiously not on the fractional part, where %ebp is a
-C (fixed) couple of cycles faster.
-
- orl %ecx, %ecx
- jz L(divide_no_integer)
-
-L(divide_integer):
- C eax scratch (quotient)
- C ebx xsize
- C ecx counter
- C edx scratch (remainder)
- C esi src
- C edi &dst[xsize-1]
- C ebp divisor
-
- movl -4(%esi,%ecx,4), %eax
-
- divl PARAM_DIVISOR
-
- movl %eax, (%edi,%ecx,4)
- decl %ecx
- jnz L(divide_integer)
-
-
-L(divide_no_integer):
- movl PARAM_DST, %edi
- orl %ebx, %ebx
- jnz L(divide_fraction)
-
-L(divide_done):
- movl SAVE_ESI, %esi
- movl SAVE_EDI, %edi
- movl %edx, %eax
-
- movl SAVE_EBX, %ebx
- movl SAVE_EBP, %ebp
- addl $STACK_SPACE, %esp
-
- ret
-
-
-L(divide_fraction):
- C eax scratch (quotient)
- C ebx counter
- C ecx
- C edx scratch (remainder)
- C esi
- C edi dst
- C ebp divisor
-
- movl $0, %eax
-
- divl %ebp
-
- movl %eax, -4(%edi,%ebx,4)
- decl %ebx
- jnz L(divide_fraction)
-
- jmp L(divide_done)
-
-
-
-C -----------------------------------------------------------------------------
-
-L(mul_by_inverse):
- C eax
- C ebx xsize
- C ecx size
- C edx carry
- C esi src
- C edi &dst[xsize-1]
- C ebp divisor
-
- bsrl %ebp, %eax C 31-l
-
- leal 12(%edi), %ebx
- leal 4(%edi,%ecx,4), %edi C &dst[xsize+size]
-
- movl %edi, VAR_DST
- movl %ebx, VAR_DST_STOP
-
- movl %ecx, %ebx C size
- movl $31, %ecx
-
- movl %edx, %edi C carry
- movl $-1, %edx
-
- C
-
- xorl %eax, %ecx C l
- incl %eax C 32-l
-
- shll %cl, %ebp C d normalized
- movl %ecx, VAR_NORM
-
- movd %eax, %mm7
-
- movl $-1, %eax
- subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1
-
- divl %ebp C floor (b*(b-d)-1) / d
-
- orl %ebx, %ebx C size
- movl %eax, VAR_INVERSE
- leal -12(%esi,%ebx,4), %eax C &src[size-3]
-
- jz L(start_zero)
- movl %eax, VAR_SRC
- cmpl $1, %ebx
-
- movl 8(%eax), %esi C src high limb
- jz L(start_one)
-
-L(start_two_or_more):
- movl 4(%eax), %edx C src second highest limb
-
- shldl( %cl, %esi, %edi) C n2 = carry,high << l
-
- shldl( %cl, %edx, %esi) C n10 = high,second << l
-
- cmpl $2, %ebx
- je L(integer_two_left)
- jmp L(integer_top)
-
-
-L(start_one):
- shldl( %cl, %esi, %edi) C n2 = carry,high << l
-
- shll %cl, %esi C n10 = high << l
- movl %eax, VAR_SRC
- jmp L(integer_one_left)
-
-
-L(start_zero):
- shll %cl, %edi C n2 = carry << l
- movl $0, %esi C n10 = 0
-
- C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
- C must have xsize!=0
- jmp L(fraction_some)
-
-
-
-C -----------------------------------------------------------------------------
-C
-C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
-C execution. The instruction scheduling is important, with various
-C apparently equivalent forms running 1 to 5 cycles slower.
-C
-C A lower bound for the time would seem to be 16 cycles, based on the
-C following successive dependencies.
-C
-C cycles
-C n2+n1 1
-C mul 6
-C q1+1 1
-C mul 6
-C sub 1
-C addback 1
-C ---
-C 16
-C
-C This chain is what the loop has already, but 16 cycles isn't achieved.
-C K7 has enough decode, and probably enough execute (depending maybe on what
-C a mul actually consumes), but nothing running under 17 has been found.
-C
-C In theory n2+n1 could be done in the sub and addback stages (by
-C calculating both n2 and n2+n1 there), but lack of registers makes this an
-C unlikely proposition.
-C
-C The jz in the loop keeps the q1+1 stage to 1 cycle. Handling an overflow
-C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
-C chain, and nothing better than 18 cycles has been found when using it.
-C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
-C be an extremely rare event.
-C
-C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but
-C if some special data is coming out with this always, the q1_ff special
-C case actually runs at 15 c/l. 0x2FFF...FFFD divided by 3 is a good way to
-C induce the q1_ff case, for speed measurements or testing. Note that
-C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
-C
-C The instruction groupings and empty comments show the cycles for a naive
-C in-order view of the code (conveniently ignoring the load latency on
-C VAR_INVERSE). This shows some of where the time is going, but is nonsense
-C to the extent that out-of-order execution rearranges it. In this case
-C there's 19 cycles shown, but it executes at 17.
-
- ALIGN(16)
-L(integer_top):
- C eax scratch
- C ebx scratch (nadj, q1)
- C ecx scratch (src, dst)
- C edx scratch
- C esi n10
- C edi n2
- C ebp divisor
- C
- C mm0 scratch (src qword)
- C mm7 rshift for normalization
-
- cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
- movl %edi, %eax C n2
- movl VAR_SRC, %ecx
-
- leal (%ebp,%esi), %ebx
- cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
- sbbl $-1, %eax C n2+n1
-
- mull VAR_INVERSE C m*(n2+n1)
-
- movq (%ecx), %mm0 C next limb and the one below it
- subl $4, %ecx
-
- movl %ecx, VAR_SRC
-
- C
-
- addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
- leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
- movl %ebp, %eax C d
-
- C
-
- adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
- jz L(q1_ff)
- movl VAR_DST, %ecx
-
- mull %ebx C (q1+1)*d
-
- psrlq %mm7, %mm0
-
- leal -4(%ecx), %ecx
-
- C
-
- subl %eax, %esi
- movl VAR_DST_STOP, %eax
-
- C
-
- sbbl %edx, %edi C n - (q1+1)*d
- movl %esi, %edi C remainder -> n2
- leal (%ebp,%esi), %edx
-
- movd %mm0, %esi
-
- cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
- sbbl $0, %ebx C q
- cmpl %eax, %ecx
-
- movl %ebx, (%ecx)
- movl %ecx, VAR_DST
- jne L(integer_top)
-
-
-L(integer_loop_done):
-
-
-C -----------------------------------------------------------------------------
-C
-C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
-C q1_ff special case. This make the code a bit smaller and simpler, and
-C costs only 1 cycle (each).
-
-L(integer_two_left):
- C eax scratch
- C ebx scratch (nadj, q1)
- C ecx scratch (src, dst)
- C edx scratch
- C esi n10
- C edi n2
- C ebp divisor
- C
- C mm0 src limb, shifted
- C mm7 rshift
-
- cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
- movl %edi, %eax C n2
- movl PARAM_SRC, %ecx
-
- leal (%ebp,%esi), %ebx
- cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
- sbbl $-1, %eax C n2+n1
-
- mull VAR_INVERSE C m*(n2+n1)
-
- movd (%ecx), %mm0 C src low limb
-
- movl VAR_DST_STOP, %ecx
-
- C
-
- addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
- leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
- movl %ebp, %eax C d
-
- adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
-
- sbbl $0, %ebx
-
- mull %ebx C (q1+1)*d
-
- psllq $32, %mm0
-
- psrlq %mm7, %mm0
-
- C
-
- subl %eax, %esi
-
- C
-
- sbbl %edx, %edi C n - (q1+1)*d
- movl %esi, %edi C remainder -> n2
- leal (%ebp,%esi), %edx
-
- movd %mm0, %esi
-
- cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
- sbbl $0, %ebx C q
-
- movl %ebx, -4(%ecx)
-
-
-C -----------------------------------------------------------------------------
-L(integer_one_left):
- C eax scratch
- C ebx scratch (nadj, q1)
- C ecx dst
- C edx scratch
- C esi n10
- C edi n2
- C ebp divisor
- C
- C mm0 src limb, shifted
- C mm7 rshift
-
- movl VAR_DST_STOP, %ecx
- cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
- movl %edi, %eax C n2
-
- leal (%ebp,%esi), %ebx
- cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
- sbbl $-1, %eax C n2+n1
-
- mull VAR_INVERSE C m*(n2+n1)
-
- C
-
- C
-
- C
-
- addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
- leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
- movl %ebp, %eax C d
-
- C
-
- adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
-
- sbbl $0, %ebx C q1 if q1+1 overflowed
-
- mull %ebx
-
- C
-
- C
-
- C
-
- subl %eax, %esi
-
- C
-
- sbbl %edx, %edi C n - (q1+1)*d
- movl %esi, %edi C remainder -> n2
- leal (%ebp,%esi), %edx
-
- cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
- sbbl $0, %ebx C q
-
- movl %ebx, -8(%ecx)
- subl $8, %ecx
-
-
-
-L(integer_none):
- cmpl $0, PARAM_XSIZE
- jne L(fraction_some)
-
- movl %edi, %eax
-L(fraction_done):
- movl VAR_NORM, %ecx
- movl SAVE_EBP, %ebp
-
- movl SAVE_EDI, %edi
- movl SAVE_ESI, %esi
-
- movl SAVE_EBX, %ebx
- addl $STACK_SPACE, %esp
-
- shrl %cl, %eax
- emms
-
- ret
-
-
-C -----------------------------------------------------------------------------
-C
-C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
-C of q*d is simply -d and the remainder n-q*d = n10+d
-
-L(q1_ff):
- C eax (divisor)
- C ebx (q1+1 == 0)
- C ecx
- C edx
- C esi n10
- C edi n2
- C ebp divisor
-
- movl VAR_DST, %ecx
- movl VAR_DST_STOP, %edx
- subl $4, %ecx
-
- psrlq %mm7, %mm0
- leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
- movl %ecx, VAR_DST
-
- movd %mm0, %esi C next n10
-
- movl $-1, (%ecx)
- cmpl %ecx, %edx
- jne L(integer_top)
-
- jmp L(integer_loop_done)
-
-
-
-C -----------------------------------------------------------------------------
-C
-C Being the fractional part, the "source" limbs are all zero, meaning
-C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
-C
-C The loop runs at 15 cycles. The dependent chain is the same as the
-C general case above, but without the n2+n1 stage (due to n1==0), so 15
-C would seem to be the lower bound.
-C
-C A not entirely obvious simplification is that q1+1 never overflows a limb,
-C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
-C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
-C rnd() means rounding down to a multiple of d.
-C
-C m*n2 + b*n2 <= m*(d-1) + b*(d-1)
-C = m*d + b*d - m - b
-C = floor((b(b-d)-1)/d)*d + b*d - m - b
-C = rnd(b(b-d)-1) + b*d - m - b
-C = rnd(b(b-d)-1 + b*d) - m - b
-C = rnd(b*b-1) - m - b
-C <= (b-2)*b
-C
-C Unchanged from the general case is that the final quotient limb q can be
-C either q1 or q1+1, and the q1+1 case occurs often. This can be seen from
-C equation 8.4 of the paper which simplifies as follows when n1==0 and
-C n0==0.
-C
-C n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
-C
-C As before, the instruction groupings and empty comments show a naive
-C in-order view of the code, which is made a nonsense by out of order
-C execution. There's 17 cycles shown, but it executes at 15.
-C
-C Rotating the store q and remainder->n2 instructions up to the top of the
-C loop gets the run time down from 16 to 15.
-
- ALIGN(16)
-L(fraction_some):
- C eax
- C ebx
- C ecx
- C edx
- C esi
- C edi carry
- C ebp divisor
-
- movl PARAM_DST, %esi
- movl VAR_DST_STOP, %ecx
- movl %edi, %eax
-
- subl $8, %ecx
-
- jmp L(fraction_entry)
-
-
- ALIGN(16)
-L(fraction_top):
- C eax n2 carry, then scratch
- C ebx scratch (nadj, q1)
- C ecx dst, decrementing
- C edx scratch
- C esi dst stop point
- C edi (will be n2)
- C ebp divisor
-
- movl %ebx, (%ecx) C previous q
- movl %eax, %edi C remainder->n2
-
-L(fraction_entry):
- mull VAR_INVERSE C m*n2
-
- movl %ebp, %eax C d
- subl $4, %ecx C dst
- leal 1(%edi), %ebx
-
- C
-
- C
-
- C
-
- C
-
- addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1
-
- mull %ebx C (q1+1)*d
-
- C
-
- C
-
- C
-
- negl %eax C low of n - (q1+1)*d
-
- C
-
- sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry
- leal (%ebp,%eax), %edx
-
- cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
- sbbl $0, %ebx C q
- cmpl %esi, %ecx
-
- jne L(fraction_top)
-
-
- movl %ebx, (%ecx)
- jmp L(fraction_done)
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/lshift.asm b/rts/gmp/mpn/x86/k7/mmx/lshift.asm
deleted file mode 100644
index 4d17c881ec..0000000000
--- a/rts/gmp/mpn/x86/k7/mmx/lshift.asm
+++ /dev/null
@@ -1,472 +0,0 @@
-dnl AMD K7 mpn_lshift -- mpn left shift.
-dnl
-dnl K7: 1.21 cycles/limb (at 16 limbs/loop).
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-dnl K7: UNROLL_COUNT cycles/limb
-dnl 4 1.51
-dnl 8 1.26
-dnl 16 1.21
-dnl 32 1.2
-dnl Maximum possible with the current code is 64.
-
-deflit(UNROLL_COUNT, 16)
-
-
-C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C unsigned shift);
-C
-C Shift src,size left by shift many bits and store the result in dst,size.
-C Zeros are shifted in at the right. The bits shifted out at the left are
-C the return value.
-C
-C The comments in mpn_rshift apply here too.
-
-ifdef(`PIC',`
-deflit(UNROLL_THRESHOLD, 10)
-',`
-deflit(UNROLL_THRESHOLD, 10)
-')
-
-defframe(PARAM_SHIFT,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-defframe(SAVE_EDI, -4)
-defframe(SAVE_ESI, -8)
-defframe(SAVE_EBX, -12)
-deflit(SAVE_SIZE, 12)
-
- .text
- ALIGN(32)
-
-PROLOGUE(mpn_lshift)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %eax
- movl PARAM_SRC, %edx
- subl $SAVE_SIZE, %esp
-deflit(`FRAME',SAVE_SIZE)
-
- movl PARAM_SHIFT, %ecx
- movl %edi, SAVE_EDI
-
- movl PARAM_DST, %edi
- decl %eax
- jnz L(more_than_one_limb)
-
- movl (%edx), %edx
-
- shldl( %cl, %edx, %eax) C eax was decremented to zero
-
- shll %cl, %edx
-
- movl %edx, (%edi)
- movl SAVE_EDI, %edi
- addl $SAVE_SIZE, %esp
-
- ret
-
-
-C -----------------------------------------------------------------------------
-L(more_than_one_limb):
- C eax size-1
- C ebx
- C ecx shift
- C edx src
- C esi
- C edi dst
- C ebp
-
- movd PARAM_SHIFT, %mm6
- movd (%edx,%eax,4), %mm5 C src high limb
- cmp $UNROLL_THRESHOLD-1, %eax
-
- jae L(unroll)
- negl %ecx
- movd (%edx), %mm4 C src low limb
-
- addl $32, %ecx
-
- movd %ecx, %mm7
-
-L(simple_top):
- C eax loop counter, limbs
- C ebx
- C ecx
- C edx src
- C esi
- C edi dst
- C ebp
- C
- C mm0 scratch
- C mm4 src low limb
- C mm5 src high limb
- C mm6 shift
- C mm7 32-shift
-
- movq -4(%edx,%eax,4), %mm0
- decl %eax
-
- psrlq %mm7, %mm0
-
- movd %mm0, 4(%edi,%eax,4)
- jnz L(simple_top)
-
-
- psllq %mm6, %mm5
- psllq %mm6, %mm4
-
- psrlq $32, %mm5
- movd %mm4, (%edi) C dst low limb
-
- movd %mm5, %eax C return value
-
- movl SAVE_EDI, %edi
- addl $SAVE_SIZE, %esp
- emms
-
- ret
-
-
-C -----------------------------------------------------------------------------
- ALIGN(16)
-L(unroll):
- C eax size-1
- C ebx (saved)
- C ecx shift
- C edx src
- C esi
- C edi dst
- C ebp
- C
- C mm5 src high limb, for return value
- C mm6 lshift
-
- movl %esi, SAVE_ESI
- movl %ebx, SAVE_EBX
- leal -4(%edx,%eax,4), %edx C &src[size-2]
-
- testb $4, %dl
- movq (%edx), %mm1 C src high qword
-
- jz L(start_src_aligned)
-
-
- C src isn't aligned, process high limb (marked xxx) separately to
- C make it so
- C
- C source -4(edx,%eax,4)
- C |
- C +-------+-------+-------+--
- C | xxx |
- C +-------+-------+-------+--
- C 0mod8 4mod8 0mod8
- C
- C dest -4(edi,%eax,4)
- C |
- C +-------+-------+--
- C | xxx | |
- C +-------+-------+--
-
- psllq %mm6, %mm1
- subl $4, %edx
- movl %eax, PARAM_SIZE C size-1
-
- psrlq $32, %mm1
- decl %eax C size-2 is new size-1
-
- movd %mm1, 4(%edi,%eax,4)
- movq (%edx), %mm1 C new src high qword
-L(start_src_aligned):
-
-
- leal -4(%edi,%eax,4), %edi C &dst[size-2]
- psllq %mm6, %mm5
-
- testl $4, %edi
- psrlq $32, %mm5 C return value
-
- jz L(start_dst_aligned)
-
-
- C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
- C shift is 32 bits extra. High limb of dst (marked xxx) handled
- C here separately.
- C
- C source %edx
- C +-------+-------+--
- C | mm1 |
- C +-------+-------+--
- C 0mod8 4mod8
- C
- C dest %edi
- C +-------+-------+-------+--
- C | xxx |
- C +-------+-------+-------+--
- C 0mod8 4mod8 0mod8
-
- movq %mm1, %mm0
- psllq %mm6, %mm1
- addl $32, %ecx C shift+32
-
- psrlq $32, %mm1
-
- movd %mm1, 4(%edi)
- movq %mm0, %mm1
- subl $4, %edi
-
- movd %ecx, %mm6 C new lshift
-L(start_dst_aligned):
-
- decl %eax C size-2, two last limbs handled at end
- movq %mm1, %mm2 C copy of src high qword
- negl %ecx
-
- andl $-2, %eax C round size down to even
- addl $64, %ecx
-
- movl %eax, %ebx
- negl %eax
-
- andl $UNROLL_MASK, %eax
- decl %ebx
-
- shll %eax
-
- movd %ecx, %mm7 C rshift = 64-lshift
-
-ifdef(`PIC',`
- call L(pic_calc)
-L(here):
-',`
- leal L(entry) (%eax,%eax,4), %esi
-')
- shrl $UNROLL_LOG2, %ebx C loop counter
-
- leal ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
- leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
- movl PARAM_SIZE, %eax C for use at end
- jmp *%esi
-
-
-ifdef(`PIC',`
-L(pic_calc):
- C See README.family about old gas bugs
- leal (%eax,%eax,4), %esi
- addl $L(entry)-L(here), %esi
- addl (%esp), %esi
-
- ret
-')
-
-
-C -----------------------------------------------------------------------------
- ALIGN(32)
-L(top):
- C eax size (for use at end)
- C ebx loop counter
- C ecx rshift
- C edx src
- C esi computed jump
- C edi dst
- C ebp
- C
- C mm0 scratch
- C mm1 \ carry (alternating, mm2 first)
- C mm2 /
- C mm6 lshift
- C mm7 rshift
- C
- C 10 code bytes/limb
- C
- C The two chunks differ in whether mm1 or mm2 hold the carry.
- C The computed jump puts the initial carry in both mm1 and mm2.
-
-L(entry):
-deflit(CHUNK_COUNT, 4)
-forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
- deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
- deflit(`disp1', eval(disp0 - 8))
-
- movq disp0(%edx), %mm0
- psllq %mm6, %mm2
-
- movq %mm0, %mm1
- psrlq %mm7, %mm0
-
- por %mm2, %mm0
- movq %mm0, disp0(%edi)
-
-
- movq disp1(%edx), %mm0
- psllq %mm6, %mm1
-
- movq %mm0, %mm2
- psrlq %mm7, %mm0
-
- por %mm1, %mm0
- movq %mm0, disp1(%edi)
-')
-
- subl $UNROLL_BYTES, %edx
- subl $UNROLL_BYTES, %edi
- decl %ebx
-
- jns L(top)
-
-
-
-define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
-
-L(end):
- testb $1, %al
- movl SAVE_EBX, %ebx
- psllq %mm6, %mm2 C wanted left shifted in all cases below
-
- movd %mm5, %eax
-
- movl SAVE_ESI, %esi
- jz L(end_even)
-
-
-L(end_odd):
-
- C Size odd, destination was aligned.
- C
- C source edx+8 edx+4
- C --+---------------+-------+
- C | mm2 | |
- C --+---------------+-------+
- C
- C dest edi
- C --+---------------+---------------+-------+
- C | written | | |
- C --+---------------+---------------+-------+
- C
- C mm6 = shift
- C mm7 = ecx = 64-shift
-
-
- C Size odd, destination was unaligned.
- C
- C source edx+8 edx+4
- C --+---------------+-------+
- C | mm2 | |
- C --+---------------+-------+
- C
- C dest edi
- C --+---------------+---------------+
- C | written | |
- C --+---------------+---------------+
- C
- C mm6 = shift+32
- C mm7 = ecx = 64-(shift+32)
-
-
- C In both cases there's one extra limb of src to fetch and combine
- C with mm2 to make a qword at (%edi), and in the aligned case
- C there's an extra limb of dst to be formed from that extra src limb
- C left shifted.
-
- movd disp(4) (%edx), %mm0
- testb $32, %cl
-
- movq %mm0, %mm1
- psllq $32, %mm0
-
- psrlq %mm7, %mm0
- psllq %mm6, %mm1
-
- por %mm2, %mm0
-
- movq %mm0, disp(0) (%edi)
- jz L(end_odd_unaligned)
- movd %mm1, disp(-4) (%edi)
-L(end_odd_unaligned):
-
- movl SAVE_EDI, %edi
- addl $SAVE_SIZE, %esp
- emms
-
- ret
-
-
-L(end_even):
-
- C Size even, destination was aligned.
- C
- C source edx+8
- C --+---------------+
- C | mm2 |
- C --+---------------+
- C
- C dest edi
- C --+---------------+---------------+
- C | written | |
- C --+---------------+---------------+
- C
- C mm6 = shift
- C mm7 = ecx = 64-shift
-
-
- C Size even, destination was unaligned.
- C
- C source edx+8
- C --+---------------+
- C | mm2 |
- C --+---------------+
- C
- C dest edi+4
- C --+---------------+-------+
- C | written | |
- C --+---------------+-------+
- C
- C mm6 = shift+32
- C mm7 = ecx = 64-(shift+32)
-
-
- C The movq for the aligned case overwrites the movd for the
- C unaligned case.
-
- movq %mm2, %mm0
- psrlq $32, %mm2
-
- testb $32, %cl
- movd %mm2, disp(4) (%edi)
-
- jz L(end_even_unaligned)
- movq %mm0, disp(0) (%edi)
-L(end_even_unaligned):
-
- movl SAVE_EDI, %edi
- addl $SAVE_SIZE, %esp
- emms
-
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/mod_1.asm b/rts/gmp/mpn/x86/k7/mmx/mod_1.asm
deleted file mode 100644
index 545ca56ddf..0000000000
--- a/rts/gmp/mpn/x86/k7/mmx/mod_1.asm
+++ /dev/null
@@ -1,457 +0,0 @@
-dnl AMD K7 mpn_mod_1 -- mpn by limb remainder.
-dnl
-dnl K7: 17.0 cycles/limb.
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
-C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
-C mp_limb_t carry);
-C
-C The code here is the same as mpn_divrem_1, but with the quotient
-C discarded. See mpn/x86/k7/mmx/divrem_1.c for some comments.
-
-
-dnl MUL_THRESHOLD is the size at which the multiply by inverse method is
-dnl used, rather than plain "divl"s. Minimum value 2.
-dnl
-dnl The inverse takes about 50 cycles to calculate, but after that the
-dnl multiply is 17 c/l versus division at 41 c/l.
-dnl
-dnl Using mul or div is about the same speed at 3 limbs, so the threshold
-dnl is set to 4 to get the smaller div code used at 3.
-
-deflit(MUL_THRESHOLD, 4)
-
-
-defframe(PARAM_CARRY, 16)
-defframe(PARAM_DIVISOR,12)
-defframe(PARAM_SIZE, 8)
-defframe(PARAM_SRC, 4)
-
-defframe(SAVE_EBX, -4)
-defframe(SAVE_ESI, -8)
-defframe(SAVE_EDI, -12)
-defframe(SAVE_EBP, -16)
-
-defframe(VAR_NORM, -20)
-defframe(VAR_INVERSE, -24)
-defframe(VAR_SRC_STOP,-28)
-
-deflit(STACK_SPACE, 28)
-
- .text
- ALIGN(32)
-
-PROLOGUE(mpn_mod_1c)
-deflit(`FRAME',0)
- movl PARAM_CARRY, %edx
- movl PARAM_SIZE, %ecx
- subl $STACK_SPACE, %esp
-deflit(`FRAME',STACK_SPACE)
-
- movl %ebp, SAVE_EBP
- movl PARAM_DIVISOR, %ebp
-
- movl %esi, SAVE_ESI
- movl PARAM_SRC, %esi
- jmp LF(mpn_mod_1,start_1c)
-
-EPILOGUE()
-
-
- ALIGN(32)
-PROLOGUE(mpn_mod_1)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %ecx
- movl $0, %edx C initial carry (if can't skip a div)
- subl $STACK_SPACE, %esp
-deflit(`FRAME',STACK_SPACE)
-
- movl %esi, SAVE_ESI
- movl PARAM_SRC, %esi
-
- movl %ebp, SAVE_EBP
- movl PARAM_DIVISOR, %ebp
-
- orl %ecx, %ecx
- jz L(divide_done)
-
- movl -4(%esi,%ecx,4), %eax C src high limb
-
- cmpl %ebp, %eax C carry flag if high<divisor
-
- cmovc( %eax, %edx) C src high limb as initial carry
- sbbl $0, %ecx C size-1 to skip one div
- jz L(divide_done)
-
-
- ALIGN(16)
-L(start_1c):
- C eax
- C ebx
- C ecx size
- C edx carry
- C esi src
- C edi
- C ebp divisor
-
- cmpl $MUL_THRESHOLD, %ecx
- jae L(mul_by_inverse)
-
-
-
-C With a MUL_THRESHOLD of 4, this "loop" only ever does 1 to 3 iterations,
-C but it's already fast and compact, and there's nothing to gain by
-C expanding it out.
-C
-C Using PARAM_DIVISOR in the divl is a couple of cycles faster than %ebp.
-
- orl %ecx, %ecx
- jz L(divide_done)
-
-
-L(divide_top):
- C eax scratch (quotient)
- C ebx
- C ecx counter, limbs, decrementing
- C edx scratch (remainder)
- C esi src
- C edi
- C ebp
-
- movl -4(%esi,%ecx,4), %eax
-
- divl PARAM_DIVISOR
-
- decl %ecx
- jnz L(divide_top)
-
-
-L(divide_done):
- movl SAVE_ESI, %esi
- movl SAVE_EBP, %ebp
- addl $STACK_SPACE, %esp
-
- movl %edx, %eax
-
- ret
-
-
-
-C -----------------------------------------------------------------------------
-
-L(mul_by_inverse):
- C eax
- C ebx
- C ecx size
- C edx carry
- C esi src
- C edi
- C ebp divisor
-
- bsrl %ebp, %eax C 31-l
-
- movl %ebx, SAVE_EBX
- leal -4(%esi), %ebx
-
- movl %ebx, VAR_SRC_STOP
- movl %edi, SAVE_EDI
-
- movl %ecx, %ebx C size
- movl $31, %ecx
-
- movl %edx, %edi C carry
- movl $-1, %edx
-
- C
-
- xorl %eax, %ecx C l
- incl %eax C 32-l
-
- shll %cl, %ebp C d normalized
- movl %ecx, VAR_NORM
-
- movd %eax, %mm7
-
- movl $-1, %eax
- subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
-
- divl %ebp C floor (b*(b-d)-1) / d
-
- C
-
- movl %eax, VAR_INVERSE
- leal -12(%esi,%ebx,4), %eax C &src[size-3]
-
- movl 8(%eax), %esi C src high limb
- movl 4(%eax), %edx C src second highest limb
-
- shldl( %cl, %esi, %edi) C n2 = carry,high << l
-
- shldl( %cl, %edx, %esi) C n10 = high,second << l
-
- movl %eax, %ecx C &src[size-3]
-
-
-ifelse(MUL_THRESHOLD,2,`
- cmpl $2, %ebx
- je L(inverse_two_left)
-')
-
-
-C The dependent chain here is the same as in mpn_divrem_1, but a few
-C instructions are saved by not needing to store the quotient limbs.
-C Unfortunately this doesn't get the code down to the theoretical 16 c/l.
-C
-C There's four dummy instructions in the loop, all of which are necessary
-C for the claimed 17 c/l. It's a 1 to 3 cycle slowdown if any are removed,
-C or changed from load to store or vice versa. They're not completely
-C random, since they correspond to what mpn_divrem_1 has, but there's no
-C obvious reason why they're necessary. Presumably they induce something
-C good in the out of order execution, perhaps through some load/store
-C ordering and/or decoding effects.
-C
-C The q1==0xFFFFFFFF case is handled here the same as in mpn_divrem_1. On
-C on special data that comes out as q1==0xFFFFFFFF always, the loop runs at
-C about 13.5 c/l.
-
- ALIGN(32)
-L(inverse_top):
- C eax scratch
- C ebx scratch (nadj, q1)
- C ecx src pointer, decrementing
- C edx scratch
- C esi n10
- C edi n2
- C ebp divisor
- C
- C mm0 scratch (src qword)
- C mm7 rshift for normalization
-
- cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
- movl %edi, %eax C n2
- movl PARAM_SIZE, %ebx C dummy
-
- leal (%ebp,%esi), %ebx
- cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
- sbbl $-1, %eax C n2+n1
-
- mull VAR_INVERSE C m*(n2+n1)
-
- movq (%ecx), %mm0 C next src limb and the one below it
- subl $4, %ecx
-
- movl %ecx, PARAM_SIZE C dummy
-
- C
-
- addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
- leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
- movl %ebp, %eax C d
-
- C
-
- adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
- jz L(q1_ff)
- nop C dummy
-
- mull %ebx C (q1+1)*d
-
- psrlq %mm7, %mm0
- leal 0(%ecx), %ecx C dummy
-
- C
-
- C
-
- subl %eax, %esi
- movl VAR_SRC_STOP, %eax
-
- C
-
- sbbl %edx, %edi C n - (q1+1)*d
- movl %esi, %edi C remainder -> n2
- leal (%ebp,%esi), %edx
-
- movd %mm0, %esi
-
- cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
- cmpl %eax, %ecx
- jne L(inverse_top)
-
-
-L(inverse_loop_done):
-
-
-C -----------------------------------------------------------------------------
-
-L(inverse_two_left):
- C eax scratch
- C ebx scratch (nadj, q1)
- C ecx &src[-1]
- C edx scratch
- C esi n10
- C edi n2
- C ebp divisor
- C
- C mm0 scratch (src dword)
- C mm7 rshift
-
- cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
- movl %edi, %eax C n2
-
- leal (%ebp,%esi), %ebx
- cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
- sbbl $-1, %eax C n2+n1
-
- mull VAR_INVERSE C m*(n2+n1)
-
- movd 4(%ecx), %mm0 C src low limb
-
- C
-
- C
-
- addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
- leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
- movl %ebp, %eax C d
-
- adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
-
- sbbl $0, %ebx
-
- mull %ebx C (q1+1)*d
-
- psllq $32, %mm0
-
- psrlq %mm7, %mm0
-
- C
-
- subl %eax, %esi
-
- C
-
- sbbl %edx, %edi C n - (q1+1)*d
- movl %esi, %edi C remainder -> n2
- leal (%ebp,%esi), %edx
-
- movd %mm0, %esi
-
- cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
-
-
-C One limb left
-
- C eax scratch
- C ebx scratch (nadj, q1)
- C ecx
- C edx scratch
- C esi n10
- C edi n2
- C ebp divisor
- C
- C mm0 src limb, shifted
- C mm7 rshift
-
- cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
- movl %edi, %eax C n2
-
- leal (%ebp,%esi), %ebx
- cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
- sbbl $-1, %eax C n2+n1
-
- mull VAR_INVERSE C m*(n2+n1)
-
- movl VAR_NORM, %ecx C for final denorm
-
- C
-
- C
-
- addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
- leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
- movl %ebp, %eax C d
-
- C
-
- adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
-
- sbbl $0, %ebx
-
- mull %ebx C (q1+1)*d
-
- movl SAVE_EBX, %ebx
-
- C
-
- C
-
- subl %eax, %esi
-
- movl %esi, %eax C remainder
- movl SAVE_ESI, %esi
-
- sbbl %edx, %edi C n - (q1+1)*d
- leal (%ebp,%eax), %edx
- movl SAVE_EBP, %ebp
-
- cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
- movl SAVE_EDI, %edi
-
- shrl %cl, %eax C denorm remainder
- addl $STACK_SPACE, %esp
- emms
-
- ret
-
-
-C -----------------------------------------------------------------------------
-C
-C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
-C of q*d is simply -d and the remainder n-q*d = n10+d
-
-L(q1_ff):
- C eax (divisor)
- C ebx (q1+1 == 0)
- C ecx src pointer
- C edx
- C esi n10
- C edi (n2)
- C ebp divisor
-
- movl VAR_SRC_STOP, %edx
- leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
- psrlq %mm7, %mm0
-
- movd %mm0, %esi C next n10
-
- cmpl %ecx, %edx
- jne L(inverse_top)
- jmp L(inverse_loop_done)
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/popham.asm b/rts/gmp/mpn/x86/k7/mmx/popham.asm
deleted file mode 100644
index fa7c8c04a5..0000000000
--- a/rts/gmp/mpn/x86/k7/mmx/popham.asm
+++ /dev/null
@@ -1,239 +0,0 @@
-dnl AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
-dnl distance.
-dnl
-dnl K7: popcount 5.0 cycles/limb, hamdist 6.0 cycles/limb
-
-
-dnl Copyright (C) 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-dnl Only recent versions of gas know psadbw, in particular gas 2.9.1 on
-dnl FreeBSD 3.3 and 3.4 doesn't recognise it.
-
-define(psadbw_mm4_mm0,
-`ifelse(m4_ifdef_anyof_p(`HAVE_TARGET_CPU_athlon',
- `HAVE_TARGET_CPU_pentium3'),1,
- `.byte 0x0f,0xf6,0xc4 C psadbw %mm4, %mm0',
-
-`m4_warning(`warning, using simulated and only partly functional psadbw, use for testing only
-') C this works enough for the sum of bytes done below, making it
- C possible to test on an older cpu
- leal -8(%esp), %esp
- movq %mm4, (%esp)
- movq %mm0, %mm4
-forloop(i,1,7,
-` psrlq $ 8, %mm4
- paddb %mm4, %mm0
-')
- pushl $ 0
- pushl $ 0xFF
- pand (%esp), %mm0
- movq 8(%esp), %mm4
- leal 16(%esp), %esp
-')')
-
-
-C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
-C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
-C
-C The code here is almost certainly not optimal, but is already a 3x speedup
-C over the generic C code. The main improvement would be to interleave
-C processing of two qwords in the loop so as to fully exploit the available
-C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
-C
-C The loop is based on the example "Efficient 64-bit population count using
-C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
-C page 158 of rev E (reference in mpn/x86/k7/README).
-
-ifdef(`OPERATION_popcount',,
-`ifdef(`OPERATION_hamdist',,
-`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
-')')')
-
-define(HAM,
-m4_assert_numargs(1)
-`ifdef(`OPERATION_hamdist',`$1')')
-
-define(POP,
-m4_assert_numargs(1)
-`ifdef(`OPERATION_popcount',`$1')')
-
-HAM(`
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC2, 8)
-defframe(PARAM_SRC, 4)
-define(M4_function,mpn_hamdist)
-')
-POP(`
-defframe(PARAM_SIZE, 8)
-defframe(PARAM_SRC, 4)
-define(M4_function,mpn_popcount)
-')
-
-MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
-
-
-ifdef(`PIC',,`
- dnl non-PIC
-
- DATA
- ALIGN(8)
-
-define(LS,
-m4_assert_numargs(1)
-`LF(M4_function,`$1')')
-
-LS(rodata_AAAAAAAAAAAAAAAA):
- .long 0xAAAAAAAA
- .long 0xAAAAAAAA
-
-LS(rodata_3333333333333333):
- .long 0x33333333
- .long 0x33333333
-
-LS(rodata_0F0F0F0F0F0F0F0F):
- .long 0x0F0F0F0F
- .long 0x0F0F0F0F
-')
-
- .text
- ALIGN(32)
-
-PROLOGUE(M4_function)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %ecx
- orl %ecx, %ecx
- jz L(zero)
-
-ifdef(`PIC',`
- movl $0xAAAAAAAA, %eax
- movl $0x33333333, %edx
-
- movd %eax, %mm7
- movd %edx, %mm6
-
- movl $0x0F0F0F0F, %eax
-
- punpckldq %mm7, %mm7
- punpckldq %mm6, %mm6
-
- movd %eax, %mm5
- movd %edx, %mm4
-
- punpckldq %mm5, %mm5
-
-',`
- movq LS(rodata_AAAAAAAAAAAAAAAA), %mm7
- movq LS(rodata_3333333333333333), %mm6
- movq LS(rodata_0F0F0F0F0F0F0F0F), %mm5
-')
- pxor %mm4, %mm4
-
-define(REG_AAAAAAAAAAAAAAAA,%mm7)
-define(REG_3333333333333333,%mm6)
-define(REG_0F0F0F0F0F0F0F0F,%mm5)
-define(REG_0000000000000000,%mm4)
-
-
- movl PARAM_SRC, %eax
-HAM(` movl PARAM_SRC2, %edx')
-
- pxor %mm2, %mm2 C total
-
- shrl %ecx
- jnc L(top)
-
- movd (%eax,%ecx,8), %mm1
-
-HAM(` movd 0(%edx,%ecx,8), %mm0
- pxor %mm0, %mm1
-')
- orl %ecx, %ecx
- jmp L(loaded)
-
-
- ALIGN(16)
-L(top):
- C eax src
- C ebx
- C ecx counter, qwords, decrementing
- C edx [hamdist] src2
- C
- C mm0 (scratch)
- C mm1 (scratch)
- C mm2 total (low dword)
- C mm3
- C mm4 \
- C mm5 | special constants
- C mm6 |
- C mm7 /
-
- movq -8(%eax,%ecx,8), %mm1
-
-HAM(` pxor -8(%edx,%ecx,8), %mm1')
- decl %ecx
-
-L(loaded):
- movq %mm1, %mm0
- pand REG_AAAAAAAAAAAAAAAA, %mm1
-
- psrlq $1, %mm1
-
- psubd %mm1, %mm0 C bit pairs
-
-
- movq %mm0, %mm1
- psrlq $2, %mm0
-
- pand REG_3333333333333333, %mm0
- pand REG_3333333333333333, %mm1
-
- paddd %mm1, %mm0 C nibbles
-
-
- movq %mm0, %mm1
- psrlq $4, %mm0
-
- pand REG_0F0F0F0F0F0F0F0F, %mm0
- pand REG_0F0F0F0F0F0F0F0F, %mm1
-
- paddd %mm1, %mm0 C bytes
-
-
- psadbw_mm4_mm0
-
- paddd %mm0, %mm2 C add to total
- jnz L(top)
-
-
- movd %mm2, %eax
- emms
- ret
-
-
-L(zero):
- movl $0, %eax
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/rshift.asm b/rts/gmp/mpn/x86/k7/mmx/rshift.asm
deleted file mode 100644
index abb546cd5b..0000000000
--- a/rts/gmp/mpn/x86/k7/mmx/rshift.asm
+++ /dev/null
@@ -1,471 +0,0 @@
-dnl AMD K7 mpn_rshift -- mpn right shift.
-dnl
-dnl K7: 1.21 cycles/limb (at 16 limbs/loop).
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-dnl K7: UNROLL_COUNT cycles/limb
-dnl 4 1.51
-dnl 8 1.26
-dnl 16 1.21
-dnl 32 1.2
-dnl Maximum possible with the current code is 64.
-
-deflit(UNROLL_COUNT, 16)
-
-
-C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C unsigned shift);
-C
-C Shift src,size right by shift many bits and store the result in dst,size.
-C Zeros are shifted in at the left. The bits shifted out at the right are
-C the return value.
-C
-C This code uses 64-bit MMX operations, which makes it possible to handle
-C two limbs at a time, for a theoretical 1.0 cycles/limb. Plain integer
-C code, on the other hand, suffers from shrd being a vector path decode and
-C running at 3 cycles back-to-back.
-C
-C Full speed depends on source and destination being aligned, and some hairy
-C setups and finish-ups are done to arrange this for the loop.
-
-ifdef(`PIC',`
-deflit(UNROLL_THRESHOLD, 10)
-',`
-deflit(UNROLL_THRESHOLD, 10)
-')
-
-defframe(PARAM_SHIFT,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-defframe(SAVE_EDI, -4)
-defframe(SAVE_ESI, -8)
-defframe(SAVE_EBX, -12)
-deflit(SAVE_SIZE, 12)
-
- .text
- ALIGN(32)
-
-PROLOGUE(mpn_rshift)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %eax
- movl PARAM_SRC, %edx
- subl $SAVE_SIZE, %esp
-deflit(`FRAME',SAVE_SIZE)
-
- movl PARAM_SHIFT, %ecx
- movl %edi, SAVE_EDI
-
- movl PARAM_DST, %edi
- decl %eax
- jnz L(more_than_one_limb)
-
- movl (%edx), %edx C src limb
-
- shrdl( %cl, %edx, %eax) C eax was decremented to zero
-
- shrl %cl, %edx
-
- movl %edx, (%edi) C dst limb
- movl SAVE_EDI, %edi
- addl $SAVE_SIZE, %esp
-
- ret
-
-
-C -----------------------------------------------------------------------------
-L(more_than_one_limb):
- C eax size-1
- C ebx
- C ecx shift
- C edx src
- C esi
- C edi dst
- C ebp
-
- movd PARAM_SHIFT, %mm6 C rshift
- movd (%edx), %mm5 C src low limb
- cmp $UNROLL_THRESHOLD-1, %eax
-
- jae L(unroll)
- leal (%edx,%eax,4), %edx C &src[size-1]
- leal -4(%edi,%eax,4), %edi C &dst[size-2]
-
- movd (%edx), %mm4 C src high limb
- negl %eax
-
-
-L(simple_top):
- C eax loop counter, limbs, negative
- C ebx
- C ecx shift
- C edx carry
- C edx &src[size-1]
- C edi &dst[size-2]
- C ebp
- C
- C mm0 scratch
- C mm4 src high limb
- C mm5 src low limb
- C mm6 shift
-
- movq (%edx,%eax,4), %mm0
- incl %eax
-
- psrlq %mm6, %mm0
-
- movd %mm0, (%edi,%eax,4)
- jnz L(simple_top)
-
-
- psllq $32, %mm5
- psrlq %mm6, %mm4
-
- psrlq %mm6, %mm5
- movd %mm4, 4(%edi) C dst high limb
-
- movd %mm5, %eax C return value
-
- movl SAVE_EDI, %edi
- addl $SAVE_SIZE, %esp
- emms
-
- ret
-
-
-C -----------------------------------------------------------------------------
- ALIGN(16)
-L(unroll):
- C eax size-1
- C ebx
- C ecx shift
- C edx src
- C esi
- C edi dst
- C ebp
- C
- C mm5 src low limb
- C mm6 rshift
-
- testb $4, %dl
- movl %esi, SAVE_ESI
- movl %ebx, SAVE_EBX
-
- psllq $32, %mm5
- jz L(start_src_aligned)
-
-
- C src isn't aligned, process low limb separately (marked xxx) and
- C step src and dst by one limb, making src aligned.
- C
- C source edx
- C --+-------+-------+-------+
- C | xxx |
- C --+-------+-------+-------+
- C 4mod8 0mod8 4mod8
- C
- C dest edi
- C --+-------+-------+
- C | | xxx |
- C --+-------+-------+
-
- movq (%edx), %mm0 C src low two limbs
- addl $4, %edx
- movl %eax, PARAM_SIZE C size-1
-
- addl $4, %edi
- decl %eax C size-2 is new size-1
-
- psrlq %mm6, %mm0
- movl %edi, PARAM_DST C new dst
-
- movd %mm0, -4(%edi)
-L(start_src_aligned):
-
-
- movq (%edx), %mm1 C src low two limbs
- decl %eax C size-2, two last limbs handled at end
- testl $4, %edi
-
- psrlq %mm6, %mm5
- jz L(start_dst_aligned)
-
-
- C dst isn't aligned, add 4 to make it so, and pretend the shift is
- C 32 bits extra. Low limb of dst (marked xxx) handled here separately.
- C
- C source edx
- C --+-------+-------+
- C | mm1 |
- C --+-------+-------+
- C 4mod8 0mod8
- C
- C dest edi
- C --+-------+-------+-------+
- C | xxx |
- C --+-------+-------+-------+
- C 4mod8 0mod8 4mod8
-
- movq %mm1, %mm0
- psrlq %mm6, %mm1
- addl $32, %ecx C shift+32
-
- movd %mm1, (%edi)
- movq %mm0, %mm1
- addl $4, %edi C new dst
-
- movd %ecx, %mm6
-L(start_dst_aligned):
-
-
- movq %mm1, %mm2 C copy of src low two limbs
- negl %ecx
- andl $-2, %eax C round size down to even
-
- movl %eax, %ebx
- negl %eax
- addl $64, %ecx
-
- andl $UNROLL_MASK, %eax
- decl %ebx
-
- shll %eax
-
- movd %ecx, %mm7 C lshift = 64-rshift
-
-ifdef(`PIC',`
- call L(pic_calc)
-L(here):
-',`
- leal L(entry) (%eax,%eax,4), %esi
- negl %eax
-')
- shrl $UNROLL_LOG2, %ebx C loop counter
-
- leal ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
- leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
- movl PARAM_SIZE, %eax C for use at end
-
- jmp *%esi
-
-
-ifdef(`PIC',`
-L(pic_calc):
- C See README.family about old gas bugs
- leal (%eax,%eax,4), %esi
- addl $L(entry)-L(here), %esi
- addl (%esp), %esi
- negl %eax
-
- ret
-')
-
-
-C -----------------------------------------------------------------------------
- ALIGN(64)
-L(top):
- C eax size, for use at end
- C ebx loop counter
- C ecx lshift
- C edx src
- C esi was computed jump
- C edi dst
- C ebp
- C
- C mm0 scratch
- C mm1 \ carry (alternating)
- C mm2 /
- C mm6 rshift
- C mm7 lshift
- C
- C 10 code bytes/limb
- C
- C The two chunks differ in whether mm1 or mm2 hold the carry.
- C The computed jump puts the initial carry in both mm1 and mm2.
-
-L(entry):
-deflit(CHUNK_COUNT, 4)
-forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
- deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
- deflit(`disp1', eval(disp0 + 8))
-
- movq disp0(%edx), %mm0
- psrlq %mm6, %mm2
-
- movq %mm0, %mm1
- psllq %mm7, %mm0
-
- por %mm2, %mm0
- movq %mm0, disp0(%edi)
-
-
- movq disp1(%edx), %mm0
- psrlq %mm6, %mm1
-
- movq %mm0, %mm2
- psllq %mm7, %mm0
-
- por %mm1, %mm0
- movq %mm0, disp1(%edi)
-')
-
- addl $UNROLL_BYTES, %edx
- addl $UNROLL_BYTES, %edi
- decl %ebx
-
- jns L(top)
-
-
-deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
-deflit(`disp1', eval(disp0-0 + 8))
-
- testb $1, %al
- psrlq %mm6, %mm2 C wanted rshifted in all cases below
- movl SAVE_ESI, %esi
-
- movd %mm5, %eax C return value
-
- movl SAVE_EBX, %ebx
- jz L(end_even)
-
-
- C Size odd, destination was aligned.
- C
- C source
- C edx
- C +-------+---------------+--
- C | | mm2 |
- C +-------+---------------+--
- C
- C dest edi
- C +-------+---------------+---------------+--
- C | | | written |
- C +-------+---------------+---------------+--
- C
- C mm6 = shift
- C mm7 = ecx = 64-shift
-
-
- C Size odd, destination was unaligned.
- C
- C source
- C edx
- C +-------+---------------+--
- C | | mm2 |
- C +-------+---------------+--
- C
- C dest edi
- C +---------------+---------------+--
- C | | written |
- C +---------------+---------------+--
- C
- C mm6 = shift+32
- C mm7 = ecx = 64-(shift+32)
-
-
- C In both cases there's one extra limb of src to fetch and combine
- C with mm2 to make a qword to store, and in the aligned case there's
- C a further extra limb of dst to be formed.
-
-
- movd disp0(%edx), %mm0
- movq %mm0, %mm1
-
- psllq %mm7, %mm0
- testb $32, %cl
-
- por %mm2, %mm0
- psrlq %mm6, %mm1
-
- movq %mm0, disp0(%edi)
- jz L(finish_odd_unaligned)
-
- movd %mm1, disp1(%edi)
-L(finish_odd_unaligned):
-
- movl SAVE_EDI, %edi
- addl $SAVE_SIZE, %esp
- emms
-
- ret
-
-
-L(end_even):
-
- C Size even, destination was aligned.
- C
- C source
- C +---------------+--
- C | mm2 |
- C +---------------+--
- C
- C dest edi
- C +---------------+---------------+--
- C | | mm3 |
- C +---------------+---------------+--
- C
- C mm6 = shift
- C mm7 = ecx = 64-shift
-
-
- C Size even, destination was unaligned.
- C
- C source
- C +---------------+--
- C | mm2 |
- C +---------------+--
- C
- C dest edi
- C +-------+---------------+--
- C | | mm3 |
- C +-------+---------------+--
- C
- C mm6 = shift+32
- C mm7 = 64-(shift+32)
-
-
- C The movd for the unaligned case is the same data as the movq for
- C the aligned case, it's just a choice between whether one or two
- C limbs should be written.
-
-
- testb $32, %cl
- movd %mm2, disp0(%edi)
-
- jz L(end_even_unaligned)
-
- movq %mm2, disp0(%edi)
-L(end_even_unaligned):
-
- movl SAVE_EDI, %edi
- addl $SAVE_SIZE, %esp
- emms
-
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mul_1.asm b/rts/gmp/mpn/x86/k7/mul_1.asm
deleted file mode 100644
index 07f7085b10..0000000000
--- a/rts/gmp/mpn/x86/k7/mul_1.asm
+++ /dev/null
@@ -1,265 +0,0 @@
-dnl AMD K7 mpn_mul_1 -- mpn by limb multiply.
-dnl
-dnl K7: 3.4 cycles/limb (at 16 limbs/loop).
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-dnl K7: UNROLL_COUNT cycles/limb
-dnl 8 3.9
-dnl 16 3.4
-dnl 32 3.4
-dnl 64 3.35
-dnl Maximum possible with the current code is 64.
-
-deflit(UNROLL_COUNT, 16)
-
-
-C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t multiplier);
-C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t multiplier, mp_limb_t carry);
-C
-C Multiply src,size by mult and store the result in dst,size.
-C Return the carry limb from the top of the result.
-C
-C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
-C the low limb of the destination.
-C
-C Variations on the unrolled loop have been tried, with the current
-C registers or with the counter on the stack to free up ecx. The current
-C code is the fastest found.
-C
-C An interesting effect is that removing the stores "movl %ebx, disp0(%edi)"
-C from the unrolled loop actually slows it down to 5.0 cycles/limb. Code
-C with this change can be tested on sizes of the form UNROLL_COUNT*n+1
-C without having to change the computed jump. There's obviously something
-C fishy going on, perhaps with what execution units the mul needs.
-
-defframe(PARAM_CARRY, 20)
-defframe(PARAM_MULTIPLIER,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-defframe(SAVE_EBP, -4)
-defframe(SAVE_EDI, -8)
-defframe(SAVE_ESI, -12)
-defframe(SAVE_EBX, -16)
-deflit(STACK_SPACE, 16)
-
-dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
-ifdef(`PIC',`
-deflit(UNROLL_THRESHOLD, 7)
-',`
-deflit(UNROLL_THRESHOLD, 5)
-')
-
- .text
- ALIGN(32)
-PROLOGUE(mpn_mul_1c)
-deflit(`FRAME',0)
- movl PARAM_CARRY, %edx
- jmp LF(mpn_mul_1,start_nc)
-EPILOGUE()
-
-
-PROLOGUE(mpn_mul_1)
-deflit(`FRAME',0)
- xorl %edx, %edx C initial carry
-L(start_nc):
- movl PARAM_SIZE, %ecx
- subl $STACK_SPACE, %esp
-deflit(`FRAME', STACK_SPACE)
-
- movl %edi, SAVE_EDI
- movl %ebx, SAVE_EBX
- movl %edx, %ebx
-
- movl %esi, SAVE_ESI
- movl PARAM_SRC, %esi
- cmpl $UNROLL_THRESHOLD, %ecx
-
- movl PARAM_DST, %edi
- movl %ebp, SAVE_EBP
- jae L(unroll)
-
- leal (%esi,%ecx,4), %esi
- leal (%edi,%ecx,4), %edi
- negl %ecx
-
- movl PARAM_MULTIPLIER, %ebp
-
-L(simple):
- C eax scratch
- C ebx carry
- C ecx counter (negative)
- C edx scratch
- C esi src
- C edi dst
- C ebp multiplier
-
- movl (%esi,%ecx,4), %eax
-
- mull %ebp
-
- addl %ebx, %eax
- movl %eax, (%edi,%ecx,4)
- movl $0, %ebx
-
- adcl %edx, %ebx
- incl %ecx
- jnz L(simple)
-
- movl %ebx, %eax
- movl SAVE_EBX, %ebx
- movl SAVE_ESI, %esi
-
- movl SAVE_EDI, %edi
- movl SAVE_EBP, %ebp
- addl $STACK_SPACE, %esp
-
- ret
-
-
-C -----------------------------------------------------------------------------
-C The mov to load the next source limb is done well ahead of the mul, this
-C is necessary for full speed. It leads to one limb handled separately
-C after the loop.
-C
-C When unrolling to 32 or more, an offset of +4 is used on the src pointer,
-C to avoid having an 0x80 displacement in the code for the last limb in the
-C unrolled loop. This is for a fair comparison between 16 and 32 unrolling.
-
-ifelse(eval(UNROLL_COUNT >= 32),1,`
-deflit(SRC_OFFSET,4)
-',`
-deflit(SRC_OFFSET,)
-')
-
- C this is offset 0x62, so close enough to aligned
-L(unroll):
- C eax
- C ebx initial carry
- C ecx size
- C edx
- C esi src
- C edi dst
- C ebp
-deflit(`FRAME', STACK_SPACE)
-
- leal -1(%ecx), %edx C one limb handled at end
- leal -2(%ecx), %ecx C and ecx is one less than edx
- movl %ebp, SAVE_EBP
-
- negl %edx
- shrl $UNROLL_LOG2, %ecx C unrolled loop counter
- movl (%esi), %eax C src low limb
-
- andl $UNROLL_MASK, %edx
- movl PARAM_DST, %edi
-
- movl %edx, %ebp
- shll $4, %edx
-
- C 17 code bytes per limb
-ifdef(`PIC',`
- call L(add_eip_to_edx)
-L(here):
-',`
- leal L(entry) (%edx,%ebp), %edx
-')
- negl %ebp
-
- leal ifelse(UNROLL_BYTES,256,128+) SRC_OFFSET(%esi,%ebp,4), %esi
- leal ifelse(UNROLL_BYTES,256,128) (%edi,%ebp,4), %edi
- movl PARAM_MULTIPLIER, %ebp
-
- jmp *%edx
-
-
-ifdef(`PIC',`
-L(add_eip_to_edx):
- C See README.family about old gas bugs
- leal (%edx,%ebp), %edx
- addl $L(entry)-L(here), %edx
- addl (%esp), %edx
- ret
-')
-
-
-C ----------------------------------------------------------------------------
- ALIGN(32)
-L(top):
- C eax next src limb
- C ebx carry
- C ecx counter
- C edx scratch
- C esi src+4
- C edi dst
- C ebp multiplier
- C
- C 17 code bytes per limb processed
-
-L(entry):
-forloop(i, 0, UNROLL_COUNT-1, `
- deflit(`disp_dst', eval(i*4 ifelse(UNROLL_BYTES,256,-128)))
- deflit(`disp_src', eval(disp_dst + 4-(SRC_OFFSET-0)))
-
- mull %ebp
-
- addl %eax, %ebx
-Zdisp( movl, disp_src,(%esi), %eax)
-Zdisp( movl, %ebx, disp_dst,(%edi))
-
- movl $0, %ebx
- adcl %edx, %ebx
-')
-
- decl %ecx
-
- leal UNROLL_BYTES(%esi), %esi
- leal UNROLL_BYTES(%edi), %edi
- jns L(top)
-
-
-deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
-
- mull %ebp
-
- addl %eax, %ebx
- movl $0, %eax
- movl SAVE_ESI, %esi
-
- movl %ebx, disp0(%edi)
- movl SAVE_EBX, %ebx
- movl SAVE_EDI, %edi
-
- adcl %edx, %eax
- movl SAVE_EBP, %ebp
- addl $STACK_SPACE, %esp
-
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mul_basecase.asm b/rts/gmp/mpn/x86/k7/mul_basecase.asm
deleted file mode 100644
index c4be62e633..0000000000
--- a/rts/gmp/mpn/x86/k7/mul_basecase.asm
+++ /dev/null
@@ -1,593 +0,0 @@
-dnl AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
-dnl
-dnl K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
-dnl limbs/loop unrolling).
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-dnl K7 UNROLL_COUNT cycles/product (at around 20x20)
-dnl 8 4.67
-dnl 16 4.59
-dnl 32 4.42
-dnl Maximum possible with the current code is 32.
-dnl
-dnl At 32 the typical 13-26 limb sizes from the karatsuba code will get
-dnl done with a straight run through a block of code, no inner loop. Using
-dnl 32 gives 1k of code, but the k7 has a 64k L1 code cache.
-
-deflit(UNROLL_COUNT, 32)
-
-
-C void mpn_mul_basecase (mp_ptr wp,
-C mp_srcptr xp, mp_size_t xsize,
-C mp_srcptr yp, mp_size_t ysize);
-C
-C Calculate xp,xsize multiplied by yp,ysize, storing the result in
-C wp,xsize+ysize.
-C
-C This routine is essentially the same as mpn/generic/mul_basecase.c, but
-C it's faster because it does most of the mpn_addmul_1() startup
-C calculations only once. The saving is 15-25% on typical sizes coming from
-C the Karatsuba multiply code.
-
-ifdef(`PIC',`
-deflit(UNROLL_THRESHOLD, 5)
-',`
-deflit(UNROLL_THRESHOLD, 5)
-')
-
-defframe(PARAM_YSIZE,20)
-defframe(PARAM_YP, 16)
-defframe(PARAM_XSIZE,12)
-defframe(PARAM_XP, 8)
-defframe(PARAM_WP, 4)
-
- .text
- ALIGN(32)
-PROLOGUE(mpn_mul_basecase)
-deflit(`FRAME',0)
-
- movl PARAM_XSIZE, %ecx
- movl PARAM_YP, %eax
-
- movl PARAM_XP, %edx
- movl (%eax), %eax C yp low limb
-
- cmpl $2, %ecx
- ja L(xsize_more_than_two)
- je L(two_by_something)
-
-
- C one limb by one limb
-
- mull (%edx)
-
- movl PARAM_WP, %ecx
- movl %eax, (%ecx)
- movl %edx, 4(%ecx)
- ret
-
-
-C -----------------------------------------------------------------------------
-L(two_by_something):
-deflit(`FRAME',0)
- decl PARAM_YSIZE
- pushl %ebx defframe_pushl(`SAVE_EBX')
- movl %eax, %ecx C yp low limb
-
- movl PARAM_WP, %ebx
- pushl %esi defframe_pushl(`SAVE_ESI')
- movl %edx, %esi C xp
-
- movl (%edx), %eax C xp low limb
- jnz L(two_by_two)
-
-
- C two limbs by one limb
-
- mull %ecx
-
- movl %eax, (%ebx)
- movl 4(%esi), %eax
- movl %edx, %esi C carry
-
- mull %ecx
-
- addl %eax, %esi
-
- movl %esi, 4(%ebx)
- movl SAVE_ESI, %esi
-
- adcl $0, %edx
-
- movl %edx, 8(%ebx)
- movl SAVE_EBX, %ebx
- addl $FRAME, %esp
-
- ret
-
-
-
-C -----------------------------------------------------------------------------
-C Could load yp earlier into another register.
-
- ALIGN(16)
-L(two_by_two):
- C eax xp low limb
- C ebx wp
- C ecx yp low limb
- C edx
- C esi xp
- C edi
- C ebp
-
-dnl FRAME carries on from previous
-
- mull %ecx C xp[0] * yp[0]
-
- push %edi defframe_pushl(`SAVE_EDI')
- movl %edx, %edi C carry, for wp[1]
-
- movl %eax, (%ebx)
- movl 4(%esi), %eax
-
- mull %ecx C xp[1] * yp[0]
-
- addl %eax, %edi
- movl PARAM_YP, %ecx
-
- adcl $0, %edx
- movl 4(%ecx), %ecx C yp[1]
- movl %edi, 4(%ebx)
-
- movl 4(%esi), %eax C xp[1]
- movl %edx, %edi C carry, for wp[2]
-
- mull %ecx C xp[1] * yp[1]
-
- addl %eax, %edi
-
- adcl $0, %edx
- movl (%esi), %eax C xp[0]
-
- movl %edx, %esi C carry, for wp[3]
-
- mull %ecx C xp[0] * yp[1]
-
- addl %eax, 4(%ebx)
- adcl %edx, %edi
- movl %edi, 8(%ebx)
-
- adcl $0, %esi
- movl SAVE_EDI, %edi
- movl %esi, 12(%ebx)
-
- movl SAVE_ESI, %esi
- movl SAVE_EBX, %ebx
- addl $FRAME, %esp
-
- ret
-
-
-C -----------------------------------------------------------------------------
- ALIGN(16)
-L(xsize_more_than_two):
-
-C The first limb of yp is processed with a simple mpn_mul_1 style loop
-C inline. Unrolling this doesn't seem worthwhile since it's only run once
-C (whereas the addmul below is run ysize-1 many times). A call to the
-C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
-C popping, and doesn't seem likely to be worthwhile on the typical 13-26
-C limb operations the Karatsuba code calls here with.
-
- C eax yp[0]
- C ebx
- C ecx xsize
- C edx xp
- C esi
- C edi
- C ebp
-
-dnl FRAME doesn't carry on from previous, no pushes yet here
-defframe(`SAVE_EBX',-4)
-defframe(`SAVE_ESI',-8)
-defframe(`SAVE_EDI',-12)
-defframe(`SAVE_EBP',-16)
-deflit(`FRAME',0)
-
- subl $16, %esp
-deflit(`FRAME',16)
-
- movl %edi, SAVE_EDI
- movl PARAM_WP, %edi
-
- movl %ebx, SAVE_EBX
- movl %ebp, SAVE_EBP
- movl %eax, %ebp
-
- movl %esi, SAVE_ESI
- xorl %ebx, %ebx
- leal (%edx,%ecx,4), %esi C xp end
-
- leal (%edi,%ecx,4), %edi C wp end of mul1
- negl %ecx
-
-
-L(mul1):
- C eax scratch
- C ebx carry
- C ecx counter, negative
- C edx scratch
- C esi xp end
- C edi wp end of mul1
- C ebp multiplier
-
- movl (%esi,%ecx,4), %eax
-
- mull %ebp
-
- addl %ebx, %eax
- movl %eax, (%edi,%ecx,4)
- movl $0, %ebx
-
- adcl %edx, %ebx
- incl %ecx
- jnz L(mul1)
-
-
- movl PARAM_YSIZE, %edx
- movl PARAM_XSIZE, %ecx
-
- movl %ebx, (%edi) C final carry
- decl %edx
-
- jnz L(ysize_more_than_one)
-
-
- movl SAVE_EDI, %edi
- movl SAVE_EBX, %ebx
-
- movl SAVE_EBP, %ebp
- movl SAVE_ESI, %esi
- addl $FRAME, %esp
-
- ret
-
-
-L(ysize_more_than_one):
- cmpl $UNROLL_THRESHOLD, %ecx
- movl PARAM_YP, %eax
-
- jae L(unroll)
-
-
-C -----------------------------------------------------------------------------
- C simple addmul looping
- C
- C eax yp
- C ebx
- C ecx xsize
- C edx ysize-1
- C esi xp end
- C edi wp end of mul1
- C ebp
-
- leal 4(%eax,%edx,4), %ebp C yp end
- negl %ecx
- negl %edx
-
- movl (%esi,%ecx,4), %eax C xp low limb
- movl %edx, PARAM_YSIZE C -(ysize-1)
- incl %ecx
-
- xorl %ebx, %ebx C initial carry
- movl %ecx, PARAM_XSIZE C -(xsize-1)
- movl %ebp, PARAM_YP
-
- movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier
- jmp L(simple_outer_entry)
-
-
- C this is offset 0x121 so close enough to aligned
-L(simple_outer_top):
- C ebp ysize counter, negative
-
- movl PARAM_YP, %edx
- movl PARAM_XSIZE, %ecx C -(xsize-1)
- xorl %ebx, %ebx C carry
-
- movl %ebp, PARAM_YSIZE
- addl $4, %edi C next position in wp
-
- movl (%edx,%ebp,4), %ebp C yp limb - multiplier
- movl -4(%esi,%ecx,4), %eax C xp low limb
-
-
-L(simple_outer_entry):
-
-L(simple_inner):
- C eax xp limb
- C ebx carry limb
- C ecx loop counter (negative)
- C edx scratch
- C esi xp end
- C edi wp end
- C ebp multiplier
-
- mull %ebp
-
- addl %eax, %ebx
- adcl $0, %edx
-
- addl %ebx, (%edi,%ecx,4)
- movl (%esi,%ecx,4), %eax
- adcl $0, %edx
-
- incl %ecx
- movl %edx, %ebx
- jnz L(simple_inner)
-
-
- mull %ebp
-
- movl PARAM_YSIZE, %ebp
- addl %eax, %ebx
-
- adcl $0, %edx
- addl %ebx, (%edi)
-
- adcl $0, %edx
- incl %ebp
-
- movl %edx, 4(%edi)
- jnz L(simple_outer_top)
-
-
- movl SAVE_EBX, %ebx
- movl SAVE_ESI, %esi
-
- movl SAVE_EDI, %edi
- movl SAVE_EBP, %ebp
- addl $FRAME, %esp
-
- ret
-
-
-
-C -----------------------------------------------------------------------------
-C
-C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
-C comments.
-C
-C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
-C increment xp and wp. This is used to adjust back xp and wp, and rshifted
-C to given an initial VAR_COUNTER at the top of the outer loop.
-C
-C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
-C up to -1, inclusive.
-C
-C VAR_JMP is the computed jump into the unrolled loop.
-C
-C VAR_XP_LOW is the least significant limb of xp, which is needed at the
-C start of the unrolled loop.
-C
-C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
-C inclusive.
-C
-C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
-C added to give the location of the next limb of yp, which is the multiplier
-C in the unrolled loop.
-C
-C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
-C outer loop to take care of xp, wp and the inner loop counter.
-
-defframe(VAR_COUNTER, -20)
-defframe(VAR_ADJUST, -24)
-defframe(VAR_JMP, -28)
-defframe(VAR_XP_LOW, -32)
-deflit(VAR_EXTRA_SPACE, 16)
-
-
-L(unroll):
- C eax yp
- C ebx
- C ecx xsize
- C edx ysize-1
- C esi xp end
- C edi wp end of mul1
- C ebp
-
- movl PARAM_XP, %esi
- movl 4(%eax), %ebp C multiplier (yp second limb)
- leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing
-
- movl PARAM_WP, %edi
- movl %eax, PARAM_YP
- negl %edx
-
- movl %edx, PARAM_YSIZE
- leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1
- decl %ecx C xsize-1
-
- movl (%esi), %eax C xp low limb
- andl $-UNROLL_MASK-1, %ebx
- negl %ecx
-
- subl $VAR_EXTRA_SPACE, %esp
-deflit(`FRAME',16+VAR_EXTRA_SPACE)
- negl %ebx
- andl $UNROLL_MASK, %ecx
-
- movl %ebx, VAR_ADJUST
- movl %ecx, %edx
- shll $4, %ecx
-
- sarl $UNROLL_LOG2, %ebx
-
- C 17 code bytes per limb
-ifdef(`PIC',`
- call L(pic_calc)
-L(unroll_here):
-',`
- leal L(unroll_entry) (%ecx,%edx,1), %ecx
-')
- negl %edx
-
- movl %eax, VAR_XP_LOW
- movl %ecx, VAR_JMP
- leal 4(%edi,%edx,4), %edi C wp and xp, adjust for unrolling,
- leal 4(%esi,%edx,4), %esi C and start at second limb
- jmp L(unroll_outer_entry)
-
-
-ifdef(`PIC',`
-L(pic_calc):
- C See README.family about old gas bugs
- leal (%ecx,%edx,1), %ecx
- addl $L(unroll_entry)-L(unroll_here), %ecx
- addl (%esp), %ecx
- ret
-')
-
-
-C --------------------------------------------------------------------------
- ALIGN(32)
-L(unroll_outer_top):
- C ebp ysize counter, negative
-
- movl VAR_ADJUST, %ebx
- movl PARAM_YP, %edx
-
- movl VAR_XP_LOW, %eax
- movl %ebp, PARAM_YSIZE C store incremented ysize counter
-
- leal 4(%edi,%ebx,4), %edi
- leal (%esi,%ebx,4), %esi
- sarl $UNROLL_LOG2, %ebx
-
- movl (%edx,%ebp,4), %ebp C yp next multiplier
- movl VAR_JMP, %ecx
-
-L(unroll_outer_entry):
- mull %ebp
-
- testb $1, %cl C and clear carry bit
- movl %ebx, VAR_COUNTER
- movl $0, %ebx
-
- movl $0, %ecx
- cmovz( %eax, %ecx) C eax into low carry, zero into high carry limb
- cmovnz( %eax, %ebx)
-
- C Extra fetch of VAR_JMP is bad, but registers are tight
- jmp *VAR_JMP
-
-
-C -----------------------------------------------------------------------------
- ALIGN(32)
-L(unroll_top):
- C eax xp limb
- C ebx carry high
- C ecx carry low
- C edx scratch
- C esi xp+8
- C edi wp
- C ebp yp multiplier limb
- C
- C VAR_COUNTER loop counter, negative
- C
- C 17 bytes each limb
-
-L(unroll_entry):
-
-deflit(CHUNK_COUNT,2)
-forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
- deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
- deflit(`disp1', eval(disp0 + 4))
-
-Zdisp( movl, disp0,(%esi), %eax)
- adcl %edx, %ebx
-
- mull %ebp
-
-Zdisp( addl, %ecx, disp0,(%edi))
- movl $0, %ecx
-
- adcl %eax, %ebx
-
-
- movl disp1(%esi), %eax
- adcl %edx, %ecx
-
- mull %ebp
-
- addl %ebx, disp1(%edi)
- movl $0, %ebx
-
- adcl %eax, %ecx
-')
-
-
- incl VAR_COUNTER
- leal UNROLL_BYTES(%esi), %esi
- leal UNROLL_BYTES(%edi), %edi
-
- jnz L(unroll_top)
-
-
- C eax
- C ebx zero
- C ecx low
- C edx high
- C esi
- C edi wp, pointing at second last limb)
- C ebp
- C
- C carry flag to be added to high
-
-deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
-deflit(`disp1', eval(disp0-0 + 4))
-
- movl PARAM_YSIZE, %ebp
- adcl $0, %edx
- addl %ecx, disp0(%edi)
-
- adcl $0, %edx
- incl %ebp
-
- movl %edx, disp1(%edi)
- jnz L(unroll_outer_top)
-
-
- movl SAVE_ESI, %esi
- movl SAVE_EBP, %ebp
-
- movl SAVE_EDI, %edi
- movl SAVE_EBX, %ebx
- addl $FRAME, %esp
-
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/sqr_basecase.asm b/rts/gmp/mpn/x86/k7/sqr_basecase.asm
deleted file mode 100644
index 84861ea66b..0000000000
--- a/rts/gmp/mpn/x86/k7/sqr_basecase.asm
+++ /dev/null
@@ -1,627 +0,0 @@
-dnl AMD K7 mpn_sqr_basecase -- square an mpn number.
-dnl
-dnl K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product
-dnl (measured on the speed difference between 25 and 50 limbs, which is
-dnl roughly the Karatsuba recursing range).
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-dnl These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
-dnl some comments.
-
-deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66)
-
-ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
-`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
-
-m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
-deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
-
-
-C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
-C
-C With a KARATSUBA_SQR_THRESHOLD around 50 this code is about 1500 bytes,
-C which is quite a bit, but is considered good value since squares big
-C enough to use most of the code will be spending quite a few cycles in it.
-
-
-defframe(PARAM_SIZE,12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
- .text
- ALIGN(32)
-PROLOGUE(mpn_sqr_basecase)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %ecx
- movl PARAM_SRC, %eax
- cmpl $2, %ecx
-
- movl PARAM_DST, %edx
- je L(two_limbs)
- ja L(three_or_more)
-
-
-C------------------------------------------------------------------------------
-C one limb only
- C eax src
- C ecx size
- C edx dst
-
- movl (%eax), %eax
- movl %edx, %ecx
-
- mull %eax
-
- movl %edx, 4(%ecx)
- movl %eax, (%ecx)
- ret
-
-
-C------------------------------------------------------------------------------
-C
-C Using the read/modify/write "add"s seems to be faster than saving and
-C restoring registers. Perhaps the loads for the first set hide under the
-C mul latency and the second gets store to load forwarding.
-
- ALIGN(16)
-L(two_limbs):
- C eax src
- C ebx
- C ecx size
- C edx dst
-deflit(`FRAME',0)
-
- pushl %ebx FRAME_pushl()
- movl %eax, %ebx C src
- movl (%eax), %eax
-
- movl %edx, %ecx C dst
-
- mull %eax C src[0]^2
-
- movl %eax, (%ecx) C dst[0]
- movl 4(%ebx), %eax
-
- movl %edx, 4(%ecx) C dst[1]
-
- mull %eax C src[1]^2
-
- movl %eax, 8(%ecx) C dst[2]
- movl (%ebx), %eax
-
- movl %edx, 12(%ecx) C dst[3]
-
- mull 4(%ebx) C src[0]*src[1]
-
- popl %ebx
-
- addl %eax, 4(%ecx)
- adcl %edx, 8(%ecx)
- adcl $0, 12(%ecx)
- ASSERT(nc)
-
- addl %eax, 4(%ecx)
- adcl %edx, 8(%ecx)
- adcl $0, 12(%ecx)
- ASSERT(nc)
-
- ret
-
-
-C------------------------------------------------------------------------------
-defframe(SAVE_EBX, -4)
-defframe(SAVE_ESI, -8)
-defframe(SAVE_EDI, -12)
-defframe(SAVE_EBP, -16)
-deflit(STACK_SPACE, 16)
-
-L(three_or_more):
- subl $STACK_SPACE, %esp
- cmpl $4, %ecx
- jae L(four_or_more)
-deflit(`FRAME',STACK_SPACE)
-
-
-C------------------------------------------------------------------------------
-C Three limbs
-C
-C Writing out the loads and stores separately at the end of this code comes
-C out about 10 cycles faster than using adcls to memory.
-
- C eax src
- C ecx size
- C edx dst
-
- movl %ebx, SAVE_EBX
- movl %eax, %ebx C src
- movl (%eax), %eax
-
- movl %edx, %ecx C dst
- movl %esi, SAVE_ESI
- movl %edi, SAVE_EDI
-
- mull %eax C src[0] ^ 2
-
- movl %eax, (%ecx)
- movl 4(%ebx), %eax
- movl %edx, 4(%ecx)
-
- mull %eax C src[1] ^ 2
-
- movl %eax, 8(%ecx)
- movl 8(%ebx), %eax
- movl %edx, 12(%ecx)
-
- mull %eax C src[2] ^ 2
-
- movl %eax, 16(%ecx)
- movl (%ebx), %eax
- movl %edx, 20(%ecx)
-
- mull 4(%ebx) C src[0] * src[1]
-
- movl %eax, %esi
- movl (%ebx), %eax
- movl %edx, %edi
-
- mull 8(%ebx) C src[0] * src[2]
-
- addl %eax, %edi
- movl %ebp, SAVE_EBP
- movl $0, %ebp
-
- movl 4(%ebx), %eax
- adcl %edx, %ebp
-
- mull 8(%ebx) C src[1] * src[2]
-
- xorl %ebx, %ebx
- addl %eax, %ebp
-
- adcl $0, %edx
-
- C eax
- C ebx zero, will be dst[5]
- C ecx dst
- C edx dst[4]
- C esi dst[1]
- C edi dst[2]
- C ebp dst[3]
-
- adcl $0, %edx
- addl %esi, %esi
-
- adcl %edi, %edi
- movl 4(%ecx), %eax
-
- adcl %ebp, %ebp
-
- adcl %edx, %edx
-
- adcl $0, %ebx
- addl %eax, %esi
- movl 8(%ecx), %eax
-
- adcl %eax, %edi
- movl 12(%ecx), %eax
- movl %esi, 4(%ecx)
-
- adcl %eax, %ebp
- movl 16(%ecx), %eax
- movl %edi, 8(%ecx)
-
- movl SAVE_ESI, %esi
- movl SAVE_EDI, %edi
-
- adcl %eax, %edx
- movl 20(%ecx), %eax
- movl %ebp, 12(%ecx)
-
- adcl %ebx, %eax
- ASSERT(nc)
- movl SAVE_EBX, %ebx
- movl SAVE_EBP, %ebp
-
- movl %edx, 16(%ecx)
- movl %eax, 20(%ecx)
- addl $FRAME, %esp
-
- ret
-
-
-C------------------------------------------------------------------------------
-L(four_or_more):
-
-C First multiply src[0]*src[1..size-1] and store at dst[1..size].
-C Further products are added in rather than stored.
-
- C eax src
- C ebx
- C ecx size
- C edx dst
- C esi
- C edi
- C ebp
-
-defframe(`VAR_COUNTER',-20)
-defframe(`VAR_JMP', -24)
-deflit(EXTRA_STACK_SPACE, 8)
-
- movl %ebx, SAVE_EBX
- movl %edi, SAVE_EDI
- leal (%edx,%ecx,4), %edi C &dst[size]
-
- movl %esi, SAVE_ESI
- movl %ebp, SAVE_EBP
- leal (%eax,%ecx,4), %esi C &src[size]
-
- movl (%eax), %ebp C multiplier
- movl $0, %ebx
- decl %ecx
-
- negl %ecx
- subl $EXTRA_STACK_SPACE, %esp
-FRAME_subl_esp(EXTRA_STACK_SPACE)
-
-L(mul_1):
- C eax scratch
- C ebx carry
- C ecx counter
- C edx scratch
- C esi &src[size]
- C edi &dst[size]
- C ebp multiplier
-
- movl (%esi,%ecx,4), %eax
-
- mull %ebp
-
- addl %ebx, %eax
- movl %eax, (%edi,%ecx,4)
- movl $0, %ebx
-
- adcl %edx, %ebx
- incl %ecx
- jnz L(mul_1)
-
-
-C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
-C
-C The last two products, which are the bottom right corner of the product
-C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
-C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
-C cases that need to be done.
-C
-C The unrolled code is the same as in mpn_addmul_1, see that routine for
-C some comments.
-C
-C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive.
-C
-C VAR_JMP is the computed jump into the unrolled code, stepped by one code
-C chunk each outer loop.
-C
-C K7 does branch prediction on indirect jumps, which is bad since it's a
-C different target each time. There seems no way to avoid this.
-
-dnl This value also hard coded in some shifts and adds
-deflit(CODE_BYTES_PER_LIMB, 17)
-
-dnl With the unmodified &src[size] and &dst[size] pointers, the
-dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT
-dnl values up to 31, but above that an offset must be added to them.
-
-deflit(OFFSET,
-ifelse(eval(UNROLL_COUNT>31),1,
-eval((UNROLL_COUNT-31)*4),
-0))
-
-dnl Because the last chunk of code is generated differently, a label placed
-dnl at the end doesn't work. Instead calculate the implied end using the
-dnl start and how many chunks of code there are.
-
-deflit(UNROLL_INNER_END,
-`L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)')
-
- C eax
- C ebx carry
- C ecx
- C edx
- C esi &src[size]
- C edi &dst[size]
- C ebp
-
- movl PARAM_SIZE, %ecx
- movl %ebx, (%edi)
-
- subl $4, %ecx
- jz L(corner)
-
- negl %ecx
-ifelse(OFFSET,0,,`subl $OFFSET, %edi')
-ifelse(OFFSET,0,,`subl $OFFSET, %esi')
-
- movl %ecx, %edx
- shll $4, %ecx
-
-ifdef(`PIC',`
- call L(pic_calc)
-L(here):
-',`
- leal UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
-')
-
-
- C The calculated jump mustn't come out to before the start of the
- C code available. This is the limit UNROLL_COUNT puts on the src
- C operand size, but checked here directly using the jump address.
- ASSERT(ae,
- `movl_text_address(L(unroll_inner_start), %eax)
- cmpl %eax, %ecx')
-
-
-C------------------------------------------------------------------------------
- ALIGN(16)
-L(unroll_outer_top):
- C eax
- C ebx high limb to store
- C ecx VAR_JMP
- C edx VAR_COUNTER, limbs, negative
- C esi &src[size], constant
- C edi dst ptr, high of last addmul
- C ebp
-
- movl -12+OFFSET(%esi,%edx,4), %ebp C next multiplier
- movl -8+OFFSET(%esi,%edx,4), %eax C first of multiplicand
-
- movl %edx, VAR_COUNTER
-
- mull %ebp
-
-define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')')
-
- testb $1, %cl
- movl %edx, %ebx C high carry
- movl %ecx, %edx C jump
-
- movl %eax, %ecx C low carry
- cmovX( %ebx, %ecx) C high carry reverse
- cmovX( %eax, %ebx) C low carry reverse
-
- leal CODE_BYTES_PER_LIMB(%edx), %eax
- xorl %edx, %edx
- leal 4(%edi), %edi
-
- movl %eax, VAR_JMP
-
- jmp *%eax
-
-
-ifdef(`PIC',`
-L(pic_calc):
- addl (%esp), %ecx
- addl $UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx
- addl %edx, %ecx
- ret
-')
-
-
- C Must be an even address to preserve the significance of the low
- C bit of the jump address indicating which way around ecx/ebx should
- C start.
- ALIGN(2)
-
-L(unroll_inner_start):
- C eax next limb
- C ebx carry high
- C ecx carry low
- C edx scratch
- C esi src
- C edi dst
- C ebp multiplier
-
-forloop(`i', UNROLL_COUNT, 1, `
- deflit(`disp_src', eval(-i*4 + OFFSET))
- deflit(`disp_dst', eval(disp_src - 4))
-
- m4_assert(`disp_src>=-128 && disp_src<128')
- m4_assert(`disp_dst>=-128 && disp_dst<128')
-
-ifelse(eval(i%2),0,`
-Zdisp( movl, disp_src,(%esi), %eax)
- adcl %edx, %ebx
-
- mull %ebp
-
-Zdisp( addl, %ecx, disp_dst,(%edi))
- movl $0, %ecx
-
- adcl %eax, %ebx
-
-',`
- dnl this bit comes out last
-Zdisp( movl, disp_src,(%esi), %eax)
- adcl %edx, %ecx
-
- mull %ebp
-
-dnl Zdisp( addl %ebx, disp_src,(%edi))
- addl %ebx, disp_dst(%edi)
-ifelse(forloop_last,0,
-` movl $0, %ebx')
-
- adcl %eax, %ecx
-')
-')
-
- C eax next limb
- C ebx carry high
- C ecx carry low
- C edx scratch
- C esi src
- C edi dst
- C ebp multiplier
-
- adcl $0, %edx
- addl %ecx, -4+OFFSET(%edi)
- movl VAR_JMP, %ecx
-
- adcl $0, %edx
-
- movl %edx, m4_empty_if_zero(OFFSET) (%edi)
- movl VAR_COUNTER, %edx
-
- incl %edx
- jnz L(unroll_outer_top)
-
-
-ifelse(OFFSET,0,,`
- addl $OFFSET, %esi
- addl $OFFSET, %edi
-')
-
-
-C------------------------------------------------------------------------------
-L(corner):
- C esi &src[size]
- C edi &dst[2*size-5]
-
- movl -12(%esi), %ebp
- movl -8(%esi), %eax
- movl %eax, %ecx
-
- mull %ebp
-
- addl %eax, -4(%edi)
- movl -4(%esi), %eax
-
- adcl $0, %edx
- movl %edx, %ebx
- movl %eax, %esi
-
- mull %ebp
-
- addl %ebx, %eax
-
- adcl $0, %edx
- addl %eax, (%edi)
- movl %esi, %eax
-
- adcl $0, %edx
- movl %edx, %ebx
-
- mull %ecx
-
- addl %ebx, %eax
- movl %eax, 4(%edi)
-
- adcl $0, %edx
- movl %edx, 8(%edi)
-
-
-
-C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
-
-L(lshift_start):
- movl PARAM_SIZE, %eax
- movl PARAM_DST, %edi
- xorl %ecx, %ecx C clear carry
-
- leal (%edi,%eax,8), %edi
- notl %eax C -size-1, preserve carry
-
- leal 2(%eax), %eax C -(size-1)
-
-L(lshift):
- C eax counter, negative
- C ebx
- C ecx
- C edx
- C esi
- C edi dst, pointing just after last limb
- C ebp
-
- rcll -4(%edi,%eax,8)
- rcll (%edi,%eax,8)
- incl %eax
- jnz L(lshift)
-
- setc %al
-
- movl PARAM_SRC, %esi
- movl %eax, -4(%edi) C dst most significant limb
-
- movl PARAM_SIZE, %ecx
-
-
-C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
-C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
-C low limb of src[0]^2.
-
- movl (%esi), %eax C src[0]
-
- mull %eax
-
- leal (%esi,%ecx,4), %esi C src point just after last limb
- negl %ecx
-
- movl %eax, (%edi,%ecx,8) C dst[0]
- incl %ecx
-
-L(diag):
- C eax scratch
- C ebx scratch
- C ecx counter, negative
- C edx carry
- C esi src just after last limb
- C edi dst just after last limb
- C ebp
-
- movl (%esi,%ecx,4), %eax
- movl %edx, %ebx
-
- mull %eax
-
- addl %ebx, -4(%edi,%ecx,8)
- adcl %eax, (%edi,%ecx,8)
- adcl $0, %edx
-
- incl %ecx
- jnz L(diag)
-
-
- movl SAVE_ESI, %esi
- movl SAVE_EBX, %ebx
-
- addl %edx, -4(%edi) C dst most significant limb
- movl SAVE_EDI, %edi
-
- movl SAVE_EBP, %ebp
- addl $FRAME, %esp
-
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/lshift.asm b/rts/gmp/mpn/x86/lshift.asm
deleted file mode 100644
index 4735335cbe..0000000000
--- a/rts/gmp/mpn/x86/lshift.asm
+++ /dev/null
@@ -1,90 +0,0 @@
-dnl x86 mpn_lshift -- mpn left shift.
-
-dnl Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
-dnl Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C unsigned shift);
-
-defframe(PARAM_SHIFT,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
- .text
- ALIGN(8)
-PROLOGUE(mpn_lshift)
-
- pushl %edi
- pushl %esi
- pushl %ebx
-deflit(`FRAME',12)
-
- movl PARAM_DST,%edi
- movl PARAM_SRC,%esi
- movl PARAM_SIZE,%edx
- movl PARAM_SHIFT,%ecx
-
- subl $4,%esi C adjust src
-
- movl (%esi,%edx,4),%ebx C read most significant limb
- xorl %eax,%eax
- shldl( %cl, %ebx, %eax) C compute carry limb
- decl %edx
- jz L(end)
- pushl %eax C push carry limb onto stack
- testb $1,%dl
- jnz L(1) C enter loop in the middle
- movl %ebx,%eax
-
- ALIGN(8)
-L(oop): movl (%esi,%edx,4),%ebx C load next lower limb
- shldl( %cl, %ebx, %eax) C compute result limb
- movl %eax,(%edi,%edx,4) C store it
- decl %edx
-L(1): movl (%esi,%edx,4),%eax
- shldl( %cl, %eax, %ebx)
- movl %ebx,(%edi,%edx,4)
- decl %edx
- jnz L(oop)
-
- shll %cl,%eax C compute least significant limb
- movl %eax,(%edi) C store it
-
- popl %eax C pop carry limb
-
- popl %ebx
- popl %esi
- popl %edi
- ret
-
-L(end): shll %cl,%ebx C compute least significant limb
- movl %ebx,(%edi) C store it
-
- popl %ebx
- popl %esi
- popl %edi
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/mod_1.asm b/rts/gmp/mpn/x86/mod_1.asm
deleted file mode 100644
index 3908161b3e..0000000000
--- a/rts/gmp/mpn/x86/mod_1.asm
+++ /dev/null
@@ -1,141 +0,0 @@
-dnl x86 mpn_mod_1 -- mpn by limb remainder.
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-dnl cycles/limb
-dnl K6 20
-dnl P5 44
-dnl P6 39
-dnl 486 approx 42 maybe
-dnl
-dnl The following have their own optimized mod_1 implementations, but for
-dnl reference the code here runs as follows.
-dnl
-dnl P6MMX 39
-dnl K7 41
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
-C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
-C mp_limb_t carry);
-C
-C Divide src,size by divisor and return the remainder. The quotient is
-C discarded.
-C
-C See mpn/x86/divrem_1.asm for some comments.
-
-defframe(PARAM_CARRY, 16)
-defframe(PARAM_DIVISOR,12)
-defframe(PARAM_SIZE, 8)
-defframe(PARAM_SRC, 4)
-
- .text
- ALIGN(16)
-
-PROLOGUE(mpn_mod_1c)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %ecx
- pushl %ebx FRAME_pushl()
-
- movl PARAM_SRC, %ebx
- pushl %esi FRAME_pushl()
-
- movl PARAM_DIVISOR, %esi
- orl %ecx, %ecx
-
- movl PARAM_CARRY, %edx
- jnz LF(mpn_mod_1,top)
-
- popl %esi
- movl %edx, %eax
-
- popl %ebx
-
- ret
-
-EPILOGUE()
-
-
-PROLOGUE(mpn_mod_1)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %ecx
- pushl %ebx FRAME_pushl()
-
- movl PARAM_SRC, %ebx
- pushl %esi FRAME_pushl()
-
- orl %ecx, %ecx
- jz L(done_zero)
-
- movl PARAM_DIVISOR, %esi
- movl -4(%ebx,%ecx,4), %eax C src high limb
-
- cmpl %esi, %eax
-
- sbbl %edx, %edx C -1 if high<divisor
-
- addl %edx, %ecx C skip one division if high<divisor
- jz L(done_eax)
-
- andl %eax, %edx C carry if high<divisor
-
-
-L(top):
- C eax scratch (quotient)
- C ebx src
- C ecx counter
- C edx carry (remainder)
- C esi divisor
- C edi
- C ebp
-
- movl -4(%ebx,%ecx,4), %eax
-
- divl %esi
-
- loop_or_decljnz L(top)
-
-
- movl %edx, %eax
-L(done_eax):
- popl %esi
-
- popl %ebx
-
- ret
-
-
-L(done_zero):
- popl %esi
- xorl %eax, %eax
-
- popl %ebx
-
- ret
-
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/mul_1.asm b/rts/gmp/mpn/x86/mul_1.asm
deleted file mode 100644
index 8817f291bc..0000000000
--- a/rts/gmp/mpn/x86/mul_1.asm
+++ /dev/null
@@ -1,130 +0,0 @@
-dnl x86 mpn_mul_1 (for 386, 486, and Pentium Pro) -- Multiply a limb vector
-dnl with a limb and store the result in a second limb vector.
-dnl
-dnl cycles/limb
-dnl P6: 5.5
-dnl
-dnl The following CPUs have their own optimized code, but for reference the
-dnl code here runs as follows.
-dnl
-dnl cycles/limb
-dnl P5: 12.5
-dnl K6: 10.5
-dnl K7: 4.5
-
-
-dnl Copyright (C) 1992, 1994, 1997, 1998, 1999, 2000 Free Software
-dnl Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t multiplier);
-
-defframe(PARAM_MULTIPLIER,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
- TEXT
- ALIGN(8)
-PROLOGUE(mpn_mul_1)
-deflit(`FRAME',0)
-
- pushl %edi
- pushl %esi
- pushl %ebx
- pushl %ebp
-deflit(`FRAME',16)
-
- movl PARAM_DST,%edi
- movl PARAM_SRC,%esi
- movl PARAM_SIZE,%ecx
-
- xorl %ebx,%ebx
- andl $3,%ecx
- jz L(end0)
-
-L(oop0):
- movl (%esi),%eax
- mull PARAM_MULTIPLIER
- leal 4(%esi),%esi
- addl %ebx,%eax
- movl $0,%ebx
- adcl %ebx,%edx
- movl %eax,(%edi)
- movl %edx,%ebx C propagate carry into cylimb
-
- leal 4(%edi),%edi
- decl %ecx
- jnz L(oop0)
-
-L(end0):
- movl PARAM_SIZE,%ecx
- shrl $2,%ecx
- jz L(end)
-
-
- ALIGN(8)
-L(oop): movl (%esi),%eax
- mull PARAM_MULTIPLIER
- addl %eax,%ebx
- movl $0,%ebp
- adcl %edx,%ebp
-
- movl 4(%esi),%eax
- mull PARAM_MULTIPLIER
- movl %ebx,(%edi)
- addl %eax,%ebp C new lo + cylimb
- movl $0,%ebx
- adcl %edx,%ebx
-
- movl 8(%esi),%eax
- mull PARAM_MULTIPLIER
- movl %ebp,4(%edi)
- addl %eax,%ebx C new lo + cylimb
- movl $0,%ebp
- adcl %edx,%ebp
-
- movl 12(%esi),%eax
- mull PARAM_MULTIPLIER
- movl %ebx,8(%edi)
- addl %eax,%ebp C new lo + cylimb
- movl $0,%ebx
- adcl %edx,%ebx
-
- movl %ebp,12(%edi)
-
- leal 16(%esi),%esi
- leal 16(%edi),%edi
- decl %ecx
- jnz L(oop)
-
-L(end): movl %ebx,%eax
-
- popl %ebp
- popl %ebx
- popl %esi
- popl %edi
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/mul_basecase.asm b/rts/gmp/mpn/x86/mul_basecase.asm
deleted file mode 100644
index 3a9b73895b..0000000000
--- a/rts/gmp/mpn/x86/mul_basecase.asm
+++ /dev/null
@@ -1,209 +0,0 @@
-dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result
-dnl in a third limb vector.
-
-
-dnl Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation,
-dnl Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C void mpn_mul_basecase (mp_ptr wp,
-C mp_srcptr xp, mp_size_t xsize,
-C mp_srcptr yp, mp_size_t ysize);
-C
-C This was written in a haste since the Pentium optimized code that was used
-C for all x86 machines was slow for the Pentium II. This code would benefit
-C from some cleanup.
-C
-C To shave off some percentage of the run-time, one should make 4 variants
-C of the Louter loop, for the four different outcomes of un mod 4. That
-C would avoid Loop0 altogether. Code expansion would be > 4-fold for that
-C part of the function, but since it is not very large, that would be
-C acceptable.
-C
-C The mul loop (at L(oopM)) might need some tweaking. It's current speed is
-C unknown.
-
-defframe(PARAM_YSIZE,20)
-defframe(PARAM_YP, 16)
-defframe(PARAM_XSIZE,12)
-defframe(PARAM_XP, 8)
-defframe(PARAM_WP, 4)
-
-defframe(VAR_MULTIPLIER, -4)
-defframe(VAR_COUNTER, -8)
-deflit(VAR_STACK_SPACE, 8)
-
- .text
- ALIGN(8)
-
-PROLOGUE(mpn_mul_basecase)
-deflit(`FRAME',0)
-
- subl $VAR_STACK_SPACE,%esp
- pushl %esi
- pushl %ebp
- pushl %edi
-deflit(`FRAME',eval(VAR_STACK_SPACE+12))
-
- movl PARAM_XP,%esi
- movl PARAM_WP,%edi
- movl PARAM_YP,%ebp
-
- movl (%esi),%eax C load xp[0]
- mull (%ebp) C multiply by yp[0]
- movl %eax,(%edi) C store to wp[0]
- movl PARAM_XSIZE,%ecx C xsize
- decl %ecx C If xsize = 1, ysize = 1 too
- jz L(done)
-
- pushl %ebx
-FRAME_pushl()
- movl %edx,%ebx
-
- leal 4(%esi),%esi
- leal 4(%edi),%edi
-
-L(oopM):
- movl (%esi),%eax C load next limb at xp[j]
- leal 4(%esi),%esi
- mull (%ebp)
- addl %ebx,%eax
- movl %edx,%ebx
- adcl $0,%ebx
- movl %eax,(%edi)
- leal 4(%edi),%edi
- decl %ecx
- jnz L(oopM)
-
- movl %ebx,(%edi) C most significant limb of product
- addl $4,%edi C increment wp
- movl PARAM_XSIZE,%eax
- shll $2,%eax
- subl %eax,%edi
- subl %eax,%esi
-
- movl PARAM_YSIZE,%eax C ysize
- decl %eax
- jz L(skip)
- movl %eax,VAR_COUNTER C set index i to ysize
-
-L(outer):
- movl PARAM_YP,%ebp C yp
- addl $4,%ebp C make ebp point to next v limb
- movl %ebp,PARAM_YP
- movl (%ebp),%eax C copy y limb ...
- movl %eax,VAR_MULTIPLIER C ... to stack slot
- movl PARAM_XSIZE,%ecx
-
- xorl %ebx,%ebx
- andl $3,%ecx
- jz L(end0)
-
-L(oop0):
- movl (%esi),%eax
- mull VAR_MULTIPLIER
- leal 4(%esi),%esi
- addl %ebx,%eax
- movl $0,%ebx
- adcl %ebx,%edx
- addl %eax,(%edi)
- adcl %edx,%ebx C propagate carry into cylimb
-
- leal 4(%edi),%edi
- decl %ecx
- jnz L(oop0)
-
-L(end0):
- movl PARAM_XSIZE,%ecx
- shrl $2,%ecx
- jz L(endX)
-
- ALIGN(8)
-L(oopX):
- movl (%esi),%eax
- mull VAR_MULTIPLIER
- addl %eax,%ebx
- movl $0,%ebp
- adcl %edx,%ebp
-
- movl 4(%esi),%eax
- mull VAR_MULTIPLIER
- addl %ebx,(%edi)
- adcl %eax,%ebp C new lo + cylimb
- movl $0,%ebx
- adcl %edx,%ebx
-
- movl 8(%esi),%eax
- mull VAR_MULTIPLIER
- addl %ebp,4(%edi)
- adcl %eax,%ebx C new lo + cylimb
- movl $0,%ebp
- adcl %edx,%ebp
-
- movl 12(%esi),%eax
- mull VAR_MULTIPLIER
- addl %ebx,8(%edi)
- adcl %eax,%ebp C new lo + cylimb
- movl $0,%ebx
- adcl %edx,%ebx
-
- addl %ebp,12(%edi)
- adcl $0,%ebx C propagate carry into cylimb
-
- leal 16(%esi),%esi
- leal 16(%edi),%edi
- decl %ecx
- jnz L(oopX)
-
-L(endX):
- movl %ebx,(%edi)
- addl $4,%edi
-
- C we incremented wp and xp in the loop above; compensate
- movl PARAM_XSIZE,%eax
- shll $2,%eax
- subl %eax,%edi
- subl %eax,%esi
-
- movl VAR_COUNTER,%eax
- decl %eax
- movl %eax,VAR_COUNTER
- jnz L(outer)
-
-L(skip):
- popl %ebx
- popl %edi
- popl %ebp
- popl %esi
- addl $8,%esp
- ret
-
-L(done):
- movl %edx,4(%edi) C store to wp[1]
- popl %edi
- popl %ebp
- popl %esi
- addl $8,%esp
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/p6/README b/rts/gmp/mpn/x86/p6/README
deleted file mode 100644
index 7dbc905a0d..0000000000
--- a/rts/gmp/mpn/x86/p6/README
+++ /dev/null
@@ -1,95 +0,0 @@
-
- INTEL P6 MPN SUBROUTINES
-
-
-
-This directory contains code optimized for Intel P6 class CPUs, meaning
-PentiumPro, Pentium II and Pentium III. The mmx and p3mmx subdirectories
-have routines using MMX instructions.
-
-
-
-STATUS
-
-Times for the loops, with all code and data in L1 cache, are as follows.
-Some of these might be able to be improved.
-
- cycles/limb
-
- mpn_add_n/sub_n 3.7
-
- mpn_copyi 0.75
- mpn_copyd 2.4
-
- mpn_divrem_1 39.0
- mpn_mod_1 39.0
- mpn_divexact_by3 8.5
-
- mpn_mul_1 5.5
- mpn_addmul/submul_1 6.35
-
- mpn_l/rshift 2.5
-
- mpn_mul_basecase 8.2 cycles/crossproduct (approx)
- mpn_sqr_basecase 4.0 cycles/crossproduct (approx)
- or 7.75 cycles/triangleproduct (approx)
-
-Pentium II and III have MMX and get the following improvements.
-
- mpn_divrem_1 25.0 integer part, 17.5 fractional part
- mpn_mod_1 24.0
-
- mpn_l/rshift 1.75
-
-
-
-
-NOTES
-
-Write-allocate L1 data cache means prefetching of destinations is unnecessary.
-
-Mispredicted branches have a penalty of between 9 and 15 cycles, and even up
-to 26 cycles depending how far speculative execution has gone. The 9 cycle
-minimum penalty comes from the issue pipeline being 9 stages.
-
-A copy with rep movs seems to copy 16 bytes at a time, since speeds for 4,
-5, 6 or 7 limb operations are all the same. The 0.75 cycles/limb would be 3
-cycles per 16 byte block.
-
-
-
-
-CODING
-
-Instructions in general code have been shown grouped if they can execute
-together, which means up to three instructions with no successive
-dependencies, and with only the first being a multiple micro-op.
-
-P6 has out-of-order execution, so the groupings are really only showing
-dependent paths where some shuffling might allow some latencies to be
-hidden.
-
-
-
-
-REFERENCES
-
-"Intel Architecture Optimization Reference Manual", 1999, revision 001 dated
-02/99, order number 245127 (order number 730795-001 is in the document too).
-Available on-line:
-
- http://download.intel.com/design/PentiumII/manuals/245127.htm
-
-"Intel Architecture Optimization Manual", 1997, order number 242816. This
-is an older document mostly about P5 and not as good as the above.
-Available on-line:
-
- http://download.intel.com/design/PentiumII/manuals/242816.htm
-
-
-
-----------------
-Local variables:
-mode: text
-fill-column: 76
-End:
diff --git a/rts/gmp/mpn/x86/p6/aorsmul_1.asm b/rts/gmp/mpn/x86/p6/aorsmul_1.asm
deleted file mode 100644
index feb364ec0b..0000000000
--- a/rts/gmp/mpn/x86/p6/aorsmul_1.asm
+++ /dev/null
@@ -1,300 +0,0 @@
-dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
-dnl
-dnl P6: 6.35 cycles/limb (at 16 limbs/loop).
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-dnl P6 UNROLL_COUNT cycles/limb
-dnl 8 6.7
-dnl 16 6.35
-dnl 32 6.3
-dnl 64 6.3
-dnl Maximum possible with the current code is 64.
-
-deflit(UNROLL_COUNT, 16)
-
-
-ifdef(`OPERATION_addmul_1', `
- define(M4_inst, addl)
- define(M4_function_1, mpn_addmul_1)
- define(M4_function_1c, mpn_addmul_1c)
- define(M4_description, add it to)
- define(M4_desc_retval, carry)
-',`ifdef(`OPERATION_submul_1', `
- define(M4_inst, subl)
- define(M4_function_1, mpn_submul_1)
- define(M4_function_1c, mpn_submul_1c)
- define(M4_description, subtract it from)
- define(M4_desc_retval, borrow)
-',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
-')')')
-
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
-
-
-C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t mult);
-C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t mult, mp_limb_t carry);
-C
-C Calculate src,size multiplied by mult and M4_description dst,size.
-C Return the M4_desc_retval limb from the top of the result.
-C
-C This code is pretty much the same as the K6 code. The unrolled loop is
-C the same, but there's just a few scheduling tweaks in the setups and the
-C simple loop.
-C
-C A number of variations have been tried for the unrolled loop, with one or
-C two carries, and with loads scheduled earlier, but nothing faster than 6
-C cycles/limb has been found.
-
-ifdef(`PIC',`
-deflit(UNROLL_THRESHOLD, 5)
-',`
-deflit(UNROLL_THRESHOLD, 5)
-')
-
-defframe(PARAM_CARRY, 20)
-defframe(PARAM_MULTIPLIER,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
- .text
- ALIGN(32)
-
-PROLOGUE(M4_function_1c)
- pushl %ebx
-deflit(`FRAME',4)
- movl PARAM_CARRY, %ebx
- jmp LF(M4_function_1,start_nc)
-EPILOGUE()
-
-PROLOGUE(M4_function_1)
- push %ebx
-deflit(`FRAME',4)
- xorl %ebx, %ebx C initial carry
-
-L(start_nc):
- movl PARAM_SIZE, %ecx
- pushl %esi
-deflit(`FRAME',8)
-
- movl PARAM_SRC, %esi
- pushl %edi
-deflit(`FRAME',12)
-
- movl PARAM_DST, %edi
- pushl %ebp
-deflit(`FRAME',16)
- cmpl $UNROLL_THRESHOLD, %ecx
-
- movl PARAM_MULTIPLIER, %ebp
- jae L(unroll)
-
-
- C simple loop
- C this is offset 0x22, so close enough to aligned
-L(simple):
- C eax scratch
- C ebx carry
- C ecx counter
- C edx scratch
- C esi src
- C edi dst
- C ebp multiplier
-
- movl (%esi), %eax
- addl $4, %edi
-
- mull %ebp
-
- addl %ebx, %eax
- adcl $0, %edx
-
- M4_inst %eax, -4(%edi)
- movl %edx, %ebx
-
- adcl $0, %ebx
- decl %ecx
-
- leal 4(%esi), %esi
- jnz L(simple)
-
-
- popl %ebp
- popl %edi
-
- popl %esi
- movl %ebx, %eax
-
- popl %ebx
- ret
-
-
-
-C------------------------------------------------------------------------------
-C VAR_JUMP holds the computed jump temporarily because there's not enough
-C registers when doing the mul for the initial two carry limbs.
-C
-C The add/adc for the initial carry in %ebx is necessary only for the
-C mpn_add/submul_1c entry points. Duplicating the startup code to
-C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
-C idea.
-
-dnl overlapping with parameters already fetched
-define(VAR_COUNTER,`PARAM_SIZE')
-define(VAR_JUMP, `PARAM_DST')
-
- C this is offset 0x43, so close enough to aligned
-L(unroll):
- C eax
- C ebx initial carry
- C ecx size
- C edx
- C esi src
- C edi dst
- C ebp
-
- movl %ecx, %edx
- decl %ecx
-
- subl $2, %edx
- negl %ecx
-
- shrl $UNROLL_LOG2, %edx
- andl $UNROLL_MASK, %ecx
-
- movl %edx, VAR_COUNTER
- movl %ecx, %edx
-
- C 15 code bytes per limb
-ifdef(`PIC',`
- call L(pic_calc)
-L(here):
-',`
- shll $4, %edx
- negl %ecx
-
- leal L(entry) (%edx,%ecx,1), %edx
-')
- movl (%esi), %eax C src low limb
-
- movl %edx, VAR_JUMP
- leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
-
- mull %ebp
-
- addl %ebx, %eax C initial carry (from _1c)
- adcl $0, %edx
-
- movl %edx, %ebx C high carry
- leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
-
- movl VAR_JUMP, %edx
- testl $1, %ecx
- movl %eax, %ecx C low carry
-
- cmovnz( %ebx, %ecx) C high,low carry other way around
- cmovnz( %eax, %ebx)
-
- jmp *%edx
-
-
-ifdef(`PIC',`
-L(pic_calc):
- shll $4, %edx
- negl %ecx
-
- C See README.family about old gas bugs
- leal (%edx,%ecx,1), %edx
- addl $L(entry)-L(here), %edx
-
- addl (%esp), %edx
-
- ret
-')
-
-
-C -----------------------------------------------------------
- ALIGN(32)
-L(top):
-deflit(`FRAME',16)
- C eax scratch
- C ebx carry hi
- C ecx carry lo
- C edx scratch
- C esi src
- C edi dst
- C ebp multiplier
- C
- C VAR_COUNTER loop counter
- C
- C 15 code bytes per limb
-
- addl $UNROLL_BYTES, %edi
-
-L(entry):
-deflit(CHUNK_COUNT,2)
-forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
- deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
- deflit(`disp1', eval(disp0 + 4))
-
-Zdisp( movl, disp0,(%esi), %eax)
- mull %ebp
-Zdisp( M4_inst,%ecx, disp0,(%edi))
- adcl %eax, %ebx
- movl %edx, %ecx
- adcl $0, %ecx
-
- movl disp1(%esi), %eax
- mull %ebp
- M4_inst %ebx, disp1(%edi)
- adcl %eax, %ecx
- movl %edx, %ebx
- adcl $0, %ebx
-')
-
- decl VAR_COUNTER
- leal UNROLL_BYTES(%esi), %esi
-
- jns L(top)
-
-
-deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
-
- M4_inst %ecx, disp0(%edi)
- movl %ebx, %eax
-
- popl %ebp
- popl %edi
-
- popl %esi
- popl %ebx
- adcl $0, %eax
-
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/p6/diveby3.asm b/rts/gmp/mpn/x86/p6/diveby3.asm
deleted file mode 100644
index a77703ea89..0000000000
--- a/rts/gmp/mpn/x86/p6/diveby3.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-dnl Intel P6 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
-dnl
-dnl P6: 8.5 cycles/limb
-
-
-dnl Copyright (C) 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-dnl The P5 code runs well on P6, in fact better than anything else found so
-dnl far. An imul is 4 cycles, meaning the two cmp/sbbl pairs on the
-dnl dependent path are taking 4.5 cycles.
-dnl
-dnl The destination cache line prefetching is unnecessary on P6, but
-dnl removing it is a 2 cycle slowdown (approx), so it must be inducing
-dnl something good in the out of order execution.
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_divexact_by3c)
-include_mpn(`x86/pentium/diveby3.asm')
diff --git a/rts/gmp/mpn/x86/p6/gmp-mparam.h b/rts/gmp/mpn/x86/p6/gmp-mparam.h
deleted file mode 100644
index d7bfb6d60c..0000000000
--- a/rts/gmp/mpn/x86/p6/gmp-mparam.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Intel P6 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of the GNU Lesser General Public License as published by
-the Free Software Foundation; either version 2.1 of the License, or (at your
-option) any later version.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-License for more details.
-
-You should have received a copy of the GNU Lesser General Public License
-along with the GNU MP Library; see the file COPYING.LIB. If not, write to
-the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-MA 02111-1307, USA. */
-
-
-#define BITS_PER_MP_LIMB 32
-#define BYTES_PER_MP_LIMB 4
-#define BITS_PER_LONGINT 32
-#define BITS_PER_INT 32
-#define BITS_PER_SHORTINT 16
-#define BITS_PER_CHAR 8
-
-
-#ifndef UMUL_TIME
-#define UMUL_TIME 5 /* cycles */
-#endif
-#ifndef UDIV_TIME
-#define UDIV_TIME 39 /* cycles */
-#endif
-
-#ifndef COUNT_TRAILING_ZEROS_TIME
-#define COUNT_TRAILING_ZEROS_TIME 2 /* cycles */
-#endif
-
-
-/* Generated by tuneup.c, 2000-07-06. */
-
-#ifndef KARATSUBA_MUL_THRESHOLD
-#define KARATSUBA_MUL_THRESHOLD 23
-#endif
-#ifndef TOOM3_MUL_THRESHOLD
-#define TOOM3_MUL_THRESHOLD 139
-#endif
-
-#ifndef KARATSUBA_SQR_THRESHOLD
-#define KARATSUBA_SQR_THRESHOLD 52
-#endif
-#ifndef TOOM3_SQR_THRESHOLD
-#define TOOM3_SQR_THRESHOLD 166
-#endif
-
-#ifndef BZ_THRESHOLD
-#define BZ_THRESHOLD 116
-#endif
-
-#ifndef FIB_THRESHOLD
-#define FIB_THRESHOLD 66
-#endif
-
-#ifndef POWM_THRESHOLD
-#define POWM_THRESHOLD 20
-#endif
-
-#ifndef GCD_ACCEL_THRESHOLD
-#define GCD_ACCEL_THRESHOLD 4
-#endif
-#ifndef GCDEXT_THRESHOLD
-#define GCDEXT_THRESHOLD 54
-#endif
-
-#ifndef FFT_MUL_TABLE
-#define FFT_MUL_TABLE { 592, 1440, 2688, 5632, 14336, 40960, 0 }
-#endif
-#ifndef FFT_MODF_MUL_THRESHOLD
-#define FFT_MODF_MUL_THRESHOLD 608
-#endif
-#ifndef FFT_MUL_THRESHOLD
-#define FFT_MUL_THRESHOLD 5888
-#endif
-
-#ifndef FFT_SQR_TABLE
-#define FFT_SQR_TABLE { 656, 1504, 2944, 6656, 18432, 57344, 0 }
-#endif
-#ifndef FFT_MODF_SQR_THRESHOLD
-#define FFT_MODF_SQR_THRESHOLD 672
-#endif
-#ifndef FFT_SQR_THRESHOLD
-#define FFT_SQR_THRESHOLD 5888
-#endif
diff --git a/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm b/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm
deleted file mode 100644
index f1b011b623..0000000000
--- a/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm
+++ /dev/null
@@ -1,677 +0,0 @@
-dnl Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
-dnl
-dnl P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
-C mp_srcptr src, mp_size_t size,
-C mp_limb_t divisor);
-C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
-C mp_srcptr src, mp_size_t size,
-C mp_limb_t divisor, mp_limb_t carry);
-C
-C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
-C see that file for some comments. It's likely what's here can be improved.
-
-
-dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
-dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
-dnl
-dnl The different speeds of the integer and fraction parts means that using
-dnl xsize+size isn't quite right. The threshold wants to be a bit higher
-dnl for the integer part and a bit lower for the fraction part. (Or what's
-dnl really wanted is to speed up the integer part!)
-dnl
-dnl The threshold is set to make the integer part right. At 4 limbs the
-dnl div and mul are about the same there, but on the fractional part the
-dnl mul is much faster.
-
-deflit(MUL_THRESHOLD, 4)
-
-
-defframe(PARAM_CARRY, 24)
-defframe(PARAM_DIVISOR,20)
-defframe(PARAM_SIZE, 16)
-defframe(PARAM_SRC, 12)
-defframe(PARAM_XSIZE, 8)
-defframe(PARAM_DST, 4)
-
-defframe(SAVE_EBX, -4)
-defframe(SAVE_ESI, -8)
-defframe(SAVE_EDI, -12)
-defframe(SAVE_EBP, -16)
-
-defframe(VAR_NORM, -20)
-defframe(VAR_INVERSE, -24)
-defframe(VAR_SRC, -28)
-defframe(VAR_DST, -32)
-defframe(VAR_DST_STOP,-36)
-
-deflit(STACK_SPACE, 36)
-
- .text
- ALIGN(16)
-
-PROLOGUE(mpn_divrem_1c)
-deflit(`FRAME',0)
- movl PARAM_CARRY, %edx
-
- movl PARAM_SIZE, %ecx
- subl $STACK_SPACE, %esp
-deflit(`FRAME',STACK_SPACE)
-
- movl %ebx, SAVE_EBX
- movl PARAM_XSIZE, %ebx
-
- movl %edi, SAVE_EDI
- movl PARAM_DST, %edi
-
- movl %ebp, SAVE_EBP
- movl PARAM_DIVISOR, %ebp
-
- movl %esi, SAVE_ESI
- movl PARAM_SRC, %esi
-
- leal -4(%edi,%ebx,4), %edi
- jmp LF(mpn_divrem_1,start_1c)
-
-EPILOGUE()
-
-
- C offset 0x31, close enough to aligned
-PROLOGUE(mpn_divrem_1)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %ecx
- movl $0, %edx C initial carry (if can't skip a div)
- subl $STACK_SPACE, %esp
-deflit(`FRAME',STACK_SPACE)
-
- movl %ebp, SAVE_EBP
- movl PARAM_DIVISOR, %ebp
-
- movl %ebx, SAVE_EBX
- movl PARAM_XSIZE, %ebx
-
- movl %esi, SAVE_ESI
- movl PARAM_SRC, %esi
- orl %ecx, %ecx
-
- movl %edi, SAVE_EDI
- movl PARAM_DST, %edi
-
- leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
- jz L(no_skip_div)
-
- movl -4(%esi,%ecx,4), %eax C src high limb
- cmpl %ebp, %eax C one less div if high<divisor
- jnb L(no_skip_div)
-
- movl $0, (%edi,%ecx,4) C dst high limb
- decl %ecx C size-1
- movl %eax, %edx C src high limb as initial carry
-L(no_skip_div):
-
-
-L(start_1c):
- C eax
- C ebx xsize
- C ecx size
- C edx carry
- C esi src
- C edi &dst[xsize-1]
- C ebp divisor
-
- leal (%ebx,%ecx), %eax C size+xsize
- cmpl $MUL_THRESHOLD, %eax
- jae L(mul_by_inverse)
-
- orl %ecx, %ecx
- jz L(divide_no_integer)
-
-L(divide_integer):
- C eax scratch (quotient)
- C ebx xsize
- C ecx counter
- C edx scratch (remainder)
- C esi src
- C edi &dst[xsize-1]
- C ebp divisor
-
- movl -4(%esi,%ecx,4), %eax
-
- divl %ebp
-
- movl %eax, (%edi,%ecx,4)
- decl %ecx
- jnz L(divide_integer)
-
-
-L(divide_no_integer):
- movl PARAM_DST, %edi
- orl %ebx, %ebx
- jnz L(divide_fraction)
-
-L(divide_done):
- movl SAVE_ESI, %esi
-
- movl SAVE_EDI, %edi
-
- movl SAVE_EBX, %ebx
- movl %edx, %eax
-
- movl SAVE_EBP, %ebp
- addl $STACK_SPACE, %esp
-
- ret
-
-
-L(divide_fraction):
- C eax scratch (quotient)
- C ebx counter
- C ecx
- C edx scratch (remainder)
- C esi
- C edi dst
- C ebp divisor
-
- movl $0, %eax
-
- divl %ebp
-
- movl %eax, -4(%edi,%ebx,4)
- decl %ebx
- jnz L(divide_fraction)
-
- jmp L(divide_done)
-
-
-
-C -----------------------------------------------------------------------------
-
-L(mul_by_inverse):
- C eax
- C ebx xsize
- C ecx size
- C edx carry
- C esi src
- C edi &dst[xsize-1]
- C ebp divisor
-
- leal 12(%edi), %ebx
-
- movl %ebx, VAR_DST_STOP
- leal 4(%edi,%ecx,4), %edi C &dst[xsize+size]
-
- movl %edi, VAR_DST
- movl %ecx, %ebx C size
-
- bsrl %ebp, %ecx C 31-l
- movl %edx, %edi C carry
-
- leal 1(%ecx), %eax C 32-l
- xorl $31, %ecx C l
-
- movl %ecx, VAR_NORM
- movl $-1, %edx
-
- shll %cl, %ebp C d normalized
- movd %eax, %mm7
-
- movl $-1, %eax
- subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1
-
- divl %ebp C floor (b*(b-d)-1) / d
-
- movl %eax, VAR_INVERSE
- orl %ebx, %ebx C size
- leal -12(%esi,%ebx,4), %eax C &src[size-3]
-
- movl %eax, VAR_SRC
- jz L(start_zero)
-
- movl 8(%eax), %esi C src high limb
- cmpl $1, %ebx
- jz L(start_one)
-
-L(start_two_or_more):
- movl 4(%eax), %edx C src second highest limb
-
- shldl( %cl, %esi, %edi) C n2 = carry,high << l
-
- shldl( %cl, %edx, %esi) C n10 = high,second << l
-
- cmpl $2, %ebx
- je L(integer_two_left)
- jmp L(integer_top)
-
-
-L(start_one):
- shldl( %cl, %esi, %edi) C n2 = carry,high << l
-
- shll %cl, %esi C n10 = high << l
- jmp L(integer_one_left)
-
-
-L(start_zero):
- shll %cl, %edi C n2 = carry << l
- movl $0, %esi C n10 = 0
-
- C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
- C must have xsize!=0
- jmp L(fraction_some)
-
-
-
-C -----------------------------------------------------------------------------
-C
-C This loop runs at about 25 cycles, which is probably sub-optimal, and
-C certainly more than the dependent chain would suggest. A better loop, or
-C a better rough analysis of what's possible, would be welcomed.
-C
-C In the current implementation, the following successively dependent
-C micro-ops seem to exist.
-C
-C uops
-C n2+n1 1 (addl)
-C mul 5
-C q1+1 3 (addl/adcl)
-C mul 5
-C sub 3 (subl/sbbl)
-C addback 2 (cmov)
-C ---
-C 19
-C
-C Lack of registers hinders explicit scheduling and it might be that the
-C normal out of order execution isn't able to hide enough under the mul
-C latencies.
-C
-C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
-C cmov (and takes one uop off the dependent chain). A sarl/andl/addl
-C combination was tried for the addback (despite the fact it would lengthen
-C the dependent chain) but found to be no faster.
-
-
- ALIGN(16)
-L(integer_top):
- C eax scratch
- C ebx scratch (nadj, q1)
- C ecx scratch (src, dst)
- C edx scratch
- C esi n10
- C edi n2
- C ebp d
- C
- C mm0 scratch (src qword)
- C mm7 rshift for normalization
-
- movl %esi, %eax
- movl %ebp, %ebx
-
- sarl $31, %eax C -n1
- movl VAR_SRC, %ecx
-
- andl %eax, %ebx C -n1 & d
- negl %eax C n1
-
- addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
- addl %edi, %eax C n2+n1
- movq (%ecx), %mm0 C next src limb and the one below it
-
- mull VAR_INVERSE C m*(n2+n1)
-
- subl $4, %ecx
-
- movl %ecx, VAR_SRC
-
- C
-
- C
-
- addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
- movl %ebp, %eax C d
- leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
-
- adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
- jz L(q1_ff)
-
- mull %ebx C (q1+1)*d
-
- movl VAR_DST, %ecx
- psrlq %mm7, %mm0
-
- C
-
- C
-
- C
-
- subl %eax, %esi
- movl VAR_DST_STOP, %eax
-
- sbbl %edx, %edi C n - (q1+1)*d
- movl %esi, %edi C remainder -> n2
- leal (%ebp,%esi), %edx
-
- cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
- movd %mm0, %esi
-
- sbbl $0, %ebx C q
- subl $4, %ecx
-
- movl %ebx, (%ecx)
- cmpl %eax, %ecx
-
- movl %ecx, VAR_DST
- jne L(integer_top)
-
-
-L(integer_loop_done):
-
-
-C -----------------------------------------------------------------------------
-C
-C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
-C q1_ff special case. This make the code a bit smaller and simpler, and
-C costs only 2 cycles (each).
-
-L(integer_two_left):
- C eax scratch
- C ebx scratch (nadj, q1)
- C ecx scratch (src, dst)
- C edx scratch
- C esi n10
- C edi n2
- C ebp divisor
- C
- C mm0 src limb, shifted
- C mm7 rshift
-
-
- movl %esi, %eax
- movl %ebp, %ebx
-
- sarl $31, %eax C -n1
- movl PARAM_SRC, %ecx
-
- andl %eax, %ebx C -n1 & d
- negl %eax C n1
-
- addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
- addl %edi, %eax C n2+n1
-
- mull VAR_INVERSE C m*(n2+n1)
-
- movd (%ecx), %mm0 C src low limb
-
- movl VAR_DST_STOP, %ecx
-
- C
-
- C
-
- addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
- leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
- movl %ebp, %eax C d
-
- adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
-
- sbbl $0, %ebx
-
- mull %ebx C (q1+1)*d
-
- psllq $32, %mm0
-
- psrlq %mm7, %mm0
-
- C
-
- C
-
- subl %eax, %esi
-
- sbbl %edx, %edi C n - (q1+1)*d
- movl %esi, %edi C remainder -> n2
- leal (%ebp,%esi), %edx
-
- cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
- movd %mm0, %esi
-
- sbbl $0, %ebx C q
-
- movl %ebx, -4(%ecx)
-
-
-C -----------------------------------------------------------------------------
-L(integer_one_left):
- C eax scratch
- C ebx scratch (nadj, q1)
- C ecx scratch (dst)
- C edx scratch
- C esi n10
- C edi n2
- C ebp divisor
- C
- C mm0 src limb, shifted
- C mm7 rshift
-
-
- movl %esi, %eax
- movl %ebp, %ebx
-
- sarl $31, %eax C -n1
- movl VAR_DST_STOP, %ecx
-
- andl %eax, %ebx C -n1 & d
- negl %eax C n1
-
- addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
- addl %edi, %eax C n2+n1
-
- mull VAR_INVERSE C m*(n2+n1)
-
- C
-
- C
-
- C
-
- addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
- leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
- movl %ebp, %eax C d
-
- C
-
- adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
-
- sbbl $0, %ebx C q1 if q1+1 overflowed
-
- mull %ebx
-
- C
-
- C
-
- C
-
- C
-
- subl %eax, %esi
- movl PARAM_XSIZE, %eax
-
- sbbl %edx, %edi C n - (q1+1)*d
- movl %esi, %edi C remainder -> n2
- leal (%ebp,%esi), %edx
-
- cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
-
- sbbl $0, %ebx C q
-
- movl %ebx, -8(%ecx)
- subl $8, %ecx
-
-
-
- orl %eax, %eax C xsize
- jnz L(fraction_some)
-
- movl %edi, %eax
-L(fraction_done):
- movl VAR_NORM, %ecx
- movl SAVE_EBP, %ebp
-
- movl SAVE_EDI, %edi
-
- movl SAVE_ESI, %esi
-
- movl SAVE_EBX, %ebx
- addl $STACK_SPACE, %esp
-
- shrl %cl, %eax
- emms
-
- ret
-
-
-C -----------------------------------------------------------------------------
-C
-C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
-C of q*d is simply -d and the remainder n-q*d = n10+d
-
-L(q1_ff):
- C eax (divisor)
- C ebx (q1+1 == 0)
- C ecx
- C edx
- C esi n10
- C edi n2
- C ebp divisor
-
- movl VAR_DST, %ecx
- movl VAR_DST_STOP, %edx
- subl $4, %ecx
-
- movl %ecx, VAR_DST
- psrlq %mm7, %mm0
- leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
-
- movl $-1, (%ecx)
- movd %mm0, %esi C next n10
-
- cmpl %ecx, %edx
- jne L(integer_top)
-
- jmp L(integer_loop_done)
-
-
-
-C -----------------------------------------------------------------------------
-C
-C In the current implementation, the following successively dependent
-C micro-ops seem to exist.
-C
-C uops
-C mul 5
-C q1+1 1 (addl)
-C mul 5
-C sub 3 (negl/sbbl)
-C addback 2 (cmov)
-C ---
-C 16
-C
-C The loop in fact runs at about 17.5 cycles. Using a sarl/andl/addl for
-C the addback was found to be a touch slower.
-
-
- ALIGN(16)
-L(fraction_some):
- C eax
- C ebx
- C ecx
- C edx
- C esi
- C edi carry
- C ebp divisor
-
- movl PARAM_DST, %esi
- movl VAR_DST_STOP, %ecx
- movl %edi, %eax
-
- subl $8, %ecx
-
-
- ALIGN(16)
-L(fraction_top):
- C eax n2, then scratch
- C ebx scratch (nadj, q1)
- C ecx dst, decrementing
- C edx scratch
- C esi dst stop point
- C edi n2
- C ebp divisor
-
- mull VAR_INVERSE C m*n2
-
- movl %ebp, %eax C d
- subl $4, %ecx C dst
- leal 1(%edi), %ebx
-
- C
-
- C
-
- C
-
- addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1
-
- mull %ebx C (q1+1)*d
-
- C
-
- C
-
- C
-
- C
-
- negl %eax C low of n - (q1+1)*d
-
- sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry
- leal (%ebp,%eax), %edx
-
- cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
-
- sbbl $0, %ebx C q
- movl %eax, %edi C remainder->n2
- cmpl %esi, %ecx
-
- movl %ebx, (%ecx) C previous q
- jne L(fraction_top)
-
-
- jmp L(fraction_done)
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/p6/mmx/mod_1.asm b/rts/gmp/mpn/x86/p6/mmx/mod_1.asm
deleted file mode 100644
index e7d8d94d33..0000000000
--- a/rts/gmp/mpn/x86/p6/mmx/mod_1.asm
+++ /dev/null
@@ -1,444 +0,0 @@
-dnl Intel Pentium-II mpn_mod_1 -- mpn by limb remainder.
-dnl
-dnl P6MMX: 24.0 cycles/limb.
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
-C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
-C mp_limb_t carry);
-C
-C The code here very similar to mpn_divrem_1, but with the quotient
-C discarded. What's here probably isn't optimal.
-C
-C See mpn/x86/p6/mmx/divrem_1.c and mpn/x86/k7/mmx/mod_1.asm for some
-C comments.
-
-
-dnl MUL_THRESHOLD is the size at which the multiply by inverse method is
-dnl used, rather than plain "divl"s. Minimum value 2.
-
-deflit(MUL_THRESHOLD, 4)
-
-
-defframe(PARAM_CARRY, 16)
-defframe(PARAM_DIVISOR,12)
-defframe(PARAM_SIZE, 8)
-defframe(PARAM_SRC, 4)
-
-defframe(SAVE_EBX, -4)
-defframe(SAVE_ESI, -8)
-defframe(SAVE_EDI, -12)
-defframe(SAVE_EBP, -16)
-
-defframe(VAR_NORM, -20)
-defframe(VAR_INVERSE, -24)
-defframe(VAR_SRC_STOP,-28)
-
-deflit(STACK_SPACE, 28)
-
- .text
- ALIGN(16)
-
-PROLOGUE(mpn_mod_1c)
-deflit(`FRAME',0)
- movl PARAM_CARRY, %edx
- movl PARAM_SIZE, %ecx
- subl $STACK_SPACE, %esp
-deflit(`FRAME',STACK_SPACE)
-
- movl %ebp, SAVE_EBP
- movl PARAM_DIVISOR, %ebp
-
- movl %esi, SAVE_ESI
- movl PARAM_SRC, %esi
- jmp LF(mpn_mod_1,start_1c)
-
-EPILOGUE()
-
-
- ALIGN(16)
-PROLOGUE(mpn_mod_1)
-deflit(`FRAME',0)
-
- movl $0, %edx C initial carry (if can't skip a div)
- movl PARAM_SIZE, %ecx
- subl $STACK_SPACE, %esp
-deflit(`FRAME',STACK_SPACE)
-
- movl %esi, SAVE_ESI
- movl PARAM_SRC, %esi
-
- movl %ebp, SAVE_EBP
- movl PARAM_DIVISOR, %ebp
-
- orl %ecx, %ecx
- jz L(divide_done)
-
- movl -4(%esi,%ecx,4), %eax C src high limb
-
- cmpl %ebp, %eax C carry flag if high<divisor
-
- cmovc( %eax, %edx) C src high limb as initial carry
- sbbl $0, %ecx C size-1 to skip one div
- jz L(divide_done)
-
-
- ALIGN(16)
-L(start_1c):
- C eax
- C ebx
- C ecx size
- C edx carry
- C esi src
- C edi
- C ebp divisor
-
- cmpl $MUL_THRESHOLD, %ecx
- jae L(mul_by_inverse)
-
-
- orl %ecx, %ecx
- jz L(divide_done)
-
-
-L(divide_top):
- C eax scratch (quotient)
- C ebx
- C ecx counter, limbs, decrementing
- C edx scratch (remainder)
- C esi src
- C edi
- C ebp
-
- movl -4(%esi,%ecx,4), %eax
-
- divl %ebp
-
- decl %ecx
- jnz L(divide_top)
-
-
-L(divide_done):
- movl SAVE_ESI, %esi
- movl %edx, %eax
-
- movl SAVE_EBP, %ebp
- addl $STACK_SPACE, %esp
-
- ret
-
-
-
-C -----------------------------------------------------------------------------
-
-L(mul_by_inverse):
- C eax
- C ebx
- C ecx size
- C edx carry
- C esi src
- C edi
- C ebp divisor
-
- movl %ebx, SAVE_EBX
- leal -4(%esi), %ebx
-
- movl %ebx, VAR_SRC_STOP
- movl %ecx, %ebx C size
-
- movl %edi, SAVE_EDI
- movl %edx, %edi C carry
-
- bsrl %ebp, %ecx C 31-l
- movl $-1, %edx
-
- leal 1(%ecx), %eax C 32-l
- xorl $31, %ecx C l
-
- movl %ecx, VAR_NORM
- shll %cl, %ebp C d normalized
-
- movd %eax, %mm7
- movl $-1, %eax
- subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
-
- divl %ebp C floor (b*(b-d)-1) / d
-
- C
-
- movl %eax, VAR_INVERSE
- leal -12(%esi,%ebx,4), %eax C &src[size-3]
-
- movl 8(%eax), %esi C src high limb
- movl 4(%eax), %edx C src second highest limb
-
- shldl( %cl, %esi, %edi) C n2 = carry,high << l
-
- shldl( %cl, %edx, %esi) C n10 = high,second << l
-
- movl %eax, %ecx C &src[size-3]
-
-
-ifelse(MUL_THRESHOLD,2,`
- cmpl $2, %ebx
- je L(inverse_two_left)
-')
-
-
-C The dependent chain here is the same as in mpn_divrem_1, but a few
-C instructions are saved by not needing to store the quotient limbs. This
-C gets it down to 24 c/l, which is still a bit away from a theoretical 19
-C c/l.
-
- ALIGN(16)
-L(inverse_top):
- C eax scratch
- C ebx scratch (nadj, q1)
- C ecx src pointer, decrementing
- C edx scratch
- C esi n10
- C edi n2
- C ebp divisor
- C
- C mm0 scratch (src qword)
- C mm7 rshift for normalization
-
-
- movl %esi, %eax
- movl %ebp, %ebx
-
- sarl $31, %eax C -n1
-
- andl %eax, %ebx C -n1 & d
- negl %eax C n1
-
- addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
- addl %edi, %eax C n2+n1
-
- mull VAR_INVERSE C m*(n2+n1)
-
- movq (%ecx), %mm0 C next src limb and the one below it
- subl $4, %ecx
-
- C
-
- C
-
- C
-
- addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
- leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
- movl %ebp, %eax C d
-
- adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
- jz L(q1_ff)
-
- mull %ebx C (q1+1)*d
-
- psrlq %mm7, %mm0
- movl VAR_SRC_STOP, %ebx
-
- C
-
- C
-
- C
-
- subl %eax, %esi
-
- sbbl %edx, %edi C n - (q1+1)*d
- movl %esi, %edi C remainder -> n2
- leal (%ebp,%esi), %edx
-
- cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
- movd %mm0, %esi
- cmpl %ebx, %ecx
-
- jne L(inverse_top)
-
-
-L(inverse_loop_done):
-
-
-C -----------------------------------------------------------------------------
-
-L(inverse_two_left):
- C eax scratch
- C ebx scratch (nadj, q1)
- C ecx &src[-1]
- C edx scratch
- C esi n10
- C edi n2
- C ebp divisor
- C
- C mm0 scratch (src dword)
- C mm7 rshift
-
- movl %esi, %eax
- movl %ebp, %ebx
-
- sarl $31, %eax C -n1
-
- andl %eax, %ebx C -n1 & d
- negl %eax C n1
-
- addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
- addl %edi, %eax C n2+n1
-
- mull VAR_INVERSE C m*(n2+n1)
-
- movd 4(%ecx), %mm0 C src low limb
-
- C
-
- C
-
- C
-
- addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
- leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
-
- adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
-
- sbbl $0, %ebx
- movl %ebp, %eax C d
-
- mull %ebx C (q1+1)*d
-
- psllq $32, %mm0
-
- psrlq %mm7, %mm0
-
- C
-
- C
-
- subl %eax, %esi
-
- sbbl %edx, %edi C n - (q1+1)*d
- movl %esi, %edi C remainder -> n2
- leal (%ebp,%esi), %edx
-
- cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
- movd %mm0, %esi
-
-
-C One limb left
-
- C eax scratch
- C ebx scratch (nadj, q1)
- C ecx
- C edx scratch
- C esi n10
- C edi n2
- C ebp divisor
- C
- C mm0 src limb, shifted
- C mm7 rshift
-
- movl %esi, %eax
- movl %ebp, %ebx
-
- sarl $31, %eax C -n1
-
- andl %eax, %ebx C -n1 & d
- negl %eax C n1
-
- addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
- addl %edi, %eax C n2+n1
-
- mull VAR_INVERSE C m*(n2+n1)
-
- movl VAR_NORM, %ecx C for final denorm
-
- C
-
- C
-
- C
-
- addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
- leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
-
- adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
-
- sbbl $0, %ebx
- movl %ebp, %eax C d
-
- mull %ebx C (q1+1)*d
-
- movl SAVE_EBX, %ebx
-
- C
-
- C
-
- C
-
- subl %eax, %esi
-
- sbbl %edx, %edi C n - (q1+1)*d
- leal (%ebp,%esi), %edx
- movl SAVE_EBP, %ebp
-
- movl %esi, %eax C remainder
- movl SAVE_ESI, %esi
-
- cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
- movl SAVE_EDI, %edi
-
- shrl %cl, %eax C denorm remainder
- addl $STACK_SPACE, %esp
- emms
-
- ret
-
-
-C -----------------------------------------------------------------------------
-C
-C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
-C of q*d is simply -d and the remainder n-q*d = n10+d
-
-L(q1_ff):
- C eax (divisor)
- C ebx (q1+1 == 0)
- C ecx src pointer
- C edx
- C esi n10
- C edi (n2)
- C ebp divisor
-
- leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
- movl VAR_SRC_STOP, %edx
- psrlq %mm7, %mm0
-
- movd %mm0, %esi C next n10
- cmpl %ecx, %edx
- jne L(inverse_top)
-
- jmp L(inverse_loop_done)
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/p6/mmx/popham.asm b/rts/gmp/mpn/x86/p6/mmx/popham.asm
deleted file mode 100644
index 50f9a11218..0000000000
--- a/rts/gmp/mpn/x86/p6/mmx/popham.asm
+++ /dev/null
@@ -1,31 +0,0 @@
-dnl Intel Pentium-II mpn_popcount, mpn_hamdist -- population count and
-dnl hamming distance.
-dnl
-dnl P6MMX: popcount 11 cycles/limb (approx), hamdist 11.5 cycles/limb
-dnl (approx)
-
-
-dnl Copyright (C) 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
-include_mpn(`x86/k6/mmx/popham.asm')
diff --git a/rts/gmp/mpn/x86/p6/p3mmx/popham.asm b/rts/gmp/mpn/x86/p6/p3mmx/popham.asm
deleted file mode 100644
index e63fbf334b..0000000000
--- a/rts/gmp/mpn/x86/p6/p3mmx/popham.asm
+++ /dev/null
@@ -1,30 +0,0 @@
-dnl Intel Pentium-III mpn_popcount, mpn_hamdist -- population count and
-dnl hamming distance.
-
-dnl Copyright (C) 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-dnl Haven't actually measured it, but the K7 code with the psadbw should be
-dnl good on P-III.
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
-include_mpn(`x86/k7/mmx/popham.asm')
diff --git a/rts/gmp/mpn/x86/p6/sqr_basecase.asm b/rts/gmp/mpn/x86/p6/sqr_basecase.asm
deleted file mode 100644
index 174c78406a..0000000000
--- a/rts/gmp/mpn/x86/p6/sqr_basecase.asm
+++ /dev/null
@@ -1,641 +0,0 @@
-dnl Intel P6 mpn_sqr_basecase -- square an mpn number.
-dnl
-dnl P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
-dnl product (measured on the speed difference between 20 and 40 limbs,
-dnl which is the Karatsuba recursing range).
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-dnl These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
-dnl a description. The only difference here is that UNROLL_COUNT can go up
-dnl to 64 (not 63) making KARATSUBA_SQR_THRESHOLD_MAX 67.
-
-deflit(KARATSUBA_SQR_THRESHOLD_MAX, 67)
-
-ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
-`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
-
-m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
-deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
-
-
-C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
-C
-C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
-C lot of function call overheads are avoided, especially when the given size
-C is small.
-C
-C The code size might look a bit excessive, but not all of it is executed so
-C it won't all get into the code cache. The 1x1, 2x2 and 3x3 special cases
-C clearly apply only to those sizes; mid sizes like 10x10 only need part of
-C the unrolled addmul; and big sizes like 40x40 that do use the full
-C unrolling will least be making good use of it, because 40x40 will take
-C something like 7000 cycles.
-
-defframe(PARAM_SIZE,12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
- .text
- ALIGN(32)
-PROLOGUE(mpn_sqr_basecase)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %edx
-
- movl PARAM_SRC, %eax
-
- cmpl $2, %edx
- movl PARAM_DST, %ecx
- je L(two_limbs)
-
- movl (%eax), %eax
- ja L(three_or_more)
-
-
-C -----------------------------------------------------------------------------
-C one limb only
- C eax src limb
- C ebx
- C ecx dst
- C edx
-
- mull %eax
-
- movl %eax, (%ecx)
- movl %edx, 4(%ecx)
-
- ret
-
-
-C -----------------------------------------------------------------------------
-L(two_limbs):
- C eax src
- C ebx
- C ecx dst
- C edx
-
-defframe(SAVE_ESI, -4)
-defframe(SAVE_EBX, -8)
-defframe(SAVE_EDI, -12)
-defframe(SAVE_EBP, -16)
-deflit(`STACK_SPACE',16)
-
- subl $STACK_SPACE, %esp
-deflit(`FRAME',STACK_SPACE)
-
- movl %esi, SAVE_ESI
- movl %eax, %esi
- movl (%eax), %eax
-
- mull %eax C src[0]^2
-
- movl %eax, (%ecx) C dst[0]
- movl 4(%esi), %eax
-
- movl %ebx, SAVE_EBX
- movl %edx, %ebx C dst[1]
-
- mull %eax C src[1]^2
-
- movl %edi, SAVE_EDI
- movl %eax, %edi C dst[2]
- movl (%esi), %eax
-
- movl %ebp, SAVE_EBP
- movl %edx, %ebp C dst[3]
-
- mull 4(%esi) C src[0]*src[1]
-
- addl %eax, %ebx
- movl SAVE_ESI, %esi
-
- adcl %edx, %edi
-
- adcl $0, %ebp
- addl %ebx, %eax
- movl SAVE_EBX, %ebx
-
- adcl %edi, %edx
- movl SAVE_EDI, %edi
-
- adcl $0, %ebp
-
- movl %eax, 4(%ecx)
-
- movl %ebp, 12(%ecx)
- movl SAVE_EBP, %ebp
-
- movl %edx, 8(%ecx)
- addl $FRAME, %esp
-
- ret
-
-
-C -----------------------------------------------------------------------------
-L(three_or_more):
- C eax src low limb
- C ebx
- C ecx dst
- C edx size
-deflit(`FRAME',0)
-
- pushl %esi defframe_pushl(`SAVE_ESI')
- cmpl $4, %edx
-
- movl PARAM_SRC, %esi
- jae L(four_or_more)
-
-
-C -----------------------------------------------------------------------------
-C three limbs
-
- C eax src low limb
- C ebx
- C ecx dst
- C edx
- C esi src
- C edi
- C ebp
-
- pushl %ebp defframe_pushl(`SAVE_EBP')
- pushl %edi defframe_pushl(`SAVE_EDI')
-
- mull %eax C src[0] ^ 2
-
- movl %eax, (%ecx)
- movl %edx, 4(%ecx)
-
- movl 4(%esi), %eax
- xorl %ebp, %ebp
-
- mull %eax C src[1] ^ 2
-
- movl %eax, 8(%ecx)
- movl %edx, 12(%ecx)
- movl 8(%esi), %eax
-
- pushl %ebx defframe_pushl(`SAVE_EBX')
-
- mull %eax C src[2] ^ 2
-
- movl %eax, 16(%ecx)
- movl %edx, 20(%ecx)
-
- movl (%esi), %eax
-
- mull 4(%esi) C src[0] * src[1]
-
- movl %eax, %ebx
- movl %edx, %edi
-
- movl (%esi), %eax
-
- mull 8(%esi) C src[0] * src[2]
-
- addl %eax, %edi
- movl %edx, %ebp
-
- adcl $0, %ebp
- movl 4(%esi), %eax
-
- mull 8(%esi) C src[1] * src[2]
-
- xorl %esi, %esi
- addl %eax, %ebp
-
- C eax
- C ebx dst[1]
- C ecx dst
- C edx dst[4]
- C esi zero, will be dst[5]
- C edi dst[2]
- C ebp dst[3]
-
- adcl $0, %edx
- addl %ebx, %ebx
-
- adcl %edi, %edi
-
- adcl %ebp, %ebp
-
- adcl %edx, %edx
- movl 4(%ecx), %eax
-
- adcl $0, %esi
- addl %ebx, %eax
-
- movl %eax, 4(%ecx)
- movl 8(%ecx), %eax
-
- adcl %edi, %eax
- movl 12(%ecx), %ebx
-
- adcl %ebp, %ebx
- movl 16(%ecx), %edi
-
- movl %eax, 8(%ecx)
- movl SAVE_EBP, %ebp
-
- movl %ebx, 12(%ecx)
- movl SAVE_EBX, %ebx
-
- adcl %edx, %edi
- movl 20(%ecx), %eax
-
- movl %edi, 16(%ecx)
- movl SAVE_EDI, %edi
-
- adcl %esi, %eax C no carry out of this
- movl SAVE_ESI, %esi
-
- movl %eax, 20(%ecx)
- addl $FRAME, %esp
-
- ret
-
-
-
-C -----------------------------------------------------------------------------
-defframe(VAR_COUNTER,-20)
-defframe(VAR_JMP, -24)
-deflit(`STACK_SPACE',24)
-
-L(four_or_more):
- C eax src low limb
- C ebx
- C ecx
- C edx size
- C esi src
- C edi
- C ebp
-deflit(`FRAME',4) dnl %esi already pushed
-
-C First multiply src[0]*src[1..size-1] and store at dst[1..size].
-
- subl $STACK_SPACE-FRAME, %esp
-deflit(`FRAME',STACK_SPACE)
- movl $1, %ecx
-
- movl %edi, SAVE_EDI
- movl PARAM_DST, %edi
-
- movl %ebx, SAVE_EBX
- subl %edx, %ecx C -(size-1)
-
- movl %ebp, SAVE_EBP
- movl $0, %ebx C initial carry
-
- leal (%esi,%edx,4), %esi C &src[size]
- movl %eax, %ebp C multiplier
-
- leal -4(%edi,%edx,4), %edi C &dst[size-1]
-
-
-C This loop runs at just over 6 c/l.
-
-L(mul_1):
- C eax scratch
- C ebx carry
- C ecx counter, limbs, negative, -(size-1) to -1
- C edx scratch
- C esi &src[size]
- C edi &dst[size-1]
- C ebp multiplier
-
- movl %ebp, %eax
-
- mull (%esi,%ecx,4)
-
- addl %ebx, %eax
- movl $0, %ebx
-
- adcl %edx, %ebx
- movl %eax, 4(%edi,%ecx,4)
-
- incl %ecx
- jnz L(mul_1)
-
-
- movl %ebx, 4(%edi)
-
-
-C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
-C
-C The last two addmuls, which are the bottom right corner of the product
-C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
-C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
-C cases that need to be done.
-C
-C The unrolled code is the same as mpn_addmul_1(), see that routine for some
-C comments.
-C
-C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
-C
-C VAR_JMP is the computed jump into the unrolled code, stepped by one code
-C chunk each outer loop.
-
-dnl This is also hard-coded in the address calculation below.
-deflit(CODE_BYTES_PER_LIMB, 15)
-
-dnl With &src[size] and &dst[size-1] pointers, the displacements in the
-dnl unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above
-dnl that an offset must be added to them.
-deflit(OFFSET,
-ifelse(eval(UNROLL_COUNT>32),1,
-eval((UNROLL_COUNT-32)*4),
-0))
-
- C eax
- C ebx carry
- C ecx
- C edx
- C esi &src[size]
- C edi &dst[size-1]
- C ebp
-
- movl PARAM_SIZE, %ecx
-
- subl $4, %ecx
- jz L(corner)
-
- movl %ecx, %edx
- negl %ecx
-
- shll $4, %ecx
-ifelse(OFFSET,0,,`subl $OFFSET, %esi')
-
-ifdef(`PIC',`
- call L(pic_calc)
-L(here):
-',`
- leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
-')
- negl %edx
-
-ifelse(OFFSET,0,,`subl $OFFSET, %edi')
-
- C The calculated jump mustn't be before the start of the available
- C code. This is the limit that UNROLL_COUNT puts on the src operand
- C size, but checked here using the jump address directly.
-
- ASSERT(ae,
- `movl_text_address( L(unroll_inner_start), %eax)
- cmpl %eax, %ecx')
-
-
-C -----------------------------------------------------------------------------
- ALIGN(16)
-L(unroll_outer_top):
- C eax
- C ebx high limb to store
- C ecx VAR_JMP
- C edx VAR_COUNTER, limbs, negative
- C esi &src[size], constant
- C edi dst ptr, second highest limb of last addmul
- C ebp
-
- movl -12+OFFSET(%esi,%edx,4), %ebp C multiplier
- movl %edx, VAR_COUNTER
-
- movl -8+OFFSET(%esi,%edx,4), %eax C first limb of multiplicand
-
- mull %ebp
-
-define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
-
- testb $1, %cl
-
- movl %edx, %ebx C high carry
- leal 4(%edi), %edi
-
- movl %ecx, %edx C jump
-
- movl %eax, %ecx C low carry
- leal CODE_BYTES_PER_LIMB(%edx), %edx
-
- cmovX( %ebx, %ecx) C high carry reverse
- cmovX( %eax, %ebx) C low carry reverse
- movl %edx, VAR_JMP
- jmp *%edx
-
-
- C Must be on an even address here so the low bit of the jump address
- C will indicate which way around ecx/ebx should start.
-
- ALIGN(2)
-
-L(unroll_inner_start):
- C eax scratch
- C ebx carry high
- C ecx carry low
- C edx scratch
- C esi src pointer
- C edi dst pointer
- C ebp multiplier
- C
- C 15 code bytes each limb
- C ecx/ebx reversed on each chunk
-
-forloop(`i', UNROLL_COUNT, 1, `
- deflit(`disp_src', eval(-i*4 + OFFSET))
- deflit(`disp_dst', eval(disp_src))
-
- m4_assert(`disp_src>=-128 && disp_src<128')
- m4_assert(`disp_dst>=-128 && disp_dst<128')
-
-ifelse(eval(i%2),0,`
-Zdisp( movl, disp_src,(%esi), %eax)
- mull %ebp
-Zdisp( addl, %ebx, disp_dst,(%edi))
- adcl %eax, %ecx
- movl %edx, %ebx
- adcl $0, %ebx
-',`
- dnl this one comes out last
-Zdisp( movl, disp_src,(%esi), %eax)
- mull %ebp
-Zdisp( addl, %ecx, disp_dst,(%edi))
- adcl %eax, %ebx
- movl %edx, %ecx
- adcl $0, %ecx
-')
-')
-L(unroll_inner_end):
-
- addl %ebx, m4_empty_if_zero(OFFSET)(%edi)
-
- movl VAR_COUNTER, %edx
- adcl $0, %ecx
-
- movl %ecx, m4_empty_if_zero(OFFSET+4)(%edi)
- movl VAR_JMP, %ecx
-
- incl %edx
- jnz L(unroll_outer_top)
-
-
-ifelse(OFFSET,0,,`
- addl $OFFSET, %esi
- addl $OFFSET, %edi
-')
-
-
-C -----------------------------------------------------------------------------
- ALIGN(16)
-L(corner):
- C eax
- C ebx
- C ecx
- C edx
- C esi &src[size]
- C edi &dst[2*size-5]
- C ebp
-
- movl -12(%esi), %eax
-
- mull -8(%esi)
-
- addl %eax, (%edi)
- movl -12(%esi), %eax
- movl $0, %ebx
-
- adcl %edx, %ebx
-
- mull -4(%esi)
-
- addl %eax, %ebx
- movl -8(%esi), %eax
-
- adcl $0, %edx
-
- addl %ebx, 4(%edi)
- movl $0, %ebx
-
- adcl %edx, %ebx
-
- mull -4(%esi)
-
- movl PARAM_SIZE, %ecx
- addl %ebx, %eax
-
- adcl $0, %edx
-
- movl %eax, 8(%edi)
-
- movl %edx, 12(%edi)
- movl PARAM_DST, %edi
-
-
-C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
-
- subl $1, %ecx C size-1
- xorl %eax, %eax C ready for final adcl, and clear carry
-
- movl %ecx, %edx
- movl PARAM_SRC, %esi
-
-
-L(lshift):
- C eax
- C ebx
- C ecx counter, size-1 to 1
- C edx size-1 (for later use)
- C esi src (for later use)
- C edi dst, incrementing
- C ebp
-
- rcll 4(%edi)
- rcll 8(%edi)
-
- leal 8(%edi), %edi
- decl %ecx
- jnz L(lshift)
-
-
- adcl %eax, %eax
-
- movl %eax, 4(%edi) C dst most significant limb
- movl (%esi), %eax C src[0]
-
- leal 4(%esi,%edx,4), %esi C &src[size]
- subl %edx, %ecx C -(size-1)
-
-
-C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
-C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
-C low limb of src[0]^2.
-
-
- mull %eax
-
- movl %eax, (%edi,%ecx,8) C dst[0]
-
-
-L(diag):
- C eax scratch
- C ebx scratch
- C ecx counter, negative
- C edx carry
- C esi &src[size]
- C edi dst[2*size-2]
- C ebp
-
- movl (%esi,%ecx,4), %eax
- movl %edx, %ebx
-
- mull %eax
-
- addl %ebx, 4(%edi,%ecx,8)
- adcl %eax, 8(%edi,%ecx,8)
- adcl $0, %edx
-
- incl %ecx
- jnz L(diag)
-
-
- movl SAVE_ESI, %esi
- movl SAVE_EBX, %ebx
-
- addl %edx, 4(%edi) C dst most significant limb
-
- movl SAVE_EDI, %edi
- movl SAVE_EBP, %ebp
- addl $FRAME, %esp
- ret
-
-
-
-C -----------------------------------------------------------------------------
-ifdef(`PIC',`
-L(pic_calc):
- addl (%esp), %ecx
- addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
- addl %edx, %ecx
- ret
-')
-
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/README b/rts/gmp/mpn/x86/pentium/README
deleted file mode 100644
index 3b9ec8ac6f..0000000000
--- a/rts/gmp/mpn/x86/pentium/README
+++ /dev/null
@@ -1,77 +0,0 @@
-
- INTEL PENTIUM P5 MPN SUBROUTINES
-
-
-This directory contains mpn functions optimized for Intel Pentium (P5,P54)
-processors. The mmx subdirectory has code for Pentium with MMX (P55).
-
-
-STATUS
-
- cycles/limb
-
- mpn_add_n/sub_n 2.375
-
- mpn_copyi/copyd 1.0
-
- mpn_divrem_1 44.0
- mpn_mod_1 44.0
- mpn_divexact_by3 15.0
-
- mpn_l/rshift 5.375 normal (6.0 on P54)
- 1.875 special shift by 1 bit
-
- mpn_mul_1 13.0
- mpn_add/submul_1 14.0
-
- mpn_mul_basecase 14.2 cycles/crossproduct (approx)
-
- mpn_sqr_basecase 8 cycles/crossproduct (approx)
- or 15.5 cycles/triangleproduct (approx)
-
-Pentium MMX gets the following improvements
-
- mpn_l/rshift 1.75
-
-
-1. mpn_lshift and mpn_rshift run at about 6 cycles/limb on P5 and P54, but the
-documentation indicates that they should take only 43/8 = 5.375 cycles/limb,
-or 5 cycles/limb asymptotically. The P55 runs them at the expected speed.
-
-2. mpn_add_n and mpn_sub_n run at asymptotically 2 cycles/limb. Due to loop
-overhead and other delays (cache refill?), they run at or near 2.5 cycles/limb.
-
-3. mpn_mul_1, mpn_addmul_1, mpn_submul_1 all run 1 cycle faster than they
-should. Intel documentation says a mul instruction is 10 cycles, but it
-measures 9 and the routines using it run with it as 9.
-
-
-
-RELEVANT OPTIMIZATION ISSUES
-
-1. Pentium doesn't allocate cache lines on writes, unlike most other modern
-processors. Since the functions in the mpn class do array writes, we have to
-handle allocating the destination cache lines by reading a word from it in the
-loops, to achieve the best performance.
-
-2. Pairing of memory operations requires that the two issued operations refer
-to different cache banks. The simplest way to insure this is to read/write
-two words from the same object. If we make operations on different objects,
-they might or might not be to the same cache bank.
-
-
-
-REFERENCES
-
-"Intel Architecture Optimization Manual", 1997, order number 242816. This
-is mostly about P5, the parts about P6 aren't relevant. Available on-line:
-
- http://download.intel.com/design/PentiumII/manuals/242816.htm
-
-
-
-----------------
-Local variables:
-mode: text
-fill-column: 76
-End:
diff --git a/rts/gmp/mpn/x86/pentium/aors_n.asm b/rts/gmp/mpn/x86/pentium/aors_n.asm
deleted file mode 100644
index a61082a456..0000000000
--- a/rts/gmp/mpn/x86/pentium/aors_n.asm
+++ /dev/null
@@ -1,196 +0,0 @@
-dnl Intel Pentium mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
-dnl
-dnl P5: 2.375 cycles/limb
-
-
-dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
-dnl Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-ifdef(`OPERATION_add_n',`
- define(M4_inst, adcl)
- define(M4_function_n, mpn_add_n)
- define(M4_function_nc, mpn_add_nc)
-
-',`ifdef(`OPERATION_sub_n',`
- define(M4_inst, sbbl)
- define(M4_function_n, mpn_sub_n)
- define(M4_function_nc, mpn_sub_nc)
-
-',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
-')')')
-
-MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
-
-
-C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
-C mp_size_t size);
-C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
-C mp_size_t size, mp_limb_t carry);
-
-defframe(PARAM_CARRY,20)
-defframe(PARAM_SIZE, 16)
-defframe(PARAM_SRC2, 12)
-defframe(PARAM_SRC1, 8)
-defframe(PARAM_DST, 4)
-
- .text
- ALIGN(8)
-PROLOGUE(M4_function_nc)
-
- pushl %edi
- pushl %esi
- pushl %ebx
- pushl %ebp
-deflit(`FRAME',16)
-
- movl PARAM_DST,%edi
- movl PARAM_SRC1,%esi
- movl PARAM_SRC2,%ebp
- movl PARAM_SIZE,%ecx
-
- movl (%ebp),%ebx
-
- decl %ecx
- movl %ecx,%edx
- shrl $3,%ecx
- andl $7,%edx
- testl %ecx,%ecx C zero carry flag
- jz L(endgo)
-
- pushl %edx
-FRAME_pushl()
- movl PARAM_CARRY,%eax
- shrl $1,%eax C shift bit 0 into carry
- jmp LF(M4_function_n,oop)
-
-L(endgo):
-deflit(`FRAME',16)
- movl PARAM_CARRY,%eax
- shrl $1,%eax C shift bit 0 into carry
- jmp LF(M4_function_n,end)
-
-EPILOGUE()
-
-
- ALIGN(8)
-PROLOGUE(M4_function_n)
-
- pushl %edi
- pushl %esi
- pushl %ebx
- pushl %ebp
-deflit(`FRAME',16)
-
- movl PARAM_DST,%edi
- movl PARAM_SRC1,%esi
- movl PARAM_SRC2,%ebp
- movl PARAM_SIZE,%ecx
-
- movl (%ebp),%ebx
-
- decl %ecx
- movl %ecx,%edx
- shrl $3,%ecx
- andl $7,%edx
- testl %ecx,%ecx C zero carry flag
- jz L(end)
- pushl %edx
-FRAME_pushl()
-
- ALIGN(8)
-L(oop): movl 28(%edi),%eax C fetch destination cache line
- leal 32(%edi),%edi
-
-L(1): movl (%esi),%eax
- movl 4(%esi),%edx
- M4_inst %ebx,%eax
- movl 4(%ebp),%ebx
- M4_inst %ebx,%edx
- movl 8(%ebp),%ebx
- movl %eax,-32(%edi)
- movl %edx,-28(%edi)
-
-L(2): movl 8(%esi),%eax
- movl 12(%esi),%edx
- M4_inst %ebx,%eax
- movl 12(%ebp),%ebx
- M4_inst %ebx,%edx
- movl 16(%ebp),%ebx
- movl %eax,-24(%edi)
- movl %edx,-20(%edi)
-
-L(3): movl 16(%esi),%eax
- movl 20(%esi),%edx
- M4_inst %ebx,%eax
- movl 20(%ebp),%ebx
- M4_inst %ebx,%edx
- movl 24(%ebp),%ebx
- movl %eax,-16(%edi)
- movl %edx,-12(%edi)
-
-L(4): movl 24(%esi),%eax
- movl 28(%esi),%edx
- M4_inst %ebx,%eax
- movl 28(%ebp),%ebx
- M4_inst %ebx,%edx
- movl 32(%ebp),%ebx
- movl %eax,-8(%edi)
- movl %edx,-4(%edi)
-
- leal 32(%esi),%esi
- leal 32(%ebp),%ebp
- decl %ecx
- jnz L(oop)
-
- popl %edx
-FRAME_popl()
-L(end):
- decl %edx C test %edx w/o clobbering carry
- js L(end2)
- incl %edx
-L(oop2):
- leal 4(%edi),%edi
- movl (%esi),%eax
- M4_inst %ebx,%eax
- movl 4(%ebp),%ebx
- movl %eax,-4(%edi)
- leal 4(%esi),%esi
- leal 4(%ebp),%ebp
- decl %edx
- jnz L(oop2)
-L(end2):
- movl (%esi),%eax
- M4_inst %ebx,%eax
- movl %eax,(%edi)
-
- sbbl %eax,%eax
- negl %eax
-
- popl %ebp
- popl %ebx
- popl %esi
- popl %edi
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/aorsmul_1.asm b/rts/gmp/mpn/x86/pentium/aorsmul_1.asm
deleted file mode 100644
index 147b55610f..0000000000
--- a/rts/gmp/mpn/x86/pentium/aorsmul_1.asm
+++ /dev/null
@@ -1,99 +0,0 @@
-dnl Intel Pentium mpn_addmul_1 -- mpn by limb multiplication.
-dnl
-dnl P5: 14.0 cycles/limb
-
-
-dnl Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
-dnl Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA. */
-
-
-include(`../config.m4')
-
-
-ifdef(`OPERATION_addmul_1', `
- define(M4_inst, addl)
- define(M4_function_1, mpn_addmul_1)
-
-',`ifdef(`OPERATION_submul_1', `
- define(M4_inst, subl)
- define(M4_function_1, mpn_submul_1)
-
-',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
-')')')
-
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
-
-
-C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t mult);
-
-defframe(PARAM_MULTIPLIER,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
- .text
- ALIGN(8)
-
-PROLOGUE(M4_function_1)
-
- pushl %edi
- pushl %esi
- pushl %ebx
- pushl %ebp
-deflit(`FRAME',16)
-
- movl PARAM_DST, %edi
- movl PARAM_SRC, %esi
- movl PARAM_SIZE, %ecx
- movl PARAM_MULTIPLIER, %ebp
-
- leal (%edi,%ecx,4), %edi
- leal (%esi,%ecx,4), %esi
- negl %ecx
- xorl %ebx, %ebx
- ALIGN(8)
-
-L(oop): adcl $0, %ebx
- movl (%esi,%ecx,4), %eax
-
- mull %ebp
-
- addl %ebx, %eax
- movl (%edi,%ecx,4), %ebx
-
- adcl $0, %edx
- M4_inst %eax, %ebx
-
- movl %ebx, (%edi,%ecx,4)
- incl %ecx
-
- movl %edx, %ebx
- jnz L(oop)
-
- adcl $0, %ebx
- movl %ebx, %eax
- popl %ebp
- popl %ebx
- popl %esi
- popl %edi
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/diveby3.asm b/rts/gmp/mpn/x86/pentium/diveby3.asm
deleted file mode 100644
index dbac81642f..0000000000
--- a/rts/gmp/mpn/x86/pentium/diveby3.asm
+++ /dev/null
@@ -1,183 +0,0 @@
-dnl Intel P5 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
-dnl
-dnl P5: 15.0 cycles/limb
-
-
-dnl Copyright (C) 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t carry);
-
-defframe(PARAM_CARRY,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
-dnl multiplicative inverse of 3, modulo 2^32
-deflit(INVERSE_3, 0xAAAAAAAB)
-
-dnl ceil(b/3), ceil(b*2/3) and floor(b*2/3) where b=2^32
-deflit(ONE_THIRD_CEIL, 0x55555556)
-deflit(TWO_THIRDS_CEIL, 0xAAAAAAAB)
-deflit(TWO_THIRDS_FLOOR, 0xAAAAAAAA)
-
- .text
- ALIGN(8)
-
-PROLOGUE(mpn_divexact_by3c)
-deflit(`FRAME',0)
-
- movl PARAM_SRC, %ecx
- movl PARAM_SIZE, %edx
-
- decl %edx
- jnz L(two_or_more)
-
- movl (%ecx), %edx
- movl PARAM_CARRY, %eax C risk of cache bank clash here
-
- movl PARAM_DST, %ecx
- subl %eax, %edx
-
- sbbl %eax, %eax C 0 or -1
-
- imull $INVERSE_3, %edx, %edx
-
- negl %eax C 0 or 1
- cmpl $ONE_THIRD_CEIL, %edx
-
- sbbl $-1, %eax C +1 if edx>=ceil(b/3)
- cmpl $TWO_THIRDS_CEIL, %edx
-
- sbbl $-1, %eax C +1 if edx>=ceil(b*2/3)
- movl %edx, (%ecx)
-
- ret
-
-
-L(two_or_more):
- C eax
- C ebx
- C ecx src
- C edx size-1
- C esi
- C edi
- C ebp
-
- pushl %ebx FRAME_pushl()
- pushl %esi FRAME_pushl()
-
- pushl %edi FRAME_pushl()
- pushl %ebp FRAME_pushl()
-
- movl PARAM_DST, %edi
- movl PARAM_CARRY, %esi
-
- movl (%ecx), %eax C src low limb
- xorl %ebx, %ebx
-
- sub %esi, %eax
- movl $TWO_THIRDS_FLOOR, %esi
-
- leal (%ecx,%edx,4), %ecx C &src[size-1]
- leal (%edi,%edx,4), %edi C &dst[size-1]
-
- adcl $0, %ebx C carry, 0 or 1
- negl %edx C -(size-1)
-
-
-C The loop needs a source limb ready at the top, which leads to one limb
-C handled separately at the end, and the special case above for size==1.
-C There doesn't seem to be any scheduling that would keep the speed but move
-C the source load and carry subtract up to the top.
-C
-C The destination cache line prefetching adds 1 cycle to the loop but is
-C considered worthwhile. The slowdown is a factor of 1.07, but will prevent
-C repeated write-throughs if the destination isn't in L1. A version using
-C an outer loop to prefetch only every 8 limbs (a cache line) proved to be
-C no faster, due to unavoidable branch mispreditions in the inner loop.
-C
-C setc is 2 cycles on P54, so an adcl is used instead. If the movl $0,%ebx
-C could be avoided then the src limb fetch could pair up and save a cycle.
-C This would probably mean going to a two limb loop with the carry limb
-C alternately positive or negative, since an sbbl %ebx,%ebx will leave a
-C value which is in the opposite sense to the preceding sbbl/adcl %ebx,%eax.
-C
-C A register is used for TWO_THIRDS_FLOOR because a cmp can't be done as
-C "cmpl %edx, $n" with the immediate as the second operand.
-C
-C The "4" source displacement is in the loop rather than the setup because
-C this gets L(top) aligned to 8 bytes at no cost.
-
- ALIGN(8)
-L(top):
- C eax source limb, carry subtracted
- C ebx carry (0 or 1)
- C ecx &src[size-1]
- C edx counter, limbs, negative
- C esi TWO_THIRDS_FLOOR
- C edi &dst[size-1]
- C ebp scratch (result limb)
-
- imull $INVERSE_3, %eax, %ebp
-
- cmpl $ONE_THIRD_CEIL, %ebp
- movl (%edi,%edx,4), %eax C dst cache line prefetch
-
- sbbl $-1, %ebx C +1 if ebp>=ceil(b/3)
- cmpl %ebp, %esi
-
- movl 4(%ecx,%edx,4), %eax C next src limb
-
- sbbl %ebx, %eax C and further -1 if ebp>=ceil(b*2/3)
- movl $0, %ebx
-
- adcl $0, %ebx C new carry
- movl %ebp, (%edi,%edx,4)
-
- incl %edx
- jnz L(top)
-
-
-
- imull $INVERSE_3, %eax, %edx
-
- cmpl $ONE_THIRD_CEIL, %edx
- movl %edx, (%edi)
-
- sbbl $-1, %ebx C +1 if edx>=ceil(b/3)
- cmpl $TWO_THIRDS_CEIL, %edx
-
- sbbl $-1, %ebx C +1 if edx>=ceil(b*2/3)
- popl %ebp
-
- movl %ebx, %eax
- popl %edi
-
- popl %esi
- popl %ebx
-
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/gmp-mparam.h b/rts/gmp/mpn/x86/pentium/gmp-mparam.h
deleted file mode 100644
index d3ed3d73ce..0000000000
--- a/rts/gmp/mpn/x86/pentium/gmp-mparam.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Intel P54 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of the GNU Lesser General Public License as published by
-the Free Software Foundation; either version 2.1 of the License, or (at your
-option) any later version.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-License for more details.
-
-You should have received a copy of the GNU Lesser General Public License
-along with the GNU MP Library; see the file COPYING.LIB. If not, write to
-the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-MA 02111-1307, USA. */
-
-
-#define BITS_PER_MP_LIMB 32
-#define BYTES_PER_MP_LIMB 4
-#define BITS_PER_LONGINT 32
-#define BITS_PER_INT 32
-#define BITS_PER_SHORTINT 16
-#define BITS_PER_CHAR 8
-
-
-#ifndef UMUL_TIME
-#define UMUL_TIME 9 /* cycles */
-#endif
-#ifndef UDIV_TIME
-#define UDIV_TIME 41 /* cycles */
-#endif
-
-/* bsf takes 18-42 cycles, put an average for uniform random numbers */
-#ifndef COUNT_TRAILING_ZEROS_TIME
-#define COUNT_TRAILING_ZEROS_TIME 20 /* cycles */
-#endif
-
-
-/* Generated by tuneup.c, 2000-07-06. */
-
-#ifndef KARATSUBA_MUL_THRESHOLD
-#define KARATSUBA_MUL_THRESHOLD 14
-#endif
-#ifndef TOOM3_MUL_THRESHOLD
-#define TOOM3_MUL_THRESHOLD 179
-#endif
-
-#ifndef KARATSUBA_SQR_THRESHOLD
-#define KARATSUBA_SQR_THRESHOLD 22
-#endif
-#ifndef TOOM3_SQR_THRESHOLD
-#define TOOM3_SQR_THRESHOLD 153
-#endif
-
-#ifndef BZ_THRESHOLD
-#define BZ_THRESHOLD 46
-#endif
-
-#ifndef FIB_THRESHOLD
-#define FIB_THRESHOLD 110
-#endif
-
-#ifndef POWM_THRESHOLD
-#define POWM_THRESHOLD 13
-#endif
-
-#ifndef GCD_ACCEL_THRESHOLD
-#define GCD_ACCEL_THRESHOLD 4
-#endif
-#ifndef GCDEXT_THRESHOLD
-#define GCDEXT_THRESHOLD 25
-#endif
-
-#ifndef FFT_MUL_TABLE
-#define FFT_MUL_TABLE { 496, 928, 1920, 4608, 14336, 40960, 0 }
-#endif
-#ifndef FFT_MODF_MUL_THRESHOLD
-#define FFT_MODF_MUL_THRESHOLD 512
-#endif
-#ifndef FFT_MUL_THRESHOLD
-#define FFT_MUL_THRESHOLD 3840
-#endif
-
-#ifndef FFT_SQR_TABLE
-#define FFT_SQR_TABLE { 496, 1184, 1920, 5632, 14336, 40960, 0 }
-#endif
-#ifndef FFT_MODF_SQR_THRESHOLD
-#define FFT_MODF_SQR_THRESHOLD 512
-#endif
-#ifndef FFT_SQR_THRESHOLD
-#define FFT_SQR_THRESHOLD 3840
-#endif
diff --git a/rts/gmp/mpn/x86/pentium/lshift.asm b/rts/gmp/mpn/x86/pentium/lshift.asm
deleted file mode 100644
index e1e35d4c57..0000000000
--- a/rts/gmp/mpn/x86/pentium/lshift.asm
+++ /dev/null
@@ -1,236 +0,0 @@
-dnl Intel Pentium mpn_lshift -- mpn left shift.
-dnl
-dnl cycles/limb
-dnl P5,P54: 6.0
-dnl P55: 5.375
-
-
-dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
-dnl Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C unsigned shift);
-C
-C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
-C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
-
-defframe(PARAM_SHIFT,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
- .text
- ALIGN(8)
-PROLOGUE(mpn_lshift)
-
- pushl %edi
- pushl %esi
- pushl %ebx
- pushl %ebp
-deflit(`FRAME',16)
-
- movl PARAM_DST,%edi
- movl PARAM_SRC,%esi
- movl PARAM_SIZE,%ebp
- movl PARAM_SHIFT,%ecx
-
-C We can use faster code for shift-by-1 under certain conditions.
- cmp $1,%ecx
- jne L(normal)
- leal 4(%esi),%eax
- cmpl %edi,%eax
- jnc L(special) C jump if s_ptr + 1 >= res_ptr
- leal (%esi,%ebp,4),%eax
- cmpl %eax,%edi
- jnc L(special) C jump if res_ptr >= s_ptr + size
-
-L(normal):
- leal -4(%edi,%ebp,4),%edi
- leal -4(%esi,%ebp,4),%esi
-
- movl (%esi),%edx
- subl $4,%esi
- xorl %eax,%eax
- shldl( %cl, %edx, %eax) C compute carry limb
- pushl %eax C push carry limb onto stack
-
- decl %ebp
- pushl %ebp
- shrl $3,%ebp
- jz L(end)
-
- movl (%edi),%eax C fetch destination cache line
-
- ALIGN(4)
-L(oop): movl -28(%edi),%eax C fetch destination cache line
- movl %edx,%ebx
-
- movl (%esi),%eax
- movl -4(%esi),%edx
- shldl( %cl, %eax, %ebx)
- shldl( %cl, %edx, %eax)
- movl %ebx,(%edi)
- movl %eax,-4(%edi)
-
- movl -8(%esi),%ebx
- movl -12(%esi),%eax
- shldl( %cl, %ebx, %edx)
- shldl( %cl, %eax, %ebx)
- movl %edx,-8(%edi)
- movl %ebx,-12(%edi)
-
- movl -16(%esi),%edx
- movl -20(%esi),%ebx
- shldl( %cl, %edx, %eax)
- shldl( %cl, %ebx, %edx)
- movl %eax,-16(%edi)
- movl %edx,-20(%edi)
-
- movl -24(%esi),%eax
- movl -28(%esi),%edx
- shldl( %cl, %eax, %ebx)
- shldl( %cl, %edx, %eax)
- movl %ebx,-24(%edi)
- movl %eax,-28(%edi)
-
- subl $32,%esi
- subl $32,%edi
- decl %ebp
- jnz L(oop)
-
-L(end): popl %ebp
- andl $7,%ebp
- jz L(end2)
-L(oop2):
- movl (%esi),%eax
- shldl( %cl,%eax,%edx)
- movl %edx,(%edi)
- movl %eax,%edx
- subl $4,%esi
- subl $4,%edi
- decl %ebp
- jnz L(oop2)
-
-L(end2):
- shll %cl,%edx C compute least significant limb
- movl %edx,(%edi) C store it
-
- popl %eax C pop carry limb
-
- popl %ebp
- popl %ebx
- popl %esi
- popl %edi
- ret
-
-
-C We loop from least significant end of the arrays, which is only
-C permissable if the source and destination don't overlap, since the
-C function is documented to work for overlapping source and destination.
-
-L(special):
- movl (%esi),%edx
- addl $4,%esi
-
- decl %ebp
- pushl %ebp
- shrl $3,%ebp
-
- addl %edx,%edx
- incl %ebp
- decl %ebp
- jz L(Lend)
-
- movl (%edi),%eax C fetch destination cache line
-
- ALIGN(4)
-L(Loop):
- movl 28(%edi),%eax C fetch destination cache line
- movl %edx,%ebx
-
- movl (%esi),%eax
- movl 4(%esi),%edx
- adcl %eax,%eax
- movl %ebx,(%edi)
- adcl %edx,%edx
- movl %eax,4(%edi)
-
- movl 8(%esi),%ebx
- movl 12(%esi),%eax
- adcl %ebx,%ebx
- movl %edx,8(%edi)
- adcl %eax,%eax
- movl %ebx,12(%edi)
-
- movl 16(%esi),%edx
- movl 20(%esi),%ebx
- adcl %edx,%edx
- movl %eax,16(%edi)
- adcl %ebx,%ebx
- movl %edx,20(%edi)
-
- movl 24(%esi),%eax
- movl 28(%esi),%edx
- adcl %eax,%eax
- movl %ebx,24(%edi)
- adcl %edx,%edx
- movl %eax,28(%edi)
-
- leal 32(%esi),%esi C use leal not to clobber carry
- leal 32(%edi),%edi
- decl %ebp
- jnz L(Loop)
-
-L(Lend):
- popl %ebp
- sbbl %eax,%eax C save carry in %eax
- andl $7,%ebp
- jz L(Lend2)
- addl %eax,%eax C restore carry from eax
-L(Loop2):
- movl %edx,%ebx
- movl (%esi),%edx
- adcl %edx,%edx
- movl %ebx,(%edi)
-
- leal 4(%esi),%esi C use leal not to clobber carry
- leal 4(%edi),%edi
- decl %ebp
- jnz L(Loop2)
-
- jmp L(L1)
-L(Lend2):
- addl %eax,%eax C restore carry from eax
-L(L1): movl %edx,(%edi) C store last limb
-
- sbbl %eax,%eax
- negl %eax
-
- popl %ebp
- popl %ebx
- popl %esi
- popl %edi
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h b/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h
deleted file mode 100644
index 2379077d0c..0000000000
--- a/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Intel P55 gmp-mparam.h -- Compiler/machine parameter header file.
-
-Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of the GNU Lesser General Public License as published by
-the Free Software Foundation; either version 2.1 of the License, or (at your
-option) any later version.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-License for more details.
-
-You should have received a copy of the GNU Lesser General Public License
-along with the GNU MP Library; see the file COPYING.LIB. If not, write to
-the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-MA 02111-1307, USA. */
-
-
-#define BITS_PER_MP_LIMB 32
-#define BYTES_PER_MP_LIMB 4
-#define BITS_PER_LONGINT 32
-#define BITS_PER_INT 32
-#define BITS_PER_SHORTINT 16
-#define BITS_PER_CHAR 8
-
-
-#ifndef UMUL_TIME
-#define UMUL_TIME 9 /* cycles */
-#endif
-#ifndef UDIV_TIME
-#define UDIV_TIME 41 /* cycles */
-#endif
-
-/* bsf takes 18-42 cycles, put an average for uniform random numbers */
-#ifndef COUNT_TRAILING_ZEROS_TIME
-#define COUNT_TRAILING_ZEROS_TIME 20 /* cycles */
-#endif
-
-
-/* Generated by tuneup.c, 2000-07-06. */
-
-#ifndef KARATSUBA_MUL_THRESHOLD
-#define KARATSUBA_MUL_THRESHOLD 14
-#endif
-#ifndef TOOM3_MUL_THRESHOLD
-#define TOOM3_MUL_THRESHOLD 99
-#endif
-
-#ifndef KARATSUBA_SQR_THRESHOLD
-#define KARATSUBA_SQR_THRESHOLD 22
-#endif
-#ifndef TOOM3_SQR_THRESHOLD
-#define TOOM3_SQR_THRESHOLD 89
-#endif
-
-#ifndef BZ_THRESHOLD
-#define BZ_THRESHOLD 40
-#endif
-
-#ifndef FIB_THRESHOLD
-#define FIB_THRESHOLD 98
-#endif
-
-#ifndef POWM_THRESHOLD
-#define POWM_THRESHOLD 13
-#endif
-
-#ifndef GCD_ACCEL_THRESHOLD
-#define GCD_ACCEL_THRESHOLD 5
-#endif
-#ifndef GCDEXT_THRESHOLD
-#define GCDEXT_THRESHOLD 25
-#endif
-
-#ifndef FFT_MUL_TABLE
-#define FFT_MUL_TABLE { 496, 1056, 1920, 4608, 14336, 40960, 0 }
-#endif
-#ifndef FFT_MODF_MUL_THRESHOLD
-#define FFT_MODF_MUL_THRESHOLD 512
-#endif
-#ifndef FFT_MUL_THRESHOLD
-#define FFT_MUL_THRESHOLD 3840
-#endif
-
-#ifndef FFT_SQR_TABLE
-#define FFT_SQR_TABLE { 496, 1184, 2176, 5632, 14336, 40960, 0 }
-#endif
-#ifndef FFT_MODF_SQR_THRESHOLD
-#define FFT_MODF_SQR_THRESHOLD 512
-#endif
-#ifndef FFT_SQR_THRESHOLD
-#define FFT_SQR_THRESHOLD 4352
-#endif
diff --git a/rts/gmp/mpn/x86/pentium/mmx/lshift.asm b/rts/gmp/mpn/x86/pentium/mmx/lshift.asm
deleted file mode 100644
index 2225438658..0000000000
--- a/rts/gmp/mpn/x86/pentium/mmx/lshift.asm
+++ /dev/null
@@ -1,455 +0,0 @@
-dnl Intel P5 mpn_lshift -- mpn left shift.
-dnl
-dnl P5: 1.75 cycles/limb.
-
-
-dnl Copyright (C) 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C unsigned shift);
-C
-C Shift src,size left by shift many bits and store the result in dst,size.
-C Zeros are shifted in at the right. Return the bits shifted out at the
-C left.
-C
-C The comments in mpn_rshift apply here too.
-
-defframe(PARAM_SHIFT,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-deflit(`FRAME',0)
-
-dnl minimum 5, because the unrolled loop can't handle less
-deflit(UNROLL_THRESHOLD, 5)
-
- .text
- ALIGN(8)
-
-PROLOGUE(mpn_lshift)
-
- pushl %ebx
- pushl %edi
-deflit(`FRAME',8)
-
- movl PARAM_SIZE, %eax
- movl PARAM_DST, %edx
-
- movl PARAM_SRC, %ebx
- movl PARAM_SHIFT, %ecx
-
- cmp $UNROLL_THRESHOLD, %eax
- jae L(unroll)
-
- movl -4(%ebx,%eax,4), %edi C src high limb
- decl %eax
-
- jnz L(simple)
-
- shldl( %cl, %edi, %eax) C eax was decremented to zero
-
- shll %cl, %edi
-
- movl %edi, (%edx) C dst low limb
- popl %edi C risk of data cache bank clash
-
- popl %ebx
-
- ret
-
-
-C -----------------------------------------------------------------------------
-L(simple):
- C eax size-1
- C ebx src
- C ecx shift
- C edx dst
- C esi
- C edi
- C ebp
-deflit(`FRAME',8)
-
- movd (%ebx,%eax,4), %mm5 C src high limb
-
- movd %ecx, %mm6 C lshift
- negl %ecx
-
- psllq %mm6, %mm5
- addl $32, %ecx
-
- movd %ecx, %mm7
- psrlq $32, %mm5 C retval
-
-
-L(simple_top):
- C eax counter, limbs, negative
- C ebx src
- C ecx
- C edx dst
- C esi
- C edi
- C
- C mm0 scratch
- C mm5 return value
- C mm6 shift
- C mm7 32-shift
-
- movq -4(%ebx,%eax,4), %mm0
- decl %eax
-
- psrlq %mm7, %mm0
-
- C
-
- movd %mm0, 4(%edx,%eax,4)
- jnz L(simple_top)
-
-
- movd (%ebx), %mm0
-
- movd %mm5, %eax
- psllq %mm6, %mm0
-
- popl %edi
- popl %ebx
-
- movd %mm0, (%edx)
-
- emms
-
- ret
-
-
-C -----------------------------------------------------------------------------
- ALIGN(8)
-L(unroll):
- C eax size
- C ebx src
- C ecx shift
- C edx dst
- C esi
- C edi
- C ebp
-deflit(`FRAME',8)
-
- movd -4(%ebx,%eax,4), %mm5 C src high limb
- leal (%ebx,%eax,4), %edi
-
- movd %ecx, %mm6 C lshift
- andl $4, %edi
-
- psllq %mm6, %mm5
- jz L(start_src_aligned)
-
-
- C src isn't aligned, process high limb separately (marked xxx) to
- C make it so.
- C
- C source -8(ebx,%eax,4)
- C |
- C +-------+-------+-------+--
- C | |
- C +-------+-------+-------+--
- C 0mod8 4mod8 0mod8
- C
- C dest
- C -4(edx,%eax,4)
- C |
- C +-------+-------+--
- C | xxx | |
- C +-------+-------+--
-
- movq -8(%ebx,%eax,4), %mm0 C unaligned load
-
- psllq %mm6, %mm0
- decl %eax
-
- psrlq $32, %mm0
-
- C
-
- movd %mm0, (%edx,%eax,4)
-L(start_src_aligned):
-
- movq -8(%ebx,%eax,4), %mm1 C src high qword
- leal (%edx,%eax,4), %edi
-
- andl $4, %edi
- psrlq $32, %mm5 C return value
-
- movq -16(%ebx,%eax,4), %mm3 C src second highest qword
- jz L(start_dst_aligned)
-
- C dst isn't aligned, subtract 4 to make it so, and pretend the shift
- C is 32 bits extra. High limb of dst (marked xxx) handled here
- C separately.
- C
- C source -8(ebx,%eax,4)
- C |
- C +-------+-------+--
- C | mm1 |
- C +-------+-------+--
- C 0mod8 4mod8
- C
- C dest
- C -4(edx,%eax,4)
- C |
- C +-------+-------+-------+--
- C | xxx | |
- C +-------+-------+-------+--
- C 0mod8 4mod8 0mod8
-
- movq %mm1, %mm0
- addl $32, %ecx C new shift
-
- psllq %mm6, %mm0
-
- movd %ecx, %mm6
- psrlq $32, %mm0
-
- C wasted cycle here waiting for %mm0
-
- movd %mm0, -4(%edx,%eax,4)
- subl $4, %edx
-L(start_dst_aligned):
-
-
- psllq %mm6, %mm1
- negl %ecx C -shift
-
- addl $64, %ecx C 64-shift
- movq %mm3, %mm2
-
- movd %ecx, %mm7
- subl $8, %eax C size-8
-
- psrlq %mm7, %mm3
-
- por %mm1, %mm3 C mm3 ready to store
- jc L(finish)
-
-
- C The comments in mpn_rshift apply here too.
-
- ALIGN(8)
-L(unroll_loop):
- C eax counter, limbs
- C ebx src
- C ecx
- C edx dst
- C esi
- C edi
- C
- C mm0
- C mm1
- C mm2 src qword from 48(%ebx,%eax,4)
- C mm3 dst qword ready to store to 56(%edx,%eax,4)
- C
- C mm5 return value
- C mm6 lshift
- C mm7 rshift
-
- movq 8(%ebx,%eax,4), %mm0
- psllq %mm6, %mm2
-
- movq %mm0, %mm1
- psrlq %mm7, %mm0
-
- movq %mm3, 24(%edx,%eax,4) C prev
- por %mm2, %mm0
-
- movq (%ebx,%eax,4), %mm3 C
- psllq %mm6, %mm1 C
-
- movq %mm0, 16(%edx,%eax,4)
- movq %mm3, %mm2 C
-
- psrlq %mm7, %mm3 C
- subl $4, %eax
-
- por %mm1, %mm3 C
- jnc L(unroll_loop)
-
-
-
-L(finish):
- C eax -4 to -1 representing respectively 0 to 3 limbs remaining
-
- testb $2, %al
-
- jz L(finish_no_two)
-
- movq 8(%ebx,%eax,4), %mm0
- psllq %mm6, %mm2
-
- movq %mm0, %mm1
- psrlq %mm7, %mm0
-
- movq %mm3, 24(%edx,%eax,4) C prev
- por %mm2, %mm0
-
- movq %mm1, %mm2
- movq %mm0, %mm3
-
- subl $2, %eax
-L(finish_no_two):
-
-
- C eax -4 or -3 representing respectively 0 or 1 limbs remaining
- C
- C mm2 src prev qword, from 48(%ebx,%eax,4)
- C mm3 dst qword, for 56(%edx,%eax,4)
-
- testb $1, %al
- movd %mm5, %eax C retval
-
- popl %edi
- jz L(finish_zero)
-
-
- C One extra src limb, destination was aligned.
- C
- C source ebx
- C --+---------------+-------+
- C | mm2 | |
- C --+---------------+-------+
- C
- C dest edx+12 edx+4 edx
- C --+---------------+---------------+-------+
- C | mm3 | | |
- C --+---------------+---------------+-------+
- C
- C mm6 = shift
- C mm7 = ecx = 64-shift
-
-
- C One extra src limb, destination was unaligned.
- C
- C source ebx
- C --+---------------+-------+
- C | mm2 | |
- C --+---------------+-------+
- C
- C dest edx+12 edx+4
- C --+---------------+---------------+
- C | mm3 | |
- C --+---------------+---------------+
- C
- C mm6 = shift+32
- C mm7 = ecx = 64-(shift+32)
-
-
- C In both cases there's one extra limb of src to fetch and combine
- C with mm2 to make a qword at 4(%edx), and in the aligned case
- C there's an extra limb of dst to be formed from that extra src limb
- C left shifted.
-
-
- movd (%ebx), %mm0
- psllq %mm6, %mm2
-
- movq %mm3, 12(%edx)
- psllq $32, %mm0
-
- movq %mm0, %mm1
- psrlq %mm7, %mm0
-
- por %mm2, %mm0
- psllq %mm6, %mm1
-
- movq %mm0, 4(%edx)
- psrlq $32, %mm1
-
- andl $32, %ecx
- popl %ebx
-
- jz L(finish_one_unaligned)
-
- movd %mm1, (%edx)
-L(finish_one_unaligned):
-
- emms
-
- ret
-
-
-L(finish_zero):
-
- C No extra src limbs, destination was aligned.
- C
- C source ebx
- C --+---------------+
- C | mm2 |
- C --+---------------+
- C
- C dest edx+8 edx
- C --+---------------+---------------+
- C | mm3 | |
- C --+---------------+---------------+
- C
- C mm6 = shift
- C mm7 = ecx = 64-shift
-
-
- C No extra src limbs, destination was unaligned.
- C
- C source ebx
- C --+---------------+
- C | mm2 |
- C --+---------------+
- C
- C dest edx+8 edx+4
- C --+---------------+-------+
- C | mm3 | |
- C --+---------------+-------+
- C
- C mm6 = shift+32
- C mm7 = ecx = 64-(shift+32)
-
-
- C The movd for the unaligned case writes the same data to 4(%edx)
- C that the movq does for the aligned case.
-
-
- movq %mm3, 8(%edx)
- andl $32, %ecx
-
- psllq %mm6, %mm2
- jz L(finish_zero_unaligned)
-
- movq %mm2, (%edx)
-L(finish_zero_unaligned):
-
- psrlq $32, %mm2
- popl %ebx
-
- movd %mm5, %eax C retval
-
- movd %mm2, 4(%edx)
-
- emms
-
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mmx/popham.asm b/rts/gmp/mpn/x86/pentium/mmx/popham.asm
deleted file mode 100644
index 587a07ab3d..0000000000
--- a/rts/gmp/mpn/x86/pentium/mmx/popham.asm
+++ /dev/null
@@ -1,30 +0,0 @@
-dnl Intel P55 mpn_popcount, mpn_hamdist -- population count and hamming
-dnl distance.
-dnl
-dnl P55: popcount 11.5 cycles/limb, hamdist 12.0 cycles/limb
-
-
-dnl Copyright (C) 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
-include_mpn(`x86/k6/mmx/popham.asm')
diff --git a/rts/gmp/mpn/x86/pentium/mmx/rshift.asm b/rts/gmp/mpn/x86/pentium/mmx/rshift.asm
deleted file mode 100644
index 7672630d57..0000000000
--- a/rts/gmp/mpn/x86/pentium/mmx/rshift.asm
+++ /dev/null
@@ -1,460 +0,0 @@
-dnl Intel P5 mpn_rshift -- mpn right shift.
-dnl
-dnl P5: 1.75 cycles/limb.
-
-
-dnl Copyright (C) 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C unsigned shift);
-C
-C Shift src,size right by shift many bits and store the result in dst,size.
-C Zeros are shifted in at the left. Return the bits shifted out at the
-C right.
-C
-C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
-C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
-C
-C Full speed depends on source and destination being aligned. Unaligned mmx
-C loads and stores on P5 don't pair and have a 2 cycle penalty. Some hairy
-C setups and finish-ups are done to ensure alignment for the loop.
-C
-C MMX shifts work out a bit faster even for the simple loop.
-
-defframe(PARAM_SHIFT,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-deflit(`FRAME',0)
-
-dnl Minimum 5, because the unrolled loop can't handle less.
-deflit(UNROLL_THRESHOLD, 5)
-
- .text
- ALIGN(8)
-
-PROLOGUE(mpn_rshift)
-
- pushl %ebx
- pushl %edi
-deflit(`FRAME',8)
-
- movl PARAM_SIZE, %eax
- movl PARAM_DST, %edx
-
- movl PARAM_SRC, %ebx
- movl PARAM_SHIFT, %ecx
-
- cmp $UNROLL_THRESHOLD, %eax
- jae L(unroll)
-
- decl %eax
- movl (%ebx), %edi C src low limb
-
- jnz L(simple)
-
- shrdl( %cl, %edi, %eax) C eax was decremented to zero
-
- shrl %cl, %edi
-
- movl %edi, (%edx) C dst low limb
- popl %edi C risk of data cache bank clash
-
- popl %ebx
-
- ret
-
-
-C -----------------------------------------------------------------------------
- ALIGN(8)
-L(simple):
- C eax size-1
- C ebx src
- C ecx shift
- C edx dst
- C esi
- C edi
- C ebp
-deflit(`FRAME',8)
-
- movd (%ebx), %mm5 C src[0]
- leal (%ebx,%eax,4), %ebx C &src[size-1]
-
- movd %ecx, %mm6 C rshift
- leal -4(%edx,%eax,4), %edx C &dst[size-2]
-
- psllq $32, %mm5
- negl %eax
-
-
-C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
-C cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4
-C cycles and would be 8 in a simple loop. Using mmx helps the return value
-C and last limb calculations too.
-
-L(simple_top):
- C eax counter, limbs, negative
- C ebx &src[size-1]
- C ecx return value
- C edx &dst[size-2]
- C
- C mm0 scratch
- C mm5 return value
- C mm6 shift
-
- movq (%ebx,%eax,4), %mm0
- incl %eax
-
- psrlq %mm6, %mm0
-
- movd %mm0, (%edx,%eax,4)
- jnz L(simple_top)
-
-
- movd (%ebx), %mm0
- psrlq %mm6, %mm5 C return value
-
- psrlq %mm6, %mm0
- popl %edi
-
- movd %mm5, %eax
- popl %ebx
-
- movd %mm0, 4(%edx)
-
- emms
-
- ret
-
-
-C -----------------------------------------------------------------------------
- ALIGN(8)
-L(unroll):
- C eax size
- C ebx src
- C ecx shift
- C edx dst
- C esi
- C edi
- C ebp
-deflit(`FRAME',8)
-
- movd (%ebx), %mm5 C src[0]
- movl $4, %edi
-
- movd %ecx, %mm6 C rshift
- testl %edi, %ebx
-
- psllq $32, %mm5
- jz L(start_src_aligned)
-
-
- C src isn't aligned, process low limb separately (marked xxx) and
- C step src and dst by one limb, making src aligned.
- C
- C source ebx
- C --+-------+-------+-------+
- C | xxx |
- C --+-------+-------+-------+
- C 4mod8 0mod8 4mod8
- C
- C dest edx
- C --+-------+-------+
- C | | xxx |
- C --+-------+-------+
-
- movq (%ebx), %mm0 C unaligned load
-
- psrlq %mm6, %mm0
- addl $4, %ebx
-
- decl %eax
-
- movd %mm0, (%edx)
- addl $4, %edx
-L(start_src_aligned):
-
-
- movq (%ebx), %mm1
- testl %edi, %edx
-
- psrlq %mm6, %mm5 C retval
- jz L(start_dst_aligned)
-
- C dst isn't aligned, add 4 to make it so, and pretend the shift is
- C 32 bits extra. Low limb of dst (marked xxx) handled here
- C separately.
- C
- C source ebx
- C --+-------+-------+
- C | mm1 |
- C --+-------+-------+
- C 4mod8 0mod8
- C
- C dest edx
- C --+-------+-------+-------+
- C | xxx |
- C --+-------+-------+-------+
- C 4mod8 0mod8 4mod8
-
- movq %mm1, %mm0
- addl $32, %ecx C new shift
-
- psrlq %mm6, %mm0
-
- movd %ecx, %mm6
-
- movd %mm0, (%edx)
- addl $4, %edx
-L(start_dst_aligned):
-
-
- movq 8(%ebx), %mm3
- negl %ecx
-
- movq %mm3, %mm2 C mm2 src qword
- addl $64, %ecx
-
- movd %ecx, %mm7
- psrlq %mm6, %mm1
-
- leal -12(%ebx,%eax,4), %ebx
- leal -20(%edx,%eax,4), %edx
-
- psllq %mm7, %mm3
- subl $7, %eax C size-7
-
- por %mm1, %mm3 C mm3 ready to store
- negl %eax C -(size-7)
-
- jns L(finish)
-
-
- C This loop is the important bit, the rest is just support. Careful
- C instruction scheduling achieves the claimed 1.75 c/l. The
- C relevant parts of the pairing rules are:
- C
- C - mmx loads and stores execute only in the U pipe
- C - only one mmx shift in a pair
- C - wait one cycle before storing an mmx register result
- C - the usual address generation interlock
- C
- C Two qword calculations are slightly interleaved. The instructions
- C marked "C" belong to the second qword, and the "C prev" one is for
- C the second qword from the previous iteration.
-
- ALIGN(8)
-L(unroll_loop):
- C eax counter, limbs, negative
- C ebx &src[size-12]
- C ecx
- C edx &dst[size-12]
- C esi
- C edi
- C
- C mm0
- C mm1
- C mm2 src qword from -8(%ebx,%eax,4)
- C mm3 dst qword ready to store to -8(%edx,%eax,4)
- C
- C mm5 return value
- C mm6 rshift
- C mm7 lshift
-
- movq (%ebx,%eax,4), %mm0
- psrlq %mm6, %mm2
-
- movq %mm0, %mm1
- psllq %mm7, %mm0
-
- movq %mm3, -8(%edx,%eax,4) C prev
- por %mm2, %mm0
-
- movq 8(%ebx,%eax,4), %mm3 C
- psrlq %mm6, %mm1 C
-
- movq %mm0, (%edx,%eax,4)
- movq %mm3, %mm2 C
-
- psllq %mm7, %mm3 C
- addl $4, %eax
-
- por %mm1, %mm3 C
- js L(unroll_loop)
-
-
-L(finish):
- C eax 0 to 3 representing respectively 3 to 0 limbs remaining
-
- testb $2, %al
-
- jnz L(finish_no_two)
-
- movq (%ebx,%eax,4), %mm0
- psrlq %mm6, %mm2
-
- movq %mm0, %mm1
- psllq %mm7, %mm0
-
- movq %mm3, -8(%edx,%eax,4) C prev
- por %mm2, %mm0
-
- movq %mm1, %mm2
- movq %mm0, %mm3
-
- addl $2, %eax
-L(finish_no_two):
-
-
- C eax 2 or 3 representing respectively 1 or 0 limbs remaining
- C
- C mm2 src prev qword, from -8(%ebx,%eax,4)
- C mm3 dst qword, for -8(%edx,%eax,4)
-
- testb $1, %al
- popl %edi
-
- movd %mm5, %eax C retval
- jnz L(finish_zero)
-
-
- C One extra limb, destination was aligned.
- C
- C source ebx
- C +-------+---------------+--
- C | | mm2 |
- C +-------+---------------+--
- C
- C dest edx
- C +-------+---------------+---------------+--
- C | | | mm3 |
- C +-------+---------------+---------------+--
- C
- C mm6 = shift
- C mm7 = ecx = 64-shift
-
-
- C One extra limb, destination was unaligned.
- C
- C source ebx
- C +-------+---------------+--
- C | | mm2 |
- C +-------+---------------+--
- C
- C dest edx
- C +---------------+---------------+--
- C | | mm3 |
- C +---------------+---------------+--
- C
- C mm6 = shift+32
- C mm7 = ecx = 64-(shift+32)
-
-
- C In both cases there's one extra limb of src to fetch and combine
- C with mm2 to make a qword at 8(%edx), and in the aligned case
- C there's a further extra limb of dst to be formed.
-
-
- movd 8(%ebx), %mm0
- psrlq %mm6, %mm2
-
- movq %mm0, %mm1
- psllq %mm7, %mm0
-
- movq %mm3, (%edx)
- por %mm2, %mm0
-
- psrlq %mm6, %mm1
- andl $32, %ecx
-
- popl %ebx
- jz L(finish_one_unaligned)
-
- C dst was aligned, must store one extra limb
- movd %mm1, 16(%edx)
-L(finish_one_unaligned):
-
- movq %mm0, 8(%edx)
-
- emms
-
- ret
-
-
-L(finish_zero):
-
- C No extra limbs, destination was aligned.
- C
- C source ebx
- C +---------------+--
- C | mm2 |
- C +---------------+--
- C
- C dest edx+4
- C +---------------+---------------+--
- C | | mm3 |
- C +---------------+---------------+--
- C
- C mm6 = shift
- C mm7 = ecx = 64-shift
-
-
- C No extra limbs, destination was unaligned.
- C
- C source ebx
- C +---------------+--
- C | mm2 |
- C +---------------+--
- C
- C dest edx+4
- C +-------+---------------+--
- C | | mm3 |
- C +-------+---------------+--
- C
- C mm6 = shift+32
- C mm7 = 64-(shift+32)
-
-
- C The movd for the unaligned case is clearly the same data as the
- C movq for the aligned case, it's just a choice between whether one
- C or two limbs should be written.
-
-
- movq %mm3, 4(%edx)
- psrlq %mm6, %mm2
-
- movd %mm2, 12(%edx)
- andl $32, %ecx
-
- popl %ebx
- jz L(finish_zero_unaligned)
-
- movq %mm2, 12(%edx)
-L(finish_zero_unaligned):
-
- emms
-
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mul_1.asm b/rts/gmp/mpn/x86/pentium/mul_1.asm
deleted file mode 100644
index 08639eca09..0000000000
--- a/rts/gmp/mpn/x86/pentium/mul_1.asm
+++ /dev/null
@@ -1,79 +0,0 @@
-dnl Intel Pentium mpn_mul_1 -- mpn by limb multiplication.
-dnl
-dnl P5: 13.0 cycles/limb
-
-dnl Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
-dnl Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA. */
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t multiplier);
-
-defframe(PARAM_MULTIPLIER,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
- .text
- ALIGN(8)
-PROLOGUE(mpn_mul_1)
-
- pushl %edi
- pushl %esi
- pushl %ebx
- pushl %ebp
-deflit(`FRAME',16)
-
- movl PARAM_DST, %edi
- movl PARAM_SRC, %esi
- movl PARAM_SIZE, %ecx
- movl PARAM_MULTIPLIER, %ebp
-
- leal (%edi,%ecx,4), %edi
- leal (%esi,%ecx,4), %esi
- negl %ecx
- xorl %ebx, %ebx
- ALIGN(8)
-
-L(oop): adcl $0, %ebx
- movl (%esi,%ecx,4), %eax
-
- mull %ebp
-
- addl %eax, %ebx
-
- movl %ebx, (%edi,%ecx,4)
- incl %ecx
-
- movl %edx, %ebx
- jnz L(oop)
-
- adcl $0, %ebx
- movl %ebx, %eax
- popl %ebp
- popl %ebx
- popl %esi
- popl %edi
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mul_basecase.asm b/rts/gmp/mpn/x86/pentium/mul_basecase.asm
deleted file mode 100644
index d9f79a0831..0000000000
--- a/rts/gmp/mpn/x86/pentium/mul_basecase.asm
+++ /dev/null
@@ -1,135 +0,0 @@
-dnl Intel Pentium mpn_mul_basecase -- mpn by mpn multiplication.
-dnl
-dnl P5: 14.2 cycles/crossproduct (approx)
-
-
-dnl Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C void mpn_mul_basecase (mp_ptr wp,
-C mp_srcptr xp, mp_size_t xsize,
-C mp_srcptr yp, mp_size_t ysize);
-
-defframe(PARAM_YSIZE, 20)
-defframe(PARAM_YP, 16)
-defframe(PARAM_XSIZE, 12)
-defframe(PARAM_XP, 8)
-defframe(PARAM_WP, 4)
-
-defframe(VAR_COUNTER, -4)
-
- .text
- ALIGN(8)
-PROLOGUE(mpn_mul_basecase)
-
- pushl %eax C dummy push for allocating stack slot
- pushl %esi
- pushl %ebp
- pushl %edi
-deflit(`FRAME',16)
-
- movl PARAM_XP,%esi
- movl PARAM_WP,%edi
- movl PARAM_YP,%ebp
-
- movl (%esi),%eax C load xp[0]
- mull (%ebp) C multiply by yp[0]
- movl %eax,(%edi) C store to wp[0]
- movl PARAM_XSIZE,%ecx C xsize
- decl %ecx C If xsize = 1, ysize = 1 too
- jz L(done)
-
- movl PARAM_XSIZE,%eax
- pushl %ebx
-FRAME_pushl()
- movl %edx,%ebx
- leal (%esi,%eax,4),%esi C make xp point at end
- leal (%edi,%eax,4),%edi C offset wp by xsize
- negl %ecx C negate j size/index for inner loop
- xorl %eax,%eax C clear carry
-
- ALIGN(8)
-L(oop1): adcl $0,%ebx
- movl (%esi,%ecx,4),%eax C load next limb at xp[j]
- mull (%ebp)
- addl %ebx,%eax
- movl %eax,(%edi,%ecx,4)
- incl %ecx
- movl %edx,%ebx
- jnz L(oop1)
-
- adcl $0,%ebx
- movl PARAM_YSIZE,%eax
- movl %ebx,(%edi) C most significant limb of product
- addl $4,%edi C increment wp
- decl %eax
- jz L(skip)
- movl %eax,VAR_COUNTER C set index i to ysize
-
-L(outer):
- addl $4,%ebp C make ebp point to next y limb
- movl PARAM_XSIZE,%ecx
- negl %ecx
- xorl %ebx,%ebx
-
- C code at 0x61 here, close enough to aligned
-L(oop2):
- adcl $0,%ebx
- movl (%esi,%ecx,4),%eax
- mull (%ebp)
- addl %ebx,%eax
- movl (%edi,%ecx,4),%ebx
- adcl $0,%edx
- addl %eax,%ebx
- movl %ebx,(%edi,%ecx,4)
- incl %ecx
- movl %edx,%ebx
- jnz L(oop2)
-
- adcl $0,%ebx
-
- movl %ebx,(%edi)
- addl $4,%edi
- movl VAR_COUNTER,%eax
- decl %eax
- movl %eax,VAR_COUNTER
- jnz L(outer)
-
-L(skip):
- popl %ebx
- popl %edi
- popl %ebp
- popl %esi
- addl $4,%esp
- ret
-
-L(done):
- movl %edx,4(%edi) C store to wp[1]
- popl %edi
- popl %ebp
- popl %esi
- popl %eax C dummy pop for deallocating stack slot
- ret
-
-EPILOGUE()
-
diff --git a/rts/gmp/mpn/x86/pentium/rshift.asm b/rts/gmp/mpn/x86/pentium/rshift.asm
deleted file mode 100644
index e8f5ae8ec8..0000000000
--- a/rts/gmp/mpn/x86/pentium/rshift.asm
+++ /dev/null
@@ -1,236 +0,0 @@
-dnl Intel Pentium mpn_rshift -- mpn right shift.
-dnl
-dnl cycles/limb
-dnl P5,P54: 6.0
-dnl P55: 5.375
-
-
-dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
-dnl Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C unsigned shift);
-C
-C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
-C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
-
-defframe(PARAM_SHIFT,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
- .text
- ALIGN(8)
-PROLOGUE(mpn_rshift)
-
- pushl %edi
- pushl %esi
- pushl %ebx
- pushl %ebp
-deflit(`FRAME',16)
-
- movl PARAM_DST,%edi
- movl PARAM_SRC,%esi
- movl PARAM_SIZE,%ebp
- movl PARAM_SHIFT,%ecx
-
-C We can use faster code for shift-by-1 under certain conditions.
- cmp $1,%ecx
- jne L(normal)
- leal 4(%edi),%eax
- cmpl %esi,%eax
- jnc L(special) C jump if res_ptr + 1 >= s_ptr
- leal (%edi,%ebp,4),%eax
- cmpl %eax,%esi
- jnc L(special) C jump if s_ptr >= res_ptr + size
-
-L(normal):
- movl (%esi),%edx
- addl $4,%esi
- xorl %eax,%eax
- shrdl( %cl, %edx, %eax) C compute carry limb
- pushl %eax C push carry limb onto stack
-
- decl %ebp
- pushl %ebp
- shrl $3,%ebp
- jz L(end)
-
- movl (%edi),%eax C fetch destination cache line
-
- ALIGN(4)
-L(oop): movl 28(%edi),%eax C fetch destination cache line
- movl %edx,%ebx
-
- movl (%esi),%eax
- movl 4(%esi),%edx
- shrdl( %cl, %eax, %ebx)
- shrdl( %cl, %edx, %eax)
- movl %ebx,(%edi)
- movl %eax,4(%edi)
-
- movl 8(%esi),%ebx
- movl 12(%esi),%eax
- shrdl( %cl, %ebx, %edx)
- shrdl( %cl, %eax, %ebx)
- movl %edx,8(%edi)
- movl %ebx,12(%edi)
-
- movl 16(%esi),%edx
- movl 20(%esi),%ebx
- shrdl( %cl, %edx, %eax)
- shrdl( %cl, %ebx, %edx)
- movl %eax,16(%edi)
- movl %edx,20(%edi)
-
- movl 24(%esi),%eax
- movl 28(%esi),%edx
- shrdl( %cl, %eax, %ebx)
- shrdl( %cl, %edx, %eax)
- movl %ebx,24(%edi)
- movl %eax,28(%edi)
-
- addl $32,%esi
- addl $32,%edi
- decl %ebp
- jnz L(oop)
-
-L(end): popl %ebp
- andl $7,%ebp
- jz L(end2)
-L(oop2):
- movl (%esi),%eax
- shrdl( %cl,%eax,%edx) C compute result limb
- movl %edx,(%edi)
- movl %eax,%edx
- addl $4,%esi
- addl $4,%edi
- decl %ebp
- jnz L(oop2)
-
-L(end2):
- shrl %cl,%edx C compute most significant limb
- movl %edx,(%edi) C store it
-
- popl %eax C pop carry limb
-
- popl %ebp
- popl %ebx
- popl %esi
- popl %edi
- ret
-
-
-C We loop from least significant end of the arrays, which is only
-C permissable if the source and destination don't overlap, since the
-C function is documented to work for overlapping source and destination.
-
-L(special):
- leal -4(%edi,%ebp,4),%edi
- leal -4(%esi,%ebp,4),%esi
-
- movl (%esi),%edx
- subl $4,%esi
-
- decl %ebp
- pushl %ebp
- shrl $3,%ebp
-
- shrl %edx
- incl %ebp
- decl %ebp
- jz L(Lend)
-
- movl (%edi),%eax C fetch destination cache line
-
- ALIGN(4)
-L(Loop):
- movl -28(%edi),%eax C fetch destination cache line
- movl %edx,%ebx
-
- movl (%esi),%eax
- movl -4(%esi),%edx
- rcrl %eax
- movl %ebx,(%edi)
- rcrl %edx
- movl %eax,-4(%edi)
-
- movl -8(%esi),%ebx
- movl -12(%esi),%eax
- rcrl %ebx
- movl %edx,-8(%edi)
- rcrl %eax
- movl %ebx,-12(%edi)
-
- movl -16(%esi),%edx
- movl -20(%esi),%ebx
- rcrl %edx
- movl %eax,-16(%edi)
- rcrl %ebx
- movl %edx,-20(%edi)
-
- movl -24(%esi),%eax
- movl -28(%esi),%edx
- rcrl %eax
- movl %ebx,-24(%edi)
- rcrl %edx
- movl %eax,-28(%edi)
-
- leal -32(%esi),%esi C use leal not to clobber carry
- leal -32(%edi),%edi
- decl %ebp
- jnz L(Loop)
-
-L(Lend):
- popl %ebp
- sbbl %eax,%eax C save carry in %eax
- andl $7,%ebp
- jz L(Lend2)
- addl %eax,%eax C restore carry from eax
-L(Loop2):
- movl %edx,%ebx
- movl (%esi),%edx
- rcrl %edx
- movl %ebx,(%edi)
-
- leal -4(%esi),%esi C use leal not to clobber carry
- leal -4(%edi),%edi
- decl %ebp
- jnz L(Loop2)
-
- jmp L(L1)
-L(Lend2):
- addl %eax,%eax C restore carry from eax
-L(L1): movl %edx,(%edi) C store last limb
-
- movl $0,%eax
- rcrl %eax
-
- popl %ebp
- popl %ebx
- popl %esi
- popl %edi
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/sqr_basecase.asm b/rts/gmp/mpn/x86/pentium/sqr_basecase.asm
deleted file mode 100644
index c8584df13c..0000000000
--- a/rts/gmp/mpn/x86/pentium/sqr_basecase.asm
+++ /dev/null
@@ -1,520 +0,0 @@
-dnl Intel P5 mpn_sqr_basecase -- square an mpn number.
-dnl
-dnl P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
-dnl product at around 20x20 limbs.
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
-C
-C Calculate src,size squared, storing the result in dst,2*size.
-C
-C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
-C lot of function call overheads are avoided, especially when the size is
-C small.
-
-defframe(PARAM_SIZE,12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
- .text
- ALIGN(8)
-PROLOGUE(mpn_sqr_basecase)
-deflit(`FRAME',0)
-
- movl PARAM_SIZE, %edx
- movl PARAM_SRC, %eax
-
- cmpl $2, %edx
- movl PARAM_DST, %ecx
-
- je L(two_limbs)
-
- movl (%eax), %eax
- ja L(three_or_more)
-
-C -----------------------------------------------------------------------------
-C one limb only
- C eax src
- C ebx
- C ecx dst
- C edx
-
- mull %eax
-
- movl %eax, (%ecx)
- movl %edx, 4(%ecx)
-
- ret
-
-C -----------------------------------------------------------------------------
- ALIGN(8)
-L(two_limbs):
- C eax src
- C ebx
- C ecx dst
- C edx size
-
- pushl %ebp
- pushl %edi
-
- pushl %esi
- pushl %ebx
-
- movl %eax, %ebx
- movl (%eax), %eax
-
- mull %eax C src[0]^2
-
- movl %eax, (%ecx) C dst[0]
- movl %edx, %esi C dst[1]
-
- movl 4(%ebx), %eax
-
- mull %eax C src[1]^2
-
- movl %eax, %edi C dst[2]
- movl %edx, %ebp C dst[3]
-
- movl (%ebx), %eax
-
- mull 4(%ebx) C src[0]*src[1]
-
- addl %eax, %esi
- popl %ebx
-
- adcl %edx, %edi
-
- adcl $0, %ebp
- addl %esi, %eax
-
- adcl %edi, %edx
- movl %eax, 4(%ecx)
-
- adcl $0, %ebp
- popl %esi
-
- movl %edx, 8(%ecx)
- movl %ebp, 12(%ecx)
-
- popl %edi
- popl %ebp
-
- ret
-
-
-C -----------------------------------------------------------------------------
- ALIGN(8)
-L(three_or_more):
- C eax src low limb
- C ebx
- C ecx dst
- C edx size
-
- cmpl $4, %edx
- pushl %ebx
-deflit(`FRAME',4)
-
- movl PARAM_SRC, %ebx
- jae L(four_or_more)
-
-
-C -----------------------------------------------------------------------------
-C three limbs
- C eax src low limb
- C ebx src
- C ecx dst
- C edx size
-
- pushl %ebp
- pushl %edi
-
- mull %eax C src[0] ^ 2
-
- movl %eax, (%ecx)
- movl %edx, 4(%ecx)
-
- movl 4(%ebx), %eax
- xorl %ebp, %ebp
-
- mull %eax C src[1] ^ 2
-
- movl %eax, 8(%ecx)
- movl %edx, 12(%ecx)
-
- movl 8(%ebx), %eax
- pushl %esi C risk of cache bank clash
-
- mull %eax C src[2] ^ 2
-
- movl %eax, 16(%ecx)
- movl %edx, 20(%ecx)
-
- movl (%ebx), %eax
-
- mull 4(%ebx) C src[0] * src[1]
-
- movl %eax, %esi
- movl %edx, %edi
-
- movl (%ebx), %eax
-
- mull 8(%ebx) C src[0] * src[2]
-
- addl %eax, %edi
- movl %edx, %ebp
-
- adcl $0, %ebp
- movl 4(%ebx), %eax
-
- mull 8(%ebx) C src[1] * src[2]
-
- xorl %ebx, %ebx
- addl %eax, %ebp
-
- C eax
- C ebx zero, will be dst[5]
- C ecx dst
- C edx dst[4]
- C esi dst[1]
- C edi dst[2]
- C ebp dst[3]
-
- adcl $0, %edx
- addl %esi, %esi
-
- adcl %edi, %edi
-
- adcl %ebp, %ebp
-
- adcl %edx, %edx
- movl 4(%ecx), %eax
-
- adcl $0, %ebx
- addl %esi, %eax
-
- movl %eax, 4(%ecx)
- movl 8(%ecx), %eax
-
- adcl %edi, %eax
- movl 12(%ecx), %esi
-
- adcl %ebp, %esi
- movl 16(%ecx), %edi
-
- movl %eax, 8(%ecx)
- movl %esi, 12(%ecx)
-
- adcl %edx, %edi
- popl %esi
-
- movl 20(%ecx), %eax
- movl %edi, 16(%ecx)
-
- popl %edi
- popl %ebp
-
- adcl %ebx, %eax C no carry out of this
- popl %ebx
-
- movl %eax, 20(%ecx)
-
- ret
-
-
-C -----------------------------------------------------------------------------
- ALIGN(8)
-L(four_or_more):
- C eax src low limb
- C ebx src
- C ecx dst
- C edx size
- C esi
- C edi
- C ebp
- C
- C First multiply src[0]*src[1..size-1] and store at dst[1..size].
-
-deflit(`FRAME',4)
-
- pushl %edi
-FRAME_pushl()
- pushl %esi
-FRAME_pushl()
-
- pushl %ebp
-FRAME_pushl()
- leal (%ecx,%edx,4), %edi C dst end of this mul1
-
- leal (%ebx,%edx,4), %esi C src end
- movl %ebx, %ebp C src
-
- negl %edx C -size
- xorl %ebx, %ebx C clear carry limb and carry flag
-
- leal 1(%edx), %ecx C -(size-1)
-
-L(mul1):
- C eax scratch
- C ebx carry
- C ecx counter, negative
- C edx scratch
- C esi &src[size]
- C edi &dst[size]
- C ebp src
-
- adcl $0, %ebx
- movl (%esi,%ecx,4), %eax
-
- mull (%ebp)
-
- addl %eax, %ebx
-
- movl %ebx, (%edi,%ecx,4)
- incl %ecx
-
- movl %edx, %ebx
- jnz L(mul1)
-
-
- C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
- C n=1..size-2.
- C
- C The last two products, which are the end corner of the product
- C triangle, are handled separately to save looping overhead. These
- C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
- C If size is 4 then it's only these that need to be done.
- C
- C In the outer loop %esi is a constant, and %edi just advances by 1
- C limb each time. The size of the operation decreases by 1 limb
- C each time.
-
- C eax
- C ebx carry (needing carry flag added)
- C ecx
- C edx
- C esi &src[size]
- C edi &dst[size]
- C ebp
-
- adcl $0, %ebx
- movl PARAM_SIZE, %edx
-
- movl %ebx, (%edi)
- subl $4, %edx
-
- negl %edx
- jz L(corner)
-
-
-L(outer):
- C ebx previous carry limb to store
- C edx outer loop counter (negative)
- C esi &src[size]
- C edi dst, pointing at stored carry limb of previous loop
-
- pushl %edx C new outer loop counter
- leal -2(%edx), %ecx
-
- movl %ebx, (%edi)
- addl $4, %edi
-
- addl $4, %ebp
- xorl %ebx, %ebx C initial carry limb, clear carry flag
-
-L(inner):
- C eax scratch
- C ebx carry (needing carry flag added)
- C ecx counter, negative
- C edx scratch
- C esi &src[size]
- C edi dst end of this addmul
- C ebp &src[j]
-
- adcl $0, %ebx
- movl (%esi,%ecx,4), %eax
-
- mull (%ebp)
-
- addl %ebx, %eax
- movl (%edi,%ecx,4), %ebx
-
- adcl $0, %edx
- addl %eax, %ebx
-
- movl %ebx, (%edi,%ecx,4)
- incl %ecx
-
- movl %edx, %ebx
- jnz L(inner)
-
-
- adcl $0, %ebx
- popl %edx C outer loop counter
-
- incl %edx
- jnz L(outer)
-
-
- movl %ebx, (%edi)
-
-L(corner):
- C esi &src[size]
- C edi &dst[2*size-4]
-
- movl -8(%esi), %eax
- movl -4(%edi), %ebx C risk of data cache bank clash here
-
- mull -12(%esi) C src[size-2]*src[size-3]
-
- addl %eax, %ebx
- movl %edx, %ecx
-
- adcl $0, %ecx
- movl -4(%esi), %eax
-
- mull -12(%esi) C src[size-1]*src[size-3]
-
- addl %ecx, %eax
- movl (%edi), %ecx
-
- adcl $0, %edx
- movl %ebx, -4(%edi)
-
- addl %eax, %ecx
- movl %edx, %ebx
-
- adcl $0, %ebx
- movl -4(%esi), %eax
-
- mull -8(%esi) C src[size-1]*src[size-2]
-
- movl %ecx, 0(%edi)
- addl %eax, %ebx
-
- adcl $0, %edx
- movl PARAM_SIZE, %eax
-
- negl %eax
- movl %ebx, 4(%edi)
-
- addl $1, %eax C -(size-1) and clear carry
- movl %edx, 8(%edi)
-
-
-C -----------------------------------------------------------------------------
-C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
-
-L(lshift):
- C eax counter, negative
- C ebx next limb
- C ecx
- C edx
- C esi
- C edi &dst[2*size-4]
- C ebp
-
- movl 12(%edi,%eax,8), %ebx
-
- rcll %ebx
- movl 16(%edi,%eax,8), %ecx
-
- rcll %ecx
- movl %ebx, 12(%edi,%eax,8)
-
- movl %ecx, 16(%edi,%eax,8)
- incl %eax
-
- jnz L(lshift)
-
-
- adcl %eax, %eax C high bit out
- movl PARAM_SRC, %esi
-
- movl PARAM_SIZE, %ecx C risk of cache bank clash
- movl %eax, 12(%edi) C dst most significant limb
-
-
-C -----------------------------------------------------------------------------
-C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
-C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
-C low limb of src[0]^2.
-
- movl (%esi), %eax C src[0]
- leal (%esi,%ecx,4), %esi C src end
-
- negl %ecx
-
- mull %eax
-
- movl %eax, 16(%edi,%ecx,8) C dst[0]
- movl %edx, %ebx
-
- addl $1, %ecx C size-1 and clear carry
-
-L(diag):
- C eax scratch (low product)
- C ebx carry limb
- C ecx counter, negative
- C edx scratch (high product)
- C esi &src[size]
- C edi &dst[2*size-4]
- C ebp scratch (fetched dst limbs)
-
- movl (%esi,%ecx,4), %eax
- adcl $0, %ebx
-
- mull %eax
-
- movl 16-4(%edi,%ecx,8), %ebp
-
- addl %ebp, %ebx
- movl 16(%edi,%ecx,8), %ebp
-
- adcl %eax, %ebp
- movl %ebx, 16-4(%edi,%ecx,8)
-
- movl %ebp, 16(%edi,%ecx,8)
- incl %ecx
-
- movl %edx, %ebx
- jnz L(diag)
-
-
- adcl $0, %edx
- movl 16-4(%edi), %eax C dst most significant limb
-
- addl %eax, %edx
- popl %ebp
-
- movl %edx, 16-4(%edi)
- popl %esi C risk of cache bank clash
-
- popl %edi
- popl %ebx
-
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/rshift.asm b/rts/gmp/mpn/x86/rshift.asm
deleted file mode 100644
index c9881fd966..0000000000
--- a/rts/gmp/mpn/x86/rshift.asm
+++ /dev/null
@@ -1,92 +0,0 @@
-dnl x86 mpn_rshift -- mpn right shift.
-
-dnl Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
-dnl Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C unsigned shift);
-
-defframe(PARAM_SHIFT,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-
- .text
- ALIGN(8)
-PROLOGUE(mpn_rshift)
-
- pushl %edi
- pushl %esi
- pushl %ebx
-deflit(`FRAME',12)
-
- movl PARAM_DST,%edi
- movl PARAM_SRC,%esi
- movl PARAM_SIZE,%edx
- movl PARAM_SHIFT,%ecx
-
- leal -4(%edi,%edx,4),%edi
- leal (%esi,%edx,4),%esi
- negl %edx
-
- movl (%esi,%edx,4),%ebx C read least significant limb
- xorl %eax,%eax
- shrdl( %cl, %ebx, %eax) C compute carry limb
- incl %edx
- jz L(end)
- pushl %eax C push carry limb onto stack
- testb $1,%dl
- jnz L(1) C enter loop in the middle
- movl %ebx,%eax
-
- ALIGN(8)
-L(oop): movl (%esi,%edx,4),%ebx C load next higher limb
- shrdl( %cl, %ebx, %eax) C compute result limb
- movl %eax,(%edi,%edx,4) C store it
- incl %edx
-L(1): movl (%esi,%edx,4),%eax
- shrdl( %cl, %eax, %ebx)
- movl %ebx,(%edi,%edx,4)
- incl %edx
- jnz L(oop)
-
- shrl %cl,%eax C compute most significant limb
- movl %eax,(%edi) C store it
-
- popl %eax C pop carry limb
-
- popl %ebx
- popl %esi
- popl %edi
- ret
-
-L(end): shrl %cl,%ebx C compute most significant limb
- movl %ebx,(%edi) C store it
-
- popl %ebx
- popl %esi
- popl %edi
- ret
-
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/udiv.asm b/rts/gmp/mpn/x86/udiv.asm
deleted file mode 100644
index 9fe022b107..0000000000
--- a/rts/gmp/mpn/x86/udiv.asm
+++ /dev/null
@@ -1,44 +0,0 @@
-dnl x86 mpn_udiv_qrnnd -- 2 by 1 limb division
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_udiv_qrnnd (mp_limb_t *remptr, mp_limb_t high, mp_limb_t low,
-C mp_limb_t divisor);
-
-defframe(PARAM_DIVISOR, 16)
-defframe(PARAM_LOW, 12)
-defframe(PARAM_HIGH, 8)
-defframe(PARAM_REMPTR, 4)
-
- TEXT
- ALIGN(8)
-PROLOGUE(mpn_udiv_qrnnd)
-deflit(`FRAME',0)
- movl PARAM_LOW, %eax
- movl PARAM_HIGH, %edx
- divl PARAM_DIVISOR
- movl PARAM_REMPTR, %ecx
- movl %edx, (%ecx)
- ret
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/umul.asm b/rts/gmp/mpn/x86/umul.asm
deleted file mode 100644
index 3d289d1784..0000000000
--- a/rts/gmp/mpn/x86/umul.asm
+++ /dev/null
@@ -1,43 +0,0 @@
-dnl mpn_umul_ppmm -- 1x1->2 limb multiplication
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-include(`../config.m4')
-
-
-C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2);
-C
-
-defframe(PARAM_M2, 12)
-defframe(PARAM_M1, 8)
-defframe(PARAM_LOWPTR, 4)
-
- TEXT
- ALIGN(8)
-PROLOGUE(mpn_umul_ppmm)
-deflit(`FRAME',0)
- movl PARAM_LOWPTR, %ecx
- movl PARAM_M1, %eax
- mull PARAM_M2
- movl %eax, (%ecx)
- movl %edx, %eax
- ret
-EPILOGUE()
diff --git a/rts/gmp/mpn/x86/x86-defs.m4 b/rts/gmp/mpn/x86/x86-defs.m4
deleted file mode 100644
index 2dad698002..0000000000
--- a/rts/gmp/mpn/x86/x86-defs.m4
+++ /dev/null
@@ -1,713 +0,0 @@
-divert(-1)
-
-dnl m4 macros for x86 assembler.
-
-
-dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 2.1 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with the GNU MP Library; see the file COPYING.LIB. If
-dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
-dnl Suite 330, Boston, MA 02111-1307, USA.
-
-
-dnl Notes:
-dnl
-dnl m4 isn't perfect for processing BSD style x86 assembler code, the main
-dnl problems are,
-dnl
-dnl 1. Doing define(foo,123) and then using foo in an addressing mode like
-dnl foo(%ebx) expands as a macro rather than a constant. This is worked
-dnl around by using deflit() from asm-defs.m4, instead of define().
-dnl
-dnl 2. Immediates in macro definitions need a space or `' to stop the $
-dnl looking like a macro parameter. For example,
-dnl
-dnl define(foo, `mov $ 123, %eax')
-dnl
-dnl This is only a problem in macro definitions, not in ordinary text,
-dnl nor in macro parameters like text passed to forloop() or ifdef().
-
-
-deflit(BYTES_PER_MP_LIMB, 4)
-
-
-dnl --------------------------------------------------------------------------
-dnl Replacement PROLOGUE/EPILOGUE with more sophisticated error checking.
-dnl Nesting and overlapping not allowed.
-dnl
-
-
-dnl Usage: PROLOGUE(functionname)
-dnl
-dnl Generate a function prologue. functionname gets GSYM_PREFIX added.
-dnl Examples,
-dnl
-dnl PROLOGUE(mpn_add_n)
-dnl PROLOGUE(somefun)
-
-define(`PROLOGUE',
-m4_assert_numargs(1)
-m4_assert_defined(`PROLOGUE_cpu')
-`ifdef(`PROLOGUE_current_function',
-`m4_error(`PROLOGUE'(`PROLOGUE_current_function') needs an `EPILOGUE'() before `PROLOGUE'($1)
-)')dnl
-m4_file_seen()dnl
-define(`PROLOGUE_current_function',`$1')dnl
-PROLOGUE_cpu(GSYM_PREFIX`'$1)')
-
-
-dnl Usage: EPILOGUE()
-dnl
-dnl Notice the function name is passed to EPILOGUE_cpu(), letting it use $1
-dnl instead of the long PROLOGUE_current_function symbol.
-
-define(`EPILOGUE',
-m4_assert_numargs(0)
-m4_assert_defined(`EPILOGUE_cpu')
-`ifdef(`PROLOGUE_current_function',,
-`m4_error(`EPILOGUE'() with no `PROLOGUE'()
-)')dnl
-EPILOGUE_cpu(GSYM_PREFIX`'PROLOGUE_current_function)`'dnl
-undefine(`PROLOGUE_current_function')')
-
-m4wrap_prepend(
-`ifdef(`PROLOGUE_current_function',
-`m4_error(`EPILOGUE() for PROLOGUE('PROLOGUE_current_function`) never seen
-')')')
-
-
-dnl Usage: PROLOGUE_assert_inside()
-dnl
-dnl Use this unquoted on a line on its own at the start of a macro
-dnl definition to add some code to check the macro is only used inside a
-dnl PROLOGUE/EPILOGUE pair, and that hence PROLOGUE_current_function is
-dnl defined.
-
-define(PROLOGUE_assert_inside,
-m4_assert_numargs(0)
-``PROLOGUE_assert_inside_internal'(m4_doublequote($`'0))`dnl '')
-
-define(PROLOGUE_assert_inside_internal,
-m4_assert_numargs(1)
-`ifdef(`PROLOGUE_current_function',,
-`m4_error(`$1 used outside a PROLOGUE / EPILOGUE pair
-')')')
-
-
-dnl Usage: L(labelname)
-dnl LF(functionname,labelname)
-dnl
-dnl Generate a local label in the current or given function. For LF(),
-dnl functionname gets GSYM_PREFIX added, the same as with PROLOGUE().
-dnl
-dnl For example, in a function mpn_add_n (and with MPN_PREFIX __gmpn),
-dnl
-dnl L(bar) => L__gmpn_add_n__bar
-dnl LF(somefun,bar) => Lsomefun__bar
-dnl
-dnl The funtion name and label name get two underscores between them rather
-dnl than one to guard against clashing with a separate external symbol that
-dnl happened to be called functionname_labelname. (Though this would only
-dnl happen if the local label prefix is is empty.) Underscores are used so
-dnl the whole label will still be a valid C identifier and so can be easily
-dnl used in gdb.
-
-dnl LSYM_PREFIX can be L$, so defn() is used to prevent L expanding as the
-dnl L macro and making an infinite recursion.
-define(LF,
-m4_assert_numargs(2)
-m4_assert_defined(`LSYM_PREFIX')
-`defn(`LSYM_PREFIX')GSYM_PREFIX`'$1`'__$2')
-
-define(`L',
-m4_assert_numargs(1)
-PROLOGUE_assert_inside()
-`LF(PROLOGUE_current_function,`$1')')
-
-
-dnl Called: PROLOGUE_cpu(gsym)
-dnl EPILOGUE_cpu(gsym)
-
-define(PROLOGUE_cpu,
-m4_assert_numargs(1)
- `GLOBL $1
- TYPE($1,`function')
-$1:')
-
-define(EPILOGUE_cpu,
-m4_assert_numargs(1)
-` SIZE($1,.-$1)')
-
-
-
-dnl --------------------------------------------------------------------------
-dnl Various x86 macros.
-dnl
-
-
-dnl Usage: ALIGN_OFFSET(bytes,offset)
-dnl
-dnl Align to `offset' away from a multiple of `bytes'.
-dnl
-dnl This is useful for testing, for example align to something very strict
-dnl and see what effect offsets from it have, "ALIGN_OFFSET(256,32)".
-dnl
-dnl Generally you wouldn't execute across the padding, but it's done with
-dnl nop's so it'll work.
-
-define(ALIGN_OFFSET,
-m4_assert_numargs(2)
-`ALIGN($1)
-forloop(`i',1,$2,` nop
-')')
-
-
-dnl Usage: defframe(name,offset)
-dnl
-dnl Make a definition like the following with which to access a parameter
-dnl or variable on the stack.
-dnl
-dnl define(name,`FRAME+offset(%esp)')
-dnl
-dnl Actually m4_empty_if_zero(FRAME+offset) is used, which will save one
-dnl byte if FRAME+offset is zero, by putting (%esp) rather than 0(%esp).
-dnl Use define(`defframe_empty_if_zero_disabled',1) if for some reason the
-dnl zero offset is wanted.
-dnl
-dnl The new macro also gets a check that when it's used FRAME is actually
-dnl defined, and that the final %esp offset isn't negative, which would
-dnl mean an attempt to access something below the current %esp.
-dnl
-dnl deflit() is used rather than a plain define(), so the new macro won't
-dnl delete any following parenthesized expression. name(%edi) will come
-dnl out say as 16(%esp)(%edi). This isn't valid assembler and should
-dnl provoke an error, which is better than silently giving just 16(%esp).
-dnl
-dnl See README.family for more on the suggested way to access the stack
-dnl frame.
-
-define(defframe,
-m4_assert_numargs(2)
-`deflit(`$1',
-m4_assert_defined(`FRAME')
-`defframe_check_notbelow(`$1',$2,FRAME)dnl
-defframe_empty_if_zero(FRAME+($2))(%esp)')')
-
-dnl Called: defframe_empty_if_zero(expression)
-define(defframe_empty_if_zero,
-`ifelse(defframe_empty_if_zero_disabled,1,
-`eval($1)',
-`m4_empty_if_zero($1)')')
-
-dnl Called: defframe_check_notbelow(`name',offset,FRAME)
-define(defframe_check_notbelow,
-m4_assert_numargs(3)
-`ifelse(eval(($3)+($2)<0),1,
-`m4_error(`$1 at frame offset $2 used when FRAME is only $3 bytes
-')')')
-
-
-dnl Usage: FRAME_pushl()
-dnl FRAME_popl()
-dnl FRAME_addl_esp(n)
-dnl FRAME_subl_esp(n)
-dnl
-dnl Adjust FRAME appropriately for a pushl or popl, or for an addl or subl
-dnl %esp of n bytes.
-dnl
-dnl Using these macros is completely optional. Sometimes it makes more
-dnl sense to put explicit deflit(`FRAME',N) forms, especially when there's
-dnl jumps and different sequences of FRAME values need to be used in
-dnl different places.
-
-define(FRAME_pushl,
-m4_assert_numargs(0)
-m4_assert_defined(`FRAME')
-`deflit(`FRAME',eval(FRAME+4))')
-
-define(FRAME_popl,
-m4_assert_numargs(0)
-m4_assert_defined(`FRAME')
-`deflit(`FRAME',eval(FRAME-4))')
-
-define(FRAME_addl_esp,
-m4_assert_numargs(1)
-m4_assert_defined(`FRAME')
-`deflit(`FRAME',eval(FRAME-($1)))')
-
-define(FRAME_subl_esp,
-m4_assert_numargs(1)
-m4_assert_defined(`FRAME')
-`deflit(`FRAME',eval(FRAME+($1)))')
-
-
-dnl Usage: defframe_pushl(name)
-dnl
-dnl Do a combination of a FRAME_pushl() and a defframe() to name the stack
-dnl location just pushed. This should come after a pushl instruction.
-dnl Putting it on the same line works and avoids lengthening the code. For
-dnl example,
-dnl
-dnl pushl %eax defframe_pushl(VAR_COUNTER)
-dnl
-dnl Notice the defframe() is done with an unquoted -FRAME thus giving its
-dnl current value without tracking future changes.
-
-define(defframe_pushl,
-`FRAME_pushl()defframe(`$1',-FRAME)')
-
-
-dnl --------------------------------------------------------------------------
-dnl Assembler instruction macros.
-dnl
-
-
-dnl Usage: emms_or_femms
-dnl femms_available_p
-dnl
-dnl femms_available_p expands to 1 or 0 according to whether the AMD 3DNow
-dnl femms instruction is available. emms_or_femms expands to femms if
-dnl available, or emms if not.
-dnl
-dnl emms_or_femms is meant for use in the K6 directory where plain K6
-dnl (without femms) and K6-2 and K6-3 (with a slightly faster femms) are
-dnl supported together.
-dnl
-dnl On K7 femms is no longer faster and is just an alias for emms, so plain
-dnl emms may as well be used.
-
-define(femms_available_p,
-m4_assert_numargs(-1)
-`m4_ifdef_anyof_p(
- `HAVE_TARGET_CPU_k62',
- `HAVE_TARGET_CPU_k63',
- `HAVE_TARGET_CPU_athlon')')
-
-define(emms_or_femms,
-m4_assert_numargs(-1)
-`ifelse(femms_available_p,1,`femms',`emms')')
-
-
-dnl Usage: femms
-dnl
-dnl The gas 2.9.1 that comes with FreeBSD 3.4 doesn't support femms, so the
-dnl following is a replacement using .byte.
-dnl
-dnl If femms isn't available, an emms is generated instead, for convenience
-dnl when testing on a machine without femms.
-
-define(femms,
-m4_assert_numargs(-1)
-`ifelse(femms_available_p,1,
-`.byte 15,14 C AMD 3DNow femms',
-`emms`'dnl
-m4_warning(`warning, using emms in place of femms, use for testing only
-')')')
-
-
-dnl Usage: jadcl0(op)
-dnl
-dnl Issue a jnc/incl as a substitute for adcl $0,op. This isn't an exact
-dnl replacement, since it doesn't set the flags like adcl does.
-dnl
-dnl This finds a use in K6 mpn_addmul_1, mpn_submul_1, mpn_mul_basecase and
-dnl mpn_sqr_basecase because on K6 an adcl is slow, the branch
-dnl misprediction penalty is small, and the multiply algorithm used leads
-dnl to a carry bit on average only 1/4 of the time.
-dnl
-dnl jadcl0_disabled can be set to 1 to instead issue an ordinary adcl for
-dnl comparison. For example,
-dnl
-dnl define(`jadcl0_disabled',1)
-dnl
-dnl When using a register operand, eg. "jadcl0(%edx)", the jnc/incl code is
-dnl the same size as an adcl. This makes it possible to use the exact same
-dnl computed jump code when testing the relative speed of jnc/incl and adcl
-dnl with jadcl0_disabled.
-
-define(jadcl0,
-m4_assert_numargs(1)
-`ifelse(jadcl0_disabled,1,
- `adcl $`'0, $1',
- `jnc 1f
- incl $1
-1:dnl')')
-
-
-dnl Usage: cmov_available_p
-dnl
-dnl Expand to 1 if cmov is available, 0 if not.
-
-define(cmov_available_p,
-`m4_ifdef_anyof_p(
- `HAVE_TARGET_CPU_pentiumpro',
- `HAVE_TARGET_CPU_pentium2',
- `HAVE_TARGET_CPU_pentium3',
- `HAVE_TARGET_CPU_athlon')')
-
-
-dnl Usage: x86_lookup(target, key,value, key,value, ...)
-dnl x86_lookup_p(target, key,value, key,value, ...)
-dnl
-dnl Look for `target' among the `key' parameters.
-dnl
-dnl x86_lookup expands to the corresponding `value', or generates an error
-dnl if `target' isn't found.
-dnl
-dnl x86_lookup_p expands to 1 if `target' is found, or 0 if not.
-
-define(x86_lookup,
-`ifelse(eval($#<3),1,
-`m4_error(`unrecognised part of x86 instruction: $1
-')',
-`ifelse(`$1',`$2', `$3',
-`x86_lookup(`$1',shift(shift(shift($@))))')')')
-
-define(x86_lookup_p,
-`ifelse(eval($#<3),1, `0',
-`ifelse(`$1',`$2', `1',
-`x86_lookup_p(`$1',shift(shift(shift($@))))')')')
-
-
-dnl Usage: x86_opcode_reg32(reg)
-dnl x86_opcode_reg32_p(reg)
-dnl
-dnl x86_opcode_reg32 expands to the standard 3 bit encoding for the given
-dnl 32-bit register, eg. `%ebp' turns into 5.
-dnl
-dnl x86_opcode_reg32_p expands to 1 if reg is a valid 32-bit register, or 0
-dnl if not.
-
-define(x86_opcode_reg32,
-m4_assert_numargs(1)
-`x86_lookup(`$1',x86_opcode_reg32_list)')
-
-define(x86_opcode_reg32_p,
-m4_assert_onearg()
-`x86_lookup_p(`$1',x86_opcode_reg32_list)')
-
-define(x86_opcode_reg32_list,
-``%eax',0,
-`%ecx',1,
-`%edx',2,
-`%ebx',3,
-`%esp',4,
-`%ebp',5,
-`%esi',6,
-`%edi',7')
-
-
-dnl Usage: x86_opcode_tttn(cond)
-dnl
-dnl Expand to the 4-bit "tttn" field value for the given x86 branch
-dnl condition (like `c', `ae', etc).
-
-define(x86_opcode_tttn,
-m4_assert_numargs(1)
-`x86_lookup(`$1',x86_opcode_ttn_list)')
-
-define(x86_opcode_tttn_list,
-``o', 0,
-`no', 1,
-`b', 2, `c', 2, `nae',2,
-`nb', 3, `nc', 3, `ae', 3,
-`e', 4, `z', 4,
-`ne', 5, `nz', 5,
-`be', 6, `na', 6,
-`nbe', 7, `a', 7,
-`s', 8,
-`ns', 9,
-`p', 10, `pe', 10, `npo',10,
-`np', 11, `npe',11, `po', 11,
-`l', 12, `nge',12,
-`nl', 13, `ge', 13,
-`le', 14, `ng', 14,
-`nle',15, `g', 15')
-
-
-dnl Usage: cmovCC(srcreg,dstreg)
-dnl
-dnl Generate a cmov instruction if the target supports cmov, or simulate it
-dnl with a conditional jump if not (the latter being meant only for
-dnl testing). For example,
-dnl
-dnl cmovz( %eax, %ebx)
-dnl
-dnl cmov instructions are generated using .byte sequences, since only
-dnl recent versions of gas know cmov.
-dnl
-dnl The source operand can only be a plain register. (m4 code implementing
-dnl full memory addressing modes exists, believe it or not, but isn't
-dnl currently needed and isn't included.)
-dnl
-dnl All the standard conditions are defined. Attempting to use one without
-dnl the macro parentheses, such as just "cmovbe %eax, %ebx", will provoke
-dnl an error. This ensures the necessary .byte sequences aren't
-dnl accidentally missed.
-
-dnl Called: define_cmov_many(cond,tttn,cond,tttn,...)
-define(define_cmov_many,
-`ifelse(m4_length(`$1'),0,,
-`define_cmov(`$1',`$2')define_cmov_many(shift(shift($@)))')')
-
-dnl Called: define_cmov(cond,tttn)
-define(define_cmov,
-m4_assert_numargs(2)
-`define(`cmov$1',
-m4_instruction_wrapper()
-m4_assert_numargs(2)
-`cmov_internal'(m4_doublequote($`'0),``$1',`$2'',dnl
-m4_doublequote($`'1),m4_doublequote($`'2)))')
-
-define_cmov_many(x86_opcode_tttn_list)
-
-
-dnl Called: cmov_internal(name,cond,tttn,src,dst)
-define(cmov_internal,
-m4_assert_numargs(5)
-`ifelse(cmov_available_p,1,
-`cmov_bytes_tttn(`$1',`$3',`$4',`$5')',
-`m4_warning(`warning, simulating cmov with jump, use for testing only
-')cmov_simulate(`$2',`$4',`$5')')')
-
-dnl Called: cmov_simulate(cond,src,dst)
-dnl If this is going to be used with memory operands for the source it will
-dnl need to be changed to do a fetch even if the condition is false, so as
-dnl to trigger exceptions the same way a real cmov does.
-define(cmov_simulate,
-m4_assert_numargs(3)
- `j$1 1f C cmov$1 $2, $3
- jmp 2f
-1: movl $2, $3
-2:')
-
-dnl Called: cmov_bytes_tttn(name,tttn,src,dst)
-define(cmov_bytes_tttn,
-m4_assert_numargs(4)
-`.byte dnl
-15, dnl
-eval(64+$2), dnl
-eval(192+8*x86_opcode_reg32(`$4')+x86_opcode_reg32(`$3')) dnl
- C `$1 $3, $4'')
-
-
-dnl Usage: loop_or_decljnz label
-dnl
-dnl Generate either a "loop" instruction or a "decl %ecx / jnz", whichever
-dnl is better. "loop" is better on K6 and probably on 386, on other chips
-dnl separate decl/jnz is better.
-dnl
-dnl This macro is just for mpn/x86/divrem_1.asm and mpn/x86/mod_1.asm where
-dnl this loop_or_decljnz variation is enough to let the code be shared by
-dnl all chips.
-
-define(loop_or_decljnz,
-`ifelse(loop_is_better_p,1,
- `loop',
- `decl %ecx
- jnz')')
-
-define(loop_is_better_p,
-`m4_ifdef_anyof_p(`HAVE_TARGET_CPU_k6',
- `HAVE_TARGET_CPU_k62',
- `HAVE_TARGET_CPU_k63',
- `HAVE_TARGET_CPU_i386')')
-
-
-dnl Usage: Zdisp(inst,op,op,op)
-dnl
-dnl Generate explicit .byte sequences if necessary to force a byte-sized
-dnl zero displacement on an instruction. For example,
-dnl
-dnl Zdisp( movl, 0,(%esi), %eax)
-dnl
-dnl expands to
-dnl
-dnl .byte 139,70,0 C movl 0(%esi), %eax
-dnl
-dnl If the displacement given isn't 0, then normal assembler code is
-dnl generated. For example,
-dnl
-dnl Zdisp( movl, 4,(%esi), %eax)
-dnl
-dnl expands to
-dnl
-dnl movl 4(%esi), %eax
-dnl
-dnl This means a single Zdisp() form can be used with an expression for the
-dnl displacement, and .byte will be used only if necessary. The
-dnl displacement argument is eval()ed.
-dnl
-dnl Because there aren't many places a 0(reg) form is wanted, Zdisp is
-dnl implemented with a table of instructions and encodings. A new entry is
-dnl needed for any different operation or registers.
-
-define(Zdisp,
-`define(`Zdisp_found',0)dnl
-Zdisp_match( movl, %eax, 0,(%edi), `137,71,0', $@)`'dnl
-Zdisp_match( movl, %ebx, 0,(%edi), `137,95,0', $@)`'dnl
-Zdisp_match( movl, %esi, 0,(%edi), `137,119,0', $@)`'dnl
-Zdisp_match( movl, 0,(%ebx), %eax, `139,67,0', $@)`'dnl
-Zdisp_match( movl, 0,(%ebx), %esi, `139,115,0', $@)`'dnl
-Zdisp_match( movl, 0,(%esi), %eax, `139,70,0', $@)`'dnl
-Zdisp_match( movl, 0,(%esi,%ecx,4), %eax, `0x8b,0x44,0x8e,0x00', $@)`'dnl
-Zdisp_match( addl, %ebx, 0,(%edi), `1,95,0', $@)`'dnl
-Zdisp_match( addl, %ecx, 0,(%edi), `1,79,0', $@)`'dnl
-Zdisp_match( addl, %esi, 0,(%edi), `1,119,0', $@)`'dnl
-Zdisp_match( subl, %ecx, 0,(%edi), `41,79,0', $@)`'dnl
-Zdisp_match( adcl, 0,(%edx), %esi, `19,114,0', $@)`'dnl
-Zdisp_match( sbbl, 0,(%edx), %esi, `27,114,0', $@)`'dnl
-Zdisp_match( movq, 0,(%eax,%ecx,8), %mm0, `0x0f,0x6f,0x44,0xc8,0x00', $@)`'dnl
-Zdisp_match( movq, 0,(%ebx,%eax,4), %mm0, `0x0f,0x6f,0x44,0x83,0x00', $@)`'dnl
-Zdisp_match( movq, 0,(%ebx,%eax,4), %mm2, `0x0f,0x6f,0x54,0x83,0x00', $@)`'dnl
-Zdisp_match( movq, 0,(%esi), %mm0, `15,111,70,0', $@)`'dnl
-Zdisp_match( movq, %mm0, 0,(%edi), `15,127,71,0', $@)`'dnl
-Zdisp_match( movq, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7f,0x54,0x81,0x00', $@)`'dnl
-Zdisp_match( movq, %mm2, 0,(%edx,%eax,4), `0x0f,0x7f,0x54,0x82,0x00', $@)`'dnl
-Zdisp_match( movq, %mm0, 0,(%edx,%ecx,8), `0x0f,0x7f,0x44,0xca,0x00', $@)`'dnl
-Zdisp_match( movd, 0,(%eax,%ecx,8), %mm1, `0x0f,0x6e,0x4c,0xc8,0x00', $@)`'dnl
-Zdisp_match( movd, 0,(%edx,%ecx,8), %mm0, `0x0f,0x6e,0x44,0xca,0x00', $@)`'dnl
-Zdisp_match( movd, %mm0, 0,(%eax,%ecx,4), `0x0f,0x7e,0x44,0x88,0x00', $@)`'dnl
-Zdisp_match( movd, %mm0, 0,(%ecx,%eax,4), `0x0f,0x7e,0x44,0x81,0x00', $@)`'dnl
-Zdisp_match( movd, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7e,0x54,0x81,0x00', $@)`'dnl
-ifelse(Zdisp_found,0,
-`m4_error(`unrecognised instruction in Zdisp: $1 $2 $3 $4
-')')')
-
-define(Zdisp_match,
-`ifelse(eval(m4_stringequal_p(`$1',`$6')
- && m4_stringequal_p(`$2',0)
- && m4_stringequal_p(`$3',`$8')
- && m4_stringequal_p(`$4',`$9')),1,
-`define(`Zdisp_found',1)dnl
-ifelse(eval(`$7'),0,
-` .byte $5 C `$1 0$3, $4'',
-` $6 $7$8, $9')',
-
-`ifelse(eval(m4_stringequal_p(`$1',`$6')
- && m4_stringequal_p(`$2',`$7')
- && m4_stringequal_p(`$3',0)
- && m4_stringequal_p(`$4',`$9')),1,
-`define(`Zdisp_found',1)dnl
-ifelse(eval(`$8'),0,
-` .byte $5 C `$1 $2, 0$4'',
-` $6 $7, $8$9')')')')
-
-
-dnl Usage: shldl(count,src,dst)
-dnl shrdl(count,src,dst)
-dnl shldw(count,src,dst)
-dnl shrdw(count,src,dst)
-dnl
-dnl Generate a double-shift instruction, possibly omitting a %cl count
-dnl parameter if that's what the assembler requires, as indicated by
-dnl WANT_SHLDL_CL in config.m4. For example,
-dnl
-dnl shldl( %cl, %eax, %ebx)
-dnl
-dnl turns into either
-dnl
-dnl shldl %cl, %eax, %ebx
-dnl or
-dnl shldl %eax, %ebx
-dnl
-dnl Immediate counts are always passed through unchanged. For example,
-dnl
-dnl shrdl( $2, %esi, %edi)
-dnl becomes
-dnl shrdl $2, %esi, %edi
-dnl
-dnl
-dnl If you forget to use the macro form "shldl( ...)" and instead write
-dnl just a plain "shldl ...", an error results. This ensures the necessary
-dnl variant treatment of %cl isn't accidentally bypassed.
-
-define(define_shd_instruction,
-`define($1,
-m4_instruction_wrapper()
-m4_assert_numargs(3)
-`shd_instruction'(m4_doublequote($`'0),m4_doublequote($`'1),dnl
-m4_doublequote($`'2),m4_doublequote($`'3)))')
-
-dnl Effectively: define(shldl,`shd_instruction(`$0',`$1',`$2',`$3')') etc
-define_shd_instruction(shldl)
-define_shd_instruction(shrdl)
-define_shd_instruction(shldw)
-define_shd_instruction(shrdw)
-
-dnl Called: shd_instruction(op,count,src,dst)
-define(shd_instruction,
-m4_assert_numargs(4)
-m4_assert_defined(`WANT_SHLDL_CL')
-`ifelse(eval(m4_stringequal_p(`$2',`%cl') && !WANT_SHLDL_CL),1,
-``$1' `$3', `$4'',
-``$1' `$2', `$3', `$4'')')
-
-
-dnl Usage: ASSERT(cond, instructions)
-dnl
-dnl If WANT_ASSERT is 1, output the given instructions and expect the given
-dnl flags condition to then be satisfied. For example,
-dnl
-dnl ASSERT(ne, `cmpl %eax, %ebx')
-dnl
-dnl The instructions can be omitted to just assert a flags condition with
-dnl no extra calculation. For example,
-dnl
-dnl ASSERT(nc)
-dnl
-dnl When `instructions' is not empty, a pushf/popf is added to preserve the
-dnl flags, but the instructions themselves must preserve any registers that
-dnl matter. FRAME is adjusted for the push and pop, so the instructions
-dnl given can use defframe() stack variables.
-
-define(ASSERT,
-m4_assert_numargs_range(1,2)
-`ifelse(WANT_ASSERT,1,
- `C ASSERT
-ifelse(`$2',,,` pushf ifdef(`FRAME',`FRAME_pushl()')')
- $2
- j`$1' 1f
- ud2 C assertion failed
-1:
-ifelse(`$2',,,` popf ifdef(`FRAME',`FRAME_popl()')')
-')')
-
-
-dnl Usage: movl_text_address(label,register)
-dnl
-dnl Get the address of a text segment label, using either a plain movl or a
-dnl position-independent calculation, as necessary. For example,
-dnl
-dnl movl_code_address(L(foo),%eax)
-dnl
-dnl This macro is only meant for use in ASSERT()s or when testing, since
-dnl the PIC sequence it generates will want to be done with a ret balancing
-dnl the call on CPUs with return address branch predition.
-dnl
-dnl The addl generated here has a backward reference to 1b, and so won't
-dnl suffer from the two forwards references bug in old gas (described in
-dnl mpn/x86/README.family).
-
-define(movl_text_address,
-`ifdef(`PIC',
- `call 1f
-1: popl $2 C %eip
- addl `$'$1-1b, $2',
- `movl `$'$1, $2')')
-
-
-divert`'dnl