summaryrefslogtreecommitdiff
path: root/rts/gmp/mpn/x86/k6
diff options
context:
space:
mode:
Diffstat (limited to 'rts/gmp/mpn/x86/k6')
-rw-r--r--rts/gmp/mpn/x86/k6/README237
-rw-r--r--rts/gmp/mpn/x86/k6/aors_n.asm329
-rw-r--r--rts/gmp/mpn/x86/k6/aorsmul_1.asm372
-rw-r--r--rts/gmp/mpn/x86/k6/cross.pl141
-rw-r--r--rts/gmp/mpn/x86/k6/diveby3.asm110
-rw-r--r--rts/gmp/mpn/x86/k6/gmp-mparam.h97
-rw-r--r--rts/gmp/mpn/x86/k6/k62mmx/copyd.asm179
-rw-r--r--rts/gmp/mpn/x86/k6/k62mmx/copyi.asm196
-rw-r--r--rts/gmp/mpn/x86/k6/k62mmx/lshift.asm286
-rw-r--r--rts/gmp/mpn/x86/k6/k62mmx/rshift.asm285
-rw-r--r--rts/gmp/mpn/x86/k6/mmx/com_n.asm91
-rw-r--r--rts/gmp/mpn/x86/k6/mmx/logops_n.asm212
-rw-r--r--rts/gmp/mpn/x86/k6/mmx/lshift.asm122
-rw-r--r--rts/gmp/mpn/x86/k6/mmx/popham.asm238
-rw-r--r--rts/gmp/mpn/x86/k6/mmx/rshift.asm122
-rw-r--r--rts/gmp/mpn/x86/k6/mul_1.asm272
-rw-r--r--rts/gmp/mpn/x86/k6/mul_basecase.asm600
-rw-r--r--rts/gmp/mpn/x86/k6/sqr_basecase.asm672
18 files changed, 4561 insertions, 0 deletions
diff --git a/rts/gmp/mpn/x86/k6/README b/rts/gmp/mpn/x86/k6/README
new file mode 100644
index 0000000000..3ad96c8b89
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/README
@@ -0,0 +1,237 @@
+
+ AMD K6 MPN SUBROUTINES
+
+
+
+This directory contains code optimized for AMD K6 CPUs, meaning K6, K6-2 and
+K6-3.
+
+The mmx and k62mmx subdirectories have routines using MMX instructions. All
+K6s have MMX, the separate directories are just so that ./configure can omit
+them if the assembler doesn't support MMX.
+
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache, are as follows.
+
+ cycles/limb
+
+ mpn_add_n/sub_n 3.25 normal, 2.75 in-place
+
+ mpn_mul_1 6.25
+ mpn_add/submul_1 7.65-8.4 (varying with data values)
+
+ mpn_mul_basecase 9.25 cycles/crossproduct (approx)
+ mpn_sqr_basecase 4.7 cycles/crossproduct (approx)
+ or 9.2 cycles/triangleproduct (approx)
+
+ mpn_divrem_1 20.0
+ mpn_mod_1 20.0
+ mpn_divexact_by3 11.0
+
+ mpn_l/rshift 3.0
+
+ mpn_copyi/copyd 1.0
+
+ mpn_com_n 1.5-1.85 \
+ mpn_and/andn/ior/xor_n 1.5-1.75 | varying with
+ mpn_iorn/xnor_n 2.0-2.25 | data alignment
+ mpn_nand/nior_n 2.0-2.25 /
+
+ mpn_popcount 12.5
+ mpn_hamdist 13.0
+
+
+K6-2 and K6-3 have dual-issue MMX and get the following improvements.
+
+ mpn_l/rshift 1.75
+
+ mpn_copyi/copyd 0.56 or 1.0 \
+ |
+ mpn_com_n 1.0-1.2 | varying with
+ mpn_and/andn/ior/xor_n 1.2-1.5 | data alignment
+ mpn_iorn/xnor_n 1.5-2.0 |
+ mpn_nand/nior_n 1.75-2.0 /
+
+ mpn_popcount 9.0
+ mpn_hamdist 11.5
+
+
+Prefetching of sources hasn't yet given any joy. With the 3DNow "prefetch"
+instruction, code seems to run slower, and with just "mov" loads it doesn't
+seem faster. Results so far are inconsistent. The K6 does a hardware
+prefetch of the second cache line in a sector, so the penalty for not
+prefetching in software is reduced.
+
+
+
+
+NOTES
+
+All K6 family chips have MMX, but only K6-2 and K6-3 have 3DNow.
+
+Plain K6 executes MMX instructions only in the X pipe, but K6-2 and K6-3 can
+execute them in both X and Y (and together).
+
+Branch misprediction penalty is 1 to 4 cycles (Optimization Manual
+chapter 6 table 12).
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+Store queue is 7 entries of 64 bits each.
+
+Floating point multiplications can be done in parallel with integer
+multiplications, but there doesn't seem to be any way to make use of this.
+
+
+
+OPTIMIZATIONS
+
+Unrolled loops are used to reduce looping overhead. The unrolling is
+configurable up to 32 limbs/loop for most routines, up to 64 for some.
+
+Sometimes computed jumps into the unrolling are used to handle sizes not a
+multiple of the unrolling. An attractive feature of this is that times
+smoothly increase with operand size, but an indirect jump is about 6 cycles
+and the setups about another 6, so it depends on how much the unrolled code
+is faster than a simple loop as to whether a computed jump ought to be used.
+
+Position independent code is implemented using a call to get eip for
+computed jumps and a ret is always done, rather than an addl $4,%esp or a
+popl, so the CPU return address branch prediction stack stays synchronised
+with the actual stack in memory. Such a call however still costs 4 to 7
+cycles.
+
+Branch prediction, in absence of any history, will guess forward jumps are
+not taken and backward jumps are taken. Where possible it's arranged that
+the less likely or less important case is under a taken forward jump.
+
+
+
+MMX
+
+Putting emms or femms as late as possible in a routine seems to be fastest.
+Perhaps an emms or femms stalls until all outstanding MMX instructions have
+completed, so putting it later gives them a chance to complete on their own,
+in parallel with other operations (like register popping).
+
+The Optimization Manual chapter 5 recommends using a femms on K6-2 and K6-3
+at the start of a routine, in case it's been preceded by x87 floating point
+operations. This isn't done because in gmp programs it's expected that x87
+floating point won't be much used and that chances are an mpn routine won't
+have been preceded by any x87 code.
+
+
+
+CODING
+
+Instructions in general code are shown paired if they can decode and execute
+together, meaning two short decode instructions with the second not
+depending on the first, only the first using the shifter, no more than one
+load, and no more than one store.
+
+K6 does some out of order execution so the pairings aren't essential, they
+just show what slots might be available. When decoding is the limiting
+factor things can be scheduled that might not execute until later.
+
+
+
+NOTES
+
+Code alignment
+
+- if an opcode/modrm or 0Fh/opcode/modrm crosses a cache line boundary,
+ short decode is inhibited. The cross.pl script detects this.
+
+- loops and branch targets should be aligned to 16 bytes, or ensure at least
+ 2 instructions before a 32 byte boundary. This makes use of the 16 byte
+ cache in the BTB.
+
+Addressing modes
+
+- (%esi) degrades decoding from short to vector. 0(%esi) doesn't have this
+ problem, and can be used as an equivalent, or easier is just to use a
+ different register, like %ebx.
+
+- K6 and pre-CXT core K6-2 have the following problem. (K6-2 CXT and K6-3
+ have it fixed, these being cpuid function 1 signatures 0x588 to 0x58F).
+
+ If more than 3 bytes are needed to determine instruction length then
+ decoding degrades from direct to long, or from long to vector. This
+ happens with forms like "0F opcode mod/rm" with mod/rm=00-xxx-100 since
+ with mod=00 the sib determines whether there's a displacement.
+
+ This affects all MMX and 3DNow instructions, and others with an 0F prefix
+ like movzbl. The modes affected are anything with an index and no
+ displacement, or an index but no base, and this includes (%esp) which is
+ really (,%esp,1).
+
+ The cross.pl script detects problem cases. The workaround is to always
+ use a displacement, and to do this with Zdisp if it's zero so the
+ assembler doesn't discard it.
+
+ See Optimization Manual rev D page 67 and 3DNow Porting Guide rev B pages
+ 13-14 and 36-37.
+
+Calls
+
+- indirect jumps and calls are not branch predicted, they measure about 6
+ cycles.
+
+Various
+
+- adcl 2 cycles of decode, maybe 2 cycles executing in the X pipe
+- bsf 12-27 cycles
+- emms 5 cycles
+- femms 3 cycles
+- jecxz 2 cycles taken, 13 not taken (optimization manual says 7 not taken)
+- divl 20 cycles back-to-back
+- imull 2 decode, 2 execute
+- mull 2 decode, 3 execute (optimization manual decoding sample)
+- prefetch 2 cycles
+- rcll/rcrl implicit by one bit: 2 cycles
+ immediate or %cl count: 11 + 2 per bit for dword
+ 13 + 4 per bit for byte
+- setCC 2 cycles
+- xchgl %eax,reg 1.5 cycles, back-to-back (strange)
+ reg,reg 2 cycles, back-to-back
+
+
+
+
+REFERENCES
+
+"AMD-K6 Processor Code Optimization Application Note", AMD publication
+number 21924, revision D amendment 0, January 2000. This describes K6-2 and
+K6-3. Available on-line,
+
+ http://www.amd.com/K6/k6docs/pdf/21924.pdf
+
+"AMD-K6 MMX Enhanced Processor x86 Code Optimization Application Note", AMD
+publication number 21828, revision A amendment 0, August 1997. This is an
+older edition of the above document, describing plain K6. Available
+on-line,
+
+ http://www.amd.com/K6/k6docs/pdf/21828.pdf
+
+"3DNow Technology Manual", AMD publication number 21928F/0-August 1999.
+This describes the femms and prefetch instructions, but nothing else from
+3DNow has been used. Available on-line,
+
+ http://www.amd.com/K6/k6docs/pdf/21928.pdf
+
+"3DNow Instruction Porting Guide", AMD publication number 22621, revision B,
+August 1999. This has some notes on general K6 optimizations as well as
+3DNow. Available on-line,
+
+ http://www.amd.com/products/cpg/athlon/techdocs/pdf/22621.pdf
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/k6/aors_n.asm b/rts/gmp/mpn/x86/k6/aors_n.asm
new file mode 100644
index 0000000000..31b05ada51
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/aors_n.asm
@@ -0,0 +1,329 @@
+dnl AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
+dnl
+dnl K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_add_n', `
+ define(M4_inst, adcl)
+ define(M4_function_n, mpn_add_n)
+ define(M4_function_nc, mpn_add_nc)
+ define(M4_description, add)
+',`ifdef(`OPERATION_sub_n', `
+ define(M4_inst, sbbl)
+ define(M4_function_n, mpn_sub_n)
+ define(M4_function_nc, mpn_sub_nc)
+ define(M4_description, subtract)
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+C
+C Calculate src1,size M4_description src2,size, and store the result in
+C dst,size. The return value is the carry bit from the top of the result
+C (1 or 0).
+C
+C The _nc version accepts 1 or 0 for an initial carry into the low limb of
+C the calculation. Note values other than 1 or 0 here will lead to garbage
+C results.
+C
+C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
+C an in-place dst+=src to 2.5 c/l. The unrolled loops have 1 cycle/loop of
+C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
+
+define(PARAM_CARRY, `FRAME+20(%esp)')
+define(PARAM_SIZE, `FRAME+16(%esp)')
+define(PARAM_SRC2, `FRAME+12(%esp)')
+define(PARAM_SRC1, `FRAME+8(%esp)')
+define(PARAM_DST, `FRAME+4(%esp)')
+deflit(`FRAME',0)
+
+dnl minimum 5 because the unrolled code can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(M4_function_nc)
+ movl PARAM_CARRY, %eax
+ jmp LF(M4_function_n,start)
+EPILOGUE()
+
+
+PROLOGUE(M4_function_n)
+ xorl %eax, %eax
+L(start):
+ movl PARAM_SIZE, %ecx
+ pushl %ebx
+FRAME_pushl()
+
+ movl PARAM_SRC1, %ebx
+ pushl %edi
+FRAME_pushl()
+
+ movl PARAM_SRC2, %edx
+ cmpl $UNROLL_THRESHOLD, %ecx
+
+ movl PARAM_DST, %edi
+ jae L(unroll)
+
+
+ shrl %eax C initial carry flag
+
+ C offset 0x21 here, close enough to aligned
+L(simple):
+ C eax scratch
+ C ebx src1
+ C ecx counter
+ C edx src2
+ C esi
+ C edi dst
+ C ebp
+ C
+ C The store to (%edi) could be done with a stosl; it'd be smaller
+ C code, but there's no speed gain and a cld would have to be added
+ C (per mpn/x86/README.family).
+
+ movl (%ebx), %eax
+ leal 4(%ebx), %ebx
+
+ M4_inst (%edx), %eax
+
+ movl %eax, (%edi)
+ leal 4(%edi), %edi
+
+ leal 4(%edx), %edx
+ loop L(simple)
+
+
+ movl $0, %eax
+ popl %edi
+
+ setc %al
+
+ popl %ebx
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(unroll):
+ C eax carry
+ C ebx src1
+ C ecx counter
+ C edx src2
+ C esi
+ C edi dst
+ C ebp
+
+ cmpl %edi, %ebx
+ pushl %esi
+
+ je L(inplace)
+
+ifdef(`OPERATION_add_n',`
+ cmpl %edi, %edx
+
+ je L(inplace_reverse)
+')
+
+ movl %ecx, %esi
+
+ andl $-4, %ecx
+ andl $3, %esi
+
+ leal (%ebx,%ecx,4), %ebx
+ leal (%edx,%ecx,4), %edx
+ leal (%edi,%ecx,4), %edi
+
+ negl %ecx
+ shrl %eax
+
+ ALIGN(32)
+L(normal_top):
+ C eax counter, qwords, negative
+ C ebx src1
+ C ecx scratch
+ C edx src2
+ C esi
+ C edi dst
+ C ebp
+
+ movl (%ebx,%ecx,4), %eax
+ leal 5(%ecx), %ecx
+ M4_inst -20(%edx,%ecx,4), %eax
+ movl %eax, -20(%edi,%ecx,4)
+
+ movl 4-20(%ebx,%ecx,4), %eax
+ M4_inst 4-20(%edx,%ecx,4), %eax
+ movl %eax, 4-20(%edi,%ecx,4)
+
+ movl 8-20(%ebx,%ecx,4), %eax
+ M4_inst 8-20(%edx,%ecx,4), %eax
+ movl %eax, 8-20(%edi,%ecx,4)
+
+ movl 12-20(%ebx,%ecx,4), %eax
+ M4_inst 12-20(%edx,%ecx,4), %eax
+ movl %eax, 12-20(%edi,%ecx,4)
+
+ loop L(normal_top)
+
+
+ decl %esi
+ jz L(normal_finish_one)
+ js L(normal_done)
+
+ C two or three more limbs
+
+ movl (%ebx), %eax
+ M4_inst (%edx), %eax
+ movl %eax, (%edi)
+
+ movl 4(%ebx), %eax
+ M4_inst 4(%edx), %eax
+ decl %esi
+ movl %eax, 4(%edi)
+
+ jz L(normal_done)
+ movl $2, %ecx
+
+L(normal_finish_one):
+ movl (%ebx,%ecx,4), %eax
+ M4_inst (%edx,%ecx,4), %eax
+ movl %eax, (%edi,%ecx,4)
+
+L(normal_done):
+ popl %esi
+ popl %edi
+
+ movl $0, %eax
+ popl %ebx
+
+ setc %al
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+
+ifdef(`OPERATION_add_n',`
+L(inplace_reverse):
+ C dst==src2
+
+ movl %ebx, %edx
+')
+
+L(inplace):
+ C eax initial carry
+ C ebx
+ C ecx size
+ C edx src
+ C esi
+ C edi dst
+ C ebp
+
+ leal -1(%ecx), %esi
+ decl %ecx
+
+ andl $-4, %ecx
+ andl $3, %esi
+
+ movl (%edx), %ebx C src low limb
+ leal (%edx,%ecx,4), %edx
+
+ leal (%edi,%ecx,4), %edi
+ negl %ecx
+
+ shrl %eax
+
+
+ ALIGN(32)
+L(inplace_top):
+ C eax
+ C ebx next src limb
+ C ecx size
+ C edx src
+ C esi
+ C edi dst
+ C ebp
+
+ M4_inst %ebx, (%edi,%ecx,4)
+
+ movl 4(%edx,%ecx,4), %eax
+ leal 5(%ecx), %ecx
+
+ M4_inst %eax, 4-20(%edi,%ecx,4)
+
+ movl 8-20(%edx,%ecx,4), %eax
+ movl 12-20(%edx,%ecx,4), %ebx
+
+ M4_inst %eax, 8-20(%edi,%ecx,4)
+ M4_inst %ebx, 12-20(%edi,%ecx,4)
+
+ movl 16-20(%edx,%ecx,4), %ebx
+ loop L(inplace_top)
+
+
+ C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
+
+ M4_inst %ebx, (%edi)
+
+ decl %esi
+ jz L(inplace_finish_one)
+ js L(inplace_done)
+
+ C two or three more limbs
+
+ movl 4(%edx), %eax
+ movl 8(%edx), %ebx
+ M4_inst %eax, 4(%edi)
+ M4_inst %ebx, 8(%edi)
+
+ decl %esi
+ movl $2, %ecx
+
+ jz L(normal_done)
+
+L(inplace_finish_one):
+ movl 4(%edx,%ecx,4), %eax
+ M4_inst %eax, 4(%edi,%ecx,4)
+
+L(inplace_done):
+ popl %esi
+ popl %edi
+
+ movl $0, %eax
+ popl %ebx
+
+ setc %al
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/aorsmul_1.asm b/rts/gmp/mpn/x86/k6/aorsmul_1.asm
new file mode 100644
index 0000000000..da4120fe2f
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/aorsmul_1.asm
@@ -0,0 +1,372 @@
+dnl AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+dnl
+dnl K6: 7.65 to 8.5 cycles/limb (at 16 limbs/loop and depending on the data),
+dnl PIC adds about 6 cycles at the start.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl K6: large multpliers small multpliers
+dnl UNROLL_COUNT cycles/limb cycles/limb
+dnl 4 9.5 7.78
+dnl 8 9.0 7.78
+dnl 16 8.4 7.65
+dnl 32 8.4 8.2
+dnl
+dnl Maximum possible unrolling with the current code is 32.
+dnl
+dnl Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
+dnl byte block, which might explain the good speed at that unrolling.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_addmul_1', `
+ define(M4_inst, addl)
+ define(M4_function_1, mpn_addmul_1)
+ define(M4_function_1c, mpn_addmul_1c)
+ define(M4_description, add it to)
+ define(M4_desc_retval, carry)
+',`ifdef(`OPERATION_submul_1', `
+ define(M4_inst, subl)
+ define(M4_function_1, mpn_submul_1)
+ define(M4_function_1c, mpn_submul_1c)
+ define(M4_description, subtract it from)
+ define(M4_desc_retval, borrow)
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult);
+C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult, mp_limb_t carry);
+C
+C Calculate src,size multiplied by mult and M4_description dst,size.
+C Return the M4_desc_retval limb from the top of the result.
+C
+C The jadcl0()s in the unrolled loop makes the speed data dependent. Small
+C multipliers (most significant few bits clear) result in few carry bits and
+C speeds up to 7.65 cycles/limb are attained. Large multipliers (most
+C significant few bits set) make the carry bits 50/50 and lead to something
+C more like 8.4 c/l. (With adcl's both of these would be 9.3 c/l.)
+C
+C It's important that the gains for jadcl0 on small multipliers don't come
+C at the cost of slowing down other data. Tests on uniformly distributed
+C random data, designed to confound branch prediction, show about a 7%
+C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
+C overheads included).
+C
+C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
+C 11.0 cycles/limb), and hence isn't used.
+C
+C In the simple loop, note that running ecx from negative to zero and using
+C it as an index in the two movs wouldn't help. It would save one
+C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
+C that would be collapsed by this.
+C
+C
+C jadcl0
+C ------
+C
+C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
+C firstly the instruction decoding and secondly the fact that there's a
+C carry bit for the jadcl0 only on average about 1/4 of the time.
+C
+C The code in the unrolled loop decodes something like the following.
+C
+C decode cycles
+C mull %ebp 2
+C M4_inst %esi, disp(%edi) 1
+C adcl %eax, %ecx 2
+C movl %edx, %esi \ 1
+C jnc 1f /
+C incl %esi \ 1
+C 1: movl disp(%ebx), %eax /
+C ---
+C 7
+C
+C In a back-to-back style test this measures 7 with the jnc not taken, or 8
+C with it taken (both when correctly predicted). This is opposite to the
+C measurements showing small multipliers running faster than large ones.
+C Watch this space for more info ...
+C
+C It's not clear how much branch misprediction might be costing. The K6
+C doco says it will be 1 to 4 cycles, but presumably it's near the low end
+C of that range to get the measured results.
+C
+C
+C In the code the two carries are more or less the preceding mul product and
+C the calculation is roughly
+C
+C x*y + u*b+v
+C
+C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
+C v are the two limbs it's added to (being the low of the next mul, and a
+C limb from the destination).
+C
+C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
+C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
+C x*y/b^2. If x, y, u and v are random and uniformly distributed between 0
+C and b-1, then the total probability can be summed over x and y,
+C
+C 1 b-1 b-1 x*y 1 b*(b-1) b*(b-1)
+C --- * sum sum --- = --- * ------- * ------- = 1/4
+C b^2 x=0 y=1 b^2 b^4 2 2
+C
+C Actually it's a very tiny bit less than 1/4 of course. If y is fixed,
+C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
+
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 9)
+',`
+deflit(UNROLL_THRESHOLD, 6)
+')
+
+defframe(PARAM_CARRY, 20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(M4_function_1c)
+ pushl %esi
+deflit(`FRAME',4)
+ movl PARAM_CARRY, %esi
+ jmp LF(M4_function_1,start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function_1)
+ push %esi
+deflit(`FRAME',4)
+ xorl %esi, %esi C initial carry
+
+L(start_nc):
+ movl PARAM_SIZE, %ecx
+ pushl %ebx
+deflit(`FRAME',8)
+
+ movl PARAM_SRC, %ebx
+ pushl %edi
+deflit(`FRAME',12)
+
+ cmpl $UNROLL_THRESHOLD, %ecx
+ movl PARAM_DST, %edi
+
+ pushl %ebp
+deflit(`FRAME',16)
+ jae L(unroll)
+
+
+ C simple loop
+
+ movl PARAM_MULTIPLIER, %ebp
+
+L(simple):
+ C eax scratch
+ C ebx src
+ C ecx counter
+ C edx scratch
+ C esi carry
+ C edi dst
+ C ebp multiplier
+
+ movl (%ebx), %eax
+ addl $4, %ebx
+
+ mull %ebp
+
+ addl $4, %edi
+ addl %esi, %eax
+
+ adcl $0, %edx
+
+ M4_inst %eax, -4(%edi)
+
+ adcl $0, %edx
+
+ movl %edx, %esi
+ loop L(simple)
+
+
+ popl %ebp
+ popl %edi
+
+ popl %ebx
+ movl %esi, %eax
+
+ popl %esi
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+C The unrolled loop uses a "two carry limbs" scheme. At the top of the loop
+C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
+C For the computed jump an odd size means they start one way around, an even
+C size the other.
+C
+C VAR_JUMP holds the computed jump temporarily because there's not enough
+C registers at the point of doing the mul for the initial two carry limbs.
+C
+C The add/adc for the initial carry in %esi is necessary only for the
+C mpn_addmul/submul_1c entry points. Duplicating the startup code to
+C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
+C idea.
+
+dnl overlapping with parameters already fetched
+define(VAR_COUNTER, `PARAM_SIZE')
+define(VAR_JUMP, `PARAM_DST')
+
+L(unroll):
+ C eax
+ C ebx src
+ C ecx size
+ C edx
+ C esi initial carry
+ C edi dst
+ C ebp
+
+ movl %ecx, %edx
+ decl %ecx
+
+ subl $2, %edx
+ negl %ecx
+
+ shrl $UNROLL_LOG2, %edx
+ andl $UNROLL_MASK, %ecx
+
+ movl %edx, VAR_COUNTER
+ movl %ecx, %edx
+
+ shll $4, %edx
+ negl %ecx
+
+ C 15 code bytes per limb
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal L(entry) (%edx,%ecx,1), %edx
+')
+ movl (%ebx), %eax C src low limb
+
+ movl PARAM_MULTIPLIER, %ebp
+ movl %edx, VAR_JUMP
+
+ mull %ebp
+
+ addl %esi, %eax C initial carry (from _1c)
+ jadcl0( %edx)
+
+
+ leal 4(%ebx,%ecx,4), %ebx
+ movl %edx, %esi C high carry
+
+ movl VAR_JUMP, %edx
+ leal (%edi,%ecx,4), %edi
+
+ testl $1, %ecx
+ movl %eax, %ecx C low carry
+
+ jz L(noswap)
+ movl %esi, %ecx C high,low carry other way around
+
+ movl %eax, %esi
+L(noswap):
+
+ jmp *%edx
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See README.family about old gas bugs
+ leal (%edx,%ecx,1), %edx
+ addl $L(entry)-L(here), %edx
+ addl (%esp), %edx
+ ret
+')
+
+
+C -----------------------------------------------------------
+ ALIGN(32)
+L(top):
+deflit(`FRAME',16)
+ C eax scratch
+ C ebx src
+ C ecx carry lo
+ C edx scratch
+ C esi carry hi
+ C edi dst
+ C ebp multiplier
+ C
+ C 15 code bytes per limb
+
+ leal UNROLL_BYTES(%edi), %edi
+
+L(entry):
+forloop(`i', 0, UNROLL_COUNT/2-1, `
+ deflit(`disp0', eval(2*i*4))
+ deflit(`disp1', eval(disp0 + 4))
+
+Zdisp( movl, disp0,(%ebx), %eax)
+ mull %ebp
+Zdisp( M4_inst,%ecx, disp0,(%edi))
+ adcl %eax, %esi
+ movl %edx, %ecx
+ jadcl0( %ecx)
+
+ movl disp1(%ebx), %eax
+ mull %ebp
+ M4_inst %esi, disp1(%edi)
+ adcl %eax, %ecx
+ movl %edx, %esi
+ jadcl0( %esi)
+')
+
+ decl VAR_COUNTER
+ leal UNROLL_BYTES(%ebx), %ebx
+
+ jns L(top)
+
+
+ popl %ebp
+ M4_inst %ecx, UNROLL_BYTES(%edi)
+
+ popl %edi
+ movl %esi, %eax
+
+ popl %ebx
+ jadcl0( %eax)
+
+ popl %esi
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/cross.pl b/rts/gmp/mpn/x86/k6/cross.pl
new file mode 100644
index 0000000000..21734f3e52
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/cross.pl
@@ -0,0 +1,141 @@
+#! /usr/bin/perl
+
+# Copyright (C) 2000 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published
+# by the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# Usage: cross.pl [filename.o]...
+#
+# Produce an annotated disassembly of the given object files, indicating
+# certain code alignment and addressing mode problems afflicting K6 chips.
+# "ZZ" is used on all annotations, so this can be searched for.
+#
+# With no arguments, all .o files corresponding to .asm files are processed.
+# This is good in the mpn object directory of a k6*-*-* build.
+#
+# As far as fixing problems goes, any cache line crossing problems in loops
+# get attention, but as a rule it's too tedious to rearrange code or slip in
+# nops to fix every problem in setup or finishup code.
+#
+# Bugs:
+#
+# Instructions without mod/rm bytes or which are already vector decoded are
+# unaffected by cache line boundary crossing, but not all of these have yet
+# been put in as exceptions. All that occur in practice in GMP are present
+# though.
+#
+# There's no messages for using the vector decoded addressing mode (%esi),
+# but that mode is easy to avoid when coding.
+
+use strict;
+
+sub disassemble {
+ my ($file) = @_;
+ my ($addr,$b1,$b2,$b3, $prefix,$opcode,$modrm);
+
+ open (IN, "objdump -Srfh $file |")
+ || die "Cannot open pipe from objdump\n";
+ while (<IN>) {
+ print;
+
+ if (/^[ \t]*[0-9]+[ \t]+\.text[ \t]/ && /2\*\*([0-9]+)$/) {
+ if ($1 < 5) {
+ print "ZZ need at least 2**5 for predictable cache line crossing\n";
+ }
+ }
+
+ if (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)[ \t]+([0-9a-f]+)/) {
+ ($addr,$b1,$b2,$b3) = ($1,$2,$3,$4);
+
+ } elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)/) {
+ ($addr,$b1,$b2,$b3) = ($1,$2,$3,'');
+
+ } elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)/) {
+ ($addr,$b1,$b2,$b3) = ($1,$2,'','');
+
+ } else {
+ next;
+ }
+
+ if ($b1 =~ /0f/) {
+ $prefix = $b1;
+ $opcode = $b2;
+ $modrm = $b3;
+ } else {
+ $prefix = '';
+ $opcode = $b1;
+ $modrm = $b2;
+ }
+
+ # modrm of the form 00-xxx-100 with an 0F prefix is the problem case
+ # for K6 and pre-CXT K6-2
+ if ($prefix =~ /0f/
+ && $opcode !~ /^8/ # jcond disp32
+ && $modrm =~ /^[0-3][4c]/) {
+ print "ZZ ($file) >3 bytes to determine instruction length\n";
+ }
+
+ # with just an opcode, starting 1f mod 20h
+ if ($addr =~ /[13579bdf]f$/
+ && $prefix !~ /0f/
+ && $opcode !~ /1[012345]/ # adc
+ && $opcode !~ /1[89abcd]/ # sbb
+ && $opcode !~ /68/ # push $imm32
+ && $opcode !~ /^7/ # jcond disp8
+ && $opcode !~ /a[89]/ # test+imm
+ && $opcode !~ /a[a-f]/ # stos/lods/scas
+ && $opcode !~ /b8/ # movl $imm32,%eax
+ && $opcode !~ /e[0123]/ # loop/loopz/loopnz/jcxz
+ && $opcode !~ /e[b9]/ # jmp disp8/disp32
+ && $opcode !~ /f[89abcd]/ # clc,stc,cli,sti,cld,std
+ && !($opcode =~ /f[67]/ # grp 1
+ && $modrm =~ /^[2367abef]/) # mul, imul, div, idiv
+ && $modrm !~ /^$/) {
+ print "ZZ ($file) opcode/modrm cross 32-byte boundary\n";
+ }
+
+ # with an 0F prefix, anything starting at 1f mod 20h
+ if ($addr =~ /[13579bdf][f]$/
+ && $prefix =~ /0f/) {
+ print "ZZ ($file) prefix/opcode cross 32-byte boundary\n";
+ }
+
+ # with an 0F prefix, anything with mod/rm starting at 1e mod 20h
+ if ($addr =~ /[13579bdf][e]$/
+ && $prefix =~ /0f/
+ && $opcode !~ /^8/ # jcond disp32
+ && $modrm !~ /^$/) {
+ print "ZZ ($file) prefix/opcode/modrm cross 32-byte boundary\n";
+ }
+ }
+ close IN || die "Error from objdump (or objdump not available)\n";
+}
+
+
+my @files;
+if ($#ARGV >= 0) {
+ @files = @ARGV;
+} else {
+ @files = glob "*.asm";
+ map {s/.asm/.o/} @files;
+}
+
+foreach (@files) {
+ disassemble($_);
+}
diff --git a/rts/gmp/mpn/x86/k6/diveby3.asm b/rts/gmp/mpn/x86/k6/diveby3.asm
new file mode 100644
index 0000000000..ffb97bc380
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/diveby3.asm
@@ -0,0 +1,110 @@
+dnl AMD K6 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+dnl
+dnl K6: 11.0 cycles/limb
+
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t carry);
+C
+C Using %esi in (%esi,%ecx,4) or 0(%esi,%ecx,4) addressing modes doesn't
+C lead to vector decoding, unlike plain (%esi) does.
+
+defframe(PARAM_CARRY,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl multiplicative inverse of 3, modulo 2^32
+deflit(INVERSE_3, 0xAAAAAAAB)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_divexact_by3c)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ pushl %esi defframe_pushl(SAVE_ESI)
+
+ movl PARAM_SRC, %esi
+ pushl %edi defframe_pushl(SAVE_EDI)
+
+ movl PARAM_DST, %edi
+ pushl %ebx defframe_pushl(SAVE_EBX)
+
+ movl PARAM_CARRY, %ebx
+ leal (%esi,%ecx,4), %esi
+
+ pushl $3 defframe_pushl(VAR_THREE)
+ leal (%edi,%ecx,4), %edi
+
+ negl %ecx
+
+
+ C Need 32 alignment for claimed speed, to avoid the movl store
+ C opcode/modrm crossing a cache line boundary
+
+ ALIGN(32)
+L(top):
+ C eax scratch, low product
+ C ebx carry limb (0 to 3)
+ C ecx counter, limbs, negative
+ C edx scratch, high product
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp
+ C
+ C The 0(%esi,%ecx,4) form pads so the finishup "movl %ebx, %eax"
+ C doesn't cross a 32 byte boundary, saving a couple of cycles
+ C (that's a fixed couple, not per loop).
+
+Zdisp( movl, 0,(%esi,%ecx,4), %eax)
+ subl %ebx, %eax
+
+ setc %bl
+
+ imull $INVERSE_3, %eax
+
+ movl %eax, (%edi,%ecx,4)
+ addl $2, %ecx
+
+ mull VAR_THREE
+
+ addl %edx, %ebx
+ loop L(top)
+
+
+ movl SAVE_ESI, %esi
+ movl %ebx, %eax
+
+ movl SAVE_EBX, %ebx
+
+ movl SAVE_EDI, %edi
+ addl $FRAME, %esp
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/gmp-mparam.h b/rts/gmp/mpn/x86/k6/gmp-mparam.h
new file mode 100644
index 0000000000..77f3948d77
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/gmp-mparam.h
@@ -0,0 +1,97 @@
+/* AMD K6 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 3 /* cycles */
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME 20 /* cycles */
+#endif
+
+/* bsfl takes 12-27 cycles, put an average for uniform random numbers */
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME 14 /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-04. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 18
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 130
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 34
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD 116
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD 68
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 98
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 13
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 67
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE { 528, 1184, 2176, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD 472
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD 4352
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE { 528, 1184, 2176, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD 544
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD 4352
+#endif
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm b/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm
new file mode 100644
index 0000000000..20a33e6ccf
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm
@@ -0,0 +1,179 @@
+dnl AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
+dnl
+dnl K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
+dnl alignment.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl K6-2 aligned:
+dnl UNROLL_COUNT cycles/limb
+dnl 8 0.75
+dnl 16 0.625
+dnl 32 0.5625
+dnl 64 0.53
+dnl Maximum possible with the current code is 64, the minimum is 2.
+
+deflit(UNROLL_COUNT, 32)
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size, processing limbs from high to low addresses.
+C
+C The comments in copyi.asm apply here too.
+
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_copyd)
+ movl PARAM_SIZE, %ecx
+ movl %esi, %eax
+
+ movl PARAM_SRC, %esi
+ movl %edi, %edx
+
+ std
+
+ movl PARAM_DST, %edi
+ cmpl $UNROLL_COUNT, %ecx
+
+ leal -4(%esi,%ecx,4), %esi
+
+ leal -4(%edi,%ecx,4), %edi
+ ja L(unroll)
+
+L(simple):
+ rep
+ movsl
+
+ cld
+
+ movl %eax, %esi
+ movl %edx, %edi
+
+ ret
+
+
+L(unroll):
+ C if src and dst are different alignments mod8, then use rep movs
+ C if src and dst are both 4mod8 then process one limb to get 0mod8
+
+ pushl %ebx
+ leal (%esi,%edi), %ebx
+
+ testb $4, %bl
+ popl %ebx
+
+ jnz L(simple)
+ testl $4, %esi
+
+ leal -UNROLL_COUNT(%ecx), %ecx
+ jnz L(already_aligned)
+
+ movsl
+
+ decl %ecx
+L(already_aligned):
+
+
+ifelse(UNROLL_BYTES,256,`
+ subl $128, %esi
+ subl $128, %edi
+')
+
+ C offset 0x3D here, but gets full speed without further alignment
+L(top):
+ C eax saved esi
+ C ebx
+ C ecx counter, limbs
+ C edx saved edi
+ C esi src, incrementing
+ C edi dst, incrementing
+ C ebp
+ C
+ C `disp' is never 0, so don't need to force 0(%esi).
+
+deflit(CHUNK_COUNT, 2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
+ movq disp(%esi), %mm0
+ movq %mm0, disp(%edi)
+')
+
+ leal -UNROLL_BYTES(%esi), %esi
+ subl $UNROLL_COUNT, %ecx
+
+ leal -UNROLL_BYTES(%edi), %edi
+ jns L(top)
+
+
+ C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
+ C UNROLL_COUNT-1 limbs remaining
+
+ testb $eval(UNROLL_COUNT/2), %cl
+
+ leal UNROLL_COUNT(%ecx), %ecx
+ jz L(not_half)
+
+
+ C at an unroll count of 32 this block of code is 16 cycles faster than
+ C the rep movs, less 3 or 4 to test whether to do it
+
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
+ deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
+ movq disp(%esi), %mm0
+ movq %mm0, disp(%edi)
+')
+
+ subl $eval(UNROLL_BYTES/2), %esi
+ subl $eval(UNROLL_BYTES/2), %edi
+
+ subl $eval(UNROLL_COUNT/2), %ecx
+L(not_half):
+
+
+ifelse(UNROLL_BYTES,256,`
+ addl $128, %esi
+ addl $128, %edi
+')
+
+ rep
+ movsl
+
+ cld
+
+ movl %eax, %esi
+ movl %edx, %edi
+
+ femms
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm b/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm
new file mode 100644
index 0000000000..215d805f2e
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm
@@ -0,0 +1,196 @@
+dnl AMD K6-2 mpn_copyi -- copy limb vector, incrementing.
+dnl
+dnl K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
+dnl alignment.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl K6-2 aligned:
+dnl UNROLL_COUNT cycles/limb
+dnl 8 0.75
+dnl 16 0.625
+dnl 32 0.5625
+dnl 64 0.53
+dnl Maximum possible with the current code is 64, the minimum is 2.
+
+deflit(UNROLL_COUNT, 32)
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The MMX loop is faster than a rep movs when src and dst are both 0mod8.
+C With one 0mod8 and one 4mod8 it's 1.056 c/l and the rep movs at 1.0 c/l is
+C used instead.
+C
+C mod8
+C src dst
+C 0 0 both aligned, use mmx
+C 0 4 unaligned, use rep movs
+C 4 0 unaligned, use rep movs
+C 4 4 do one movs, then both aligned, use mmx
+C
+C The MMX code on aligned data is 0.5 c/l, plus loop overhead of 2
+C cycles/loop, which is 0.0625 c/l at 32 limbs/loop.
+C
+C A pattern of two movq loads and two movq stores (or four and four) was
+C tried, but found to be the same speed as just one of each.
+C
+C Note that this code only suits K6-2 and K6-3. Plain K6 does only one mmx
+C instruction per cycle, so "movq"s are no faster than the simple 1 c/l rep
+C movs.
+C
+C Enhancement:
+C
+C Addressing modes like disp(%esi,%ecx,4) aren't currently used. They'd
+C make it possible to avoid incrementing %esi and %edi in the loop and hence
+C get loop overhead down to 1 cycle. Care would be needed to avoid bad
+C cache line crossings since the "movq"s would then be 5 code bytes rather
+C than 4.
+
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_copyi)
+ movl PARAM_SIZE, %ecx
+ movl %esi, %eax
+
+ movl PARAM_SRC, %esi
+ movl %edi, %edx
+
+ cld
+
+ movl PARAM_DST, %edi
+ cmpl $UNROLL_COUNT, %ecx
+
+ ja L(unroll)
+
+L(simple):
+ rep
+ movsl
+
+ movl %eax, %esi
+ movl %edx, %edi
+
+ ret
+
+
+L(unroll):
+ C if src and dst are different alignments mod8, then use rep movs
+ C if src and dst are both 4mod8 then process one limb to get 0mod8
+
+ pushl %ebx
+ leal (%esi,%edi), %ebx
+
+ testb $4, %bl
+ popl %ebx
+
+ jnz L(simple)
+ testl $4, %esi
+
+ leal -UNROLL_COUNT(%ecx), %ecx
+ jz L(already_aligned)
+
+ decl %ecx
+
+ movsl
+L(already_aligned):
+
+
+ifelse(UNROLL_BYTES,256,`
+ addl $128, %esi
+ addl $128, %edi
+')
+
+ C this is offset 0x34, no alignment needed
+L(top):
+ C eax saved esi
+ C ebx
+ C ecx counter, limbs
+ C edx saved edi
+ C esi src, incrementing
+ C edi dst, incrementing
+ C ebp
+ C
+ C Zdisp gets 0(%esi) left that way to avoid vector decode, and with
+ C 0(%edi) keeps code aligned to 16 byte boundaries.
+
+deflit(CHUNK_COUNT, 2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+Zdisp( movq, disp,(%esi), %mm0)
+Zdisp( movq, %mm0, disp,(%edi))
+')
+
+ addl $UNROLL_BYTES, %esi
+ subl $UNROLL_COUNT, %ecx
+
+ leal UNROLL_BYTES(%edi), %edi
+ jns L(top)
+
+
+ C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
+ C UNROLL_COUNT-1 limbs remaining
+
+ testb $eval(UNROLL_COUNT/2), %cl
+
+ leal UNROLL_COUNT(%ecx), %ecx
+ jz L(not_half)
+
+ C at an unroll count of 32 this block of code is 16 cycles faster than
+ C the rep movs, less 3 or 4 to test whether to do it
+
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
+ deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+ movq disp(%esi), %mm0
+ movq %mm0, disp(%edi)
+')
+ addl $eval(UNROLL_BYTES/2), %esi
+ addl $eval(UNROLL_BYTES/2), %edi
+
+ subl $eval(UNROLL_COUNT/2), %ecx
+L(not_half):
+
+
+ifelse(UNROLL_BYTES,256,`
+ subl $128, %esi
+ subl $128, %edi
+')
+
+ rep
+ movsl
+
+ movl %eax, %esi
+ movl %edx, %edi
+
+ femms
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm b/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm
new file mode 100644
index 0000000000..f6d54f97a8
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm
@@ -0,0 +1,286 @@
+dnl AMD K6-2 mpn_lshift -- mpn left shift.
+dnl
+dnl K6-2: 1.75 cycles/limb
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl used after src has been fetched
+define(VAR_RETVAL,`PARAM_SRC')
+
+dnl minimum 9, because unrolled loop can't handle less
+deflit(UNROLL_THRESHOLD, 9)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+ C The 1 limb case can be done without the push %ebx, but it's then
+ C still the same speed. The push is left as a free helping hand for
+ C the two_or_more code.
+
+ movl PARAM_SIZE, %eax
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ decl %eax
+
+ movl PARAM_SHIFT, %ecx
+ jnz L(two_or_more)
+
+ movl (%ebx), %edx C src limb
+ movl PARAM_DST, %ebx
+
+ shldl( %cl, %edx, %eax) C return value
+
+ shll %cl, %edx
+
+ movl %edx, (%ebx) C dst limb
+ popl %ebx
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16) C avoid offset 0x1f
+L(two_or_more):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx
+
+ movl (%ebx,%eax,4), %edx C src high limb
+ negl %ecx
+
+ movd PARAM_SHIFT, %mm6
+ addl $32, %ecx C 32-shift
+
+ shrl %cl, %edx
+ cmpl $UNROLL_THRESHOLD-1, %eax
+
+ movl %edx, VAR_RETVAL
+ jae L(unroll)
+
+
+ movd %ecx, %mm7
+ movl %eax, %ecx
+
+ movl PARAM_DST, %eax
+
+L(simple):
+ C eax dst
+ C ebx src
+ C ecx counter, size-1 to 1
+ C edx retval
+ C
+ C mm0 scratch
+ C mm6 shift
+ C mm7 32-shift
+
+ movq -4(%ebx,%ecx,4), %mm0
+
+ psrlq %mm7, %mm0
+
+Zdisp( movd, %mm0, 0,(%eax,%ecx,4))
+ loop L(simple)
+
+
+ movd (%ebx), %mm0
+ popl %ebx
+
+ psllq %mm6, %mm0
+
+ movd %mm0, (%eax)
+ movl %edx, %eax
+
+ femms
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll):
+ C eax size-1
+ C ebx src
+ C ecx 32-shift
+ C edx retval (but instead VAR_RETVAL is used)
+ C
+ C mm6 shift
+
+ addl $32, %ecx
+ movl PARAM_DST, %edx
+
+ movd %ecx, %mm7
+ subl $7, %eax C size-8
+
+ leal (%edx,%eax,4), %ecx C alignment of dst
+
+ movq 32-8(%ebx,%eax,4), %mm2 C src high qword
+ testb $4, %cl
+
+ jz L(dst_aligned)
+ psllq %mm6, %mm2
+
+ psrlq $32, %mm2
+ decl %eax
+
+ movd %mm2, 32(%edx,%eax,4) C dst high limb
+ movq 32-8(%ebx,%eax,4), %mm2 C new src high qword
+L(dst_aligned):
+
+ movq 32-16(%ebx,%eax,4), %mm0 C src second highest qword
+
+
+ C This loop is the important bit, the rest is just support for it.
+ C Four src limbs are held at the start, and four more will be read.
+ C Four dst limbs will be written. This schedule seems necessary for
+ C full speed.
+ C
+ C The use of size-8 lets the loop stop when %eax goes negative and
+ C leaves -4 to -1 which can be tested with test $1 and $2.
+
+L(top):
+ C eax counter, size-8 step by -4 until <0
+ C ebx src
+ C ecx
+ C edx dst
+ C
+ C mm0 src next qword
+ C mm1 scratch
+ C mm2 src prev qword
+ C mm6 shift
+ C mm7 64-shift
+
+ psllq %mm6, %mm2
+ subl $4, %eax
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ por %mm0, %mm2
+ movq 24(%ebx,%eax,4), %mm0
+
+ psllq %mm6, %mm1
+ movq %mm2, 40(%edx,%eax,4)
+
+ movq %mm0, %mm2
+ psrlq %mm7, %mm0
+
+ por %mm0, %mm1
+ movq 16(%ebx,%eax,4), %mm0
+
+ movq %mm1, 32(%edx,%eax,4)
+ jnc L(top)
+
+
+ C Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4.
+ C
+ C 8(%ebx) is the next source, and 24(%edx) is the next destination.
+ C %eax is between -4 and -1, representing respectively 0 to 3 extra
+ C limbs that must be read.
+
+
+ testl $2, %eax C testl to avoid bad cache line crossing
+ jz L(finish_nottwo)
+
+ C Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes
+ C new mm2 and a new mm0 is loaded.
+
+ psllq %mm6, %mm2
+ movq %mm0, %mm1
+
+ psrlq %mm7, %mm0
+ subl $2, %eax
+
+ por %mm0, %mm2
+ movq 16(%ebx,%eax,4), %mm0
+
+ movq %mm2, 32(%edx,%eax,4)
+ movq %mm1, %mm2
+L(finish_nottwo):
+
+
+ C lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0
+
+ testb $1, %al
+ psllq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ por %mm0, %mm2
+ psllq %mm6, %mm1
+
+ movq %mm2, 24(%edx,%eax,4)
+ jz L(finish_even)
+
+
+ C Size is odd, so mm1 and one extra limb to process.
+
+ movd (%ebx), %mm0 C src[0]
+ popl %ebx
+deflit(`FRAME',0)
+
+ movq %mm0, %mm2
+ psllq $32, %mm0
+
+ psrlq %mm7, %mm0
+
+ psllq %mm6, %mm2
+ por %mm0, %mm1
+
+ movq %mm1, 4(%edx) C dst[1,2]
+ movd %mm2, (%edx) C dst[0]
+
+ movl VAR_RETVAL, %eax
+
+ femms
+ ret
+
+
+ nop C avoid bad cache line crossing
+L(finish_even):
+deflit(`FRAME',4)
+ C Size is even, so only mm1 left to process.
+
+ movq %mm1, (%edx) C dst[0,1]
+ movl VAR_RETVAL, %eax
+
+ popl %ebx
+ femms
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm b/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm
new file mode 100644
index 0000000000..8a8c144241
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm
@@ -0,0 +1,285 @@
+dnl AMD K6-2 mpn_rshift -- mpn right shift.
+dnl
+dnl K6-2: 1.75 cycles/limb
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl Minimum 9, because the unrolled loop can't handle less.
+dnl
+deflit(UNROLL_THRESHOLD, 9)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+ C The 1 limb case can be done without the push %ebx, but it's then
+ C still the same speed. The push is left as a free helping hand for
+ C the two_or_more code.
+
+ movl PARAM_SIZE, %eax
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ decl %eax
+
+ movl PARAM_SHIFT, %ecx
+ jnz L(two_or_more)
+
+ movl (%ebx), %edx C src limb
+ movl PARAM_DST, %ebx
+
+ shrdl( %cl, %edx, %eax) C return value
+
+ shrl %cl, %edx
+
+ movl %edx, (%ebx) C dst limb
+ popl %ebx
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16) C avoid offset 0x1f
+L(two_or_more):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx
+
+ movl (%ebx), %edx C src low limb
+ negl %ecx
+
+ addl $32, %ecx
+ movd PARAM_SHIFT, %mm6
+
+ shll %cl, %edx
+ cmpl $UNROLL_THRESHOLD-1, %eax
+
+ jae L(unroll)
+
+
+ C eax size-1
+ C ebx src
+ C ecx 32-shift
+ C edx retval
+ C
+ C mm6 shift
+
+ movl PARAM_DST, %ecx
+ leal (%ebx,%eax,4), %ebx
+
+ leal -4(%ecx,%eax,4), %ecx
+ negl %eax
+
+ C This loop runs at about 3 cycles/limb, which is the amount of
+ C decoding, and this is despite every second access being unaligned.
+
+L(simple):
+ C eax counter, -(size-1) to -1
+ C ebx &src[size-1]
+ C ecx &dst[size-1]
+ C edx retval
+ C
+ C mm0 scratch
+ C mm6 shift
+
+Zdisp( movq, 0,(%ebx,%eax,4), %mm0)
+ incl %eax
+
+ psrlq %mm6, %mm0
+
+Zdisp( movd, %mm0, 0,(%ecx,%eax,4))
+ jnz L(simple)
+
+
+ movq %mm0, (%ecx)
+ movl %edx, %eax
+
+ popl %ebx
+
+ femms
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll):
+ C eax size-1
+ C ebx src
+ C ecx 32-shift
+ C edx retval
+ C
+ C mm6 shift
+
+ addl $32, %ecx
+ subl $7, %eax C size-8
+
+ movd %ecx, %mm7
+ movl PARAM_DST, %ecx
+
+ movq (%ebx), %mm2 C src low qword
+ leal (%ebx,%eax,4), %ebx C src end - 32
+
+ testb $4, %cl
+ leal (%ecx,%eax,4), %ecx C dst end - 32
+
+ notl %eax C -(size-7)
+ jz L(dst_aligned)
+
+ psrlq %mm6, %mm2
+ incl %eax
+
+Zdisp( movd, %mm2, 0,(%ecx,%eax,4)) C dst low limb
+ movq 4(%ebx,%eax,4), %mm2 C new src low qword
+L(dst_aligned):
+
+ movq 12(%ebx,%eax,4), %mm0 C src second lowest qword
+ nop C avoid bad cache line crossing
+
+
+ C This loop is the important bit, the rest is just support for it.
+ C Four src limbs are held at the start, and four more will be read.
+ C Four dst limbs will be written. This schedule seems necessary for
+ C full speed.
+ C
+ C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
+ C and leaves 0 to 3 which can be tested with test $1 and $2.
+
+L(top):
+ C eax counter, -(size-7) step by +4 until >=0
+ C ebx src end - 32
+ C ecx dst end - 32
+ C edx retval
+ C
+ C mm0 src next qword
+ C mm1 scratch
+ C mm2 src prev qword
+ C mm6 shift
+ C mm7 64-shift
+
+ psrlq %mm6, %mm2
+ addl $4, %eax
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ por %mm0, %mm2
+ movq 4(%ebx,%eax,4), %mm0
+
+ psrlq %mm6, %mm1
+ movq %mm2, -12(%ecx,%eax,4)
+
+ movq %mm0, %mm2
+ psllq %mm7, %mm0
+
+ por %mm0, %mm1
+ movq 12(%ebx,%eax,4), %mm0
+
+ movq %mm1, -4(%ecx,%eax,4)
+ ja L(top) C jump if no carry and not zero
+
+
+
+ C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
+ C to 3 representing respectively 3 to 0 further limbs.
+
+ testl $2, %eax C testl to avoid bad cache line crossings
+ jnz L(finish_nottwo)
+
+ C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
+ C becomes new mm2 and a new mm0 is loaded.
+
+ psrlq %mm6, %mm2
+ movq %mm0, %mm1
+
+ psllq %mm7, %mm0
+ addl $2, %eax
+
+ por %mm0, %mm2
+ movq 12(%ebx,%eax,4), %mm0
+
+ movq %mm2, -4(%ecx,%eax,4)
+ movq %mm1, %mm2
+L(finish_nottwo):
+
+
+ testb $1, %al
+ psrlq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ por %mm0, %mm2
+ psrlq %mm6, %mm1
+
+ movq %mm2, 4(%ecx,%eax,4)
+ jnz L(finish_even)
+
+
+ C one further extra limb to process
+
+ movd 32-4(%ebx), %mm0 C src[size-1], most significant limb
+ popl %ebx
+
+ movq %mm0, %mm2
+ psllq %mm7, %mm0
+
+ por %mm0, %mm1
+ psrlq %mm6, %mm2
+
+ movq %mm1, 32-12(%ecx) C dst[size-3,size-2]
+ movd %mm2, 32-4(%ecx) C dst[size-1]
+
+ movl %edx, %eax C retval
+
+ femms
+ ret
+
+
+ nop C avoid bad cache line crossing
+L(finish_even):
+ C no further extra limbs
+
+ movq %mm1, 32-8(%ecx) C dst[size-2,size-1]
+ movl %edx, %eax C retval
+
+ popl %ebx
+
+ femms
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/com_n.asm b/rts/gmp/mpn/x86/k6/mmx/com_n.asm
new file mode 100644
index 0000000000..8915080f0f
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/com_n.asm
@@ -0,0 +1,91 @@
+dnl AMD K6-2 mpn_com_n -- mpn bitwise one's complement.
+dnl
+dnl alignment dst/src, A=0mod8 N=4mod8
+dnl A/A A/N N/A N/N
+dnl K6-2 1.0 1.18 1.18 1.18 cycles/limb
+dnl K6 1.5 1.85 1.75 1.85
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Take the bitwise ones-complement of src,size and write it to dst,size.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(32)
+PROLOGUE(mpn_com_n)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %eax
+ movl PARAM_DST, %edx
+ shrl %ecx
+ jnz L(two_or_more)
+
+ movl (%eax), %eax
+ notl %eax
+ movl %eax, (%edx)
+ ret
+
+
+L(two_or_more):
+ pushl %ebx
+FRAME_pushl()
+ movl %ecx, %ebx
+
+ pcmpeqd %mm7, %mm7 C all ones
+
+
+ ALIGN(16)
+L(top):
+ C eax src
+ C ebx floor(size/2)
+ C ecx counter
+ C edx dst
+ C esi
+ C edi
+ C ebp
+
+ movq -8(%eax,%ecx,8), %mm0
+ pxor %mm7, %mm0
+ movq %mm0, -8(%edx,%ecx,8)
+ loop L(top)
+
+
+ jnc L(no_extra)
+ movl (%eax,%ebx,8), %eax
+ notl %eax
+ movl %eax, (%edx,%ebx,8)
+L(no_extra):
+
+ popl %ebx
+ emms_or_femms
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/logops_n.asm b/rts/gmp/mpn/x86/k6/mmx/logops_n.asm
new file mode 100644
index 0000000000..46cb3b7ea5
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/logops_n.asm
@@ -0,0 +1,212 @@
+dnl AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
+dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
+dnl
+dnl alignment dst/src1/src2, A=0mod8, N=4mod8
+dnl A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
+dnl
+dnl K6-2 1.2 1.5 1.5 1.2 1.2 1.5 1.5 1.2 and,andn,ior,xor
+dnl K6-2 1.5 1.75 2.0 1.75 1.75 2.0 1.75 1.5 iorn,xnor
+dnl K6-2 1.75 2.0 2.0 2.0 2.0 2.0 2.0 1.75 nand,nior
+dnl
+dnl K6 1.5 1.68 1.75 1.2 1.75 1.75 1.68 1.5 and,andn,ior,xor
+dnl K6 2.0 2.0 2.25 2.25 2.25 2.25 2.0 2.0 iorn,xnor
+dnl K6 2.0 2.25 2.25 2.25 2.25 2.25 2.25 2.0 nand,nior
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl M4_p and M4_i are the MMX and integer instructions
+dnl M4_*_neg_dst means whether to negate the final result before writing
+dnl M4_*_neg_src2 means whether to negate the src2 values before using them
+
+define(M4_choose_op,
+m4_assert_numargs(7)
+`ifdef(`OPERATION_$1',`
+define(`M4_function', `mpn_$1')
+define(`M4_operation', `$1')
+define(`M4_p', `$2')
+define(`M4_p_neg_dst', `$3')
+define(`M4_p_neg_src2',`$4')
+define(`M4_i', `$5')
+define(`M4_i_neg_dst', `$6')
+define(`M4_i_neg_src2',`$7')
+')')
+
+dnl xnor is done in "iorn" style because it's a touch faster than "nior"
+dnl style (the two are equivalent for xor).
+
+M4_choose_op( and_n, pand,0,0, andl,0,0)
+M4_choose_op( andn_n, pandn,0,0, andl,0,1)
+M4_choose_op( nand_n, pand,1,0, andl,1,0)
+M4_choose_op( ior_n, por,0,0, orl,0,0)
+M4_choose_op( iorn_n, por,0,1, orl,0,1)
+M4_choose_op( nior_n, por,1,0, orl,1,0)
+M4_choose_op( xor_n, pxor,0,0, xorl,0,0)
+M4_choose_op( xnor_n, pxor,0,1, xorl,0,1)
+
+ifdef(`M4_function',,
+`m4_error(`Unrecognised or undefined OPERATION symbol
+')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+
+C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C
+C Do src1,size M4_operation src2,size, storing the result in dst,size.
+C
+C Unaligned movq loads and stores are a bit slower than aligned ones. The
+C test at the start of the routine checks the alignment of src1 and if
+C necessary processes one limb separately at the low end to make it aligned.
+C
+C The raw speeds without this alignment switch are as follows.
+C
+C alignment dst/src1/src2, A=0mod8, N=4mod8
+C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
+C
+C K6 1.5 2.0 1.5 2.0 and,andn,ior,xor
+C K6 1.75 2.2 2.0 2.28 iorn,xnor
+C K6 2.0 2.25 2.35 2.28 nand,nior
+C
+C
+C Future:
+C
+C K6 can do one 64-bit load per cycle so each of these routines should be
+C able to approach 1.0 c/l, if aligned. The basic and/andn/ior/xor might be
+C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
+C The others are 4 instructions per 2 limbs, and so can only approach 1.0
+C because there's nowhere to hide some loop control.
+
+defframe(PARAM_SIZE,16)
+defframe(PARAM_SRC2,12)
+defframe(PARAM_SRC1,8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+ .text
+ ALIGN(32)
+PROLOGUE(M4_function)
+ movl PARAM_SIZE, %ecx
+ pushl %ebx
+ FRAME_pushl()
+ movl PARAM_SRC1, %eax
+ movl PARAM_SRC2, %ebx
+ cmpl $1, %ecx
+ movl PARAM_DST, %edx
+ ja L(two_or_more)
+
+
+ movl (%ebx), %ecx
+ popl %ebx
+ifelse(M4_i_neg_src2,1,`notl %ecx')
+ M4_i (%eax), %ecx
+ifelse(M4_i_neg_dst,1,` notl %ecx')
+ movl %ecx, (%edx)
+
+ ret
+
+
+L(two_or_more):
+ C eax src1
+ C ebx src2
+ C ecx size
+ C edx dst
+ C esi
+ C edi
+ C ebp
+ C
+ C carry bit is low of size
+
+ pushl %esi
+ FRAME_pushl()
+ testl $4, %eax
+ jz L(alignment_ok)
+
+ movl (%ebx), %esi
+ addl $4, %ebx
+ifelse(M4_i_neg_src2,1,`notl %esi')
+ M4_i (%eax), %esi
+ addl $4, %eax
+ifelse(M4_i_neg_dst,1,` notl %esi')
+ movl %esi, (%edx)
+ addl $4, %edx
+ decl %ecx
+
+L(alignment_ok):
+ movl %ecx, %esi
+ shrl %ecx
+ jnz L(still_two_or_more)
+
+ movl (%ebx), %ecx
+ popl %esi
+ifelse(M4_i_neg_src2,1,`notl %ecx')
+ M4_i (%eax), %ecx
+ifelse(M4_i_neg_dst,1,` notl %ecx')
+ popl %ebx
+ movl %ecx, (%edx)
+ ret
+
+
+L(still_two_or_more):
+ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
+ pcmpeqd %mm7, %mm7 C all ones
+')
+
+ ALIGN(16)
+L(top):
+ C eax src1
+ C ebx src2
+ C ecx counter
+ C edx dst
+ C esi
+ C edi
+ C ebp
+ C
+ C carry bit is low of size
+
+ movq -8(%ebx,%ecx,8), %mm0
+ifelse(M4_p_neg_src2,1,`pxor %mm7, %mm0')
+ M4_p -8(%eax,%ecx,8), %mm0
+ifelse(M4_p_neg_dst,1,` pxor %mm7, %mm0')
+ movq %mm0, -8(%edx,%ecx,8)
+
+ loop L(top)
+
+
+ jnc L(no_extra)
+
+ movl -4(%ebx,%esi,4), %ebx
+ifelse(M4_i_neg_src2,1,`notl %ebx')
+ M4_i -4(%eax,%esi,4), %ebx
+ifelse(M4_i_neg_dst,1,` notl %ebx')
+ movl %ebx, -4(%edx,%esi,4)
+L(no_extra):
+
+ popl %esi
+ popl %ebx
+ emms_or_femms
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/lshift.asm b/rts/gmp/mpn/x86/k6/mmx/lshift.asm
new file mode 100644
index 0000000000..f1dc83db46
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/lshift.asm
@@ -0,0 +1,122 @@
+dnl AMD K6 mpn_lshift -- mpn left shift.
+dnl
+dnl K6: 3.0 cycles/limb
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
+C instructions. This is despite every second fetch being unaligned.
+
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+ C The 1 limb case can be done without the push %ebx, but it's then
+ C still the same speed. The push is left as a free helping hand for
+ C the two_or_more code.
+
+ movl PARAM_SIZE, %eax
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ decl %eax
+
+ movl PARAM_SHIFT, %ecx
+ jnz L(two_or_more)
+
+ movl (%ebx), %edx C src limb
+ movl PARAM_DST, %ebx
+
+ shldl( %cl, %edx, %eax) C return value
+
+ shll %cl, %edx
+
+ movl %edx, (%ebx) C dst limb
+ popl %ebx
+
+ ret
+
+
+ ALIGN(16) C avoid offset 0x1f
+ nop C avoid bad cache line crossing
+L(two_or_more):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx
+
+ movl (%ebx,%eax,4), %edx C src high limb
+ negl %ecx
+
+ movd PARAM_SHIFT, %mm6
+ addl $32, %ecx C 32-shift
+
+ shrl %cl, %edx
+
+ movd %ecx, %mm7
+ movl PARAM_DST, %ecx
+
+L(top):
+ C eax counter, size-1 to 1
+ C ebx src
+ C ecx dst
+ C edx retval
+ C
+ C mm0 scratch
+ C mm6 shift
+ C mm7 32-shift
+
+ movq -4(%ebx,%eax,4), %mm0
+ decl %eax
+
+ psrlq %mm7, %mm0
+
+ movd %mm0, 4(%ecx,%eax,4)
+ jnz L(top)
+
+
+ movd (%ebx), %mm0
+ popl %ebx
+
+ psllq %mm6, %mm0
+ movl %edx, %eax
+
+ movd %mm0, (%ecx)
+
+ emms
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/popham.asm b/rts/gmp/mpn/x86/k6/mmx/popham.asm
new file mode 100644
index 0000000000..2c619252bb
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/popham.asm
@@ -0,0 +1,238 @@
+dnl AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
+dnl hamming distance.
+dnl
+dnl popcount hamdist
+dnl K6-2: 9.0 11.5 cycles/limb
+dnl K6: 12.5 13.0
+
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
+C
+C The code here isn't optimal, but it's already a 2x speedup over the plain
+C integer mpn/generic/popcount.c,hamdist.c.
+
+
+ifdef(`OPERATION_popcount',,
+`ifdef(`OPERATION_hamdist',,
+`m4_error(`Need OPERATION_popcount or OPERATION_hamdist
+')m4exit(1)')')
+
+define(HAM,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_hamdist',`$1')')
+
+define(POP,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_popcount',`$1')')
+
+HAM(`
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC2, 8)
+defframe(PARAM_SRC, 4)
+define(M4_function,mpn_hamdist)
+')
+POP(`
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+define(M4_function,mpn_popcount)
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+
+ifdef(`PIC',,`
+ dnl non-PIC
+
+ DATA
+ ALIGN(8)
+
+define(LS,
+m4_assert_numargs(1)
+`LF(M4_function,`$1')')
+
+LS(rodata_AAAAAAAAAAAAAAAA):
+ .long 0xAAAAAAAA
+ .long 0xAAAAAAAA
+
+LS(rodata_3333333333333333):
+ .long 0x33333333
+ .long 0x33333333
+
+LS(rodata_0F0F0F0F0F0F0F0F):
+ .long 0x0F0F0F0F
+ .long 0x0F0F0F0F
+
+LS(rodata_000000FF000000FF):
+ .long 0x000000FF
+ .long 0x000000FF
+')
+
+ .text
+ ALIGN(32)
+
+POP(`ifdef(`PIC', `
+ C avoid shrl crossing a 32-byte boundary
+ nop')')
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ orl %ecx, %ecx
+ jz L(zero)
+
+ifdef(`PIC',`
+ movl $0xAAAAAAAA, %eax
+ movl $0x33333333, %edx
+
+ movd %eax, %mm7
+ movd %edx, %mm6
+
+ movl $0x0F0F0F0F, %eax
+ movl $0x000000FF, %edx
+
+ punpckldq %mm7, %mm7
+ punpckldq %mm6, %mm6
+
+ movd %eax, %mm5
+ movd %edx, %mm4
+
+ punpckldq %mm5, %mm5
+ punpckldq %mm4, %mm4
+',`
+
+ movq LS(rodata_AAAAAAAAAAAAAAAA), %mm7
+ movq LS(rodata_3333333333333333), %mm6
+ movq LS(rodata_0F0F0F0F0F0F0F0F), %mm5
+ movq LS(rodata_000000FF000000FF), %mm4
+')
+
+define(REG_AAAAAAAAAAAAAAAA, %mm7)
+define(REG_3333333333333333, %mm6)
+define(REG_0F0F0F0F0F0F0F0F, %mm5)
+define(REG_000000FF000000FF, %mm4)
+
+
+ movl PARAM_SRC, %eax
+HAM(` movl PARAM_SRC2, %edx')
+
+ pxor %mm2, %mm2 C total
+
+ shrl %ecx
+ jnc L(top)
+
+Zdisp( movd, 0,(%eax,%ecx,8), %mm1)
+
+HAM(`
+Zdisp( movd, 0,(%edx,%ecx,8), %mm0)
+ pxor %mm0, %mm1
+')
+
+ incl %ecx
+ jmp L(loaded)
+
+
+ ALIGN(16)
+POP(` nop C alignment to avoid crossing 32-byte boundaries')
+
+L(top):
+ C eax src
+ C ebx
+ C ecx counter, qwords, decrementing
+ C edx [hamdist] src2
+ C
+ C mm0 (scratch)
+ C mm1 (scratch)
+ C mm2 total (low dword)
+ C mm3
+ C mm4 \
+ C mm5 | special constants
+ C mm6 |
+ C mm7 /
+
+ movq -8(%eax,%ecx,8), %mm1
+HAM(` pxor -8(%edx,%ecx,8), %mm1')
+
+L(loaded):
+ movq %mm1, %mm0
+ pand REG_AAAAAAAAAAAAAAAA, %mm1
+
+ psrlq $1, %mm1
+HAM(` nop C code alignment')
+
+ psubd %mm1, %mm0 C bit pairs
+HAM(` nop C code alignment')
+
+
+ movq %mm0, %mm1
+ psrlq $2, %mm0
+
+ pand REG_3333333333333333, %mm0
+ pand REG_3333333333333333, %mm1
+
+ paddd %mm1, %mm0 C nibbles
+
+
+ movq %mm0, %mm1
+ psrlq $4, %mm0
+
+ pand REG_0F0F0F0F0F0F0F0F, %mm0
+ pand REG_0F0F0F0F0F0F0F0F, %mm1
+
+ paddd %mm1, %mm0 C bytes
+
+ movq %mm0, %mm1
+ psrlq $8, %mm0
+
+
+ paddb %mm1, %mm0 C words
+
+
+ movq %mm0, %mm1
+ psrlq $16, %mm0
+
+ paddd %mm1, %mm0 C dwords
+
+ pand REG_000000FF000000FF, %mm0
+
+ paddd %mm0, %mm2 C low to total
+ psrlq $32, %mm0
+
+ paddd %mm0, %mm2 C high to total
+ loop L(top)
+
+
+
+ movd %mm2, %eax
+ emms_or_femms
+ ret
+
+L(zero):
+ movl $0, %eax
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/rshift.asm b/rts/gmp/mpn/x86/k6/mmx/rshift.asm
new file mode 100644
index 0000000000..cc5948f26c
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/rshift.asm
@@ -0,0 +1,122 @@
+dnl AMD K6 mpn_rshift -- mpn right shift.
+dnl
+dnl K6: 3.0 cycles/limb
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
+C instructions. This is despite every second fetch being unaligned.
+
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+ C The 1 limb case can be done without the push %ebx, but it's then
+ C still the same speed. The push is left as a free helping hand for
+ C the two_or_more code.
+
+ movl PARAM_SIZE, %eax
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ decl %eax
+
+ movl PARAM_SHIFT, %ecx
+ jnz L(two_or_more)
+
+ movl (%ebx), %edx C src limb
+ movl PARAM_DST, %ebx
+
+ shrdl( %cl, %edx, %eax) C return value
+
+ shrl %cl, %edx
+
+ movl %edx, (%ebx) C dst limb
+ popl %ebx
+
+ ret
+
+
+ ALIGN(16) C avoid offset 0x1f
+L(two_or_more):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx
+
+ movl (%ebx), %edx C src low limb
+ negl %ecx
+
+ addl $32, %ecx C 32-shift
+ movd PARAM_SHIFT, %mm6
+
+ shll %cl, %edx C retval
+ movl PARAM_DST, %ecx
+
+ leal (%ebx,%eax,4), %ebx
+
+ leal -4(%ecx,%eax,4), %ecx
+ negl %eax
+
+
+L(simple):
+ C eax counter (negative)
+ C ebx &src[size-1]
+ C ecx &dst[size-1]
+ C edx retval
+ C
+ C mm0 scratch
+ C mm6 shift
+
+Zdisp( movq, 0,(%ebx,%eax,4), %mm0)
+ incl %eax
+
+ psrlq %mm6, %mm0
+
+Zdisp( movd, %mm0, 0,(%ecx,%eax,4))
+ jnz L(simple)
+
+
+ movq %mm0, (%ecx)
+ movl %edx, %eax
+
+ popl %ebx
+
+ emms
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mul_1.asm b/rts/gmp/mpn/x86/k6/mul_1.asm
new file mode 100644
index 0000000000..c2220fe4ca
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mul_1.asm
@@ -0,0 +1,272 @@
+dnl AMD K6 mpn_mul_1 -- mpn by limb multiply.
+dnl
+dnl K6: 6.25 cycles/limb.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t multiplier);
+C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t multiplier, mp_limb_t carry);
+C
+C Multiply src,size by mult and store the result in dst,size.
+C Return the carry limb from the top of the result.
+C
+C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
+C the low limb of the result.
+
+defframe(PARAM_CARRY, 20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl minimum 5 because the unrolled code can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+ .text
+ ALIGN(32)
+
+PROLOGUE(mpn_mul_1c)
+ pushl %esi
+deflit(`FRAME',4)
+ movl PARAM_CARRY, %esi
+ jmp LF(mpn_mul_1,start_nc)
+EPILOGUE()
+
+
+PROLOGUE(mpn_mul_1)
+ push %esi
+deflit(`FRAME',4)
+ xorl %esi, %esi C initial carry
+
+L(start_nc):
+ mov PARAM_SIZE, %ecx
+ push %ebx
+FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ push %edi
+FRAME_pushl()
+
+ movl PARAM_DST, %edi
+ pushl %ebp
+FRAME_pushl()
+
+ cmpl $UNROLL_THRESHOLD, %ecx
+ movl PARAM_MULTIPLIER, %ebp
+
+ jae L(unroll)
+
+
+ C code offset 0x22 here, close enough to aligned
+L(simple):
+ C eax scratch
+ C ebx src
+ C ecx counter
+ C edx scratch
+ C esi carry
+ C edi dst
+ C ebp multiplier
+ C
+ C this loop 8 cycles/limb
+
+ movl (%ebx), %eax
+ addl $4, %ebx
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, (%edi)
+ addl $4, %edi
+
+ loop L(simple)
+
+
+ popl %ebp
+
+ popl %edi
+ popl %ebx
+
+ movl %esi, %eax
+ popl %esi
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+C The code for each limb is 6 cycles, with instruction decoding being the
+C limiting factor. At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
+C cycles/limb in total.
+C
+C The secret ingredient to get 6.25 is to start the loop with the mul and
+C have the load/store pair at the end. Rotating the load/store to the top
+C is an 0.5 c/l slowdown. (Some address generation effect probably.)
+C
+C The whole unrolled loop fits nicely in exactly 80 bytes.
+
+
+ ALIGN(16) C already aligned to 16 here actually
+L(unroll):
+ movl (%ebx), %eax
+ leal -16(%ebx,%ecx,4), %ebx
+
+ leal -16(%edi,%ecx,4), %edi
+ subl $4, %ecx
+
+ negl %ecx
+
+
+ ALIGN(16) C one byte nop for this alignment
+L(top):
+ C eax scratch
+ C ebx &src[size-4]
+ C ecx counter
+ C edx scratch
+ C esi carry
+ C edi &dst[size-4]
+ C ebp multiplier
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, (%edi,%ecx,4)
+ movl 4(%ebx,%ecx,4), %eax
+
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, 4(%edi,%ecx,4)
+ movl 8(%ebx,%ecx,4), %eax
+
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, 8(%edi,%ecx,4)
+ movl 12(%ebx,%ecx,4), %eax
+
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, 12(%edi,%ecx,4)
+ movl 16(%ebx,%ecx,4), %eax
+
+
+ addl $4, %ecx
+ js L(top)
+
+
+
+ C eax next src limb
+ C ebx &src[size-4]
+ C ecx 0 to 3 representing respectively 4 to 1 further limbs
+ C edx
+ C esi carry
+ C edi &dst[size-4]
+
+ testb $2, %cl
+ jnz L(finish_not_two)
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, (%edi,%ecx,4)
+ movl 4(%ebx,%ecx,4), %eax
+
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, 4(%edi,%ecx,4)
+ movl 8(%ebx,%ecx,4), %eax
+
+ addl $2, %ecx
+L(finish_not_two):
+
+
+ testb $1, %cl
+ jnz L(finish_not_one)
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, 8(%edi)
+ movl 12(%ebx), %eax
+L(finish_not_one):
+
+
+ mull %ebp
+
+ addl %esi, %eax
+ popl %ebp
+
+ adcl $0, %edx
+
+ movl %eax, 12(%edi)
+ popl %edi
+
+ popl %ebx
+ movl %edx, %eax
+
+ popl %esi
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mul_basecase.asm b/rts/gmp/mpn/x86/k6/mul_basecase.asm
new file mode 100644
index 0000000000..1f5a3a4b4b
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mul_basecase.asm
@@ -0,0 +1,600 @@
+dnl AMD K6 mpn_mul_basecase -- multiply two mpn numbers.
+dnl
+dnl K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop
+dnl unrolling).
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl K6: UNROLL_COUNT cycles/product (approx)
+dnl 8 9.75
+dnl 16 9.3
+dnl 32 9.3
+dnl Maximum possible with the current code is 32.
+dnl
+dnl With 16 the inner unrolled loop fits exactly in a 256 byte block, which
+dnl might explain it's good performance.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C mp_srcptr xp, mp_size_t xsize,
+C mp_srcptr yp, mp_size_t ysize);
+C
+C Calculate xp,xsize multiplied by yp,ysize, storing the result in
+C wp,xsize+ysize.
+C
+C This routine is essentially the same as mpn/generic/mul_basecase.c, but
+C it's faster because it does most of the mpn_addmul_1() entry code only
+C once. The saving is about 10-20% on typical sizes coming from the
+C Karatsuba multiply code.
+C
+C Future:
+C
+C The unrolled loop could be shared by mpn_addmul_1, with some extra stack
+C setups and maybe 2 or 3 wasted cycles at the end. Code saving would be
+C 256 bytes.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 8)
+',`
+deflit(UNROLL_THRESHOLD, 8)
+')
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP, 16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP, 8)
+defframe(PARAM_WP, 4)
+
+ .text
+ ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_XSIZE, %ecx
+ movl PARAM_YP, %eax
+
+ movl PARAM_XP, %edx
+ movl (%eax), %eax C yp low limb
+
+ cmpl $2, %ecx
+ ja L(xsize_more_than_two_limbs)
+ je L(two_by_something)
+
+
+ C one limb by one limb
+
+ movl (%edx), %edx C xp low limb
+ movl PARAM_WP, %ecx
+
+ mull %edx
+
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(two_by_something):
+ decl PARAM_YSIZE
+ pushl %ebx
+deflit(`FRAME',4)
+
+ movl PARAM_WP, %ebx
+ pushl %esi
+deflit(`FRAME',8)
+
+ movl %eax, %ecx C yp low limb
+ movl (%edx), %eax C xp low limb
+
+ movl %edx, %esi C xp
+ jnz L(two_by_two)
+
+
+ C two limbs by one limb
+
+ mull %ecx
+
+ movl %eax, (%ebx)
+ movl 4(%esi), %eax
+
+ movl %edx, %esi C carry
+
+ mull %ecx
+
+ addl %eax, %esi
+ movl %esi, 4(%ebx)
+
+ adcl $0, %edx
+
+ movl %edx, 8(%ebx)
+ popl %esi
+
+ popl %ebx
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(two_by_two):
+ C eax xp low limb
+ C ebx wp
+ C ecx yp low limb
+ C edx
+ C esi xp
+ C edi
+ C ebp
+deflit(`FRAME',8)
+
+ mull %ecx C xp[0] * yp[0]
+
+ push %edi
+deflit(`FRAME',12)
+ movl %eax, (%ebx)
+
+ movl 4(%esi), %eax
+ movl %edx, %edi C carry, for wp[1]
+
+ mull %ecx C xp[1] * yp[0]
+
+ addl %eax, %edi
+ movl PARAM_YP, %ecx
+
+ adcl $0, %edx
+
+ movl %edi, 4(%ebx)
+ movl 4(%ecx), %ecx C yp[1]
+
+ movl 4(%esi), %eax C xp[1]
+ movl %edx, %edi C carry, for wp[2]
+
+ mull %ecx C xp[1] * yp[1]
+
+ addl %eax, %edi
+
+ adcl $0, %edx
+
+ movl (%esi), %eax C xp[0]
+ movl %edx, %esi C carry, for wp[3]
+
+ mull %ecx C xp[0] * yp[1]
+
+ addl %eax, 4(%ebx)
+ adcl %edx, %edi
+ adcl $0, %esi
+
+ movl %edi, 8(%ebx)
+ popl %edi
+
+ movl %esi, 12(%ebx)
+ popl %esi
+
+ popl %ebx
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(xsize_more_than_two_limbs):
+
+C The first limb of yp is processed with a simple mpn_mul_1 style loop
+C inline. Unrolling this doesn't seem worthwhile since it's only run once
+C (whereas the addmul below is run ysize-1 many times). A call to the
+C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
+C popping, and doesn't seem likely to be worthwhile on the typical 10-20
+C limb operations the Karatsuba code calls here with.
+
+ C eax yp[0]
+ C ebx
+ C ecx xsize
+ C edx xp
+ C esi
+ C edi
+ C ebp
+deflit(`FRAME',0)
+
+ pushl %edi defframe_pushl(SAVE_EDI)
+ pushl %ebp defframe_pushl(SAVE_EBP)
+
+ movl PARAM_WP, %edi
+ pushl %esi defframe_pushl(SAVE_ESI)
+
+ movl %eax, %ebp
+ pushl %ebx defframe_pushl(SAVE_EBX)
+
+ leal (%edx,%ecx,4), %ebx C xp end
+ xorl %esi, %esi
+
+ leal (%edi,%ecx,4), %edi C wp end of mul1
+ negl %ecx
+
+
+L(mul1):
+ C eax scratch
+ C ebx xp end
+ C ecx counter, negative
+ C edx scratch
+ C esi carry
+ C edi wp end of mul1
+ C ebp multiplier
+
+ movl (%ebx,%ecx,4), %eax
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, (%edi,%ecx,4)
+ incl %ecx
+
+ jnz L(mul1)
+
+
+ movl PARAM_YSIZE, %edx
+ movl %esi, (%edi) C final carry
+
+ movl PARAM_XSIZE, %ecx
+ decl %edx
+
+ jnz L(ysize_more_than_one_limb)
+
+ popl %ebx
+ popl %esi
+ popl %ebp
+ popl %edi
+ ret
+
+
+L(ysize_more_than_one_limb):
+ cmpl $UNROLL_THRESHOLD, %ecx
+ movl PARAM_YP, %eax
+
+ jae L(unroll)
+
+
+C -----------------------------------------------------------------------------
+C Simple addmul loop.
+C
+C Using ebx and edi pointing at the ends of their respective locations saves
+C a couple of instructions in the outer loop. The inner loop is still 11
+C cycles, the same as the simple loop in aorsmul_1.asm.
+
+ C eax yp
+ C ebx xp end
+ C ecx xsize
+ C edx ysize-1
+ C esi
+ C edi wp end of mul1
+ C ebp
+
+ movl 4(%eax), %ebp C multiplier
+ negl %ecx
+
+ movl %ecx, PARAM_XSIZE C -xsize
+ xorl %esi, %esi C initial carry
+
+ leal 4(%eax,%edx,4), %eax C yp end
+ negl %edx
+
+ movl %eax, PARAM_YP
+ movl %edx, PARAM_YSIZE
+
+ jmp L(simple_outer_entry)
+
+
+ C aligning here saves a couple of cycles
+ ALIGN(16)
+L(simple_outer_top):
+ C edx ysize counter, negative
+
+ movl PARAM_YP, %eax C yp end
+ xorl %esi, %esi C carry
+
+ movl PARAM_XSIZE, %ecx C -xsize
+ movl %edx, PARAM_YSIZE
+
+ movl (%eax,%edx,4), %ebp C yp limb multiplier
+L(simple_outer_entry):
+ addl $4, %edi
+
+
+L(simple_inner):
+ C eax scratch
+ C ebx xp end
+ C ecx counter, negative
+ C edx scratch
+ C esi carry
+ C edi wp end of this addmul
+ C ebp multiplier
+
+ movl (%ebx,%ecx,4), %eax
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl $0, %edx
+ addl %eax, (%edi,%ecx,4)
+ adcl %edx, %esi
+
+ incl %ecx
+ jnz L(simple_inner)
+
+
+ movl PARAM_YSIZE, %edx
+ movl %esi, (%edi)
+
+ incl %edx
+ jnz L(simple_outer_top)
+
+
+ popl %ebx
+ popl %esi
+ popl %ebp
+ popl %edi
+ ret
+
+
+C -----------------------------------------------------------------------------
+C Unrolled loop.
+C
+C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for
+C some comments.
+C
+C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to
+C 0, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled loop.
+C
+C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop
+C is entered.
+C
+C VAR_XP_LOW is the least significant limb of xp, which is needed at the
+C start of the unrolled loop. This can't just be fetched through the xp
+C pointer because of the offset applied to it.
+C
+C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
+C inclusive.
+C
+C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
+C added to give the location of the next limb of yp, which is the multiplier
+C in the unrolled loop.
+C
+C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added
+C to give the starting point in the destination for each unrolled loop (this
+C point is one limb upwards for each limb of yp processed).
+C
+C Having PARAM_YSIZE count negative to zero means it's not necessary to
+C store new values of PARAM_YP and PARAM_WP on each loop. Those values on
+C the stack remain constant and on each loop an leal adjusts them with the
+C PARAM_YSIZE counter value.
+
+
+defframe(VAR_COUNTER, -20)
+defframe(VAR_COUNTER_INIT, -24)
+defframe(VAR_JMP, -28)
+defframe(VAR_XP_LOW, -32)
+deflit(VAR_STACK_SPACE, 16)
+
+dnl For some strange reason using (%esp) instead of 0(%esp) is a touch
+dnl slower in this code, hence the defframe empty-if-zero feature is
+dnl disabled.
+dnl
+dnl If VAR_COUNTER is at (%esp), the effect is worse. In this case the
+dnl unrolled loop is 255 instead of 256 bytes, but quite how this affects
+dnl anything isn't clear.
+dnl
+define(`defframe_empty_if_zero_disabled',1)
+
+L(unroll):
+ C eax yp (not used)
+ C ebx xp end (not used)
+ C ecx xsize
+ C edx ysize-1
+ C esi
+ C edi wp end of mul1 (not used)
+ C ebp
+deflit(`FRAME', 16)
+
+ leal -2(%ecx), %ebp C one limb processed at start,
+ decl %ecx C and ebp is one less
+
+ shrl $UNROLL_LOG2, %ebp
+ negl %ecx
+
+ subl $VAR_STACK_SPACE, %esp
+deflit(`FRAME', 16+VAR_STACK_SPACE)
+ andl $UNROLL_MASK, %ecx
+
+ movl %ecx, %esi
+ shll $4, %ecx
+
+ movl %ebp, VAR_COUNTER_INIT
+ negl %esi
+
+ C 15 code bytes per limb
+ifdef(`PIC',`
+ call L(pic_calc)
+L(unroll_here):
+',`
+ leal L(unroll_entry) (%ecx,%esi,1), %ecx
+')
+
+ movl PARAM_XP, %ebx
+ movl %ebp, VAR_COUNTER
+
+ movl PARAM_WP, %edi
+ movl %ecx, VAR_JMP
+
+ movl (%ebx), %eax
+ leal 4(%edi,%esi,4), %edi C wp adjust for unrolling and mul1
+
+ leal (%ebx,%esi,4), %ebx C xp adjust for unrolling
+
+ movl %eax, VAR_XP_LOW
+
+ movl %ebx, PARAM_XP
+ movl PARAM_YP, %ebx
+
+ leal (%edi,%edx,4), %ecx C wp adjust for ysize indexing
+ movl 4(%ebx), %ebp C multiplier (yp second limb)
+
+ leal 4(%ebx,%edx,4), %ebx C yp adjust for ysize indexing
+
+ movl %ecx, PARAM_WP
+
+ leal 1(%esi), %ecx C adjust parity for decl %ecx above
+
+ movl %ebx, PARAM_YP
+ negl %edx
+
+ movl %edx, PARAM_YSIZE
+ jmp L(unroll_outer_entry)
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See README.family about old gas bugs
+ leal (%ecx,%esi,1), %ecx
+ addl $L(unroll_entry)-L(unroll_here), %ecx
+ addl (%esp), %ecx
+ ret
+')
+
+
+C -----------------------------------------------------------------------------
+ C Aligning here saves a couple of cycles per loop. Using 32 doesn't
+ C cost any extra space, since the inner unrolled loop below is
+ C aligned to 32.
+ ALIGN(32)
+L(unroll_outer_top):
+ C edx ysize
+
+ movl PARAM_YP, %eax
+ movl %edx, PARAM_YSIZE C incremented ysize counter
+
+ movl PARAM_WP, %edi
+
+ movl VAR_COUNTER_INIT, %ebx
+ movl (%eax,%edx,4), %ebp C next multiplier
+
+ movl PARAM_XSIZE, %ecx
+ leal (%edi,%edx,4), %edi C adjust wp for where we are in yp
+
+ movl VAR_XP_LOW, %eax
+ movl %ebx, VAR_COUNTER
+
+L(unroll_outer_entry):
+ mull %ebp
+
+ C using testb is a tiny bit faster than testl
+ testb $1, %cl
+
+ movl %eax, %ecx C low carry
+ movl VAR_JMP, %eax
+
+ movl %edx, %esi C high carry
+ movl PARAM_XP, %ebx
+
+ jnz L(unroll_noswap)
+ movl %ecx, %esi C high,low carry other way around
+
+ movl %edx, %ecx
+L(unroll_noswap):
+
+ jmp *%eax
+
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(32)
+L(unroll_top):
+ C eax scratch
+ C ebx xp
+ C ecx carry low
+ C edx scratch
+ C esi carry high
+ C edi wp
+ C ebp multiplier
+ C VAR_COUNTER loop counter
+ C
+ C 15 code bytes each limb
+
+ leal UNROLL_BYTES(%edi), %edi
+
+L(unroll_entry):
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp0', eval(i*CHUNK_COUNT*4))
+ deflit(`disp1', eval(disp0 + 4))
+ deflit(`disp2', eval(disp1 + 4))
+
+ movl disp1(%ebx), %eax
+ mull %ebp
+Zdisp( addl, %ecx, disp0,(%edi))
+ adcl %eax, %esi
+ movl %edx, %ecx
+ jadcl0( %ecx)
+
+ movl disp2(%ebx), %eax
+ mull %ebp
+ addl %esi, disp1(%edi)
+ adcl %eax, %ecx
+ movl %edx, %esi
+ jadcl0( %esi)
+')
+
+ decl VAR_COUNTER
+ leal UNROLL_BYTES(%ebx), %ebx
+
+ jns L(unroll_top)
+
+
+ movl PARAM_YSIZE, %edx
+ addl %ecx, UNROLL_BYTES(%edi)
+
+ adcl $0, %esi
+
+ incl %edx
+ movl %esi, UNROLL_BYTES+4(%edi)
+
+ jnz L(unroll_outer_top)
+
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EBP, %ebp
+ movl SAVE_EDI, %edi
+ movl SAVE_EBX, %ebx
+
+ addl $FRAME, %esp
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/sqr_basecase.asm b/rts/gmp/mpn/x86/k6/sqr_basecase.asm
new file mode 100644
index 0000000000..70d49b3e57
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/sqr_basecase.asm
@@ -0,0 +1,672 @@
+dnl AMD K6 mpn_sqr_basecase -- square an mpn number.
+dnl
+dnl K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
+dnl product (measured on the speed difference between 17 and 33 limbs,
+dnl which is roughly the Karatsuba recursing range).
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl KARATSUBA_SQR_THRESHOLD_MAX is the maximum KARATSUBA_SQR_THRESHOLD this
+dnl code supports. This value is used only by the tune program to know
+dnl what it can go up to. (An attempt to compile with a bigger value will
+dnl trigger some m4_assert()s in the code, making the build fail.)
+dnl
+dnl The value is determined by requiring the displacements in the unrolled
+dnl addmul to fit in single bytes. This means a maximum UNROLL_COUNT of
+dnl 63, giving a maximum KARATSUBA_SQR_THRESHOLD of 66.
+
+deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66)
+
+
+dnl Allow a value from the tune program to override config.m4.
+
+ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
+`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
+
+
+dnl UNROLL_COUNT is the number of code chunks in the unrolled addmul. The
+dnl number required is determined by KARATSUBA_SQR_THRESHOLD, since
+dnl mpn_sqr_basecase only needs to handle sizes < KARATSUBA_SQR_THRESHOLD.
+dnl
+dnl The first addmul is the biggest, and this takes the second least
+dnl significant limb and multiplies it by the third least significant and
+dnl up. Hence for a maximum operand size of KARATSUBA_SQR_THRESHOLD-1
+dnl limbs, UNROLL_COUNT needs to be KARATSUBA_SQR_THRESHOLD-3.
+
+m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
+deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the given size
+C is small.
+C
+C The code size might look a bit excessive, but not all of it is executed
+C and so won't fill up the code cache. The 1x1, 2x2 and 3x3 special cases
+C clearly apply only to those sizes; mid sizes like 10x10 only need part of
+C the unrolled addmul; and big sizes like 35x35 that do need all of it will
+C at least be getting value for money, because 35x35 spends something like
+C 5780 cycles here.
+C
+C Different values of UNROLL_COUNT give slightly different speeds, between
+C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
+C This isn't a big difference, but it's presumably some alignment effect
+C which if understood could give a simple speedup.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %eax
+
+ cmpl $2, %ecx
+ je L(two_limbs)
+
+ movl PARAM_DST, %edx
+ ja L(three_or_more)
+
+
+C -----------------------------------------------------------------------------
+C one limb only
+ C eax src
+ C ebx
+ C ecx size
+ C edx dst
+
+ movl (%eax), %eax
+ movl %edx, %ecx
+
+ mull %eax
+
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(two_limbs):
+ C eax src
+ C ebx
+ C ecx size
+ C edx dst
+
+ pushl %ebx
+ movl %eax, %ebx C src
+deflit(`FRAME',4)
+
+ movl (%ebx), %eax
+ movl PARAM_DST, %ecx
+
+ mull %eax C src[0]^2
+
+ movl %eax, (%ecx)
+ movl 4(%ebx), %eax
+
+ movl %edx, 4(%ecx)
+
+ mull %eax C src[1]^2
+
+ movl %eax, 8(%ecx)
+ movl (%ebx), %eax
+
+ movl %edx, 12(%ecx)
+ movl 4(%ebx), %edx
+
+ mull %edx C src[0]*src[1]
+
+ addl %eax, 4(%ecx)
+
+ adcl %edx, 8(%ecx)
+ adcl $0, 12(%ecx)
+
+ popl %ebx
+ addl %eax, 4(%ecx)
+
+ adcl %edx, 8(%ecx)
+ adcl $0, 12(%ecx)
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(three_or_more):
+deflit(`FRAME',0)
+ cmpl $4, %ecx
+ jae L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+ C eax src
+ C ecx size
+ C edx dst
+
+ pushl %ebx
+ movl %eax, %ebx C src
+
+ movl (%ebx), %eax
+ movl %edx, %ecx C dst
+
+ mull %eax C src[0] ^ 2
+
+ movl %eax, (%ecx)
+ movl 4(%ebx), %eax
+
+ movl %edx, 4(%ecx)
+ pushl %esi
+
+ mull %eax C src[1] ^ 2
+
+ movl %eax, 8(%ecx)
+ movl 8(%ebx), %eax
+
+ movl %edx, 12(%ecx)
+ pushl %edi
+
+ mull %eax C src[2] ^ 2
+
+ movl %eax, 16(%ecx)
+ movl (%ebx), %eax
+
+ movl %edx, 20(%ecx)
+ movl 4(%ebx), %edx
+
+ mull %edx C src[0] * src[1]
+
+ movl %eax, %esi
+ movl (%ebx), %eax
+
+ movl %edx, %edi
+ movl 8(%ebx), %edx
+
+ pushl %ebp
+ xorl %ebp, %ebp
+
+ mull %edx C src[0] * src[2]
+
+ addl %eax, %edi
+ movl 4(%ebx), %eax
+
+ adcl %edx, %ebp
+
+ movl 8(%ebx), %edx
+
+ mull %edx C src[1] * src[2]
+
+ addl %eax, %ebp
+
+ adcl $0, %edx
+
+
+ C eax will be dst[5]
+ C ebx
+ C ecx dst
+ C edx dst[4]
+ C esi dst[1]
+ C edi dst[2]
+ C ebp dst[3]
+
+ xorl %eax, %eax
+ addl %esi, %esi
+ adcl %edi, %edi
+ adcl %ebp, %ebp
+ adcl %edx, %edx
+ adcl $0, %eax
+
+ addl %esi, 4(%ecx)
+ adcl %edi, 8(%ecx)
+ adcl %ebp, 12(%ecx)
+
+ popl %ebp
+ popl %edi
+
+ adcl %edx, 16(%ecx)
+
+ popl %esi
+ popl %ebx
+
+ adcl %eax, 20(%ecx)
+ ASSERT(nc)
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+defframe(VAR_COUNTER,-20)
+defframe(VAR_JMP, -24)
+deflit(STACK_SPACE, 24)
+
+ ALIGN(16)
+L(four_or_more):
+
+ C eax src
+ C ebx
+ C ecx size
+ C edx dst
+ C esi
+ C edi
+ C ebp
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+C
+C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
+C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
+C a 5780 cycle operation, which is not surprising since the loop here is 8
+C c/l and mpn_mul_1 is 6.25 c/l.
+
+ subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
+
+ movl %edi, SAVE_EDI
+ leal 4(%edx), %edi
+
+ movl %ebx, SAVE_EBX
+ leal 4(%eax), %ebx
+
+ movl %esi, SAVE_ESI
+ xorl %esi, %esi
+
+ movl %ebp, SAVE_EBP
+
+ C eax
+ C ebx src+4
+ C ecx size
+ C edx
+ C esi
+ C edi dst+4
+ C ebp
+
+ movl (%eax), %ebp C multiplier
+ leal -1(%ecx), %ecx C size-1, and pad to a 16 byte boundary
+
+
+ ALIGN(16)
+L(mul_1):
+ C eax scratch
+ C ebx src ptr
+ C ecx counter
+ C edx scratch
+ C esi carry
+ C edi dst ptr
+ C ebp multiplier
+
+ movl (%ebx), %eax
+ addl $4, %ebx
+
+ mull %ebp
+
+ addl %esi, %eax
+ movl $0, %esi
+
+ adcl %edx, %esi
+
+ movl %eax, (%edi)
+ addl $4, %edi
+
+ loop L(mul_1)
+
+
+C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two addmuls, which are the bottom right corner of the product
+C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as mpn_addmul_1(), see that routine for some
+C comments.
+C
+C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+C
+C K6 doesn't do any branch prediction on indirect jumps, which is good
+C actually because it's a different target each time. The unrolled addmul
+C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
+C the indirect jump is quickly recovered.
+
+
+dnl This value is also implicitly encoded in a shift and add.
+dnl
+deflit(CODE_BYTES_PER_LIMB, 15)
+
+dnl With the unmodified &src[size] and &dst[size] pointers, the
+dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT
+dnl values up to 31. Above that an offset must be added to them.
+dnl
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>31),1,
+eval((UNROLL_COUNT-31)*4),
+0))
+
+ C eax
+ C ebx &src[size]
+ C ecx
+ C edx
+ C esi carry
+ C edi &dst[size]
+ C ebp
+
+ movl PARAM_SIZE, %ecx
+ movl %esi, (%edi)
+
+ subl $4, %ecx
+ jz L(corner)
+
+ movl %ecx, %edx
+ifelse(OFFSET,0,,
+` subl $OFFSET, %ebx')
+
+ shll $4, %ecx
+ifelse(OFFSET,0,,
+` subl $OFFSET, %edi')
+
+ negl %ecx
+
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+ negl %edx
+
+
+ C The calculated jump mustn't be before the start of the available
+ C code. This is the limitation UNROLL_COUNT puts on the src operand
+ C size, but checked here using the jump address directly.
+ C
+ ASSERT(ae,`
+ movl_text_address( L(unroll_inner_start), %eax)
+ cmpl %eax, %ecx
+ ')
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll_outer_top):
+ C eax
+ C ebx &src[size], constant
+ C ecx VAR_JMP
+ C edx VAR_COUNTER, limbs, negative
+ C esi high limb to store
+ C edi dst ptr, high of last addmul
+ C ebp
+
+ movl -12+OFFSET(%ebx,%edx,4), %ebp C multiplier
+ movl %edx, VAR_COUNTER
+
+ movl -8+OFFSET(%ebx,%edx,4), %eax C first limb of multiplicand
+
+ mull %ebp
+
+ testb $1, %cl
+
+ movl %edx, %esi C high carry
+ movl %ecx, %edx C jump
+
+ movl %eax, %ecx C low carry
+ leal CODE_BYTES_PER_LIMB(%edx), %edx
+
+ movl %edx, VAR_JMP
+ leal 4(%edi), %edi
+
+ C A branch-free version of this using some xors was found to be a
+ C touch slower than just a conditional jump, despite the jump
+ C switching between taken and not taken on every loop.
+
+ifelse(eval(UNROLL_COUNT%2),0,
+ jz,jnz) L(unroll_noswap)
+ movl %esi, %eax C high,low carry other way around
+
+ movl %ecx, %esi
+ movl %eax, %ecx
+L(unroll_noswap):
+
+ jmp *%edx
+
+
+ C Must be on an even address here so the low bit of the jump address
+ C will indicate which way around ecx/esi should start.
+ C
+ C An attempt was made at padding here to get the end of the unrolled
+ C code to come out on a good alignment, to save padding before
+ C L(corner). This worked, but turned out to run slower than just an
+ C ALIGN(2). The reason for this is not clear, it might be related
+ C to the different speeds on different UNROLL_COUNTs noted above.
+
+ ALIGN(2)
+
+L(unroll_inner_start):
+ C eax scratch
+ C ebx src
+ C ecx carry low
+ C edx scratch
+ C esi carry high
+ C edi dst
+ C ebp multiplier
+ C
+ C 15 code bytes each limb
+ C ecx/esi swapped on each chunk
+
+forloop(`i', UNROLL_COUNT, 1, `
+ deflit(`disp_src', eval(-i*4 + OFFSET))
+ deflit(`disp_dst', eval(disp_src - 4))
+
+ m4_assert(`disp_src>=-128 && disp_src<128')
+ m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp( movl, disp_src,(%ebx), %eax)
+ mull %ebp
+Zdisp( addl, %esi, disp_dst,(%edi))
+ adcl %eax, %ecx
+ movl %edx, %esi
+ jadcl0( %esi)
+',`
+ dnl this one comes out last
+Zdisp( movl, disp_src,(%ebx), %eax)
+ mull %ebp
+Zdisp( addl, %ecx, disp_dst,(%edi))
+ adcl %eax, %esi
+ movl %edx, %ecx
+ jadcl0( %ecx)
+')
+')
+L(unroll_inner_end):
+
+ addl %esi, -4+OFFSET(%edi)
+
+ movl VAR_COUNTER, %edx
+ jadcl0( %ecx)
+
+ movl %ecx, m4_empty_if_zero(OFFSET)(%edi)
+ movl VAR_JMP, %ecx
+
+ incl %edx
+ jnz L(unroll_outer_top)
+
+
+ifelse(OFFSET,0,,`
+ addl $OFFSET, %ebx
+ addl $OFFSET, %edi
+')
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(corner):
+ C ebx &src[size]
+ C edi &dst[2*size-5]
+
+ movl -12(%ebx), %ebp
+
+ movl -8(%ebx), %eax
+ movl %eax, %ecx
+
+ mull %ebp
+
+ addl %eax, -4(%edi)
+ adcl $0, %edx
+
+ movl -4(%ebx), %eax
+ movl %edx, %esi
+ movl %eax, %ebx
+
+ mull %ebp
+
+ addl %esi, %eax
+ adcl $0, %edx
+
+ addl %eax, (%edi)
+ adcl $0, %edx
+
+ movl %edx, %esi
+ movl %ebx, %eax
+
+ mull %ecx
+
+ addl %esi, %eax
+ movl %eax, 4(%edi)
+
+ adcl $0, %edx
+
+ movl %edx, 8(%edi)
+
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
+C The loop measures about 6 cycles/iteration, though it looks like it should
+C decode in 5.
+
+L(lshift_start):
+ movl PARAM_SIZE, %ecx
+
+ movl PARAM_DST, %edi
+ subl $1, %ecx C size-1 and clear carry
+
+ movl PARAM_SRC, %ebx
+ movl %ecx, %edx
+
+ xorl %eax, %eax C ready for adcl
+
+
+ ALIGN(16)
+L(lshift):
+ C eax
+ C ebx src (for later use)
+ C ecx counter, decrementing
+ C edx size-1 (for later use)
+ C esi
+ C edi dst, incrementing
+ C ebp
+
+ rcll 4(%edi)
+ rcll 8(%edi)
+ leal 8(%edi), %edi
+ loop L(lshift)
+
+
+ adcl %eax, %eax
+
+ movl %eax, 4(%edi) C dst most significant limb
+ movl (%ebx), %eax C src[0]
+
+ leal 4(%ebx,%edx,4), %ebx C &src[size]
+ subl %edx, %ecx C -(size-1)
+
+
+C -----------------------------------------------------------------------------
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+
+ mull %eax
+
+ movl %eax, (%edi,%ecx,8) C dst[0]
+
+
+ ALIGN(16)
+L(diag):
+ C eax scratch
+ C ebx &src[size]
+ C ecx counter, negative
+ C edx carry
+ C esi scratch
+ C edi dst[2*size-2]
+ C ebp
+
+ movl (%ebx,%ecx,4), %eax
+ movl %edx, %esi
+
+ mull %eax
+
+ addl %esi, 4(%edi,%ecx,8)
+ adcl %eax, 8(%edi,%ecx,8)
+ adcl $0, %edx
+
+ incl %ecx
+ jnz L(diag)
+
+
+ movl SAVE_EBX, %ebx
+ movl SAVE_ESI, %esi
+
+ addl %edx, 4(%edi) C dst most significant limb
+
+ movl SAVE_EDI, %edi
+ movl SAVE_EBP, %ebp
+ addl $FRAME, %esp
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+ifdef(`PIC',`
+L(pic_calc):
+ C See README.family about old gas bugs
+ addl (%esp), %ecx
+ addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
+ addl %edx, %ecx
+ ret
+')
+
+
+EPILOGUE()