summaryrefslogtreecommitdiff
path: root/rts/gmp/mpn/x86/pentium
diff options
context:
space:
mode:
Diffstat (limited to 'rts/gmp/mpn/x86/pentium')
-rw-r--r--rts/gmp/mpn/x86/pentium/README77
-rw-r--r--rts/gmp/mpn/x86/pentium/aors_n.asm196
-rw-r--r--rts/gmp/mpn/x86/pentium/aorsmul_1.asm99
-rw-r--r--rts/gmp/mpn/x86/pentium/diveby3.asm183
-rw-r--r--rts/gmp/mpn/x86/pentium/gmp-mparam.h97
-rw-r--r--rts/gmp/mpn/x86/pentium/lshift.asm236
-rw-r--r--rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h97
-rw-r--r--rts/gmp/mpn/x86/pentium/mmx/lshift.asm455
-rw-r--r--rts/gmp/mpn/x86/pentium/mmx/popham.asm30
-rw-r--r--rts/gmp/mpn/x86/pentium/mmx/rshift.asm460
-rw-r--r--rts/gmp/mpn/x86/pentium/mul_1.asm79
-rw-r--r--rts/gmp/mpn/x86/pentium/mul_basecase.asm135
-rw-r--r--rts/gmp/mpn/x86/pentium/rshift.asm236
-rw-r--r--rts/gmp/mpn/x86/pentium/sqr_basecase.asm520
14 files changed, 2900 insertions, 0 deletions
diff --git a/rts/gmp/mpn/x86/pentium/README b/rts/gmp/mpn/x86/pentium/README
new file mode 100644
index 0000000000..3b9ec8ac6f
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/README
@@ -0,0 +1,77 @@
+
+ INTEL PENTIUM P5 MPN SUBROUTINES
+
+
+This directory contains mpn functions optimized for Intel Pentium (P5,P54)
+processors. The mmx subdirectory has code for Pentium with MMX (P55).
+
+
+STATUS
+
+ cycles/limb
+
+ mpn_add_n/sub_n 2.375
+
+ mpn_copyi/copyd 1.0
+
+ mpn_divrem_1 44.0
+ mpn_mod_1 44.0
+ mpn_divexact_by3 15.0
+
+ mpn_l/rshift 5.375 normal (6.0 on P54)
+ 1.875 special shift by 1 bit
+
+ mpn_mul_1 13.0
+ mpn_add/submul_1 14.0
+
+ mpn_mul_basecase 14.2 cycles/crossproduct (approx)
+
+ mpn_sqr_basecase 8 cycles/crossproduct (approx)
+ or 15.5 cycles/triangleproduct (approx)
+
+Pentium MMX gets the following improvements
+
+ mpn_l/rshift 1.75
+
+
+1. mpn_lshift and mpn_rshift run at about 6 cycles/limb on P5 and P54, but the
+documentation indicates that they should take only 43/8 = 5.375 cycles/limb,
+or 5 cycles/limb asymptotically. The P55 runs them at the expected speed.
+
+2. mpn_add_n and mpn_sub_n run at asymptotically 2 cycles/limb. Due to loop
+overhead and other delays (cache refill?), they run at or near 2.5 cycles/limb.
+
+3. mpn_mul_1, mpn_addmul_1, mpn_submul_1 all run 1 cycle faster than they
+should. Intel documentation says a mul instruction is 10 cycles, but it
+measures 9 and the routines using it run with it as 9.
+
+
+
+RELEVANT OPTIMIZATION ISSUES
+
+1. Pentium doesn't allocate cache lines on writes, unlike most other modern
+processors. Since the functions in the mpn class do array writes, we have to
+handle allocating the destination cache lines by reading a word from it in the
+loops, to achieve the best performance.
+
+2. Pairing of memory operations requires that the two issued operations refer
+to different cache banks. The simplest way to insure this is to read/write
+two words from the same object. If we make operations on different objects,
+they might or might not be to the same cache bank.
+
+
+
+REFERENCES
+
+"Intel Architecture Optimization Manual", 1997, order number 242816. This
+is mostly about P5, the parts about P6 aren't relevant. Available on-line:
+
+ http://download.intel.com/design/PentiumII/manuals/242816.htm
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/pentium/aors_n.asm b/rts/gmp/mpn/x86/pentium/aors_n.asm
new file mode 100644
index 0000000000..a61082a456
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/aors_n.asm
@@ -0,0 +1,196 @@
+dnl Intel Pentium mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+dnl
+dnl P5: 2.375 cycles/limb
+
+
+dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
+dnl Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_add_n',`
+ define(M4_inst, adcl)
+ define(M4_function_n, mpn_add_n)
+ define(M4_function_nc, mpn_add_nc)
+
+',`ifdef(`OPERATION_sub_n',`
+ define(M4_inst, sbbl)
+ define(M4_function_n, mpn_sub_n)
+ define(M4_function_nc, mpn_sub_nc)
+
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(8)
+PROLOGUE(M4_function_nc)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC1,%esi
+ movl PARAM_SRC2,%ebp
+ movl PARAM_SIZE,%ecx
+
+ movl (%ebp),%ebx
+
+ decl %ecx
+ movl %ecx,%edx
+ shrl $3,%ecx
+ andl $7,%edx
+ testl %ecx,%ecx C zero carry flag
+ jz L(endgo)
+
+ pushl %edx
+FRAME_pushl()
+ movl PARAM_CARRY,%eax
+ shrl $1,%eax C shift bit 0 into carry
+ jmp LF(M4_function_n,oop)
+
+L(endgo):
+deflit(`FRAME',16)
+ movl PARAM_CARRY,%eax
+ shrl $1,%eax C shift bit 0 into carry
+ jmp LF(M4_function_n,end)
+
+EPILOGUE()
+
+
+ ALIGN(8)
+PROLOGUE(M4_function_n)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC1,%esi
+ movl PARAM_SRC2,%ebp
+ movl PARAM_SIZE,%ecx
+
+ movl (%ebp),%ebx
+
+ decl %ecx
+ movl %ecx,%edx
+ shrl $3,%ecx
+ andl $7,%edx
+ testl %ecx,%ecx C zero carry flag
+ jz L(end)
+ pushl %edx
+FRAME_pushl()
+
+ ALIGN(8)
+L(oop): movl 28(%edi),%eax C fetch destination cache line
+ leal 32(%edi),%edi
+
+L(1): movl (%esi),%eax
+ movl 4(%esi),%edx
+ M4_inst %ebx,%eax
+ movl 4(%ebp),%ebx
+ M4_inst %ebx,%edx
+ movl 8(%ebp),%ebx
+ movl %eax,-32(%edi)
+ movl %edx,-28(%edi)
+
+L(2): movl 8(%esi),%eax
+ movl 12(%esi),%edx
+ M4_inst %ebx,%eax
+ movl 12(%ebp),%ebx
+ M4_inst %ebx,%edx
+ movl 16(%ebp),%ebx
+ movl %eax,-24(%edi)
+ movl %edx,-20(%edi)
+
+L(3): movl 16(%esi),%eax
+ movl 20(%esi),%edx
+ M4_inst %ebx,%eax
+ movl 20(%ebp),%ebx
+ M4_inst %ebx,%edx
+ movl 24(%ebp),%ebx
+ movl %eax,-16(%edi)
+ movl %edx,-12(%edi)
+
+L(4): movl 24(%esi),%eax
+ movl 28(%esi),%edx
+ M4_inst %ebx,%eax
+ movl 28(%ebp),%ebx
+ M4_inst %ebx,%edx
+ movl 32(%ebp),%ebx
+ movl %eax,-8(%edi)
+ movl %edx,-4(%edi)
+
+ leal 32(%esi),%esi
+ leal 32(%ebp),%ebp
+ decl %ecx
+ jnz L(oop)
+
+ popl %edx
+FRAME_popl()
+L(end):
+ decl %edx C test %edx w/o clobbering carry
+ js L(end2)
+ incl %edx
+L(oop2):
+ leal 4(%edi),%edi
+ movl (%esi),%eax
+ M4_inst %ebx,%eax
+ movl 4(%ebp),%ebx
+ movl %eax,-4(%edi)
+ leal 4(%esi),%esi
+ leal 4(%ebp),%ebp
+ decl %edx
+ jnz L(oop2)
+L(end2):
+ movl (%esi),%eax
+ M4_inst %ebx,%eax
+ movl %eax,(%edi)
+
+ sbbl %eax,%eax
+ negl %eax
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/aorsmul_1.asm b/rts/gmp/mpn/x86/pentium/aorsmul_1.asm
new file mode 100644
index 0000000000..147b55610f
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/aorsmul_1.asm
@@ -0,0 +1,99 @@
+dnl Intel Pentium mpn_addmul_1 -- mpn by limb multiplication.
+dnl
+dnl P5: 14.0 cycles/limb
+
+
+dnl Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
+dnl Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA. */
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_addmul_1', `
+ define(M4_inst, addl)
+ define(M4_function_1, mpn_addmul_1)
+
+',`ifdef(`OPERATION_submul_1', `
+ define(M4_inst, subl)
+ define(M4_function_1, mpn_submul_1)
+
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult);
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(8)
+
+PROLOGUE(M4_function_1)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST, %edi
+ movl PARAM_SRC, %esi
+ movl PARAM_SIZE, %ecx
+ movl PARAM_MULTIPLIER, %ebp
+
+ leal (%edi,%ecx,4), %edi
+ leal (%esi,%ecx,4), %esi
+ negl %ecx
+ xorl %ebx, %ebx
+ ALIGN(8)
+
+L(oop): adcl $0, %ebx
+ movl (%esi,%ecx,4), %eax
+
+ mull %ebp
+
+ addl %ebx, %eax
+ movl (%edi,%ecx,4), %ebx
+
+ adcl $0, %edx
+ M4_inst %eax, %ebx
+
+ movl %ebx, (%edi,%ecx,4)
+ incl %ecx
+
+ movl %edx, %ebx
+ jnz L(oop)
+
+ adcl $0, %ebx
+ movl %ebx, %eax
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/diveby3.asm b/rts/gmp/mpn/x86/pentium/diveby3.asm
new file mode 100644
index 0000000000..dbac81642f
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/diveby3.asm
@@ -0,0 +1,183 @@
+dnl Intel P5 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+dnl
+dnl P5: 15.0 cycles/limb
+
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t carry);
+
+defframe(PARAM_CARRY,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl multiplicative inverse of 3, modulo 2^32
+deflit(INVERSE_3, 0xAAAAAAAB)
+
+dnl ceil(b/3), ceil(b*2/3) and floor(b*2/3) where b=2^32
+deflit(ONE_THIRD_CEIL, 0x55555556)
+deflit(TWO_THIRDS_CEIL, 0xAAAAAAAB)
+deflit(TWO_THIRDS_FLOOR, 0xAAAAAAAA)
+
+ .text
+ ALIGN(8)
+
+PROLOGUE(mpn_divexact_by3c)
+deflit(`FRAME',0)
+
+ movl PARAM_SRC, %ecx
+ movl PARAM_SIZE, %edx
+
+ decl %edx
+ jnz L(two_or_more)
+
+ movl (%ecx), %edx
+ movl PARAM_CARRY, %eax C risk of cache bank clash here
+
+ movl PARAM_DST, %ecx
+ subl %eax, %edx
+
+ sbbl %eax, %eax C 0 or -1
+
+ imull $INVERSE_3, %edx, %edx
+
+ negl %eax C 0 or 1
+ cmpl $ONE_THIRD_CEIL, %edx
+
+ sbbl $-1, %eax C +1 if edx>=ceil(b/3)
+ cmpl $TWO_THIRDS_CEIL, %edx
+
+ sbbl $-1, %eax C +1 if edx>=ceil(b*2/3)
+ movl %edx, (%ecx)
+
+ ret
+
+
+L(two_or_more):
+ C eax
+ C ebx
+ C ecx src
+ C edx size-1
+ C esi
+ C edi
+ C ebp
+
+ pushl %ebx FRAME_pushl()
+ pushl %esi FRAME_pushl()
+
+ pushl %edi FRAME_pushl()
+ pushl %ebp FRAME_pushl()
+
+ movl PARAM_DST, %edi
+ movl PARAM_CARRY, %esi
+
+ movl (%ecx), %eax C src low limb
+ xorl %ebx, %ebx
+
+ sub %esi, %eax
+ movl $TWO_THIRDS_FLOOR, %esi
+
+ leal (%ecx,%edx,4), %ecx C &src[size-1]
+ leal (%edi,%edx,4), %edi C &dst[size-1]
+
+ adcl $0, %ebx C carry, 0 or 1
+ negl %edx C -(size-1)
+
+
+C The loop needs a source limb ready at the top, which leads to one limb
+C handled separately at the end, and the special case above for size==1.
+C There doesn't seem to be any scheduling that would keep the speed but move
+C the source load and carry subtract up to the top.
+C
+C The destination cache line prefetching adds 1 cycle to the loop but is
+C considered worthwhile. The slowdown is a factor of 1.07, but will prevent
+C repeated write-throughs if the destination isn't in L1. A version using
+C an outer loop to prefetch only every 8 limbs (a cache line) proved to be
+C no faster, due to unavoidable branch mispreditions in the inner loop.
+C
+C setc is 2 cycles on P54, so an adcl is used instead. If the movl $0,%ebx
+C could be avoided then the src limb fetch could pair up and save a cycle.
+C This would probably mean going to a two limb loop with the carry limb
+C alternately positive or negative, since an sbbl %ebx,%ebx will leave a
+C value which is in the opposite sense to the preceding sbbl/adcl %ebx,%eax.
+C
+C A register is used for TWO_THIRDS_FLOOR because a cmp can't be done as
+C "cmpl %edx, $n" with the immediate as the second operand.
+C
+C The "4" source displacement is in the loop rather than the setup because
+C this gets L(top) aligned to 8 bytes at no cost.
+
+ ALIGN(8)
+L(top):
+ C eax source limb, carry subtracted
+ C ebx carry (0 or 1)
+ C ecx &src[size-1]
+ C edx counter, limbs, negative
+ C esi TWO_THIRDS_FLOOR
+ C edi &dst[size-1]
+ C ebp scratch (result limb)
+
+ imull $INVERSE_3, %eax, %ebp
+
+ cmpl $ONE_THIRD_CEIL, %ebp
+ movl (%edi,%edx,4), %eax C dst cache line prefetch
+
+ sbbl $-1, %ebx C +1 if ebp>=ceil(b/3)
+ cmpl %ebp, %esi
+
+ movl 4(%ecx,%edx,4), %eax C next src limb
+
+ sbbl %ebx, %eax C and further -1 if ebp>=ceil(b*2/3)
+ movl $0, %ebx
+
+ adcl $0, %ebx C new carry
+ movl %ebp, (%edi,%edx,4)
+
+ incl %edx
+ jnz L(top)
+
+
+
+ imull $INVERSE_3, %eax, %edx
+
+ cmpl $ONE_THIRD_CEIL, %edx
+ movl %edx, (%edi)
+
+ sbbl $-1, %ebx C +1 if edx>=ceil(b/3)
+ cmpl $TWO_THIRDS_CEIL, %edx
+
+ sbbl $-1, %ebx C +1 if edx>=ceil(b*2/3)
+ popl %ebp
+
+ movl %ebx, %eax
+ popl %edi
+
+ popl %esi
+ popl %ebx
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/gmp-mparam.h b/rts/gmp/mpn/x86/pentium/gmp-mparam.h
new file mode 100644
index 0000000000..d3ed3d73ce
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/gmp-mparam.h
@@ -0,0 +1,97 @@
+/* Intel P54 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 9 /* cycles */
+#endif
+#ifndef UDIV_TIME
+#define UDIV_TIME 41 /* cycles */
+#endif
+
+/* bsf takes 18-42 cycles, put an average for uniform random numbers */
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME 20 /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 14
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 179
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 22
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD 153
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD 46
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 110
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 13
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 25
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE { 496, 928, 1920, 4608, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD 512
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD 3840
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE { 496, 1184, 1920, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD 512
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD 3840
+#endif
diff --git a/rts/gmp/mpn/x86/pentium/lshift.asm b/rts/gmp/mpn/x86/pentium/lshift.asm
new file mode 100644
index 0000000000..e1e35d4c57
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/lshift.asm
@@ -0,0 +1,236 @@
+dnl Intel Pentium mpn_lshift -- mpn left shift.
+dnl
+dnl cycles/limb
+dnl P5,P54: 6.0
+dnl P55: 5.375
+
+
+dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
+dnl Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
+C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(8)
+PROLOGUE(mpn_lshift)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC,%esi
+ movl PARAM_SIZE,%ebp
+ movl PARAM_SHIFT,%ecx
+
+C We can use faster code for shift-by-1 under certain conditions.
+ cmp $1,%ecx
+ jne L(normal)
+ leal 4(%esi),%eax
+ cmpl %edi,%eax
+ jnc L(special) C jump if s_ptr + 1 >= res_ptr
+ leal (%esi,%ebp,4),%eax
+ cmpl %eax,%edi
+ jnc L(special) C jump if res_ptr >= s_ptr + size
+
+L(normal):
+ leal -4(%edi,%ebp,4),%edi
+ leal -4(%esi,%ebp,4),%esi
+
+ movl (%esi),%edx
+ subl $4,%esi
+ xorl %eax,%eax
+ shldl( %cl, %edx, %eax) C compute carry limb
+ pushl %eax C push carry limb onto stack
+
+ decl %ebp
+ pushl %ebp
+ shrl $3,%ebp
+ jz L(end)
+
+ movl (%edi),%eax C fetch destination cache line
+
+ ALIGN(4)
+L(oop): movl -28(%edi),%eax C fetch destination cache line
+ movl %edx,%ebx
+
+ movl (%esi),%eax
+ movl -4(%esi),%edx
+ shldl( %cl, %eax, %ebx)
+ shldl( %cl, %edx, %eax)
+ movl %ebx,(%edi)
+ movl %eax,-4(%edi)
+
+ movl -8(%esi),%ebx
+ movl -12(%esi),%eax
+ shldl( %cl, %ebx, %edx)
+ shldl( %cl, %eax, %ebx)
+ movl %edx,-8(%edi)
+ movl %ebx,-12(%edi)
+
+ movl -16(%esi),%edx
+ movl -20(%esi),%ebx
+ shldl( %cl, %edx, %eax)
+ shldl( %cl, %ebx, %edx)
+ movl %eax,-16(%edi)
+ movl %edx,-20(%edi)
+
+ movl -24(%esi),%eax
+ movl -28(%esi),%edx
+ shldl( %cl, %eax, %ebx)
+ shldl( %cl, %edx, %eax)
+ movl %ebx,-24(%edi)
+ movl %eax,-28(%edi)
+
+ subl $32,%esi
+ subl $32,%edi
+ decl %ebp
+ jnz L(oop)
+
+L(end): popl %ebp
+ andl $7,%ebp
+ jz L(end2)
+L(oop2):
+ movl (%esi),%eax
+ shldl( %cl,%eax,%edx)
+ movl %edx,(%edi)
+ movl %eax,%edx
+ subl $4,%esi
+ subl $4,%edi
+ decl %ebp
+ jnz L(oop2)
+
+L(end2):
+ shll %cl,%edx C compute least significant limb
+ movl %edx,(%edi) C store it
+
+ popl %eax C pop carry limb
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+
+C We loop from least significant end of the arrays, which is only
+C permissable if the source and destination don't overlap, since the
+C function is documented to work for overlapping source and destination.
+
+L(special):
+ movl (%esi),%edx
+ addl $4,%esi
+
+ decl %ebp
+ pushl %ebp
+ shrl $3,%ebp
+
+ addl %edx,%edx
+ incl %ebp
+ decl %ebp
+ jz L(Lend)
+
+ movl (%edi),%eax C fetch destination cache line
+
+ ALIGN(4)
+L(Loop):
+ movl 28(%edi),%eax C fetch destination cache line
+ movl %edx,%ebx
+
+ movl (%esi),%eax
+ movl 4(%esi),%edx
+ adcl %eax,%eax
+ movl %ebx,(%edi)
+ adcl %edx,%edx
+ movl %eax,4(%edi)
+
+ movl 8(%esi),%ebx
+ movl 12(%esi),%eax
+ adcl %ebx,%ebx
+ movl %edx,8(%edi)
+ adcl %eax,%eax
+ movl %ebx,12(%edi)
+
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebx
+ adcl %edx,%edx
+ movl %eax,16(%edi)
+ adcl %ebx,%ebx
+ movl %edx,20(%edi)
+
+ movl 24(%esi),%eax
+ movl 28(%esi),%edx
+ adcl %eax,%eax
+ movl %ebx,24(%edi)
+ adcl %edx,%edx
+ movl %eax,28(%edi)
+
+ leal 32(%esi),%esi C use leal not to clobber carry
+ leal 32(%edi),%edi
+ decl %ebp
+ jnz L(Loop)
+
+L(Lend):
+ popl %ebp
+ sbbl %eax,%eax C save carry in %eax
+ andl $7,%ebp
+ jz L(Lend2)
+ addl %eax,%eax C restore carry from eax
+L(Loop2):
+ movl %edx,%ebx
+ movl (%esi),%edx
+ adcl %edx,%edx
+ movl %ebx,(%edi)
+
+ leal 4(%esi),%esi C use leal not to clobber carry
+ leal 4(%edi),%edi
+ decl %ebp
+ jnz L(Loop2)
+
+ jmp L(L1)
+L(Lend2):
+ addl %eax,%eax C restore carry from eax
+L(L1): movl %edx,(%edi) C store last limb
+
+ sbbl %eax,%eax
+ negl %eax
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h b/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h
new file mode 100644
index 0000000000..2379077d0c
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h
@@ -0,0 +1,97 @@
+/* Intel P55 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 9 /* cycles */
+#endif
+#ifndef UDIV_TIME
+#define UDIV_TIME 41 /* cycles */
+#endif
+
+/* bsf takes 18-42 cycles, put an average for uniform random numbers */
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME 20 /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 14
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 99
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 22
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD 89
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD 40
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 98
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD 13
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 5
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 25
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE { 496, 1056, 1920, 4608, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD 512
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD 3840
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE { 496, 1184, 2176, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD 512
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD 4352
+#endif
diff --git a/rts/gmp/mpn/x86/pentium/mmx/lshift.asm b/rts/gmp/mpn/x86/pentium/mmx/lshift.asm
new file mode 100644
index 0000000000..2225438658
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mmx/lshift.asm
@@ -0,0 +1,455 @@
+dnl Intel P5 mpn_lshift -- mpn left shift.
+dnl
+dnl P5: 1.75 cycles/limb.
+
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C Shift src,size left by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the right. Return the bits shifted out at the
+C left.
+C
+C The comments in mpn_rshift apply here too.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl minimum 5, because the unrolled loop can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+ .text
+ ALIGN(8)
+
+PROLOGUE(mpn_lshift)
+
+ pushl %ebx
+ pushl %edi
+deflit(`FRAME',8)
+
+ movl PARAM_SIZE, %eax
+ movl PARAM_DST, %edx
+
+ movl PARAM_SRC, %ebx
+ movl PARAM_SHIFT, %ecx
+
+ cmp $UNROLL_THRESHOLD, %eax
+ jae L(unroll)
+
+ movl -4(%ebx,%eax,4), %edi C src high limb
+ decl %eax
+
+ jnz L(simple)
+
+ shldl( %cl, %edi, %eax) C eax was decremented to zero
+
+ shll %cl, %edi
+
+ movl %edi, (%edx) C dst low limb
+ popl %edi C risk of data cache bank clash
+
+ popl %ebx
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(simple):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx dst
+ C esi
+ C edi
+ C ebp
+deflit(`FRAME',8)
+
+ movd (%ebx,%eax,4), %mm5 C src high limb
+
+ movd %ecx, %mm6 C lshift
+ negl %ecx
+
+ psllq %mm6, %mm5
+ addl $32, %ecx
+
+ movd %ecx, %mm7
+ psrlq $32, %mm5 C retval
+
+
+L(simple_top):
+ C eax counter, limbs, negative
+ C ebx src
+ C ecx
+ C edx dst
+ C esi
+ C edi
+ C
+ C mm0 scratch
+ C mm5 return value
+ C mm6 shift
+ C mm7 32-shift
+
+ movq -4(%ebx,%eax,4), %mm0
+ decl %eax
+
+ psrlq %mm7, %mm0
+
+ C
+
+ movd %mm0, 4(%edx,%eax,4)
+ jnz L(simple_top)
+
+
+ movd (%ebx), %mm0
+
+ movd %mm5, %eax
+ psllq %mm6, %mm0
+
+ popl %edi
+ popl %ebx
+
+ movd %mm0, (%edx)
+
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(unroll):
+ C eax size
+ C ebx src
+ C ecx shift
+ C edx dst
+ C esi
+ C edi
+ C ebp
+deflit(`FRAME',8)
+
+ movd -4(%ebx,%eax,4), %mm5 C src high limb
+ leal (%ebx,%eax,4), %edi
+
+ movd %ecx, %mm6 C lshift
+ andl $4, %edi
+
+ psllq %mm6, %mm5
+ jz L(start_src_aligned)
+
+
+ C src isn't aligned, process high limb separately (marked xxx) to
+ C make it so.
+ C
+ C source -8(ebx,%eax,4)
+ C |
+ C +-------+-------+-------+--
+ C | |
+ C +-------+-------+-------+--
+ C 0mod8 4mod8 0mod8
+ C
+ C dest
+ C -4(edx,%eax,4)
+ C |
+ C +-------+-------+--
+ C | xxx | |
+ C +-------+-------+--
+
+ movq -8(%ebx,%eax,4), %mm0 C unaligned load
+
+ psllq %mm6, %mm0
+ decl %eax
+
+ psrlq $32, %mm0
+
+ C
+
+ movd %mm0, (%edx,%eax,4)
+L(start_src_aligned):
+
+ movq -8(%ebx,%eax,4), %mm1 C src high qword
+ leal (%edx,%eax,4), %edi
+
+ andl $4, %edi
+ psrlq $32, %mm5 C return value
+
+ movq -16(%ebx,%eax,4), %mm3 C src second highest qword
+ jz L(start_dst_aligned)
+
+ C dst isn't aligned, subtract 4 to make it so, and pretend the shift
+ C is 32 bits extra. High limb of dst (marked xxx) handled here
+ C separately.
+ C
+ C source -8(ebx,%eax,4)
+ C |
+ C +-------+-------+--
+ C | mm1 |
+ C +-------+-------+--
+ C 0mod8 4mod8
+ C
+ C dest
+ C -4(edx,%eax,4)
+ C |
+ C +-------+-------+-------+--
+ C | xxx | |
+ C +-------+-------+-------+--
+ C 0mod8 4mod8 0mod8
+
+ movq %mm1, %mm0
+ addl $32, %ecx C new shift
+
+ psllq %mm6, %mm0
+
+ movd %ecx, %mm6
+ psrlq $32, %mm0
+
+ C wasted cycle here waiting for %mm0
+
+ movd %mm0, -4(%edx,%eax,4)
+ subl $4, %edx
+L(start_dst_aligned):
+
+
+ psllq %mm6, %mm1
+ negl %ecx C -shift
+
+ addl $64, %ecx C 64-shift
+ movq %mm3, %mm2
+
+ movd %ecx, %mm7
+ subl $8, %eax C size-8
+
+ psrlq %mm7, %mm3
+
+ por %mm1, %mm3 C mm3 ready to store
+ jc L(finish)
+
+
+ C The comments in mpn_rshift apply here too.
+
+ ALIGN(8)
+L(unroll_loop):
+ C eax counter, limbs
+ C ebx src
+ C ecx
+ C edx dst
+ C esi
+ C edi
+ C
+ C mm0
+ C mm1
+ C mm2 src qword from 48(%ebx,%eax,4)
+ C mm3 dst qword ready to store to 56(%edx,%eax,4)
+ C
+ C mm5 return value
+ C mm6 lshift
+ C mm7 rshift
+
+ movq 8(%ebx,%eax,4), %mm0
+ psllq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ movq %mm3, 24(%edx,%eax,4) C prev
+ por %mm2, %mm0
+
+ movq (%ebx,%eax,4), %mm3 C
+ psllq %mm6, %mm1 C
+
+ movq %mm0, 16(%edx,%eax,4)
+ movq %mm3, %mm2 C
+
+ psrlq %mm7, %mm3 C
+ subl $4, %eax
+
+ por %mm1, %mm3 C
+ jnc L(unroll_loop)
+
+
+
+L(finish):
+ C eax -4 to -1 representing respectively 0 to 3 limbs remaining
+
+ testb $2, %al
+
+ jz L(finish_no_two)
+
+ movq 8(%ebx,%eax,4), %mm0
+ psllq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ movq %mm3, 24(%edx,%eax,4) C prev
+ por %mm2, %mm0
+
+ movq %mm1, %mm2
+ movq %mm0, %mm3
+
+ subl $2, %eax
+L(finish_no_two):
+
+
+ C eax -4 or -3 representing respectively 0 or 1 limbs remaining
+ C
+ C mm2 src prev qword, from 48(%ebx,%eax,4)
+ C mm3 dst qword, for 56(%edx,%eax,4)
+
+ testb $1, %al
+ movd %mm5, %eax C retval
+
+ popl %edi
+ jz L(finish_zero)
+
+
+ C One extra src limb, destination was aligned.
+ C
+ C source ebx
+ C --+---------------+-------+
+ C | mm2 | |
+ C --+---------------+-------+
+ C
+ C dest edx+12 edx+4 edx
+ C --+---------------+---------------+-------+
+ C | mm3 | | |
+ C --+---------------+---------------+-------+
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C One extra src limb, destination was unaligned.
+ C
+ C source ebx
+ C --+---------------+-------+
+ C | mm2 | |
+ C --+---------------+-------+
+ C
+ C dest edx+12 edx+4
+ C --+---------------+---------------+
+ C | mm3 | |
+ C --+---------------+---------------+
+ C
+ C mm6 = shift+32
+ C mm7 = ecx = 64-(shift+32)
+
+
+ C In both cases there's one extra limb of src to fetch and combine
+ C with mm2 to make a qword at 4(%edx), and in the aligned case
+ C there's an extra limb of dst to be formed from that extra src limb
+ C left shifted.
+
+
+ movd (%ebx), %mm0
+ psllq %mm6, %mm2
+
+ movq %mm3, 12(%edx)
+ psllq $32, %mm0
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ por %mm2, %mm0
+ psllq %mm6, %mm1
+
+ movq %mm0, 4(%edx)
+ psrlq $32, %mm1
+
+ andl $32, %ecx
+ popl %ebx
+
+ jz L(finish_one_unaligned)
+
+ movd %mm1, (%edx)
+L(finish_one_unaligned):
+
+ emms
+
+ ret
+
+
+L(finish_zero):
+
+ C No extra src limbs, destination was aligned.
+ C
+ C source ebx
+ C --+---------------+
+ C | mm2 |
+ C --+---------------+
+ C
+ C dest edx+8 edx
+ C --+---------------+---------------+
+ C | mm3 | |
+ C --+---------------+---------------+
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C No extra src limbs, destination was unaligned.
+ C
+ C source ebx
+ C --+---------------+
+ C | mm2 |
+ C --+---------------+
+ C
+ C dest edx+8 edx+4
+ C --+---------------+-------+
+ C | mm3 | |
+ C --+---------------+-------+
+ C
+ C mm6 = shift+32
+ C mm7 = ecx = 64-(shift+32)
+
+
+ C The movd for the unaligned case writes the same data to 4(%edx)
+ C that the movq does for the aligned case.
+
+
+ movq %mm3, 8(%edx)
+ andl $32, %ecx
+
+ psllq %mm6, %mm2
+ jz L(finish_zero_unaligned)
+
+ movq %mm2, (%edx)
+L(finish_zero_unaligned):
+
+ psrlq $32, %mm2
+ popl %ebx
+
+ movd %mm5, %eax C retval
+
+ movd %mm2, 4(%edx)
+
+ emms
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mmx/popham.asm b/rts/gmp/mpn/x86/pentium/mmx/popham.asm
new file mode 100644
index 0000000000..587a07ab3d
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mmx/popham.asm
@@ -0,0 +1,30 @@
+dnl Intel P55 mpn_popcount, mpn_hamdist -- population count and hamming
+dnl distance.
+dnl
+dnl P55: popcount 11.5 cycles/limb, hamdist 12.0 cycles/limb
+
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k6/mmx/popham.asm')
diff --git a/rts/gmp/mpn/x86/pentium/mmx/rshift.asm b/rts/gmp/mpn/x86/pentium/mmx/rshift.asm
new file mode 100644
index 0000000000..7672630d57
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mmx/rshift.asm
@@ -0,0 +1,460 @@
+dnl Intel P5 mpn_rshift -- mpn right shift.
+dnl
+dnl P5: 1.75 cycles/limb.
+
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C Shift src,size right by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the left. Return the bits shifted out at the
+C right.
+C
+C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
+C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
+C
+C Full speed depends on source and destination being aligned. Unaligned mmx
+C loads and stores on P5 don't pair and have a 2 cycle penalty. Some hairy
+C setups and finish-ups are done to ensure alignment for the loop.
+C
+C MMX shifts work out a bit faster even for the simple loop.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl Minimum 5, because the unrolled loop can't handle less.
+deflit(UNROLL_THRESHOLD, 5)
+
+ .text
+ ALIGN(8)
+
+PROLOGUE(mpn_rshift)
+
+ pushl %ebx
+ pushl %edi
+deflit(`FRAME',8)
+
+ movl PARAM_SIZE, %eax
+ movl PARAM_DST, %edx
+
+ movl PARAM_SRC, %ebx
+ movl PARAM_SHIFT, %ecx
+
+ cmp $UNROLL_THRESHOLD, %eax
+ jae L(unroll)
+
+ decl %eax
+ movl (%ebx), %edi C src low limb
+
+ jnz L(simple)
+
+ shrdl( %cl, %edi, %eax) C eax was decremented to zero
+
+ shrl %cl, %edi
+
+ movl %edi, (%edx) C dst low limb
+ popl %edi C risk of data cache bank clash
+
+ popl %ebx
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(simple):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx dst
+ C esi
+ C edi
+ C ebp
+deflit(`FRAME',8)
+
+ movd (%ebx), %mm5 C src[0]
+ leal (%ebx,%eax,4), %ebx C &src[size-1]
+
+ movd %ecx, %mm6 C rshift
+ leal -4(%edx,%eax,4), %edx C &dst[size-2]
+
+ psllq $32, %mm5
+ negl %eax
+
+
+C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
+C cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4
+C cycles and would be 8 in a simple loop. Using mmx helps the return value
+C and last limb calculations too.
+
+L(simple_top):
+ C eax counter, limbs, negative
+ C ebx &src[size-1]
+ C ecx return value
+ C edx &dst[size-2]
+ C
+ C mm0 scratch
+ C mm5 return value
+ C mm6 shift
+
+ movq (%ebx,%eax,4), %mm0
+ incl %eax
+
+ psrlq %mm6, %mm0
+
+ movd %mm0, (%edx,%eax,4)
+ jnz L(simple_top)
+
+
+ movd (%ebx), %mm0
+ psrlq %mm6, %mm5 C return value
+
+ psrlq %mm6, %mm0
+ popl %edi
+
+ movd %mm5, %eax
+ popl %ebx
+
+ movd %mm0, 4(%edx)
+
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(unroll):
+ C eax size
+ C ebx src
+ C ecx shift
+ C edx dst
+ C esi
+ C edi
+ C ebp
+deflit(`FRAME',8)
+
+ movd (%ebx), %mm5 C src[0]
+ movl $4, %edi
+
+ movd %ecx, %mm6 C rshift
+ testl %edi, %ebx
+
+ psllq $32, %mm5
+ jz L(start_src_aligned)
+
+
+ C src isn't aligned, process low limb separately (marked xxx) and
+ C step src and dst by one limb, making src aligned.
+ C
+ C source ebx
+ C --+-------+-------+-------+
+ C | xxx |
+ C --+-------+-------+-------+
+ C 4mod8 0mod8 4mod8
+ C
+ C dest edx
+ C --+-------+-------+
+ C | | xxx |
+ C --+-------+-------+
+
+ movq (%ebx), %mm0 C unaligned load
+
+ psrlq %mm6, %mm0
+ addl $4, %ebx
+
+ decl %eax
+
+ movd %mm0, (%edx)
+ addl $4, %edx
+L(start_src_aligned):
+
+
+ movq (%ebx), %mm1
+ testl %edi, %edx
+
+ psrlq %mm6, %mm5 C retval
+ jz L(start_dst_aligned)
+
+ C dst isn't aligned, add 4 to make it so, and pretend the shift is
+ C 32 bits extra. Low limb of dst (marked xxx) handled here
+ C separately.
+ C
+ C source ebx
+ C --+-------+-------+
+ C | mm1 |
+ C --+-------+-------+
+ C 4mod8 0mod8
+ C
+ C dest edx
+ C --+-------+-------+-------+
+ C | xxx |
+ C --+-------+-------+-------+
+ C 4mod8 0mod8 4mod8
+
+ movq %mm1, %mm0
+ addl $32, %ecx C new shift
+
+ psrlq %mm6, %mm0
+
+ movd %ecx, %mm6
+
+ movd %mm0, (%edx)
+ addl $4, %edx
+L(start_dst_aligned):
+
+
+ movq 8(%ebx), %mm3
+ negl %ecx
+
+ movq %mm3, %mm2 C mm2 src qword
+ addl $64, %ecx
+
+ movd %ecx, %mm7
+ psrlq %mm6, %mm1
+
+ leal -12(%ebx,%eax,4), %ebx
+ leal -20(%edx,%eax,4), %edx
+
+ psllq %mm7, %mm3
+ subl $7, %eax C size-7
+
+ por %mm1, %mm3 C mm3 ready to store
+ negl %eax C -(size-7)
+
+ jns L(finish)
+
+
+ C This loop is the important bit, the rest is just support. Careful
+ C instruction scheduling achieves the claimed 1.75 c/l. The
+ C relevant parts of the pairing rules are:
+ C
+ C - mmx loads and stores execute only in the U pipe
+ C - only one mmx shift in a pair
+ C - wait one cycle before storing an mmx register result
+ C - the usual address generation interlock
+ C
+ C Two qword calculations are slightly interleaved. The instructions
+ C marked "C" belong to the second qword, and the "C prev" one is for
+ C the second qword from the previous iteration.
+
+ ALIGN(8)
+L(unroll_loop):
+ C eax counter, limbs, negative
+ C ebx &src[size-12]
+ C ecx
+ C edx &dst[size-12]
+ C esi
+ C edi
+ C
+ C mm0
+ C mm1
+ C mm2 src qword from -8(%ebx,%eax,4)
+ C mm3 dst qword ready to store to -8(%edx,%eax,4)
+ C
+ C mm5 return value
+ C mm6 rshift
+ C mm7 lshift
+
+ movq (%ebx,%eax,4), %mm0
+ psrlq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ movq %mm3, -8(%edx,%eax,4) C prev
+ por %mm2, %mm0
+
+ movq 8(%ebx,%eax,4), %mm3 C
+ psrlq %mm6, %mm1 C
+
+ movq %mm0, (%edx,%eax,4)
+ movq %mm3, %mm2 C
+
+ psllq %mm7, %mm3 C
+ addl $4, %eax
+
+ por %mm1, %mm3 C
+ js L(unroll_loop)
+
+
+L(finish):
+ C eax 0 to 3 representing respectively 3 to 0 limbs remaining
+
+ testb $2, %al
+
+ jnz L(finish_no_two)
+
+ movq (%ebx,%eax,4), %mm0
+ psrlq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ movq %mm3, -8(%edx,%eax,4) C prev
+ por %mm2, %mm0
+
+ movq %mm1, %mm2
+ movq %mm0, %mm3
+
+ addl $2, %eax
+L(finish_no_two):
+
+
+ C eax 2 or 3 representing respectively 1 or 0 limbs remaining
+ C
+ C mm2 src prev qword, from -8(%ebx,%eax,4)
+ C mm3 dst qword, for -8(%edx,%eax,4)
+
+ testb $1, %al
+ popl %edi
+
+ movd %mm5, %eax C retval
+ jnz L(finish_zero)
+
+
+ C One extra limb, destination was aligned.
+ C
+ C source ebx
+ C +-------+---------------+--
+ C | | mm2 |
+ C +-------+---------------+--
+ C
+ C dest edx
+ C +-------+---------------+---------------+--
+ C | | | mm3 |
+ C +-------+---------------+---------------+--
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C One extra limb, destination was unaligned.
+ C
+ C source ebx
+ C +-------+---------------+--
+ C | | mm2 |
+ C +-------+---------------+--
+ C
+ C dest edx
+ C +---------------+---------------+--
+ C | | mm3 |
+ C +---------------+---------------+--
+ C
+ C mm6 = shift+32
+ C mm7 = ecx = 64-(shift+32)
+
+
+ C In both cases there's one extra limb of src to fetch and combine
+ C with mm2 to make a qword at 8(%edx), and in the aligned case
+ C there's a further extra limb of dst to be formed.
+
+
+ movd 8(%ebx), %mm0
+ psrlq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ movq %mm3, (%edx)
+ por %mm2, %mm0
+
+ psrlq %mm6, %mm1
+ andl $32, %ecx
+
+ popl %ebx
+ jz L(finish_one_unaligned)
+
+ C dst was aligned, must store one extra limb
+ movd %mm1, 16(%edx)
+L(finish_one_unaligned):
+
+ movq %mm0, 8(%edx)
+
+ emms
+
+ ret
+
+
+L(finish_zero):
+
+ C No extra limbs, destination was aligned.
+ C
+ C source ebx
+ C +---------------+--
+ C | mm2 |
+ C +---------------+--
+ C
+ C dest edx+4
+ C +---------------+---------------+--
+ C | | mm3 |
+ C +---------------+---------------+--
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C No extra limbs, destination was unaligned.
+ C
+ C source ebx
+ C +---------------+--
+ C | mm2 |
+ C +---------------+--
+ C
+ C dest edx+4
+ C +-------+---------------+--
+ C | | mm3 |
+ C +-------+---------------+--
+ C
+ C mm6 = shift+32
+ C mm7 = 64-(shift+32)
+
+
+ C The movd for the unaligned case is clearly the same data as the
+ C movq for the aligned case, it's just a choice between whether one
+ C or two limbs should be written.
+
+
+ movq %mm3, 4(%edx)
+ psrlq %mm6, %mm2
+
+ movd %mm2, 12(%edx)
+ andl $32, %ecx
+
+ popl %ebx
+ jz L(finish_zero_unaligned)
+
+ movq %mm2, 12(%edx)
+L(finish_zero_unaligned):
+
+ emms
+
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mul_1.asm b/rts/gmp/mpn/x86/pentium/mul_1.asm
new file mode 100644
index 0000000000..08639eca09
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mul_1.asm
@@ -0,0 +1,79 @@
+dnl Intel Pentium mpn_mul_1 -- mpn by limb multiplication.
+dnl
+dnl P5: 13.0 cycles/limb
+
+dnl Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
+dnl Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA. */
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t multiplier);
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(8)
+PROLOGUE(mpn_mul_1)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST, %edi
+ movl PARAM_SRC, %esi
+ movl PARAM_SIZE, %ecx
+ movl PARAM_MULTIPLIER, %ebp
+
+ leal (%edi,%ecx,4), %edi
+ leal (%esi,%ecx,4), %esi
+ negl %ecx
+ xorl %ebx, %ebx
+ ALIGN(8)
+
+L(oop): adcl $0, %ebx
+ movl (%esi,%ecx,4), %eax
+
+ mull %ebp
+
+ addl %eax, %ebx
+
+ movl %ebx, (%edi,%ecx,4)
+ incl %ecx
+
+ movl %edx, %ebx
+ jnz L(oop)
+
+ adcl $0, %ebx
+ movl %ebx, %eax
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mul_basecase.asm b/rts/gmp/mpn/x86/pentium/mul_basecase.asm
new file mode 100644
index 0000000000..d9f79a0831
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mul_basecase.asm
@@ -0,0 +1,135 @@
+dnl Intel Pentium mpn_mul_basecase -- mpn by mpn multiplication.
+dnl
+dnl P5: 14.2 cycles/crossproduct (approx)
+
+
+dnl Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C mp_srcptr xp, mp_size_t xsize,
+C mp_srcptr yp, mp_size_t ysize);
+
+defframe(PARAM_YSIZE, 20)
+defframe(PARAM_YP, 16)
+defframe(PARAM_XSIZE, 12)
+defframe(PARAM_XP, 8)
+defframe(PARAM_WP, 4)
+
+defframe(VAR_COUNTER, -4)
+
+ .text
+ ALIGN(8)
+PROLOGUE(mpn_mul_basecase)
+
+ pushl %eax C dummy push for allocating stack slot
+ pushl %esi
+ pushl %ebp
+ pushl %edi
+deflit(`FRAME',16)
+
+ movl PARAM_XP,%esi
+ movl PARAM_WP,%edi
+ movl PARAM_YP,%ebp
+
+ movl (%esi),%eax C load xp[0]
+ mull (%ebp) C multiply by yp[0]
+ movl %eax,(%edi) C store to wp[0]
+ movl PARAM_XSIZE,%ecx C xsize
+ decl %ecx C If xsize = 1, ysize = 1 too
+ jz L(done)
+
+ movl PARAM_XSIZE,%eax
+ pushl %ebx
+FRAME_pushl()
+ movl %edx,%ebx
+ leal (%esi,%eax,4),%esi C make xp point at end
+ leal (%edi,%eax,4),%edi C offset wp by xsize
+ negl %ecx C negate j size/index for inner loop
+ xorl %eax,%eax C clear carry
+
+ ALIGN(8)
+L(oop1): adcl $0,%ebx
+ movl (%esi,%ecx,4),%eax C load next limb at xp[j]
+ mull (%ebp)
+ addl %ebx,%eax
+ movl %eax,(%edi,%ecx,4)
+ incl %ecx
+ movl %edx,%ebx
+ jnz L(oop1)
+
+ adcl $0,%ebx
+ movl PARAM_YSIZE,%eax
+ movl %ebx,(%edi) C most significant limb of product
+ addl $4,%edi C increment wp
+ decl %eax
+ jz L(skip)
+ movl %eax,VAR_COUNTER C set index i to ysize
+
+L(outer):
+ addl $4,%ebp C make ebp point to next y limb
+ movl PARAM_XSIZE,%ecx
+ negl %ecx
+ xorl %ebx,%ebx
+
+ C code at 0x61 here, close enough to aligned
+L(oop2):
+ adcl $0,%ebx
+ movl (%esi,%ecx,4),%eax
+ mull (%ebp)
+ addl %ebx,%eax
+ movl (%edi,%ecx,4),%ebx
+ adcl $0,%edx
+ addl %eax,%ebx
+ movl %ebx,(%edi,%ecx,4)
+ incl %ecx
+ movl %edx,%ebx
+ jnz L(oop2)
+
+ adcl $0,%ebx
+
+ movl %ebx,(%edi)
+ addl $4,%edi
+ movl VAR_COUNTER,%eax
+ decl %eax
+ movl %eax,VAR_COUNTER
+ jnz L(outer)
+
+L(skip):
+ popl %ebx
+ popl %edi
+ popl %ebp
+ popl %esi
+ addl $4,%esp
+ ret
+
+L(done):
+ movl %edx,4(%edi) C store to wp[1]
+ popl %edi
+ popl %ebp
+ popl %esi
+ popl %eax C dummy pop for deallocating stack slot
+ ret
+
+EPILOGUE()
+
diff --git a/rts/gmp/mpn/x86/pentium/rshift.asm b/rts/gmp/mpn/x86/pentium/rshift.asm
new file mode 100644
index 0000000000..e8f5ae8ec8
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/rshift.asm
@@ -0,0 +1,236 @@
+dnl Intel Pentium mpn_rshift -- mpn right shift.
+dnl
+dnl cycles/limb
+dnl P5,P54: 6.0
+dnl P55: 5.375
+
+
+dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
+dnl Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
+C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(8)
+PROLOGUE(mpn_rshift)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC,%esi
+ movl PARAM_SIZE,%ebp
+ movl PARAM_SHIFT,%ecx
+
+C We can use faster code for shift-by-1 under certain conditions.
+ cmp $1,%ecx
+ jne L(normal)
+ leal 4(%edi),%eax
+ cmpl %esi,%eax
+ jnc L(special) C jump if res_ptr + 1 >= s_ptr
+ leal (%edi,%ebp,4),%eax
+ cmpl %eax,%esi
+ jnc L(special) C jump if s_ptr >= res_ptr + size
+
+L(normal):
+ movl (%esi),%edx
+ addl $4,%esi
+ xorl %eax,%eax
+ shrdl( %cl, %edx, %eax) C compute carry limb
+ pushl %eax C push carry limb onto stack
+
+ decl %ebp
+ pushl %ebp
+ shrl $3,%ebp
+ jz L(end)
+
+ movl (%edi),%eax C fetch destination cache line
+
+ ALIGN(4)
+L(oop): movl 28(%edi),%eax C fetch destination cache line
+ movl %edx,%ebx
+
+ movl (%esi),%eax
+ movl 4(%esi),%edx
+ shrdl( %cl, %eax, %ebx)
+ shrdl( %cl, %edx, %eax)
+ movl %ebx,(%edi)
+ movl %eax,4(%edi)
+
+ movl 8(%esi),%ebx
+ movl 12(%esi),%eax
+ shrdl( %cl, %ebx, %edx)
+ shrdl( %cl, %eax, %ebx)
+ movl %edx,8(%edi)
+ movl %ebx,12(%edi)
+
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebx
+ shrdl( %cl, %edx, %eax)
+ shrdl( %cl, %ebx, %edx)
+ movl %eax,16(%edi)
+ movl %edx,20(%edi)
+
+ movl 24(%esi),%eax
+ movl 28(%esi),%edx
+ shrdl( %cl, %eax, %ebx)
+ shrdl( %cl, %edx, %eax)
+ movl %ebx,24(%edi)
+ movl %eax,28(%edi)
+
+ addl $32,%esi
+ addl $32,%edi
+ decl %ebp
+ jnz L(oop)
+
+L(end): popl %ebp
+ andl $7,%ebp
+ jz L(end2)
+L(oop2):
+ movl (%esi),%eax
+ shrdl( %cl,%eax,%edx) C compute result limb
+ movl %edx,(%edi)
+ movl %eax,%edx
+ addl $4,%esi
+ addl $4,%edi
+ decl %ebp
+ jnz L(oop2)
+
+L(end2):
+ shrl %cl,%edx C compute most significant limb
+ movl %edx,(%edi) C store it
+
+ popl %eax C pop carry limb
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+
+C We loop from least significant end of the arrays, which is only
+C permissable if the source and destination don't overlap, since the
+C function is documented to work for overlapping source and destination.
+
+L(special):
+ leal -4(%edi,%ebp,4),%edi
+ leal -4(%esi,%ebp,4),%esi
+
+ movl (%esi),%edx
+ subl $4,%esi
+
+ decl %ebp
+ pushl %ebp
+ shrl $3,%ebp
+
+ shrl %edx
+ incl %ebp
+ decl %ebp
+ jz L(Lend)
+
+ movl (%edi),%eax C fetch destination cache line
+
+ ALIGN(4)
+L(Loop):
+ movl -28(%edi),%eax C fetch destination cache line
+ movl %edx,%ebx
+
+ movl (%esi),%eax
+ movl -4(%esi),%edx
+ rcrl %eax
+ movl %ebx,(%edi)
+ rcrl %edx
+ movl %eax,-4(%edi)
+
+ movl -8(%esi),%ebx
+ movl -12(%esi),%eax
+ rcrl %ebx
+ movl %edx,-8(%edi)
+ rcrl %eax
+ movl %ebx,-12(%edi)
+
+ movl -16(%esi),%edx
+ movl -20(%esi),%ebx
+ rcrl %edx
+ movl %eax,-16(%edi)
+ rcrl %ebx
+ movl %edx,-20(%edi)
+
+ movl -24(%esi),%eax
+ movl -28(%esi),%edx
+ rcrl %eax
+ movl %ebx,-24(%edi)
+ rcrl %edx
+ movl %eax,-28(%edi)
+
+ leal -32(%esi),%esi C use leal not to clobber carry
+ leal -32(%edi),%edi
+ decl %ebp
+ jnz L(Loop)
+
+L(Lend):
+ popl %ebp
+ sbbl %eax,%eax C save carry in %eax
+ andl $7,%ebp
+ jz L(Lend2)
+ addl %eax,%eax C restore carry from eax
+L(Loop2):
+ movl %edx,%ebx
+ movl (%esi),%edx
+ rcrl %edx
+ movl %ebx,(%edi)
+
+ leal -4(%esi),%esi C use leal not to clobber carry
+ leal -4(%edi),%edi
+ decl %ebp
+ jnz L(Loop2)
+
+ jmp L(L1)
+L(Lend2):
+ addl %eax,%eax C restore carry from eax
+L(L1): movl %edx,(%edi) C store last limb
+
+ movl $0,%eax
+ rcrl %eax
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/sqr_basecase.asm b/rts/gmp/mpn/x86/pentium/sqr_basecase.asm
new file mode 100644
index 0000000000..c8584df13c
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/sqr_basecase.asm
@@ -0,0 +1,520 @@
+dnl Intel P5 mpn_sqr_basecase -- square an mpn number.
+dnl
+dnl P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
+dnl product at around 20x20 limbs.
+
+
+dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Calculate src,size squared, storing the result in dst,2*size.
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the size is
+C small.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ .text
+ ALIGN(8)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %edx
+ movl PARAM_SRC, %eax
+
+ cmpl $2, %edx
+ movl PARAM_DST, %ecx
+
+ je L(two_limbs)
+
+ movl (%eax), %eax
+ ja L(three_or_more)
+
+C -----------------------------------------------------------------------------
+C one limb only
+ C eax src
+ C ebx
+ C ecx dst
+ C edx
+
+ mull %eax
+
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+
+ ret
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(two_limbs):
+ C eax src
+ C ebx
+ C ecx dst
+ C edx size
+
+ pushl %ebp
+ pushl %edi
+
+ pushl %esi
+ pushl %ebx
+
+ movl %eax, %ebx
+ movl (%eax), %eax
+
+ mull %eax C src[0]^2
+
+ movl %eax, (%ecx) C dst[0]
+ movl %edx, %esi C dst[1]
+
+ movl 4(%ebx), %eax
+
+ mull %eax C src[1]^2
+
+ movl %eax, %edi C dst[2]
+ movl %edx, %ebp C dst[3]
+
+ movl (%ebx), %eax
+
+ mull 4(%ebx) C src[0]*src[1]
+
+ addl %eax, %esi
+ popl %ebx
+
+ adcl %edx, %edi
+
+ adcl $0, %ebp
+ addl %esi, %eax
+
+ adcl %edi, %edx
+ movl %eax, 4(%ecx)
+
+ adcl $0, %ebp
+ popl %esi
+
+ movl %edx, 8(%ecx)
+ movl %ebp, 12(%ecx)
+
+ popl %edi
+ popl %ebp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(three_or_more):
+ C eax src low limb
+ C ebx
+ C ecx dst
+ C edx size
+
+ cmpl $4, %edx
+ pushl %ebx
+deflit(`FRAME',4)
+
+ movl PARAM_SRC, %ebx
+ jae L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+ C eax src low limb
+ C ebx src
+ C ecx dst
+ C edx size
+
+ pushl %ebp
+ pushl %edi
+
+ mull %eax C src[0] ^ 2
+
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+
+ movl 4(%ebx), %eax
+ xorl %ebp, %ebp
+
+ mull %eax C src[1] ^ 2
+
+ movl %eax, 8(%ecx)
+ movl %edx, 12(%ecx)
+
+ movl 8(%ebx), %eax
+ pushl %esi C risk of cache bank clash
+
+ mull %eax C src[2] ^ 2
+
+ movl %eax, 16(%ecx)
+ movl %edx, 20(%ecx)
+
+ movl (%ebx), %eax
+
+ mull 4(%ebx) C src[0] * src[1]
+
+ movl %eax, %esi
+ movl %edx, %edi
+
+ movl (%ebx), %eax
+
+ mull 8(%ebx) C src[0] * src[2]
+
+ addl %eax, %edi
+ movl %edx, %ebp
+
+ adcl $0, %ebp
+ movl 4(%ebx), %eax
+
+ mull 8(%ebx) C src[1] * src[2]
+
+ xorl %ebx, %ebx
+ addl %eax, %ebp
+
+ C eax
+ C ebx zero, will be dst[5]
+ C ecx dst
+ C edx dst[4]
+ C esi dst[1]
+ C edi dst[2]
+ C ebp dst[3]
+
+ adcl $0, %edx
+ addl %esi, %esi
+
+ adcl %edi, %edi
+
+ adcl %ebp, %ebp
+
+ adcl %edx, %edx
+ movl 4(%ecx), %eax
+
+ adcl $0, %ebx
+ addl %esi, %eax
+
+ movl %eax, 4(%ecx)
+ movl 8(%ecx), %eax
+
+ adcl %edi, %eax
+ movl 12(%ecx), %esi
+
+ adcl %ebp, %esi
+ movl 16(%ecx), %edi
+
+ movl %eax, 8(%ecx)
+ movl %esi, 12(%ecx)
+
+ adcl %edx, %edi
+ popl %esi
+
+ movl 20(%ecx), %eax
+ movl %edi, 16(%ecx)
+
+ popl %edi
+ popl %ebp
+
+ adcl %ebx, %eax C no carry out of this
+ popl %ebx
+
+ movl %eax, 20(%ecx)
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(four_or_more):
+ C eax src low limb
+ C ebx src
+ C ecx dst
+ C edx size
+ C esi
+ C edi
+ C ebp
+ C
+ C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+
+deflit(`FRAME',4)
+
+ pushl %edi
+FRAME_pushl()
+ pushl %esi
+FRAME_pushl()
+
+ pushl %ebp
+FRAME_pushl()
+ leal (%ecx,%edx,4), %edi C dst end of this mul1
+
+ leal (%ebx,%edx,4), %esi C src end
+ movl %ebx, %ebp C src
+
+ negl %edx C -size
+ xorl %ebx, %ebx C clear carry limb and carry flag
+
+ leal 1(%edx), %ecx C -(size-1)
+
+L(mul1):
+ C eax scratch
+ C ebx carry
+ C ecx counter, negative
+ C edx scratch
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp src
+
+ adcl $0, %ebx
+ movl (%esi,%ecx,4), %eax
+
+ mull (%ebp)
+
+ addl %eax, %ebx
+
+ movl %ebx, (%edi,%ecx,4)
+ incl %ecx
+
+ movl %edx, %ebx
+ jnz L(mul1)
+
+
+ C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
+ C n=1..size-2.
+ C
+ C The last two products, which are the end corner of the product
+ C triangle, are handled separately to save looping overhead. These
+ C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
+ C If size is 4 then it's only these that need to be done.
+ C
+ C In the outer loop %esi is a constant, and %edi just advances by 1
+ C limb each time. The size of the operation decreases by 1 limb
+ C each time.
+
+ C eax
+ C ebx carry (needing carry flag added)
+ C ecx
+ C edx
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp
+
+ adcl $0, %ebx
+ movl PARAM_SIZE, %edx
+
+ movl %ebx, (%edi)
+ subl $4, %edx
+
+ negl %edx
+ jz L(corner)
+
+
+L(outer):
+ C ebx previous carry limb to store
+ C edx outer loop counter (negative)
+ C esi &src[size]
+ C edi dst, pointing at stored carry limb of previous loop
+
+ pushl %edx C new outer loop counter
+ leal -2(%edx), %ecx
+
+ movl %ebx, (%edi)
+ addl $4, %edi
+
+ addl $4, %ebp
+ xorl %ebx, %ebx C initial carry limb, clear carry flag
+
+L(inner):
+ C eax scratch
+ C ebx carry (needing carry flag added)
+ C ecx counter, negative
+ C edx scratch
+ C esi &src[size]
+ C edi dst end of this addmul
+ C ebp &src[j]
+
+ adcl $0, %ebx
+ movl (%esi,%ecx,4), %eax
+
+ mull (%ebp)
+
+ addl %ebx, %eax
+ movl (%edi,%ecx,4), %ebx
+
+ adcl $0, %edx
+ addl %eax, %ebx
+
+ movl %ebx, (%edi,%ecx,4)
+ incl %ecx
+
+ movl %edx, %ebx
+ jnz L(inner)
+
+
+ adcl $0, %ebx
+ popl %edx C outer loop counter
+
+ incl %edx
+ jnz L(outer)
+
+
+ movl %ebx, (%edi)
+
+L(corner):
+ C esi &src[size]
+ C edi &dst[2*size-4]
+
+ movl -8(%esi), %eax
+ movl -4(%edi), %ebx C risk of data cache bank clash here
+
+ mull -12(%esi) C src[size-2]*src[size-3]
+
+ addl %eax, %ebx
+ movl %edx, %ecx
+
+ adcl $0, %ecx
+ movl -4(%esi), %eax
+
+ mull -12(%esi) C src[size-1]*src[size-3]
+
+ addl %ecx, %eax
+ movl (%edi), %ecx
+
+ adcl $0, %edx
+ movl %ebx, -4(%edi)
+
+ addl %eax, %ecx
+ movl %edx, %ebx
+
+ adcl $0, %ebx
+ movl -4(%esi), %eax
+
+ mull -8(%esi) C src[size-1]*src[size-2]
+
+ movl %ecx, 0(%edi)
+ addl %eax, %ebx
+
+ adcl $0, %edx
+ movl PARAM_SIZE, %eax
+
+ negl %eax
+ movl %ebx, 4(%edi)
+
+ addl $1, %eax C -(size-1) and clear carry
+ movl %edx, 8(%edi)
+
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+L(lshift):
+ C eax counter, negative
+ C ebx next limb
+ C ecx
+ C edx
+ C esi
+ C edi &dst[2*size-4]
+ C ebp
+
+ movl 12(%edi,%eax,8), %ebx
+
+ rcll %ebx
+ movl 16(%edi,%eax,8), %ecx
+
+ rcll %ecx
+ movl %ebx, 12(%edi,%eax,8)
+
+ movl %ecx, 16(%edi,%eax,8)
+ incl %eax
+
+ jnz L(lshift)
+
+
+ adcl %eax, %eax C high bit out
+ movl PARAM_SRC, %esi
+
+ movl PARAM_SIZE, %ecx C risk of cache bank clash
+ movl %eax, 12(%edi) C dst most significant limb
+
+
+C -----------------------------------------------------------------------------
+C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
+C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+ movl (%esi), %eax C src[0]
+ leal (%esi,%ecx,4), %esi C src end
+
+ negl %ecx
+
+ mull %eax
+
+ movl %eax, 16(%edi,%ecx,8) C dst[0]
+ movl %edx, %ebx
+
+ addl $1, %ecx C size-1 and clear carry
+
+L(diag):
+ C eax scratch (low product)
+ C ebx carry limb
+ C ecx counter, negative
+ C edx scratch (high product)
+ C esi &src[size]
+ C edi &dst[2*size-4]
+ C ebp scratch (fetched dst limbs)
+
+ movl (%esi,%ecx,4), %eax
+ adcl $0, %ebx
+
+ mull %eax
+
+ movl 16-4(%edi,%ecx,8), %ebp
+
+ addl %ebp, %ebx
+ movl 16(%edi,%ecx,8), %ebp
+
+ adcl %eax, %ebp
+ movl %ebx, 16-4(%edi,%ecx,8)
+
+ movl %ebp, 16(%edi,%ecx,8)
+ incl %ecx
+
+ movl %edx, %ebx
+ jnz L(diag)
+
+
+ adcl $0, %edx
+ movl 16-4(%edi), %eax C dst most significant limb
+
+ addl %eax, %edx
+ popl %ebp
+
+ movl %edx, 16-4(%edi)
+ popl %esi C risk of cache bank clash
+
+ popl %edi
+ popl %ebx
+
+ ret
+
+EPILOGUE()