diff options
Diffstat (limited to 'rts/gmp/mpn/x86/pentium')
-rw-r--r-- | rts/gmp/mpn/x86/pentium/README | 77 | ||||
-rw-r--r-- | rts/gmp/mpn/x86/pentium/aors_n.asm | 196 | ||||
-rw-r--r-- | rts/gmp/mpn/x86/pentium/aorsmul_1.asm | 99 | ||||
-rw-r--r-- | rts/gmp/mpn/x86/pentium/diveby3.asm | 183 | ||||
-rw-r--r-- | rts/gmp/mpn/x86/pentium/gmp-mparam.h | 97 | ||||
-rw-r--r-- | rts/gmp/mpn/x86/pentium/lshift.asm | 236 | ||||
-rw-r--r-- | rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h | 97 | ||||
-rw-r--r-- | rts/gmp/mpn/x86/pentium/mmx/lshift.asm | 455 | ||||
-rw-r--r-- | rts/gmp/mpn/x86/pentium/mmx/popham.asm | 30 | ||||
-rw-r--r-- | rts/gmp/mpn/x86/pentium/mmx/rshift.asm | 460 | ||||
-rw-r--r-- | rts/gmp/mpn/x86/pentium/mul_1.asm | 79 | ||||
-rw-r--r-- | rts/gmp/mpn/x86/pentium/mul_basecase.asm | 135 | ||||
-rw-r--r-- | rts/gmp/mpn/x86/pentium/rshift.asm | 236 | ||||
-rw-r--r-- | rts/gmp/mpn/x86/pentium/sqr_basecase.asm | 520 |
14 files changed, 2900 insertions, 0 deletions
diff --git a/rts/gmp/mpn/x86/pentium/README b/rts/gmp/mpn/x86/pentium/README new file mode 100644 index 0000000000..3b9ec8ac6f --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/README @@ -0,0 +1,77 @@ + + INTEL PENTIUM P5 MPN SUBROUTINES + + +This directory contains mpn functions optimized for Intel Pentium (P5,P54) +processors. The mmx subdirectory has code for Pentium with MMX (P55). + + +STATUS + + cycles/limb + + mpn_add_n/sub_n 2.375 + + mpn_copyi/copyd 1.0 + + mpn_divrem_1 44.0 + mpn_mod_1 44.0 + mpn_divexact_by3 15.0 + + mpn_l/rshift 5.375 normal (6.0 on P54) + 1.875 special shift by 1 bit + + mpn_mul_1 13.0 + mpn_add/submul_1 14.0 + + mpn_mul_basecase 14.2 cycles/crossproduct (approx) + + mpn_sqr_basecase 8 cycles/crossproduct (approx) + or 15.5 cycles/triangleproduct (approx) + +Pentium MMX gets the following improvements + + mpn_l/rshift 1.75 + + +1. mpn_lshift and mpn_rshift run at about 6 cycles/limb on P5 and P54, but the +documentation indicates that they should take only 43/8 = 5.375 cycles/limb, +or 5 cycles/limb asymptotically. The P55 runs them at the expected speed. + +2. mpn_add_n and mpn_sub_n run at asymptotically 2 cycles/limb. Due to loop +overhead and other delays (cache refill?), they run at or near 2.5 cycles/limb. + +3. mpn_mul_1, mpn_addmul_1, mpn_submul_1 all run 1 cycle faster than they +should. Intel documentation says a mul instruction is 10 cycles, but it +measures 9 and the routines using it run with it as 9. + + + +RELEVANT OPTIMIZATION ISSUES + +1. Pentium doesn't allocate cache lines on writes, unlike most other modern +processors. Since the functions in the mpn class do array writes, we have to +handle allocating the destination cache lines by reading a word from it in the +loops, to achieve the best performance. + +2. Pairing of memory operations requires that the two issued operations refer +to different cache banks. The simplest way to insure this is to read/write +two words from the same object. If we make operations on different objects, +they might or might not be to the same cache bank. + + + +REFERENCES + +"Intel Architecture Optimization Manual", 1997, order number 242816. This +is mostly about P5, the parts about P6 aren't relevant. Available on-line: + + http://download.intel.com/design/PentiumII/manuals/242816.htm + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/rts/gmp/mpn/x86/pentium/aors_n.asm b/rts/gmp/mpn/x86/pentium/aors_n.asm new file mode 100644 index 0000000000..a61082a456 --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/aors_n.asm @@ -0,0 +1,196 @@ +dnl Intel Pentium mpn_add_n/mpn_sub_n -- mpn addition and subtraction. +dnl +dnl P5: 2.375 cycles/limb + + +dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software +dnl Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +ifdef(`OPERATION_add_n',` + define(M4_inst, adcl) + define(M4_function_n, mpn_add_n) + define(M4_function_nc, mpn_add_nc) + +',`ifdef(`OPERATION_sub_n',` + define(M4_inst, sbbl) + define(M4_function_n, mpn_sub_n) + define(M4_function_nc, mpn_sub_nc) + +',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n +')')') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + + +C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_limb_t carry); + +defframe(PARAM_CARRY,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) +PROLOGUE(M4_function_nc) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC1,%esi + movl PARAM_SRC2,%ebp + movl PARAM_SIZE,%ecx + + movl (%ebp),%ebx + + decl %ecx + movl %ecx,%edx + shrl $3,%ecx + andl $7,%edx + testl %ecx,%ecx C zero carry flag + jz L(endgo) + + pushl %edx +FRAME_pushl() + movl PARAM_CARRY,%eax + shrl $1,%eax C shift bit 0 into carry + jmp LF(M4_function_n,oop) + +L(endgo): +deflit(`FRAME',16) + movl PARAM_CARRY,%eax + shrl $1,%eax C shift bit 0 into carry + jmp LF(M4_function_n,end) + +EPILOGUE() + + + ALIGN(8) +PROLOGUE(M4_function_n) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC1,%esi + movl PARAM_SRC2,%ebp + movl PARAM_SIZE,%ecx + + movl (%ebp),%ebx + + decl %ecx + movl %ecx,%edx + shrl $3,%ecx + andl $7,%edx + testl %ecx,%ecx C zero carry flag + jz L(end) + pushl %edx +FRAME_pushl() + + ALIGN(8) +L(oop): movl 28(%edi),%eax C fetch destination cache line + leal 32(%edi),%edi + +L(1): movl (%esi),%eax + movl 4(%esi),%edx + M4_inst %ebx,%eax + movl 4(%ebp),%ebx + M4_inst %ebx,%edx + movl 8(%ebp),%ebx + movl %eax,-32(%edi) + movl %edx,-28(%edi) + +L(2): movl 8(%esi),%eax + movl 12(%esi),%edx + M4_inst %ebx,%eax + movl 12(%ebp),%ebx + M4_inst %ebx,%edx + movl 16(%ebp),%ebx + movl %eax,-24(%edi) + movl %edx,-20(%edi) + +L(3): movl 16(%esi),%eax + movl 20(%esi),%edx + M4_inst %ebx,%eax + movl 20(%ebp),%ebx + M4_inst %ebx,%edx + movl 24(%ebp),%ebx + movl %eax,-16(%edi) + movl %edx,-12(%edi) + +L(4): movl 24(%esi),%eax + movl 28(%esi),%edx + M4_inst %ebx,%eax + movl 28(%ebp),%ebx + M4_inst %ebx,%edx + movl 32(%ebp),%ebx + movl %eax,-8(%edi) + movl %edx,-4(%edi) + + leal 32(%esi),%esi + leal 32(%ebp),%ebp + decl %ecx + jnz L(oop) + + popl %edx +FRAME_popl() +L(end): + decl %edx C test %edx w/o clobbering carry + js L(end2) + incl %edx +L(oop2): + leal 4(%edi),%edi + movl (%esi),%eax + M4_inst %ebx,%eax + movl 4(%ebp),%ebx + movl %eax,-4(%edi) + leal 4(%esi),%esi + leal 4(%ebp),%ebp + decl %edx + jnz L(oop2) +L(end2): + movl (%esi),%eax + M4_inst %ebx,%eax + movl %eax,(%edi) + + sbbl %eax,%eax + negl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/pentium/aorsmul_1.asm b/rts/gmp/mpn/x86/pentium/aorsmul_1.asm new file mode 100644 index 0000000000..147b55610f --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/aorsmul_1.asm @@ -0,0 +1,99 @@ +dnl Intel Pentium mpn_addmul_1 -- mpn by limb multiplication. +dnl +dnl P5: 14.0 cycles/limb + + +dnl Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, +dnl Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. */ + + +include(`../config.m4') + + +ifdef(`OPERATION_addmul_1', ` + define(M4_inst, addl) + define(M4_function_1, mpn_addmul_1) + +',`ifdef(`OPERATION_submul_1', ` + define(M4_inst, subl) + define(M4_function_1, mpn_submul_1) + +',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 +')')') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + + +C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); + +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) + +PROLOGUE(M4_function_1) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST, %edi + movl PARAM_SRC, %esi + movl PARAM_SIZE, %ecx + movl PARAM_MULTIPLIER, %ebp + + leal (%edi,%ecx,4), %edi + leal (%esi,%ecx,4), %esi + negl %ecx + xorl %ebx, %ebx + ALIGN(8) + +L(oop): adcl $0, %ebx + movl (%esi,%ecx,4), %eax + + mull %ebp + + addl %ebx, %eax + movl (%edi,%ecx,4), %ebx + + adcl $0, %edx + M4_inst %eax, %ebx + + movl %ebx, (%edi,%ecx,4) + incl %ecx + + movl %edx, %ebx + jnz L(oop) + + adcl $0, %ebx + movl %ebx, %eax + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/pentium/diveby3.asm b/rts/gmp/mpn/x86/pentium/diveby3.asm new file mode 100644 index 0000000000..dbac81642f --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/diveby3.asm @@ -0,0 +1,183 @@ +dnl Intel P5 mpn_divexact_by3 -- mpn division by 3, expecting no remainder. +dnl +dnl P5: 15.0 cycles/limb + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t carry); + +defframe(PARAM_CARRY,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl multiplicative inverse of 3, modulo 2^32 +deflit(INVERSE_3, 0xAAAAAAAB) + +dnl ceil(b/3), ceil(b*2/3) and floor(b*2/3) where b=2^32 +deflit(ONE_THIRD_CEIL, 0x55555556) +deflit(TWO_THIRDS_CEIL, 0xAAAAAAAB) +deflit(TWO_THIRDS_FLOOR, 0xAAAAAAAA) + + .text + ALIGN(8) + +PROLOGUE(mpn_divexact_by3c) +deflit(`FRAME',0) + + movl PARAM_SRC, %ecx + movl PARAM_SIZE, %edx + + decl %edx + jnz L(two_or_more) + + movl (%ecx), %edx + movl PARAM_CARRY, %eax C risk of cache bank clash here + + movl PARAM_DST, %ecx + subl %eax, %edx + + sbbl %eax, %eax C 0 or -1 + + imull $INVERSE_3, %edx, %edx + + negl %eax C 0 or 1 + cmpl $ONE_THIRD_CEIL, %edx + + sbbl $-1, %eax C +1 if edx>=ceil(b/3) + cmpl $TWO_THIRDS_CEIL, %edx + + sbbl $-1, %eax C +1 if edx>=ceil(b*2/3) + movl %edx, (%ecx) + + ret + + +L(two_or_more): + C eax + C ebx + C ecx src + C edx size-1 + C esi + C edi + C ebp + + pushl %ebx FRAME_pushl() + pushl %esi FRAME_pushl() + + pushl %edi FRAME_pushl() + pushl %ebp FRAME_pushl() + + movl PARAM_DST, %edi + movl PARAM_CARRY, %esi + + movl (%ecx), %eax C src low limb + xorl %ebx, %ebx + + sub %esi, %eax + movl $TWO_THIRDS_FLOOR, %esi + + leal (%ecx,%edx,4), %ecx C &src[size-1] + leal (%edi,%edx,4), %edi C &dst[size-1] + + adcl $0, %ebx C carry, 0 or 1 + negl %edx C -(size-1) + + +C The loop needs a source limb ready at the top, which leads to one limb +C handled separately at the end, and the special case above for size==1. +C There doesn't seem to be any scheduling that would keep the speed but move +C the source load and carry subtract up to the top. +C +C The destination cache line prefetching adds 1 cycle to the loop but is +C considered worthwhile. The slowdown is a factor of 1.07, but will prevent +C repeated write-throughs if the destination isn't in L1. A version using +C an outer loop to prefetch only every 8 limbs (a cache line) proved to be +C no faster, due to unavoidable branch mispreditions in the inner loop. +C +C setc is 2 cycles on P54, so an adcl is used instead. If the movl $0,%ebx +C could be avoided then the src limb fetch could pair up and save a cycle. +C This would probably mean going to a two limb loop with the carry limb +C alternately positive or negative, since an sbbl %ebx,%ebx will leave a +C value which is in the opposite sense to the preceding sbbl/adcl %ebx,%eax. +C +C A register is used for TWO_THIRDS_FLOOR because a cmp can't be done as +C "cmpl %edx, $n" with the immediate as the second operand. +C +C The "4" source displacement is in the loop rather than the setup because +C this gets L(top) aligned to 8 bytes at no cost. + + ALIGN(8) +L(top): + C eax source limb, carry subtracted + C ebx carry (0 or 1) + C ecx &src[size-1] + C edx counter, limbs, negative + C esi TWO_THIRDS_FLOOR + C edi &dst[size-1] + C ebp scratch (result limb) + + imull $INVERSE_3, %eax, %ebp + + cmpl $ONE_THIRD_CEIL, %ebp + movl (%edi,%edx,4), %eax C dst cache line prefetch + + sbbl $-1, %ebx C +1 if ebp>=ceil(b/3) + cmpl %ebp, %esi + + movl 4(%ecx,%edx,4), %eax C next src limb + + sbbl %ebx, %eax C and further -1 if ebp>=ceil(b*2/3) + movl $0, %ebx + + adcl $0, %ebx C new carry + movl %ebp, (%edi,%edx,4) + + incl %edx + jnz L(top) + + + + imull $INVERSE_3, %eax, %edx + + cmpl $ONE_THIRD_CEIL, %edx + movl %edx, (%edi) + + sbbl $-1, %ebx C +1 if edx>=ceil(b/3) + cmpl $TWO_THIRDS_CEIL, %edx + + sbbl $-1, %ebx C +1 if edx>=ceil(b*2/3) + popl %ebp + + movl %ebx, %eax + popl %edi + + popl %esi + popl %ebx + + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/pentium/gmp-mparam.h b/rts/gmp/mpn/x86/pentium/gmp-mparam.h new file mode 100644 index 0000000000..d3ed3d73ce --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/gmp-mparam.h @@ -0,0 +1,97 @@ +/* Intel P54 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + + +#ifndef UMUL_TIME +#define UMUL_TIME 9 /* cycles */ +#endif +#ifndef UDIV_TIME +#define UDIV_TIME 41 /* cycles */ +#endif + +/* bsf takes 18-42 cycles, put an average for uniform random numbers */ +#ifndef COUNT_TRAILING_ZEROS_TIME +#define COUNT_TRAILING_ZEROS_TIME 20 /* cycles */ +#endif + + +/* Generated by tuneup.c, 2000-07-06. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 14 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 179 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 22 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 153 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 46 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 110 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 13 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 4 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 25 +#endif + +#ifndef FFT_MUL_TABLE +#define FFT_MUL_TABLE { 496, 928, 1920, 4608, 14336, 40960, 0 } +#endif +#ifndef FFT_MODF_MUL_THRESHOLD +#define FFT_MODF_MUL_THRESHOLD 512 +#endif +#ifndef FFT_MUL_THRESHOLD +#define FFT_MUL_THRESHOLD 3840 +#endif + +#ifndef FFT_SQR_TABLE +#define FFT_SQR_TABLE { 496, 1184, 1920, 5632, 14336, 40960, 0 } +#endif +#ifndef FFT_MODF_SQR_THRESHOLD +#define FFT_MODF_SQR_THRESHOLD 512 +#endif +#ifndef FFT_SQR_THRESHOLD +#define FFT_SQR_THRESHOLD 3840 +#endif diff --git a/rts/gmp/mpn/x86/pentium/lshift.asm b/rts/gmp/mpn/x86/pentium/lshift.asm new file mode 100644 index 0000000000..e1e35d4c57 --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/lshift.asm @@ -0,0 +1,236 @@ +dnl Intel Pentium mpn_lshift -- mpn left shift. +dnl +dnl cycles/limb +dnl P5,P54: 6.0 +dnl P55: 5.375 + + +dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software +dnl Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does, +C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) +PROLOGUE(mpn_lshift) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC,%esi + movl PARAM_SIZE,%ebp + movl PARAM_SHIFT,%ecx + +C We can use faster code for shift-by-1 under certain conditions. + cmp $1,%ecx + jne L(normal) + leal 4(%esi),%eax + cmpl %edi,%eax + jnc L(special) C jump if s_ptr + 1 >= res_ptr + leal (%esi,%ebp,4),%eax + cmpl %eax,%edi + jnc L(special) C jump if res_ptr >= s_ptr + size + +L(normal): + leal -4(%edi,%ebp,4),%edi + leal -4(%esi,%ebp,4),%esi + + movl (%esi),%edx + subl $4,%esi + xorl %eax,%eax + shldl( %cl, %edx, %eax) C compute carry limb + pushl %eax C push carry limb onto stack + + decl %ebp + pushl %ebp + shrl $3,%ebp + jz L(end) + + movl (%edi),%eax C fetch destination cache line + + ALIGN(4) +L(oop): movl -28(%edi),%eax C fetch destination cache line + movl %edx,%ebx + + movl (%esi),%eax + movl -4(%esi),%edx + shldl( %cl, %eax, %ebx) + shldl( %cl, %edx, %eax) + movl %ebx,(%edi) + movl %eax,-4(%edi) + + movl -8(%esi),%ebx + movl -12(%esi),%eax + shldl( %cl, %ebx, %edx) + shldl( %cl, %eax, %ebx) + movl %edx,-8(%edi) + movl %ebx,-12(%edi) + + movl -16(%esi),%edx + movl -20(%esi),%ebx + shldl( %cl, %edx, %eax) + shldl( %cl, %ebx, %edx) + movl %eax,-16(%edi) + movl %edx,-20(%edi) + + movl -24(%esi),%eax + movl -28(%esi),%edx + shldl( %cl, %eax, %ebx) + shldl( %cl, %edx, %eax) + movl %ebx,-24(%edi) + movl %eax,-28(%edi) + + subl $32,%esi + subl $32,%edi + decl %ebp + jnz L(oop) + +L(end): popl %ebp + andl $7,%ebp + jz L(end2) +L(oop2): + movl (%esi),%eax + shldl( %cl,%eax,%edx) + movl %edx,(%edi) + movl %eax,%edx + subl $4,%esi + subl $4,%edi + decl %ebp + jnz L(oop2) + +L(end2): + shll %cl,%edx C compute least significant limb + movl %edx,(%edi) C store it + + popl %eax C pop carry limb + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + +C We loop from least significant end of the arrays, which is only +C permissable if the source and destination don't overlap, since the +C function is documented to work for overlapping source and destination. + +L(special): + movl (%esi),%edx + addl $4,%esi + + decl %ebp + pushl %ebp + shrl $3,%ebp + + addl %edx,%edx + incl %ebp + decl %ebp + jz L(Lend) + + movl (%edi),%eax C fetch destination cache line + + ALIGN(4) +L(Loop): + movl 28(%edi),%eax C fetch destination cache line + movl %edx,%ebx + + movl (%esi),%eax + movl 4(%esi),%edx + adcl %eax,%eax + movl %ebx,(%edi) + adcl %edx,%edx + movl %eax,4(%edi) + + movl 8(%esi),%ebx + movl 12(%esi),%eax + adcl %ebx,%ebx + movl %edx,8(%edi) + adcl %eax,%eax + movl %ebx,12(%edi) + + movl 16(%esi),%edx + movl 20(%esi),%ebx + adcl %edx,%edx + movl %eax,16(%edi) + adcl %ebx,%ebx + movl %edx,20(%edi) + + movl 24(%esi),%eax + movl 28(%esi),%edx + adcl %eax,%eax + movl %ebx,24(%edi) + adcl %edx,%edx + movl %eax,28(%edi) + + leal 32(%esi),%esi C use leal not to clobber carry + leal 32(%edi),%edi + decl %ebp + jnz L(Loop) + +L(Lend): + popl %ebp + sbbl %eax,%eax C save carry in %eax + andl $7,%ebp + jz L(Lend2) + addl %eax,%eax C restore carry from eax +L(Loop2): + movl %edx,%ebx + movl (%esi),%edx + adcl %edx,%edx + movl %ebx,(%edi) + + leal 4(%esi),%esi C use leal not to clobber carry + leal 4(%edi),%edi + decl %ebp + jnz L(Loop2) + + jmp L(L1) +L(Lend2): + addl %eax,%eax C restore carry from eax +L(L1): movl %edx,(%edi) C store last limb + + sbbl %eax,%eax + negl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h b/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h new file mode 100644 index 0000000000..2379077d0c --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h @@ -0,0 +1,97 @@ +/* Intel P55 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + + +#ifndef UMUL_TIME +#define UMUL_TIME 9 /* cycles */ +#endif +#ifndef UDIV_TIME +#define UDIV_TIME 41 /* cycles */ +#endif + +/* bsf takes 18-42 cycles, put an average for uniform random numbers */ +#ifndef COUNT_TRAILING_ZEROS_TIME +#define COUNT_TRAILING_ZEROS_TIME 20 /* cycles */ +#endif + + +/* Generated by tuneup.c, 2000-07-06. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 14 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 99 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 22 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 89 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 40 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 98 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 13 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 5 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 25 +#endif + +#ifndef FFT_MUL_TABLE +#define FFT_MUL_TABLE { 496, 1056, 1920, 4608, 14336, 40960, 0 } +#endif +#ifndef FFT_MODF_MUL_THRESHOLD +#define FFT_MODF_MUL_THRESHOLD 512 +#endif +#ifndef FFT_MUL_THRESHOLD +#define FFT_MUL_THRESHOLD 3840 +#endif + +#ifndef FFT_SQR_TABLE +#define FFT_SQR_TABLE { 496, 1184, 2176, 5632, 14336, 40960, 0 } +#endif +#ifndef FFT_MODF_SQR_THRESHOLD +#define FFT_MODF_SQR_THRESHOLD 512 +#endif +#ifndef FFT_SQR_THRESHOLD +#define FFT_SQR_THRESHOLD 4352 +#endif diff --git a/rts/gmp/mpn/x86/pentium/mmx/lshift.asm b/rts/gmp/mpn/x86/pentium/mmx/lshift.asm new file mode 100644 index 0000000000..2225438658 --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/mmx/lshift.asm @@ -0,0 +1,455 @@ +dnl Intel P5 mpn_lshift -- mpn left shift. +dnl +dnl P5: 1.75 cycles/limb. + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C Shift src,size left by shift many bits and store the result in dst,size. +C Zeros are shifted in at the right. Return the bits shifted out at the +C left. +C +C The comments in mpn_rshift apply here too. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl minimum 5, because the unrolled loop can't handle less +deflit(UNROLL_THRESHOLD, 5) + + .text + ALIGN(8) + +PROLOGUE(mpn_lshift) + + pushl %ebx + pushl %edi +deflit(`FRAME',8) + + movl PARAM_SIZE, %eax + movl PARAM_DST, %edx + + movl PARAM_SRC, %ebx + movl PARAM_SHIFT, %ecx + + cmp $UNROLL_THRESHOLD, %eax + jae L(unroll) + + movl -4(%ebx,%eax,4), %edi C src high limb + decl %eax + + jnz L(simple) + + shldl( %cl, %edi, %eax) C eax was decremented to zero + + shll %cl, %edi + + movl %edi, (%edx) C dst low limb + popl %edi C risk of data cache bank clash + + popl %ebx + + ret + + +C ----------------------------------------------------------------------------- +L(simple): + C eax size-1 + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd (%ebx,%eax,4), %mm5 C src high limb + + movd %ecx, %mm6 C lshift + negl %ecx + + psllq %mm6, %mm5 + addl $32, %ecx + + movd %ecx, %mm7 + psrlq $32, %mm5 C retval + + +L(simple_top): + C eax counter, limbs, negative + C ebx src + C ecx + C edx dst + C esi + C edi + C + C mm0 scratch + C mm5 return value + C mm6 shift + C mm7 32-shift + + movq -4(%ebx,%eax,4), %mm0 + decl %eax + + psrlq %mm7, %mm0 + + C + + movd %mm0, 4(%edx,%eax,4) + jnz L(simple_top) + + + movd (%ebx), %mm0 + + movd %mm5, %eax + psllq %mm6, %mm0 + + popl %edi + popl %ebx + + movd %mm0, (%edx) + + emms + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(unroll): + C eax size + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd -4(%ebx,%eax,4), %mm5 C src high limb + leal (%ebx,%eax,4), %edi + + movd %ecx, %mm6 C lshift + andl $4, %edi + + psllq %mm6, %mm5 + jz L(start_src_aligned) + + + C src isn't aligned, process high limb separately (marked xxx) to + C make it so. + C + C source -8(ebx,%eax,4) + C | + C +-------+-------+-------+-- + C | | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + C + C dest + C -4(edx,%eax,4) + C | + C +-------+-------+-- + C | xxx | | + C +-------+-------+-- + + movq -8(%ebx,%eax,4), %mm0 C unaligned load + + psllq %mm6, %mm0 + decl %eax + + psrlq $32, %mm0 + + C + + movd %mm0, (%edx,%eax,4) +L(start_src_aligned): + + movq -8(%ebx,%eax,4), %mm1 C src high qword + leal (%edx,%eax,4), %edi + + andl $4, %edi + psrlq $32, %mm5 C return value + + movq -16(%ebx,%eax,4), %mm3 C src second highest qword + jz L(start_dst_aligned) + + C dst isn't aligned, subtract 4 to make it so, and pretend the shift + C is 32 bits extra. High limb of dst (marked xxx) handled here + C separately. + C + C source -8(ebx,%eax,4) + C | + C +-------+-------+-- + C | mm1 | + C +-------+-------+-- + C 0mod8 4mod8 + C + C dest + C -4(edx,%eax,4) + C | + C +-------+-------+-------+-- + C | xxx | | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + + movq %mm1, %mm0 + addl $32, %ecx C new shift + + psllq %mm6, %mm0 + + movd %ecx, %mm6 + psrlq $32, %mm0 + + C wasted cycle here waiting for %mm0 + + movd %mm0, -4(%edx,%eax,4) + subl $4, %edx +L(start_dst_aligned): + + + psllq %mm6, %mm1 + negl %ecx C -shift + + addl $64, %ecx C 64-shift + movq %mm3, %mm2 + + movd %ecx, %mm7 + subl $8, %eax C size-8 + + psrlq %mm7, %mm3 + + por %mm1, %mm3 C mm3 ready to store + jc L(finish) + + + C The comments in mpn_rshift apply here too. + + ALIGN(8) +L(unroll_loop): + C eax counter, limbs + C ebx src + C ecx + C edx dst + C esi + C edi + C + C mm0 + C mm1 + C mm2 src qword from 48(%ebx,%eax,4) + C mm3 dst qword ready to store to 56(%edx,%eax,4) + C + C mm5 return value + C mm6 lshift + C mm7 rshift + + movq 8(%ebx,%eax,4), %mm0 + psllq %mm6, %mm2 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + movq %mm3, 24(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq (%ebx,%eax,4), %mm3 C + psllq %mm6, %mm1 C + + movq %mm0, 16(%edx,%eax,4) + movq %mm3, %mm2 C + + psrlq %mm7, %mm3 C + subl $4, %eax + + por %mm1, %mm3 C + jnc L(unroll_loop) + + + +L(finish): + C eax -4 to -1 representing respectively 0 to 3 limbs remaining + + testb $2, %al + + jz L(finish_no_two) + + movq 8(%ebx,%eax,4), %mm0 + psllq %mm6, %mm2 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + movq %mm3, 24(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq %mm1, %mm2 + movq %mm0, %mm3 + + subl $2, %eax +L(finish_no_two): + + + C eax -4 or -3 representing respectively 0 or 1 limbs remaining + C + C mm2 src prev qword, from 48(%ebx,%eax,4) + C mm3 dst qword, for 56(%edx,%eax,4) + + testb $1, %al + movd %mm5, %eax C retval + + popl %edi + jz L(finish_zero) + + + C One extra src limb, destination was aligned. + C + C source ebx + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edx+12 edx+4 edx + C --+---------------+---------------+-------+ + C | mm3 | | | + C --+---------------+---------------+-------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C One extra src limb, destination was unaligned. + C + C source ebx + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edx+12 edx+4 + C --+---------------+---------------+ + C | mm3 | | + C --+---------------+---------------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C In both cases there's one extra limb of src to fetch and combine + C with mm2 to make a qword at 4(%edx), and in the aligned case + C there's an extra limb of dst to be formed from that extra src limb + C left shifted. + + + movd (%ebx), %mm0 + psllq %mm6, %mm2 + + movq %mm3, 12(%edx) + psllq $32, %mm0 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + por %mm2, %mm0 + psllq %mm6, %mm1 + + movq %mm0, 4(%edx) + psrlq $32, %mm1 + + andl $32, %ecx + popl %ebx + + jz L(finish_one_unaligned) + + movd %mm1, (%edx) +L(finish_one_unaligned): + + emms + + ret + + +L(finish_zero): + + C No extra src limbs, destination was aligned. + C + C source ebx + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edx+8 edx + C --+---------------+---------------+ + C | mm3 | | + C --+---------------+---------------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C No extra src limbs, destination was unaligned. + C + C source ebx + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edx+8 edx+4 + C --+---------------+-------+ + C | mm3 | | + C --+---------------+-------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C The movd for the unaligned case writes the same data to 4(%edx) + C that the movq does for the aligned case. + + + movq %mm3, 8(%edx) + andl $32, %ecx + + psllq %mm6, %mm2 + jz L(finish_zero_unaligned) + + movq %mm2, (%edx) +L(finish_zero_unaligned): + + psrlq $32, %mm2 + popl %ebx + + movd %mm5, %eax C retval + + movd %mm2, 4(%edx) + + emms + + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/pentium/mmx/popham.asm b/rts/gmp/mpn/x86/pentium/mmx/popham.asm new file mode 100644 index 0000000000..587a07ab3d --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/mmx/popham.asm @@ -0,0 +1,30 @@ +dnl Intel P55 mpn_popcount, mpn_hamdist -- population count and hamming +dnl distance. +dnl +dnl P55: popcount 11.5 cycles/limb, hamdist 12.0 cycles/limb + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) +include_mpn(`x86/k6/mmx/popham.asm') diff --git a/rts/gmp/mpn/x86/pentium/mmx/rshift.asm b/rts/gmp/mpn/x86/pentium/mmx/rshift.asm new file mode 100644 index 0000000000..7672630d57 --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/mmx/rshift.asm @@ -0,0 +1,460 @@ +dnl Intel P5 mpn_rshift -- mpn right shift. +dnl +dnl P5: 1.75 cycles/limb. + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C Shift src,size right by shift many bits and store the result in dst,size. +C Zeros are shifted in at the left. Return the bits shifted out at the +C right. +C +C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb, +C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l. +C +C Full speed depends on source and destination being aligned. Unaligned mmx +C loads and stores on P5 don't pair and have a 2 cycle penalty. Some hairy +C setups and finish-ups are done to ensure alignment for the loop. +C +C MMX shifts work out a bit faster even for the simple loop. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl Minimum 5, because the unrolled loop can't handle less. +deflit(UNROLL_THRESHOLD, 5) + + .text + ALIGN(8) + +PROLOGUE(mpn_rshift) + + pushl %ebx + pushl %edi +deflit(`FRAME',8) + + movl PARAM_SIZE, %eax + movl PARAM_DST, %edx + + movl PARAM_SRC, %ebx + movl PARAM_SHIFT, %ecx + + cmp $UNROLL_THRESHOLD, %eax + jae L(unroll) + + decl %eax + movl (%ebx), %edi C src low limb + + jnz L(simple) + + shrdl( %cl, %edi, %eax) C eax was decremented to zero + + shrl %cl, %edi + + movl %edi, (%edx) C dst low limb + popl %edi C risk of data cache bank clash + + popl %ebx + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(simple): + C eax size-1 + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd (%ebx), %mm5 C src[0] + leal (%ebx,%eax,4), %ebx C &src[size-1] + + movd %ecx, %mm6 C rshift + leal -4(%edx,%eax,4), %edx C &dst[size-2] + + psllq $32, %mm5 + negl %eax + + +C This loop is 5 or 8 cycles, with every second load unaligned and a wasted +C cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4 +C cycles and would be 8 in a simple loop. Using mmx helps the return value +C and last limb calculations too. + +L(simple_top): + C eax counter, limbs, negative + C ebx &src[size-1] + C ecx return value + C edx &dst[size-2] + C + C mm0 scratch + C mm5 return value + C mm6 shift + + movq (%ebx,%eax,4), %mm0 + incl %eax + + psrlq %mm6, %mm0 + + movd %mm0, (%edx,%eax,4) + jnz L(simple_top) + + + movd (%ebx), %mm0 + psrlq %mm6, %mm5 C return value + + psrlq %mm6, %mm0 + popl %edi + + movd %mm5, %eax + popl %ebx + + movd %mm0, 4(%edx) + + emms + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(unroll): + C eax size + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd (%ebx), %mm5 C src[0] + movl $4, %edi + + movd %ecx, %mm6 C rshift + testl %edi, %ebx + + psllq $32, %mm5 + jz L(start_src_aligned) + + + C src isn't aligned, process low limb separately (marked xxx) and + C step src and dst by one limb, making src aligned. + C + C source ebx + C --+-------+-------+-------+ + C | xxx | + C --+-------+-------+-------+ + C 4mod8 0mod8 4mod8 + C + C dest edx + C --+-------+-------+ + C | | xxx | + C --+-------+-------+ + + movq (%ebx), %mm0 C unaligned load + + psrlq %mm6, %mm0 + addl $4, %ebx + + decl %eax + + movd %mm0, (%edx) + addl $4, %edx +L(start_src_aligned): + + + movq (%ebx), %mm1 + testl %edi, %edx + + psrlq %mm6, %mm5 C retval + jz L(start_dst_aligned) + + C dst isn't aligned, add 4 to make it so, and pretend the shift is + C 32 bits extra. Low limb of dst (marked xxx) handled here + C separately. + C + C source ebx + C --+-------+-------+ + C | mm1 | + C --+-------+-------+ + C 4mod8 0mod8 + C + C dest edx + C --+-------+-------+-------+ + C | xxx | + C --+-------+-------+-------+ + C 4mod8 0mod8 4mod8 + + movq %mm1, %mm0 + addl $32, %ecx C new shift + + psrlq %mm6, %mm0 + + movd %ecx, %mm6 + + movd %mm0, (%edx) + addl $4, %edx +L(start_dst_aligned): + + + movq 8(%ebx), %mm3 + negl %ecx + + movq %mm3, %mm2 C mm2 src qword + addl $64, %ecx + + movd %ecx, %mm7 + psrlq %mm6, %mm1 + + leal -12(%ebx,%eax,4), %ebx + leal -20(%edx,%eax,4), %edx + + psllq %mm7, %mm3 + subl $7, %eax C size-7 + + por %mm1, %mm3 C mm3 ready to store + negl %eax C -(size-7) + + jns L(finish) + + + C This loop is the important bit, the rest is just support. Careful + C instruction scheduling achieves the claimed 1.75 c/l. The + C relevant parts of the pairing rules are: + C + C - mmx loads and stores execute only in the U pipe + C - only one mmx shift in a pair + C - wait one cycle before storing an mmx register result + C - the usual address generation interlock + C + C Two qword calculations are slightly interleaved. The instructions + C marked "C" belong to the second qword, and the "C prev" one is for + C the second qword from the previous iteration. + + ALIGN(8) +L(unroll_loop): + C eax counter, limbs, negative + C ebx &src[size-12] + C ecx + C edx &dst[size-12] + C esi + C edi + C + C mm0 + C mm1 + C mm2 src qword from -8(%ebx,%eax,4) + C mm3 dst qword ready to store to -8(%edx,%eax,4) + C + C mm5 return value + C mm6 rshift + C mm7 lshift + + movq (%ebx,%eax,4), %mm0 + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + movq %mm3, -8(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq 8(%ebx,%eax,4), %mm3 C + psrlq %mm6, %mm1 C + + movq %mm0, (%edx,%eax,4) + movq %mm3, %mm2 C + + psllq %mm7, %mm3 C + addl $4, %eax + + por %mm1, %mm3 C + js L(unroll_loop) + + +L(finish): + C eax 0 to 3 representing respectively 3 to 0 limbs remaining + + testb $2, %al + + jnz L(finish_no_two) + + movq (%ebx,%eax,4), %mm0 + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + movq %mm3, -8(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq %mm1, %mm2 + movq %mm0, %mm3 + + addl $2, %eax +L(finish_no_two): + + + C eax 2 or 3 representing respectively 1 or 0 limbs remaining + C + C mm2 src prev qword, from -8(%ebx,%eax,4) + C mm3 dst qword, for -8(%edx,%eax,4) + + testb $1, %al + popl %edi + + movd %mm5, %eax C retval + jnz L(finish_zero) + + + C One extra limb, destination was aligned. + C + C source ebx + C +-------+---------------+-- + C | | mm2 | + C +-------+---------------+-- + C + C dest edx + C +-------+---------------+---------------+-- + C | | | mm3 | + C +-------+---------------+---------------+-- + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C One extra limb, destination was unaligned. + C + C source ebx + C +-------+---------------+-- + C | | mm2 | + C +-------+---------------+-- + C + C dest edx + C +---------------+---------------+-- + C | | mm3 | + C +---------------+---------------+-- + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C In both cases there's one extra limb of src to fetch and combine + C with mm2 to make a qword at 8(%edx), and in the aligned case + C there's a further extra limb of dst to be formed. + + + movd 8(%ebx), %mm0 + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + movq %mm3, (%edx) + por %mm2, %mm0 + + psrlq %mm6, %mm1 + andl $32, %ecx + + popl %ebx + jz L(finish_one_unaligned) + + C dst was aligned, must store one extra limb + movd %mm1, 16(%edx) +L(finish_one_unaligned): + + movq %mm0, 8(%edx) + + emms + + ret + + +L(finish_zero): + + C No extra limbs, destination was aligned. + C + C source ebx + C +---------------+-- + C | mm2 | + C +---------------+-- + C + C dest edx+4 + C +---------------+---------------+-- + C | | mm3 | + C +---------------+---------------+-- + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C No extra limbs, destination was unaligned. + C + C source ebx + C +---------------+-- + C | mm2 | + C +---------------+-- + C + C dest edx+4 + C +-------+---------------+-- + C | | mm3 | + C +-------+---------------+-- + C + C mm6 = shift+32 + C mm7 = 64-(shift+32) + + + C The movd for the unaligned case is clearly the same data as the + C movq for the aligned case, it's just a choice between whether one + C or two limbs should be written. + + + movq %mm3, 4(%edx) + psrlq %mm6, %mm2 + + movd %mm2, 12(%edx) + andl $32, %ecx + + popl %ebx + jz L(finish_zero_unaligned) + + movq %mm2, 12(%edx) +L(finish_zero_unaligned): + + emms + + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/pentium/mul_1.asm b/rts/gmp/mpn/x86/pentium/mul_1.asm new file mode 100644 index 0000000000..08639eca09 --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/mul_1.asm @@ -0,0 +1,79 @@ +dnl Intel Pentium mpn_mul_1 -- mpn by limb multiplication. +dnl +dnl P5: 13.0 cycles/limb + +dnl Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, +dnl Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. */ + + +include(`../config.m4') + + +C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier); + +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) +PROLOGUE(mpn_mul_1) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST, %edi + movl PARAM_SRC, %esi + movl PARAM_SIZE, %ecx + movl PARAM_MULTIPLIER, %ebp + + leal (%edi,%ecx,4), %edi + leal (%esi,%ecx,4), %esi + negl %ecx + xorl %ebx, %ebx + ALIGN(8) + +L(oop): adcl $0, %ebx + movl (%esi,%ecx,4), %eax + + mull %ebp + + addl %eax, %ebx + + movl %ebx, (%edi,%ecx,4) + incl %ecx + + movl %edx, %ebx + jnz L(oop) + + adcl $0, %ebx + movl %ebx, %eax + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/pentium/mul_basecase.asm b/rts/gmp/mpn/x86/pentium/mul_basecase.asm new file mode 100644 index 0000000000..d9f79a0831 --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/mul_basecase.asm @@ -0,0 +1,135 @@ +dnl Intel Pentium mpn_mul_basecase -- mpn by mpn multiplication. +dnl +dnl P5: 14.2 cycles/crossproduct (approx) + + +dnl Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C void mpn_mul_basecase (mp_ptr wp, +C mp_srcptr xp, mp_size_t xsize, +C mp_srcptr yp, mp_size_t ysize); + +defframe(PARAM_YSIZE, 20) +defframe(PARAM_YP, 16) +defframe(PARAM_XSIZE, 12) +defframe(PARAM_XP, 8) +defframe(PARAM_WP, 4) + +defframe(VAR_COUNTER, -4) + + .text + ALIGN(8) +PROLOGUE(mpn_mul_basecase) + + pushl %eax C dummy push for allocating stack slot + pushl %esi + pushl %ebp + pushl %edi +deflit(`FRAME',16) + + movl PARAM_XP,%esi + movl PARAM_WP,%edi + movl PARAM_YP,%ebp + + movl (%esi),%eax C load xp[0] + mull (%ebp) C multiply by yp[0] + movl %eax,(%edi) C store to wp[0] + movl PARAM_XSIZE,%ecx C xsize + decl %ecx C If xsize = 1, ysize = 1 too + jz L(done) + + movl PARAM_XSIZE,%eax + pushl %ebx +FRAME_pushl() + movl %edx,%ebx + leal (%esi,%eax,4),%esi C make xp point at end + leal (%edi,%eax,4),%edi C offset wp by xsize + negl %ecx C negate j size/index for inner loop + xorl %eax,%eax C clear carry + + ALIGN(8) +L(oop1): adcl $0,%ebx + movl (%esi,%ecx,4),%eax C load next limb at xp[j] + mull (%ebp) + addl %ebx,%eax + movl %eax,(%edi,%ecx,4) + incl %ecx + movl %edx,%ebx + jnz L(oop1) + + adcl $0,%ebx + movl PARAM_YSIZE,%eax + movl %ebx,(%edi) C most significant limb of product + addl $4,%edi C increment wp + decl %eax + jz L(skip) + movl %eax,VAR_COUNTER C set index i to ysize + +L(outer): + addl $4,%ebp C make ebp point to next y limb + movl PARAM_XSIZE,%ecx + negl %ecx + xorl %ebx,%ebx + + C code at 0x61 here, close enough to aligned +L(oop2): + adcl $0,%ebx + movl (%esi,%ecx,4),%eax + mull (%ebp) + addl %ebx,%eax + movl (%edi,%ecx,4),%ebx + adcl $0,%edx + addl %eax,%ebx + movl %ebx,(%edi,%ecx,4) + incl %ecx + movl %edx,%ebx + jnz L(oop2) + + adcl $0,%ebx + + movl %ebx,(%edi) + addl $4,%edi + movl VAR_COUNTER,%eax + decl %eax + movl %eax,VAR_COUNTER + jnz L(outer) + +L(skip): + popl %ebx + popl %edi + popl %ebp + popl %esi + addl $4,%esp + ret + +L(done): + movl %edx,4(%edi) C store to wp[1] + popl %edi + popl %ebp + popl %esi + popl %eax C dummy pop for deallocating stack slot + ret + +EPILOGUE() + diff --git a/rts/gmp/mpn/x86/pentium/rshift.asm b/rts/gmp/mpn/x86/pentium/rshift.asm new file mode 100644 index 0000000000..e8f5ae8ec8 --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/rshift.asm @@ -0,0 +1,236 @@ +dnl Intel Pentium mpn_rshift -- mpn right shift. +dnl +dnl cycles/limb +dnl P5,P54: 6.0 +dnl P55: 5.375 + + +dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software +dnl Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does, +C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) +PROLOGUE(mpn_rshift) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC,%esi + movl PARAM_SIZE,%ebp + movl PARAM_SHIFT,%ecx + +C We can use faster code for shift-by-1 under certain conditions. + cmp $1,%ecx + jne L(normal) + leal 4(%edi),%eax + cmpl %esi,%eax + jnc L(special) C jump if res_ptr + 1 >= s_ptr + leal (%edi,%ebp,4),%eax + cmpl %eax,%esi + jnc L(special) C jump if s_ptr >= res_ptr + size + +L(normal): + movl (%esi),%edx + addl $4,%esi + xorl %eax,%eax + shrdl( %cl, %edx, %eax) C compute carry limb + pushl %eax C push carry limb onto stack + + decl %ebp + pushl %ebp + shrl $3,%ebp + jz L(end) + + movl (%edi),%eax C fetch destination cache line + + ALIGN(4) +L(oop): movl 28(%edi),%eax C fetch destination cache line + movl %edx,%ebx + + movl (%esi),%eax + movl 4(%esi),%edx + shrdl( %cl, %eax, %ebx) + shrdl( %cl, %edx, %eax) + movl %ebx,(%edi) + movl %eax,4(%edi) + + movl 8(%esi),%ebx + movl 12(%esi),%eax + shrdl( %cl, %ebx, %edx) + shrdl( %cl, %eax, %ebx) + movl %edx,8(%edi) + movl %ebx,12(%edi) + + movl 16(%esi),%edx + movl 20(%esi),%ebx + shrdl( %cl, %edx, %eax) + shrdl( %cl, %ebx, %edx) + movl %eax,16(%edi) + movl %edx,20(%edi) + + movl 24(%esi),%eax + movl 28(%esi),%edx + shrdl( %cl, %eax, %ebx) + shrdl( %cl, %edx, %eax) + movl %ebx,24(%edi) + movl %eax,28(%edi) + + addl $32,%esi + addl $32,%edi + decl %ebp + jnz L(oop) + +L(end): popl %ebp + andl $7,%ebp + jz L(end2) +L(oop2): + movl (%esi),%eax + shrdl( %cl,%eax,%edx) C compute result limb + movl %edx,(%edi) + movl %eax,%edx + addl $4,%esi + addl $4,%edi + decl %ebp + jnz L(oop2) + +L(end2): + shrl %cl,%edx C compute most significant limb + movl %edx,(%edi) C store it + + popl %eax C pop carry limb + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + +C We loop from least significant end of the arrays, which is only +C permissable if the source and destination don't overlap, since the +C function is documented to work for overlapping source and destination. + +L(special): + leal -4(%edi,%ebp,4),%edi + leal -4(%esi,%ebp,4),%esi + + movl (%esi),%edx + subl $4,%esi + + decl %ebp + pushl %ebp + shrl $3,%ebp + + shrl %edx + incl %ebp + decl %ebp + jz L(Lend) + + movl (%edi),%eax C fetch destination cache line + + ALIGN(4) +L(Loop): + movl -28(%edi),%eax C fetch destination cache line + movl %edx,%ebx + + movl (%esi),%eax + movl -4(%esi),%edx + rcrl %eax + movl %ebx,(%edi) + rcrl %edx + movl %eax,-4(%edi) + + movl -8(%esi),%ebx + movl -12(%esi),%eax + rcrl %ebx + movl %edx,-8(%edi) + rcrl %eax + movl %ebx,-12(%edi) + + movl -16(%esi),%edx + movl -20(%esi),%ebx + rcrl %edx + movl %eax,-16(%edi) + rcrl %ebx + movl %edx,-20(%edi) + + movl -24(%esi),%eax + movl -28(%esi),%edx + rcrl %eax + movl %ebx,-24(%edi) + rcrl %edx + movl %eax,-28(%edi) + + leal -32(%esi),%esi C use leal not to clobber carry + leal -32(%edi),%edi + decl %ebp + jnz L(Loop) + +L(Lend): + popl %ebp + sbbl %eax,%eax C save carry in %eax + andl $7,%ebp + jz L(Lend2) + addl %eax,%eax C restore carry from eax +L(Loop2): + movl %edx,%ebx + movl (%esi),%edx + rcrl %edx + movl %ebx,(%edi) + + leal -4(%esi),%esi C use leal not to clobber carry + leal -4(%edi),%edi + decl %ebp + jnz L(Loop2) + + jmp L(L1) +L(Lend2): + addl %eax,%eax C restore carry from eax +L(L1): movl %edx,(%edi) C store last limb + + movl $0,%eax + rcrl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/pentium/sqr_basecase.asm b/rts/gmp/mpn/x86/pentium/sqr_basecase.asm new file mode 100644 index 0000000000..c8584df13c --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/sqr_basecase.asm @@ -0,0 +1,520 @@ +dnl Intel P5 mpn_sqr_basecase -- square an mpn number. +dnl +dnl P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular +dnl product at around 20x20 limbs. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Calculate src,size squared, storing the result in dst,2*size. +C +C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a +C lot of function call overheads are avoided, especially when the size is +C small. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) +PROLOGUE(mpn_sqr_basecase) +deflit(`FRAME',0) + + movl PARAM_SIZE, %edx + movl PARAM_SRC, %eax + + cmpl $2, %edx + movl PARAM_DST, %ecx + + je L(two_limbs) + + movl (%eax), %eax + ja L(three_or_more) + +C ----------------------------------------------------------------------------- +C one limb only + C eax src + C ebx + C ecx dst + C edx + + mull %eax + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + + ret + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(two_limbs): + C eax src + C ebx + C ecx dst + C edx size + + pushl %ebp + pushl %edi + + pushl %esi + pushl %ebx + + movl %eax, %ebx + movl (%eax), %eax + + mull %eax C src[0]^2 + + movl %eax, (%ecx) C dst[0] + movl %edx, %esi C dst[1] + + movl 4(%ebx), %eax + + mull %eax C src[1]^2 + + movl %eax, %edi C dst[2] + movl %edx, %ebp C dst[3] + + movl (%ebx), %eax + + mull 4(%ebx) C src[0]*src[1] + + addl %eax, %esi + popl %ebx + + adcl %edx, %edi + + adcl $0, %ebp + addl %esi, %eax + + adcl %edi, %edx + movl %eax, 4(%ecx) + + adcl $0, %ebp + popl %esi + + movl %edx, 8(%ecx) + movl %ebp, 12(%ecx) + + popl %edi + popl %ebp + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(three_or_more): + C eax src low limb + C ebx + C ecx dst + C edx size + + cmpl $4, %edx + pushl %ebx +deflit(`FRAME',4) + + movl PARAM_SRC, %ebx + jae L(four_or_more) + + +C ----------------------------------------------------------------------------- +C three limbs + C eax src low limb + C ebx src + C ecx dst + C edx size + + pushl %ebp + pushl %edi + + mull %eax C src[0] ^ 2 + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + + movl 4(%ebx), %eax + xorl %ebp, %ebp + + mull %eax C src[1] ^ 2 + + movl %eax, 8(%ecx) + movl %edx, 12(%ecx) + + movl 8(%ebx), %eax + pushl %esi C risk of cache bank clash + + mull %eax C src[2] ^ 2 + + movl %eax, 16(%ecx) + movl %edx, 20(%ecx) + + movl (%ebx), %eax + + mull 4(%ebx) C src[0] * src[1] + + movl %eax, %esi + movl %edx, %edi + + movl (%ebx), %eax + + mull 8(%ebx) C src[0] * src[2] + + addl %eax, %edi + movl %edx, %ebp + + adcl $0, %ebp + movl 4(%ebx), %eax + + mull 8(%ebx) C src[1] * src[2] + + xorl %ebx, %ebx + addl %eax, %ebp + + C eax + C ebx zero, will be dst[5] + C ecx dst + C edx dst[4] + C esi dst[1] + C edi dst[2] + C ebp dst[3] + + adcl $0, %edx + addl %esi, %esi + + adcl %edi, %edi + + adcl %ebp, %ebp + + adcl %edx, %edx + movl 4(%ecx), %eax + + adcl $0, %ebx + addl %esi, %eax + + movl %eax, 4(%ecx) + movl 8(%ecx), %eax + + adcl %edi, %eax + movl 12(%ecx), %esi + + adcl %ebp, %esi + movl 16(%ecx), %edi + + movl %eax, 8(%ecx) + movl %esi, 12(%ecx) + + adcl %edx, %edi + popl %esi + + movl 20(%ecx), %eax + movl %edi, 16(%ecx) + + popl %edi + popl %ebp + + adcl %ebx, %eax C no carry out of this + popl %ebx + + movl %eax, 20(%ecx) + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(four_or_more): + C eax src low limb + C ebx src + C ecx dst + C edx size + C esi + C edi + C ebp + C + C First multiply src[0]*src[1..size-1] and store at dst[1..size]. + +deflit(`FRAME',4) + + pushl %edi +FRAME_pushl() + pushl %esi +FRAME_pushl() + + pushl %ebp +FRAME_pushl() + leal (%ecx,%edx,4), %edi C dst end of this mul1 + + leal (%ebx,%edx,4), %esi C src end + movl %ebx, %ebp C src + + negl %edx C -size + xorl %ebx, %ebx C clear carry limb and carry flag + + leal 1(%edx), %ecx C -(size-1) + +L(mul1): + C eax scratch + C ebx carry + C ecx counter, negative + C edx scratch + C esi &src[size] + C edi &dst[size] + C ebp src + + adcl $0, %ebx + movl (%esi,%ecx,4), %eax + + mull (%ebp) + + addl %eax, %ebx + + movl %ebx, (%edi,%ecx,4) + incl %ecx + + movl %edx, %ebx + jnz L(mul1) + + + C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for + C n=1..size-2. + C + C The last two products, which are the end corner of the product + C triangle, are handled separately to save looping overhead. These + C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1]. + C If size is 4 then it's only these that need to be done. + C + C In the outer loop %esi is a constant, and %edi just advances by 1 + C limb each time. The size of the operation decreases by 1 limb + C each time. + + C eax + C ebx carry (needing carry flag added) + C ecx + C edx + C esi &src[size] + C edi &dst[size] + C ebp + + adcl $0, %ebx + movl PARAM_SIZE, %edx + + movl %ebx, (%edi) + subl $4, %edx + + negl %edx + jz L(corner) + + +L(outer): + C ebx previous carry limb to store + C edx outer loop counter (negative) + C esi &src[size] + C edi dst, pointing at stored carry limb of previous loop + + pushl %edx C new outer loop counter + leal -2(%edx), %ecx + + movl %ebx, (%edi) + addl $4, %edi + + addl $4, %ebp + xorl %ebx, %ebx C initial carry limb, clear carry flag + +L(inner): + C eax scratch + C ebx carry (needing carry flag added) + C ecx counter, negative + C edx scratch + C esi &src[size] + C edi dst end of this addmul + C ebp &src[j] + + adcl $0, %ebx + movl (%esi,%ecx,4), %eax + + mull (%ebp) + + addl %ebx, %eax + movl (%edi,%ecx,4), %ebx + + adcl $0, %edx + addl %eax, %ebx + + movl %ebx, (%edi,%ecx,4) + incl %ecx + + movl %edx, %ebx + jnz L(inner) + + + adcl $0, %ebx + popl %edx C outer loop counter + + incl %edx + jnz L(outer) + + + movl %ebx, (%edi) + +L(corner): + C esi &src[size] + C edi &dst[2*size-4] + + movl -8(%esi), %eax + movl -4(%edi), %ebx C risk of data cache bank clash here + + mull -12(%esi) C src[size-2]*src[size-3] + + addl %eax, %ebx + movl %edx, %ecx + + adcl $0, %ecx + movl -4(%esi), %eax + + mull -12(%esi) C src[size-1]*src[size-3] + + addl %ecx, %eax + movl (%edi), %ecx + + adcl $0, %edx + movl %ebx, -4(%edi) + + addl %eax, %ecx + movl %edx, %ebx + + adcl $0, %ebx + movl -4(%esi), %eax + + mull -8(%esi) C src[size-1]*src[size-2] + + movl %ecx, 0(%edi) + addl %eax, %ebx + + adcl $0, %edx + movl PARAM_SIZE, %eax + + negl %eax + movl %ebx, 4(%edi) + + addl $1, %eax C -(size-1) and clear carry + movl %edx, 8(%edi) + + +C ----------------------------------------------------------------------------- +C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. + +L(lshift): + C eax counter, negative + C ebx next limb + C ecx + C edx + C esi + C edi &dst[2*size-4] + C ebp + + movl 12(%edi,%eax,8), %ebx + + rcll %ebx + movl 16(%edi,%eax,8), %ecx + + rcll %ecx + movl %ebx, 12(%edi,%eax,8) + + movl %ecx, 16(%edi,%eax,8) + incl %eax + + jnz L(lshift) + + + adcl %eax, %eax C high bit out + movl PARAM_SRC, %esi + + movl PARAM_SIZE, %ecx C risk of cache bank clash + movl %eax, 12(%edi) C dst most significant limb + + +C ----------------------------------------------------------------------------- +C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ..., +C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the +C low limb of src[0]^2. + + movl (%esi), %eax C src[0] + leal (%esi,%ecx,4), %esi C src end + + negl %ecx + + mull %eax + + movl %eax, 16(%edi,%ecx,8) C dst[0] + movl %edx, %ebx + + addl $1, %ecx C size-1 and clear carry + +L(diag): + C eax scratch (low product) + C ebx carry limb + C ecx counter, negative + C edx scratch (high product) + C esi &src[size] + C edi &dst[2*size-4] + C ebp scratch (fetched dst limbs) + + movl (%esi,%ecx,4), %eax + adcl $0, %ebx + + mull %eax + + movl 16-4(%edi,%ecx,8), %ebp + + addl %ebp, %ebx + movl 16(%edi,%ecx,8), %ebp + + adcl %eax, %ebp + movl %ebx, 16-4(%edi,%ecx,8) + + movl %ebp, 16(%edi,%ecx,8) + incl %ecx + + movl %edx, %ebx + jnz L(diag) + + + adcl $0, %edx + movl 16-4(%edi), %eax C dst most significant limb + + addl %eax, %edx + popl %ebp + + movl %edx, 16-4(%edi) + popl %esi C risk of cache bank clash + + popl %edi + popl %ebx + + ret + +EPILOGUE() |