diff options
Diffstat (limited to 'rts/gmp/mpn/x86/p6')
-rw-r--r-- | rts/gmp/mpn/x86/p6/README | 95 | ||||
-rw-r--r-- | rts/gmp/mpn/x86/p6/aorsmul_1.asm | 300 | ||||
-rw-r--r-- | rts/gmp/mpn/x86/p6/diveby3.asm | 37 | ||||
-rw-r--r-- | rts/gmp/mpn/x86/p6/gmp-mparam.h | 96 | ||||
-rw-r--r-- | rts/gmp/mpn/x86/p6/mmx/divrem_1.asm | 677 | ||||
-rw-r--r-- | rts/gmp/mpn/x86/p6/mmx/mod_1.asm | 444 | ||||
-rw-r--r-- | rts/gmp/mpn/x86/p6/mmx/popham.asm | 31 | ||||
-rw-r--r-- | rts/gmp/mpn/x86/p6/p3mmx/popham.asm | 30 | ||||
-rw-r--r-- | rts/gmp/mpn/x86/p6/sqr_basecase.asm | 641 |
9 files changed, 2351 insertions, 0 deletions
diff --git a/rts/gmp/mpn/x86/p6/README b/rts/gmp/mpn/x86/p6/README new file mode 100644 index 0000000000..7dbc905a0d --- /dev/null +++ b/rts/gmp/mpn/x86/p6/README @@ -0,0 +1,95 @@ + + INTEL P6 MPN SUBROUTINES + + + +This directory contains code optimized for Intel P6 class CPUs, meaning +PentiumPro, Pentium II and Pentium III. The mmx and p3mmx subdirectories +have routines using MMX instructions. + + + +STATUS + +Times for the loops, with all code and data in L1 cache, are as follows. +Some of these might be able to be improved. + + cycles/limb + + mpn_add_n/sub_n 3.7 + + mpn_copyi 0.75 + mpn_copyd 2.4 + + mpn_divrem_1 39.0 + mpn_mod_1 39.0 + mpn_divexact_by3 8.5 + + mpn_mul_1 5.5 + mpn_addmul/submul_1 6.35 + + mpn_l/rshift 2.5 + + mpn_mul_basecase 8.2 cycles/crossproduct (approx) + mpn_sqr_basecase 4.0 cycles/crossproduct (approx) + or 7.75 cycles/triangleproduct (approx) + +Pentium II and III have MMX and get the following improvements. + + mpn_divrem_1 25.0 integer part, 17.5 fractional part + mpn_mod_1 24.0 + + mpn_l/rshift 1.75 + + + + +NOTES + +Write-allocate L1 data cache means prefetching of destinations is unnecessary. + +Mispredicted branches have a penalty of between 9 and 15 cycles, and even up +to 26 cycles depending how far speculative execution has gone. The 9 cycle +minimum penalty comes from the issue pipeline being 9 stages. + +A copy with rep movs seems to copy 16 bytes at a time, since speeds for 4, +5, 6 or 7 limb operations are all the same. The 0.75 cycles/limb would be 3 +cycles per 16 byte block. + + + + +CODING + +Instructions in general code have been shown grouped if they can execute +together, which means up to three instructions with no successive +dependencies, and with only the first being a multiple micro-op. + +P6 has out-of-order execution, so the groupings are really only showing +dependent paths where some shuffling might allow some latencies to be +hidden. + + + + +REFERENCES + +"Intel Architecture Optimization Reference Manual", 1999, revision 001 dated +02/99, order number 245127 (order number 730795-001 is in the document too). +Available on-line: + + http://download.intel.com/design/PentiumII/manuals/245127.htm + +"Intel Architecture Optimization Manual", 1997, order number 242816. This +is an older document mostly about P5 and not as good as the above. +Available on-line: + + http://download.intel.com/design/PentiumII/manuals/242816.htm + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/rts/gmp/mpn/x86/p6/aorsmul_1.asm b/rts/gmp/mpn/x86/p6/aorsmul_1.asm new file mode 100644 index 0000000000..feb364ec0b --- /dev/null +++ b/rts/gmp/mpn/x86/p6/aorsmul_1.asm @@ -0,0 +1,300 @@ +dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. +dnl +dnl P6: 6.35 cycles/limb (at 16 limbs/loop). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl P6 UNROLL_COUNT cycles/limb +dnl 8 6.7 +dnl 16 6.35 +dnl 32 6.3 +dnl 64 6.3 +dnl Maximum possible with the current code is 64. + +deflit(UNROLL_COUNT, 16) + + +ifdef(`OPERATION_addmul_1', ` + define(M4_inst, addl) + define(M4_function_1, mpn_addmul_1) + define(M4_function_1c, mpn_addmul_1c) + define(M4_description, add it to) + define(M4_desc_retval, carry) +',`ifdef(`OPERATION_submul_1', ` + define(M4_inst, subl) + define(M4_function_1, mpn_submul_1) + define(M4_function_1c, mpn_submul_1c) + define(M4_description, subtract it from) + define(M4_desc_retval, borrow) +',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 +')')') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) + + +C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); +C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult, mp_limb_t carry); +C +C Calculate src,size multiplied by mult and M4_description dst,size. +C Return the M4_desc_retval limb from the top of the result. +C +C This code is pretty much the same as the K6 code. The unrolled loop is +C the same, but there's just a few scheduling tweaks in the setups and the +C simple loop. +C +C A number of variations have been tried for the unrolled loop, with one or +C two carries, and with loads scheduled earlier, but nothing faster than 6 +C cycles/limb has been found. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 5) +',` +deflit(UNROLL_THRESHOLD, 5) +') + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(32) + +PROLOGUE(M4_function_1c) + pushl %ebx +deflit(`FRAME',4) + movl PARAM_CARRY, %ebx + jmp LF(M4_function_1,start_nc) +EPILOGUE() + +PROLOGUE(M4_function_1) + push %ebx +deflit(`FRAME',4) + xorl %ebx, %ebx C initial carry + +L(start_nc): + movl PARAM_SIZE, %ecx + pushl %esi +deflit(`FRAME',8) + + movl PARAM_SRC, %esi + pushl %edi +deflit(`FRAME',12) + + movl PARAM_DST, %edi + pushl %ebp +deflit(`FRAME',16) + cmpl $UNROLL_THRESHOLD, %ecx + + movl PARAM_MULTIPLIER, %ebp + jae L(unroll) + + + C simple loop + C this is offset 0x22, so close enough to aligned +L(simple): + C eax scratch + C ebx carry + C ecx counter + C edx scratch + C esi src + C edi dst + C ebp multiplier + + movl (%esi), %eax + addl $4, %edi + + mull %ebp + + addl %ebx, %eax + adcl $0, %edx + + M4_inst %eax, -4(%edi) + movl %edx, %ebx + + adcl $0, %ebx + decl %ecx + + leal 4(%esi), %esi + jnz L(simple) + + + popl %ebp + popl %edi + + popl %esi + movl %ebx, %eax + + popl %ebx + ret + + + +C------------------------------------------------------------------------------ +C VAR_JUMP holds the computed jump temporarily because there's not enough +C registers when doing the mul for the initial two carry limbs. +C +C The add/adc for the initial carry in %ebx is necessary only for the +C mpn_add/submul_1c entry points. Duplicating the startup code to +C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good +C idea. + +dnl overlapping with parameters already fetched +define(VAR_COUNTER,`PARAM_SIZE') +define(VAR_JUMP, `PARAM_DST') + + C this is offset 0x43, so close enough to aligned +L(unroll): + C eax + C ebx initial carry + C ecx size + C edx + C esi src + C edi dst + C ebp + + movl %ecx, %edx + decl %ecx + + subl $2, %edx + negl %ecx + + shrl $UNROLL_LOG2, %edx + andl $UNROLL_MASK, %ecx + + movl %edx, VAR_COUNTER + movl %ecx, %edx + + C 15 code bytes per limb +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + shll $4, %edx + negl %ecx + + leal L(entry) (%edx,%ecx,1), %edx +') + movl (%esi), %eax C src low limb + + movl %edx, VAR_JUMP + leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi + + mull %ebp + + addl %ebx, %eax C initial carry (from _1c) + adcl $0, %edx + + movl %edx, %ebx C high carry + leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi + + movl VAR_JUMP, %edx + testl $1, %ecx + movl %eax, %ecx C low carry + + cmovnz( %ebx, %ecx) C high,low carry other way around + cmovnz( %eax, %ebx) + + jmp *%edx + + +ifdef(`PIC',` +L(pic_calc): + shll $4, %edx + negl %ecx + + C See README.family about old gas bugs + leal (%edx,%ecx,1), %edx + addl $L(entry)-L(here), %edx + + addl (%esp), %edx + + ret +') + + +C ----------------------------------------------------------- + ALIGN(32) +L(top): +deflit(`FRAME',16) + C eax scratch + C ebx carry hi + C ecx carry lo + C edx scratch + C esi src + C edi dst + C ebp multiplier + C + C VAR_COUNTER loop counter + C + C 15 code bytes per limb + + addl $UNROLL_BYTES, %edi + +L(entry): +deflit(CHUNK_COUNT,2) +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 + 4)) + +Zdisp( movl, disp0,(%esi), %eax) + mull %ebp +Zdisp( M4_inst,%ecx, disp0,(%edi)) + adcl %eax, %ebx + movl %edx, %ecx + adcl $0, %ecx + + movl disp1(%esi), %eax + mull %ebp + M4_inst %ebx, disp1(%edi) + adcl %eax, %ecx + movl %edx, %ebx + adcl $0, %ebx +') + + decl VAR_COUNTER + leal UNROLL_BYTES(%esi), %esi + + jns L(top) + + +deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128))) + + M4_inst %ecx, disp0(%edi) + movl %ebx, %eax + + popl %ebp + popl %edi + + popl %esi + popl %ebx + adcl $0, %eax + + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/p6/diveby3.asm b/rts/gmp/mpn/x86/p6/diveby3.asm new file mode 100644 index 0000000000..a77703ea89 --- /dev/null +++ b/rts/gmp/mpn/x86/p6/diveby3.asm @@ -0,0 +1,37 @@ +dnl Intel P6 mpn_divexact_by3 -- mpn division by 3, expecting no remainder. +dnl +dnl P6: 8.5 cycles/limb + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +dnl The P5 code runs well on P6, in fact better than anything else found so +dnl far. An imul is 4 cycles, meaning the two cmp/sbbl pairs on the +dnl dependent path are taking 4.5 cycles. +dnl +dnl The destination cache line prefetching is unnecessary on P6, but +dnl removing it is a 2 cycle slowdown (approx), so it must be inducing +dnl something good in the out of order execution. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_divexact_by3c) +include_mpn(`x86/pentium/diveby3.asm') diff --git a/rts/gmp/mpn/x86/p6/gmp-mparam.h b/rts/gmp/mpn/x86/p6/gmp-mparam.h new file mode 100644 index 0000000000..d7bfb6d60c --- /dev/null +++ b/rts/gmp/mpn/x86/p6/gmp-mparam.h @@ -0,0 +1,96 @@ +/* Intel P6 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + + +#ifndef UMUL_TIME +#define UMUL_TIME 5 /* cycles */ +#endif +#ifndef UDIV_TIME +#define UDIV_TIME 39 /* cycles */ +#endif + +#ifndef COUNT_TRAILING_ZEROS_TIME +#define COUNT_TRAILING_ZEROS_TIME 2 /* cycles */ +#endif + + +/* Generated by tuneup.c, 2000-07-06. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 23 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 139 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 52 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 166 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 116 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 66 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 20 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 4 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 54 +#endif + +#ifndef FFT_MUL_TABLE +#define FFT_MUL_TABLE { 592, 1440, 2688, 5632, 14336, 40960, 0 } +#endif +#ifndef FFT_MODF_MUL_THRESHOLD +#define FFT_MODF_MUL_THRESHOLD 608 +#endif +#ifndef FFT_MUL_THRESHOLD +#define FFT_MUL_THRESHOLD 5888 +#endif + +#ifndef FFT_SQR_TABLE +#define FFT_SQR_TABLE { 656, 1504, 2944, 6656, 18432, 57344, 0 } +#endif +#ifndef FFT_MODF_SQR_THRESHOLD +#define FFT_MODF_SQR_THRESHOLD 672 +#endif +#ifndef FFT_SQR_THRESHOLD +#define FFT_SQR_THRESHOLD 5888 +#endif diff --git a/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm b/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm new file mode 100644 index 0000000000..f1b011b623 --- /dev/null +++ b/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm @@ -0,0 +1,677 @@ +dnl Intel Pentium-II mpn_divrem_1 -- mpn by limb division. +dnl +dnl P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C +C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm, +C see that file for some comments. It's likely what's here can be improved. + + +dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by +dnl inverse method is used, rather than plain "divl"s. Minimum value 1. +dnl +dnl The different speeds of the integer and fraction parts means that using +dnl xsize+size isn't quite right. The threshold wants to be a bit higher +dnl for the integer part and a bit lower for the fraction part. (Or what's +dnl really wanted is to speed up the integer part!) +dnl +dnl The threshold is set to make the integer part right. At 4 limbs the +dnl div and mul are about the same there, but on the fractional part the +dnl mul is much faster. + +deflit(MUL_THRESHOLD, 4) + + +defframe(PARAM_CARRY, 24) +defframe(PARAM_DIVISOR,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC, 12) +defframe(PARAM_XSIZE, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) + +defframe(VAR_NORM, -20) +defframe(VAR_INVERSE, -24) +defframe(VAR_SRC, -28) +defframe(VAR_DST, -32) +defframe(VAR_DST_STOP,-36) + +deflit(STACK_SPACE, 36) + + .text + ALIGN(16) + +PROLOGUE(mpn_divrem_1c) +deflit(`FRAME',0) + movl PARAM_CARRY, %edx + + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %ebx, SAVE_EBX + movl PARAM_XSIZE, %ebx + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + leal -4(%edi,%ebx,4), %edi + jmp LF(mpn_divrem_1,start_1c) + +EPILOGUE() + + + C offset 0x31, close enough to aligned +PROLOGUE(mpn_divrem_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl $0, %edx C initial carry (if can't skip a div) + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %ebx, SAVE_EBX + movl PARAM_XSIZE, %ebx + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + orl %ecx, %ecx + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] + jz L(no_skip_div) + + movl -4(%esi,%ecx,4), %eax C src high limb + cmpl %ebp, %eax C one less div if high<divisor + jnb L(no_skip_div) + + movl $0, (%edi,%ecx,4) C dst high limb + decl %ecx C size-1 + movl %eax, %edx C src high limb as initial carry +L(no_skip_div): + + +L(start_1c): + C eax + C ebx xsize + C ecx size + C edx carry + C esi src + C edi &dst[xsize-1] + C ebp divisor + + leal (%ebx,%ecx), %eax C size+xsize + cmpl $MUL_THRESHOLD, %eax + jae L(mul_by_inverse) + + orl %ecx, %ecx + jz L(divide_no_integer) + +L(divide_integer): + C eax scratch (quotient) + C ebx xsize + C ecx counter + C edx scratch (remainder) + C esi src + C edi &dst[xsize-1] + C ebp divisor + + movl -4(%esi,%ecx,4), %eax + + divl %ebp + + movl %eax, (%edi,%ecx,4) + decl %ecx + jnz L(divide_integer) + + +L(divide_no_integer): + movl PARAM_DST, %edi + orl %ebx, %ebx + jnz L(divide_fraction) + +L(divide_done): + movl SAVE_ESI, %esi + + movl SAVE_EDI, %edi + + movl SAVE_EBX, %ebx + movl %edx, %eax + + movl SAVE_EBP, %ebp + addl $STACK_SPACE, %esp + + ret + + +L(divide_fraction): + C eax scratch (quotient) + C ebx counter + C ecx + C edx scratch (remainder) + C esi + C edi dst + C ebp divisor + + movl $0, %eax + + divl %ebp + + movl %eax, -4(%edi,%ebx,4) + decl %ebx + jnz L(divide_fraction) + + jmp L(divide_done) + + + +C ----------------------------------------------------------------------------- + +L(mul_by_inverse): + C eax + C ebx xsize + C ecx size + C edx carry + C esi src + C edi &dst[xsize-1] + C ebp divisor + + leal 12(%edi), %ebx + + movl %ebx, VAR_DST_STOP + leal 4(%edi,%ecx,4), %edi C &dst[xsize+size] + + movl %edi, VAR_DST + movl %ecx, %ebx C size + + bsrl %ebp, %ecx C 31-l + movl %edx, %edi C carry + + leal 1(%ecx), %eax C 32-l + xorl $31, %ecx C l + + movl %ecx, VAR_NORM + movl $-1, %edx + + shll %cl, %ebp C d normalized + movd %eax, %mm7 + + movl $-1, %eax + subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1 + + divl %ebp C floor (b*(b-d)-1) / d + + movl %eax, VAR_INVERSE + orl %ebx, %ebx C size + leal -12(%esi,%ebx,4), %eax C &src[size-3] + + movl %eax, VAR_SRC + jz L(start_zero) + + movl 8(%eax), %esi C src high limb + cmpl $1, %ebx + jz L(start_one) + +L(start_two_or_more): + movl 4(%eax), %edx C src second highest limb + + shldl( %cl, %esi, %edi) C n2 = carry,high << l + + shldl( %cl, %edx, %esi) C n10 = high,second << l + + cmpl $2, %ebx + je L(integer_two_left) + jmp L(integer_top) + + +L(start_one): + shldl( %cl, %esi, %edi) C n2 = carry,high << l + + shll %cl, %esi C n10 = high << l + jmp L(integer_one_left) + + +L(start_zero): + shll %cl, %edi C n2 = carry << l + movl $0, %esi C n10 = 0 + + C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then + C must have xsize!=0 + jmp L(fraction_some) + + + +C ----------------------------------------------------------------------------- +C +C This loop runs at about 25 cycles, which is probably sub-optimal, and +C certainly more than the dependent chain would suggest. A better loop, or +C a better rough analysis of what's possible, would be welcomed. +C +C In the current implementation, the following successively dependent +C micro-ops seem to exist. +C +C uops +C n2+n1 1 (addl) +C mul 5 +C q1+1 3 (addl/adcl) +C mul 5 +C sub 3 (subl/sbbl) +C addback 2 (cmov) +C --- +C 19 +C +C Lack of registers hinders explicit scheduling and it might be that the +C normal out of order execution isn't able to hide enough under the mul +C latencies. +C +C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than +C cmov (and takes one uop off the dependent chain). A sarl/andl/addl +C combination was tried for the addback (despite the fact it would lengthen +C the dependent chain) but found to be no faster. + + + ALIGN(16) +L(integer_top): + C eax scratch + C ebx scratch (nadj, q1) + C ecx scratch (src, dst) + C edx scratch + C esi n10 + C edi n2 + C ebp d + C + C mm0 scratch (src qword) + C mm7 rshift for normalization + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + movl VAR_SRC, %ecx + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + movq (%ecx), %mm0 C next src limb and the one below it + + mull VAR_INVERSE C m*(n2+n1) + + subl $4, %ecx + + movl %ecx, VAR_SRC + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + movl %ebp, %eax C d + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + jz L(q1_ff) + + mull %ebx C (q1+1)*d + + movl VAR_DST, %ecx + psrlq %mm7, %mm0 + + C + + C + + C + + subl %eax, %esi + movl VAR_DST_STOP, %eax + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + movd %mm0, %esi + + sbbl $0, %ebx C q + subl $4, %ecx + + movl %ebx, (%ecx) + cmpl %eax, %ecx + + movl %ecx, VAR_DST + jne L(integer_top) + + +L(integer_loop_done): + + +C ----------------------------------------------------------------------------- +C +C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz +C q1_ff special case. This make the code a bit smaller and simpler, and +C costs only 2 cycles (each). + +L(integer_two_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx scratch (src, dst) + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 src limb, shifted + C mm7 rshift + + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + movl PARAM_SRC, %ecx + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movd (%ecx), %mm0 C src low limb + + movl VAR_DST_STOP, %ecx + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + movl %ebp, %eax C d + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx + + mull %ebx C (q1+1)*d + + psllq $32, %mm0 + + psrlq %mm7, %mm0 + + C + + C + + subl %eax, %esi + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + movd %mm0, %esi + + sbbl $0, %ebx C q + + movl %ebx, -4(%ecx) + + +C ----------------------------------------------------------------------------- +L(integer_one_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx scratch (dst) + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 src limb, shifted + C mm7 rshift + + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + movl VAR_DST_STOP, %ecx + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + C + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + movl %ebp, %eax C d + + C + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx C q1 if q1+1 overflowed + + mull %ebx + + C + + C + + C + + C + + subl %eax, %esi + movl PARAM_XSIZE, %eax + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + + sbbl $0, %ebx C q + + movl %ebx, -8(%ecx) + subl $8, %ecx + + + + orl %eax, %eax C xsize + jnz L(fraction_some) + + movl %edi, %eax +L(fraction_done): + movl VAR_NORM, %ecx + movl SAVE_EBP, %ebp + + movl SAVE_EDI, %edi + + movl SAVE_ESI, %esi + + movl SAVE_EBX, %ebx + addl $STACK_SPACE, %esp + + shrl %cl, %eax + emms + + ret + + +C ----------------------------------------------------------------------------- +C +C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword +C of q*d is simply -d and the remainder n-q*d = n10+d + +L(q1_ff): + C eax (divisor) + C ebx (q1+1 == 0) + C ecx + C edx + C esi n10 + C edi n2 + C ebp divisor + + movl VAR_DST, %ecx + movl VAR_DST_STOP, %edx + subl $4, %ecx + + movl %ecx, VAR_DST + psrlq %mm7, %mm0 + leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 + + movl $-1, (%ecx) + movd %mm0, %esi C next n10 + + cmpl %ecx, %edx + jne L(integer_top) + + jmp L(integer_loop_done) + + + +C ----------------------------------------------------------------------------- +C +C In the current implementation, the following successively dependent +C micro-ops seem to exist. +C +C uops +C mul 5 +C q1+1 1 (addl) +C mul 5 +C sub 3 (negl/sbbl) +C addback 2 (cmov) +C --- +C 16 +C +C The loop in fact runs at about 17.5 cycles. Using a sarl/andl/addl for +C the addback was found to be a touch slower. + + + ALIGN(16) +L(fraction_some): + C eax + C ebx + C ecx + C edx + C esi + C edi carry + C ebp divisor + + movl PARAM_DST, %esi + movl VAR_DST_STOP, %ecx + movl %edi, %eax + + subl $8, %ecx + + + ALIGN(16) +L(fraction_top): + C eax n2, then scratch + C ebx scratch (nadj, q1) + C ecx dst, decrementing + C edx scratch + C esi dst stop point + C edi n2 + C ebp divisor + + mull VAR_INVERSE C m*n2 + + movl %ebp, %eax C d + subl $4, %ecx C dst + leal 1(%edi), %ebx + + C + + C + + C + + addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1 + + mull %ebx C (q1+1)*d + + C + + C + + C + + C + + negl %eax C low of n - (q1+1)*d + + sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry + leal (%ebp,%eax), %edx + + cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 + + sbbl $0, %ebx C q + movl %eax, %edi C remainder->n2 + cmpl %esi, %ecx + + movl %ebx, (%ecx) C previous q + jne L(fraction_top) + + + jmp L(fraction_done) + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/p6/mmx/mod_1.asm b/rts/gmp/mpn/x86/p6/mmx/mod_1.asm new file mode 100644 index 0000000000..e7d8d94d33 --- /dev/null +++ b/rts/gmp/mpn/x86/p6/mmx/mod_1.asm @@ -0,0 +1,444 @@ +dnl Intel Pentium-II mpn_mod_1 -- mpn by limb remainder. +dnl +dnl P6MMX: 24.0 cycles/limb. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor); +C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t carry); +C +C The code here very similar to mpn_divrem_1, but with the quotient +C discarded. What's here probably isn't optimal. +C +C See mpn/x86/p6/mmx/divrem_1.c and mpn/x86/k7/mmx/mod_1.asm for some +C comments. + + +dnl MUL_THRESHOLD is the size at which the multiply by inverse method is +dnl used, rather than plain "divl"s. Minimum value 2. + +deflit(MUL_THRESHOLD, 4) + + +defframe(PARAM_CARRY, 16) +defframe(PARAM_DIVISOR,12) +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) + +defframe(VAR_NORM, -20) +defframe(VAR_INVERSE, -24) +defframe(VAR_SRC_STOP,-28) + +deflit(STACK_SPACE, 28) + + .text + ALIGN(16) + +PROLOGUE(mpn_mod_1c) +deflit(`FRAME',0) + movl PARAM_CARRY, %edx + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + jmp LF(mpn_mod_1,start_1c) + +EPILOGUE() + + + ALIGN(16) +PROLOGUE(mpn_mod_1) +deflit(`FRAME',0) + + movl $0, %edx C initial carry (if can't skip a div) + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + orl %ecx, %ecx + jz L(divide_done) + + movl -4(%esi,%ecx,4), %eax C src high limb + + cmpl %ebp, %eax C carry flag if high<divisor + + cmovc( %eax, %edx) C src high limb as initial carry + sbbl $0, %ecx C size-1 to skip one div + jz L(divide_done) + + + ALIGN(16) +L(start_1c): + C eax + C ebx + C ecx size + C edx carry + C esi src + C edi + C ebp divisor + + cmpl $MUL_THRESHOLD, %ecx + jae L(mul_by_inverse) + + + orl %ecx, %ecx + jz L(divide_done) + + +L(divide_top): + C eax scratch (quotient) + C ebx + C ecx counter, limbs, decrementing + C edx scratch (remainder) + C esi src + C edi + C ebp + + movl -4(%esi,%ecx,4), %eax + + divl %ebp + + decl %ecx + jnz L(divide_top) + + +L(divide_done): + movl SAVE_ESI, %esi + movl %edx, %eax + + movl SAVE_EBP, %ebp + addl $STACK_SPACE, %esp + + ret + + + +C ----------------------------------------------------------------------------- + +L(mul_by_inverse): + C eax + C ebx + C ecx size + C edx carry + C esi src + C edi + C ebp divisor + + movl %ebx, SAVE_EBX + leal -4(%esi), %ebx + + movl %ebx, VAR_SRC_STOP + movl %ecx, %ebx C size + + movl %edi, SAVE_EDI + movl %edx, %edi C carry + + bsrl %ebp, %ecx C 31-l + movl $-1, %edx + + leal 1(%ecx), %eax C 32-l + xorl $31, %ecx C l + + movl %ecx, VAR_NORM + shll %cl, %ebp C d normalized + + movd %eax, %mm7 + movl $-1, %eax + subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1 + + divl %ebp C floor (b*(b-d)-1) / d + + C + + movl %eax, VAR_INVERSE + leal -12(%esi,%ebx,4), %eax C &src[size-3] + + movl 8(%eax), %esi C src high limb + movl 4(%eax), %edx C src second highest limb + + shldl( %cl, %esi, %edi) C n2 = carry,high << l + + shldl( %cl, %edx, %esi) C n10 = high,second << l + + movl %eax, %ecx C &src[size-3] + + +ifelse(MUL_THRESHOLD,2,` + cmpl $2, %ebx + je L(inverse_two_left) +') + + +C The dependent chain here is the same as in mpn_divrem_1, but a few +C instructions are saved by not needing to store the quotient limbs. This +C gets it down to 24 c/l, which is still a bit away from a theoretical 19 +C c/l. + + ALIGN(16) +L(inverse_top): + C eax scratch + C ebx scratch (nadj, q1) + C ecx src pointer, decrementing + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 scratch (src qword) + C mm7 rshift for normalization + + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movq (%ecx), %mm0 C next src limb and the one below it + subl $4, %ecx + + C + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + movl %ebp, %eax C d + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + jz L(q1_ff) + + mull %ebx C (q1+1)*d + + psrlq %mm7, %mm0 + movl VAR_SRC_STOP, %ebx + + C + + C + + C + + subl %eax, %esi + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + movd %mm0, %esi + cmpl %ebx, %ecx + + jne L(inverse_top) + + +L(inverse_loop_done): + + +C ----------------------------------------------------------------------------- + +L(inverse_two_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx &src[-1] + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 scratch (src dword) + C mm7 rshift + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movd 4(%ecx), %mm0 C src low limb + + C + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx + movl %ebp, %eax C d + + mull %ebx C (q1+1)*d + + psllq $32, %mm0 + + psrlq %mm7, %mm0 + + C + + C + + subl %eax, %esi + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + movd %mm0, %esi + + +C One limb left + + C eax scratch + C ebx scratch (nadj, q1) + C ecx + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 src limb, shifted + C mm7 rshift + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movl VAR_NORM, %ecx C for final denorm + + C + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx + movl %ebp, %eax C d + + mull %ebx C (q1+1)*d + + movl SAVE_EBX, %ebx + + C + + C + + C + + subl %eax, %esi + + sbbl %edx, %edi C n - (q1+1)*d + leal (%ebp,%esi), %edx + movl SAVE_EBP, %ebp + + movl %esi, %eax C remainder + movl SAVE_ESI, %esi + + cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 + movl SAVE_EDI, %edi + + shrl %cl, %eax C denorm remainder + addl $STACK_SPACE, %esp + emms + + ret + + +C ----------------------------------------------------------------------------- +C +C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword +C of q*d is simply -d and the remainder n-q*d = n10+d + +L(q1_ff): + C eax (divisor) + C ebx (q1+1 == 0) + C ecx src pointer + C edx + C esi n10 + C edi (n2) + C ebp divisor + + leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 + movl VAR_SRC_STOP, %edx + psrlq %mm7, %mm0 + + movd %mm0, %esi C next n10 + cmpl %ecx, %edx + jne L(inverse_top) + + jmp L(inverse_loop_done) + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/p6/mmx/popham.asm b/rts/gmp/mpn/x86/p6/mmx/popham.asm new file mode 100644 index 0000000000..50f9a11218 --- /dev/null +++ b/rts/gmp/mpn/x86/p6/mmx/popham.asm @@ -0,0 +1,31 @@ +dnl Intel Pentium-II mpn_popcount, mpn_hamdist -- population count and +dnl hamming distance. +dnl +dnl P6MMX: popcount 11 cycles/limb (approx), hamdist 11.5 cycles/limb +dnl (approx) + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) +include_mpn(`x86/k6/mmx/popham.asm') diff --git a/rts/gmp/mpn/x86/p6/p3mmx/popham.asm b/rts/gmp/mpn/x86/p6/p3mmx/popham.asm new file mode 100644 index 0000000000..e63fbf334b --- /dev/null +++ b/rts/gmp/mpn/x86/p6/p3mmx/popham.asm @@ -0,0 +1,30 @@ +dnl Intel Pentium-III mpn_popcount, mpn_hamdist -- population count and +dnl hamming distance. + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +dnl Haven't actually measured it, but the K7 code with the psadbw should be +dnl good on P-III. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) +include_mpn(`x86/k7/mmx/popham.asm') diff --git a/rts/gmp/mpn/x86/p6/sqr_basecase.asm b/rts/gmp/mpn/x86/p6/sqr_basecase.asm new file mode 100644 index 0000000000..174c78406a --- /dev/null +++ b/rts/gmp/mpn/x86/p6/sqr_basecase.asm @@ -0,0 +1,641 @@ +dnl Intel P6 mpn_sqr_basecase -- square an mpn number. +dnl +dnl P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular +dnl product (measured on the speed difference between 20 and 40 limbs, +dnl which is the Karatsuba recursing range). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for +dnl a description. The only difference here is that UNROLL_COUNT can go up +dnl to 64 (not 63) making KARATSUBA_SQR_THRESHOLD_MAX 67. + +deflit(KARATSUBA_SQR_THRESHOLD_MAX, 67) + +ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE', +`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)') + +m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD') +deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3)) + + +C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a +C lot of function call overheads are avoided, especially when the given size +C is small. +C +C The code size might look a bit excessive, but not all of it is executed so +C it won't all get into the code cache. The 1x1, 2x2 and 3x3 special cases +C clearly apply only to those sizes; mid sizes like 10x10 only need part of +C the unrolled addmul; and big sizes like 40x40 that do use the full +C unrolling will least be making good use of it, because 40x40 will take +C something like 7000 cycles. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(32) +PROLOGUE(mpn_sqr_basecase) +deflit(`FRAME',0) + + movl PARAM_SIZE, %edx + + movl PARAM_SRC, %eax + + cmpl $2, %edx + movl PARAM_DST, %ecx + je L(two_limbs) + + movl (%eax), %eax + ja L(three_or_more) + + +C ----------------------------------------------------------------------------- +C one limb only + C eax src limb + C ebx + C ecx dst + C edx + + mull %eax + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + + ret + + +C ----------------------------------------------------------------------------- +L(two_limbs): + C eax src + C ebx + C ecx dst + C edx + +defframe(SAVE_ESI, -4) +defframe(SAVE_EBX, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +deflit(`STACK_SPACE',16) + + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %esi, SAVE_ESI + movl %eax, %esi + movl (%eax), %eax + + mull %eax C src[0]^2 + + movl %eax, (%ecx) C dst[0] + movl 4(%esi), %eax + + movl %ebx, SAVE_EBX + movl %edx, %ebx C dst[1] + + mull %eax C src[1]^2 + + movl %edi, SAVE_EDI + movl %eax, %edi C dst[2] + movl (%esi), %eax + + movl %ebp, SAVE_EBP + movl %edx, %ebp C dst[3] + + mull 4(%esi) C src[0]*src[1] + + addl %eax, %ebx + movl SAVE_ESI, %esi + + adcl %edx, %edi + + adcl $0, %ebp + addl %ebx, %eax + movl SAVE_EBX, %ebx + + adcl %edi, %edx + movl SAVE_EDI, %edi + + adcl $0, %ebp + + movl %eax, 4(%ecx) + + movl %ebp, 12(%ecx) + movl SAVE_EBP, %ebp + + movl %edx, 8(%ecx) + addl $FRAME, %esp + + ret + + +C ----------------------------------------------------------------------------- +L(three_or_more): + C eax src low limb + C ebx + C ecx dst + C edx size +deflit(`FRAME',0) + + pushl %esi defframe_pushl(`SAVE_ESI') + cmpl $4, %edx + + movl PARAM_SRC, %esi + jae L(four_or_more) + + +C ----------------------------------------------------------------------------- +C three limbs + + C eax src low limb + C ebx + C ecx dst + C edx + C esi src + C edi + C ebp + + pushl %ebp defframe_pushl(`SAVE_EBP') + pushl %edi defframe_pushl(`SAVE_EDI') + + mull %eax C src[0] ^ 2 + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + + movl 4(%esi), %eax + xorl %ebp, %ebp + + mull %eax C src[1] ^ 2 + + movl %eax, 8(%ecx) + movl %edx, 12(%ecx) + movl 8(%esi), %eax + + pushl %ebx defframe_pushl(`SAVE_EBX') + + mull %eax C src[2] ^ 2 + + movl %eax, 16(%ecx) + movl %edx, 20(%ecx) + + movl (%esi), %eax + + mull 4(%esi) C src[0] * src[1] + + movl %eax, %ebx + movl %edx, %edi + + movl (%esi), %eax + + mull 8(%esi) C src[0] * src[2] + + addl %eax, %edi + movl %edx, %ebp + + adcl $0, %ebp + movl 4(%esi), %eax + + mull 8(%esi) C src[1] * src[2] + + xorl %esi, %esi + addl %eax, %ebp + + C eax + C ebx dst[1] + C ecx dst + C edx dst[4] + C esi zero, will be dst[5] + C edi dst[2] + C ebp dst[3] + + adcl $0, %edx + addl %ebx, %ebx + + adcl %edi, %edi + + adcl %ebp, %ebp + + adcl %edx, %edx + movl 4(%ecx), %eax + + adcl $0, %esi + addl %ebx, %eax + + movl %eax, 4(%ecx) + movl 8(%ecx), %eax + + adcl %edi, %eax + movl 12(%ecx), %ebx + + adcl %ebp, %ebx + movl 16(%ecx), %edi + + movl %eax, 8(%ecx) + movl SAVE_EBP, %ebp + + movl %ebx, 12(%ecx) + movl SAVE_EBX, %ebx + + adcl %edx, %edi + movl 20(%ecx), %eax + + movl %edi, 16(%ecx) + movl SAVE_EDI, %edi + + adcl %esi, %eax C no carry out of this + movl SAVE_ESI, %esi + + movl %eax, 20(%ecx) + addl $FRAME, %esp + + ret + + + +C ----------------------------------------------------------------------------- +defframe(VAR_COUNTER,-20) +defframe(VAR_JMP, -24) +deflit(`STACK_SPACE',24) + +L(four_or_more): + C eax src low limb + C ebx + C ecx + C edx size + C esi src + C edi + C ebp +deflit(`FRAME',4) dnl %esi already pushed + +C First multiply src[0]*src[1..size-1] and store at dst[1..size]. + + subl $STACK_SPACE-FRAME, %esp +deflit(`FRAME',STACK_SPACE) + movl $1, %ecx + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + movl %ebx, SAVE_EBX + subl %edx, %ecx C -(size-1) + + movl %ebp, SAVE_EBP + movl $0, %ebx C initial carry + + leal (%esi,%edx,4), %esi C &src[size] + movl %eax, %ebp C multiplier + + leal -4(%edi,%edx,4), %edi C &dst[size-1] + + +C This loop runs at just over 6 c/l. + +L(mul_1): + C eax scratch + C ebx carry + C ecx counter, limbs, negative, -(size-1) to -1 + C edx scratch + C esi &src[size] + C edi &dst[size-1] + C ebp multiplier + + movl %ebp, %eax + + mull (%esi,%ecx,4) + + addl %ebx, %eax + movl $0, %ebx + + adcl %edx, %ebx + movl %eax, 4(%edi,%ecx,4) + + incl %ecx + jnz L(mul_1) + + + movl %ebx, 4(%edi) + + +C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2. +C +C The last two addmuls, which are the bottom right corner of the product +C triangle, are left to the end. These are src[size-3]*src[size-2,size-1] +C and src[size-2]*src[size-1]. If size is 4 then it's only these corner +C cases that need to be done. +C +C The unrolled code is the same as mpn_addmul_1(), see that routine for some +C comments. +C +C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive. +C +C VAR_JMP is the computed jump into the unrolled code, stepped by one code +C chunk each outer loop. + +dnl This is also hard-coded in the address calculation below. +deflit(CODE_BYTES_PER_LIMB, 15) + +dnl With &src[size] and &dst[size-1] pointers, the displacements in the +dnl unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above +dnl that an offset must be added to them. +deflit(OFFSET, +ifelse(eval(UNROLL_COUNT>32),1, +eval((UNROLL_COUNT-32)*4), +0)) + + C eax + C ebx carry + C ecx + C edx + C esi &src[size] + C edi &dst[size-1] + C ebp + + movl PARAM_SIZE, %ecx + + subl $4, %ecx + jz L(corner) + + movl %ecx, %edx + negl %ecx + + shll $4, %ecx +ifelse(OFFSET,0,,`subl $OFFSET, %esi') + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx +') + negl %edx + +ifelse(OFFSET,0,,`subl $OFFSET, %edi') + + C The calculated jump mustn't be before the start of the available + C code. This is the limit that UNROLL_COUNT puts on the src operand + C size, but checked here using the jump address directly. + + ASSERT(ae, + `movl_text_address( L(unroll_inner_start), %eax) + cmpl %eax, %ecx') + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll_outer_top): + C eax + C ebx high limb to store + C ecx VAR_JMP + C edx VAR_COUNTER, limbs, negative + C esi &src[size], constant + C edi dst ptr, second highest limb of last addmul + C ebp + + movl -12+OFFSET(%esi,%edx,4), %ebp C multiplier + movl %edx, VAR_COUNTER + + movl -8+OFFSET(%esi,%edx,4), %eax C first limb of multiplicand + + mull %ebp + +define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')') + + testb $1, %cl + + movl %edx, %ebx C high carry + leal 4(%edi), %edi + + movl %ecx, %edx C jump + + movl %eax, %ecx C low carry + leal CODE_BYTES_PER_LIMB(%edx), %edx + + cmovX( %ebx, %ecx) C high carry reverse + cmovX( %eax, %ebx) C low carry reverse + movl %edx, VAR_JMP + jmp *%edx + + + C Must be on an even address here so the low bit of the jump address + C will indicate which way around ecx/ebx should start. + + ALIGN(2) + +L(unroll_inner_start): + C eax scratch + C ebx carry high + C ecx carry low + C edx scratch + C esi src pointer + C edi dst pointer + C ebp multiplier + C + C 15 code bytes each limb + C ecx/ebx reversed on each chunk + +forloop(`i', UNROLL_COUNT, 1, ` + deflit(`disp_src', eval(-i*4 + OFFSET)) + deflit(`disp_dst', eval(disp_src)) + + m4_assert(`disp_src>=-128 && disp_src<128') + m4_assert(`disp_dst>=-128 && disp_dst<128') + +ifelse(eval(i%2),0,` +Zdisp( movl, disp_src,(%esi), %eax) + mull %ebp +Zdisp( addl, %ebx, disp_dst,(%edi)) + adcl %eax, %ecx + movl %edx, %ebx + adcl $0, %ebx +',` + dnl this one comes out last +Zdisp( movl, disp_src,(%esi), %eax) + mull %ebp +Zdisp( addl, %ecx, disp_dst,(%edi)) + adcl %eax, %ebx + movl %edx, %ecx + adcl $0, %ecx +') +') +L(unroll_inner_end): + + addl %ebx, m4_empty_if_zero(OFFSET)(%edi) + + movl VAR_COUNTER, %edx + adcl $0, %ecx + + movl %ecx, m4_empty_if_zero(OFFSET+4)(%edi) + movl VAR_JMP, %ecx + + incl %edx + jnz L(unroll_outer_top) + + +ifelse(OFFSET,0,,` + addl $OFFSET, %esi + addl $OFFSET, %edi +') + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(corner): + C eax + C ebx + C ecx + C edx + C esi &src[size] + C edi &dst[2*size-5] + C ebp + + movl -12(%esi), %eax + + mull -8(%esi) + + addl %eax, (%edi) + movl -12(%esi), %eax + movl $0, %ebx + + adcl %edx, %ebx + + mull -4(%esi) + + addl %eax, %ebx + movl -8(%esi), %eax + + adcl $0, %edx + + addl %ebx, 4(%edi) + movl $0, %ebx + + adcl %edx, %ebx + + mull -4(%esi) + + movl PARAM_SIZE, %ecx + addl %ebx, %eax + + adcl $0, %edx + + movl %eax, 8(%edi) + + movl %edx, 12(%edi) + movl PARAM_DST, %edi + + +C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1]. + + subl $1, %ecx C size-1 + xorl %eax, %eax C ready for final adcl, and clear carry + + movl %ecx, %edx + movl PARAM_SRC, %esi + + +L(lshift): + C eax + C ebx + C ecx counter, size-1 to 1 + C edx size-1 (for later use) + C esi src (for later use) + C edi dst, incrementing + C ebp + + rcll 4(%edi) + rcll 8(%edi) + + leal 8(%edi), %edi + decl %ecx + jnz L(lshift) + + + adcl %eax, %eax + + movl %eax, 4(%edi) C dst most significant limb + movl (%esi), %eax C src[0] + + leal 4(%esi,%edx,4), %esi C &src[size] + subl %edx, %ecx C -(size-1) + + +C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., +C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the +C low limb of src[0]^2. + + + mull %eax + + movl %eax, (%edi,%ecx,8) C dst[0] + + +L(diag): + C eax scratch + C ebx scratch + C ecx counter, negative + C edx carry + C esi &src[size] + C edi dst[2*size-2] + C ebp + + movl (%esi,%ecx,4), %eax + movl %edx, %ebx + + mull %eax + + addl %ebx, 4(%edi,%ecx,8) + adcl %eax, 8(%edi,%ecx,8) + adcl $0, %edx + + incl %ecx + jnz L(diag) + + + movl SAVE_ESI, %esi + movl SAVE_EBX, %ebx + + addl %edx, 4(%edi) C dst most significant limb + + movl SAVE_EDI, %edi + movl SAVE_EBP, %ebp + addl $FRAME, %esp + ret + + + +C ----------------------------------------------------------------------------- +ifdef(`PIC',` +L(pic_calc): + addl (%esp), %ecx + addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx + addl %edx, %ecx + ret +') + + +EPILOGUE() |