diff options
author | Kevin Ryde <user42@zip.com.au> | 2001-04-25 02:46:23 +0200 |
---|---|---|
committer | Kevin Ryde <user42@zip.com.au> | 2001-04-25 02:46:23 +0200 |
commit | f47189694ffbe459f0d2b3b689d77906b578146c (patch) | |
tree | c35f39df72f2c19147ccc41931e518ba559303e3 | |
parent | 3d361eec7ff17ad54f1bcf9a3734e13e74cce1b1 (diff) | |
download | gmp-f47189694ffbe459f0d2b3b689d77906b578146c.tar.gz |
* mpn/x86/k6/mmx/dive_1.asm: New file.
-rw-r--r-- | mpn/x86/k6/mmx/dive_1.asm | 290 |
1 files changed, 290 insertions, 0 deletions
diff --git a/mpn/x86/k6/mmx/dive_1.asm b/mpn/x86/k6/mmx/dive_1.asm new file mode 100644 index 000000000..bc8cd750d --- /dev/null +++ b/mpn/x86/k6/mmx/dive_1.asm @@ -0,0 +1,290 @@ +dnl AMD K6 mpn_divexact_1 -- mpn by limb exact division. +dnl +dnl divisor +dnl odd even +dnl K6: 10.0 12.0 cycles/limb +dnl K6-2: 10.0 11.5 + +dnl Copyright 2000, 2001 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + +include(`../config.m4') + + +C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C +C A simple divl is used for size==1. This is about 10 cycles faster for an +C odd divisor or 20 cycles for an even divisor. +C +C The loops are quite sensitive to code alignment, speeds should be +C rechecked (odd and even divisor, pic and non-pic) if contemplating +C changing anything. + +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(VAR_INVERSE,`PARAM_DST') + + TEXT + + ALIGN(32) +PROLOGUE(mpn_divexact_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + + movl PARAM_SRC, %eax + xorl %edx, %edx + + cmpl $1, %ecx + jnz L(two_or_more) + + movl (%eax), %eax + + divl PARAM_DIVISOR + + movl PARAM_DST, %ecx + movl %eax, (%ecx) + + ret + + +L(two_or_more): + movl PARAM_DIVISOR, %eax + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + pushl %ebp FRAME_pushl() + +L(strip_twos): + shrl %eax + incl %edx C will get shift+1 + + jnc L(strip_twos) + pushl %esi FRAME_pushl() + + leal 1(%eax,%eax), %esi C d without twos + andl $127, %eax C d/2, 7 bits + +ifdef(`PIC',` + call L(movl_eip_ebp) + + addl $_GLOBAL_OFFSET_TABLE_, %ebp + C + movl modlimb_invert_table@GOT(%ebp), %ebp + C +Zdisp( movzbl, 0,(%eax,%ebp), %eax) +',` + +dnl non-PIC + movzbl modlimb_invert_table(%eax), %eax C inv 8 bits +') + pushl %edi FRAME_pushl() + + leal (%eax,%eax), %ebp C 2*inv + + imull %eax, %eax C inv*inv + + movl PARAM_DST, %edi + + imull %esi, %eax C inv*inv*d + + subl %eax, %ebp C inv = 2*inv - inv*inv*d + leal (%ebp,%ebp), %eax C 2*inv + + imull %ebp, %ebp C inv*inv + + movl %esi, PARAM_DIVISOR C d without twos + leal (%ebx,%ecx,4), %ebx C src end + + imull %esi, %ebp C inv*inv*d + + leal (%edi,%ecx,4), %edi C dst end + negl %ecx C -size + + subl %ebp, %eax C inv = 2*inv - inv*inv*d + subl $1, %edx C shift amount, and clear carry + + ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB + pushl %eax FRAME_pushl() + imull PARAM_DIVISOR, %eax + cmpl $1, %eax + popl %eax FRAME_popl()') + + movl %eax, VAR_INVERSE + jnz L(even) + + movl (%ebx,%ecx,4), %esi C src low limb + jmp L(odd_entry) + + + ALIGN(16) + nop C code alignment +L(odd_top): + C eax scratch + C ebx src end + C ecx counter, limbs, negative + C edx inverse + C esi next limb, adjusted for carry + C edi dst end + C ebp carry bit, 0 or -1 + + imull %edx, %esi + + movl PARAM_DIVISOR, %eax + movl %esi, -4(%edi,%ecx,4) + + mull %esi C carry limb in edx + + subl %ebp, %edx C apply carry bit + movl (%ebx,%ecx,4), %esi + +L(odd_entry): + subl %edx, %esi C apply carry limb + movl VAR_INVERSE, %edx + + sbbl %ebp, %ebp C 0 or -1 + + incl %ecx + jnz L(odd_top) + + + imull %edx, %esi + + movl %esi, -4(%edi,%ecx,4) + + popl %edi + popl %esi + + popl %ebp + popl %ebx + + ret + + +ifdef(`PIC',` +L(movl_eip_ebp): + movl (%esp), %ebp + ret + + ALIGN(8) + nop C code alignment, necessary for claimed speed + nop +',` +C non-PIC code alignment already ok at 0x9a +') + +L(even): + C eax + C ebx src end + C ecx -size + C edx twos + C esi + C edi dst end + C ebp + + xorl %ebp, %ebp +Zdisp( movq, 0,(%ebx,%ecx,4), %mm0) C src[0,1] + + movd %edx, %mm7 + movl VAR_INVERSE, %edx + + addl $2, %ecx + psrlq %mm7, %mm0 + + movd %mm0, %esi + jz L(even_two) C if only two limbs + + +C Out-of-order execution is good enough to hide the load/rshift/movd +C latency. Having imul at the top of the loop gives 11.5 c/l instead of 12, +C on K6-2. In fact there's only 11 of decode, but nothing running at 11 has +C been found. Maybe the fact every second movq is unaligned costs the extra +C 0.5. + +L(even_top): + C eax scratch + C ebx src end + C ecx counter, limbs, negative + C edx inverse + C esi next limb, adjusted for carry + C edi dst end + C ebp carry bit, 0 or -1 + C + C mm0 scratch, source limbs + C mm7 twos + + imull %edx, %esi + + movl %esi, -8(%edi,%ecx,4) + movl PARAM_DIVISOR, %eax + + mull %esi C carry limb in edx + + movq -4(%ebx,%ecx,4), %mm0 + psrlq %mm7, %mm0 + + movd %mm0, %esi + subl %ebp, %edx C apply carry bit + + subl %edx, %esi C apply carry limb + movl VAR_INVERSE, %edx + + sbbl %ebp, %ebp C 0 or -1 + + incl %ecx + jnz L(even_top) + + +L(even_two): + movd -4(%ebx), %mm0 C src high limb + psrlq %mm7, %mm0 + + imull %edx, %esi + + movl %esi, -8(%edi) + movl PARAM_DIVISOR, %eax + + mull %esi C carry limb in edx + + movd %mm0, %esi + subl %ebp, %edx C apply carry bit + + movl VAR_INVERSE, %eax + subl %edx, %esi C apply carry limb + + imull %eax, %esi + + movl %esi, -4(%edi) + + popl %edi + popl %esi + + popl %ebp + popl %ebx + + emms_or_femms + + ret + +EPILOGUE() |