diff options
Diffstat (limited to 'rts/gmp/mpn/x86/mul_basecase.asm')
-rw-r--r-- | rts/gmp/mpn/x86/mul_basecase.asm | 209 |
1 files changed, 209 insertions, 0 deletions
diff --git a/rts/gmp/mpn/x86/mul_basecase.asm b/rts/gmp/mpn/x86/mul_basecase.asm new file mode 100644 index 0000000000..3a9b73895b --- /dev/null +++ b/rts/gmp/mpn/x86/mul_basecase.asm @@ -0,0 +1,209 @@ +dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result +dnl in a third limb vector. + + +dnl Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation, +dnl Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C void mpn_mul_basecase (mp_ptr wp, +C mp_srcptr xp, mp_size_t xsize, +C mp_srcptr yp, mp_size_t ysize); +C +C This was written in a haste since the Pentium optimized code that was used +C for all x86 machines was slow for the Pentium II. This code would benefit +C from some cleanup. +C +C To shave off some percentage of the run-time, one should make 4 variants +C of the Louter loop, for the four different outcomes of un mod 4. That +C would avoid Loop0 altogether. Code expansion would be > 4-fold for that +C part of the function, but since it is not very large, that would be +C acceptable. +C +C The mul loop (at L(oopM)) might need some tweaking. It's current speed is +C unknown. + +defframe(PARAM_YSIZE,20) +defframe(PARAM_YP, 16) +defframe(PARAM_XSIZE,12) +defframe(PARAM_XP, 8) +defframe(PARAM_WP, 4) + +defframe(VAR_MULTIPLIER, -4) +defframe(VAR_COUNTER, -8) +deflit(VAR_STACK_SPACE, 8) + + .text + ALIGN(8) + +PROLOGUE(mpn_mul_basecase) +deflit(`FRAME',0) + + subl $VAR_STACK_SPACE,%esp + pushl %esi + pushl %ebp + pushl %edi +deflit(`FRAME',eval(VAR_STACK_SPACE+12)) + + movl PARAM_XP,%esi + movl PARAM_WP,%edi + movl PARAM_YP,%ebp + + movl (%esi),%eax C load xp[0] + mull (%ebp) C multiply by yp[0] + movl %eax,(%edi) C store to wp[0] + movl PARAM_XSIZE,%ecx C xsize + decl %ecx C If xsize = 1, ysize = 1 too + jz L(done) + + pushl %ebx +FRAME_pushl() + movl %edx,%ebx + + leal 4(%esi),%esi + leal 4(%edi),%edi + +L(oopM): + movl (%esi),%eax C load next limb at xp[j] + leal 4(%esi),%esi + mull (%ebp) + addl %ebx,%eax + movl %edx,%ebx + adcl $0,%ebx + movl %eax,(%edi) + leal 4(%edi),%edi + decl %ecx + jnz L(oopM) + + movl %ebx,(%edi) C most significant limb of product + addl $4,%edi C increment wp + movl PARAM_XSIZE,%eax + shll $2,%eax + subl %eax,%edi + subl %eax,%esi + + movl PARAM_YSIZE,%eax C ysize + decl %eax + jz L(skip) + movl %eax,VAR_COUNTER C set index i to ysize + +L(outer): + movl PARAM_YP,%ebp C yp + addl $4,%ebp C make ebp point to next v limb + movl %ebp,PARAM_YP + movl (%ebp),%eax C copy y limb ... + movl %eax,VAR_MULTIPLIER C ... to stack slot + movl PARAM_XSIZE,%ecx + + xorl %ebx,%ebx + andl $3,%ecx + jz L(end0) + +L(oop0): + movl (%esi),%eax + mull VAR_MULTIPLIER + leal 4(%esi),%esi + addl %ebx,%eax + movl $0,%ebx + adcl %ebx,%edx + addl %eax,(%edi) + adcl %edx,%ebx C propagate carry into cylimb + + leal 4(%edi),%edi + decl %ecx + jnz L(oop0) + +L(end0): + movl PARAM_XSIZE,%ecx + shrl $2,%ecx + jz L(endX) + + ALIGN(8) +L(oopX): + movl (%esi),%eax + mull VAR_MULTIPLIER + addl %eax,%ebx + movl $0,%ebp + adcl %edx,%ebp + + movl 4(%esi),%eax + mull VAR_MULTIPLIER + addl %ebx,(%edi) + adcl %eax,%ebp C new lo + cylimb + movl $0,%ebx + adcl %edx,%ebx + + movl 8(%esi),%eax + mull VAR_MULTIPLIER + addl %ebp,4(%edi) + adcl %eax,%ebx C new lo + cylimb + movl $0,%ebp + adcl %edx,%ebp + + movl 12(%esi),%eax + mull VAR_MULTIPLIER + addl %ebx,8(%edi) + adcl %eax,%ebp C new lo + cylimb + movl $0,%ebx + adcl %edx,%ebx + + addl %ebp,12(%edi) + adcl $0,%ebx C propagate carry into cylimb + + leal 16(%esi),%esi + leal 16(%edi),%edi + decl %ecx + jnz L(oopX) + +L(endX): + movl %ebx,(%edi) + addl $4,%edi + + C we incremented wp and xp in the loop above; compensate + movl PARAM_XSIZE,%eax + shll $2,%eax + subl %eax,%edi + subl %eax,%esi + + movl VAR_COUNTER,%eax + decl %eax + movl %eax,VAR_COUNTER + jnz L(outer) + +L(skip): + popl %ebx + popl %edi + popl %ebp + popl %esi + addl $8,%esp + ret + +L(done): + movl %edx,4(%edi) C store to wp[1] + popl %edi + popl %ebp + popl %esi + addl $8,%esp + ret + +EPILOGUE() |