diff options
author | tege <tege@gmplib.org> | 2006-12-17 00:15:58 +0100 |
---|---|---|
committer | tege <tege@gmplib.org> | 2006-12-17 00:15:58 +0100 |
commit | 8607f387a73cc4a5e03d2bd433c8054175c544ac (patch) | |
tree | 53ed51d8eeaa29979003006c8303f7ce1740efe8 /mpn/x86_64/lshsub_n.asm | |
parent | 194ae8bb1dba4d4aa8e023013c72ac38077bd214 (diff) | |
download | gmp-8607f387a73cc4a5e03d2bd433c8054175c544ac.tar.gz |
New file.
Diffstat (limited to 'mpn/x86_64/lshsub_n.asm')
-rw-r--r-- | mpn/x86_64/lshsub_n.asm | 153 |
1 files changed, 153 insertions, 0 deletions
diff --git a/mpn/x86_64/lshsub_n.asm b/mpn/x86_64/lshsub_n.asm new file mode 100644 index 000000000..8efb90f0d --- /dev/null +++ b/mpn/x86_64/lshsub_n.asm @@ -0,0 +1,153 @@ +dnl AMD64 mpn_lshsub_n. R = 2^k(U - V). + +dnl Copyright 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write +dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +dnl Boston, MA 02110-1301, USA. + +include(`../config.m4') + + +C cycles/limb +C K8: 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) +C P4: ? +C P6-15: 4.35 + +C This was written quickly and not optimized at all, but it runs very well on +C K8. But perhaps one could get under 3 c/l. Ideas: +C 1) Use indexing to save the 3 LEA +C 2) Write reasonable feed-in code +C 3) Be more clever about register usage +C 4) Unroll more, handling CL negation, carry save/restore cost much now +C 5) Reschedule + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cnt' `%r8') + + TEXT + ALIGN(16) +ASM_START() +PROLOGUE(mpn_lshsub_n) + + push %r12 + push %r13 + push %r14 + push %r15 + push %rbx + + mov n, %rax + xor %ebx, %ebx C clear carry save register + mov %r8d, %ecx C shift count + xor %r15d, %r15d C limb carry + + mov %eax, %r11d + and $3, %r11d + je .L4 + sub $1, %r11d + +.Loopette: + add %ebx, %ebx C restore carry flag + mov 0(up), %r8 + lea 8(up), up + sbb 0(vp), %r8 + mov %r8, %r12 + sbb %ebx, %ebx C save carry flag + shl %cl, %r8 + or %r15, %r8 + mov %r12, %r15 + lea 8(vp), vp + neg %cl + shr %cl, %r15 + neg %cl + mov %r8, 0(rp) + lea 8(rp), rp + sub $1, %r11d + jnc .Loopette + +.L4: + sub $4, %rax + jc .Lend + + ALIGN(16) +.Loop: + add %ebx, %ebx C restore carry flag + + mov 0(up), %r8 + mov 8(up), %r9 + mov 16(up), %r10 + mov 24(up), %r11 + + lea 32(up), up + + sbb 0(vp), %r8 + mov %r8, %r12 + sbb 8(vp), %r9 + mov %r9, %r13 + sbb 16(vp), %r10 + mov %r10, %r14 + sbb 24(vp), %r11 + + sbb %ebx, %ebx C save carry flag + + shl %cl, %r8 + shl %cl, %r9 + shl %cl, %r10 + or %r15, %r8 + mov %r11, %r15 + shl %cl, %r11 + + lea 32(vp), vp + + neg %cl + + shr %cl, %r12 + shr %cl, %r13 + shr %cl, %r14 + shr %cl, %r15 C used next loop + + or %r12, %r9 + or %r13, %r10 + or %r14, %r11 + + neg %cl + + mov %r8, 0(rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + mov %r11, 24(rp) + + lea 32(rp), rp + + sub $4, %rax + jnc .Loop +.Lend: + neg %ebx + shl %cl, %rbx + adc %r15, %rbx + mov %rbx, %rax + pop %rbx + pop %r15 + pop %r14 + pop %r13 + pop %r12 + + ret +EPILOGUE() |