summaryrefslogtreecommitdiff
path: root/mpn/x86_64/lshsub_n.asm
diff options
context:
space:
mode:
authortege <tege@gmplib.org>2006-12-17 00:15:58 +0100
committertege <tege@gmplib.org>2006-12-17 00:15:58 +0100
commit8607f387a73cc4a5e03d2bd433c8054175c544ac (patch)
tree53ed51d8eeaa29979003006c8303f7ce1740efe8 /mpn/x86_64/lshsub_n.asm
parent194ae8bb1dba4d4aa8e023013c72ac38077bd214 (diff)
downloadgmp-8607f387a73cc4a5e03d2bd433c8054175c544ac.tar.gz
New file.
Diffstat (limited to 'mpn/x86_64/lshsub_n.asm')
-rw-r--r--mpn/x86_64/lshsub_n.asm153
1 files changed, 153 insertions, 0 deletions
diff --git a/mpn/x86_64/lshsub_n.asm b/mpn/x86_64/lshsub_n.asm
new file mode 100644
index 000000000..8efb90f0d
--- /dev/null
+++ b/mpn/x86_64/lshsub_n.asm
@@ -0,0 +1,153 @@
+dnl AMD64 mpn_lshsub_n. R = 2^k(U - V).
+
+dnl Copyright 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write
+dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+dnl Boston, MA 02110-1301, USA.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C K8: 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l)
+C P4: ?
+C P6-15: 4.35
+
+C This was written quickly and not optimized at all, but it runs very well on
+C K8. But perhaps one could get under 3 c/l. Ideas:
+C 1) Use indexing to save the 3 LEA
+C 2) Write reasonable feed-in code
+C 3) Be more clever about register usage
+C 4) Unroll more, handling CL negation, carry save/restore cost much now
+C 5) Reschedule
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cnt' `%r8')
+
+ TEXT
+ ALIGN(16)
+ASM_START()
+PROLOGUE(mpn_lshsub_n)
+
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ push %rbx
+
+ mov n, %rax
+ xor %ebx, %ebx C clear carry save register
+ mov %r8d, %ecx C shift count
+ xor %r15d, %r15d C limb carry
+
+ mov %eax, %r11d
+ and $3, %r11d
+ je .L4
+ sub $1, %r11d
+
+.Loopette:
+ add %ebx, %ebx C restore carry flag
+ mov 0(up), %r8
+ lea 8(up), up
+ sbb 0(vp), %r8
+ mov %r8, %r12
+ sbb %ebx, %ebx C save carry flag
+ shl %cl, %r8
+ or %r15, %r8
+ mov %r12, %r15
+ lea 8(vp), vp
+ neg %cl
+ shr %cl, %r15
+ neg %cl
+ mov %r8, 0(rp)
+ lea 8(rp), rp
+ sub $1, %r11d
+ jnc .Loopette
+
+.L4:
+ sub $4, %rax
+ jc .Lend
+
+ ALIGN(16)
+.Loop:
+ add %ebx, %ebx C restore carry flag
+
+ mov 0(up), %r8
+ mov 8(up), %r9
+ mov 16(up), %r10
+ mov 24(up), %r11
+
+ lea 32(up), up
+
+ sbb 0(vp), %r8
+ mov %r8, %r12
+ sbb 8(vp), %r9
+ mov %r9, %r13
+ sbb 16(vp), %r10
+ mov %r10, %r14
+ sbb 24(vp), %r11
+
+ sbb %ebx, %ebx C save carry flag
+
+ shl %cl, %r8
+ shl %cl, %r9
+ shl %cl, %r10
+ or %r15, %r8
+ mov %r11, %r15
+ shl %cl, %r11
+
+ lea 32(vp), vp
+
+ neg %cl
+
+ shr %cl, %r12
+ shr %cl, %r13
+ shr %cl, %r14
+ shr %cl, %r15 C used next loop
+
+ or %r12, %r9
+ or %r13, %r10
+ or %r14, %r11
+
+ neg %cl
+
+ mov %r8, 0(rp)
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ mov %r11, 24(rp)
+
+ lea 32(rp), rp
+
+ sub $4, %rax
+ jnc .Loop
+.Lend:
+ neg %ebx
+ shl %cl, %rbx
+ adc %r15, %rbx
+ mov %rbx, %rax
+ pop %rbx
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+
+ ret
+EPILOGUE()