dnl AMD64 mpn_lshsub_n. R = 2^k(U - V). dnl Copyright 2006 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 3 of the License, or (at dnl your option) any later version. dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb C AMD K8,K9 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) C AMD K10 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) C Intel P4 16.5 C Intel core2 4.35 C Intel corei ? C Intel atom ? C VIA nano ? C This was written quickly and not optimized at all, but it runs very well on C K8. But perhaps one could get under 3 c/l. Ideas: C 1) Use indexing to save the 3 LEA C 2) Write reasonable feed-in code C 3) Be more clever about register usage C 4) Unroll more, handling CL negation, carry save/restore cost much now C 5) Reschedule C INPUT PARAMETERS define(`rp', `%rdi') define(`up', `%rsi') define(`vp', `%rdx') define(`n', `%rcx') define(`cnt', `%r8') ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_lshsub_n) push %r12 push %r13 push %r14 push %r15 push %rbx mov n, %rax xor %ebx, %ebx C clear carry save register mov %r8d, %ecx C shift count xor %r15d, %r15d C limb carry mov %eax, %r11d and $3, %r11d je L(4) sub $1, %r11d L(oopette): add %ebx, %ebx C restore carry flag mov 0(up), %r8 lea 8(up), up sbb 0(vp), %r8 mov %r8, %r12 sbb %ebx, %ebx C save carry flag shl %cl, %r8 or %r15, %r8 mov %r12, %r15 lea 8(vp), vp neg %cl shr %cl, %r15 neg %cl mov %r8, 0(rp) lea 8(rp), rp sub $1, %r11d jnc L(oopette) L(4): sub $4, %rax jc L(end) ALIGN(16) L(oop): add %ebx, %ebx C restore carry flag mov 0(up), %r8 mov 8(up), %r9 mov 16(up), %r10 mov 24(up), %r11 lea 32(up), up sbb 0(vp), %r8 mov %r8, %r12 sbb 8(vp), %r9 mov %r9, %r13 sbb 16(vp), %r10 mov %r10, %r14 sbb 24(vp), %r11 sbb %ebx, %ebx C save carry flag shl %cl, %r8 shl %cl, %r9 shl %cl, %r10 or %r15, %r8 mov %r11, %r15 shl %cl, %r11 lea 32(vp), vp neg %cl shr %cl, %r12 shr %cl, %r13 shr %cl, %r14 shr %cl, %r15 C used next loop or %r12, %r9 or %r13, %r10 or %r14, %r11 neg %cl mov %r8, 0(rp) mov %r9, 8(rp) mov %r10, 16(rp) mov %r11, 24(rp) lea 32(rp), rp sub $4, %rax jnc L(oop) L(end): neg %ebx shl %cl, %rbx adc %r15, %rbx mov %rbx, %rax pop %rbx pop %r15 pop %r14 pop %r13 pop %r12 ret EPILOGUE()