diff options
Diffstat (limited to 'mpn/x86_64/k10')
-rw-r--r-- | mpn/x86_64/k10/hamdist.asm | 85 |
1 files changed, 85 insertions, 0 deletions
diff --git a/mpn/x86_64/k10/hamdist.asm b/mpn/x86_64/k10/hamdist.asm new file mode 100644 index 000000000..ae0309b3e --- /dev/null +++ b/mpn/x86_64/k10/hamdist.asm @@ -0,0 +1,85 @@ +dnl AMD64 mpn_hamdist -- hamming distance. + +dnl Copyright 2008, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 n/a +C AMD K10 2 +C Intel P4 n/a +C Intel core2 n/a +C Intel corei 2.05 +C Intel atom n/a +C VIA nano n/a + +C This is very straightforward 2-way unrolled code. + +C TODO +C * Write something less basic. It should not be hard to reach 1.5 c/l with +C 4-way unrolling. + +define(`ap', `%rdi') +define(`bp', `%rsi') +define(`n', `%rdx') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_hamdist) + mov (ap), %r8 + xor (bp), %r8 + + lea (ap,n,8), ap C point at A operand end + lea (bp,n,8), bp C point at B operand end + neg n + + bt $0, R32(n) + jnc L(2) + +L(1): .byte 0xf3,0x49,0x0f,0xb8,0xc0 C popcnt %r8, %rax + xor R32(%r10), R32(%r10) + add $1, n + js L(top) + ret + + ALIGN(16) +L(2): mov 8(ap,n,8), %r9 + .byte 0xf3,0x49,0x0f,0xb8,0xc0 C popcnt %r8, %rax + xor 8(bp,n,8), %r9 + .byte 0xf3,0x4d,0x0f,0xb8,0xd1 C popcnt %r9, %r10 + add $2, n + js L(top) + lea (%r10, %rax), %rax + ret + + ALIGN(16) +L(top): mov (ap,n,8), %r8 + lea (%r10, %rax), %rax + mov 8(ap,n,8), %r9 + xor (bp,n,8), %r8 + xor 8(bp,n,8), %r9 + .byte 0xf3,0x49,0x0f,0xb8,0xc8 C popcnt %r8, %rcx + lea (%rcx, %rax), %rax + .byte 0xf3,0x4d,0x0f,0xb8,0xd1 C popcnt %r9, %r10 + add $2, n + js L(top) + + lea (%r10, %rax), %rax + ret +EPILOGUE() |