From d3ac5c08d12176b94577ce4a8778a55cb2231f31 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Mon, 13 Dec 2010 20:08:08 +0100 Subject: Add a k10 hamdist.asm. --- ChangeLog | 2 ++ mpn/x86_64/k10/hamdist.asm | 85 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 mpn/x86_64/k10/hamdist.asm diff --git a/ChangeLog b/ChangeLog index b31a6c887..6fcb65b27 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,7 @@ 2010-12-13 Torbjorn Granlund + * mpn/x86_64/k10/hamdist.asm: New file. + * configure.in: Amend last change for lame /bin/sh. 2010-12-12 Torbjorn Granlund diff --git a/mpn/x86_64/k10/hamdist.asm b/mpn/x86_64/k10/hamdist.asm new file mode 100644 index 000000000..ae0309b3e --- /dev/null +++ b/mpn/x86_64/k10/hamdist.asm @@ -0,0 +1,85 @@ +dnl AMD64 mpn_hamdist -- hamming distance. + +dnl Copyright 2008, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 n/a +C AMD K10 2 +C Intel P4 n/a +C Intel core2 n/a +C Intel corei 2.05 +C Intel atom n/a +C VIA nano n/a + +C This is very straightforward 2-way unrolled code. + +C TODO +C * Write something less basic. It should not be hard to reach 1.5 c/l with +C 4-way unrolling. + +define(`ap', `%rdi') +define(`bp', `%rsi') +define(`n', `%rdx') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_hamdist) + mov (ap), %r8 + xor (bp), %r8 + + lea (ap,n,8), ap C point at A operand end + lea (bp,n,8), bp C point at B operand end + neg n + + bt $0, R32(n) + jnc L(2) + +L(1): .byte 0xf3,0x49,0x0f,0xb8,0xc0 C popcnt %r8, %rax + xor R32(%r10), R32(%r10) + add $1, n + js L(top) + ret + + ALIGN(16) +L(2): mov 8(ap,n,8), %r9 + .byte 0xf3,0x49,0x0f,0xb8,0xc0 C popcnt %r8, %rax + xor 8(bp,n,8), %r9 + .byte 0xf3,0x4d,0x0f,0xb8,0xd1 C popcnt %r9, %r10 + add $2, n + js L(top) + lea (%r10, %rax), %rax + ret + + ALIGN(16) +L(top): mov (ap,n,8), %r8 + lea (%r10, %rax), %rax + mov 8(ap,n,8), %r9 + xor (bp,n,8), %r8 + xor 8(bp,n,8), %r9 + .byte 0xf3,0x49,0x0f,0xb8,0xc8 C popcnt %r8, %rcx + lea (%rcx, %rax), %rax + .byte 0xf3,0x4d,0x0f,0xb8,0xd1 C popcnt %r9, %r10 + add $2, n + js L(top) + + lea (%r10, %rax), %rax + ret +EPILOGUE() -- cgit v1.2.1