summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTorbjorn Granlund <tege@gmplib.org>2010-12-13 20:08:08 +0100
committerTorbjorn Granlund <tege@gmplib.org>2010-12-13 20:08:08 +0100
commitd3ac5c08d12176b94577ce4a8778a55cb2231f31 (patch)
tree02eb5f26a80ac7a90e9af670466096e69a6980bd
parenta6753f28e4b4c30c98565b96fea2474c03e109b0 (diff)
downloadgmp-d3ac5c08d12176b94577ce4a8778a55cb2231f31.tar.gz
Add a k10 hamdist.asm.
-rw-r--r--ChangeLog2
-rw-r--r--mpn/x86_64/k10/hamdist.asm85
2 files changed, 87 insertions, 0 deletions
diff --git a/ChangeLog b/ChangeLog
index b31a6c887..6fcb65b27 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,7 @@
2010-12-13 Torbjorn Granlund <tege@gmplib.org>
+ * mpn/x86_64/k10/hamdist.asm: New file.
+
* configure.in: Amend last change for lame /bin/sh.
2010-12-12 Torbjorn Granlund <tege@gmplib.org>
diff --git a/mpn/x86_64/k10/hamdist.asm b/mpn/x86_64/k10/hamdist.asm
new file mode 100644
index 000000000..ae0309b3e
--- /dev/null
+++ b/mpn/x86_64/k10/hamdist.asm
@@ -0,0 +1,85 @@
+dnl AMD64 mpn_hamdist -- hamming distance.
+
+dnl Copyright 2008, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 n/a
+C AMD K10 2
+C Intel P4 n/a
+C Intel core2 n/a
+C Intel corei 2.05
+C Intel atom n/a
+C VIA nano n/a
+
+C This is very straightforward 2-way unrolled code.
+
+C TODO
+C * Write something less basic. It should not be hard to reach 1.5 c/l with
+C 4-way unrolling.
+
+define(`ap', `%rdi')
+define(`bp', `%rsi')
+define(`n', `%rdx')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_hamdist)
+ mov (ap), %r8
+ xor (bp), %r8
+
+ lea (ap,n,8), ap C point at A operand end
+ lea (bp,n,8), bp C point at B operand end
+ neg n
+
+ bt $0, R32(n)
+ jnc L(2)
+
+L(1): .byte 0xf3,0x49,0x0f,0xb8,0xc0 C popcnt %r8, %rax
+ xor R32(%r10), R32(%r10)
+ add $1, n
+ js L(top)
+ ret
+
+ ALIGN(16)
+L(2): mov 8(ap,n,8), %r9
+ .byte 0xf3,0x49,0x0f,0xb8,0xc0 C popcnt %r8, %rax
+ xor 8(bp,n,8), %r9
+ .byte 0xf3,0x4d,0x0f,0xb8,0xd1 C popcnt %r9, %r10
+ add $2, n
+ js L(top)
+ lea (%r10, %rax), %rax
+ ret
+
+ ALIGN(16)
+L(top): mov (ap,n,8), %r8
+ lea (%r10, %rax), %rax
+ mov 8(ap,n,8), %r9
+ xor (bp,n,8), %r8
+ xor 8(bp,n,8), %r9
+ .byte 0xf3,0x49,0x0f,0xb8,0xc8 C popcnt %r8, %rcx
+ lea (%rcx, %rax), %rax
+ .byte 0xf3,0x4d,0x0f,0xb8,0xd1 C popcnt %r9, %r10
+ add $2, n
+ js L(top)
+
+ lea (%r10, %rax), %rax
+ ret
+EPILOGUE()