1 files changed, 85 insertions, 0 deletions
diff --git a/mpn/x86_64/k10/hamdist.asm b/mpn/x86_64/k10/hamdist.asm
new file mode 100644
index 000000000..ae0309b3e
--- /dev/null
+++ b/mpn/x86_64/k10/hamdist.asm
@@ -0,0 +1,85 @@
+dnl  AMD64 mpn_hamdist -- hamming distance.
+
+dnl  Copyright 2008, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/limb
+C AMD K8,K9		 n/a
+C AMD K10		 2
+C Intel P4		 n/a
+C Intel core2		 n/a
+C Intel corei		 2.05
+C Intel atom		 n/a
+C VIA nano		 n/a
+
+C This is very straightforward 2-way unrolled code.
+
+C TODO
+C  * Write something less basic.  It should not be hard to reach 1.5 c/l with
+C    4-way unrolling.
+
+define(`ap',		`%rdi')
+define(`bp',		`%rsi')
+define(`n',		`%rdx')
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_hamdist)
+	mov	(ap), %r8
+	xor	(bp), %r8
+
+	lea	(ap,n,8), ap			C point at A operand end
+	lea	(bp,n,8), bp			C point at B operand end
+	neg	n
+
+	bt	$0, R32(n)
+	jnc	L(2)
+
+L(1):	.byte	0xf3,0x49,0x0f,0xb8,0xc0	C popcnt %r8, %rax
+	xor	R32(%r10), R32(%r10)
+	add	$1, n
+	js	L(top)
+	ret
+
+	ALIGN(16)
+L(2):	mov	8(ap,n,8), %r9
+	.byte	0xf3,0x49,0x0f,0xb8,0xc0	C popcnt %r8, %rax
+	xor	8(bp,n,8), %r9
+	.byte	0xf3,0x4d,0x0f,0xb8,0xd1	C popcnt %r9, %r10
+	add	$2, n
+	js	L(top)
+	lea	(%r10, %rax), %rax
+	ret
+
+	ALIGN(16)
+L(top):	mov	(ap,n,8), %r8
+	lea	(%r10, %rax), %rax
+	mov	8(ap,n,8), %r9
+	xor	(bp,n,8), %r8
+	xor	8(bp,n,8), %r9
+	.byte	0xf3,0x49,0x0f,0xb8,0xc8	C popcnt %r8, %rcx
+	lea	(%rcx, %rax), %rax
+	.byte	0xf3,0x4d,0x0f,0xb8,0xd1	C popcnt %r9, %r10
+	add	$2, n
+	js	L(top)
+
+	lea	(%r10, %rax), %rax
+	ret
+EPILOGUE()