1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
|
dnl AMD64 mpn_hamdist -- hamming distance.
dnl Copyright 2008, 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C AMD K8,K9 n/a
C AMD K10 2
C Intel P4 n/a
C Intel core2 n/a
C Intel corei 2.05
C Intel atom n/a
C VIA nano n/a
C This is very straightforward 2-way unrolled code.
C TODO
C * Write something less basic. It should not be hard to reach 1.5 c/l with
C 4-way unrolling.
define(`ap', `%rdi')
define(`bp', `%rsi')
define(`n', `%rdx')
ABI_SUPPORT(DOS64)
ABI_SUPPORT(STD64)
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_hamdist)
FUNC_ENTRY(3)
mov (ap), %r8
xor (bp), %r8
lea (ap,n,8), ap C point at A operand end
lea (bp,n,8), bp C point at B operand end
neg n
bt $0, R32(n)
jnc L(2)
L(1): .byte 0xf3,0x49,0x0f,0xb8,0xc0 C popcnt %r8, %rax
xor R32(%r10), R32(%r10)
add $1, n
js L(top)
FUNC_EXIT()
ret
ALIGN(16)
L(2): mov 8(ap,n,8), %r9
.byte 0xf3,0x49,0x0f,0xb8,0xc0 C popcnt %r8, %rax
xor 8(bp,n,8), %r9
.byte 0xf3,0x4d,0x0f,0xb8,0xd1 C popcnt %r9, %r10
add $2, n
js L(top)
lea (%r10, %rax), %rax
FUNC_EXIT()
ret
ALIGN(16)
L(top): mov (ap,n,8), %r8
lea (%r10, %rax), %rax
mov 8(ap,n,8), %r9
xor (bp,n,8), %r8
xor 8(bp,n,8), %r9
.byte 0xf3,0x49,0x0f,0xb8,0xc8 C popcnt %r8, %rcx
lea (%rcx, %rax), %rax
.byte 0xf3,0x4d,0x0f,0xb8,0xd1 C popcnt %r9, %r10
add $2, n
js L(top)
lea (%r10, %rax), %rax
FUNC_EXIT()
ret
EPILOGUE()
|