summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTorbjorn Granlund <tg@gmplib.org>2017-06-03 23:57:45 +0200
committerTorbjorn Granlund <tg@gmplib.org>2017-06-03 23:57:45 +0200
commit46617131b276147b3e6b6531b3e76376f9504e7a (patch)
tree588a263299690e0b77cdf36d271f93e9c0bfec4d
parentbea05b85192d3f18538ca8c51cbc2b8c60841d6a (diff)
downloadgmp-46617131b276147b3e6b6531b3e76376f9504e7a.tar.gz
Expand some instructions as .byte sequences.
-rw-r--r--mpn/x86_64/bd1/hamdist.asm22
-rw-r--r--mpn/x86_64/bd1/popcount.asm22
2 files changed, 22 insertions, 22 deletions
diff --git a/mpn/x86_64/bd1/hamdist.asm b/mpn/x86_64/bd1/hamdist.asm
index 67ee7b116..5282e7f1f 100644
--- a/mpn/x86_64/bd1/hamdist.asm
+++ b/mpn/x86_64/bd1/hamdist.asm
@@ -120,30 +120,30 @@ L(0): add $64, up
ALIGN(32)
L(top): lddqu (up), %xmm0
pxor (vp), %xmm0
- vpshlb %xmm6, %xmm0, %xmm1
+ .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1
pand %xmm5, %xmm0
pand %xmm5, %xmm1
- vpperm %xmm0, %xmm7, %xmm7, %xmm2
- vpperm %xmm1, %xmm7, %xmm7, %xmm3
+ .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+ .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3
paddb %xmm2, %xmm3
paddb %xmm3, %xmm4
L(6): lddqu 16(up), %xmm0
pxor 16(vp), %xmm0
- vpshlb %xmm6, %xmm0, %xmm1
+ .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1
pand %xmm5, %xmm0
pand %xmm5, %xmm1
- vpperm %xmm0, %xmm7, %xmm7, %xmm2
- vpperm %xmm1, %xmm7, %xmm7, %xmm3
+ .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+ .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3
paddb %xmm2, %xmm3
paddb %xmm3, %xmm4
L(4): lddqu 32(up), %xmm0
pxor 32(vp), %xmm0
- vpshlb %xmm6, %xmm0, %xmm1
+ .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1
pand %xmm5, %xmm0
pand %xmm5, %xmm1
- vpperm %xmm0, %xmm7, %xmm7, %xmm2
- vphaddubq %xmm4, %xmm0 C sum to 8 x 16-bit counts
- vpperm %xmm1, %xmm7, %xmm7, %xmm4
+ .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+ .byte 0x8f,0xe9,0x78,0xd3,0xc4 C vphaddubq %xmm4, %xmm0
+ .byte 0x8f,0xe8,0x40,0xa3,0xe7,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm4
paddb %xmm2, %xmm3
paddb %xmm2, %xmm4
paddq %xmm0, %xmm8 C sum to 2 x 64-bit counts
@@ -166,7 +166,7 @@ L(2): mov 48(up), %r8
xor (vp), %r8
popcnt %r8, %r8
add %r8, %r10
-L(x): vphaddubq %xmm4, %xmm0 C sum to 8 x 16-bit counts
+L(x): .byte 0x8f,0xe9,0x78,0xd3,0xc4 C vphaddubq %xmm4, %xmm0
paddq %xmm0, %xmm8
pshufd $14, %xmm8, %xmm0
paddq %xmm8, %xmm0
diff --git a/mpn/x86_64/bd1/popcount.asm b/mpn/x86_64/bd1/popcount.asm
index 8fef42491..67bc9d7e9 100644
--- a/mpn/x86_64/bd1/popcount.asm
+++ b/mpn/x86_64/bd1/popcount.asm
@@ -130,28 +130,28 @@ L(7): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx
ALIGN(32)
L(top): lddqu (up), %xmm0
- vpshlb %xmm6, %xmm0, %xmm1
+ .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1
pand %xmm9, %xmm0
pand %xmm9, %xmm1
- vpperm %xmm0, %xmm7, %xmm7, %xmm2
- vpperm %xmm1, %xmm7, %xmm7, %xmm3
+ .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+ .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1, %xmm7, %xmm7, %xmm3
paddb %xmm2, %xmm3
paddb %xmm3, %xmm4
L(e6): lddqu 16(up), %xmm0
- vpshlb %xmm6, %xmm0, %xmm1
+ .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1
pand %xmm9, %xmm0
pand %xmm9, %xmm1
- vpperm %xmm0, %xmm7, %xmm7, %xmm2
- vpperm %xmm1, %xmm7, %xmm7, %xmm3
+ .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+ .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3
paddb %xmm2, %xmm3
paddb %xmm3, %xmm4
L(e4): lddqu 32(up), %xmm0
- vpshlb %xmm6, %xmm0, %xmm1
+ .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1
pand %xmm9, %xmm0
pand %xmm9, %xmm1
- vpperm %xmm0, %xmm7, %xmm7, %xmm2
- vphaddubq %xmm4, %xmm5 C sum to 8 x 16-bit counts
- vpperm %xmm1, %xmm7, %xmm7, %xmm4
+ .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0, %xmm7, %xmm7, %xmm2
+ .byte 0x8f,0xe9,0x78,0xd3,0xec C vphaddubq %xmm4, %xmm5
+ .byte 0x8f,0xe8,0x40,0xa3,0xe7,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm4
paddb %xmm2, %xmm4
L(e2): popcnt 48(up), %r8
popcnt 56(up), %r9
@@ -162,7 +162,7 @@ L(e2): popcnt 48(up), %r8
sub $8, n
jg L(top)
- vphaddubq %xmm4, %xmm5 C sum to 8 x 16-bit counts
+ .byte 0x8f,0xe9,0x78,0xd3,0xec C vphaddubq %xmm4, %xmm5
paddq %xmm5, %xmm8
pshufd $14, %xmm8, %xmm0
paddq %xmm8, %xmm0