summaryrefslogtreecommitdiff
path: root/mpn/x86_64
diff options
context:
space:
mode:
authorTorbjorn Granlund <tg@gmplib.org>2019-08-24 00:27:11 +0200
committerTorbjorn Granlund <tg@gmplib.org>2019-08-24 00:27:11 +0200
commit1336e81059ab28efa2ba7249ec6144a4c3449cc2 (patch)
tree69626f59ec448fb9f4570d49adf4e4559f0c2650 /mpn/x86_64
parent7cea544442e7e16089a20253d74a872dd06201f9 (diff)
downloadgmp-1336e81059ab28efa2ba7249ec6144a4c3449cc2.tar.gz
Remove x86_64/bt1/gcd_22.asm, add improved version as default x86_64/gcd_22.asm.
Diffstat (limited to 'mpn/x86_64')
-rw-r--r--mpn/x86_64/gcd_22.asm (renamed from mpn/x86_64/bt1/gcd_22.asm)78
1 files changed, 36 insertions, 42 deletions
diff --git a/mpn/x86_64/bt1/gcd_22.asm b/mpn/x86_64/gcd_22.asm
index 6dfa5b7b0..984cd1ce8 100644
--- a/mpn/x86_64/bt1/gcd_22.asm
+++ b/mpn/x86_64/gcd_22.asm
@@ -1,4 +1,6 @@
-dnl AMD64 mpn_gcd_22. Assumes useless bsf, useless shrd, tzcnt, no shlx.
+dnl AMD64 mpn_gcd_22. Assumes useless bsf, useless shrd, no tzcnt, no shlx.
+dnl We actually use tzcnt here, when table cannot count bits, as tzcnt always
+dnl works for our use, and helps a lot for certain CPUs.
dnl Copyright 2019 Free Software Foundation, Inc.
@@ -32,36 +34,36 @@ include(`../config.m4')
C cycles/bit
-C AMD K8,K9 ?
-C AMD K10 ?
-C AMD bd1 9.83
-C AMD bd2 7.81
+C AMD K8,K9 8.9
+C AMD K10 8.8
+C AMD bd1 9.7
+C AMD bd2 7.8
C AMD bd3 ?
-C AMD bd4 ?
-C AMD bt1 9.0
-C AMD bt2 9.2
-C AMD zn1 ?
-C AMD zn2 ?
+C AMD bd4 7.4
+C AMD bt1 9.2
+C AMD bt2 9.1
+C AMD zn1 7.5
+C AMD zn2 7.5
C Intel P4 ?
-C Intel CNR ?
-C Intel PNR ?
-C Intel NHM ?
-C Intel WSM ?
-C Intel SBR ?
+C Intel CNR 10.5
+C Intel PNR 10.5
+C Intel NHM 9.7
+C Intel WSM 9.7
+C Intel SBR 10.7
C Intel IBR ?
-C Intel HWL ?
-C Intel BWL ?
-C Intel SKL ?
-C Intel atom ?
-C Intel SLM ?
-C Intel GLM ?
-C Intel GLM+ ?
+C Intel HWL 9.5
+C Intel BWL 8.7
+C Intel SKL 8.6
+C Intel atom 18.9
+C Intel SLM 14.0
+C Intel GLM 9.8
+C Intel GLM+ 8.8
C VIA nano ?
C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
-deflit(MAXSHIFT, 7)
+deflit(MAXSHIFT, 8)
deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
PROTECT(`ctz_table')
@@ -83,7 +85,7 @@ define(`cnt', `%rcx')
define(`s0', `%r8')
define(`s1', `%r9')
-define(`t0', `%r10')
+define(`t0', `%rcx')
define(`t1', `%r11')
dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory
@@ -91,13 +93,12 @@ ABI_SUPPORT(STD64)
ASM_START()
TEXT
- ALIGN(16)
+ ALIGN(64)
PROLOGUE(mpn_gcd_22)
FUNC_ENTRY(2)
- push %r12
mov v0_param, v0
- LEA( ctz_table, %r12)
+ LEA( ctz_table, %r10)
ALIGN(16)
L(top): mov v0, t0
@@ -113,31 +114,29 @@ L(top): mov v0, t0
sbb v1, u1
L(bck): cmovc t0, u0 C u = |u - v|
- cmovnc u1, t1 C u = |u - v|
+ cmovc t1, u1 C u = |u - v|
cmovc s0, v0 C v = min(u,v)
cmovc s1, v1 C v = min(u,v)
and $MASK, R32(t0)
- movzbl (%r12,t0), R32(%rcx)
+ movzbl (%r10,t0), R32(cnt)
jz L(count_better)
-C Rightshift (t1,,u0) into (u1,,u0)
+C Rightshift (u1,,u0) into (u1,,u0)
L(shr): shr R8(cnt), u0
- mov t1, u1
+ mov u1, t1
shr R8(cnt), u1
neg cnt
shl R8(cnt), t1
or t1, u0
- test u1, u1
- jnz L(top)
test v1, v1
jnz L(top)
+ test u1, u1
+ jnz L(top)
L(gcd_11):
mov v0, %rdi
mov u0, %rsi
- xor R32(%rdx), R32(%rdx)
- pop %r12
TCALL( mpn_gcd_11)
L(count_better):
@@ -150,22 +149,17 @@ L(lowz):C We come here when v0 - u0 = 0
mov v1, t0
sub u1, t0
je L(end)
- mov $0, t1
+ xor t1, t1
mov u0, s0
mov u1, s1
-
- rep;bsf t0, cnt C tzcnt!
-
mov u1, u0
+ xor u1, u1
sub v1, u0
- mov $0, u1
-
jmp L(bck)
L(end): C mov v0, %rax
C mov v1, %rdx
- pop %r12
FUNC_EXIT()
ret
EPILOGUE()