summaryrefslogtreecommitdiff
path: root/mpn/x86_64
diff options
context:
space:
mode:
authorTorbjorn Granlund <tg@gmplib.org>2019-08-30 23:29:44 +0200
committerTorbjorn Granlund <tg@gmplib.org>2019-08-30 23:29:44 +0200
commit3b7029ca709031796ccbe00656582c3aa9a59fe8 (patch)
treea2785d5ae638937e9dda024af514abba02274abf /mpn/x86_64
parentdded631ca1a61e83895564deed01961c38f8b6fb (diff)
downloadgmp-3b7029ca709031796ccbe00656582c3aa9a59fe8.tar.gz
Optimise, now runs well on more CPUs.
Diffstat (limited to 'mpn/x86_64')
-rw-r--r--mpn/x86_64/coreihwl/gcd_22.asm39
1 files changed, 18 insertions, 21 deletions
diff --git a/mpn/x86_64/coreihwl/gcd_22.asm b/mpn/x86_64/coreihwl/gcd_22.asm
index 525235e44..b5863b60e 100644
--- a/mpn/x86_64/coreihwl/gcd_22.asm
+++ b/mpn/x86_64/coreihwl/gcd_22.asm
@@ -37,11 +37,11 @@ C AMD K10 -
C AMD bd1 -
C AMD bd2 -
C AMD bd3 -
-C AMD bd4 6.8
+C AMD bd4 6.7
C AMD bt1 -
C AMD bt2 -
-C AMD zn1 5.7
-C AMD zn2 5.9
+C AMD zn1 5.4
+C AMD zn2 5.5
C Intel P4 -
C Intel CNR -
C Intel PNR -
@@ -50,8 +50,8 @@ C Intel WSM -
C Intel SBR -
C Intel IBR -
C Intel HWL 7.1
-C Intel BWL 6.0
-C Intel SKL 6.3
+C Intel BWL 5.5
+C Intel SKL 5.6
C Intel atom -
C Intel SLM -
C Intel GLM -
@@ -62,15 +62,13 @@ C VIA nano -
define(`u1', `%rdi')
define(`u0', `%rsi')
define(`v1', `%rdx')
-define(`v0_param', `%rcx')
-
-define(`v0', `%rax')
-define(`cnt', `%rcx')
+define(`v0', `%rcx')
define(`s0', `%r8')
define(`s1', `%r9')
define(`t0', `%r10')
define(`t1', `%r11')
+define(`cnt', `%rax')
dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory
ABI_SUPPORT(STD64)
@@ -79,8 +77,7 @@ ASM_START()
TEXT
ALIGN(64)
PROLOGUE(mpn_gcd_22)
- FUNC_ENTRY(2)
- mov v0_param, v0
+ FUNC_ENTRY(4)
ALIGN(16)
L(top): mov v0, t0
@@ -89,12 +86,11 @@ L(top): mov v0, t0
mov v1, t1
sbb u1, t1
- mov u0, s0
- mov u1, s1
-
rep;bsf t0, cnt C tzcnt!
+ mov u0, s0
sub v0, u0
+ mov u1, s1
sbb v1, u1
L(bck): cmovc t0, u0 C u = |u - v|
@@ -104,18 +100,19 @@ L(bck): cmovc t0, u0 C u = |u - v|
xor R32(t0), R32(t0)
sub cnt, t0
+ shlx( t0, u1, s1)
shrx( cnt, u0, u0)
- shlx( t0, u1, t0)
- or t0, u0
- shr R8(cnt), u1
- jnz L(top)
+ shrx( cnt, u1, u1)
+ or s1, u0
test v1, v1
jnz L(top)
+ test u1, u1
+ jnz L(top)
L(gcd_11):
mov v0, %rdi
- mov u0, %rsi
+C mov u0, %rsi
TCALL( mpn_gcd_11)
L(lowz):C We come here when v0 - u0 = 0
@@ -134,8 +131,8 @@ L(lowz):C We come here when v0 - u0 = 0
sub v1, u0
jmp L(bck)
-L(end): C mov v0, %rax
+L(end): mov v0, %rax
C mov v1, %rdx
- FUNC_EXIT()
+L(ret): FUNC_EXIT()
ret
EPILOGUE()