Check in intended version of last file.

author: Torbjorn Granlund <tege@gmplib.org> 2013-08-01 23:32:13 +0200
committer: Torbjorn Granlund <tege@gmplib.org> 2013-08-01 23:32:13 +0200
commit: fcd7e8f0371401677090d537ef48205978554aab (patch)
tree: a016e6e84b7be05d354ab6ea6b61b46ad091e564 /mpn/x86_64/bd1
parent: e1743a2fb601a999af9f17cef67de7b56112ed3c (diff)
download: gmp-fcd7e8f0371401677090d537ef48205978554aab.tar.gz
1 files changed, 57 insertions, 63 deletions
diff --git a/mpn/x86_64/bd1/mul_basecase.asm b/mpn/x86_64/bd1/mul_basecase.asm
index 54f695de9..014edab90 100644
--- a/mpn/x86_64/bd1/mul_basecase.asm
+++ b/mpn/x86_64/bd1/mul_basecase.asm
@@ -1,5 +1,7 @@
 dnl  AMD64 mpn_mul_basecase optimised for AMD Bulldozer and Piledriver.
 
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
 dnl  Copyright 2003, 2004, 2005, 2007, 2008, 2011, 2012, 2013 Free Software
 
 dnl  This file is part of the GNU MP Library.
@@ -22,12 +24,12 @@ include(`../config.m4')
 C cycles/limb	mul_1		mul_2		mul_3		addmul_2
 C AMD K8,K9
 C AMD K10
-C AMD bull	~4.1		~4.55		-		~4.3
-C AMD pile	~4.5		~4.55		-		~4.55
+C AMD bull	~4.8		~4.55		-		~4.3
+C AMD pile	~4.6		~4.55		-		~4.55
 C AMD bobcat
 C AMD jaguar
 C Intel P4
-C Intel core2
+C Intel core
 C Intel NHM
 C Intel SBR
 C Intel IBR
@@ -40,10 +42,12 @@ C The inner loops of this code are the result of running a code generation and
 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
 
 C TODO
-C  * Merge bull-specific mul_1.
+C  * Merge bull-specific mul_1, if it is not slower the TOOM22 range.
+C    Alternatively, we could tweak the present code (which was loopmixed for a
+C    different CPU).
 C  * Merge faster mul_2.  Current fastest mul_2 code is non-indexed, causing
 C    some structure headaches.
-C  * Micro-optimise to save on the constant and linear terms.
+C  * Further micro-optimise.
 
 C When playing with pointers, set this to $2 to fall back to conservative
 C indexing in wind-dowm code.
@@ -71,7 +75,6 @@ ASM_START()
 PROLOGUE(mpn_mul_basecase)
 	push	%rbx
 	push	%rbp
-	push	%r12
 	mov	un_param, un		C free up rdx
 	neg	un
 
@@ -83,98 +86,88 @@ PROLOGUE(mpn_mul_basecase)
 	mul	v0			C shared for mul_1 and mul_2
 
 	test	$1, R8(vn)
-	jz	do_mul_2
+	jz	L(do_mul_2)
 
-do_mul_1:
+L(do_mul_1):
 	test	$1, R8(un)
 	jnz	L(m1x1)
 
-L(m1x0):test	$2, R8(un)
+L(m1x0):mov	%rax, w0		C un = 2, 4, 6, 8, ...
+	mov	%rdx, w1
+	mov	8(up,un,8), %rax
+	test	$2, R8(un)
 	jnz	L(m110)
 
-L(m100):lea	(un), n			C un = 4, 8, 12, ...
-	mov	%rax, w1
-	mov	%rdx, w0
-	mov	8(up,un,8), %rax
+L(m100):lea	2(un), n		C un = 4, 8, 12, ...
 	jmp	L(m1l0)
 
-L(m1x1):test	$2, R8(un)
+L(m110):lea	(un), n			C un = 2, 6, 10, ...
+	jmp	L(m1l2)
+
+L(m1x1):mov	%rax, w1		C un = 1, 3, 5, 7, ...
+	mov	%rdx, w0
+	test	$2, R8(un)
 	jz	L(m111)
 
-L(m101):lea	1(un), n		C un = 1, 5, 9, ...
-	mov	%rax, (rp,un,8)
+L(m101):lea	3(un), n		C un = 1, 5, 9, ...
 	test	n, n
-	jns	L(n1)
-	mov	%rdx, w1
-	jmp	L(m1l1)
-	
-L(n1):	mov	%rdx, (rp)
-	pop	%r12
+	js	L(m1l1)
+	mov	%rax, -8(rp)
+	mov	%rdx, (rp)
 	pop	%rbp
 	pop	%rbx
 	ret
 
-L(m111):lea	-1(un), n		C un = 3, 7, 11, ...
-	mov	%rax, w0
-	mov	%rdx, w2
+L(m111):lea	1(un), n		C un = 3, 7, 11, ...
 	mov	8(up,un,8), %rax
-	mul	v0
 	jmp	L(m1l3)
 
-	.byte	0x90,0x90,0x90,0x90,0x90,0x90,0x90
-L(m110):lea	2(un), n		C un = 2, 6, 10, ...
-	mov	%rax, w2
-	mov	%rdx, w0
-	mov	8(up,un,8), %rax
-	test	n, n
-	jns	L(m1ed)
-
 	ALIGN(16)
-L(m1tp):mul	v0
-	mov	%rdx, w1
+L(m1tp):mov	%rdx, w0
+	add	%rax, w1
+L(m1l1):mov	-16(up,n,8), %rax
+	adc	$0, w0
+	mul	v0
 	add	%rax, w0
+	mov	w1, -24(rp,n,8)
+	mov	-8(up,n,8), %rax
+	mov	%rdx, w1
 	adc	$0, w1
-	mov	w0, -8(rp,n,8)
-	mov	w2, -16(rp,n,8)
-L(m1l1):mov	(up,n,8), %rax
-	mul	v0
+L(m1l0):mul	v0
+	mov	w0, -16(rp,n,8)
 	add	%rax, w1
 	mov	%rdx, w0
-	mov	8(up,n,8), %rax
+	mov	(up,n,8), %rax
 	adc	$0, w0
-L(m1l0):mul	v0
+L(m1l3):mul	v0
+	mov	w1, -8(rp,n,8)
+	mov	%rdx, w1
 	add	%rax, w0
-	mov	%rdx, w2
-	mov	16(up,n,8), %rax
-	adc	$0, w2
-	mul	v0
-	mov	w1, (rp,n,8)
-L(m1l3):add	%rax, w2
-	mov	w0, 8(rp,n,8)
-	mov	%rdx, w0
-	adc	$0, w0
+	mov	8(up,n,8), %rax
+	adc	$0, w1
+L(m1l2):mul	v0
+	mov	w0, (rp,n,8)
 	add	$4, n
-L(m1l2):mov	-8(up,n,8), %rax
-	js	L(m1tp)
+	jnc	L(m1tp)
 
-L(m1ed):mul	v0
-	add	%rax, w0
+L(m1ed):add	%rax, w1
 	adc	$0, %rdx
-	mov	w2, I(-16(rp),-16(rp,n,8))
-	mov	w0, I(-8(rp),-8(rp,n,8))
-	mov	%rdx, I((rp),(rp,n,8))
+	mov	w1, I(-8(rp),-24(rp,n,8))
+	mov	%rdx, I((rp),-16(rp,n,8))
 
 	dec	R32(vn)
 	jz	L(ret2)
 
 	lea	8(vp), vp
 	lea	8(rp), rp
+	push	%r12
 	push	%r13
 	push	%r14
-	jmp	do_addmul
+	jmp	L(do_addmul)
 
-do_mul_2:
+L(do_mul_2):
 define(`v1',	`%r14')
+	push	%r12
 	push	%r13
 	push	%r14
 
@@ -248,7 +241,8 @@ L(m2ed):add	%rax, w2
 	lea	16(vp), vp
 	lea	16(rp), rp
 
-do_addmul:
+
+L(do_addmul):
 	push	%r15
 	push	vn			C save vn in new stack slot
 define(`vn',	`(%rsp)')
@@ -398,8 +392,8 @@ L(end):	mul	v0
 	pop	%r15
 L(ret5):pop	%r14
 	pop	%r13
-L(ret2):pop	%r12
-	pop	%rbp
+	pop	%r12
+L(ret2):pop	%rbp
 	pop	%rbx
 	ret
 EPILOGUE()
author	Torbjorn Granlund <tege@gmplib.org>	2013-08-01 23:32:13 +0200
committer	Torbjorn Granlund <tege@gmplib.org>	2013-08-01 23:32:13 +0200
commit	fcd7e8f0371401677090d537ef48205978554aab (patch)
tree	a016e6e84b7be05d354ab6ea6b61b46ad091e564 /mpn/x86_64/bd1
parent	e1743a2fb601a999af9f17cef67de7b56112ed3c (diff)
download	gmp-fcd7e8f0371401677090d537ef48205978554aab.tar.gz