Rewrite ARM64 shifting.

author: Torbjorn Granlund <tg@gmplib.org> 2017-02-21 16:20:58 +0100
committer: Torbjorn Granlund <tg@gmplib.org> 2017-02-21 16:20:58 +0100
commit: d8206c844ce3a7609928c347513f8058a902af69 (patch)
tree: 0856eb43e1d58a5572984b775459a255b240f546 /mpn/arm64
parent: c45499aff87b10190280ab9122a6ab6f228dec78 (diff)
download: gmp-d8206c844ce3a7609928c347513f8058a902af69.tar.gz
2 files changed, 118 insertions, 111 deletions
diff --git a/mpn/arm64/lshift.asm b/mpn/arm64/lshift.asm
index 72b74fb22..1bb5698b9 100644
--- a/mpn/arm64/lshift.asm
+++ b/mpn/arm64/lshift.asm
@@ -1,6 +1,6 @@
 dnl  ARM64 mpn_lshift.
 
-dnl  Copyright 2013, 2014 Free Software Foundation, Inc.
+dnl  Copyright 2013, 2014, 2017 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -19,9 +19,16 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C	     cycles/limb
-C Cortex-A53	 ?
-C Cortex-A57	 ?
+C	     cycles/limb   assumed optimal c/l
+C Cortex-A53	3.5-4.0		 3.25
+C Cortex-A57	 2.0		 2.0
+C X-Gene	 2.67		 2.5
+
+C TODO
+C  * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes.  These
+C    numbers should be 1 and 0, respectively.  The str in wind-down should also
+C    go.
+C  * Using extr and with 63 separate loops we might reach 1.25 c/l on A57.
 
 changecom(blah)
 
@@ -34,46 +41,45 @@ define(`rp',     `x16')
 
 define(`tnc',`x8')
 
+define(`PSHIFT', lsl)
+define(`NSHIFT', lsr)
+
 ASM_START()
 PROLOGUE(mpn_lshift)
 	add	rp, rp_arg, n, lsl #3
 	add	up, up, n, lsl #3
 	sub	tnc, xzr, cnt
+	lsr	x18, n, #2
 	tbz	n, #0, L(bx0)
 
 L(bx1):	ldr	x4, [up,#-8]
 	tbnz	n, #1, L(b11)
 
-L(b01):	lsr	x0, x4, tnc
-	lsl	x18, x4, cnt
-	sub	n, n, #1
-	cbnz	n, L(gt1)
-	str	x18, [rp,#-8]
+L(b01):	NSHIFT	x0, x4, tnc
+	PSHIFT	x2, x4, cnt
+	cbnz	x18, L(gt1)
+	str	x2, [rp,#-8]
 	ret
 L(gt1):	ldp	x4, x5, [up,#-24]
 	sub	up, up, #8
 	add	rp, rp, #16
 	b	L(lo2)
 
-L(b11):	lsr	x0, x4, tnc
-	lsl	x9, x4, cnt
-	ldp	x6, x7, [up,#-24]
-	add	n, n, #1
-	add	up, up, #8
-	add	rp, rp, #32
-	b	L(lo0)
+L(b11):	NSHIFT	x0, x4, tnc
+	PSHIFT	x2, x4, cnt
+	ldp	x6, x7, [up,#-24]!
+	b	L(lo3)
 
 L(bx0):	ldp	x4, x5, [up,#-16]
 	tbz	n, #1, L(b00)
 
-L(b10):	lsr	x0, x5, tnc
-	lsl	x13, x5, cnt
-	lsr	x10, x4, tnc
-	lsl	x18, x4, cnt
-	sub	n, n, #2
-	cbnz	n, L(gt2)
+L(b10):	NSHIFT	x0, x5, tnc
+	PSHIFT	x13, x5, cnt
+	NSHIFT	x10, x4, tnc
+	PSHIFT	x2, x4, cnt
+	cbnz	x18, L(gt2)
 	orr	x10, x10, x13
-	stp	x18, x10, [rp,#-16]
+	stp	x2, x10, [rp,#-16]
 	ret
 L(gt2):	ldp	x4, x5, [up,#-32]
 	orr	x10, x10, x13
@@ -82,41 +88,39 @@ L(gt2):	ldp	x4, x5, [up,#-32]
 	add	rp, rp, #8
 	b	L(lo2)
 
-L(b00):	lsr	x0, x5, tnc
-	lsl	x13, x5, cnt
-	lsr	x10, x4, tnc
-	lsl	x9, x4, cnt
-	ldp	x6, x7, [up,#-32]
+L(b00):	NSHIFT	x0, x5, tnc
+	PSHIFT	x13, x5, cnt
+	NSHIFT	x10, x4, tnc
+	PSHIFT	x2, x4, cnt
+	ldp	x6, x7, [up,#-32]!
 	orr	x10, x10, x13
-	str	x10, [rp,#-8]
-	add	rp, rp, #24
+	str	x10, [rp,#-8]!
 	b	L(lo0)
 
 	ALIGN(16)
-L(top):	ldp	x4, x5, [up,#-48]
-	sub	rp, rp, #32		C integrate with stp?
-	sub	up, up, #32		C integrate with ldp?
-	orr	x11, x11, x9
+L(top):	ldp	x4, x5, [up,#-16]
 	orr	x10, x10, x13
+	orr	x11, x12, x2
 	stp	x10, x11, [rp,#-16]
-L(lo2):	lsr	x11, x5, tnc
-	lsl	x13, x5, cnt
-	lsr	x10, x4, tnc
-	lsl	x9, x4, cnt
-	ldp	x6, x7, [up,#-32]
-	orr	x11, x11, x18
-	orr	x10, x10, x13
-	stp	x10, x11, [rp,#-32]
-L(lo0):	sub	n, n, #4
-	lsr	x11, x7, tnc
-	lsl	x13, x7, cnt
-	lsr	x10, x6, tnc
-	lsl	x18, x6, cnt
-	cbnz	n, L(top)
-
-L(end):	orr	x11, x11, x9
+	PSHIFT	x2, x6, cnt
+L(lo2):	NSHIFT	x10, x4, tnc
+	PSHIFT	x13, x5, cnt
+	NSHIFT	x12, x5, tnc
+	ldp	x6, x7, [up,#-32]!
 	orr	x10, x10, x13
-	stp	x10, x11, [rp,#-48]
-	str	x18, [rp,#-56]
+	orr	x11, x12, x2
+	stp	x10, x11, [rp,#-32]!
+	PSHIFT	x2, x4, cnt
+L(lo0):	sub	x18, x18, #1
+L(lo3):	NSHIFT	x10, x6, tnc
+	PSHIFT	x13, x7, cnt
+	NSHIFT	x12, x7, tnc
+	cbnz	x18, L(top)
+
+L(end):	orr	x10, x10, x13
+	orr	x11, x12, x2
+	PSHIFT	x2, x6, cnt
+	stp	x10, x11, [rp,#-16]
+	str	x2, [rp,#-24]
 	ret
 EPILOGUE()
diff --git a/mpn/arm64/rshift.asm b/mpn/arm64/rshift.asm
index dac51ef09..97ddda741 100644
--- a/mpn/arm64/rshift.asm
+++ b/mpn/arm64/rshift.asm
@@ -1,6 +1,6 @@
 dnl  ARM64 mpn_rshift.
 
-dnl  Copyright 2013, 2014 Free Software Foundation, Inc.
+dnl  Copyright 2013, 2014, 2017 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -19,9 +19,16 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
-C	     cycles/limb
-C Cortex-A53	 ?
-C Cortex-A57	 ?
+C	     cycles/limb   assumed optimal c/l
+C Cortex-A53	3.5-4.0		 3.25
+C Cortex-A57	 2.0		 2.0
+C X-Gene	 2.67		 2.5
+
+C TODO
+C  * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes.  These
+C    numbers should be 1 and 0, respectively.  The str in wind-down should also
+C    go.
+C  * Using extr and with 63 separate loops we might reach 1.25 c/l on A57.
 
 changecom(blah)
 
@@ -34,88 +41,84 @@ define(`rp',     `x16')
 
 define(`tnc',`x8')
 
+define(`PSHIFT', lsr)
+define(`NSHIFT', lsl)
+
 ASM_START()
 PROLOGUE(mpn_rshift)
 	mov	rp, rp_arg
 	sub	tnc, xzr, cnt
+	lsr	x18, n, #2
 	tbz	n, #0, L(bx0)
 
-L(bx1):	ldr	x4, [up,#0]
+L(bx1):	ldr	x5, [up]
 	tbnz	n, #1, L(b11)
 
-L(b01):	lsl	x0, x4, tnc
-	lsr	x18, x4, cnt
-	sub	n, n, #1
-	cbnz	n, L(gt1)
-	str	x18, [rp,#0]
+L(b01):	NSHIFT	x0, x5, tnc
+	PSHIFT	x2, x5, cnt
+	cbnz	x18, L(gt1)
+	str	x2, [rp]
 	ret
-L(gt1):	ldp	x5, x4, [up,#8]
+L(gt1):	ldp	x4, x5, [up,#8]
 	sub	up, up, #8
 	sub	rp, rp, #32
 	b	L(lo2)
 
-L(b11):	lsl	x0, x4, tnc
-	lsr	x9, x4, cnt
-	ldp	x7, x6, [up,#8]
-	add	n, n, #1
-	sub	up, up, #24
-	sub	rp, rp, #48
-	b	L(lo0)
+L(b11):	NSHIFT	x0, x5, tnc
+	PSHIFT	x2, x5, cnt
+	ldp	x6, x7, [up,#8]!
+	sub	rp, rp, #16
+	b	L(lo3)
 
-L(bx0):	ldp	x5, x4, [up,#0]
+L(bx0):	ldp	x4, x5, [up]
 	tbz	n, #1, L(b00)
 
-L(b10):	lsl	x0, x5, tnc
-	lsr	x13, x5, cnt
-	lsl	x10, x4, tnc
-	lsr	x18, x4, cnt
-	sub	n, n, #2
-	cbnz	n, L(gt2)
+L(b10):	NSHIFT	x0, x4, tnc
+	PSHIFT	x13, x4, cnt
+	NSHIFT	x10, x5, tnc
+	PSHIFT	x2, x5, cnt
+	cbnz	x18, L(gt2)
 	orr	x10, x10, x13
-	stp	x10, x18, [rp,#0]
+	stp	x10, x2, [rp]
 	ret
-L(gt2):	ldp	x5, x4, [up,#16]
+L(gt2):	ldp	x4, x5, [up,#16]
 	orr	x10, x10, x13
-	str	x10, [rp,#0]
-	sub	rp, rp, #24
+	str	x10, [rp],#-24
 	b	L(lo2)
 
-L(b00):	lsl	x0, x5, tnc
-	lsr	x13, x5, cnt
-	lsl	x10, x4, tnc
-	lsr	x9, x4, cnt
-	ldp	x7, x6, [up,#16]
+L(b00):	NSHIFT	x0, x4, tnc
+	PSHIFT	x13, x4, cnt
+	NSHIFT	x10, x5, tnc
+	PSHIFT	x2, x5, cnt
+	ldp	x6, x7, [up,#16]!
 	orr	x10, x10, x13
-	str	x10, [rp,#0]
-	sub	up, up, #16
-	sub	rp, rp, #40
+	str	x10, [rp],#-8
 	b	L(lo0)
 
 	ALIGN(16)
-L(top):	ldp	x5, x4, [up,#48]
-	add	rp, rp, #32		C integrate with stp?
-	add	up, up, #32		C integrate with ldp?
-	orr	x11, x11, x9
+L(top):	ldp	x4, x5, [up,#16]
 	orr	x10, x10, x13
+	orr	x11, x12, x2
 	stp	x11, x10, [rp,#16]
-L(lo2):	lsl	x11, x5, tnc
-	lsr	x13, x5, cnt
-	lsl	x10, x4, tnc
-	lsr	x9, x4, cnt
-	ldp	x7, x6, [up,#32]
-	orr	x11, x11, x18
+	PSHIFT	x2, x7, cnt
+L(lo2):	NSHIFT	x10, x5, tnc
+	NSHIFT	x12, x4, tnc
+	PSHIFT	x13, x4, cnt
+	ldp	x6, x7, [up,#32]!
 	orr	x10, x10, x13
-	stp	x11, x10, [rp,#32]
-L(lo0):	sub	n, n, #4
-	lsl	x11, x7, tnc
-	lsr	x13, x7, cnt
-	lsl	x10, x6, tnc
-	lsr	x18, x6, cnt
-	cbnz	n, L(top)
-
-L(end):	orr	x11, x11, x9
-	orr	x10, x10, x13
-	stp	x11, x10, [rp,#48]
-	str	x18, [rp,#64]
+	orr	x11, x12, x2
+	stp	x11, x10, [rp,#32]!
+	PSHIFT	x2, x5, cnt
+L(lo0):	sub	x18, x18, #1
+L(lo3):	NSHIFT	x10, x7, tnc
+	NSHIFT	x12, x6, tnc
+	PSHIFT	x13, x6, cnt
+	cbnz	x18, L(top)
+
+L(end):	orr	x10, x10, x13
+	orr	x11, x12, x2
+	PSHIFT	x2, x7, cnt
+	stp	x11, x10, [rp,#16]
+	str	x2, [rp,#32]
 	ret
 EPILOGUE()
author	Torbjorn Granlund <tg@gmplib.org>	2017-02-21 16:20:58 +0100
committer	Torbjorn Granlund <tg@gmplib.org>	2017-02-21 16:20:58 +0100
commit	d8206c844ce3a7609928c347513f8058a902af69 (patch)
tree	0856eb43e1d58a5572984b775459a255b240f546 /mpn/arm64
parent	c45499aff87b10190280ab9122a6ab6f228dec78 (diff)
download	gmp-d8206c844ce3a7609928c347513f8058a902af69.tar.gz