summaryrefslogtreecommitdiff
path: root/mpn/arm64
diff options
context:
space:
mode:
authorTorbjorn Granlund <tg@gmplib.org>2017-02-21 16:20:58 +0100
committerTorbjorn Granlund <tg@gmplib.org>2017-02-21 16:20:58 +0100
commitd8206c844ce3a7609928c347513f8058a902af69 (patch)
tree0856eb43e1d58a5572984b775459a255b240f546 /mpn/arm64
parentc45499aff87b10190280ab9122a6ab6f228dec78 (diff)
downloadgmp-d8206c844ce3a7609928c347513f8058a902af69.tar.gz
Rewrite ARM64 shifting.
Diffstat (limited to 'mpn/arm64')
-rw-r--r--mpn/arm64/lshift.asm108
-rw-r--r--mpn/arm64/rshift.asm121
2 files changed, 118 insertions, 111 deletions
diff --git a/mpn/arm64/lshift.asm b/mpn/arm64/lshift.asm
index 72b74fb22..1bb5698b9 100644
--- a/mpn/arm64/lshift.asm
+++ b/mpn/arm64/lshift.asm
@@ -1,6 +1,6 @@
dnl ARM64 mpn_lshift.
-dnl Copyright 2013, 2014 Free Software Foundation, Inc.
+dnl Copyright 2013, 2014, 2017 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -19,9 +19,16 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C Cortex-A53 ?
-C Cortex-A57 ?
+C cycles/limb assumed optimal c/l
+C Cortex-A53 3.5-4.0 3.25
+C Cortex-A57 2.0 2.0
+C X-Gene 2.67 2.5
+
+C TODO
+C * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes. These
+C numbers should be 1 and 0, respectively. The str in wind-down should also
+C go.
+C * Using extr and with 63 separate loops we might reach 1.25 c/l on A57.
changecom(blah)
@@ -34,46 +41,45 @@ define(`rp', `x16')
define(`tnc',`x8')
+define(`PSHIFT', lsl)
+define(`NSHIFT', lsr)
+
ASM_START()
PROLOGUE(mpn_lshift)
add rp, rp_arg, n, lsl #3
add up, up, n, lsl #3
sub tnc, xzr, cnt
+ lsr x18, n, #2
tbz n, #0, L(bx0)
L(bx1): ldr x4, [up,#-8]
tbnz n, #1, L(b11)
-L(b01): lsr x0, x4, tnc
- lsl x18, x4, cnt
- sub n, n, #1
- cbnz n, L(gt1)
- str x18, [rp,#-8]
+L(b01): NSHIFT x0, x4, tnc
+ PSHIFT x2, x4, cnt
+ cbnz x18, L(gt1)
+ str x2, [rp,#-8]
ret
L(gt1): ldp x4, x5, [up,#-24]
sub up, up, #8
add rp, rp, #16
b L(lo2)
-L(b11): lsr x0, x4, tnc
- lsl x9, x4, cnt
- ldp x6, x7, [up,#-24]
- add n, n, #1
- add up, up, #8
- add rp, rp, #32
- b L(lo0)
+L(b11): NSHIFT x0, x4, tnc
+ PSHIFT x2, x4, cnt
+ ldp x6, x7, [up,#-24]!
+ b L(lo3)
L(bx0): ldp x4, x5, [up,#-16]
tbz n, #1, L(b00)
-L(b10): lsr x0, x5, tnc
- lsl x13, x5, cnt
- lsr x10, x4, tnc
- lsl x18, x4, cnt
- sub n, n, #2
- cbnz n, L(gt2)
+L(b10): NSHIFT x0, x5, tnc
+ PSHIFT x13, x5, cnt
+ NSHIFT x10, x4, tnc
+ PSHIFT x2, x4, cnt
+ cbnz x18, L(gt2)
orr x10, x10, x13
- stp x18, x10, [rp,#-16]
+ stp x2, x10, [rp,#-16]
ret
L(gt2): ldp x4, x5, [up,#-32]
orr x10, x10, x13
@@ -82,41 +88,39 @@ L(gt2): ldp x4, x5, [up,#-32]
add rp, rp, #8
b L(lo2)
-L(b00): lsr x0, x5, tnc
- lsl x13, x5, cnt
- lsr x10, x4, tnc
- lsl x9, x4, cnt
- ldp x6, x7, [up,#-32]
+L(b00): NSHIFT x0, x5, tnc
+ PSHIFT x13, x5, cnt
+ NSHIFT x10, x4, tnc
+ PSHIFT x2, x4, cnt
+ ldp x6, x7, [up,#-32]!
orr x10, x10, x13
- str x10, [rp,#-8]
- add rp, rp, #24
+ str x10, [rp,#-8]!
b L(lo0)
ALIGN(16)
-L(top): ldp x4, x5, [up,#-48]
- sub rp, rp, #32 C integrate with stp?
- sub up, up, #32 C integrate with ldp?
- orr x11, x11, x9
+L(top): ldp x4, x5, [up,#-16]
orr x10, x10, x13
+ orr x11, x12, x2
stp x10, x11, [rp,#-16]
-L(lo2): lsr x11, x5, tnc
- lsl x13, x5, cnt
- lsr x10, x4, tnc
- lsl x9, x4, cnt
- ldp x6, x7, [up,#-32]
- orr x11, x11, x18
- orr x10, x10, x13
- stp x10, x11, [rp,#-32]
-L(lo0): sub n, n, #4
- lsr x11, x7, tnc
- lsl x13, x7, cnt
- lsr x10, x6, tnc
- lsl x18, x6, cnt
- cbnz n, L(top)
-
-L(end): orr x11, x11, x9
+ PSHIFT x2, x6, cnt
+L(lo2): NSHIFT x10, x4, tnc
+ PSHIFT x13, x5, cnt
+ NSHIFT x12, x5, tnc
+ ldp x6, x7, [up,#-32]!
orr x10, x10, x13
- stp x10, x11, [rp,#-48]
- str x18, [rp,#-56]
+ orr x11, x12, x2
+ stp x10, x11, [rp,#-32]!
+ PSHIFT x2, x4, cnt
+L(lo0): sub x18, x18, #1
+L(lo3): NSHIFT x10, x6, tnc
+ PSHIFT x13, x7, cnt
+ NSHIFT x12, x7, tnc
+ cbnz x18, L(top)
+
+L(end): orr x10, x10, x13
+ orr x11, x12, x2
+ PSHIFT x2, x6, cnt
+ stp x10, x11, [rp,#-16]
+ str x2, [rp,#-24]
ret
EPILOGUE()
diff --git a/mpn/arm64/rshift.asm b/mpn/arm64/rshift.asm
index dac51ef09..97ddda741 100644
--- a/mpn/arm64/rshift.asm
+++ b/mpn/arm64/rshift.asm
@@ -1,6 +1,6 @@
dnl ARM64 mpn_rshift.
-dnl Copyright 2013, 2014 Free Software Foundation, Inc.
+dnl Copyright 2013, 2014, 2017 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -19,9 +19,16 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C cycles/limb
-C Cortex-A53 ?
-C Cortex-A57 ?
+C cycles/limb assumed optimal c/l
+C Cortex-A53 3.5-4.0 3.25
+C Cortex-A57 2.0 2.0
+C X-Gene 2.67 2.5
+
+C TODO
+C * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes. These
+C numbers should be 1 and 0, respectively. The str in wind-down should also
+C go.
+C * Using extr and with 63 separate loops we might reach 1.25 c/l on A57.
changecom(blah)
@@ -34,88 +41,84 @@ define(`rp', `x16')
define(`tnc',`x8')
+define(`PSHIFT', lsr)
+define(`NSHIFT', lsl)
+
ASM_START()
PROLOGUE(mpn_rshift)
mov rp, rp_arg
sub tnc, xzr, cnt
+ lsr x18, n, #2
tbz n, #0, L(bx0)
-L(bx1): ldr x4, [up,#0]
+L(bx1): ldr x5, [up]
tbnz n, #1, L(b11)
-L(b01): lsl x0, x4, tnc
- lsr x18, x4, cnt
- sub n, n, #1
- cbnz n, L(gt1)
- str x18, [rp,#0]
+L(b01): NSHIFT x0, x5, tnc
+ PSHIFT x2, x5, cnt
+ cbnz x18, L(gt1)
+ str x2, [rp]
ret
-L(gt1): ldp x5, x4, [up,#8]
+L(gt1): ldp x4, x5, [up,#8]
sub up, up, #8
sub rp, rp, #32
b L(lo2)
-L(b11): lsl x0, x4, tnc
- lsr x9, x4, cnt
- ldp x7, x6, [up,#8]
- add n, n, #1
- sub up, up, #24
- sub rp, rp, #48
- b L(lo0)
+L(b11): NSHIFT x0, x5, tnc
+ PSHIFT x2, x5, cnt
+ ldp x6, x7, [up,#8]!
+ sub rp, rp, #16
+ b L(lo3)
-L(bx0): ldp x5, x4, [up,#0]
+L(bx0): ldp x4, x5, [up]
tbz n, #1, L(b00)
-L(b10): lsl x0, x5, tnc
- lsr x13, x5, cnt
- lsl x10, x4, tnc
- lsr x18, x4, cnt
- sub n, n, #2
- cbnz n, L(gt2)
+L(b10): NSHIFT x0, x4, tnc
+ PSHIFT x13, x4, cnt
+ NSHIFT x10, x5, tnc
+ PSHIFT x2, x5, cnt
+ cbnz x18, L(gt2)
orr x10, x10, x13
- stp x10, x18, [rp,#0]
+ stp x10, x2, [rp]
ret
-L(gt2): ldp x5, x4, [up,#16]
+L(gt2): ldp x4, x5, [up,#16]
orr x10, x10, x13
- str x10, [rp,#0]
- sub rp, rp, #24
+ str x10, [rp],#-24
b L(lo2)
-L(b00): lsl x0, x5, tnc
- lsr x13, x5, cnt
- lsl x10, x4, tnc
- lsr x9, x4, cnt
- ldp x7, x6, [up,#16]
+L(b00): NSHIFT x0, x4, tnc
+ PSHIFT x13, x4, cnt
+ NSHIFT x10, x5, tnc
+ PSHIFT x2, x5, cnt
+ ldp x6, x7, [up,#16]!
orr x10, x10, x13
- str x10, [rp,#0]
- sub up, up, #16
- sub rp, rp, #40
+ str x10, [rp],#-8
b L(lo0)
ALIGN(16)
-L(top): ldp x5, x4, [up,#48]
- add rp, rp, #32 C integrate with stp?
- add up, up, #32 C integrate with ldp?
- orr x11, x11, x9
+L(top): ldp x4, x5, [up,#16]
orr x10, x10, x13
+ orr x11, x12, x2
stp x11, x10, [rp,#16]
-L(lo2): lsl x11, x5, tnc
- lsr x13, x5, cnt
- lsl x10, x4, tnc
- lsr x9, x4, cnt
- ldp x7, x6, [up,#32]
- orr x11, x11, x18
+ PSHIFT x2, x7, cnt
+L(lo2): NSHIFT x10, x5, tnc
+ NSHIFT x12, x4, tnc
+ PSHIFT x13, x4, cnt
+ ldp x6, x7, [up,#32]!
orr x10, x10, x13
- stp x11, x10, [rp,#32]
-L(lo0): sub n, n, #4
- lsl x11, x7, tnc
- lsr x13, x7, cnt
- lsl x10, x6, tnc
- lsr x18, x6, cnt
- cbnz n, L(top)
-
-L(end): orr x11, x11, x9
- orr x10, x10, x13
- stp x11, x10, [rp,#48]
- str x18, [rp,#64]
+ orr x11, x12, x2
+ stp x11, x10, [rp,#32]!
+ PSHIFT x2, x5, cnt
+L(lo0): sub x18, x18, #1
+L(lo3): NSHIFT x10, x7, tnc
+ NSHIFT x12, x6, tnc
+ PSHIFT x13, x6, cnt
+ cbnz x18, L(top)
+
+L(end): orr x10, x10, x13
+ orr x11, x12, x2
+ PSHIFT x2, x7, cnt
+ stp x11, x10, [rp,#16]
+ str x2, [rp,#32]
ret
EPILOGUE()