diff options
author | Torbjorn Granlund <tg@gmplib.org> | 2017-02-21 16:20:58 +0100 |
---|---|---|
committer | Torbjorn Granlund <tg@gmplib.org> | 2017-02-21 16:20:58 +0100 |
commit | d8206c844ce3a7609928c347513f8058a902af69 (patch) | |
tree | 0856eb43e1d58a5572984b775459a255b240f546 /mpn/arm64 | |
parent | c45499aff87b10190280ab9122a6ab6f228dec78 (diff) | |
download | gmp-d8206c844ce3a7609928c347513f8058a902af69.tar.gz |
Rewrite ARM64 shifting.
Diffstat (limited to 'mpn/arm64')
-rw-r--r-- | mpn/arm64/lshift.asm | 108 | ||||
-rw-r--r-- | mpn/arm64/rshift.asm | 121 |
2 files changed, 118 insertions, 111 deletions
diff --git a/mpn/arm64/lshift.asm b/mpn/arm64/lshift.asm index 72b74fb22..1bb5698b9 100644 --- a/mpn/arm64/lshift.asm +++ b/mpn/arm64/lshift.asm @@ -1,6 +1,6 @@ dnl ARM64 mpn_lshift. -dnl Copyright 2013, 2014 Free Software Foundation, Inc. +dnl Copyright 2013, 2014, 2017 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -19,9 +19,16 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C Cortex-A53 ? -C Cortex-A57 ? +C cycles/limb assumed optimal c/l +C Cortex-A53 3.5-4.0 3.25 +C Cortex-A57 2.0 2.0 +C X-Gene 2.67 2.5 + +C TODO +C * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes. These +C numbers should be 1 and 0, respectively. The str in wind-down should also +C go. +C * Using extr and with 63 separate loops we might reach 1.25 c/l on A57. changecom(blah) @@ -34,46 +41,45 @@ define(`rp', `x16') define(`tnc',`x8') +define(`PSHIFT', lsl) +define(`NSHIFT', lsr) + ASM_START() PROLOGUE(mpn_lshift) add rp, rp_arg, n, lsl #3 add up, up, n, lsl #3 sub tnc, xzr, cnt + lsr x18, n, #2 tbz n, #0, L(bx0) L(bx1): ldr x4, [up,#-8] tbnz n, #1, L(b11) -L(b01): lsr x0, x4, tnc - lsl x18, x4, cnt - sub n, n, #1 - cbnz n, L(gt1) - str x18, [rp,#-8] +L(b01): NSHIFT x0, x4, tnc + PSHIFT x2, x4, cnt + cbnz x18, L(gt1) + str x2, [rp,#-8] ret L(gt1): ldp x4, x5, [up,#-24] sub up, up, #8 add rp, rp, #16 b L(lo2) -L(b11): lsr x0, x4, tnc - lsl x9, x4, cnt - ldp x6, x7, [up,#-24] - add n, n, #1 - add up, up, #8 - add rp, rp, #32 - b L(lo0) +L(b11): NSHIFT x0, x4, tnc + PSHIFT x2, x4, cnt + ldp x6, x7, [up,#-24]! + b L(lo3) L(bx0): ldp x4, x5, [up,#-16] tbz n, #1, L(b00) -L(b10): lsr x0, x5, tnc - lsl x13, x5, cnt - lsr x10, x4, tnc - lsl x18, x4, cnt - sub n, n, #2 - cbnz n, L(gt2) +L(b10): NSHIFT x0, x5, tnc + PSHIFT x13, x5, cnt + NSHIFT x10, x4, tnc + PSHIFT x2, x4, cnt + cbnz x18, L(gt2) orr x10, x10, x13 - stp x18, x10, [rp,#-16] + stp x2, x10, [rp,#-16] ret L(gt2): ldp x4, x5, [up,#-32] orr x10, x10, x13 @@ -82,41 +88,39 @@ L(gt2): ldp x4, x5, [up,#-32] add rp, rp, #8 b L(lo2) -L(b00): lsr x0, x5, tnc - lsl x13, x5, cnt - lsr x10, x4, tnc - lsl x9, x4, cnt - ldp x6, x7, [up,#-32] +L(b00): NSHIFT x0, x5, tnc + PSHIFT x13, x5, cnt + NSHIFT x10, x4, tnc + PSHIFT x2, x4, cnt + ldp x6, x7, [up,#-32]! orr x10, x10, x13 - str x10, [rp,#-8] - add rp, rp, #24 + str x10, [rp,#-8]! b L(lo0) ALIGN(16) -L(top): ldp x4, x5, [up,#-48] - sub rp, rp, #32 C integrate with stp? - sub up, up, #32 C integrate with ldp? - orr x11, x11, x9 +L(top): ldp x4, x5, [up,#-16] orr x10, x10, x13 + orr x11, x12, x2 stp x10, x11, [rp,#-16] -L(lo2): lsr x11, x5, tnc - lsl x13, x5, cnt - lsr x10, x4, tnc - lsl x9, x4, cnt - ldp x6, x7, [up,#-32] - orr x11, x11, x18 - orr x10, x10, x13 - stp x10, x11, [rp,#-32] -L(lo0): sub n, n, #4 - lsr x11, x7, tnc - lsl x13, x7, cnt - lsr x10, x6, tnc - lsl x18, x6, cnt - cbnz n, L(top) - -L(end): orr x11, x11, x9 + PSHIFT x2, x6, cnt +L(lo2): NSHIFT x10, x4, tnc + PSHIFT x13, x5, cnt + NSHIFT x12, x5, tnc + ldp x6, x7, [up,#-32]! orr x10, x10, x13 - stp x10, x11, [rp,#-48] - str x18, [rp,#-56] + orr x11, x12, x2 + stp x10, x11, [rp,#-32]! + PSHIFT x2, x4, cnt +L(lo0): sub x18, x18, #1 +L(lo3): NSHIFT x10, x6, tnc + PSHIFT x13, x7, cnt + NSHIFT x12, x7, tnc + cbnz x18, L(top) + +L(end): orr x10, x10, x13 + orr x11, x12, x2 + PSHIFT x2, x6, cnt + stp x10, x11, [rp,#-16] + str x2, [rp,#-24] ret EPILOGUE() diff --git a/mpn/arm64/rshift.asm b/mpn/arm64/rshift.asm index dac51ef09..97ddda741 100644 --- a/mpn/arm64/rshift.asm +++ b/mpn/arm64/rshift.asm @@ -1,6 +1,6 @@ dnl ARM64 mpn_rshift. -dnl Copyright 2013, 2014 Free Software Foundation, Inc. +dnl Copyright 2013, 2014, 2017 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -19,9 +19,16 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') -C cycles/limb -C Cortex-A53 ? -C Cortex-A57 ? +C cycles/limb assumed optimal c/l +C Cortex-A53 3.5-4.0 3.25 +C Cortex-A57 2.0 2.0 +C X-Gene 2.67 2.5 + +C TODO +C * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes. These +C numbers should be 1 and 0, respectively. The str in wind-down should also +C go. +C * Using extr and with 63 separate loops we might reach 1.25 c/l on A57. changecom(blah) @@ -34,88 +41,84 @@ define(`rp', `x16') define(`tnc',`x8') +define(`PSHIFT', lsr) +define(`NSHIFT', lsl) + ASM_START() PROLOGUE(mpn_rshift) mov rp, rp_arg sub tnc, xzr, cnt + lsr x18, n, #2 tbz n, #0, L(bx0) -L(bx1): ldr x4, [up,#0] +L(bx1): ldr x5, [up] tbnz n, #1, L(b11) -L(b01): lsl x0, x4, tnc - lsr x18, x4, cnt - sub n, n, #1 - cbnz n, L(gt1) - str x18, [rp,#0] +L(b01): NSHIFT x0, x5, tnc + PSHIFT x2, x5, cnt + cbnz x18, L(gt1) + str x2, [rp] ret -L(gt1): ldp x5, x4, [up,#8] +L(gt1): ldp x4, x5, [up,#8] sub up, up, #8 sub rp, rp, #32 b L(lo2) -L(b11): lsl x0, x4, tnc - lsr x9, x4, cnt - ldp x7, x6, [up,#8] - add n, n, #1 - sub up, up, #24 - sub rp, rp, #48 - b L(lo0) +L(b11): NSHIFT x0, x5, tnc + PSHIFT x2, x5, cnt + ldp x6, x7, [up,#8]! + sub rp, rp, #16 + b L(lo3) -L(bx0): ldp x5, x4, [up,#0] +L(bx0): ldp x4, x5, [up] tbz n, #1, L(b00) -L(b10): lsl x0, x5, tnc - lsr x13, x5, cnt - lsl x10, x4, tnc - lsr x18, x4, cnt - sub n, n, #2 - cbnz n, L(gt2) +L(b10): NSHIFT x0, x4, tnc + PSHIFT x13, x4, cnt + NSHIFT x10, x5, tnc + PSHIFT x2, x5, cnt + cbnz x18, L(gt2) orr x10, x10, x13 - stp x10, x18, [rp,#0] + stp x10, x2, [rp] ret -L(gt2): ldp x5, x4, [up,#16] +L(gt2): ldp x4, x5, [up,#16] orr x10, x10, x13 - str x10, [rp,#0] - sub rp, rp, #24 + str x10, [rp],#-24 b L(lo2) -L(b00): lsl x0, x5, tnc - lsr x13, x5, cnt - lsl x10, x4, tnc - lsr x9, x4, cnt - ldp x7, x6, [up,#16] +L(b00): NSHIFT x0, x4, tnc + PSHIFT x13, x4, cnt + NSHIFT x10, x5, tnc + PSHIFT x2, x5, cnt + ldp x6, x7, [up,#16]! orr x10, x10, x13 - str x10, [rp,#0] - sub up, up, #16 - sub rp, rp, #40 + str x10, [rp],#-8 b L(lo0) ALIGN(16) -L(top): ldp x5, x4, [up,#48] - add rp, rp, #32 C integrate with stp? - add up, up, #32 C integrate with ldp? - orr x11, x11, x9 +L(top): ldp x4, x5, [up,#16] orr x10, x10, x13 + orr x11, x12, x2 stp x11, x10, [rp,#16] -L(lo2): lsl x11, x5, tnc - lsr x13, x5, cnt - lsl x10, x4, tnc - lsr x9, x4, cnt - ldp x7, x6, [up,#32] - orr x11, x11, x18 + PSHIFT x2, x7, cnt +L(lo2): NSHIFT x10, x5, tnc + NSHIFT x12, x4, tnc + PSHIFT x13, x4, cnt + ldp x6, x7, [up,#32]! orr x10, x10, x13 - stp x11, x10, [rp,#32] -L(lo0): sub n, n, #4 - lsl x11, x7, tnc - lsr x13, x7, cnt - lsl x10, x6, tnc - lsr x18, x6, cnt - cbnz n, L(top) - -L(end): orr x11, x11, x9 - orr x10, x10, x13 - stp x11, x10, [rp,#48] - str x18, [rp,#64] + orr x11, x12, x2 + stp x11, x10, [rp,#32]! + PSHIFT x2, x5, cnt +L(lo0): sub x18, x18, #1 +L(lo3): NSHIFT x10, x7, tnc + NSHIFT x12, x6, tnc + PSHIFT x13, x6, cnt + cbnz x18, L(top) + +L(end): orr x10, x10, x13 + orr x11, x12, x2 + PSHIFT x2, x7, cnt + stp x11, x10, [rp,#16] + str x2, [rp,#32] ret EPILOGUE() |