diff options
author | Torbjorn Granlund <tg@gmplib.org> | 2017-02-16 18:23:41 +0100 |
---|---|---|
committer | Torbjorn Granlund <tg@gmplib.org> | 2017-02-16 18:23:41 +0100 |
commit | 21437dc250d61d7346c8edda6dd8cf29ad54e833 (patch) | |
tree | 7c9bf0906551d1dcba19ce9e9e4a65bc35ed80a5 /mpn/arm64 | |
parent | 51cbfba7c84f5eaab5f19de10ed25d5de482e5e1 (diff) | |
download | gmp-21437dc250d61d7346c8edda6dd8cf29ad54e833.tar.gz |
Rewrite to use 4x unrolling.
Diffstat (limited to 'mpn/arm64')
-rw-r--r-- | mpn/arm64/aors_n.asm | 91 |
1 files changed, 59 insertions, 32 deletions
diff --git a/mpn/arm64/aors_n.asm b/mpn/arm64/aors_n.asm index 256c186d0..b440519f0 100644 --- a/mpn/arm64/aors_n.asm +++ b/mpn/arm64/aors_n.asm @@ -33,9 +33,9 @@ dnl see https://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb -C Cortex-A53 3-3.5 -C Cortex-A57 2 -C X-Gene 2.5 +C Cortex-A53 2.75-3.25 +C Cortex-A57 1.5 +C X-Gene 2.0 changecom(blah) @@ -49,14 +49,14 @@ ifdef(`OPERATION_add_n', ` define(`CLRCY', `cmn xzr, xzr') define(`SETCY', `cmp $1, #1') define(`RETVAL', `cset x0, cs') - define(`func', mpn_add_n) + define(`func_n', mpn_add_n) define(`func_nc', mpn_add_nc)') ifdef(`OPERATION_sub_n', ` define(`ADDSUBC', sbcs) define(`CLRCY', `cmp xzr, xzr') define(`SETCY', `cmp xzr, $1') define(`RETVAL', `cset x0, cc') - define(`func', mpn_sub_n) + define(`func_n', mpn_sub_n) define(`func_nc', mpn_sub_nc)') MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) @@ -66,33 +66,60 @@ PROLOGUE(func_nc) SETCY( x4) b L(ent) EPILOGUE() -PROLOGUE(func) +PROLOGUE(func_n) CLRCY -L(ent): tbz n, #0, L(b0) - - ldr x4, [up],#8 - ldr x6, [vp],#8 - sub n, n, #1 - ADDSUBC x8, x4, x6 - str x8, [rp],#8 - cbz n, L(rt) - -L(b0): ldp x4, x5, [up],#16 - ldp x6, x7, [vp],#16 - sub n, n, #2 - ADDSUBC x8, x4, x6 - ADDSUBC x9, x5, x7 - cbz n, L(end) - -L(top): ldp x4, x5, [up],#16 - ldp x6, x7, [vp],#16 - sub n, n, #2 - stp x8, x9, [rp],#16 - ADDSUBC x8, x4, x6 - ADDSUBC x9, x5, x7 - cbnz n, L(top) - -L(end): stp x8, x9, [rp] -L(rt): RETVAL +L(ent): lsr x18, n, #2 + tbz n, #0, L(bx0) + +L(bx1): ldr x7, [up] + ldr x11, [vp] + ADDSUBC x15, x7, x11 + str x15, [rp],#8 + tbnz n, #1, L(b11) + +L(b01): cbz x18, L(ret) + ldp x4, x5, [up,#8] + ldp x8, x9, [vp,#8] + sub up, up, #8 + sub vp, vp, #8 + b L(mid) + +L(b11): ldp x6, x7, [up,#8] + ldp x10, x11, [vp,#8] + add up, up, #8 + add vp, vp, #8 + cbz x18, L(end) + b L(top) + +L(bx0): tbnz n, #1, L(b10) + +L(b00): ldp x4, x5, [up] + ldp x8, x9, [vp] + sub up, up, #16 + sub vp, vp, #16 + b L(mid) + +L(b10): ldp x6, x7, [up] + ldp x10, x11, [vp] + cbz x18, L(end) + + ALIGN(16) +L(top): ldp x4, x5, [up,#16] + ldp x8, x9, [vp,#16] + ADDSUBC x14, x6, x10 + ADDSUBC x15, x7, x11 + stp x14, x15, [rp],#16 +L(mid): ldp x6, x7, [up,#32]! + ldp x10, x11, [vp,#32]! + ADDSUBC x12, x4, x8 + ADDSUBC x13, x5, x9 + stp x12, x13, [rp],#16 + sub x18, x18, #1 + cbnz x18, L(top) + +L(end): ADDSUBC x14, x6, x10 + ADDSUBC x15, x7, x11 + stp x14, x15, [rp] +L(ret): RETVAL ret EPILOGUE() |