diff options
author | Torbjorn Granlund <tg@gmplib.org> | 2020-11-29 23:45:15 +0100 |
---|---|---|
committer | Torbjorn Granlund <tg@gmplib.org> | 2020-11-29 23:45:15 +0100 |
commit | 8eec58d33b26715bbe9683db9aaee498849019b2 (patch) | |
tree | a55c876c8f917d894b349651fce91a5470302eb2 /mpn | |
parent | 4b29bdbccdd786729c71d54593c403b38f5db0b6 (diff) | |
download | gmp-8eec58d33b26715bbe9683db9aaee498849019b2.tar.gz |
New file.
Diffstat (limited to 'mpn')
-rw-r--r-- | mpn/arm64/divrem_1.asm | 231 |
1 files changed, 231 insertions, 0 deletions
diff --git a/mpn/arm64/divrem_1.asm b/mpn/arm64/divrem_1.asm new file mode 100644 index 000000000..29e8b158c --- /dev/null +++ b/mpn/arm64/divrem_1.asm @@ -0,0 +1,231 @@ +dnl ARM64 mpn_divrem_1 and mpn_preinv_divrem_1. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2020 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl TODO +dnl * Handle the most significant quotient limb for the unnormalised case +dml specially, just like in the C code. (It is very often 0.) + +define(`qp_arg', x0) +define(`fn_arg', x1) +define(`np_arg', x2) +define(`n_arg', x3) +define(`d_arg', x4) +define(`dinv_arg', x5) +define(`cnt_arg', x6) + +define(`qp', x19) +define(`np', x20) +define(`n', x21) +define(`d', x22) +define(`fn', x24) +define(`dinv', x0) +define(`cnt', x23) +define(`tnc', x8) + +dnl mp_limb_t +dnl mpn_divrem_1 (mp_ptr qp, mp_size_t fn, +dnl mp_srcptr np, mp_size_t n, +dnl mp_limb_t d_unnorm) + +dnl mp_limb_t +dnl mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn, +dnl mp_srcptr np, mp_size_t n, +dnl mp_limb_t d_unnorm, mp_limb_t dinv, int cnt) + +ASM_START() + +PROLOGUE(mpn_preinv_divrem_1) + cbz n_arg, L(fz) + stp x29, x30, [sp, #-80]! + mov x29, sp + stp x19, x20, [sp, #16] + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + + add n, n_arg, #-1 + add x7, n, fn_arg + add np, np_arg, n, lsl #3 + add qp, qp_arg, x7, lsl #3 + mov fn, fn_arg + mov d, d_arg + mov dinv, dinv_arg + tbnz d_arg, #63, L(nentry) + mov cnt, cnt_arg + b L(uentry) +EPILOGUE() + +PROLOGUE(mpn_divrem_1) + cbz n_arg, L(fz) + stp x29, x30, [sp, #-80]! + mov x29, sp + stp x19, x20, [sp, #16] + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + + add n, n_arg, #-1 + add x7, n, fn_arg + add np, np_arg, n, lsl #3 + add qp, qp_arg, x7, lsl #3 + mov fn, fn_arg + mov d, d_arg + tbnz d_arg, #63, L(normalised) + +L(unnorm): + clz cnt, d + lsl x0, d, cnt + bl GSYM_PREFIX`'MPN(invert_limb) +L(uentry): + lsl d, d, cnt + ldr x7, [np], #-8 + sub tnc, xzr, cnt + lsr x11, x7, tnc C r + lsl x1, x7, cnt + cbz n, L(uend) + +L(utop):ldr x7, [np], #-8 + add x2, x11, #1 + mul x10, x11, dinv + umulh x17, x11, dinv + lsr x9, x7, tnc + orr x1, x1, x9 + adds x10, x1, x10 + adc x2, x2, x17 + msub x11, d, x2, x1 + lsl x1, x7, cnt + cmp x10, x11 + add x14, x11, d + csel x11, x14, x11, cc + sbc x2, x2, xzr + cmp x11, d + bcs L(ufx) +L(uok): str x2, [qp], #-8 + sub n, n, #1 + cbnz n, L(utop) + +L(uend):add x2, x11, #1 + mul x10, x11, dinv + umulh x17, x11, dinv + adds x10, x1, x10 + adc x2, x2, x17 + msub x11, d, x2, x1 + cmp x10, x11 + add x14, x11, d + csel x11, x14, x11, cc + sbc x2, x2, xzr + subs x14, x11, d + adc x2, x2, xzr + csel x11, x14, x11, cs + str x2, [qp], #-8 + + cbnz fn, L(ftop) + lsr x0, x11, cnt + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp x29, x30, [sp], #80 + ret + +L(ufx): add x2, x2, #1 + sub x11, x11, d + b L(uok) + + +L(normalised): + mov x0, d + bl GSYM_PREFIX`'MPN(invert_limb) +L(nentry): + ldr x7, [np], #-8 + subs x14, x7, d + adc x2, xzr, xzr C hi q limb + csel x11, x14, x7, cs + b L(nok) + +L(ntop):ldr x1, [np], #-8 + add x2, x11, #1 + mul x10, x11, dinv + umulh x17, x11, dinv + adds x10, x1, x10 + adc x2, x2, x17 + msub x11, d, x2, x1 + cmp x10, x11 + add x14, x11, d + csel x11, x14, x11, cc C remainder + sbc x2, x2, xzr + cmp x11, d + bcs L(nfx) +L(nok): str x2, [qp], #-8 + sub n, n, #1 + tbz n, #63, L(ntop) + +L(nend):cbnz fn, L(frac) + mov x0, x11 + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp x29, x30, [sp], #80 + ret + +L(nfx): add x2, x2, #1 + sub x11, x11, d + b L(nok) + +L(frac):mov cnt, #0 +L(ftop):add x2, x11, #1 + mul x10, x11, dinv + umulh x17, x11, dinv + add x2, x2, x17 + msub x11, d, x2, xzr + cmp x10, x11 + add x14, x11, d + csel x11, x14, x11, cc C remainder + sbc x2, x2, xzr + str x2, [qp], #-8 + sub fn, fn, #1 + cbnz fn, L(ftop) + + lsr x0, x11, cnt + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp x29, x30, [sp], #80 + ret + +C Block zero. We need this for the degenerated case of n = 0, fn != 0. +L(fz): cbz fn_arg, L(zend) +L(ztop):str xzr, [qp_arg], #8 + sub fn_arg, fn_arg, #1 + cbnz fn_arg, L(ztop) +L(zend):mov x0, #0 + ret +EPILOGUE() |