diff options
author | Torbjorn Granlund <tg@gmplib.org> | 2021-10-16 18:03:13 +0200 |
---|---|---|
committer | Torbjorn Granlund <tg@gmplib.org> | 2021-10-16 18:03:13 +0200 |
commit | 20195d7a06bc88229142ca28becd8403be776750 (patch) | |
tree | ac718acc825c001efbcf2e782d87d767c1b1d00b /mpn | |
parent | 1f339389e8722e29bb893e4a03a41ce1382c76bc (diff) | |
download | gmp-20195d7a06bc88229142ca28becd8403be776750.tar.gz |
Provide addaddmul_1msb0 for Apple M1.
Diffstat (limited to 'mpn')
-rw-r--r-- | mpn/arm64/applem1/addaddmul_1msb0.asm | 92 |
1 files changed, 92 insertions, 0 deletions
diff --git a/mpn/arm64/applem1/addaddmul_1msb0.asm b/mpn/arm64/applem1/addaddmul_1msb0.asm new file mode 100644 index 000000000..03cbf9775 --- /dev/null +++ b/mpn/arm64/applem1/addaddmul_1msb0.asm @@ -0,0 +1,92 @@ +dnl ARM64 mpn_addaddmul_1msb0, R = Au + Bv, u,v < 2^63. + +dnl Copyright 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Cortex-A53 +C Cortex-A55 +C Cortex-A57 +C Cortex-A72 +C Cortex-A73 +C X-Gene +C Apple M1 2.0 + +changecom(blah) + +define(`rp', x0) +define(`ap', x1) +define(`bp', x2) +define(`n', x3) +define(`u0', x4) +define(`v0', x5) + +C TODO +C * Use fewer distinct registers, should be trivial. + +PROLOGUE(mpn_addaddmul_1msb0) + lsr x7, n, #1 + adds x6, xzr, xzr + tbz n, #0, L(top) + + ldr x11, [ap], #8 C 0 + ldr x15, [bp], #8 C 0 + mul x10, x11, u0 C 0 + umulh x11, x11, u0 C 1 + mul x14, x15, v0 C 0 + umulh x15, x15, v0 C 1 + adds x10, x10, x14 C 0 + adcs x6, x11, x15 C 1 + str x10, [rp], #8 C 0 + cbz x7, L(end) + +L(top): ldp x11, x13, [ap], #16 C 0 1 + ldp x15, x17, [bp], #16 C 0 1 + mul x10, x11, u0 C 0 + umulh x11, x11, u0 C 1 + mul x14, x15, v0 C 0 + umulh x15, x15, v0 C 1 + adcs x10, x10, x14 C 0 + adc x11, x11, x15 C 1 + adds x10, x10, x6 C 0 + mul x12, x13, u0 C 1 + umulh x13, x13, u0 C 2 + mul x14, x17, v0 C 1 + umulh x17, x17, v0 C 2 + adcs x12, x12, x14 C 1 + adc x6, x13, x17 C 2 + adds x11, x12, x11 C 1 + stp x10, x11, [rp], #16 C 0 1 + sub x7, x7, #1 + cbnz x7, L(top) + +L(end): adc x0, x6, xzr + ret +EPILOGUE() |