diff options
author | Torbjorn Granlund <tege@gmplib.org> | 2011-10-27 15:41:57 +0200 |
---|---|---|
committer | Torbjorn Granlund <tege@gmplib.org> | 2011-10-27 15:41:57 +0200 |
commit | 1aa3ae0fc22ab8b5e70e9afdad72d21e0c2295b2 (patch) | |
tree | 880c0985b444e7e28097582111a7b5a33f5de7bc | |
parent | 157b8420605b1d300cfe8663ff5bd707e2378aa7 (diff) | |
download | gmp-1aa3ae0fc22ab8b5e70e9afdad72d21e0c2295b2.tar.gz |
Add s390_32 sqr_basecase, remove separate integrated sqr_diag_addlsh1.asm.
-rw-r--r-- | mpn/s390_32/esame/sqr_basecase.asm | 202 | ||||
-rw-r--r-- | mpn/s390_32/esame/sqr_diag_addlsh1.asm | 77 |
2 files changed, 202 insertions, 77 deletions
diff --git a/mpn/s390_32/esame/sqr_basecase.asm b/mpn/s390_32/esame/sqr_basecase.asm new file mode 100644 index 000000000..6e813941e --- /dev/null +++ b/mpn/s390_32/esame/sqr_basecase.asm @@ -0,0 +1,202 @@ +dnl S/390-32 mpn_sqr_basecase. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 ? +C z990 23 +C z9 ? +C z10 ? +C z196 ? + +C TODO +C * Clean up. +C * Stop iterating addmul_1 loop at latest for n = 2, implement longer tail. +C This will ask for basecase handling of n = 3. +C * Update counters and pointers more straightforwardly, possibly lowering +C register usage. +C * Should we use this allocation-free style for more sqr_basecase asm +C implementations? The only disadvantage is that it requires R != U. +C * Replace loops by faster code. The mul_1 and addmul_1 loops could be sped +C up by about 10%. The sqr_diag_addlsh1 loop could probably be sped up even +C more. + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`n', `%r4') + +define(`zero', `%r8') +define(`rp_saved', `%r9') +define(`up_saved', `%r13') +define(`n_saved', `%r14') + +ASM_START() +PROLOGUE(mpn_sqr_basecase) + ahi n, -2 + jhe L(ge2) + +C n = 1 + l %r5, 0(up) + mlr %r4, %r5 + st %r5, 0(rp) + st %r4, 4(rp) + br %r14 + +L(ge2): jne L(gen) + +C n = 2 + stm %r6, %r8, 24(%r15) + lhi zero, 0 + + l %r5, 0(up) + mlr %r4, %r5 C u0 * u0 + l %r1, 4(up) + mlr %r0, %r1 C u1 * u1 + st %r5, 0(rp) + + l %r7, 0(up) + ml %r6, 4(up) C u0 * u1 + alr %r7, %r7 + alcr %r6, %r6 + alcr %r0, zero + + alr %r4, %r7 + alcr %r1, %r6 + alcr %r0, zero + st %r4, 4(rp) + st %r1, 8(rp) + st %r0, 12(rp) + + lm %r6, %r8, 24(%r15) + br %r14 + +L(gen): +C mul_1 ======================================================================= + + stm %r6, %r14, 24(%r15) + lhi zero, 0 + lr up_saved, up + lr rp_saved, rp + lr n_saved, n + + l %r6, 0(up) + l %r11, 4(up) + lhi %r12, 8 C init index register + mlr %r10, %r6 + lr %r5, n + st %r11, 4(rp) + cr %r15, %r15 C clear carry flag + +L(tm): l %r1, 0(%r12,up) + mlr %r0, %r6 + alcr %r1, %r10 + lr %r10, %r0 C copy high part to carry limb + st %r1, 0(%r12,rp) + la %r12, 4(%r12) + brct %r5, L(tm) + + alcr %r0, zero + st %r0, 0(%r12,rp) + +C addmul_1 loop =============================================================== + + ahi n, -1 + je L(outer_end) +L(outer_loop): + + la rp, 8(rp) C rp += 2 + la up, 4(up) C up += 1 + l %r6, 0(up) + l %r11, 4(up) + lhi %r12, 8 C init index register + mlr %r10, %r6 + lr %r5, n + al %r11, 4(rp) + st %r11, 4(rp) + +L(tam): l %r1, 0(%r12,up) + l %r7, 0(%r12,rp) + mlr %r0, %r6 + alcr %r1, %r7 + alcr %r0, zero + alr %r1, %r10 + lr %r10, %r0 + st %r1, 0(%r12,rp) + la %r12, 4(%r12) + brct %r5, L(tam) + + alcr %r0, zero + st %r0, 0(%r12,rp) + + brct n, L(outer_loop) +L(outer_end): + + l %r6, 4(up) + l %r1, 8(up) + lr %r7, %r0 C Same as: l %r7, 12(,rp) + mlr %r0, %r6 + alr %r1, %r7 + alcr %r0, zero + st %r1, 12(rp) + st %r0, 16(rp) + +C sqr_dia_addlsh1 ============================================================ + + lr up, up_saved + lr rp, rp_saved + lr n, n_saved + + lhi %r9, -1 C set non-carry state + l %r1, 0(up) + mlr %r0, %r1 + ahi n, 1 + l %r7, 4(rp) + lhi %r6, 0 + alr %r7, %r7 + j L(mid) + +L(top): lm %r6, %r7, 0(rp) + alcr %r6, %r6 + alcr %r7, %r7 +L(mid): slbr %r13, %r13 C save carry + ahi %r9, 1 C restore old carry + alcr %r6, %r1 + alcr %r7, %r0 + stm %r6, %r7, 0(rp) + la rp, 8(rp) + l %r1, 4(up) + la up, 4(up) + lr %r9, %r13 C copy carry save register + mlr %r0, %r1 + brct n, L(top) + + l %r6, 0(rp) + lhi %r7, 0 + alcr %r6, %r6 + alcr %r7, %r7 + ahi %r9, 1 C restore old carry + alcr %r6, %r1 + alcr %r7, %r0 + stm %r6, %r7, 0(rp) + + lm %r6, %r14, 24(%r15) + br %r14 +EPILOGUE() diff --git a/mpn/s390_32/esame/sqr_diag_addlsh1.asm b/mpn/s390_32/esame/sqr_diag_addlsh1.asm deleted file mode 100644 index 2d31b1b76..000000000 --- a/mpn/s390_32/esame/sqr_diag_addlsh1.asm +++ /dev/null @@ -1,77 +0,0 @@ -dnl S/390-32 mpn_sqr_diag_addlsh1 - -dnl Copyright 2011 Free Software Foundation, Inc. - -dnl This file is part of the GNU MP Library. - -dnl The GNU MP Library is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU Lesser General Public License as published -dnl by the Free Software Foundation; either version 3 of the License, or (at -dnl your option) any later version. - -dnl The GNU MP Library is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -dnl License for more details. - -dnl You should have received a copy of the GNU Lesser General Public License -dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. - -include(`../config.m4') - -C cycles/limb -C z900 13 -C z990 8 -C z9 ? -C z10 ? -C z196 ? - -C INPUT PARAMETERS -define(`rp', `%r2') -define(`tp', `%r3') -define(`up', `%r4') -define(`n', `%r5') - -ASM_START() -PROLOGUE(mpn_sqr_diag_addlsh1) - stm %r6, %r9, 24(%r15) - - lhi %r9, -1 C set non-carry state - - l %r1, 0(up) - mlr %r0, %r1 - ahi n, -1 - l %r7, 0(tp) - alr %r7, %r7 - lhi %r6, 0 - j L(mid) - -L(top): lm %r6, %r7, 4(tp) - la tp, 8(tp) - alcr %r6, %r6 - alcr %r7, %r7 -L(mid): slbr %r8, %r8 C save carry - ahi %r9, 1 C restore old carry - alcr %r6, %r1 - alcr %r7, %r0 - stm %r6, %r7, 0(rp) - la rp, 8(rp) - l %r1, 4(up) - la up, 4(up) - lr %r9, %r8 C copy carry save register - mlr %r0, %r1 - brct n, L(top) - -L(end): l %r6, 4(tp) - alcr %r6, %r6 - slbr %r8, %r8 C save carry - ahi %r9, 1 C restore old carry - lhi %r7, 1 - alcr %r6, %r1 - alcr %r7, %r0 - alr %r7, %r8 - stm %r6, %r7, 0(rp) - - lm %r6, %r9, 24(%r15) - br %r14 -EPILOGUE() |