diff options
author | Torbjorn Granlund <tege@gmplib.org> | 2008-11-23 18:58:31 +0100 |
---|---|---|
committer | Torbjorn Granlund <tege@gmplib.org> | 2008-11-23 18:58:31 +0100 |
commit | 45e6133791e8d3246df0e9953536dc88e29bf41c (patch) | |
tree | 9281030cbdedd4ad975837fe991e690ef5b0c2b9 /mpn/powerpc32 | |
parent | a9e7e18491f8feb3bab032356c62d9967461696a (diff) | |
download | gmp-45e6133791e8d3246df0e9953536dc88e29bf41c.tar.gz |
New files for division with 2-limb divisor.
Diffstat (limited to 'mpn/powerpc32')
-rw-r--r-- | mpn/powerpc32/divrem_2.asm | 172 |
1 files changed, 172 insertions, 0 deletions
diff --git a/mpn/powerpc32/divrem_2.asm b/mpn/powerpc32/divrem_2.asm new file mode 100644 index 000000000..a0a9db526 --- /dev/null +++ b/mpn/powerpc32/divrem_2.asm @@ -0,0 +1,172 @@ +dnl PPC-32 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. + +dnl Copyright 2007, 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm frac +C 7410 ~36.5 ~36.5 +C 744x, 745x 29 29 + +C INPUT PARAMETERS +C qp = r3 +C fn = r4 +C up = r5 +C un = r6 +C d = r7 + +C TODO +C * Decrease register usage. +C * Make sure mul operands and optimal for early-out. +C * Check that things work well for a shared library build. +C * Write an invert_limb, perhaps inline, perhaps as a private call. Or at +C least vastly improve the current __udiv_qrnnd_c based code. + + +ASM_START() +PROLOGUE(mpn_divrem_2) + stwu r1, -32(r1) + slwi r0, r6, 2 + add r5, r5, r0 + stmw r28, 8(r1) + addi r29, r5, -8 C up = up_param + un - 2 + lwz r10, 4(r7) + lwz r12, 4(r29) + addi r8, r3, -12 + lwz r7, 0(r7) + cmplw cr7, r12, r10 + lwz r28, 0(r29) + blt- cr7, L(2) + bgt+ cr7, L(4) + cmplw cr7, r28, r7 + blt- cr7, L(2) +L(4): subfc r28, r7, r28 + subfe r12, r10, r12 + li r3, 1 + b L(6) +L(2): li r3, 0 + +L(6): add r0, r4, r6 + addic. r30, r0, -2 + ble- cr0, L(ret) + + slwi r9, r0, 2 + add r8, r8, r9 C rp += un + fn + mtctr r30 + +C Compute di from d1 + srwi r11, r10, 16 + nor r0, r10, r10 + divwu r31, r0, r11 + rlwinm r5, r10, 0, 16, 31 + mullw r9, r11, r31 + mullw r6, r5, r31 + subf r0, r9, r0 + slwi r0, r0, 16 + ori r0, r0, 65535 + cmplw cr7, r0, r6 + bge- cr7, L(9) + add r0, r0, r10 + cmplw cr7, r0, r10 + cmplw cr6, r0, r6 + addi r31, r31, -1 C q1-- + cror 28, 28, 25 + bc+ 12, 28, L(9) + addi r31, r31, -1 C q1-- + add r0, r0, r10 +L(9): subf r0, r6, r0 + divwu r6, r0, r11 + mullw r9, r11, r6 + mullw r11, r5, r6 + subf r0, r9, r0 + slwi r0, r0, 16 + ori r0, r0, 65535 + cmplw cr7, r0, r11 + bge- cr7, L(13) + add r0, r0, r10 + cmplw cr7, r0, r10 + cmplw cr6, r0, r11 + addi r6, r6, -1 C q0-- + cror 28, 28, 25 + bc+ 12, 28, L(13) +C add r0, r0, r10 C final remainder + addi r6, r6, -1 C q0-- +L(13): rlwimi r6, r31, 16, 0, 15 C assemble final quotient + +C Adjust di by including d0 + mullw r9, r10, r6 C t0 = LO(di * d1) + addc r11, r9, r7 + subfe r0, r1, r1 + mulhwu r9, r6, r7 C s1 = HI(di * d0) + addc r9, r11, r9 + addze. r0, r0 + blt cr0, L(17) +L(18): subfc r9, r10, r9 + addi r6, r6, -1 + addme. r0, r0 + bge+ cr0, L(18) +L(17): + +C r0 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r28 r29 r30 r31 +C msl di d0 qp d1 fn up un +L(loop): + mullw r0, r12, r6 C q0 = LO(n2 * di) + cmpw cr7, r30, r4 + addc r31, r0, r28 C q0 += n1 + mulhwu r9, r12, r6 C q = HI(n2 * di) + adde r12, r9, r12 C q += n2 + addi r30, r30, -1 + mullw r0, r10, r12 C d1 * q + li r9, 0 + subf r0, r0, r28 C n1 -= d1 * q + addi r5, r12, 1 + ble- cr7, L(23) + lwzu r9, -4(r29) +L(23): + mullw r11, r12, r7 C t0 = LO(d0 * q) + subfc r28, r7, r9 C n0 -= d0 + subfe r0, r10, r0 C n1 -= d1 + mulhwu r12, r12, r7 C t1 = HI(d0 * q) + subfc r28, r11, r28 C n0 -= t0 + subfe r12, r12, r0 C n1 -= t1 + cmplw cr7, r12, r31 + blt+ cr7, L(24) + addc r28, r28, r7 + adde r12, r12, r10 + addi r5, r5, -1 +L(24): + cmplw cr7, r12, r10 + bge- cr7, L(fix) +L(bck): + stw r5, 0(r8) + addi r8, r8, -4 + bdnz L(loop) + +L(ret): lmw r28, 8(r1) + addi r1, r1, 32 + blr + +L(fix): cmplw cr6, r28, r7 + bgt+ cr7, L(28) + blt- cr6, L(bck) +L(28): subfc r28, r7, r28 + subfe r12, r10, r12 + addi r5, r5, 1 + b L(bck) +EPILOGUE() |