summaryrefslogtreecommitdiff
path: root/mpn/powerpc32
diff options
context:
space:
mode:
authorTorbjorn Granlund <tege@gmplib.org>2008-11-23 18:58:31 +0100
committerTorbjorn Granlund <tege@gmplib.org>2008-11-23 18:58:31 +0100
commit45e6133791e8d3246df0e9953536dc88e29bf41c (patch)
tree9281030cbdedd4ad975837fe991e690ef5b0c2b9 /mpn/powerpc32
parenta9e7e18491f8feb3bab032356c62d9967461696a (diff)
downloadgmp-45e6133791e8d3246df0e9953536dc88e29bf41c.tar.gz
New files for division with 2-limb divisor.
Diffstat (limited to 'mpn/powerpc32')
-rw-r--r--mpn/powerpc32/divrem_2.asm172
1 files changed, 172 insertions, 0 deletions
diff --git a/mpn/powerpc32/divrem_2.asm b/mpn/powerpc32/divrem_2.asm
new file mode 100644
index 000000000..a0a9db526
--- /dev/null
+++ b/mpn/powerpc32/divrem_2.asm
@@ -0,0 +1,172 @@
+dnl PPC-32 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
+
+dnl Copyright 2007, 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C norm frac
+C 7410 ~36.5 ~36.5
+C 744x, 745x 29 29
+
+C INPUT PARAMETERS
+C qp = r3
+C fn = r4
+C up = r5
+C un = r6
+C d = r7
+
+C TODO
+C * Decrease register usage.
+C * Make sure mul operands and optimal for early-out.
+C * Check that things work well for a shared library build.
+C * Write an invert_limb, perhaps inline, perhaps as a private call. Or at
+C least vastly improve the current __udiv_qrnnd_c based code.
+
+
+ASM_START()
+PROLOGUE(mpn_divrem_2)
+ stwu r1, -32(r1)
+ slwi r0, r6, 2
+ add r5, r5, r0
+ stmw r28, 8(r1)
+ addi r29, r5, -8 C up = up_param + un - 2
+ lwz r10, 4(r7)
+ lwz r12, 4(r29)
+ addi r8, r3, -12
+ lwz r7, 0(r7)
+ cmplw cr7, r12, r10
+ lwz r28, 0(r29)
+ blt- cr7, L(2)
+ bgt+ cr7, L(4)
+ cmplw cr7, r28, r7
+ blt- cr7, L(2)
+L(4): subfc r28, r7, r28
+ subfe r12, r10, r12
+ li r3, 1
+ b L(6)
+L(2): li r3, 0
+
+L(6): add r0, r4, r6
+ addic. r30, r0, -2
+ ble- cr0, L(ret)
+
+ slwi r9, r0, 2
+ add r8, r8, r9 C rp += un + fn
+ mtctr r30
+
+C Compute di from d1
+ srwi r11, r10, 16
+ nor r0, r10, r10
+ divwu r31, r0, r11
+ rlwinm r5, r10, 0, 16, 31
+ mullw r9, r11, r31
+ mullw r6, r5, r31
+ subf r0, r9, r0
+ slwi r0, r0, 16
+ ori r0, r0, 65535
+ cmplw cr7, r0, r6
+ bge- cr7, L(9)
+ add r0, r0, r10
+ cmplw cr7, r0, r10
+ cmplw cr6, r0, r6
+ addi r31, r31, -1 C q1--
+ cror 28, 28, 25
+ bc+ 12, 28, L(9)
+ addi r31, r31, -1 C q1--
+ add r0, r0, r10
+L(9): subf r0, r6, r0
+ divwu r6, r0, r11
+ mullw r9, r11, r6
+ mullw r11, r5, r6
+ subf r0, r9, r0
+ slwi r0, r0, 16
+ ori r0, r0, 65535
+ cmplw cr7, r0, r11
+ bge- cr7, L(13)
+ add r0, r0, r10
+ cmplw cr7, r0, r10
+ cmplw cr6, r0, r11
+ addi r6, r6, -1 C q0--
+ cror 28, 28, 25
+ bc+ 12, 28, L(13)
+C add r0, r0, r10 C final remainder
+ addi r6, r6, -1 C q0--
+L(13): rlwimi r6, r31, 16, 0, 15 C assemble final quotient
+
+C Adjust di by including d0
+ mullw r9, r10, r6 C t0 = LO(di * d1)
+ addc r11, r9, r7
+ subfe r0, r1, r1
+ mulhwu r9, r6, r7 C s1 = HI(di * d0)
+ addc r9, r11, r9
+ addze. r0, r0
+ blt cr0, L(17)
+L(18): subfc r9, r10, r9
+ addi r6, r6, -1
+ addme. r0, r0
+ bge+ cr0, L(18)
+L(17):
+
+C r0 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r28 r29 r30 r31
+C msl di d0 qp d1 fn up un
+L(loop):
+ mullw r0, r12, r6 C q0 = LO(n2 * di)
+ cmpw cr7, r30, r4
+ addc r31, r0, r28 C q0 += n1
+ mulhwu r9, r12, r6 C q = HI(n2 * di)
+ adde r12, r9, r12 C q += n2
+ addi r30, r30, -1
+ mullw r0, r10, r12 C d1 * q
+ li r9, 0
+ subf r0, r0, r28 C n1 -= d1 * q
+ addi r5, r12, 1
+ ble- cr7, L(23)
+ lwzu r9, -4(r29)
+L(23):
+ mullw r11, r12, r7 C t0 = LO(d0 * q)
+ subfc r28, r7, r9 C n0 -= d0
+ subfe r0, r10, r0 C n1 -= d1
+ mulhwu r12, r12, r7 C t1 = HI(d0 * q)
+ subfc r28, r11, r28 C n0 -= t0
+ subfe r12, r12, r0 C n1 -= t1
+ cmplw cr7, r12, r31
+ blt+ cr7, L(24)
+ addc r28, r28, r7
+ adde r12, r12, r10
+ addi r5, r5, -1
+L(24):
+ cmplw cr7, r12, r10
+ bge- cr7, L(fix)
+L(bck):
+ stw r5, 0(r8)
+ addi r8, r8, -4
+ bdnz L(loop)
+
+L(ret): lmw r28, 8(r1)
+ addi r1, r1, 32
+ blr
+
+L(fix): cmplw cr6, r28, r7
+ bgt+ cr7, L(28)
+ blt- cr6, L(bck)
+L(28): subfc r28, r7, r28
+ subfe r12, r10, r12
+ addi r5, r5, 1
+ b L(bck)
+EPILOGUE()