From 4c05211240a87d8eaa562b7c842ceac0a521bc02 Mon Sep 17 00:00:00 2001 From: Torbjorn Granlund Date: Tue, 22 Nov 2011 00:11:49 +0100 Subject: Add power6-specific addmul_1. --- ChangeLog | 4 + mpn/powerpc64/mode64/p6/addmul_1.asm | 147 +++++++++++++++++++++++++++++++++++ 2 files changed, 151 insertions(+) create mode 100644 mpn/powerpc64/mode64/p6/addmul_1.asm diff --git a/ChangeLog b/ChangeLog index 4d031a239..61631cce8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2011-11-22 Torbjorn Granlund + + * mpn/powerpc64/mode64/p6/addmul_1.asm: New file. + 2011-11-21 Torbjorn Granlund * gmp-h.in (__GNU_MP_RELEASE): Renamed from typo name. diff --git a/mpn/powerpc64/mode64/p6/addmul_1.asm b/mpn/powerpc64/mode64/p6/addmul_1.asm new file mode 100644 index 000000000..bffa6f308 --- /dev/null +++ b/mpn/powerpc64/mode64/p6/addmul_1.asm @@ -0,0 +1,147 @@ +dnl PowerPC-64 mpn_addmul_1 optimised for power6. + +dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2008, 2010, 2011 +dnl Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 ? +C POWER6 12.25 +C POWER7 ? + +C TODO +C * Reduce register usage. +C * Schedule function entry code. +C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling +C would bring us to 9 c/l. +C * Generalise to handle submul_1. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`v0', `r6') + +ASM_START() +PROLOGUE(mpn_addmul_1) + std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + std r27, -40(r1) + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi n, n, 3 C compute count... + srdi n, n, 2 C ...for ctr + mtctr n C copy loop count into ctr + beq cr0, L(b0) + blt cr6, L(b1) + beq cr6, L(b2) + +L(b3): addi up, up, 16 + addi rp, rp, 16 + ld r8, -16(up) + ld r7, -8(up) + ld r27, 0(up) + mulld r5, r8, v0 + mulhdu r8, r8, v0 + mulld r9, r7, v0 + mulhdu r7, r7, v0 + mulld r11, r27, v0 + mulhdu r27, r27, v0 + ld r29, -16(rp) + ld r30, -8(rp) + ld r31, 0(rp) + addc r9, r9, r8 + adde r11, r11, r7 + addze r12, r27 + addc r5, r5, r29 + b L(l3) + +L(b2): addi up, up, 8 + addi rp, rp, 8 + ld r7, -8(up) + ld r27, 0(up) + mulld r9, r7, v0 + mulhdu r7, r7, v0 + mulld r11, r27, v0 + mulhdu r27, r27, v0 + ld r30, -8(rp) + ld r31, 0(rp) + addc r11, r11, r7 + addze r12, r27 + addc r9, r9, r30 + b L(l2) + +L(b1): ld r27, 0(up) + ld r31, 0(rp) + mulld r11, r27, v0 + mulhdu r12, r27, v0 + addc r11, r11, r31 + b L(l1) + +L(b0): addi up, up, -8 + addi rp, rp, -8 + addic r12, r0, 0 C clear r12 and cy (use that r0 = 0) + + ALIGN(32) +L(top): ld r10, 8(up) + ld r8, 16(up) + ld r7, 24(up) + ld r27, 32(up) + addi up, up, 32 + addi rp, rp, 32 + mulld r0, r10, v0 + mulhdu r10, r10, v0 + mulld r5, r8, v0 + mulhdu r8, r8, v0 + mulld r9, r7, v0 + mulhdu r7, r7, v0 + mulld r11, r27, v0 + mulhdu r27, r27, v0 + ld r28, -24(rp) + adde r0, r0, r12 + ld r29, -16(rp) + adde r5, r5, r10 + ld r30, -8(rp) + ld r31, 0(rp) + adde r9, r9, r8 + adde r11, r11, r7 + addze r12, r27 + addc r0, r0, r28 + std r0, -24(rp) + adde r5, r5, r29 +L(l3): std r5, -16(rp) + adde r9, r9, r30 +L(l2): std r9, -8(rp) + adde r11, r11, r31 +L(l1): std r11, 0(rp) + bdnz L(top) + + addze r3, r12 + ld r31, -8(r1) + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + blr +EPILOGUE() -- cgit v1.2.1