summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authortege <tege@gmplib.org>2002-05-17 20:40:00 +0200
committertege <tege@gmplib.org>2002-05-17 20:40:00 +0200
commita09391e6e2a9ebc9d78d27e3213e04ca540afbe5 (patch)
treec2bc354a31d09c7123c2196b25226d3f75b89cb3
parentda2a8bee0aff5690ef1120e3feda650d21d50fb6 (diff)
downloadgmp-a09391e6e2a9ebc9d78d27e3213e04ca540afbe5.tar.gz
*** empty log message ***
-rw-r--r--mpn/alpha/ev6/nails/addmul_2.asm156
-rw-r--r--mpn/alpha/ev6/nails/addmul_3.asm182
-rw-r--r--mpn/alpha/ev6/nails/addmul_4.asm215
-rw-r--r--mpn/cray/README2
4 files changed, 554 insertions, 1 deletions
diff --git a/mpn/alpha/ev6/nails/addmul_2.asm b/mpn/alpha/ev6/nails/addmul_2.asm
new file mode 100644
index 000000000..ffb773bd8
--- /dev/null
+++ b/mpn/alpha/ev6/nails/addmul_2.asm
@@ -0,0 +1,156 @@
+dnl Alpha ev6 nails mpn_addmul_2.
+
+dnl Copyright 2002 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n',`r18')
+define(`vp',`r19')
+
+dnl Useful register aliases
+define(`numb_mask',`r24')
+define(`ulimb',`r25')
+define(`rlimb',`r27')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+
+define(`acc0',`r4')
+define(`acc1',`r5')
+
+define(`v0',`r6')
+define(`v1',`r7')
+
+dnl Used for temps: r8 r19 r28
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl This declaration is munged by configure
+dnl NAILS_SUPPORT(3-63)
+
+dnl Runs at 4.0 cycles/limb. With unrolling, the ulimb load and the 3
+dnl bookkeeping increments and the `bis' that copies from r21 to r5 could be
+dnl removed and the instruction count reduced from 21 to to 16. We could
+dnl thereby reach about 2.3 cycles/limb.
+
+dnl If this is going to be a Karatsuba basecase building block, we need some
+dnl of the combinations below. That way, we won't ever hit the
+dnl slower mpn_addmul_1 for any huge multiplication.
+dnl
+dnl Alt 3 Alt 4 Alt 5 Alt 6
+dnl addmul_2 addmul_2 addmul_3 addmul_3
+dnl addmul_3 addmul_3 addmul_4 addmul_4
+dnl addmul_4 addmul_5 addmul_5
+dnl addmul_6
+
+dnl Register usage:
+dnl callee-saves: r9 r10 r11 r12 r13 r14 r15
+dnl scratch: r0 r1 r2 r3 r4 r5 r6 r7 r8
+dnl r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r27 r28
+dnl return address: 26
+dnl global pointer: 29
+dnl stack pointer: 30
+
+ASM_START()
+PROLOGUE(mpn_addmul_2)
+ lda numb_mask,-1(r31)
+ srl numb_mask,NAIL_BITS,numb_mask
+
+ ldq v0, 0(vp)
+ ldq v1, 8(vp)
+
+ bis r31, r31, acc0 C zero acc0
+ sll v0,NAIL_BITS, v0
+ bis r31, r31, acc1 C zero acc1
+ sll v1,NAIL_BITS, v1
+ bis r31, r31, r19
+
+C MAIN LOOP
+ ldq ulimb, 0(up)
+ lda up, 8(up)
+ mulq v0, ulimb, m0a C U1
+ umulh v0, ulimb, m0b C U1
+ mulq v1, ulimb, m1a C U1
+ umulh v1, ulimb, m1b C U1
+ lda n, -1(n)
+ beq n, Lend C U0
+ ALIGN(16)
+Loop:
+ bis r31, r31, r31 C nop
+ ldq rlimb, 0(rp)
+ ldq ulimb, 0(up)
+ addq r19, acc0, acc0 C propagate nail
+
+ lda rp, 8(rp)
+ srl m0a,NAIL_BITS,r8 C U0
+ lda up, 8(up)
+ mulq v0, ulimb, m0a C U1
+
+ addq r8, acc0, r19
+ addq m0b, acc1, acc0
+ umulh v0, ulimb, m0b C U1
+ bis r31, r31, r31 C nop
+
+ addq rlimb, r19, r19
+ srl m1a,NAIL_BITS,r8 C U0
+ bis r31, r31, r31 C nop
+ mulq v1, ulimb, m1a C U1
+
+ addq r8, acc0, acc0
+ bis r31, m1b, acc1
+ umulh v1, ulimb, m1b C U1
+ and r19,numb_mask, r28 C extract numb part
+
+ lda n, -1(n)
+ srl r19,NUMB_BITS, r19 C extract nail part
+ stq r28, -8(rp)
+ bne n, Loop C U0
+C END LOOP
+Lend:
+ ldq rlimb, 0(rp)
+ addq r19, acc0, acc0 C propagate nail
+ lda rp, 8(rp)
+ srl m0a,NAIL_BITS,r8 C U0
+ addq r8, acc0, r19
+ addq m0b, acc1, acc0
+ addq rlimb, r19, r19
+ srl m1a,NAIL_BITS,r8 C U0
+ addq r8, acc0, acc0
+ bis r31, m1b, acc1
+ and r19,numb_mask, r28 C extract limb
+
+ srl r19,NUMB_BITS, r19 C extract nail
+ stq r28, -8(rp)
+
+ addq r19, acc0, acc0 C propagate nail
+ and acc0,numb_mask, r28
+ stq r28, 0(rp)
+ srl acc0,NUMB_BITS, r19
+ addq r19, acc1, r0
+
+ ret r31, (r26), 1
+EPILOGUE(mpn_addmul_2)
+ASM_END()
diff --git a/mpn/alpha/ev6/nails/addmul_3.asm b/mpn/alpha/ev6/nails/addmul_3.asm
new file mode 100644
index 000000000..9455720d7
--- /dev/null
+++ b/mpn/alpha/ev6/nails/addmul_3.asm
@@ -0,0 +1,182 @@
+dnl Alpha ev6 nails mpn_addmul_3.
+
+dnl Copyright 2002 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n',`r18')
+define(`vp',`r19')
+
+dnl Useful register aliases
+define(`numb_mask',`r24')
+define(`ulimb',`r25')
+define(`rlimb',`r27')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+
+define(`acc0',`r4')
+define(`acc1',`r5')
+define(`acc2',`r22')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`v2',`r23')
+
+dnl Used for temps: r8 r19 r28
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl This declaration is munged by configure
+dnl NAILS_SUPPORT(3-63)
+
+dnl Runs at 3.0 cycles/limb. With unrolling, the ulimb load and the 3
+dnl bookkeeping increments and the `bis' that copies from r22 to r6 could be
+dnl removed and the instruction count reduced from 26 to to 21. We could
+dnl thereby probably reach 2 cycles/limb, the IMUL bandwidth.
+
+dnl If this is going to be a Karatsuba basecase building block, we need some
+dnl of the combinations below. That way, we won't ever hit the
+dnl slower mpn_addmul_1 for any huge multiplication.
+dnl
+dnl Alt 3 Alt 4 Alt 5 Alt 6
+dnl addmul_2 addmul_2 addmul_3 addmul_3
+dnl addmul_3 addmul_3 addmul_4 addmul_4
+dnl addmul_4 addmul_5 addmul_5
+dnl addmul_6
+
+dnl Register usage:
+dnl callee-saves: r9 r10 r11 r12 r13 r14 r15
+dnl scratch: r0 r1 r2 r3 r4 r5 r6 r7 r8
+dnl r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r27 r28
+dnl return address: 26
+dnl global pointer: 29
+dnl stack pointer: 30
+
+ASM_START()
+PROLOGUE(mpn_addmul_3)
+ lda numb_mask,-1(r31)
+ srl numb_mask,NAIL_BITS,numb_mask
+
+ ldq v0, 0(vp)
+ ldq v1, 8(vp)
+ ldq v2, 16(vp)
+
+ bis r31, r31, acc0 C zero acc0
+ sll v0,NAIL_BITS, v0
+ bis r31, r31, acc1 C zero acc1
+ sll v1,NAIL_BITS, v1
+ bis r31, r31, acc2 C zero acc2
+ sll v2,NAIL_BITS, v2
+ bis r31, r31, r19
+
+C MAIN LOOP
+ ldq ulimb, 0(up)
+ lda up, 8(up)
+ mulq v0, ulimb, m0a C U1
+ umulh v0, ulimb, m0b C U1
+ mulq v1, ulimb, m1a C U1
+ umulh v1, ulimb, m1b C U1
+ lda n, -1(n)
+ mulq v2, ulimb, m2a C U1
+ umulh v2, ulimb, m2b C U1
+ beq n, Lend C U0
+ ALIGN(16)
+Loop:
+ bis r31, r31, r31 C nop
+ ldq rlimb, 0(rp)
+ ldq ulimb, 0(up)
+ addq r19, acc0, acc0 C propagate nail
+
+ lda rp, 8(rp)
+ srl m0a,NAIL_BITS, r8 C U0
+ lda up, 8(up)
+ mulq v0, ulimb, m0a C U1
+
+ addq r8, acc0, r19
+ addq m0b, acc1, acc0
+ umulh v0, ulimb, m0b C U1
+ bis r31, r31, r31 C nop
+
+ addq rlimb, r19, r19
+ srl m1a,NAIL_BITS, r8 C U0
+ bis r31, r31, r31 C nop
+ mulq v1, ulimb, m1a C U1
+
+ addq r8, acc0, acc0
+ addq m1b, acc2, acc1
+ umulh v1, ulimb, m1b C U1
+ and r19,numb_mask, r28 C extract numb part
+
+ bis r31, r31, r31 C nop
+ srl m2a,NAIL_BITS, r8 C U0
+ lda n, -1(n)
+ mulq v2, ulimb, m2a C U1
+
+ addq r8, acc1, acc1
+ bis r31, m2b, acc2
+ umulh v2, ulimb, m2b C U1
+ srl r19,NUMB_BITS, r19 C extract nail part
+
+ bis r31, r31, r31 C nop
+ stq r28, -8(rp)
+
+ bne n, Loop C U0
+C END LOOP
+Lend:
+ ldq rlimb, 0(rp)
+ addq r19, acc0, acc0 C propagate nail
+ lda rp, 8(rp)
+ srl m0a,NAIL_BITS, r8 C U0
+ addq r8, acc0, r19
+ addq m0b, acc1, acc0
+ addq rlimb, r19, r19
+ srl m1a,NAIL_BITS, r8 C U0
+ addq r8, acc0, acc0
+ addq m1b, acc2, acc1
+ and r19,numb_mask, r28 C extract limb
+ srl m2a,NAIL_BITS, r8 C U0
+ addq r8, acc1, acc1
+ bis r31, m2b, acc2
+ srl r19,NUMB_BITS, r19 C extract nail
+ stq r28, -8(rp)
+
+ addq r19, acc0, acc0 C propagate nail
+ and acc0,numb_mask, r28
+ stq r28, 0(rp)
+ srl acc0,NUMB_BITS, r19
+ addq r19, acc1, acc1
+
+ and acc1,numb_mask, r28
+ stq r28, 8(rp)
+ srl acc1,NUMB_BITS, r19
+ addq r19, acc2, m0a
+
+ ret r31, (r26), 1
+EPILOGUE(mpn_addmul_3)
+ASM_END()
diff --git a/mpn/alpha/ev6/nails/addmul_4.asm b/mpn/alpha/ev6/nails/addmul_4.asm
new file mode 100644
index 000000000..a08471014
--- /dev/null
+++ b/mpn/alpha/ev6/nails/addmul_4.asm
@@ -0,0 +1,215 @@
+dnl Alpha ev6 nails mpn_addmul_4.
+
+dnl Copyright 2002 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 2.1 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public
+dnl License along with the GNU MP Library; see the file COPYING.LIB. If
+dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl Suite 330, Boston, MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n',`r18')
+define(`vp',`r19')
+
+dnl Useful register aliases
+define(`numb_mask',`r24')
+define(`ulimb',`r25')
+define(`rlimb',`r27')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r12')
+define(`m3b',`r13')
+
+define(`acc0',`r4')
+define(`acc1',`r5')
+define(`acc2',`r22')
+define(`acc3',`r14')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`v2',`r23')
+define(`v3',`r15')
+
+dnl Used for temps: r8 r19 r28
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl This declaration is munged by configure
+dnl NAILS_SUPPORT(3-63)
+
+dnl Runs at 2.5 cycles/limb. With unrolling, the ulimb load and the 3
+dnl bookkeeping increments and the `bis' that copies from r23 to r7 could be
+dnl removed and the instruction count reduced from 31 to to 26. We could
+dnl thereby surely reach 2 cycles/limb, the IMUL bandwidth.
+
+dnl If this is going to be a Karatsuba basecase building block, we need some
+dnl of the combinations below. That way, we won't ever hit the
+dnl slower mpn_addmul_1 for any huge multiplication.
+dnl
+dnl Alt 3 Alt 4 Alt 5 Alt 6
+dnl addmul_2 addmul_2 addmul_3 addmul_3
+dnl addmul_3 addmul_3 addmul_4 addmul_4
+dnl addmul_4 addmul_5 addmul_5
+dnl addmul_6
+
+dnl Register usage:
+dnl callee-saves: r9 r10 r11 r12 r13 r14 r15
+dnl scratch: r0 r1 r2 r3 r4 r5 r6 r7 r8
+dnl r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r27 r28
+dnl return address: 26
+dnl global pointer: 29
+dnl stack pointer: 30
+
+ASM_START()
+PROLOGUE(mpn_addmul_4)
+ lda r30, -240(r30)
+ stq r12, 32(r30)
+ stq r13, 40(r30)
+ stq r14, 48(r30)
+ stq r15, 56(r30)
+
+ lda numb_mask,-1(r31)
+ srl numb_mask,NAIL_BITS,numb_mask
+
+ ldq v0, 0(vp)
+ ldq v1, 8(vp)
+ ldq v2, 16(vp)
+ ldq v3, 24(vp)
+
+ bis r31, r31, acc0 C zero acc0
+ sll v0,NAIL_BITS, v0
+ bis r31, r31, acc1 C zero acc1
+ sll v1,NAIL_BITS, v1
+ bis r31, r31, acc2 C zero acc2
+ sll v2,NAIL_BITS, v2
+ bis r31, r31, acc3 C zero acc3
+ sll v3,NAIL_BITS, v3
+ bis r31, r31, r19
+
+C MAIN LOOP
+ ldq ulimb, 0(up)
+ lda up, 8(up)
+ mulq v0, ulimb, m0a C U1
+ umulh v0, ulimb, m0b C U1
+ mulq v1, ulimb, m1a C U1
+ umulh v1, ulimb, m1b C U1
+ lda n, -1(n)
+ mulq v2, ulimb, m2a C U1
+ umulh v2, ulimb, m2b C U1
+ mulq v3, ulimb, m3a C U1
+ umulh v3, ulimb, m3b C U1
+ beq n, Lend C U0
+ ALIGN(16)
+Loop:
+ bis r31, r31, r31 C nop
+ ldq rlimb, 0(rp)
+ ldq ulimb, 0(up)
+ addq r19, acc0, acc0 C propagate nail
+
+ lda rp, 8(rp)
+ srl m0a,NAIL_BITS, r8 C U0
+ lda up, 8(up)
+ mulq v0, ulimb, m0a C U1
+
+ addq r8, acc0, r19
+ addq m0b, acc1, acc0
+ umulh v0, ulimb, m0b C U1
+ bis r31, r31, r31 C nop
+
+ addq rlimb, r19, r19
+ srl m1a,NAIL_BITS, r8 C U0
+ bis r31, r31, r31 C nop
+ mulq v1, ulimb, m1a C U1
+
+ addq r8, acc0, acc0
+ addq m1b, acc2, acc1
+ umulh v1, ulimb, m1b C U1
+ and r19,numb_mask, r28 C extract numb part
+
+ bis r31, r31, r31 C nop
+ srl m2a,NAIL_BITS, r8 C U0
+ lda n, -1(n)
+ mulq v2, ulimb, m2a C U1
+
+ addq r8, acc1, acc1
+ addq m2b, acc3, acc2
+ umulh v2, ulimb, m2b C U1
+ srl r19,NUMB_BITS, r19 C extract nail part
+
+ bis r31, r31, r31 C nop
+ srl m3a,NAIL_BITS, r8 C U0
+ stq r28, -8(rp)
+ mulq v3, ulimb, m3a C U1
+
+ addq r8, acc2, acc2
+ bis r31, m3b, acc3
+ umulh v3, ulimb, m3b C U1
+ bne n, Loop C U0
+C END LOOP
+Lend:
+ ldq rlimb, 0(rp)
+ addq r19, acc0, acc0 C propagate nail
+ lda rp, 8(rp)
+ srl m0a,NAIL_BITS, r8 C U0
+ addq r8, acc0, r19
+ addq m0b, acc1, acc0
+ addq rlimb, r19, r19
+ srl m1a,NAIL_BITS, r8 C U0
+ addq r8, acc0, acc0
+ addq m1b, acc2, acc1
+ and r19,numb_mask, r28 C extract limb
+ srl m2a,NAIL_BITS, r8 C U0
+ addq r8, acc1, acc1
+ addq m2b, acc3, acc2
+ srl r19,NUMB_BITS, r19 C extract nail
+ srl m3a,NAIL_BITS, r8 C U0
+ stq r28, -8(rp)
+ addq r8, acc2, acc2
+ bis r31, m3b, acc3
+
+ addq r19, acc0, acc0 C propagate nail
+ and acc0,numb_mask, r28
+ stq r28, 0(rp)
+ srl acc0,NUMB_BITS, r19
+ addq r19, acc1, acc1
+
+ and acc1,numb_mask, r28
+ stq r28, 8(rp)
+ srl acc1,NUMB_BITS, r19
+ addq r19, acc2, acc2
+
+ and acc2,numb_mask, r28
+ stq r28, 16(rp)
+ srl acc2,NUMB_BITS, r19
+ addq r19, acc3, r0
+
+ ldq r12, 32(r30)
+ ldq r13, 40(r30)
+ ldq r14, 48(r30)
+ ldq r15, 56(r30)
+ lda r30, 240(r30)
+ ret r31, (r26), 1
+EPILOGUE(mpn_addmul_4)
+ASM_END()
diff --git a/mpn/cray/README b/mpn/cray/README
index a4c56ed0f..57d8e3dd2 100644
--- a/mpn/cray/README
+++ b/mpn/cray/README
@@ -38,7 +38,7 @@ computing carry is the main issue. There are no vectorizing
unsigned-less-than instructions, and the sequence that implement that
opetration is very long.
-Shifting is the only operation that s simple to make fast. All Cray
+Shifting is the only operation that is simple to make fast. All Cray
systems have a bitblt instructions (Vi Vj,Vj<Ak and Vi Vj,Vj>Ak) that
should be really useful.