diff options
Diffstat (limited to 'rts/gmp/mpn/alpha')
-rw-r--r-- | rts/gmp/mpn/alpha/README | 224 | ||||
-rw-r--r-- | rts/gmp/mpn/alpha/add_n.asm | 114 | ||||
-rw-r--r-- | rts/gmp/mpn/alpha/addmul_1.asm | 87 | ||||
-rw-r--r-- | rts/gmp/mpn/alpha/cntlz.asm | 68 | ||||
-rw-r--r-- | rts/gmp/mpn/alpha/default.m4 | 77 | ||||
-rw-r--r-- | rts/gmp/mpn/alpha/ev5/add_n.asm | 143 | ||||
-rw-r--r-- | rts/gmp/mpn/alpha/ev5/lshift.asm | 169 | ||||
-rw-r--r-- | rts/gmp/mpn/alpha/ev5/rshift.asm | 167 | ||||
-rw-r--r-- | rts/gmp/mpn/alpha/ev5/sub_n.asm | 143 | ||||
-rw-r--r-- | rts/gmp/mpn/alpha/ev6/addmul_1.asm | 474 | ||||
-rw-r--r-- | rts/gmp/mpn/alpha/ev6/gmp-mparam.h | 62 | ||||
-rw-r--r-- | rts/gmp/mpn/alpha/gmp-mparam.h | 64 | ||||
-rw-r--r-- | rts/gmp/mpn/alpha/invert_limb.asm | 345 | ||||
-rw-r--r-- | rts/gmp/mpn/alpha/lshift.asm | 104 | ||||
-rw-r--r-- | rts/gmp/mpn/alpha/mul_1.asm | 71 | ||||
-rw-r--r-- | rts/gmp/mpn/alpha/rshift.asm | 102 | ||||
-rw-r--r-- | rts/gmp/mpn/alpha/sub_n.asm | 114 | ||||
-rw-r--r-- | rts/gmp/mpn/alpha/submul_1.asm | 87 | ||||
-rw-r--r-- | rts/gmp/mpn/alpha/udiv_qrnnd.S | 151 | ||||
-rw-r--r-- | rts/gmp/mpn/alpha/umul.asm | 39 | ||||
-rw-r--r-- | rts/gmp/mpn/alpha/unicos.m4 | 63 |
21 files changed, 2868 insertions, 0 deletions
diff --git a/rts/gmp/mpn/alpha/README b/rts/gmp/mpn/alpha/README new file mode 100644 index 0000000000..744260c7c5 --- /dev/null +++ b/rts/gmp/mpn/alpha/README @@ -0,0 +1,224 @@ +This directory contains mpn functions optimized for DEC Alpha processors. + +ALPHA ASSEMBLY RULES AND REGULATIONS + +The `.prologue N' pseudo op marks the end of instruction that needs +special handling by unwinding. It also says whether $27 is really +needed for computing the gp. The `.mask M' pseudo op says which +registers are saved on the stack, and at what offset in the frame. + +Cray code is very very different... + + +RELEVANT OPTIMIZATION ISSUES + +EV4 + +1. This chip has very limited store bandwidth. The on-chip L1 cache is + write-through, and a cache line is transfered from the store buffer to + the off-chip L2 in as much 15 cycles on most systems. This delay hurts + mpn_add_n, mpn_sub_n, mpn_lshift, and mpn_rshift. + +2. Pairing is possible between memory instructions and integer arithmetic + instructions. + +3. mulq and umulh are documented to have a latency of 23 cycles, but 2 of + these cycles are pipelined. Thus, multiply instructions can be issued at + a rate of one each 21st cycle. + +EV5 + +1. The memory bandwidth of this chip seems excellent, both for loads and + stores. Even when the working set is larger than the on-chip L1 and L2 + caches, the performance remain almost unaffected. + +2. mulq has a latency of 12 cycles and an issue rate of 1 each 8th cycle. + umulh has a measured latency of 14 cycles and an issue rate of 1 each + 10th cycle. But the exact timing is somewhat confusing. + +3. mpn_add_n. With 4-fold unrolling, we need 37 instructions, whereof 12 + are memory operations. This will take at least + ceil(37/2) [dual issue] + 1 [taken branch] = 19 cycles + We have 12 memory cycles, plus 4 after-store conflict cycles, or 16 data + cache cycles, which should be completely hidden in the 19 issue cycles. + The computation is inherently serial, with these dependencies: + + ldq ldq + \ /\ + (or) addq | + |\ / \ | + | addq cmpult + \ | | + cmpult | + \ / + or + + I.e., 3 operations are needed between carry-in and carry-out, making 12 + cycles the absolute minimum for the 4 limbs. We could replace the `or' + with a cmoveq/cmovne, which could issue one cycle earlier that the `or', + but that might waste a cycle on EV4. The total depth remain unaffected, + since cmov has a latency of 2 cycles. + + addq + / \ + addq cmpult + | \ + cmpult -> cmovne + +Montgomery has a slightly different way of computing carry that requires one +less instruction, but has depth 4 (instead of the current 3). Since the +code is currently instruction issue bound, Montgomery's idea should save us +1/2 cycle per limb, or bring us down to a total of 17 cycles or 4.25 +cycles/limb. Unfortunately, this method will not be good for the EV6. + +EV6 + +Here we have a really parallel pipeline, capable of issuing up to 4 integer +instructions per cycle. One integer multiply instruction can issue each +cycle. To get optimal speed, we need to pretend we are vectorizing the code, +i.e., minimize the iterative dependencies. + +There are two dependencies to watch out for. 1) Address arithmetic +dependencies, and 2) carry propagation dependencies. + +We can avoid serializing due to address arithmetic by unrolling the loop, so +that addresses don't depend heavily on an index variable. Avoiding +serializing because of carry propagation is trickier; the ultimate performance +of the code will be determined of the number of latency cycles it takes from +accepting carry-in to a vector point until we can generate carry-out. + +Most integer instructions can execute in either the L0, U0, L1, or U1 +pipelines. Shifts only execute in U0 and U1, and multiply only in U1. + +CMOV instructions split into two internal instructions, CMOV1 and CMOV2, but +the execute efficiently. But CMOV split the mapping process (see pg 2-26 in +cmpwrgd.pdf), suggesting the CMOV should always be placed as the last +instruction of an aligned 4 instruction block (?). + +Perhaps the most important issue is the latency between the L0/U0 and L1/U1 +clusters; a result obtained on either cluster has an extra cycle of latency +for consumers in the opposite cluster. Because of the dynamic nature of the +implementation, it is hard to predict where an instruction will execute. + +The shift loops need (per limb): + 1 load (Lx pipes) + 1 store (Lx pipes) + 2 shift (Ux pipes) + 1 iaddlog (Lx pipes, Ux pipes) +Obviously, since the pipes are very equally loaded, we should get 4 insn/cycle, or 1.25 cycles/limb. + +For mpn_add_n, we currently have + 2 load (Lx pipes) + 1 store (Lx pipes) + 5 iaddlog (Lx pipes, Ux pipes) + +Again, we have a perfect balance and will be limited by carry propagation +delays, currently three cycles. The superoptimizer indicates that ther +might be sequences that--using a final cmov--have a carry propagation delay +of just two. Montgomery's subtraction sequence could perhaps be used, by +complementing some operands. All in all, we should get down to 2 cycles +without much problems. + +For mpn_mul_1, we could do, just like for mpn_add_n: + not newlo,notnewlo + addq cylimb,newlo,newlo || cmpult cylimb,notnewlo,cyout + addq cyout,newhi,cylimb +and get 2-cycle carry propagation. The instructions needed will be + 1 ld (Lx pipes) + 1 st (Lx pipes) + 2 mul (U1 pipe) + 4 iaddlog (Lx pipes, Ux pipes) +issue1: addq not mul ld +issue2: cmpult addq mul st +Conclusion: no cluster delays and 2-cycle carry delays will give us 2 cycles/limb! + +Last, we have mpn_addmul_1. Almost certainly, we will get down to 3 +cycles/limb, which would be absolutely awesome. + +Old, perhaps obsolete addmul_1 dependency diagram (needs 175 columns wide screen): + + i + s + s i + u n + e s + d t + r + i u +l n c +i s t +v t i +e r o + u n +v c +a t t +l i y +u o p +e n e +s s s + issue + in + cycle + -1 ldq + / \ + 0 | \ + | \ + 1 | | + | | + 2 | | ldq + | | / \ + 3 | mulq | \ + | \ | \ + 4 umulh \ | | + | | | | + 5 | | | | ldq + | | | | / \ + 4calm 6 | | ldq | mulq | \ + | | / | \ | \ + 4casm 7 | | / umulh \ | | +6 | || | | | | + 3aal 8 | || | | | | ldq +7 | || | | | | / \ + 4calm 9 | || | | ldq | mulq | \ +9 | || | | / | \ | \ + 4casm 10 | || | | / umulh \ | | +9 | || | || | | | | + 3aal 11 | addq | || | | | | ldq +9 | // \ | || | | | | / \ + 4calm 12 \ cmpult addq<-cy | || | | ldq | mulq | \ +13 \ / // \ | || | | / | \ | \ + 4casm 13 addq cmpult stq | || | | / umulh \ | | +11 \ / | || | || | | | | + 3aal 14 addq | addq | || | | | | ldq +10 \ | // \ | || | | | | / \ + 4calm 15 cy ----> \ cmpult addq<-cy | || | | ldq | mulq | \ +13 \ / // \ | || | | / | \ | \ + 4casm 16 addq cmpult stq | || | | / umulh \ | | +11 \ / | || | || | | | | + 3aal 17 addq | addq | || | | | | +10 \ | // \ | || | | | | + 4calm 18 cy ----> \ cmpult addq<-cy | || | | ldq | mulq +13 \ / // \ | || | | / | \ + 4casm 19 addq cmpult stq | || | | / umulh \ +11 \ / | || | || | | + 3aal 20 addq | addq | || | | +10 \ | // \ | || | | + 4calm 21 cy ----> \ cmpult addq<-cy | || | | ldq + \ / // \ | || | | / + 22 addq cmpult stq | || | | / + \ / | || | || + 23 addq | addq | || + \ | // \ | || + 24 cy ----> \ cmpult addq<-cy | || + \ / // \ | || + 25 addq cmpult stq | || + \ / | || + 26 addq | addq + \ | // \ + 27 cy ----> \ cmpult addq<-cy + \ / // \ + 28 addq cmpult stq + \ / +As many as 6 consecutive points will be under execution simultaneously, or if we addq +schedule loads even further away, maybe 7 or 8. But the number of live quantities \ +is reasonable, and can easily be satisfied. cy ----> diff --git a/rts/gmp/mpn/alpha/add_n.asm b/rts/gmp/mpn/alpha/add_n.asm new file mode 100644 index 0000000000..08d6a9f7b8 --- /dev/null +++ b/rts/gmp/mpn/alpha/add_n.asm @@ -0,0 +1,114 @@ +dnl Alpha mpn_add_n -- Add two limb vectors of the same length > 0 and +dnl store sum in a third limb vector. + +dnl Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl s2_ptr r18 +dnl size r19 + +ASM_START() +PROLOGUE(mpn_add_n) + ldq r3,0(r17) + ldq r4,0(r18) + + subq r19,1,r19 + and r19,4-1,r2 C number of limbs in first loop + bis r31,r31,r0 + beq r2,$L0 C if multiple of 4 limbs, skip first loop + + subq r19,r2,r19 + +$Loop0: subq r2,1,r2 + ldq r5,8(r17) + addq r4,r0,r4 + ldq r6,8(r18) + cmpult r4,r0,r1 + addq r3,r4,r4 + cmpult r4,r3,r0 + stq r4,0(r16) + bis r0,r1,r0 + + addq r17,8,r17 + addq r18,8,r18 + bis r5,r5,r3 + bis r6,r6,r4 + addq r16,8,r16 + bne r2,$Loop0 + +$L0: beq r19,$Lend + + ALIGN(8) +$Loop: subq r19,4,r19 + + ldq r5,8(r17) + addq r4,r0,r4 + ldq r6,8(r18) + cmpult r4,r0,r1 + addq r3,r4,r4 + cmpult r4,r3,r0 + stq r4,0(r16) + bis r0,r1,r0 + + ldq r3,16(r17) + addq r6,r0,r6 + ldq r4,16(r18) + cmpult r6,r0,r1 + addq r5,r6,r6 + cmpult r6,r5,r0 + stq r6,8(r16) + bis r0,r1,r0 + + ldq r5,24(r17) + addq r4,r0,r4 + ldq r6,24(r18) + cmpult r4,r0,r1 + addq r3,r4,r4 + cmpult r4,r3,r0 + stq r4,16(r16) + bis r0,r1,r0 + + ldq r3,32(r17) + addq r6,r0,r6 + ldq r4,32(r18) + cmpult r6,r0,r1 + addq r5,r6,r6 + cmpult r6,r5,r0 + stq r6,24(r16) + bis r0,r1,r0 + + addq r17,32,r17 + addq r18,32,r18 + addq r16,32,r16 + bne r19,$Loop + +$Lend: addq r4,r0,r4 + cmpult r4,r0,r1 + addq r3,r4,r4 + cmpult r4,r3,r0 + stq r4,0(r16) + bis r0,r1,r0 + ret r31,(r26),1 +EPILOGUE(mpn_add_n) +ASM_END() diff --git a/rts/gmp/mpn/alpha/addmul_1.asm b/rts/gmp/mpn/alpha/addmul_1.asm new file mode 100644 index 0000000000..4ea900be6b --- /dev/null +++ b/rts/gmp/mpn/alpha/addmul_1.asm @@ -0,0 +1,87 @@ +dnl Alpha __gmpn_addmul_1 -- Multiply a limb vector with a limb and add +dnl the result to a second limb vector. + +dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl s2_limb r19 + +dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7 +dnl cycles/limb on EV6. + +ASM_START() +PROLOGUE(mpn_addmul_1) + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + subq r18,1,r18 C size-- + mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + umulh r2,r19,r0 C r0 = prod_high + beq r18,$Lend1 C jump if size was == 1 + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + subq r18,1,r18 C size-- + addq r5,r3,r3 + cmpult r3,r5,r4 + stq r3,0(r16) + addq r16,8,r16 C res_ptr++ + beq r18,$Lend2 C jump if size was == 2 + + ALIGN(8) +$Loop: mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + subq r18,1,r18 C size-- + umulh r2,r19,r4 C r4 = cy_limb + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + addq r5,r3,r3 + cmpult r3,r5,r5 + stq r3,0(r16) + addq r16,8,r16 C res_ptr++ + addq r5,r0,r0 C combine carries + bne r18,$Loop + +$Lend2: mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + umulh r2,r19,r4 C r4 = cy_limb + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + addq r5,r3,r3 + cmpult r3,r5,r5 + stq r3,0(r16) + addq r5,r0,r0 C combine carries + addq r4,r0,r0 C cy_limb = prod_high + cy + ret r31,(r26),1 +$Lend1: addq r5,r3,r3 + cmpult r3,r5,r5 + stq r3,0(r16) + addq r0,r5,r0 + ret r31,(r26),1 +EPILOGUE(mpn_addmul_1) +ASM_END() diff --git a/rts/gmp/mpn/alpha/cntlz.asm b/rts/gmp/mpn/alpha/cntlz.asm new file mode 100644 index 0000000000..febb3b70d9 --- /dev/null +++ b/rts/gmp/mpn/alpha/cntlz.asm @@ -0,0 +1,68 @@ +dnl Alpha auxiliary for longlong.h's count_leading_zeros + +dnl Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl DISCUSSION: + +dnl Other methods have been tried, and using a 128-entry table actually trims +dnl about 10% of the execution time (on a 21164) when the table is in the L1 +dnl cache. But under non-benchmarking conditions, the table will hardly be in +dnl the L1 cache. Tricky bit-fiddling methods with multiplies and magic tables +dnl are also possible, but they require many more instructions than the current +dnl code. (But for count_trailing_zeros, such tricks are beneficial.) +dnl Finally, converting to floating-point and extracting the exponent is much +dnl slower. + +ASM_START() +PROLOGUE(MPN(count_leading_zeros)) + bis r31,63,r0 C initialize partial result count + + srl r16,32,r1 C shift down 32 steps -> r1 + cmovne r1,r1,r16 C select r1 if non-zero + cmovne r1,31,r0 C if r1 is nonzero choose smaller count + + srl r16,16,r1 C shift down 16 steps -> r1 + subq r0,16,r2 C generate new partial result count + cmovne r1,r1,r16 C choose new r1 if non-zero + cmovne r1,r2,r0 C choose new count if r1 was non-zero + + srl r16,8,r1 + subq r0,8,r2 + cmovne r1,r1,r16 + cmovne r1,r2,r0 + + srl r16,4,r1 + subq r0,4,r2 + cmovne r1,r1,r16 + cmovne r1,r2,r0 + + srl r16,2,r1 + subq r0,2,r2 + cmovne r1,r1,r16 + cmovne r1,r2,r0 + + srl r16,1,r1 C extract bit 1 + subq r0,r1,r0 C subtract it from partial result + + ret r31,(r26),1 +EPILOGUE(MPN(count_leading_zeros)) +ASM_END() diff --git a/rts/gmp/mpn/alpha/default.m4 b/rts/gmp/mpn/alpha/default.m4 new file mode 100644 index 0000000000..5f4c48dc73 --- /dev/null +++ b/rts/gmp/mpn/alpha/default.m4 @@ -0,0 +1,77 @@ +divert(-1) + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +define(`ASM_START', + ` + .set noreorder + .set noat') + +define(`X',`0x$1') +define(`FLOAT64', + ` + .align 3 +$1: .t_floating $2') + +define(`PROLOGUE', + ` + .text + .align 3 + .globl $1 + .ent $1 +$1: + .frame r30,0,r26 + .prologue 0') + +define(`PROLOGUE_GP', + ` + .text + .align 3 + .globl $1 + .ent $1 +$1: + ldgp r29,0(r27) + .frame r30,0,r26 + .prologue 1') + +define(`EPILOGUE', + ` + .end $1') + +dnl Map register names r0, r1, etc, to `$0', `$1', etc. +dnl This is needed on all systems but Unicos +forloop(i,0,31, +`define(`r'i,``$''i)' +) +forloop(i,0,31, +`define(`f'i,``$f''i)' +) + +define(`DATASTART', + `dnl + DATA +$1:') +define(`DATAEND',`dnl') + +define(`ASM_END',`dnl') + +divert diff --git a/rts/gmp/mpn/alpha/ev5/add_n.asm b/rts/gmp/mpn/alpha/ev5/add_n.asm new file mode 100644 index 0000000000..716d6404ae --- /dev/null +++ b/rts/gmp/mpn/alpha/ev5/add_n.asm @@ -0,0 +1,143 @@ +dnl Alpha EV5 __gmpn_add_n -- Add two limb vectors of the same length > 0 and +dnl store sum in a third limb vector. + +dnl Copyright (C) 1995, 1999, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl s2_ptr r18 +dnl size r19 + +ASM_START() +PROLOGUE(mpn_add_n) + bis r31,r31,r25 C clear cy + subq r19,4,r19 C decr loop cnt + blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop +C Start software pipeline for 1st loop + ldq r0,0(r18) + ldq r4,0(r17) + ldq r1,8(r18) + ldq r5,8(r17) + addq r17,32,r17 C update s1_ptr + ldq r2,16(r18) + addq r0,r4,r20 C 1st main add + ldq r3,24(r18) + subq r19,4,r19 C decr loop cnt + ldq r6,-16(r17) + cmpult r20,r0,r25 C compute cy from last add + ldq r7,-8(r17) + addq r1,r5,r28 C 2nd main add + addq r18,32,r18 C update s2_ptr + addq r28,r25,r21 C 2nd carry add + cmpult r28,r5,r8 C compute cy from last add + blt r19,$Lend1 C if less than 4 limbs remain, jump +C 1st loop handles groups of 4 limbs in a software pipeline + ALIGN(16) +$Loop: cmpult r21,r28,r25 C compute cy from last add + ldq r0,0(r18) + bis r8,r25,r25 C combine cy from the two adds + ldq r1,8(r18) + addq r2,r6,r28 C 3rd main add + ldq r4,0(r17) + addq r28,r25,r22 C 3rd carry add + ldq r5,8(r17) + cmpult r28,r6,r8 C compute cy from last add + cmpult r22,r28,r25 C compute cy from last add + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two adds + stq r21,8(r16) + addq r3,r7,r28 C 4th main add + addq r28,r25,r23 C 4th carry add + cmpult r28,r7,r8 C compute cy from last add + cmpult r23,r28,r25 C compute cy from last add + addq r17,32,r17 C update s1_ptr + bis r8,r25,r25 C combine cy from the two adds + addq r16,32,r16 C update res_ptr + addq r0,r4,r28 C 1st main add + ldq r2,16(r18) + addq r25,r28,r20 C 1st carry add + ldq r3,24(r18) + cmpult r28,r4,r8 C compute cy from last add + ldq r6,-16(r17) + cmpult r20,r28,r25 C compute cy from last add + ldq r7,-8(r17) + bis r8,r25,r25 C combine cy from the two adds + subq r19,4,r19 C decr loop cnt + stq r22,-16(r16) + addq r1,r5,r28 C 2nd main add + stq r23,-8(r16) + addq r25,r28,r21 C 2nd carry add + addq r18,32,r18 C update s2_ptr + cmpult r28,r5,r8 C compute cy from last add + bge r19,$Loop +C Finish software pipeline for 1st loop +$Lend1: cmpult r21,r28,r25 C compute cy from last add + bis r8,r25,r25 C combine cy from the two adds + addq r2,r6,r28 C 3rd main add + addq r28,r25,r22 C 3rd carry add + cmpult r28,r6,r8 C compute cy from last add + cmpult r22,r28,r25 C compute cy from last add + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two adds + stq r21,8(r16) + addq r3,r7,r28 C 4th main add + addq r28,r25,r23 C 4th carry add + cmpult r28,r7,r8 C compute cy from last add + cmpult r23,r28,r25 C compute cy from last add + bis r8,r25,r25 C combine cy from the two adds + addq r16,32,r16 C update res_ptr + stq r22,-16(r16) + stq r23,-8(r16) +$Lend2: addq r19,4,r19 C restore loop cnt + beq r19,$Lret +C Start software pipeline for 2nd loop + ldq r0,0(r18) + ldq r4,0(r17) + subq r19,1,r19 + beq r19,$Lend0 +C 2nd loop handles remaining 1-3 limbs + ALIGN(16) +$Loop0: addq r0,r4,r28 C main add + ldq r0,8(r18) + cmpult r28,r4,r8 C compute cy from last add + ldq r4,8(r17) + addq r28,r25,r20 C carry add + addq r18,8,r18 + addq r17,8,r17 + stq r20,0(r16) + cmpult r20,r28,r25 C compute cy from last add + subq r19,1,r19 C decr loop cnt + bis r8,r25,r25 C combine cy from the two adds + addq r16,8,r16 + bne r19,$Loop0 +$Lend0: addq r0,r4,r28 C main add + addq r28,r25,r20 C carry add + cmpult r28,r4,r8 C compute cy from last add + cmpult r20,r28,r25 C compute cy from last add + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two adds + +$Lret: bis r25,r31,r0 C return cy + ret r31,(r26),1 +EPILOGUE(mpn_add_n) +ASM_END() diff --git a/rts/gmp/mpn/alpha/ev5/lshift.asm b/rts/gmp/mpn/alpha/ev5/lshift.asm new file mode 100644 index 0000000000..cb181dda66 --- /dev/null +++ b/rts/gmp/mpn/alpha/ev5/lshift.asm @@ -0,0 +1,169 @@ +dnl Alpha EV5 __gmpn_lshift -- Shift a number left. + +dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl cnt r19 + +dnl This code runs at 3.25 cycles/limb on the EV5. + +ASM_START() +PROLOGUE(mpn_lshift) + s8addq r18,r17,r17 C make r17 point at end of s1 + ldq r4,-8(r17) C load first limb + subq r31,r19,r20 + s8addq r18,r16,r16 C make r16 point at end of RES + subq r18,1,r18 + and r18,4-1,r28 C number of limbs in first loop + srl r4,r20,r0 C compute function result + + beq r28,$L0 + subq r18,r28,r18 + + ALIGN(8) +$Loop0: ldq r3,-16(r17) + subq r16,8,r16 + sll r4,r19,r5 + subq r17,8,r17 + subq r28,1,r28 + srl r3,r20,r6 + bis r3,r3,r4 + bis r5,r6,r8 + stq r8,0(r16) + bne r28,$Loop0 + +$L0: sll r4,r19,r24 + beq r18,$Lend +C warm up phase 1 + ldq r1,-16(r17) + subq r18,4,r18 + ldq r2,-24(r17) + ldq r3,-32(r17) + ldq r4,-40(r17) + beq r18,$Lend1 +C warm up phase 2 + srl r1,r20,r7 + sll r1,r19,r21 + srl r2,r20,r8 + ldq r1,-48(r17) + sll r2,r19,r22 + ldq r2,-56(r17) + srl r3,r20,r5 + bis r7,r24,r7 + sll r3,r19,r23 + bis r8,r21,r8 + srl r4,r20,r6 + ldq r3,-64(r17) + sll r4,r19,r24 + ldq r4,-72(r17) + subq r18,4,r18 + beq r18,$Lend2 + ALIGN(16) +C main loop +$Loop: stq r7,-8(r16) + bis r5,r22,r5 + stq r8,-16(r16) + bis r6,r23,r6 + + srl r1,r20,r7 + subq r18,4,r18 + sll r1,r19,r21 + unop C ldq r31,-96(r17) + + srl r2,r20,r8 + ldq r1,-80(r17) + sll r2,r19,r22 + ldq r2,-88(r17) + + stq r5,-24(r16) + bis r7,r24,r7 + stq r6,-32(r16) + bis r8,r21,r8 + + srl r3,r20,r5 + unop C ldq r31,-96(r17) + sll r3,r19,r23 + subq r16,32,r16 + + srl r4,r20,r6 + ldq r3,-96(r17) + sll r4,r19,r24 + ldq r4,-104(r17) + + subq r17,32,r17 + bne r18,$Loop +C cool down phase 2/1 +$Lend2: stq r7,-8(r16) + bis r5,r22,r5 + stq r8,-16(r16) + bis r6,r23,r6 + srl r1,r20,r7 + sll r1,r19,r21 + srl r2,r20,r8 + sll r2,r19,r22 + stq r5,-24(r16) + bis r7,r24,r7 + stq r6,-32(r16) + bis r8,r21,r8 + srl r3,r20,r5 + sll r3,r19,r23 + srl r4,r20,r6 + sll r4,r19,r24 +C cool down phase 2/2 + stq r7,-40(r16) + bis r5,r22,r5 + stq r8,-48(r16) + bis r6,r23,r6 + stq r5,-56(r16) + stq r6,-64(r16) +C cool down phase 2/3 + stq r24,-72(r16) + ret r31,(r26),1 + +C cool down phase 1/1 +$Lend1: srl r1,r20,r7 + sll r1,r19,r21 + srl r2,r20,r8 + sll r2,r19,r22 + srl r3,r20,r5 + bis r7,r24,r7 + sll r3,r19,r23 + bis r8,r21,r8 + srl r4,r20,r6 + sll r4,r19,r24 +C cool down phase 1/2 + stq r7,-8(r16) + bis r5,r22,r5 + stq r8,-16(r16) + bis r6,r23,r6 + stq r5,-24(r16) + stq r6,-32(r16) + stq r24,-40(r16) + ret r31,(r26),1 + +$Lend: stq r24,-8(r16) + ret r31,(r26),1 +EPILOGUE(mpn_lshift) +ASM_END() diff --git a/rts/gmp/mpn/alpha/ev5/rshift.asm b/rts/gmp/mpn/alpha/ev5/rshift.asm new file mode 100644 index 0000000000..9940d83fad --- /dev/null +++ b/rts/gmp/mpn/alpha/ev5/rshift.asm @@ -0,0 +1,167 @@ +dnl Alpha EV5 __gmpn_rshift -- Shift a number right. + +dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl cnt r19 + +dnl This code runs at 3.25 cycles/limb on the EV5. + +ASM_START() +PROLOGUE(mpn_rshift) + ldq r4,0(r17) C load first limb + subq r31,r19,r20 + subq r18,1,r18 + and r18,4-1,r28 C number of limbs in first loop + sll r4,r20,r0 C compute function result + + beq r28,$L0 + subq r18,r28,r18 + + ALIGN(8) +$Loop0: ldq r3,8(r17) + addq r16,8,r16 + srl r4,r19,r5 + addq r17,8,r17 + subq r28,1,r28 + sll r3,r20,r6 + bis r3,r3,r4 + bis r5,r6,r8 + stq r8,-8(r16) + bne r28,$Loop0 + +$L0: srl r4,r19,r24 + beq r18,$Lend +C warm up phase 1 + ldq r1,8(r17) + subq r18,4,r18 + ldq r2,16(r17) + ldq r3,24(r17) + ldq r4,32(r17) + beq r18,$Lend1 +C warm up phase 2 + sll r1,r20,r7 + srl r1,r19,r21 + sll r2,r20,r8 + ldq r1,40(r17) + srl r2,r19,r22 + ldq r2,48(r17) + sll r3,r20,r5 + bis r7,r24,r7 + srl r3,r19,r23 + bis r8,r21,r8 + sll r4,r20,r6 + ldq r3,56(r17) + srl r4,r19,r24 + ldq r4,64(r17) + subq r18,4,r18 + beq r18,$Lend2 + ALIGN(16) +C main loop +$Loop: stq r7,0(r16) + bis r5,r22,r5 + stq r8,8(r16) + bis r6,r23,r6 + + sll r1,r20,r7 + subq r18,4,r18 + srl r1,r19,r21 + unop C ldq r31,-96(r17) + + sll r2,r20,r8 + ldq r1,72(r17) + srl r2,r19,r22 + ldq r2,80(r17) + + stq r5,16(r16) + bis r7,r24,r7 + stq r6,24(r16) + bis r8,r21,r8 + + sll r3,r20,r5 + unop C ldq r31,-96(r17) + srl r3,r19,r23 + addq r16,32,r16 + + sll r4,r20,r6 + ldq r3,88(r17) + srl r4,r19,r24 + ldq r4,96(r17) + + addq r17,32,r17 + bne r18,$Loop +C cool down phase 2/1 +$Lend2: stq r7,0(r16) + bis r5,r22,r5 + stq r8,8(r16) + bis r6,r23,r6 + sll r1,r20,r7 + srl r1,r19,r21 + sll r2,r20,r8 + srl r2,r19,r22 + stq r5,16(r16) + bis r7,r24,r7 + stq r6,24(r16) + bis r8,r21,r8 + sll r3,r20,r5 + srl r3,r19,r23 + sll r4,r20,r6 + srl r4,r19,r24 +C cool down phase 2/2 + stq r7,32(r16) + bis r5,r22,r5 + stq r8,40(r16) + bis r6,r23,r6 + stq r5,48(r16) + stq r6,56(r16) +C cool down phase 2/3 + stq r24,64(r16) + ret r31,(r26),1 + +C cool down phase 1/1 +$Lend1: sll r1,r20,r7 + srl r1,r19,r21 + sll r2,r20,r8 + srl r2,r19,r22 + sll r3,r20,r5 + bis r7,r24,r7 + srl r3,r19,r23 + bis r8,r21,r8 + sll r4,r20,r6 + srl r4,r19,r24 +C cool down phase 1/2 + stq r7,0(r16) + bis r5,r22,r5 + stq r8,8(r16) + bis r6,r23,r6 + stq r5,16(r16) + stq r6,24(r16) + stq r24,32(r16) + ret r31,(r26),1 + +$Lend: stq r24,0(r16) + ret r31,(r26),1 +EPILOGUE(mpn_rshift) +ASM_END() diff --git a/rts/gmp/mpn/alpha/ev5/sub_n.asm b/rts/gmp/mpn/alpha/ev5/sub_n.asm new file mode 100644 index 0000000000..5248a2aa38 --- /dev/null +++ b/rts/gmp/mpn/alpha/ev5/sub_n.asm @@ -0,0 +1,143 @@ +dnl Alpha EV5 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 +dnl and store difference in a third limb vector. + +dnl Copyright (C) 1995, 1999, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl s2_ptr r18 +dnl size r19 + +ASM_START() +PROLOGUE(mpn_sub_n) + bis r31,r31,r25 C clear cy + subq r19,4,r19 C decr loop cnt + blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop +C Start software pipeline for 1st loop + ldq r0,0(r18) + ldq r4,0(r17) + ldq r1,8(r18) + ldq r5,8(r17) + addq r17,32,r17 C update s1_ptr + ldq r2,16(r18) + subq r4,r0,r20 C 1st main subtract + ldq r3,24(r18) + subq r19,4,r19 C decr loop cnt + ldq r6,-16(r17) + cmpult r4,r0,r25 C compute cy from last subtract + ldq r7,-8(r17) + subq r5,r1,r28 C 2nd main subtract + addq r18,32,r18 C update s2_ptr + subq r28,r25,r21 C 2nd carry subtract + cmpult r5,r1,r8 C compute cy from last subtract + blt r19,$Lend1 C if less than 4 limbs remain, jump +C 1st loop handles groups of 4 limbs in a software pipeline + ALIGN(16) +$Loop: cmpult r28,r25,r25 C compute cy from last subtract + ldq r0,0(r18) + bis r8,r25,r25 C combine cy from the two subtracts + ldq r1,8(r18) + subq r6,r2,r28 C 3rd main subtract + ldq r4,0(r17) + subq r28,r25,r22 C 3rd carry subtract + ldq r5,8(r17) + cmpult r6,r2,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two subtracts + stq r21,8(r16) + subq r7,r3,r28 C 4th main subtract + subq r28,r25,r23 C 4th carry subtract + cmpult r7,r3,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + addq r17,32,r17 C update s1_ptr + bis r8,r25,r25 C combine cy from the two subtracts + addq r16,32,r16 C update res_ptr + subq r4,r0,r28 C 1st main subtract + ldq r2,16(r18) + subq r28,r25,r20 C 1st carry subtract + ldq r3,24(r18) + cmpult r4,r0,r8 C compute cy from last subtract + ldq r6,-16(r17) + cmpult r28,r25,r25 C compute cy from last subtract + ldq r7,-8(r17) + bis r8,r25,r25 C combine cy from the two subtracts + subq r19,4,r19 C decr loop cnt + stq r22,-16(r16) + subq r5,r1,r28 C 2nd main subtract + stq r23,-8(r16) + subq r28,r25,r21 C 2nd carry subtract + addq r18,32,r18 C update s2_ptr + cmpult r5,r1,r8 C compute cy from last subtract + bge r19,$Loop +C Finish software pipeline for 1st loop +$Lend1: cmpult r28,r25,r25 C compute cy from last subtract + bis r8,r25,r25 C combine cy from the two subtracts + subq r6,r2,r28 C cy add + subq r28,r25,r22 C 3rd main subtract + cmpult r6,r2,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two subtracts + stq r21,8(r16) + subq r7,r3,r28 C cy add + subq r28,r25,r23 C 4th main subtract + cmpult r7,r3,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + bis r8,r25,r25 C combine cy from the two subtracts + addq r16,32,r16 C update res_ptr + stq r22,-16(r16) + stq r23,-8(r16) +$Lend2: addq r19,4,r19 C restore loop cnt + beq r19,$Lret +C Start software pipeline for 2nd loop + ldq r0,0(r18) + ldq r4,0(r17) + subq r19,1,r19 + beq r19,$Lend0 +C 2nd loop handles remaining 1-3 limbs + ALIGN(16) +$Loop0: subq r4,r0,r28 C main subtract + cmpult r4,r0,r8 C compute cy from last subtract + ldq r0,8(r18) + ldq r4,8(r17) + subq r28,r25,r20 C carry subtract + addq r18,8,r18 + addq r17,8,r17 + stq r20,0(r16) + cmpult r28,r25,r25 C compute cy from last subtract + subq r19,1,r19 C decr loop cnt + bis r8,r25,r25 C combine cy from the two subtracts + addq r16,8,r16 + bne r19,$Loop0 +$Lend0: subq r4,r0,r28 C main subtract + subq r28,r25,r20 C carry subtract + cmpult r4,r0,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two subtracts + +$Lret: bis r25,r31,r0 C return cy + ret r31,(r26),1 +EPILOGUE(mpn_sub_n) +ASM_END() diff --git a/rts/gmp/mpn/alpha/ev6/addmul_1.asm b/rts/gmp/mpn/alpha/ev6/addmul_1.asm new file mode 100644 index 0000000000..2f588626a5 --- /dev/null +++ b/rts/gmp/mpn/alpha/ev6/addmul_1.asm @@ -0,0 +1,474 @@ +dnl Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add +dnl the result to a second limb vector. + +dnl Copyright (C) 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl s2_limb r19 + +dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and +dnl exactly 3.625 cycles/limb on EV6... + +dnl This code was written in close cooperation with ev6 pipeline expert +dnl Steve Root (root@toober.hlo.dec.com). Any errors are tege's fault, though. +dnl +dnl Register usages for unrolled loop: +dnl 0-3 mul's +dnl 4-7 acc's +dnl 8-15 mul results +dnl 20,21 carry's +dnl 22,23 save for stores + +dnl Sustains 8 mul-adds in 29 cycles in the unrolled inner loop. + +dnl The stores can issue a cycle late so we have paired no-op's to 'catch' +dnl them, so that further disturbance to the schedule is damped. + +dnl We couldn't pair the loads, because the entangled schedule of the +dnl carry's has to happen on one side {0} of the machine. Note, the total +dnl use of U0, and the total use of L0 (after attending to the stores). +dnl which is part of the reason why.... + +dnl This is a great schedule for the d_cache, a poor schedule for the +dnl b_cache. The lockup on U0 means that any stall can't be recovered +dnl from. Consider a ldq in L1. say that load gets stalled because it +dnl collides with a fill from the b_Cache. On the next cycle, this load +dnl gets priority. If first looks at L0, and goes there. The instruction +dnl we intended for L0 gets to look at L1, which is NOT where we want +dnl it. It either stalls 1, because it can't go in L0, or goes there, and +dnl causes a further instruction to stall. + +dnl So for b_cache, we're likely going to want to put one or more cycles +dnl back into the code! And, of course, put in prefetches. For the +dnl accumulator, lds, intent to modify. For the multiplier, you might +dnl want ldq, evict next, if you're not wanting to use it again soon. Use +dnl 256 ahead of present pointer value. At a place where we have an mt +dnl followed by a bookkeeping, put the bookkeeping in upper, and the +dnl prefetch into lower. + +dnl Note, the usage of physical registers per cycle is smoothed off, as +dnl much as possible. + +dnl Note, the ldq's and stq's are at the end of the quadpacks. note, we'd +dnl like not to have a ldq or stq to preceded a conditional branch in a +dnl quadpack. The conditional branch moves the retire pointer one cycle +dnl later. + +dnl Optimization notes: +dnl Callee-saves regs: r9 r10 r11 r12 r13 r14 r15 r26 ?r27? +dnl Reserved regs: r29 r30 r31 +dnl Free caller-saves regs in unrolled code: r24 r25 r28 +dnl We should swap some of the callee-saves regs for some of the free +dnl caller-saves regs, saving some overhead cycles. +dnl Most importantly, we should write fast code for the 0-7 case. +dnl The code we use there are for the 21164, and runs at 7 cycles/limb +dnl on the 21264. Should not be hard, if we write specialized code for +dnl 1-7 limbs (the one for 0 limbs should be straightforward). We then just +dnl need a jump table indexed by the low 3 bits of the count argument. + + +ASM_START() +PROLOGUE(mpn_addmul_1) + cmpult r18, 8, r1 + beq r1, $Large + + ldq r2, 0(r17) C r2 = s1_limb + addq r17, 8, r17 C s1_ptr++ + subq r18, 1, r18 C size-- + mulq r2, r19, r3 C r3 = prod_low + ldq r5, 0(r16) C r5 = *res_ptr + umulh r2, r19, r0 C r0 = prod_high + beq r18, $Lend0b C jump if size was == 1 + ldq r2, 0(r17) C r2 = s1_limb + addq r17, 8, r17 C s1_ptr++ + subq r18, 1, r18 C size-- + addq r5, r3, r3 + cmpult r3, r5, r4 + stq r3, 0(r16) + addq r16, 8, r16 C res_ptr++ + beq r18, $Lend0a C jump if size was == 2 + + ALIGN(8) +$Loop0: mulq r2, r19, r3 C r3 = prod_low + ldq r5, 0(r16) C r5 = *res_ptr + addq r4, r0, r0 C cy_limb = cy_limb + 'cy' + subq r18, 1, r18 C size-- + umulh r2, r19, r4 C r4 = cy_limb + ldq r2, 0(r17) C r2 = s1_limb + addq r17, 8, r17 C s1_ptr++ + addq r3, r0, r3 C r3 = cy_limb + prod_low + cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low) + addq r5, r3, r3 + cmpult r3, r5, r5 + stq r3, 0(r16) + addq r16, 8, r16 C res_ptr++ + addq r5, r0, r0 C combine carries + bne r18, $Loop0 +$Lend0a: + mulq r2, r19, r3 C r3 = prod_low + ldq r5, 0(r16) C r5 = *res_ptr + addq r4, r0, r0 C cy_limb = cy_limb + 'cy' + umulh r2, r19, r4 C r4 = cy_limb + addq r3, r0, r3 C r3 = cy_limb + prod_low + cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low) + addq r5, r3, r3 + cmpult r3, r5, r5 + stq r3, 0(r16) + addq r5, r0, r0 C combine carries + addq r4, r0, r0 C cy_limb = prod_high + cy + ret r31, (r26), 1 +$Lend0b: + addq r5, r3, r3 + cmpult r3, r5, r5 + stq r3, 0(r16) + addq r0, r5, r0 + ret r31, (r26), 1 + +$Large: + lda $30, -240($30) + stq $9, 8($30) + stq $10, 16($30) + stq $11, 24($30) + stq $12, 32($30) + stq $13, 40($30) + stq $14, 48($30) + stq $15, 56($30) + + and r18, 7, r20 C count for the first loop, 0-7 + srl r18, 3, r18 C count for unrolled loop + bis r31, r31, r0 + beq r20, $Lunroll + ldq r2, 0(r17) C r2 = s1_limb + addq r17, 8, r17 C s1_ptr++ + subq r20, 1, r20 C size-- + mulq r2, r19, r3 C r3 = prod_low + ldq r5, 0(r16) C r5 = *res_ptr + umulh r2, r19, r0 C r0 = prod_high + beq r20, $Lend1b C jump if size was == 1 + ldq r2, 0(r17) C r2 = s1_limb + addq r17, 8, r17 C s1_ptr++ + subq r20, 1, r20 C size-- + addq r5, r3, r3 + cmpult r3, r5, r4 + stq r3, 0(r16) + addq r16, 8, r16 C res_ptr++ + beq r20, $Lend1a C jump if size was == 2 + + ALIGN(8) +$Loop1: mulq r2, r19, r3 C r3 = prod_low + ldq r5, 0(r16) C r5 = *res_ptr + addq r4, r0, r0 C cy_limb = cy_limb + 'cy' + subq r20, 1, r20 C size-- + umulh r2, r19, r4 C r4 = cy_limb + ldq r2, 0(r17) C r2 = s1_limb + addq r17, 8, r17 C s1_ptr++ + addq r3, r0, r3 C r3 = cy_limb + prod_low + cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low) + addq r5, r3, r3 + cmpult r3, r5, r5 + stq r3, 0(r16) + addq r16, 8, r16 C res_ptr++ + addq r5, r0, r0 C combine carries + bne r20, $Loop1 + +$Lend1a: + mulq r2, r19, r3 C r3 = prod_low + ldq r5, 0(r16) C r5 = *res_ptr + addq r4, r0, r0 C cy_limb = cy_limb + 'cy' + umulh r2, r19, r4 C r4 = cy_limb + addq r3, r0, r3 C r3 = cy_limb + prod_low + cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low) + addq r5, r3, r3 + cmpult r3, r5, r5 + stq r3, 0(r16) + addq r16, 8, r16 C res_ptr++ + addq r5, r0, r0 C combine carries + addq r4, r0, r0 C cy_limb = prod_high + cy + br r31, $Lunroll +$Lend1b: + addq r5, r3, r3 + cmpult r3, r5, r5 + stq r3, 0(r16) + addq r16, 8, r16 C res_ptr++ + addq r0, r5, r0 + +$Lunroll: + lda r17, -16(r17) C L1 bookkeeping + lda r16, -16(r16) C L1 bookkeeping + bis r0, r31, r12 + +C ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____ + + ldq r2, 16(r17) C L1 + ldq r3, 24(r17) C L1 + lda r18, -1(r18) C L1 bookkeeping + ldq r6, 16(r16) C L1 + ldq r7, 24(r16) C L1 + ldq r0, 32(r17) C L1 + mulq r19, r2, r13 C U1 + ldq r1, 40(r17) C L1 + umulh r19, r2, r14 C U1 + mulq r19, r3, r15 C U1 + lda r17, 64(r17) C L1 bookkeeping + ldq r4, 32(r16) C L1 + ldq r5, 40(r16) C L1 + umulh r19, r3, r8 C U1 + ldq r2, -16(r17) C L1 + mulq r19, r0, r9 C U1 + ldq r3, -8(r17) C L1 + umulh r19, r0, r10 C U1 + addq r6, r13, r6 C L0 lo + acc + mulq r19, r1, r11 C U1 + cmpult r6, r13, r20 C L0 lo add => carry + lda r16, 64(r16) C L1 bookkeeping + addq r6, r12, r22 C U0 hi add => answer + cmpult r22, r12, r21 C L0 hi add => carry + addq r14, r20, r14 C U0 hi mul + carry + ldq r6, -16(r16) C L1 + addq r7, r15, r23 C L0 lo + acc + addq r14, r21, r14 C U0 hi mul + carry + ldq r7, -8(r16) C L1 + umulh r19, r1, r12 C U1 + cmpult r23, r15, r20 C L0 lo add => carry + addq r23, r14, r23 C U0 hi add => answer + ldq r0, 0(r17) C L1 + mulq r19, r2, r13 C U1 + cmpult r23, r14, r21 C L0 hi add => carry + addq r8, r20, r8 C U0 hi mul + carry + ldq r1, 8(r17) C L1 + umulh r19, r2, r14 C U1 + addq r4, r9, r4 C L0 lo + acc + stq r22, -48(r16) C L0 + stq r23, -40(r16) C L1 + mulq r19, r3, r15 C U1 + addq r8, r21, r8 C U0 hi mul + carry + cmpult r4, r9, r20 C L0 lo add => carry + addq r4, r8, r22 C U0 hi add => answer + ble r18, $Lend C U1 bookkeeping + +C ____ MAIN UNROLLED LOOP ____ + ALIGN(16) +$Loop: + bis r31, r31, r31 C U1 mt + cmpult r22, r8, r21 C L0 hi add => carry + addq r10, r20, r10 C U0 hi mul + carry + ldq r4, 0(r16) C L1 + + bis r31, r31, r31 C U1 mt + addq r5, r11, r23 C L0 lo + acc + addq r10, r21, r10 C L0 hi mul + carry + ldq r5, 8(r16) C L1 + + umulh r19, r3, r8 C U1 + cmpult r23, r11, r20 C L0 lo add => carry + addq r23, r10, r23 C U0 hi add => answer + ldq r2, 16(r17) C L1 + + mulq r19, r0, r9 C U1 + cmpult r23, r10, r21 C L0 hi add => carry + addq r12, r20, r12 C U0 hi mul + carry + ldq r3, 24(r17) C L1 + + umulh r19, r0, r10 C U1 + addq r6, r13, r6 C L0 lo + acc + stq r22, -32(r16) C L0 + stq r23, -24(r16) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq r19, r1, r11 C U1 + bis r31, r31, r31 C L1 st slosh + addq r12, r21, r12 C U0 hi mul + carry + + cmpult r6, r13, r20 C L0 lo add => carry + bis r31, r31, r31 C U1 mt + lda r18, -1(r18) C L1 bookkeeping + addq r6, r12, r22 C U0 hi add => answer + + bis r31, r31, r31 C U1 mt + cmpult r22, r12, r21 C L0 hi add => carry + addq r14, r20, r14 C U0 hi mul + carry + ldq r6, 16(r16) C L1 + + bis r31, r31, r31 C U1 mt + addq r7, r15, r23 C L0 lo + acc + addq r14, r21, r14 C U0 hi mul + carry + ldq r7, 24(r16) C L1 + + umulh r19, r1, r12 C U1 + cmpult r23, r15, r20 C L0 lo add => carry + addq r23, r14, r23 C U0 hi add => answer + ldq r0, 32(r17) C L1 + + mulq r19, r2, r13 C U1 + cmpult r23, r14, r21 C L0 hi add => carry + addq r8, r20, r8 C U0 hi mul + carry + ldq r1, 40(r17) C L1 + + umulh r19, r2, r14 C U1 + addq r4, r9, r4 C U0 lo + acc + stq r22, -16(r16) C L0 + stq r23, -8(r16) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq r19, r3, r15 C U1 + bis r31, r31, r31 C L1 st slosh + addq r8, r21, r8 C L0 hi mul + carry + + cmpult r4, r9, r20 C L0 lo add => carry + bis r31, r31, r31 C U1 mt + lda r17, 64(r17) C L1 bookkeeping + addq r4, r8, r22 C U0 hi add => answer + + bis r31, r31, r31 C U1 mt + cmpult r22, r8, r21 C L0 hi add => carry + addq r10, r20, r10 C U0 hi mul + carry + ldq r4, 32(r16) C L1 + + bis r31, r31, r31 C U1 mt + addq r5, r11, r23 C L0 lo + acc + addq r10, r21, r10 C L0 hi mul + carry + ldq r5, 40(r16) C L1 + + umulh r19, r3, r8 C U1 + cmpult r23, r11, r20 C L0 lo add => carry + addq r23, r10, r23 C U0 hi add => answer + ldq r2, -16(r17) C L1 + + mulq r19, r0, r9 C U1 + cmpult r23, r10, r21 C L0 hi add => carry + addq r12, r20, r12 C U0 hi mul + carry + ldq r3, -8(r17) C L1 + + umulh r19, r0, r10 C U1 + addq r6, r13, r6 C L0 lo + acc + stq r22, 0(r16) C L0 + stq r23, 8(r16) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq r19, r1, r11 C U1 + bis r31, r31, r31 C L1 st slosh + addq r12, r21, r12 C U0 hi mul + carry + + cmpult r6, r13, r20 C L0 lo add => carry + bis r31, r31, r31 C U1 mt + lda r16, 64(r16) C L1 bookkeeping + addq r6, r12, r22 C U0 hi add => answer + + bis r31, r31, r31 C U1 mt + cmpult r22, r12, r21 C L0 hi add => carry + addq r14, r20, r14 C U0 hi mul + carry + ldq r6, -16(r16) C L1 + + bis r31, r31, r31 C U1 mt + addq r7, r15, r23 C L0 lo + acc + addq r14, r21, r14 C U0 hi mul + carry + ldq r7, -8(r16) C L1 + + umulh r19, r1, r12 C U1 + cmpult r23, r15, r20 C L0 lo add => carry + addq r23, r14, r23 C U0 hi add => answer + ldq r0, 0(r17) C L1 + + mulq r19, r2, r13 C U1 + cmpult r23, r14, r21 C L0 hi add => carry + addq r8, r20, r8 C U0 hi mul + carry + ldq r1, 8(r17) C L1 + + umulh r19, r2, r14 C U1 + addq r4, r9, r4 C L0 lo + acc + stq r22, -48(r16) C L0 + stq r23, -40(r16) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq r19, r3, r15 C U1 + bis r31, r31, r31 C L1 st slosh + addq r8, r21, r8 C U0 hi mul + carry + + cmpult r4, r9, r20 C L0 lo add => carry + addq r4, r8, r22 C U0 hi add => answer + bis r31, r31, r31 C L1 mt + bgt r18, $Loop C U1 bookkeeping + +C ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____ +$Lend: + cmpult r22, r8, r21 C L0 hi add => carry + addq r10, r20, r10 C U0 hi mul + carry + ldq r4, 0(r16) C L1 + addq r5, r11, r23 C L0 lo + acc + addq r10, r21, r10 C L0 hi mul + carry + ldq r5, 8(r16) C L1 + umulh r19, r3, r8 C U1 + cmpult r23, r11, r20 C L0 lo add => carry + addq r23, r10, r23 C U0 hi add => answer + mulq r19, r0, r9 C U1 + cmpult r23, r10, r21 C L0 hi add => carry + addq r12, r20, r12 C U0 hi mul + carry + umulh r19, r0, r10 C U1 + addq r6, r13, r6 C L0 lo + acc + stq r22, -32(r16) C L0 + stq r23, -24(r16) C L1 + mulq r19, r1, r11 C U1 + addq r12, r21, r12 C U0 hi mul + carry + cmpult r6, r13, r20 C L0 lo add => carry + addq r6, r12, r22 C U0 hi add => answer + cmpult r22, r12, r21 C L0 hi add => carry + addq r14, r20, r14 C U0 hi mul + carry + addq r7, r15, r23 C L0 lo + acc + addq r14, r21, r14 C U0 hi mul + carry + umulh r19, r1, r12 C U1 + cmpult r23, r15, r20 C L0 lo add => carry + addq r23, r14, r23 C U0 hi add => answer + cmpult r23, r14, r21 C L0 hi add => carry + addq r8, r20, r8 C U0 hi mul + carry + addq r4, r9, r4 C U0 lo + acc + stq r22, -16(r16) C L0 + stq r23, -8(r16) C L1 + bis r31, r31, r31 C L0 st slosh + addq r8, r21, r8 C L0 hi mul + carry + cmpult r4, r9, r20 C L0 lo add => carry + addq r4, r8, r22 C U0 hi add => answer + cmpult r22, r8, r21 C L0 hi add => carry + addq r10, r20, r10 C U0 hi mul + carry + addq r5, r11, r23 C L0 lo + acc + addq r10, r21, r10 C L0 hi mul + carry + cmpult r23, r11, r20 C L0 lo add => carry + addq r23, r10, r23 C U0 hi add => answer + cmpult r23, r10, r21 C L0 hi add => carry + addq r12, r20, r12 C U0 hi mul + carry + stq r22, 0(r16) C L0 + stq r23, 8(r16) C L1 + addq r12, r21, r0 C U0 hi mul + carry + + ldq $9, 8($30) + ldq $10, 16($30) + ldq $11, 24($30) + ldq $12, 32($30) + ldq $13, 40($30) + ldq $14, 48($30) + ldq $15, 56($30) + lda $30, 240($30) + ret r31, (r26), 1 +EPILOGUE(mpn_addmul_1) +ASM_END() diff --git a/rts/gmp/mpn/alpha/ev6/gmp-mparam.h b/rts/gmp/mpn/alpha/ev6/gmp-mparam.h new file mode 100644 index 0000000000..7ea20577f8 --- /dev/null +++ b/rts/gmp/mpn/alpha/ev6/gmp-mparam.h @@ -0,0 +1,62 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 64 +#define BYTES_PER_MP_LIMB 8 +#define BITS_PER_LONGINT 64 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +/* Generated by tuneup.c, 2000-08-02. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 47 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 70 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 94 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 101 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 33 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 70 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 29 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 46 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 33 +#endif diff --git a/rts/gmp/mpn/alpha/gmp-mparam.h b/rts/gmp/mpn/alpha/gmp-mparam.h new file mode 100644 index 0000000000..054ff2fe5f --- /dev/null +++ b/rts/gmp/mpn/alpha/gmp-mparam.h @@ -0,0 +1,64 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 64 +#define BYTES_PER_MP_LIMB 8 +#define BITS_PER_LONGINT 64 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +/* These values are for the 21164 family. The 21264 will require + different values, since it has such quick multiplication. */ +/* Generated by tuneup.c, 2000-07-19. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 22 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 53 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 31 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 47 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 64 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 98 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 17 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 4 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 4 +#endif diff --git a/rts/gmp/mpn/alpha/invert_limb.asm b/rts/gmp/mpn/alpha/invert_limb.asm new file mode 100644 index 0000000000..a921b32b3f --- /dev/null +++ b/rts/gmp/mpn/alpha/invert_limb.asm @@ -0,0 +1,345 @@ +dnl Alpha mpn_invert_limb -- Invert a normalized limb. + +dnl Copyright (C) 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +dnl +dnl This is based on sophie:/gmp-stuff/dbg-inv-limb.c. +dnl The ideas are due to Peter L. Montgomery +dnl +dnl The table below uses 4096 bytes. The file mentioned above has an +dnl alternative function that doesn't require the table, but it runs 50% +dnl slower than this. + +include(`../config.m4') + +ASM_START() + +FLOAT64($C36,9223372036854775808.0) C 2^63 + +PROLOGUE_GP(mpn_invert_limb) + lda r30,-16(r30) + addq r16,r16,r1 + bne r1,$73 + lda r0,-1 + br r31,$Lend +$73: + srl r16,1,r1 + stq r1,0(r30) + ldt f11,0(r30) + cvtqt f11,f1 + lda r1,$C36 + ldt f10,0(r1) + divt f10,f1,f10 + lda r2,$invtab-4096 + srl r16,52,r1 + addq r1,r1,r1 + addq r1,r2,r1 + bic r1,6,r2 + ldq r2,0(r2) + bic r1,1,r1 + extwl r2,r1,r2 + sll r2,48,r0 + umulh r16,r0,r1 + addq r16,r1,r3 + stq r3,0(r30) + ldt f11,0(r30) + cvtqt f11,f1 + mult f1,f10,f1 + cvttqc f1,f1 + stt f1,0(r30) + ldq r4,0(r30) + subq r0,r4,r0 + umulh r16,r0,r1 + mulq r16,r0,r2 + addq r16,r1,r3 + bge r3,$Loop2 +$Loop1: addq r2,r16,r2 + cmpult r2,r16,r1 + addq r3,r1,r3 + addq r0,1,r0 + blt r3,$Loop1 +$Loop2: cmpult r2,r16,r1 + subq r0,1,r0 + subq r3,r1,r3 + subq r2,r16,r2 + bge r3,$Loop2 +$Lend: + lda r30,16(r30) + ret r31,(r26),1 +EPILOGUE(mpn_invert_limb) +DATASTART(`$invtab',4) + .word 0xffff,0xffc0,0xff80,0xff40,0xff00,0xfec0,0xfe81,0xfe41 + .word 0xfe01,0xfdc2,0xfd83,0xfd43,0xfd04,0xfcc5,0xfc86,0xfc46 + .word 0xfc07,0xfbc8,0xfb8a,0xfb4b,0xfb0c,0xfacd,0xfa8e,0xfa50 + .word 0xfa11,0xf9d3,0xf994,0xf956,0xf918,0xf8d9,0xf89b,0xf85d + .word 0xf81f,0xf7e1,0xf7a3,0xf765,0xf727,0xf6ea,0xf6ac,0xf66e + .word 0xf631,0xf5f3,0xf5b6,0xf578,0xf53b,0xf4fd,0xf4c0,0xf483 + .word 0xf446,0xf409,0xf3cc,0xf38f,0xf352,0xf315,0xf2d8,0xf29c + .word 0xf25f,0xf222,0xf1e6,0xf1a9,0xf16d,0xf130,0xf0f4,0xf0b8 + .word 0xf07c,0xf03f,0xf003,0xefc7,0xef8b,0xef4f,0xef14,0xeed8 + .word 0xee9c,0xee60,0xee25,0xede9,0xedae,0xed72,0xed37,0xecfb + .word 0xecc0,0xec85,0xec4a,0xec0e,0xebd3,0xeb98,0xeb5d,0xeb22 + .word 0xeae8,0xeaad,0xea72,0xea37,0xe9fd,0xe9c2,0xe988,0xe94d + .word 0xe913,0xe8d8,0xe89e,0xe864,0xe829,0xe7ef,0xe7b5,0xe77b + .word 0xe741,0xe707,0xe6cd,0xe694,0xe65a,0xe620,0xe5e6,0xe5ad + .word 0xe573,0xe53a,0xe500,0xe4c7,0xe48d,0xe454,0xe41b,0xe3e2 + .word 0xe3a9,0xe370,0xe336,0xe2fd,0xe2c5,0xe28c,0xe253,0xe21a + .word 0xe1e1,0xe1a9,0xe170,0xe138,0xe0ff,0xe0c7,0xe08e,0xe056 + .word 0xe01e,0xdfe5,0xdfad,0xdf75,0xdf3d,0xdf05,0xdecd,0xde95 + .word 0xde5d,0xde25,0xdded,0xddb6,0xdd7e,0xdd46,0xdd0f,0xdcd7 + .word 0xdca0,0xdc68,0xdc31,0xdbf9,0xdbc2,0xdb8b,0xdb54,0xdb1d + .word 0xdae6,0xdaae,0xda78,0xda41,0xda0a,0xd9d3,0xd99c,0xd965 + .word 0xd92f,0xd8f8,0xd8c1,0xd88b,0xd854,0xd81e,0xd7e8,0xd7b1 + .word 0xd77b,0xd745,0xd70e,0xd6d8,0xd6a2,0xd66c,0xd636,0xd600 + .word 0xd5ca,0xd594,0xd55f,0xd529,0xd4f3,0xd4bd,0xd488,0xd452 + .word 0xd41d,0xd3e7,0xd3b2,0xd37c,0xd347,0xd312,0xd2dd,0xd2a7 + .word 0xd272,0xd23d,0xd208,0xd1d3,0xd19e,0xd169,0xd134,0xd100 + .word 0xd0cb,0xd096,0xd061,0xd02d,0xcff8,0xcfc4,0xcf8f,0xcf5b + .word 0xcf26,0xcef2,0xcebe,0xce89,0xce55,0xce21,0xcded,0xcdb9 + .word 0xcd85,0xcd51,0xcd1d,0xcce9,0xccb5,0xcc81,0xcc4e,0xcc1a + .word 0xcbe6,0xcbb3,0xcb7f,0xcb4c,0xcb18,0xcae5,0xcab1,0xca7e + .word 0xca4b,0xca17,0xc9e4,0xc9b1,0xc97e,0xc94b,0xc918,0xc8e5 + .word 0xc8b2,0xc87f,0xc84c,0xc819,0xc7e7,0xc7b4,0xc781,0xc74f + .word 0xc71c,0xc6e9,0xc6b7,0xc684,0xc652,0xc620,0xc5ed,0xc5bb + .word 0xc589,0xc557,0xc524,0xc4f2,0xc4c0,0xc48e,0xc45c,0xc42a + .word 0xc3f8,0xc3c7,0xc395,0xc363,0xc331,0xc300,0xc2ce,0xc29c + .word 0xc26b,0xc239,0xc208,0xc1d6,0xc1a5,0xc174,0xc142,0xc111 + .word 0xc0e0,0xc0af,0xc07e,0xc04d,0xc01c,0xbfeb,0xbfba,0xbf89 + .word 0xbf58,0xbf27,0xbef6,0xbec5,0xbe95,0xbe64,0xbe33,0xbe03 + .word 0xbdd2,0xbda2,0xbd71,0xbd41,0xbd10,0xbce0,0xbcb0,0xbc80 + .word 0xbc4f,0xbc1f,0xbbef,0xbbbf,0xbb8f,0xbb5f,0xbb2f,0xbaff + .word 0xbacf,0xba9f,0xba6f,0xba40,0xba10,0xb9e0,0xb9b1,0xb981 + .word 0xb951,0xb922,0xb8f2,0xb8c3,0xb894,0xb864,0xb835,0xb806 + .word 0xb7d6,0xb7a7,0xb778,0xb749,0xb71a,0xb6eb,0xb6bc,0xb68d + .word 0xb65e,0xb62f,0xb600,0xb5d1,0xb5a2,0xb574,0xb545,0xb516 + .word 0xb4e8,0xb4b9,0xb48a,0xb45c,0xb42e,0xb3ff,0xb3d1,0xb3a2 + .word 0xb374,0xb346,0xb318,0xb2e9,0xb2bb,0xb28d,0xb25f,0xb231 + .word 0xb203,0xb1d5,0xb1a7,0xb179,0xb14b,0xb11d,0xb0f0,0xb0c2 + .word 0xb094,0xb067,0xb039,0xb00b,0xafde,0xafb0,0xaf83,0xaf55 + .word 0xaf28,0xaefb,0xaecd,0xaea0,0xae73,0xae45,0xae18,0xadeb + .word 0xadbe,0xad91,0xad64,0xad37,0xad0a,0xacdd,0xacb0,0xac83 + .word 0xac57,0xac2a,0xabfd,0xabd0,0xaba4,0xab77,0xab4a,0xab1e + .word 0xaaf1,0xaac5,0xaa98,0xaa6c,0xaa40,0xaa13,0xa9e7,0xa9bb + .word 0xa98e,0xa962,0xa936,0xa90a,0xa8de,0xa8b2,0xa886,0xa85a + .word 0xa82e,0xa802,0xa7d6,0xa7aa,0xa77e,0xa753,0xa727,0xa6fb + .word 0xa6d0,0xa6a4,0xa678,0xa64d,0xa621,0xa5f6,0xa5ca,0xa59f + .word 0xa574,0xa548,0xa51d,0xa4f2,0xa4c6,0xa49b,0xa470,0xa445 + .word 0xa41a,0xa3ef,0xa3c4,0xa399,0xa36e,0xa343,0xa318,0xa2ed + .word 0xa2c2,0xa297,0xa26d,0xa242,0xa217,0xa1ed,0xa1c2,0xa197 + .word 0xa16d,0xa142,0xa118,0xa0ed,0xa0c3,0xa098,0xa06e,0xa044 + .word 0xa01a,0x9fef,0x9fc5,0x9f9b,0x9f71,0x9f47,0x9f1c,0x9ef2 + .word 0x9ec8,0x9e9e,0x9e74,0x9e4b,0x9e21,0x9df7,0x9dcd,0x9da3 + .word 0x9d79,0x9d50,0x9d26,0x9cfc,0x9cd3,0x9ca9,0x9c80,0x9c56 + .word 0x9c2d,0x9c03,0x9bda,0x9bb0,0x9b87,0x9b5e,0x9b34,0x9b0b + .word 0x9ae2,0x9ab9,0x9a8f,0x9a66,0x9a3d,0x9a14,0x99eb,0x99c2 + .word 0x9999,0x9970,0x9947,0x991e,0x98f6,0x98cd,0x98a4,0x987b + .word 0x9852,0x982a,0x9801,0x97d8,0x97b0,0x9787,0x975f,0x9736 + .word 0x970e,0x96e5,0x96bd,0x9695,0x966c,0x9644,0x961c,0x95f3 + .word 0x95cb,0x95a3,0x957b,0x9553,0x952b,0x9503,0x94db,0x94b3 + .word 0x948b,0x9463,0x943b,0x9413,0x93eb,0x93c3,0x939b,0x9374 + .word 0x934c,0x9324,0x92fd,0x92d5,0x92ad,0x9286,0x925e,0x9237 + .word 0x920f,0x91e8,0x91c0,0x9199,0x9172,0x914a,0x9123,0x90fc + .word 0x90d4,0x90ad,0x9086,0x905f,0x9038,0x9011,0x8fea,0x8fc3 + .word 0x8f9c,0x8f75,0x8f4e,0x8f27,0x8f00,0x8ed9,0x8eb2,0x8e8b + .word 0x8e65,0x8e3e,0x8e17,0x8df1,0x8dca,0x8da3,0x8d7d,0x8d56 + .word 0x8d30,0x8d09,0x8ce3,0x8cbc,0x8c96,0x8c6f,0x8c49,0x8c23 + .word 0x8bfc,0x8bd6,0x8bb0,0x8b8a,0x8b64,0x8b3d,0x8b17,0x8af1 + .word 0x8acb,0x8aa5,0x8a7f,0x8a59,0x8a33,0x8a0d,0x89e7,0x89c1 + .word 0x899c,0x8976,0x8950,0x892a,0x8904,0x88df,0x88b9,0x8893 + .word 0x886e,0x8848,0x8823,0x87fd,0x87d8,0x87b2,0x878d,0x8767 + .word 0x8742,0x871d,0x86f7,0x86d2,0x86ad,0x8687,0x8662,0x863d + .word 0x8618,0x85f3,0x85ce,0x85a9,0x8583,0x855e,0x8539,0x8514 + .word 0x84f0,0x84cb,0x84a6,0x8481,0x845c,0x8437,0x8412,0x83ee + .word 0x83c9,0x83a4,0x8380,0x835b,0x8336,0x8312,0x82ed,0x82c9 + .word 0x82a4,0x8280,0x825b,0x8237,0x8212,0x81ee,0x81ca,0x81a5 + .word 0x8181,0x815d,0x8138,0x8114,0x80f0,0x80cc,0x80a8,0x8084 + .word 0x8060,0x803c,0x8018,0x7ff4,0x7fd0,0x7fac,0x7f88,0x7f64 + .word 0x7f40,0x7f1c,0x7ef8,0x7ed4,0x7eb1,0x7e8d,0x7e69,0x7e45 + .word 0x7e22,0x7dfe,0x7ddb,0x7db7,0x7d93,0x7d70,0x7d4c,0x7d29 + .word 0x7d05,0x7ce2,0x7cbf,0x7c9b,0x7c78,0x7c55,0x7c31,0x7c0e + .word 0x7beb,0x7bc7,0x7ba4,0x7b81,0x7b5e,0x7b3b,0x7b18,0x7af5 + .word 0x7ad2,0x7aaf,0x7a8c,0x7a69,0x7a46,0x7a23,0x7a00,0x79dd + .word 0x79ba,0x7997,0x7975,0x7952,0x792f,0x790c,0x78ea,0x78c7 + .word 0x78a4,0x7882,0x785f,0x783c,0x781a,0x77f7,0x77d5,0x77b2 + .word 0x7790,0x776e,0x774b,0x7729,0x7706,0x76e4,0x76c2,0x76a0 + .word 0x767d,0x765b,0x7639,0x7617,0x75f5,0x75d2,0x75b0,0x758e + .word 0x756c,0x754a,0x7528,0x7506,0x74e4,0x74c2,0x74a0,0x747e + .word 0x745d,0x743b,0x7419,0x73f7,0x73d5,0x73b4,0x7392,0x7370 + .word 0x734f,0x732d,0x730b,0x72ea,0x72c8,0x72a7,0x7285,0x7264 + .word 0x7242,0x7221,0x71ff,0x71de,0x71bc,0x719b,0x717a,0x7158 + .word 0x7137,0x7116,0x70f5,0x70d3,0x70b2,0x7091,0x7070,0x704f + .word 0x702e,0x700c,0x6feb,0x6fca,0x6fa9,0x6f88,0x6f67,0x6f46 + .word 0x6f26,0x6f05,0x6ee4,0x6ec3,0x6ea2,0x6e81,0x6e60,0x6e40 + .word 0x6e1f,0x6dfe,0x6dde,0x6dbd,0x6d9c,0x6d7c,0x6d5b,0x6d3a + .word 0x6d1a,0x6cf9,0x6cd9,0x6cb8,0x6c98,0x6c77,0x6c57,0x6c37 + .word 0x6c16,0x6bf6,0x6bd6,0x6bb5,0x6b95,0x6b75,0x6b54,0x6b34 + .word 0x6b14,0x6af4,0x6ad4,0x6ab4,0x6a94,0x6a73,0x6a53,0x6a33 + .word 0x6a13,0x69f3,0x69d3,0x69b3,0x6993,0x6974,0x6954,0x6934 + .word 0x6914,0x68f4,0x68d4,0x68b5,0x6895,0x6875,0x6855,0x6836 + .word 0x6816,0x67f6,0x67d7,0x67b7,0x6798,0x6778,0x6758,0x6739 + .word 0x6719,0x66fa,0x66db,0x66bb,0x669c,0x667c,0x665d,0x663e + .word 0x661e,0x65ff,0x65e0,0x65c0,0x65a1,0x6582,0x6563,0x6544 + .word 0x6524,0x6505,0x64e6,0x64c7,0x64a8,0x6489,0x646a,0x644b + .word 0x642c,0x640d,0x63ee,0x63cf,0x63b0,0x6391,0x6373,0x6354 + .word 0x6335,0x6316,0x62f7,0x62d9,0x62ba,0x629b,0x627c,0x625e + .word 0x623f,0x6221,0x6202,0x61e3,0x61c5,0x61a6,0x6188,0x6169 + .word 0x614b,0x612c,0x610e,0x60ef,0x60d1,0x60b3,0x6094,0x6076 + .word 0x6058,0x6039,0x601b,0x5ffd,0x5fdf,0x5fc0,0x5fa2,0x5f84 + .word 0x5f66,0x5f48,0x5f2a,0x5f0b,0x5eed,0x5ecf,0x5eb1,0x5e93 + .word 0x5e75,0x5e57,0x5e39,0x5e1b,0x5dfd,0x5de0,0x5dc2,0x5da4 + .word 0x5d86,0x5d68,0x5d4a,0x5d2d,0x5d0f,0x5cf1,0x5cd3,0x5cb6 + .word 0x5c98,0x5c7a,0x5c5d,0x5c3f,0x5c21,0x5c04,0x5be6,0x5bc9 + .word 0x5bab,0x5b8e,0x5b70,0x5b53,0x5b35,0x5b18,0x5afb,0x5add + .word 0x5ac0,0x5aa2,0x5a85,0x5a68,0x5a4b,0x5a2d,0x5a10,0x59f3 + .word 0x59d6,0x59b8,0x599b,0x597e,0x5961,0x5944,0x5927,0x590a + .word 0x58ed,0x58d0,0x58b3,0x5896,0x5879,0x585c,0x583f,0x5822 + .word 0x5805,0x57e8,0x57cb,0x57ae,0x5791,0x5775,0x5758,0x573b + .word 0x571e,0x5702,0x56e5,0x56c8,0x56ac,0x568f,0x5672,0x5656 + .word 0x5639,0x561c,0x5600,0x55e3,0x55c7,0x55aa,0x558e,0x5571 + .word 0x5555,0x5538,0x551c,0x5500,0x54e3,0x54c7,0x54aa,0x548e + .word 0x5472,0x5456,0x5439,0x541d,0x5401,0x53e5,0x53c8,0x53ac + .word 0x5390,0x5374,0x5358,0x533c,0x5320,0x5304,0x52e8,0x52cb + .word 0x52af,0x5293,0x5277,0x525c,0x5240,0x5224,0x5208,0x51ec + .word 0x51d0,0x51b4,0x5198,0x517c,0x5161,0x5145,0x5129,0x510d + .word 0x50f2,0x50d6,0x50ba,0x509f,0x5083,0x5067,0x504c,0x5030 + .word 0x5015,0x4ff9,0x4fdd,0x4fc2,0x4fa6,0x4f8b,0x4f6f,0x4f54 + .word 0x4f38,0x4f1d,0x4f02,0x4ee6,0x4ecb,0x4eb0,0x4e94,0x4e79 + .word 0x4e5e,0x4e42,0x4e27,0x4e0c,0x4df0,0x4dd5,0x4dba,0x4d9f + .word 0x4d84,0x4d69,0x4d4d,0x4d32,0x4d17,0x4cfc,0x4ce1,0x4cc6 + .word 0x4cab,0x4c90,0x4c75,0x4c5a,0x4c3f,0x4c24,0x4c09,0x4bee + .word 0x4bd3,0x4bb9,0x4b9e,0x4b83,0x4b68,0x4b4d,0x4b32,0x4b18 + .word 0x4afd,0x4ae2,0x4ac7,0x4aad,0x4a92,0x4a77,0x4a5d,0x4a42 + .word 0x4a27,0x4a0d,0x49f2,0x49d8,0x49bd,0x49a3,0x4988,0x496e + .word 0x4953,0x4939,0x491e,0x4904,0x48e9,0x48cf,0x48b5,0x489a + .word 0x4880,0x4865,0x484b,0x4831,0x4817,0x47fc,0x47e2,0x47c8 + .word 0x47ae,0x4793,0x4779,0x475f,0x4745,0x472b,0x4711,0x46f6 + .word 0x46dc,0x46c2,0x46a8,0x468e,0x4674,0x465a,0x4640,0x4626 + .word 0x460c,0x45f2,0x45d8,0x45be,0x45a5,0x458b,0x4571,0x4557 + .word 0x453d,0x4523,0x4509,0x44f0,0x44d6,0x44bc,0x44a2,0x4489 + .word 0x446f,0x4455,0x443c,0x4422,0x4408,0x43ef,0x43d5,0x43bc + .word 0x43a2,0x4388,0x436f,0x4355,0x433c,0x4322,0x4309,0x42ef + .word 0x42d6,0x42bc,0x42a3,0x428a,0x4270,0x4257,0x423d,0x4224 + .word 0x420b,0x41f2,0x41d8,0x41bf,0x41a6,0x418c,0x4173,0x415a + .word 0x4141,0x4128,0x410e,0x40f5,0x40dc,0x40c3,0x40aa,0x4091 + .word 0x4078,0x405f,0x4046,0x402d,0x4014,0x3ffb,0x3fe2,0x3fc9 + .word 0x3fb0,0x3f97,0x3f7e,0x3f65,0x3f4c,0x3f33,0x3f1a,0x3f01 + .word 0x3ee8,0x3ed0,0x3eb7,0x3e9e,0x3e85,0x3e6c,0x3e54,0x3e3b + .word 0x3e22,0x3e0a,0x3df1,0x3dd8,0x3dc0,0x3da7,0x3d8e,0x3d76 + .word 0x3d5d,0x3d45,0x3d2c,0x3d13,0x3cfb,0x3ce2,0x3cca,0x3cb1 + .word 0x3c99,0x3c80,0x3c68,0x3c50,0x3c37,0x3c1f,0x3c06,0x3bee + .word 0x3bd6,0x3bbd,0x3ba5,0x3b8d,0x3b74,0x3b5c,0x3b44,0x3b2b + .word 0x3b13,0x3afb,0x3ae3,0x3acb,0x3ab2,0x3a9a,0x3a82,0x3a6a + .word 0x3a52,0x3a3a,0x3a22,0x3a09,0x39f1,0x39d9,0x39c1,0x39a9 + .word 0x3991,0x3979,0x3961,0x3949,0x3931,0x3919,0x3901,0x38ea + .word 0x38d2,0x38ba,0x38a2,0x388a,0x3872,0x385a,0x3843,0x382b + .word 0x3813,0x37fb,0x37e3,0x37cc,0x37b4,0x379c,0x3785,0x376d + .word 0x3755,0x373e,0x3726,0x370e,0x36f7,0x36df,0x36c8,0x36b0 + .word 0x3698,0x3681,0x3669,0x3652,0x363a,0x3623,0x360b,0x35f4 + .word 0x35dc,0x35c5,0x35ae,0x3596,0x357f,0x3567,0x3550,0x3539 + .word 0x3521,0x350a,0x34f3,0x34db,0x34c4,0x34ad,0x3496,0x347e + .word 0x3467,0x3450,0x3439,0x3422,0x340a,0x33f3,0x33dc,0x33c5 + .word 0x33ae,0x3397,0x3380,0x3368,0x3351,0x333a,0x3323,0x330c + .word 0x32f5,0x32de,0x32c7,0x32b0,0x3299,0x3282,0x326c,0x3255 + .word 0x323e,0x3227,0x3210,0x31f9,0x31e2,0x31cb,0x31b5,0x319e + .word 0x3187,0x3170,0x3159,0x3143,0x312c,0x3115,0x30fe,0x30e8 + .word 0x30d1,0x30ba,0x30a4,0x308d,0x3076,0x3060,0x3049,0x3033 + .word 0x301c,0x3005,0x2fef,0x2fd8,0x2fc2,0x2fab,0x2f95,0x2f7e + .word 0x2f68,0x2f51,0x2f3b,0x2f24,0x2f0e,0x2ef8,0x2ee1,0x2ecb + .word 0x2eb4,0x2e9e,0x2e88,0x2e71,0x2e5b,0x2e45,0x2e2e,0x2e18 + .word 0x2e02,0x2dec,0x2dd5,0x2dbf,0x2da9,0x2d93,0x2d7c,0x2d66 + .word 0x2d50,0x2d3a,0x2d24,0x2d0e,0x2cf8,0x2ce1,0x2ccb,0x2cb5 + .word 0x2c9f,0x2c89,0x2c73,0x2c5d,0x2c47,0x2c31,0x2c1b,0x2c05 + .word 0x2bef,0x2bd9,0x2bc3,0x2bad,0x2b97,0x2b81,0x2b6c,0x2b56 + .word 0x2b40,0x2b2a,0x2b14,0x2afe,0x2ae8,0x2ad3,0x2abd,0x2aa7 + .word 0x2a91,0x2a7c,0x2a66,0x2a50,0x2a3a,0x2a25,0x2a0f,0x29f9 + .word 0x29e4,0x29ce,0x29b8,0x29a3,0x298d,0x2977,0x2962,0x294c + .word 0x2937,0x2921,0x290c,0x28f6,0x28e0,0x28cb,0x28b5,0x28a0 + .word 0x288b,0x2875,0x2860,0x284a,0x2835,0x281f,0x280a,0x27f5 + .word 0x27df,0x27ca,0x27b4,0x279f,0x278a,0x2774,0x275f,0x274a + .word 0x2735,0x271f,0x270a,0x26f5,0x26e0,0x26ca,0x26b5,0x26a0 + .word 0x268b,0x2676,0x2660,0x264b,0x2636,0x2621,0x260c,0x25f7 + .word 0x25e2,0x25cd,0x25b8,0x25a2,0x258d,0x2578,0x2563,0x254e + .word 0x2539,0x2524,0x250f,0x24fa,0x24e5,0x24d1,0x24bc,0x24a7 + .word 0x2492,0x247d,0x2468,0x2453,0x243e,0x2429,0x2415,0x2400 + .word 0x23eb,0x23d6,0x23c1,0x23ad,0x2398,0x2383,0x236e,0x235a + .word 0x2345,0x2330,0x231c,0x2307,0x22f2,0x22dd,0x22c9,0x22b4 + .word 0x22a0,0x228b,0x2276,0x2262,0x224d,0x2239,0x2224,0x2210 + .word 0x21fb,0x21e6,0x21d2,0x21bd,0x21a9,0x2194,0x2180,0x216c + .word 0x2157,0x2143,0x212e,0x211a,0x2105,0x20f1,0x20dd,0x20c8 + .word 0x20b4,0x20a0,0x208b,0x2077,0x2063,0x204e,0x203a,0x2026 + .word 0x2012,0x1ffd,0x1fe9,0x1fd5,0x1fc1,0x1fac,0x1f98,0x1f84 + .word 0x1f70,0x1f5c,0x1f47,0x1f33,0x1f1f,0x1f0b,0x1ef7,0x1ee3 + .word 0x1ecf,0x1ebb,0x1ea7,0x1e93,0x1e7f,0x1e6a,0x1e56,0x1e42 + .word 0x1e2e,0x1e1a,0x1e06,0x1df3,0x1ddf,0x1dcb,0x1db7,0x1da3 + .word 0x1d8f,0x1d7b,0x1d67,0x1d53,0x1d3f,0x1d2b,0x1d18,0x1d04 + .word 0x1cf0,0x1cdc,0x1cc8,0x1cb5,0x1ca1,0x1c8d,0x1c79,0x1c65 + .word 0x1c52,0x1c3e,0x1c2a,0x1c17,0x1c03,0x1bef,0x1bdb,0x1bc8 + .word 0x1bb4,0x1ba0,0x1b8d,0x1b79,0x1b66,0x1b52,0x1b3e,0x1b2b + .word 0x1b17,0x1b04,0x1af0,0x1add,0x1ac9,0x1ab6,0x1aa2,0x1a8f + .word 0x1a7b,0x1a68,0x1a54,0x1a41,0x1a2d,0x1a1a,0x1a06,0x19f3 + .word 0x19e0,0x19cc,0x19b9,0x19a5,0x1992,0x197f,0x196b,0x1958 + .word 0x1945,0x1931,0x191e,0x190b,0x18f8,0x18e4,0x18d1,0x18be + .word 0x18ab,0x1897,0x1884,0x1871,0x185e,0x184b,0x1837,0x1824 + .word 0x1811,0x17fe,0x17eb,0x17d8,0x17c4,0x17b1,0x179e,0x178b + .word 0x1778,0x1765,0x1752,0x173f,0x172c,0x1719,0x1706,0x16f3 + .word 0x16e0,0x16cd,0x16ba,0x16a7,0x1694,0x1681,0x166e,0x165b + .word 0x1648,0x1635,0x1623,0x1610,0x15fd,0x15ea,0x15d7,0x15c4 + .word 0x15b1,0x159f,0x158c,0x1579,0x1566,0x1553,0x1541,0x152e + .word 0x151b,0x1508,0x14f6,0x14e3,0x14d0,0x14bd,0x14ab,0x1498 + .word 0x1485,0x1473,0x1460,0x144d,0x143b,0x1428,0x1416,0x1403 + .word 0x13f0,0x13de,0x13cb,0x13b9,0x13a6,0x1394,0x1381,0x136f + .word 0x135c,0x1349,0x1337,0x1325,0x1312,0x1300,0x12ed,0x12db + .word 0x12c8,0x12b6,0x12a3,0x1291,0x127f,0x126c,0x125a,0x1247 + .word 0x1235,0x1223,0x1210,0x11fe,0x11ec,0x11d9,0x11c7,0x11b5 + .word 0x11a3,0x1190,0x117e,0x116c,0x1159,0x1147,0x1135,0x1123 + .word 0x1111,0x10fe,0x10ec,0x10da,0x10c8,0x10b6,0x10a4,0x1091 + .word 0x107f,0x106d,0x105b,0x1049,0x1037,0x1025,0x1013,0x1001 + .word 0x0fef,0x0fdc,0x0fca,0x0fb8,0x0fa6,0x0f94,0x0f82,0x0f70 + .word 0x0f5e,0x0f4c,0x0f3a,0x0f28,0x0f17,0x0f05,0x0ef3,0x0ee1 + .word 0x0ecf,0x0ebd,0x0eab,0x0e99,0x0e87,0x0e75,0x0e64,0x0e52 + .word 0x0e40,0x0e2e,0x0e1c,0x0e0a,0x0df9,0x0de7,0x0dd5,0x0dc3 + .word 0x0db2,0x0da0,0x0d8e,0x0d7c,0x0d6b,0x0d59,0x0d47,0x0d35 + .word 0x0d24,0x0d12,0x0d00,0x0cef,0x0cdd,0x0ccb,0x0cba,0x0ca8 + .word 0x0c97,0x0c85,0x0c73,0x0c62,0x0c50,0x0c3f,0x0c2d,0x0c1c + .word 0x0c0a,0x0bf8,0x0be7,0x0bd5,0x0bc4,0x0bb2,0x0ba1,0x0b8f + .word 0x0b7e,0x0b6c,0x0b5b,0x0b4a,0x0b38,0x0b27,0x0b15,0x0b04 + .word 0x0af2,0x0ae1,0x0ad0,0x0abe,0x0aad,0x0a9c,0x0a8a,0x0a79 + .word 0x0a68,0x0a56,0x0a45,0x0a34,0x0a22,0x0a11,0x0a00,0x09ee + .word 0x09dd,0x09cc,0x09bb,0x09a9,0x0998,0x0987,0x0976,0x0965 + .word 0x0953,0x0942,0x0931,0x0920,0x090f,0x08fe,0x08ec,0x08db + .word 0x08ca,0x08b9,0x08a8,0x0897,0x0886,0x0875,0x0864,0x0853 + .word 0x0842,0x0831,0x081f,0x080e,0x07fd,0x07ec,0x07db,0x07ca + .word 0x07b9,0x07a8,0x0798,0x0787,0x0776,0x0765,0x0754,0x0743 + .word 0x0732,0x0721,0x0710,0x06ff,0x06ee,0x06dd,0x06cd,0x06bc + .word 0x06ab,0x069a,0x0689,0x0678,0x0668,0x0657,0x0646,0x0635 + .word 0x0624,0x0614,0x0603,0x05f2,0x05e1,0x05d1,0x05c0,0x05af + .word 0x059e,0x058e,0x057d,0x056c,0x055c,0x054b,0x053a,0x052a + .word 0x0519,0x0508,0x04f8,0x04e7,0x04d6,0x04c6,0x04b5,0x04a5 + .word 0x0494,0x0484,0x0473,0x0462,0x0452,0x0441,0x0431,0x0420 + .word 0x0410,0x03ff,0x03ef,0x03de,0x03ce,0x03bd,0x03ad,0x039c + .word 0x038c,0x037b,0x036b,0x035b,0x034a,0x033a,0x0329,0x0319 + .word 0x0309,0x02f8,0x02e8,0x02d7,0x02c7,0x02b7,0x02a6,0x0296 + .word 0x0286,0x0275,0x0265,0x0255,0x0245,0x0234,0x0224,0x0214 + .word 0x0204,0x01f3,0x01e3,0x01d3,0x01c3,0x01b2,0x01a2,0x0192 + .word 0x0182,0x0172,0x0161,0x0151,0x0141,0x0131,0x0121,0x0111 + .word 0x0101,0x00f0,0x00e0,0x00d0,0x00c0,0x00b0,0x00a0,0x0090 + .word 0x0080,0x0070,0x0060,0x0050,0x0040,0x0030,0x0020,0x0010 +DATAEND() +ASM_END() diff --git a/rts/gmp/mpn/alpha/lshift.asm b/rts/gmp/mpn/alpha/lshift.asm new file mode 100644 index 0000000000..87c46f6fe7 --- /dev/null +++ b/rts/gmp/mpn/alpha/lshift.asm @@ -0,0 +1,104 @@ +dnl Alpha mpn_lshift -- Shift a number left. + +dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl cnt r19 + +dnl This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, +dnl it would take 4 cycles/limb. It should be possible to get down to 3 +dnl cycles/limb since both ldq and stq can be paired with the other used +dnl instructions. But there are many restrictions in the 21064 pipeline that +dnl makes it hard, if not impossible, to get down to 3 cycles/limb: + +dnl 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. +dnl 2. Only aligned instruction pairs can be paired. +dnl 3. The store buffer or silo might not be able to deal with the bandwidth. + +ASM_START() +PROLOGUE(mpn_lshift) + s8addq r18,r17,r17 C make r17 point at end of s1 + ldq r4,-8(r17) C load first limb + subq r17,8,r17 + subq r31,r19,r7 + s8addq r18,r16,r16 C make r16 point at end of RES + subq r18,1,r18 + and r18,4-1,r20 C number of limbs in first loop + srl r4,r7,r0 C compute function result + + beq r20,$L0 + subq r18,r20,r18 + + ALIGN(8) +$Loop0: + ldq r3,-8(r17) + subq r16,8,r16 + subq r17,8,r17 + subq r20,1,r20 + sll r4,r19,r5 + srl r3,r7,r6 + bis r3,r3,r4 + bis r5,r6,r8 + stq r8,0(r16) + bne r20,$Loop0 + +$L0: beq r18,$Lend + + ALIGN(8) +$Loop: ldq r3,-8(r17) + subq r16,32,r16 + subq r18,4,r18 + sll r4,r19,r5 + srl r3,r7,r6 + + ldq r4,-16(r17) + sll r3,r19,r1 + bis r5,r6,r8 + stq r8,24(r16) + srl r4,r7,r2 + + ldq r3,-24(r17) + sll r4,r19,r5 + bis r1,r2,r8 + stq r8,16(r16) + srl r3,r7,r6 + + ldq r4,-32(r17) + sll r3,r19,r1 + bis r5,r6,r8 + stq r8,8(r16) + srl r4,r7,r2 + + subq r17,32,r17 + bis r1,r2,r8 + stq r8,0(r16) + + bgt r18,$Loop + +$Lend: sll r4,r19,r8 + stq r8,-8(r16) + ret r31,(r26),1 +EPILOGUE(mpn_lshift) +ASM_END() diff --git a/rts/gmp/mpn/alpha/mul_1.asm b/rts/gmp/mpn/alpha/mul_1.asm new file mode 100644 index 0000000000..46b8df34f5 --- /dev/null +++ b/rts/gmp/mpn/alpha/mul_1.asm @@ -0,0 +1,71 @@ +dnl Alpha __gmpn_mul_1 -- Multiply a limb vector with a limb and store +dnl the result in a second limb vector. + +dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl s2_limb r19 + +dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7 +dnl cycles/limb on EV6. + +ASM_START() +PROLOGUE(mpn_mul_1) + ldq r2,0(r17) C r2 = s1_limb + subq r18,1,r18 C size-- + mulq r2,r19,r3 C r3 = prod_low + bic r31,r31,r4 C clear cy_limb + umulh r2,r19,r0 C r0 = prod_high + beq r18,$Lend1 C jump if size was == 1 + ldq r2,8(r17) C r2 = s1_limb + subq r18,1,r18 C size-- + stq r3,0(r16) + beq r18,$Lend2 C jump if size was == 2 + + ALIGN(8) +$Loop: mulq r2,r19,r3 C r3 = prod_low + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + subq r18,1,r18 C size-- + umulh r2,r19,r4 C r4 = cy_limb + ldq r2,16(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + addq r3,r0,r3 C r3 = cy_limb + prod_low + stq r3,8(r16) + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + addq r16,8,r16 C res_ptr++ + bne r18,$Loop + +$Lend2: mulq r2,r19,r3 C r3 = prod_low + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + umulh r2,r19,r4 C r4 = cy_limb + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + stq r3,8(r16) + addq r4,r0,r0 C cy_limb = prod_high + cy + ret r31,(r26),1 +$Lend1: stq r3,0(r16) + ret r31,(r26),1 +EPILOGUE(mpn_mul_1) +ASM_END() diff --git a/rts/gmp/mpn/alpha/rshift.asm b/rts/gmp/mpn/alpha/rshift.asm new file mode 100644 index 0000000000..aa25eda54e --- /dev/null +++ b/rts/gmp/mpn/alpha/rshift.asm @@ -0,0 +1,102 @@ +dnl Alpha mpn_rshift -- Shift a number right. + +dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl cnt r19 + +dnl This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, +dnl it would take 4 cycles/limb. It should be possible to get down to 3 +dnl cycles/limb since both ldq and stq can be paired with the other used +dnl instructions. But there are many restrictions in the 21064 pipeline that +dnl makes it hard, if not impossible, to get down to 3 cycles/limb: + +dnl 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. +dnl 2. Only aligned instruction pairs can be paired. +dnl 3. The store buffer or silo might not be able to deal with the bandwidth. + +ASM_START() +PROLOGUE(mpn_rshift) + ldq r4,0(r17) C load first limb + addq r17,8,r17 + subq r31,r19,r7 + subq r18,1,r18 + and r18,4-1,r20 C number of limbs in first loop + sll r4,r7,r0 C compute function result + + beq r20,$L0 + subq r18,r20,r18 + + ALIGN(8) +$Loop0: + ldq r3,0(r17) + addq r16,8,r16 + addq r17,8,r17 + subq r20,1,r20 + srl r4,r19,r5 + sll r3,r7,r6 + bis r3,r3,r4 + bis r5,r6,r8 + stq r8,-8(r16) + bne r20,$Loop0 + +$L0: beq r18,$Lend + + ALIGN(8) +$Loop: ldq r3,0(r17) + addq r16,32,r16 + subq r18,4,r18 + srl r4,r19,r5 + sll r3,r7,r6 + + ldq r4,8(r17) + srl r3,r19,r1 + bis r5,r6,r8 + stq r8,-32(r16) + sll r4,r7,r2 + + ldq r3,16(r17) + srl r4,r19,r5 + bis r1,r2,r8 + stq r8,-24(r16) + sll r3,r7,r6 + + ldq r4,24(r17) + srl r3,r19,r1 + bis r5,r6,r8 + stq r8,-16(r16) + sll r4,r7,r2 + + addq r17,32,r17 + bis r1,r2,r8 + stq r8,-8(r16) + + bgt r18,$Loop + +$Lend: srl r4,r19,r8 + stq r8,0(r16) + ret r31,(r26),1 +EPILOGUE(mpn_rshift) +ASM_END() diff --git a/rts/gmp/mpn/alpha/sub_n.asm b/rts/gmp/mpn/alpha/sub_n.asm new file mode 100644 index 0000000000..718f657141 --- /dev/null +++ b/rts/gmp/mpn/alpha/sub_n.asm @@ -0,0 +1,114 @@ +dnl Alpha mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. + +dnl Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl s2_ptr r18 +dnl size r19 + +ASM_START() +PROLOGUE(mpn_sub_n) + ldq r3,0(r17) + ldq r4,0(r18) + + subq r19,1,r19 + and r19,4-1,r2 C number of limbs in first loop + bis r31,r31,r0 + beq r2,$L0 C if multiple of 4 limbs, skip first loop + + subq r19,r2,r19 + +$Loop0: subq r2,1,r2 + ldq r5,8(r17) + addq r4,r0,r4 + ldq r6,8(r18) + cmpult r4,r0,r1 + subq r3,r4,r4 + cmpult r3,r4,r0 + stq r4,0(r16) + bis r0,r1,r0 + + addq r17,8,r17 + addq r18,8,r18 + bis r5,r5,r3 + bis r6,r6,r4 + addq r16,8,r16 + bne r2,$Loop0 + +$L0: beq r19,$Lend + + ALIGN(8) +$Loop: subq r19,4,r19 + + ldq r5,8(r17) + addq r4,r0,r4 + ldq r6,8(r18) + cmpult r4,r0,r1 + subq r3,r4,r4 + cmpult r3,r4,r0 + stq r4,0(r16) + bis r0,r1,r0 + + ldq r3,16(r17) + addq r6,r0,r6 + ldq r4,16(r18) + cmpult r6,r0,r1 + subq r5,r6,r6 + cmpult r5,r6,r0 + stq r6,8(r16) + bis r0,r1,r0 + + ldq r5,24(r17) + addq r4,r0,r4 + ldq r6,24(r18) + cmpult r4,r0,r1 + subq r3,r4,r4 + cmpult r3,r4,r0 + stq r4,16(r16) + bis r0,r1,r0 + + ldq r3,32(r17) + addq r6,r0,r6 + ldq r4,32(r18) + cmpult r6,r0,r1 + subq r5,r6,r6 + cmpult r5,r6,r0 + stq r6,24(r16) + bis r0,r1,r0 + + addq r17,32,r17 + addq r18,32,r18 + addq r16,32,r16 + bne r19,$Loop + +$Lend: addq r4,r0,r4 + cmpult r4,r0,r1 + subq r3,r4,r4 + cmpult r3,r4,r0 + stq r4,0(r16) + bis r0,r1,r0 + ret r31,(r26),1 +EPILOGUE(mpn_sub_n) +ASM_END() diff --git a/rts/gmp/mpn/alpha/submul_1.asm b/rts/gmp/mpn/alpha/submul_1.asm new file mode 100644 index 0000000000..caec1a720b --- /dev/null +++ b/rts/gmp/mpn/alpha/submul_1.asm @@ -0,0 +1,87 @@ +dnl Alpha __gmpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl s2_limb r19 + +dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7 +dnl cycles/limb on EV6. + +ASM_START() +PROLOGUE(mpn_submul_1) + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + subq r18,1,r18 C size-- + mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + umulh r2,r19,r0 C r0 = prod_high + beq r18,$Lend1 C jump if size was == 1 + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + subq r18,1,r18 C size-- + subq r5,r3,r3 + cmpult r5,r3,r4 + stq r3,0(r16) + addq r16,8,r16 C res_ptr++ + beq r18,$Lend2 C jump if size was == 2 + + ALIGN(8) +$Loop: mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + subq r18,1,r18 C size-- + umulh r2,r19,r4 C r4 = cy_limb + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + subq r5,r3,r3 + cmpult r5,r3,r5 + stq r3,0(r16) + addq r16,8,r16 C res_ptr++ + addq r5,r0,r0 C combine carries + bne r18,$Loop + +$Lend2: mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + umulh r2,r19,r4 C r4 = cy_limb + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + subq r5,r3,r3 + cmpult r5,r3,r5 + stq r3,0(r16) + addq r5,r0,r0 C combine carries + addq r4,r0,r0 C cy_limb = prod_high + cy + ret r31,(r26),1 +$Lend1: subq r5,r3,r3 + cmpult r5,r3,r5 + stq r3,0(r16) + addq r0,r5,r0 + ret r31,(r26),1 +EPILOGUE(mpn_submul_1) +ASM_END() diff --git a/rts/gmp/mpn/alpha/udiv_qrnnd.S b/rts/gmp/mpn/alpha/udiv_qrnnd.S new file mode 100644 index 0000000000..53814bbcb0 --- /dev/null +++ b/rts/gmp/mpn/alpha/udiv_qrnnd.S @@ -0,0 +1,151 @@ + # Alpha 21064 __udiv_qrnnd + + # Copyright (C) 1992, 1994, 1995, 1997, 2000 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + .set noreorder + .set noat +.text + .align 3 + .globl __gmpn_udiv_qrnnd + .ent __gmpn_udiv_qrnnd +__gmpn_udiv_qrnnd: + .frame $30,0,$26,0 + .prologue 0 +#define cnt $2 +#define tmp $3 +#define rem_ptr $16 +#define n1 $17 +#define n0 $18 +#define d $19 +#define qb $20 + + ldiq cnt,16 + blt d,.Largedivisor + +.Loop1: cmplt n0,0,tmp + addq n1,n1,n1 + bis n1,tmp,n1 + addq n0,n0,n0 + cmpule d,n1,qb + subq n1,d,tmp + cmovne qb,tmp,n1 + bis n0,qb,n0 + cmplt n0,0,tmp + addq n1,n1,n1 + bis n1,tmp,n1 + addq n0,n0,n0 + cmpule d,n1,qb + subq n1,d,tmp + cmovne qb,tmp,n1 + bis n0,qb,n0 + cmplt n0,0,tmp + addq n1,n1,n1 + bis n1,tmp,n1 + addq n0,n0,n0 + cmpule d,n1,qb + subq n1,d,tmp + cmovne qb,tmp,n1 + bis n0,qb,n0 + cmplt n0,0,tmp + addq n1,n1,n1 + bis n1,tmp,n1 + addq n0,n0,n0 + cmpule d,n1,qb + subq n1,d,tmp + cmovne qb,tmp,n1 + bis n0,qb,n0 + subq cnt,1,cnt + bgt cnt,.Loop1 + stq n1,0(rem_ptr) + bis $31,n0,$0 + ret $31,($26),1 + +.Largedivisor: + and n0,1,$4 + + srl n0,1,n0 + sll n1,63,tmp + or tmp,n0,n0 + srl n1,1,n1 + + and d,1,$6 + srl d,1,$5 + addq $5,$6,$5 + +.Loop2: cmplt n0,0,tmp + addq n1,n1,n1 + bis n1,tmp,n1 + addq n0,n0,n0 + cmpule $5,n1,qb + subq n1,$5,tmp + cmovne qb,tmp,n1 + bis n0,qb,n0 + cmplt n0,0,tmp + addq n1,n1,n1 + bis n1,tmp,n1 + addq n0,n0,n0 + cmpule $5,n1,qb + subq n1,$5,tmp + cmovne qb,tmp,n1 + bis n0,qb,n0 + cmplt n0,0,tmp + addq n1,n1,n1 + bis n1,tmp,n1 + addq n0,n0,n0 + cmpule $5,n1,qb + subq n1,$5,tmp + cmovne qb,tmp,n1 + bis n0,qb,n0 + cmplt n0,0,tmp + addq n1,n1,n1 + bis n1,tmp,n1 + addq n0,n0,n0 + cmpule $5,n1,qb + subq n1,$5,tmp + cmovne qb,tmp,n1 + bis n0,qb,n0 + subq cnt,1,cnt + bgt cnt,.Loop2 + + addq n1,n1,n1 + addq $4,n1,n1 + bne $6,.LOdd + stq n1,0(rem_ptr) + bis $31,n0,$0 + ret $31,($26),1 + +.LOdd: + /* q' in n0. r' in n1 */ + addq n1,n0,n1 + cmpult n1,n0,tmp # tmp := carry from addq + beq tmp,.LLp6 + addq n0,1,n0 + subq n1,d,n1 +.LLp6: cmpult n1,d,tmp + bne tmp,.LLp7 + addq n0,1,n0 + subq n1,d,n1 +.LLp7: + stq n1,0(rem_ptr) + bis $31,n0,$0 + ret $31,($26),1 + + .end __gmpn_udiv_qrnnd diff --git a/rts/gmp/mpn/alpha/umul.asm b/rts/gmp/mpn/alpha/umul.asm new file mode 100644 index 0000000000..44428ed5f5 --- /dev/null +++ b/rts/gmp/mpn/alpha/umul.asm @@ -0,0 +1,39 @@ +dnl Currently unused. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + .set noreorder + .set volatile + .set noat + +.text + .align 3 + .globl __umul_ppmm + .ent __umul_ppmm +__umul_ppmm: +__umul_ppmm..ng: + .frame $30,0,$26,0 + .prologue 0 + mulq $17,$18,$1 + umulh $17,$18,$0 + stq $1,0($16) + ret $31,($26),1 + .end __umul_ppmm diff --git a/rts/gmp/mpn/alpha/unicos.m4 b/rts/gmp/mpn/alpha/unicos.m4 new file mode 100644 index 0000000000..7ff26c090c --- /dev/null +++ b/rts/gmp/mpn/alpha/unicos.m4 @@ -0,0 +1,63 @@ +divert(-1) + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +define(`ASM_START', + `.ident dummy') + +define(`X',`^X$1') +define(`FLOAT64', + `dnl + .psect $1@crud,data +$1: .t_floating $2 + .endp') + +define(`PROLOGUE', + `dnl + .stack 192 ; What does this mean? Only Cray knows. + .psect $1@code,code,cache +$1::') +define(`PROLOGUE_GP', `PROLOGUE($1)') + +define(`EPILOGUE', + `dnl + .endp') + +define(`DATASTART', + `dnl + .psect $1@crud,data +$1:') +define(`DATAEND', + `dnl + .endp') + +define(`ASM_END', + `dnl + .end') + +define(`unop',`bis r31,r31,r31') ; Unicos assembler lacks unop +define(`cvttqc',`cvttq/c') + +define(`ALIGN',`') ; Unicos assembler seems to align using garbage + +divert + |