diff options
author | tege <tege@gmplib.org> | 2001-02-12 01:51:47 +0100 |
---|---|---|
committer | tege <tege@gmplib.org> | 2001-02-12 01:51:47 +0100 |
commit | 2879ac3a9a1427c232d3ef06fe979ba016ac1e99 (patch) | |
tree | 0ca6a801c777d7778fcf097580962b789cc08e39 /mpn/alpha | |
parent | 8a4b05e64fe3819b51da494eba2ac5d7440dc3d5 (diff) | |
download | gmp-2879ac3a9a1427c232d3ef06fe979ba016ac1e99.tar.gz |
Optimize. Now really runs at 18 cycles/limb for ev5 as comment claims.
Also runs well on ev6.
Diffstat (limited to 'mpn/alpha')
-rw-r--r-- | mpn/alpha/sqr_diagonal.asm | 14 |
1 files changed, 7 insertions, 7 deletions
diff --git a/mpn/alpha/sqr_diagonal.asm b/mpn/alpha/sqr_diagonal.asm index 5b4e79f60..19060f926 100644 --- a/mpn/alpha/sqr_diagonal.asm +++ b/mpn/alpha/sqr_diagonal.asm @@ -26,7 +26,7 @@ dnl res_ptr r16 dnl s1_ptr r17 dnl size r18 -dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and ?? +dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 4 dnl cycles/limb on EV6. ASM_START() @@ -37,22 +37,22 @@ PROLOGUE(mpn_sqr_diagonal) umulh r2,r2,r4 C r4 = prod_high blt r18,$Lend1 C jump if size was == 1 ldq r2,8(r17) C r2 = s1_limb - stq r3,0(r16) - stq r4,8(r16) beq r18,$Lend2 C jump if size was == 2 ALIGN(8) -$Loop: mulq r2,r2,r3 C r3 = prod_low +$Loop: stq r3,0(r16) + mulq r2,r2,r3 C r3 = prod_low lda r18,-1(r18) C size-- + stq r4,8(r16) umulh r2,r2,r4 C r4 = cy_limb ldq r2,16(r17) C r2 = s1_limb lda r17,8(r17) C s1_ptr++ - stq r3,16(r16) - stq r4,24(r16) lda r16,16(r16) C res_ptr++ bne r18,$Loop -$Lend2: mulq r2,r2,r3 C r3 = prod_low +$Lend2: stq r3,0(r16) + mulq r2,r2,r3 C r3 = prod_low + stq r4,8(r16) umulh r2,r2,r4 C r4 = cy_limb stq r3,16(r16) stq r4,24(r16) |