diff options
Diffstat (limited to 'crypto/bn/asm/alpha.works/mul_c8.pl')
-rw-r--r-- | crypto/bn/asm/alpha.works/mul_c8.pl | 177 |
1 files changed, 177 insertions, 0 deletions
diff --git a/crypto/bn/asm/alpha.works/mul_c8.pl b/crypto/bn/asm/alpha.works/mul_c8.pl new file mode 100644 index 0000000000..525ca7494b --- /dev/null +++ b/crypto/bn/asm/alpha.works/mul_c8.pl @@ -0,0 +1,177 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_mul_comba8 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(3); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + + &function_begin($name,""); + + &comment(""); + + &stack_push(2); + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($b[0])=&NR(1),&QWPw(0,$bp)); + &st($reg_s0,&swtmp(0)); &FR($reg_s0); + &st($reg_s1,&swtmp(1)); &FR($reg_s1); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($b[1])=&NR(1),&QWPw(1,$bp)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($b[2])=&NR(1),&QWPw(2,$bp)); + &ld(($a[3])=&NR(1),&QWPw(3,$ap)); + &ld(($b[3])=&NR(1),&QWPw(3,$bp)); + &ld(($a[4])=&NR(1),&QWPw(1,$ap)); + &ld(($b[4])=&NR(1),&QWPw(1,$bp)); + &ld(($a[5])=&NR(1),&QWPw(1,$ap)); + &ld(($b[5])=&NR(1),&QWPw(1,$bp)); + &ld(($a[6])=&NR(1),&QWPw(1,$ap)); + &ld(($b[6])=&NR(1),&QWPw(1,$bp)); + &ld(($a[7])=&NR(1),&QWPw(1,$ap)); &FR($ap); + &ld(($b[7])=&NR(1),&QWPw(1,$bp)); &FR($bp); + + ($c0,$c1,$c2)=&NR(3); + &mov("zero",$c2); + &mul($a[0],$b[0],$c0); + &muh($a[0],$b[0],$c1); + &st($c0,&QWPw(0,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[1],$c0,$c1,$c2); + &mul_add_c($a[1],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(1,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[2],$c0,$c1,$c2); + &mul_add_c($a[1],$b[1],$c0,$c1,$c2); + &mul_add_c($a[2],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(2,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[3],$c0,$c1,$c2); + &mul_add_c($a[1],$b[2],$c0,$c1,$c2); + &mul_add_c($a[2],$b[1],$c0,$c1,$c2); + &mul_add_c($a[3],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(3,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[4],$c0,$c1,$c2); + &mul_add_c($a[1],$b[3],$c0,$c1,$c2); + &mul_add_c($a[2],$b[2],$c0,$c1,$c2); + &mul_add_c($a[3],$b[1],$c0,$c1,$c2); + &mul_add_c($a[4],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(4,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[5],$c0,$c1,$c2); + &mul_add_c($a[1],$b[4],$c0,$c1,$c2); + &mul_add_c($a[2],$b[3],$c0,$c1,$c2); + &mul_add_c($a[3],$b[2],$c0,$c1,$c2); + &mul_add_c($a[4],$b[1],$c0,$c1,$c2); + &mul_add_c($a[5],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(5,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[6],$c0,$c1,$c2); + &mul_add_c($a[1],$b[5],$c0,$c1,$c2); + &mul_add_c($a[2],$b[4],$c0,$c1,$c2); + &mul_add_c($a[3],$b[3],$c0,$c1,$c2); + &mul_add_c($a[4],$b[2],$c0,$c1,$c2); + &mul_add_c($a[5],$b[1],$c0,$c1,$c2); + &mul_add_c($a[6],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(6,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[7],$c0,$c1,$c2); &FR($a[0]); + &mul_add_c($a[1],$b[6],$c0,$c1,$c2); + &mul_add_c($a[2],$b[5],$c0,$c1,$c2); + &mul_add_c($a[3],$b[4],$c0,$c1,$c2); + &mul_add_c($a[4],$b[3],$c0,$c1,$c2); + &mul_add_c($a[5],$b[2],$c0,$c1,$c2); + &mul_add_c($a[6],$b[1],$c0,$c1,$c2); + &mul_add_c($a[7],$b[0],$c0,$c1,$c2); &FR($b[0]); + &st($c0,&QWPw(7,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[1],$b[7],$c0,$c1,$c2); &FR($a[1]); + &mul_add_c($a[2],$b[6],$c0,$c1,$c2); + &mul_add_c($a[3],$b[5],$c0,$c1,$c2); + &mul_add_c($a[4],$b[4],$c0,$c1,$c2); + &mul_add_c($a[5],$b[3],$c0,$c1,$c2); + &mul_add_c($a[6],$b[2],$c0,$c1,$c2); + &mul_add_c($a[7],$b[1],$c0,$c1,$c2); &FR($b[1]); + &st($c0,&QWPw(8,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[2],$b[7],$c0,$c1,$c2); &FR($a[2]); + &mul_add_c($a[3],$b[6],$c0,$c1,$c2); + &mul_add_c($a[4],$b[5],$c0,$c1,$c2); + &mul_add_c($a[5],$b[4],$c0,$c1,$c2); + &mul_add_c($a[6],$b[3],$c0,$c1,$c2); + &mul_add_c($a[7],$b[2],$c0,$c1,$c2); &FR($b[2]); + &st($c0,&QWPw(9,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[3],$b[7],$c0,$c1,$c2); &FR($a[3]); + &mul_add_c($a[4],$b[6],$c0,$c1,$c2); + &mul_add_c($a[5],$b[5],$c0,$c1,$c2); + &mul_add_c($a[6],$b[4],$c0,$c1,$c2); + &mul_add_c($a[7],$b[3],$c0,$c1,$c2); &FR($b[3]); + &st($c0,&QWPw(10,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[4],$b[7],$c0,$c1,$c2); &FR($a[4]); + &mul_add_c($a[5],$b[6],$c0,$c1,$c2); + &mul_add_c($a[6],$b[5],$c0,$c1,$c2); + &mul_add_c($a[7],$b[4],$c0,$c1,$c2); &FR($b[4]); + &st($c0,&QWPw(11,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[5],$b[7],$c0,$c1,$c2); &FR($a[5]); + &mul_add_c($a[6],$b[6],$c0,$c1,$c2); + &mul_add_c($a[7],$b[5],$c0,$c1,$c2); &FR($b[5]); + &st($c0,&QWPw(12,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[6],$b[7],$c0,$c1,$c2); &FR($a[6]); + &mul_add_c($a[7],$b[6],$c0,$c1,$c2); &FR($b[6]); + &st($c0,&QWPw(13,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[7],$b[7],$c0,$c1,$c2); &FR($a[7],$b[7]); + &st($c0,&QWPw(14,$rp)); + &st($c1,&QWPw(15,$rp)); + + &FR($c0,$c1,$c2); + + &ld($reg_s0,&swtmp(0)); + &ld($reg_s1,&swtmp(1)); + &stack_pop(2); + + &function_end($name); + + &fin_pool; + } + +1; |