diff options
author | Ralf S. Engelschall <rse@openssl.org> | 1998-12-21 11:00:56 +0000 |
---|---|---|
committer | Ralf S. Engelschall <rse@openssl.org> | 1998-12-21 11:00:56 +0000 |
commit | dfeab0689f69c0b4bd3480ffd37a9cacc2f17d9c (patch) | |
tree | 2f74e0cfd76a9e092548a9bf52e579aef984299b /crypto/bn/asm | |
parent | 58964a492275ca9a59a0cd9c8155cb2491b4b909 (diff) | |
download | openssl-new-dfeab0689f69c0b4bd3480ffd37a9cacc2f17d9c.tar.gz |
Import of old SSLeay release: SSLeay 0.9.1b (unreleased)SSLeay
Diffstat (limited to 'crypto/bn/asm')
52 files changed, 18638 insertions, 185 deletions
diff --git a/crypto/bn/asm/a.out b/crypto/bn/asm/a.out Binary files differnew file mode 100644 index 0000000000..cc5094ff45 --- /dev/null +++ b/crypto/bn/asm/a.out diff --git a/crypto/bn/asm/alpha.s b/crypto/bn/asm/alpha.s index 1d17b1d619..cf0b69cff9 100644 --- a/crypto/bn/asm/alpha.s +++ b/crypto/bn/asm/alpha.s @@ -2,7 +2,13 @@ # The bn_div64 is actually gcc output but the other parts are hand done. # Thanks to tzeruch@ceddec.com for sending me the gcc output for # bn_div64. - .file 1 "bn_mulw.c" + # I've gone back and re-done most of routines. + # The key thing to remeber for the 164 CPU is that while a + # multiply operation takes 8 cycles, another one can only be issued + # after 4 cycles have elapsed. I've done modification to help + # improve this. Also, normally, a ld instruction will not be available + # for about 3 cycles. + .file 1 "bn_asm.c" .set noat gcc2_compiled.: __gnu_compiled_c: @@ -14,65 +20,91 @@ bn_mul_add_words: bn_mul_add_words..ng: .frame $30,0,$26,0 .prologue 0 - subq $18,2,$25 # num=-2 - bis $31,$31,$0 - blt $25,$42 .align 5 -$142: - subq $18,2,$18 # num-=2 - subq $25,2,$25 # num-=2 - - ldq $1,0($17) # a[0] - ldq $2,8($17) # a[1] - - mulq $19,$1,$3 # a[0]*w low part r3 - umulh $19,$1,$1 # a[0]*w high part r1 - mulq $19,$2,$4 # a[1]*w low part r4 - umulh $19,$2,$2 # a[1]*w high part r2 - - ldq $22,0($16) # r[0] r22 - ldq $23,8($16) # r[1] r23 - - addq $3,$22,$3 # a0 low part + r[0] - addq $4,$23,$4 # a1 low part + r[1] - cmpult $3,$22,$5 # overflow? - cmpult $4,$23,$6 # overflow? - addq $5,$1,$1 # high part + overflow - addq $6,$2,$2 # high part + overflow - - addq $3,$0,$3 # add c - cmpult $3,$0,$5 # overflow? - stq $3,0($16) - addq $5,$1,$0 # c=high part + overflow - - addq $4,$0,$4 # add c - cmpult $4,$0,$5 # overflow? - stq $4,8($16) - addq $5,$2,$0 # c=high part + overflow + subq $18,4,$18 + bis $31,$31,$0 + blt $18,$43 # if we are -1, -2, -3 or -4 goto tail code + ldq $20,0($17) # 1 1 + ldq $1,0($16) # 1 1 + .align 3 +$42: + mulq $20,$19,$5 # 1 2 1 ###### + ldq $21,8($17) # 2 1 + ldq $2,8($16) # 2 1 + umulh $20,$19,$20 # 1 2 ###### + ldq $27,16($17) # 3 1 + ldq $3,16($16) # 3 1 + mulq $21,$19,$6 # 2 2 1 ###### + ldq $28,24($17) # 4 1 + addq $1,$5,$1 # 1 2 2 + ldq $4,24($16) # 4 1 + umulh $21,$19,$21 # 2 2 ###### + cmpult $1,$5,$22 # 1 2 3 1 + addq $20,$22,$20 # 1 3 1 + addq $1,$0,$1 # 1 2 3 1 + mulq $27,$19,$7 # 3 2 1 ###### + cmpult $1,$0,$0 # 1 2 3 2 + addq $2,$6,$2 # 2 2 2 + addq $20,$0,$0 # 1 3 2 + cmpult $2,$6,$23 # 2 2 3 1 + addq $21,$23,$21 # 2 3 1 + umulh $27,$19,$27 # 3 2 ###### + addq $2,$0,$2 # 2 2 3 1 + cmpult $2,$0,$0 # 2 2 3 2 + subq $18,4,$18 + mulq $28,$19,$8 # 4 2 1 ###### + addq $21,$0,$0 # 2 3 2 + addq $3,$7,$3 # 3 2 2 + addq $16,32,$16 + cmpult $3,$7,$24 # 3 2 3 1 + stq $1,-32($16) # 1 2 4 + umulh $28,$19,$28 # 4 2 ###### + addq $27,$24,$27 # 3 3 1 + addq $3,$0,$3 # 3 2 3 1 + stq $2,-24($16) # 2 2 4 + cmpult $3,$0,$0 # 3 2 3 2 + stq $3,-16($16) # 3 2 4 + addq $4,$8,$4 # 4 2 2 + addq $27,$0,$0 # 3 3 2 + cmpult $4,$8,$25 # 4 2 3 1 + addq $17,32,$17 + addq $28,$25,$28 # 4 3 1 + addq $4,$0,$4 # 4 2 3 1 + cmpult $4,$0,$0 # 4 2 3 2 + stq $4,-8($16) # 4 2 4 + addq $28,$0,$0 # 4 3 2 + blt $18,$43 - ble $18,$43 + ldq $20,0($17) # 1 1 + ldq $1,0($16) # 1 1 - addq $16,16,$16 - addq $17,16,$17 - blt $25,$42 + br $42 - br $31,$142 -$42: - ldq $1,0($17) # a[0] - umulh $19,$1,$3 # a[0]*w high part - mulq $19,$1,$1 # a[0]*w low part - ldq $2,0($16) # r[0] - addq $1,$2,$1 # low part + r[0] - cmpult $1,$2,$4 # overflow? - addq $4,$3,$3 # high part + overflow - addq $1,$0,$1 # add c - cmpult $1,$0,$4 # overflow? - addq $4,$3,$0 # c=high part + overflow - stq $1,0($16) + .align 4 +$45: + ldq $20,0($17) # 4 1 + ldq $1,0($16) # 4 1 + mulq $20,$19,$5 # 4 2 1 + subq $18,1,$18 + addq $16,8,$16 + addq $17,8,$17 + umulh $20,$19,$20 # 4 2 + addq $1,$5,$1 # 4 2 2 + cmpult $1,$5,$22 # 4 2 3 1 + addq $20,$22,$20 # 4 3 1 + addq $1,$0,$1 # 4 2 3 1 + cmpult $1,$0,$0 # 4 2 3 2 + addq $20,$0,$0 # 4 3 2 + stq $1,-8($16) # 4 2 4 + bgt $18,$45 + ret $31,($26),1 # else exit .align 4 $43: - ret $31,($26),1 + addq $18,4,$18 + bgt $18,$45 # goto tail code + ret $31,($26),1 # else exit + .end bn_mul_add_words .align 3 .globl bn_mul_words @@ -81,49 +113,75 @@ bn_mul_words: bn_mul_words..ng: .frame $30,0,$26,0 .prologue 0 - subq $18,2,$25 # num=-2 - bis $31,$31,$0 - blt $25,$242 .align 5 -$342: - subq $18,2,$18 # num-=2 - subq $25,2,$25 # num-=2 - - ldq $1,0($17) # a[0] - ldq $2,8($17) # a[1] - - mulq $19,$1,$3 # a[0]*w low part r3 - umulh $19,$1,$1 # a[0]*w high part r1 - mulq $19,$2,$4 # a[1]*w low part r4 - umulh $19,$2,$2 # a[1]*w high part r2 - - addq $3,$0,$3 # add c - cmpult $3,$0,$5 # overflow? - stq $3,0($16) - addq $5,$1,$0 # c=high part + overflow - - addq $4,$0,$4 # add c - cmpult $4,$0,$5 # overflow? - stq $4,8($16) - addq $5,$2,$0 # c=high part + overflow - - ble $18,$243 - - addq $16,16,$16 - addq $17,16,$17 - blt $25,$242 - - br $31,$342 -$242: - ldq $1,0($17) # a[0] - umulh $19,$1,$3 # a[0]*w high part - mulq $19,$1,$1 # a[0]*w low part - addq $1,$0,$1 # add c - cmpult $1,$0,$4 # overflow? - addq $4,$3,$0 # c=high part + overflow - stq $1,0($16) -$243: - ret $31,($26),1 + subq $18,4,$18 + bis $31,$31,$0 + blt $18,$143 # if we are -1, -2, -3 or -4 goto tail code + ldq $20,0($17) # 1 1 + .align 3 +$142: + + mulq $20,$19,$5 # 1 2 1 ##### + ldq $21,8($17) # 2 1 + ldq $27,16($17) # 3 1 + umulh $20,$19,$20 # 1 2 ##### + ldq $28,24($17) # 4 1 + mulq $21,$19,$6 # 2 2 1 ##### + addq $5,$0,$5 # 1 2 3 1 + subq $18,4,$18 + cmpult $5,$0,$0 # 1 2 3 2 + umulh $21,$19,$21 # 2 2 ##### + addq $20,$0,$0 # 1 3 2 + addq $17,32,$17 + addq $6,$0,$6 # 2 2 3 1 + mulq $27,$19,$7 # 3 2 1 ##### + cmpult $6,$0,$0 # 2 2 3 2 + addq $21,$0,$0 # 2 3 2 + addq $16,32,$16 + umulh $27,$19,$27 # 3 2 ##### + stq $5,-32($16) # 1 2 4 + mulq $28,$19,$8 # 4 2 1 ##### + addq $7,$0,$7 # 3 2 3 1 + stq $6,-24($16) # 2 2 4 + cmpult $7,$0,$0 # 3 2 3 2 + umulh $28,$19,$28 # 4 2 ##### + addq $27,$0,$0 # 3 3 2 + stq $7,-16($16) # 3 2 4 + addq $8,$0,$8 # 4 2 3 1 + cmpult $8,$0,$0 # 4 2 3 2 + + addq $28,$0,$0 # 4 3 2 + + stq $8,-8($16) # 4 2 4 + + blt $18,$143 + + ldq $20,0($17) # 1 1 + + br $142 + + .align 4 +$145: + ldq $20,0($17) # 4 1 + mulq $20,$19,$5 # 4 2 1 + subq $18,1,$18 + umulh $20,$19,$20 # 4 2 + addq $5,$0,$5 # 4 2 3 1 + addq $16,8,$16 + cmpult $5,$0,$0 # 4 2 3 2 + addq $17,8,$17 + addq $20,$0,$0 # 4 3 2 + stq $5,-8($16) # 4 2 4 + + bgt $18,$145 + ret $31,($26),1 # else exit + + .align 4 +$143: + addq $18,4,$18 + bgt $18,$145 # goto tail code + ret $31,($26),1 # else exit + .end bn_mul_words .align 3 .globl bn_sqr_words @@ -132,44 +190,58 @@ bn_sqr_words: bn_sqr_words..ng: .frame $30,0,$26,0 .prologue 0 - - subq $18,2,$25 # num=-2 - blt $25,$442 - .align 5 -$542: - subq $18,2,$18 # num-=2 - subq $25,2,$25 # num-=2 - - ldq $1,0($17) # a[0] - ldq $4,8($17) # a[1] - mulq $1,$1,$2 # a[0]*w low part r2 - umulh $1,$1,$3 # a[0]*w high part r3 - mulq $4,$4,$5 # a[1]*w low part r5 - umulh $4,$4,$6 # a[1]*w high part r6 - - stq $2,0($16) # r[0] - stq $3,8($16) # r[1] - stq $5,16($16) # r[3] - stq $6,24($16) # r[4] + subq $18,4,$18 + blt $18,$543 # if we are -1, -2, -3 or -4 goto tail code + ldq $20,0($17) # 1 1 + .align 3 +$542: + mulq $20,$20,$5 ###### + ldq $21,8($17) # 1 1 + subq $18,4 + umulh $20,$20,$1 ###### + ldq $27,16($17) # 1 1 + mulq $21,$21,$6 ###### + ldq $28,24($17) # 1 1 + stq $5,0($16) # r[0] + umulh $21,$21,$2 ###### + stq $1,8($16) # r[1] + mulq $27,$27,$7 ###### + stq $6,16($16) # r[0] + umulh $27,$27,$3 ###### + stq $2,24($16) # r[1] + mulq $28,$28,$8 ###### + stq $7,32($16) # r[0] + umulh $28,$28,$4 ###### + stq $3,40($16) # r[1] - ble $18,$443 + addq $16,64,$16 + addq $17,32,$17 + stq $8,-16($16) # r[0] + stq $4,-8($16) # r[1] - addq $16,32,$16 - addq $17,16,$17 - blt $25,$442 - br $31,$542 + blt $18,$543 + ldq $20,0($17) # 1 1 + br $542 $442: - ldq $1,0($17) # a[0] - mulq $1,$1,$2 # a[0]*w low part r2 - umulh $1,$1,$3 # a[0]*w high part r3 - stq $2,0($16) # r[0] - stq $3,8($16) # r[1] + ldq $20,0($17) # a[0] + mulq $20,$20,$5 # a[0]*w low part r2 + addq $16,16,$16 + addq $17,8,$17 + subq $18,1,$18 + umulh $20,$20,$1 # a[0]*w high part r3 + stq $5,-16($16) # r[0] + stq $1,-8($16) # r[1] + + bgt $18,$442 + ret $31,($26),1 # else exit .align 4 -$443: - ret $31,($26),1 +$543: + addq $18,4,$18 + bgt $18,$442 # goto tail code + ret $31,($26),1 # else exit .end bn_sqr_words .align 3 @@ -180,31 +252,74 @@ bn_add_words..ng: .frame $30,0,$26,0 .prologue 0 - bis $31,$31,$8 # carry = 0 - ble $19,$900 + subq $19,4,$19 + bis $31,$31,$0 # carry = 0 + blt $19,$900 + ldq $5,0($17) # a[0] + ldq $1,0($18) # b[1] + .align 3 $901: - ldq $0,0($17) # a[0] - ldq $1,0($18) # a[1] + addq $1,$5,$1 # r=a+b; + ldq $6,8($17) # a[1] + cmpult $1,$5,$22 # did we overflow? + ldq $2,8($18) # b[1] + addq $1,$0,$1 # c+= overflow + ldq $7,16($17) # a[2] + cmpult $1,$0,$0 # overflow? + ldq $3,16($18) # b[2] + addq $0,$22,$0 + ldq $8,24($17) # a[3] + addq $2,$6,$2 # r=a+b; + ldq $4,24($18) # b[3] + cmpult $2,$6,$23 # did we overflow? + addq $3,$7,$3 # r=a+b; + addq $2,$0,$2 # c+= overflow + cmpult $3,$7,$24 # did we overflow? + cmpult $2,$0,$0 # overflow? + addq $4,$8,$4 # r=a+b; + addq $0,$23,$0 + cmpult $4,$8,$25 # did we overflow? + addq $3,$0,$3 # c+= overflow + stq $1,0($16) # r[0]=c + cmpult $3,$0,$0 # overflow? + stq $2,8($16) # r[1]=c + addq $0,$24,$0 + stq $3,16($16) # r[2]=c + addq $4,$0,$4 # c+= overflow + subq $19,4,$19 # loop-- + cmpult $4,$0,$0 # overflow? + addq $17,32,$17 # a++ + addq $0,$25,$0 + stq $4,24($16) # r[3]=c + addq $18,32,$18 # b++ + addq $16,32,$16 # r++ - addq $0,$1,$3 # c=a+b; + blt $19,$900 + ldq $5,0($17) # a[0] + ldq $1,0($18) # b[1] + br $901 + .align 4 +$945: + ldq $5,0($17) # a[0] + ldq $1,0($18) # b[1] + addq $1,$5,$1 # r=a+b; + subq $19,1,$19 # loop-- + addq $1,$0,$1 # c+= overflow addq $17,8,$17 # a++ + cmpult $1,$5,$22 # did we overflow? + cmpult $1,$0,$0 # overflow? + addq $18,8,$18 # b++ + stq $1,0($16) # r[0]=c + addq $0,$22,$0 + addq $16,8,$16 # r++ - cmpult $3,$1,$7 # did we overflow? - addq $18,8,$18 # b++ - - addq $8,$3,$3 # c+=carry + bgt $19,$945 + ret $31,($26),1 # else exit - cmpult $3,$8,$8 # did we overflow? - stq $3,($16) # r[0]=c - - addq $7,$8,$8 # add into overflow - subq $19,1,$19 # loop-- - - addq $16,8,$16 # r++ - bgt $19,$901 $900: - bis $8,$8,$0 # return carry - ret $31,($26),1 + addq $19,4,$19 + bgt $19,$945 # goto tail code + ret $31,($26),1 # else exit .end bn_add_words # @@ -339,6 +454,1445 @@ $136: addq $30,48,$30 ret $31,($26),1 .end bn_div64 - .ident "GCC: (GNU) 2.7.2.1" + .set noat + .text + .align 3 + .globl bn_sub_words + .ent bn_sub_words +bn_sub_words: +bn_sub_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $19, 4, $19 + bis $31, $31, $0 + blt $19, $100 + ldq $1, 0($17) + ldq $2, 0($18) +$101: + ldq $3, 8($17) + cmpult $1, $2, $4 + ldq $5, 8($18) + subq $1, $2, $1 + ldq $6, 16($17) + cmpult $1, $0, $2 + ldq $7, 16($18) + subq $1, $0, $23 + ldq $8, 24($17) + addq $2, $4, $0 + cmpult $3, $5, $24 + subq $3, $5, $3 + ldq $22, 24($18) + cmpult $3, $0, $5 + subq $3, $0, $25 + addq $5, $24, $0 + cmpult $6, $7, $27 + subq $6, $7, $6 + stq $23, 0($16) + cmpult $6, $0, $7 + subq $6, $0, $28 + addq $7, $27, $0 + cmpult $8, $22, $21 + subq $8, $22, $8 + stq $25, 8($16) + cmpult $8, $0, $22 + subq $8, $0, $20 + addq $22, $21, $0 + stq $28, 16($16) + subq $19, 4, $19 + stq $20, 24($16) + addq $17, 32, $17 + addq $18, 32, $18 + addq $16, 32, $16 + blt $19, $100 + ldq $1, 0($17) + ldq $2, 0($18) + br $101 +$102: + ldq $1, 0($17) + ldq $2, 0($18) + cmpult $1, $2, $27 + subq $1, $2, $1 + cmpult $1, $0, $2 + subq $1, $0, $1 + stq $1, 0($16) + addq $2, $27, $0 + addq $17, 8, $17 + addq $18, 8, $18 + addq $16, 8, $16 + subq $19, 1, $19 + bgt $19, $102 + ret $31,($26),1 +$100: + addq $19, 4, $19 + bgt $19, $102 +$103: + ret $31,($26),1 + .end bn_sub_words + .text + .align 3 + .globl bn_mul_comba4 + .ent bn_mul_comba4 +bn_mul_comba4: +bn_mul_comba4..ng: + .frame $30,0,$26,0 + .prologue 0 + + ldq $0, 0($17) + ldq $1, 0($18) + ldq $2, 8($17) + ldq $3, 8($18) + ldq $4, 16($17) + ldq $5, 16($18) + ldq $6, 24($17) + ldq $7, 24($18) + bis $31, $31, $23 + mulq $0, $1, $8 + umulh $0, $1, $22 + stq $8, 0($16) + bis $31, $31, $8 + mulq $0, $3, $24 + umulh $0, $3, $25 + addq $22, $24, $22 + cmpult $22, $24, $27 + addq $27, $25, $25 + addq $23, $25, $23 + cmpult $23, $25, $28 + addq $8, $28, $8 + mulq $2, $1, $21 + umulh $2, $1, $20 + addq $22, $21, $22 + cmpult $22, $21, $19 + addq $19, $20, $20 + addq $23, $20, $23 + cmpult $23, $20, $17 + addq $8, $17, $8 + stq $22, 8($16) + bis $31, $31, $22 + mulq $2, $3, $18 + umulh $2, $3, $24 + addq $23, $18, $23 + cmpult $23, $18, $27 + addq $27, $24, $24 + addq $8, $24, $8 + cmpult $8, $24, $25 + addq $22, $25, $22 + mulq $0, $5, $28 + umulh $0, $5, $21 + addq $23, $28, $23 + cmpult $23, $28, $19 + addq $19, $21, $21 + addq $8, $21, $8 + cmpult $8, $21, $20 + addq $22, $20, $22 + mulq $4, $1, $17 + umulh $4, $1, $18 + addq $23, $17, $23 + cmpult $23, $17, $27 + addq $27, $18, $18 + addq $8, $18, $8 + cmpult $8, $18, $24 + addq $22, $24, $22 + stq $23, 16($16) + bis $31, $31, $23 + mulq $0, $7, $25 + umulh $0, $7, $28 + addq $8, $25, $8 + cmpult $8, $25, $19 + addq $19, $28, $28 + addq $22, $28, $22 + cmpult $22, $28, $21 + addq $23, $21, $23 + mulq $2, $5, $20 + umulh $2, $5, $17 + addq $8, $20, $8 + cmpult $8, $20, $27 + addq $27, $17, $17 + addq $22, $17, $22 + cmpult $22, $17, $18 + addq $23, $18, $23 + mulq $4, $3, $24 + umulh $4, $3, $25 + addq $8, $24, $8 + cmpult $8, $24, $19 + addq $19, $25, $25 + addq $22, $25, $22 + cmpult $22, $25, $28 + addq $23, $28, $23 + mulq $6, $1, $21 + umulh $6, $1, $0 + addq $8, $21, $8 + cmpult $8, $21, $20 + addq $20, $0, $0 + addq $22, $0, $22 + cmpult $22, $0, $27 + addq $23, $27, $23 + stq $8, 24($16) + bis $31, $31, $8 + mulq $2, $7, $17 + umulh $2, $7, $18 + addq $22, $17, $22 + cmpult $22, $17, $24 + addq $24, $18, $18 + addq $23, $18, $23 + cmpult $23, $18, $19 + addq $8, $19, $8 + mulq $4, $5, $25 + umulh $4, $5, $28 + addq $22, $25, $22 + cmpult $22, $25, $21 + addq $21, $28, $28 + addq $23, $28, $23 + cmpult $23, $28, $20 + addq $8, $20, $8 + mulq $6, $3, $0 + umulh $6, $3, $27 + addq $22, $0, $22 + cmpult $22, $0, $1 + addq $1, $27, $27 + addq $23, $27, $23 + cmpult $23, $27, $17 + addq $8, $17, $8 + stq $22, 32($16) + bis $31, $31, $22 + mulq $4, $7, $24 + umulh $4, $7, $18 + addq $23, $24, $23 + cmpult $23, $24, $19 + addq $19, $18, $18 + addq $8, $18, $8 + cmpult $8, $18, $2 + addq $22, $2, $22 + mulq $6, $5, $25 + umulh $6, $5, $21 + addq $23, $25, $23 + cmpult $23, $25, $28 + addq $28, $21, $21 + addq $8, $21, $8 + cmpult $8, $21, $20 + addq $22, $20, $22 + stq $23, 40($16) + bis $31, $31, $23 + mulq $6, $7, $0 + umulh $6, $7, $1 + addq $8, $0, $8 + cmpult $8, $0, $27 + addq $27, $1, $1 + addq $22, $1, $22 + cmpult $22, $1, $17 + addq $23, $17, $23 + stq $8, 48($16) + stq $22, 56($16) + ret $31,($26),1 + .end bn_mul_comba4 + .text + .align 3 + .globl bn_mul_comba8 + .ent bn_mul_comba8 +bn_mul_comba8: +bn_mul_comba8..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $30, 16, $30 + ldq $0, 0($17) + ldq $1, 0($18) + stq $9, 0($30) + stq $10, 8($30) + ldq $2, 8($17) + ldq $3, 8($18) + ldq $4, 16($17) + ldq $5, 16($18) + ldq $6, 24($17) + ldq $7, 24($18) + ldq $8, 8($17) + ldq $22, 8($18) + ldq $23, 8($17) + ldq $24, 8($18) + ldq $25, 8($17) + ldq $27, 8($18) + ldq $28, 8($17) + ldq $21, 8($18) + bis $31, $31, $9 + mulq $0, $1, $20 + umulh $0, $1, $19 + stq $20, 0($16) + bis $31, $31, $20 + mulq $0, $3, $10 + umulh $0, $3, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $10 + addq $20, $10, $20 + mulq $2, $1, $18 + umulh $2, $1, $17 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $10, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $20, $18, $20 + stq $19, 8($16) + bis $31, $31, $19 + mulq $0, $5, $10 + umulh $0, $5, $17 + addq $9, $10, $9 + cmpult $9, $10, $18 + addq $18, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + mulq $2, $3, $18 + umulh $2, $3, $17 + addq $9, $18, $9 + cmpult $9, $18, $10 + addq $10, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $18 + addq $19, $18, $19 + mulq $4, $1, $10 + umulh $4, $1, $17 + addq $9, $10, $9 + cmpult $9, $10, $18 + addq $18, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + stq $9, 16($16) + bis $31, $31, $9 + mulq $0, $7, $18 + umulh $0, $7, $17 + addq $20, $18, $20 + cmpult $20, $18, $10 + addq $10, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $9, $18, $9 + mulq $2, $5, $10 + umulh $2, $5, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + mulq $4, $3, $18 + umulh $4, $3, $17 + addq $20, $18, $20 + cmpult $20, $18, $10 + addq $10, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $9, $18, $9 + mulq $6, $1, $10 + umulh $6, $1, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + stq $20, 24($16) + bis $31, $31, $20 + mulq $0, $22, $18 + umulh $0, $22, $17 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $10, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $20, $18, $20 + mulq $2, $7, $10 + umulh $2, $7, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $10 + addq $20, $10, $20 + mulq $4, $5, $18 + umulh $4, $5, $17 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $10, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $20, $18, $20 + mulq $6, $3, $10 + umulh $6, $3, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $10 + addq $20, $10, $20 + mulq $8, $1, $18 + umulh $8, $1, $17 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $10, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $20, $18, $20 + stq $19, 32($16) + bis $31, $31, $19 + mulq $0, $24, $10 + umulh $0, $24, $17 + addq $9, $10, $9 + cmpult $9, $10, $18 + addq $18, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + mulq $2, $22, $18 + umulh $2, $22, $17 + addq $9, $18, $9 + cmpult $9, $18, $10 + addq $10, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $18 + addq $19, $18, $19 + mulq $4, $7, $10 + umulh $4, $7, $17 + addq $9, $10, $9 + cmpult $9, $10, $18 + addq $18, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + mulq $6, $5, $18 + umulh $6, $5, $17 + addq $9, $18, $9 + cmpult $9, $18, $10 + addq $10, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $18 + addq $19, $18, $19 + mulq $8, $3, $10 + umulh $8, $3, $17 + addq $9, $10, $9 + cmpult $9, $10, $18 + addq $18, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + mulq $23, $1, $18 + umulh $23, $1, $17 + addq $9, $18, $9 + cmpult $9, $18, $10 + addq $10, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $18 + addq $19, $18, $19 + stq $9, 40($16) + bis $31, $31, $9 + mulq $0, $27, $10 + umulh $0, $27, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + mulq $2, $24, $18 + umulh $2, $24, $17 + addq $20, $18, $20 + cmpult $20, $18, $10 + addq $10, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $9, $18, $9 + mulq $4, $22, $10 + umulh $4, $22, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + mulq $6, $7, $18 + umulh $6, $7, $17 + addq $20, $18, $20 + cmpult $20, $18, $10 + addq $10, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $9, $18, $9 + mulq $8, $5, $10 + umulh $8, $5, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + mulq $23, $3, $18 + umulh $23, $3, $17 + addq $20, $18, $20 + cmpult $20, $18, $10 + addq $10, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $9, $18, $9 + mulq $25, $1, $10 + umulh $25, $1, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + stq $20, 48($16) + bis $31, $31, $20 + mulq $0, $21, $18 + umulh $0, $21, $17 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $10, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $20, $18, $20 + mulq $2, $27, $10 + umulh $2, $27, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $20, $0, $20 + mulq $4, $24, $10 + umulh $4, $24, $18 + addq $19, $10, $19 + cmpult $19, $10, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $0 + addq $20, $0, $20 + mulq $6, $22, $10 + umulh $6, $22, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $20, $0, $20 + mulq $8, $7, $10 + umulh $8, $7, $18 + addq $19, $10, $19 + cmpult $19, $10, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $0 + addq $20, $0, $20 + mulq $23, $5, $10 + umulh $23, $5, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $20, $0, $20 + mulq $25, $3, $10 + umulh $25, $3, $18 + addq $19, $10, $19 + cmpult $19, $10, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $0 + addq $20, $0, $20 + mulq $28, $1, $10 + umulh $28, $1, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $20, $0, $20 + stq $19, 56($16) + bis $31, $31, $19 + mulq $2, $21, $10 + umulh $2, $21, $18 + addq $9, $10, $9 + cmpult $9, $10, $17 + addq $17, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $0 + addq $19, $0, $19 + mulq $4, $27, $1 + umulh $4, $27, $10 + addq $9, $1, $9 + cmpult $9, $1, $17 + addq $17, $10, $10 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $19, $18, $19 + mulq $6, $24, $0 + umulh $6, $24, $2 + addq $9, $0, $9 + cmpult $9, $0, $1 + addq $1, $2, $2 + addq $20, $2, $20 + cmpult $20, $2, $17 + addq $19, $17, $19 + mulq $8, $22, $10 + umulh $8, $22, $18 + addq $9, $10, $9 + cmpult $9, $10, $0 + addq $0, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $1 + addq $19, $1, $19 + mulq $23, $7, $2 + umulh $23, $7, $17 + addq $9, $2, $9 + cmpult $9, $2, $10 + addq $10, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $0 + addq $19, $0, $19 + mulq $25, $5, $18 + umulh $25, $5, $1 + addq $9, $18, $9 + cmpult $9, $18, $2 + addq $2, $1, $1 + addq $20, $1, $20 + cmpult $20, $1, $10 + addq $19, $10, $19 + mulq $28, $3, $17 + umulh $28, $3, $0 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $18, $0, $0 + addq $20, $0, $20 + cmpult $20, $0, $2 + addq $19, $2, $19 + stq $9, 64($16) + bis $31, $31, $9 + mulq $4, $21, $1 + umulh $4, $21, $10 + addq $20, $1, $20 + cmpult $20, $1, $17 + addq $17, $10, $10 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $9, $18, $9 + mulq $6, $27, $0 + umulh $6, $27, $2 + addq $20, $0, $20 + cmpult $20, $0, $3 + addq $3, $2, $2 + addq $19, $2, $19 + cmpult $19, $2, $1 + addq $9, $1, $9 + mulq $8, $24, $17 + umulh $8, $24, $10 + addq $20, $17, $20 + cmpult $20, $17, $18 + addq $18, $10, $10 + addq $19, $10, $19 + cmpult $19, $10, $4 + addq $9, $4, $9 + mulq $23, $22, $0 + umulh $23, $22, $3 + addq $20, $0, $20 + cmpult $20, $0, $2 + addq $2, $3, $3 + addq $19, $3, $19 + cmpult $19, $3, $1 + addq $9, $1, $9 + mulq $25, $7, $17 + umulh $25, $7, $18 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $10, $18, $18 + addq $19, $18, $19 + cmpult $19, $18, $4 + addq $9, $4, $9 + mulq $28, $5, $0 + umulh $28, $5, $2 + addq $20, $0, $20 + cmpult $20, $0, $3 + addq $3, $2, $2 + addq $19, $2, $19 + cmpult $19, $2, $1 + addq $9, $1, $9 + stq $20, 72($16) + bis $31, $31, $20 + mulq $6, $21, $17 + umulh $6, $21, $10 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $18, $10, $10 + addq $9, $10, $9 + cmpult $9, $10, $4 + addq $20, $4, $20 + mulq $8, $27, $0 + umulh $8, $27, $3 + addq $19, $0, $19 + cmpult $19, $0, $2 + addq $2, $3, $3 + addq $9, $3, $9 + cmpult $9, $3, $1 + addq $20, $1, $20 + mulq $23, $24, $5 + umulh $23, $24, $17 + addq $19, $5, $19 + cmpult $19, $5, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $10 + addq $20, $10, $20 + mulq $25, $22, $4 + umulh $25, $22, $6 + addq $19, $4, $19 + cmpult $19, $4, $0 + addq $0, $6, $6 + addq $9, $6, $9 + cmpult $9, $6, $2 + addq $20, $2, $20 + mulq $28, $7, $3 + umulh $28, $7, $1 + addq $19, $3, $19 + cmpult $19, $3, $5 + addq $5, $1, $1 + addq $9, $1, $9 + cmpult $9, $1, $18 + addq $20, $18, $20 + stq $19, 80($16) + bis $31, $31, $19 + mulq $8, $21, $17 + umulh $8, $21, $10 + addq $9, $17, $9 + cmpult $9, $17, $4 + addq $4, $10, $10 + addq $20, $10, $20 + cmpult $20, $10, $0 + addq $19, $0, $19 + mulq $23, $27, $6 + umulh $23, $27, $2 + addq $9, $6, $9 + cmpult $9, $6, $3 + addq $3, $2, $2 + addq $20, $2, $20 + cmpult $20, $2, $5 + addq $19, $5, $19 + mulq $25, $24, $1 + umulh $25, $24, $18 + addq $9, $1, $9 + cmpult $9, $1, $7 + addq $7, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $17 + addq $19, $17, $19 + mulq $28, $22, $4 + umulh $28, $22, $10 + addq $9, $4, $9 + cmpult $9, $4, $0 + addq $0, $10, $10 + addq $20, $10, $20 + cmpult $20, $10, $8 + addq $19, $8, $19 + stq $9, 88($16) + bis $31, $31, $9 + mulq $23, $21, $6 + umulh $23, $21, $3 + addq $20, $6, $20 + cmpult $20, $6, $2 + addq $2, $3, $3 + addq $19, $3, $19 + cmpult $19, $3, $5 + addq $9, $5, $9 + mulq $25, $27, $1 + umulh $25, $27, $7 + addq $20, $1, $20 + cmpult $20, $1, $18 + addq $18, $7, $7 + addq $19, $7, $19 + cmpult $19, $7, $17 + addq $9, $17, $9 + mulq $28, $24, $4 + umulh $28, $24, $0 + addq $20, $4, $20 + cmpult $20, $4, $10 + addq $10, $0, $0 + addq $19, $0, $19 + cmpult $19, $0, $8 + addq $9, $8, $9 + stq $20, 96($16) + bis $31, $31, $20 + mulq $25, $21, $22 + umulh $25, $21, $6 + addq $19, $22, $19 + cmpult $19, $22, $2 + addq $2, $6, $6 + addq $9, $6, $9 + cmpult $9, $6, $3 + addq $20, $3, $20 + mulq $28, $27, $5 + umulh $28, $27, $23 + addq $19, $5, $19 + cmpult $19, $5, $1 + addq $1, $23, $23 + addq $9, $23, $9 + cmpult $9, $23, $18 + addq $20, $18, $20 + stq $19, 104($16) + bis $31, $31, $19 + mulq $28, $21, $7 + umulh $28, $21, $17 + addq $9, $7, $9 + cmpult $9, $7, $4 + addq $4, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + stq $9, 112($16) + stq $20, 120($16) + ldq $9, 0($30) + ldq $10, 8($30) + addq $30, 16, $30 + ret $31,($26),1 + .end bn_mul_comba8 + .text + .align 3 + .globl bn_sqr_comba4 + .ent bn_sqr_comba4 +bn_sqr_comba4: +bn_sqr_comba4..ng: + .frame $30,0,$26,0 + .prologue 0 + + ldq $0, 0($17) + ldq $1, 8($17) + ldq $2, 16($17) + ldq $3, 24($17) + bis $31, $31, $6 + mulq $0, $0, $4 + umulh $0, $0, $5 + stq $4, 0($16) + bis $31, $31, $4 + mulq $0, $1, $7 + umulh $0, $1, $8 + cmplt $7, $31, $22 + cmplt $8, $31, $23 + addq $7, $7, $7 + addq $8, $8, $8 + addq $8, $22, $8 + addq $4, $23, $4 + addq $5, $7, $5 + addq $6, $8, $6 + cmpult $5, $7, $24 + cmpult $6, $8, $25 + addq $6, $24, $6 + addq $4, $25, $4 + stq $5, 8($16) + bis $31, $31, $5 + mulq $1, $1, $27 + umulh $1, $1, $28 + addq $6, $27, $6 + addq $4, $28, $4 + cmpult $6, $27, $21 + cmpult $4, $28, $20 + addq $4, $21, $4 + addq $5, $20, $5 + mulq $2, $0, $19 + umulh $2, $0, $18 + cmplt $19, $31, $17 + cmplt $18, $31, $22 + addq $19, $19, $19 + addq $18, $18, $18 + addq $18, $17, $18 + addq $5, $22, $5 + addq $6, $19, $6 + addq $4, $18, $4 + cmpult $6, $19, $23 + cmpult $4, $18, $7 + addq $4, $23, $4 + addq $5, $7, $5 + stq $6, 16($16) + bis $31, $31, $6 + mulq $3, $0, $8 + umulh $3, $0, $24 + cmplt $8, $31, $25 + cmplt $24, $31, $27 + addq $8, $8, $8 + addq $24, $24, $24 + addq $24, $25, $24 + addq $6, $27, $6 + addq $4, $8, $4 + addq $5, $24, $5 + cmpult $4, $8, $28 + cmpult $5, $24, $21 + addq $5, $28, $5 + addq $6, $21, $6 + mulq $2, $1, $20 + umulh $2, $1, $17 + cmplt $20, $31, $22 + cmplt $17, $31, $19 + addq $20, $20, $20 + addq $17, $17, $17 + addq $17, $22, $17 + addq $6, $19, $6 + addq $4, $20, $4 + addq $5, $17, $5 + cmpult $4, $20, $18 + cmpult $5, $17, $23 + addq $5, $18, $5 + addq $6, $23, $6 + stq $4, 24($16) + bis $31, $31, $4 + mulq $2, $2, $7 + umulh $2, $2, $25 + addq $5, $7, $5 + addq $6, $25, $6 + cmpult $5, $7, $27 + cmpult $6, $25, $8 + addq $6, $27, $6 + addq $4, $8, $4 + mulq $3, $1, $24 + umulh $3, $1, $28 + cmplt $24, $31, $21 + cmplt $28, $31, $22 + addq $24, $24, $24 + addq $28, $28, $28 + addq $28, $21, $28 + addq $4, $22, $4 + addq $5, $24, $5 + addq $6, $28, $6 + cmpult $5, $24, $19 + cmpult $6, $28, $20 + addq $6, $19, $6 + addq $4, $20, $4 + stq $5, 32($16) + bis $31, $31, $5 + mulq $3, $2, $17 + umulh $3, $2, $18 + cmplt $17, $31, $23 + cmplt $18, $31, $7 + addq $17, $17, $17 + addq $18, $18, $18 + addq $18, $23, $18 + addq $5, $7, $5 + addq $6, $17, $6 + addq $4, $18, $4 + cmpult $6, $17, $25 + cmpult $4, $18, $27 + addq $4, $25, $4 + addq $5, $27, $5 + stq $6, 40($16) + bis $31, $31, $6 + mulq $3, $3, $8 + umulh $3, $3, $21 + addq $4, $8, $4 + addq $5, $21, $5 + cmpult $4, $8, $22 + cmpult $5, $21, $24 + addq $5, $22, $5 + addq $6, $24, $6 + stq $4, 48($16) + stq $5, 56($16) + ret $31,($26),1 + .end bn_sqr_comba4 + .text + .align 3 + .globl bn_sqr_comba8 + .ent bn_sqr_comba8 +bn_sqr_comba8: +bn_sqr_comba8..ng: + .frame $30,0,$26,0 + .prologue 0 + ldq $0, 0($17) + ldq $1, 8($17) + ldq $2, 16($17) + ldq $3, 24($17) + ldq $4, 32($17) + ldq $5, 40($17) + ldq $6, 48($17) + ldq $7, 56($17) + bis $31, $31, $23 + mulq $0, $0, $8 + umulh $0, $0, $22 + stq $8, 0($16) + bis $31, $31, $8 + mulq $1, $0, $24 + umulh $1, $0, $25 + cmplt $24, $31, $27 + cmplt $25, $31, $28 + addq $24, $24, $24 + addq $25, $25, $25 + addq $25, $27, $25 + addq $8, $28, $8 + addq $22, $24, $22 + addq $23, $25, $23 + cmpult $22, $24, $21 + cmpult $23, $25, $20 + addq $23, $21, $23 + addq $8, $20, $8 + stq $22, 8($16) + bis $31, $31, $22 + mulq $1, $1, $19 + umulh $1, $1, $18 + addq $23, $19, $23 + addq $8, $18, $8 + cmpult $23, $19, $17 + cmpult $8, $18, $27 + addq $8, $17, $8 + addq $22, $27, $22 + mulq $2, $0, $28 + umulh $2, $0, $24 + cmplt $28, $31, $25 + cmplt $24, $31, $21 + addq $28, $28, $28 + addq $24, $24, $24 + addq $24, $25, $24 + addq $22, $21, $22 + addq $23, $28, $23 + addq $8, $24, $8 + cmpult $23, $28, $20 + cmpult $8, $24, $19 + addq $8, $20, $8 + addq $22, $19, $22 + stq $23, 16($16) + bis $31, $31, $23 + mulq $2, $1, $18 + umulh $2, $1, $17 + cmplt $18, $31, $27 + cmplt $17, $31, $25 + addq $18, $18, $18 + addq $17, $17, $17 + addq $17, $27, $17 + addq $23, $25, $23 + addq $8, $18, $8 + addq $22, $17, $22 + cmpult $8, $18, $21 + cmpult $22, $17, $28 + addq $22, $21, $22 + addq $23, $28, $23 + mulq $3, $0, $24 + umulh $3, $0, $20 + cmplt $24, $31, $19 + cmplt $20, $31, $27 + addq $24, $24, $24 + addq $20, $20, $20 + addq $20, $19, $20 + addq $23, $27, $23 + addq $8, $24, $8 + addq $22, $20, $22 + cmpult $8, $24, $25 + cmpult $22, $20, $18 + addq $22, $25, $22 + addq $23, $18, $23 + stq $8, 24($16) + bis $31, $31, $8 + mulq $2, $2, $17 + umulh $2, $2, $21 + addq $22, $17, $22 + addq $23, $21, $23 + cmpult $22, $17, $28 + cmpult $23, $21, $19 + addq $23, $28, $23 + addq $8, $19, $8 + mulq $3, $1, $27 + umulh $3, $1, $24 + cmplt $27, $31, $20 + cmplt $24, $31, $25 + addq $27, $27, $27 + addq $24, $24, $24 + addq $24, $20, $24 + addq $8, $25, $8 + addq $22, $27, $22 + addq $23, $24, $23 + cmpult $22, $27, $18 + cmpult $23, $24, $17 + addq $23, $18, $23 + addq $8, $17, $8 + mulq $4, $0, $21 + umulh $4, $0, $28 + cmplt $21, $31, $19 + cmplt $28, $31, $20 + addq $21, $21, $21 + addq $28, $28, $28 + addq $28, $19, $28 + addq $8, $20, $8 + addq $22, $21, $22 + addq $23, $28, $23 + cmpult $22, $21, $25 + cmpult $23, $28, $27 + addq $23, $25, $23 + addq $8, $27, $8 + stq $22, 32($16) + bis $31, $31, $22 + mulq $3, $2, $24 + umulh $3, $2, $18 + cmplt $24, $31, $17 + cmplt $18, $31, $19 + addq $24, $24, $24 + addq $18, $18, $18 + addq $18, $17, $18 + addq $22, $19, $22 + addq $23, $24, $23 + addq $8, $18, $8 + cmpult $23, $24, $20 + cmpult $8, $18, $21 + addq $8, $20, $8 + addq $22, $21, $22 + mulq $4, $1, $28 + umulh $4, $1, $25 + cmplt $28, $31, $27 + cmplt $25, $31, $17 + addq $28, $28, $28 + addq $25, $25, $25 + addq $25, $27, $25 + addq $22, $17, $22 + addq $23, $28, $23 + addq $8, $25, $8 + cmpult $23, $28, $19 + cmpult $8, $25, $24 + addq $8, $19, $8 + addq $22, $24, $22 + mulq $5, $0, $18 + umulh $5, $0, $20 + cmplt $18, $31, $21 + cmplt $20, $31, $27 + addq $18, $18, $18 + addq $20, $20, $20 + addq $20, $21, $20 + addq $22, $27, $22 + addq $23, $18, $23 + addq $8, $20, $8 + cmpult $23, $18, $17 + cmpult $8, $20, $28 + addq $8, $17, $8 + addq $22, $28, $22 + stq $23, 40($16) + bis $31, $31, $23 + mulq $3, $3, $25 + umulh $3, $3, $19 + addq $8, $25, $8 + addq $22, $19, $22 + cmpult $8, $25, $24 + cmpult $22, $19, $21 + addq $22, $24, $22 + addq $23, $21, $23 + mulq $4, $2, $27 + umulh $4, $2, $18 + cmplt $27, $31, $20 + cmplt $18, $31, $17 + addq $27, $27, $27 + addq $18, $18, $18 + addq $18, $20, $18 + addq $23, $17, $23 + addq $8, $27, $8 + addq $22, $18, $22 + cmpult $8, $27, $28 + cmpult $22, $18, $25 + addq $22, $28, $22 + addq $23, $25, $23 + mulq $5, $1, $19 + umulh $5, $1, $24 + cmplt $19, $31, $21 + cmplt $24, $31, $20 + addq $19, $19, $19 + addq $24, $24, $24 + addq $24, $21, $24 + addq $23, $20, $23 + addq $8, $19, $8 + addq $22, $24, $22 + cmpult $8, $19, $17 + cmpult $22, $24, $27 + addq $22, $17, $22 + addq $23, $27, $23 + mulq $6, $0, $18 + umulh $6, $0, $28 + cmplt $18, $31, $25 + cmplt $28, $31, $21 + addq $18, $18, $18 + addq $28, $28, $28 + addq $28, $25, $28 + addq $23, $21, $23 + addq $8, $18, $8 + addq $22, $28, $22 + cmpult $8, $18, $20 + cmpult $22, $28, $19 + addq $22, $20, $22 + addq $23, $19, $23 + stq $8, 48($16) + bis $31, $31, $8 + mulq $4, $3, $24 + umulh $4, $3, $17 + cmplt $24, $31, $27 + cmplt $17, $31, $25 + addq $24, $24, $24 + addq $17, $17, $17 + addq $17, $27, $17 + addq $8, $25, $8 + addq $22, $24, $22 + addq $23, $17, $23 + cmpult $22, $24, $21 + cmpult $23, $17, $18 + addq $23, $21, $23 + addq $8, $18, $8 + mulq $5, $2, $28 + umulh $5, $2, $20 + cmplt $28, $31, $19 + cmplt $20, $31, $27 + addq $28, $28, $28 + addq $20, $20, $20 + addq $20, $19, $20 + addq $8, $27, $8 + addq $22, $28, $22 + addq $23, $20, $23 + cmpult $22, $28, $25 + cmpult $23, $20, $24 + addq $23, $25, $23 + addq $8, $24, $8 + mulq $6, $1, $17 + umulh $6, $1, $21 + cmplt $17, $31, $18 + cmplt $21, $31, $19 + addq $17, $17, $17 + addq $21, $21, $21 + addq $21, $18, $21 + addq $8, $19, $8 + addq $22, $17, $22 + addq $23, $21, $23 + cmpult $22, $17, $27 + cmpult $23, $21, $28 + addq $23, $27, $23 + addq $8, $28, $8 + mulq $7, $0, $20 + umulh $7, $0, $25 + cmplt $20, $31, $24 + cmplt $25, $31, $18 + addq $20, $20, $20 + addq $25, $25, $25 + addq $25, $24, $25 + addq $8, $18, $8 + addq $22, $20, $22 + addq $23, $25, $23 + cmpult $22, $20, $19 + cmpult $23, $25, $17 + addq $23, $19, $23 + addq $8, $17, $8 + stq $22, 56($16) + bis $31, $31, $22 + mulq $4, $4, $21 + umulh $4, $4, $27 + addq $23, $21, $23 + addq $8, $27, $8 + cmpult $23, $21, $28 + cmpult $8, $27, $24 + addq $8, $28, $8 + addq $22, $24, $22 + mulq $5, $3, $18 + umulh $5, $3, $20 + cmplt $18, $31, $25 + cmplt $20, $31, $19 + addq $18, $18, $18 + addq $20, $20, $20 + addq $20, $25, $20 + addq $22, $19, $22 + addq $23, $18, $23 + addq $8, $20, $8 + cmpult $23, $18, $17 + cmpult $8, $20, $21 + addq $8, $17, $8 + addq $22, $21, $22 + mulq $6, $2, $27 + umulh $6, $2, $28 + cmplt $27, $31, $24 + cmplt $28, $31, $25 + addq $27, $27, $27 + addq $28, $28, $28 + addq $28, $24, $28 + addq $22, $25, $22 + addq $23, $27, $23 + addq $8, $28, $8 + cmpult $23, $27, $19 + cmpult $8, $28, $18 + addq $8, $19, $8 + addq $22, $18, $22 + mulq $7, $1, $20 + umulh $7, $1, $17 + cmplt $20, $31, $21 + cmplt $17, $31, $24 + addq $20, $20, $20 + addq $17, $17, $17 + addq $17, $21, $17 + addq $22, $24, $22 + addq $23, $20, $23 + addq $8, $17, $8 + cmpult $23, $20, $25 + cmpult $8, $17, $27 + addq $8, $25, $8 + addq $22, $27, $22 + stq $23, 64($16) + bis $31, $31, $23 + mulq $5, $4, $28 + umulh $5, $4, $19 + cmplt $28, $31, $18 + cmplt $19, $31, $21 + addq $28, $28, $28 + addq $19, $19, $19 + addq $19, $18, $19 + addq $23, $21, $23 + addq $8, $28, $8 + addq $22, $19, $22 + cmpult $8, $28, $24 + cmpult $22, $19, $20 + addq $22, $24, $22 + addq $23, $20, $23 + mulq $6, $3, $17 + umulh $6, $3, $25 + cmplt $17, $31, $27 + cmplt $25, $31, $18 + addq $17, $17, $17 + addq $25, $25, $25 + addq $25, $27, $25 + addq $23, $18, $23 + addq $8, $17, $8 + addq $22, $25, $22 + cmpult $8, $17, $21 + cmpult $22, $25, $28 + addq $22, $21, $22 + addq $23, $28, $23 + mulq $7, $2, $19 + umulh $7, $2, $24 + cmplt $19, $31, $20 + cmplt $24, $31, $27 + addq $19, $19, $19 + addq $24, $24, $24 + addq $24, $20, $24 + addq $23, $27, $23 + addq $8, $19, $8 + addq $22, $24, $22 + cmpult $8, $19, $18 + cmpult $22, $24, $17 + addq $22, $18, $22 + addq $23, $17, $23 + stq $8, 72($16) + bis $31, $31, $8 + mulq $5, $5, $25 + umulh $5, $5, $21 + addq $22, $25, $22 + addq $23, $21, $23 + cmpult $22, $25, $28 + cmpult $23, $21, $20 + addq $23, $28, $23 + addq $8, $20, $8 + mulq $6, $4, $27 + umulh $6, $4, $19 + cmplt $27, $31, $24 + cmplt $19, $31, $18 + addq $27, $27, $27 + addq $19, $19, $19 + addq $19, $24, $19 + addq $8, $18, $8 + addq $22, $27, $22 + addq $23, $19, $23 + cmpult $22, $27, $17 + cmpult $23, $19, $25 + addq $23, $17, $23 + addq $8, $25, $8 + mulq $7, $3, $21 + umulh $7, $3, $28 + cmplt $21, $31, $20 + cmplt $28, $31, $24 + addq $21, $21, $21 + addq $28, $28, $28 + addq $28, $20, $28 + addq $8, $24, $8 + addq $22, $21, $22 + addq $23, $28, $23 + cmpult $22, $21, $18 + cmpult $23, $28, $27 + addq $23, $18, $23 + addq $8, $27, $8 + stq $22, 80($16) + bis $31, $31, $22 + mulq $6, $5, $19 + umulh $6, $5, $17 + cmplt $19, $31, $25 + cmplt $17, $31, $20 + addq $19, $19, $19 + addq $17, $17, $17 + addq $17, $25, $17 + addq $22, $20, $22 + addq $23, $19, $23 + addq $8, $17, $8 + cmpult $23, $19, $24 + cmpult $8, $17, $21 + addq $8, $24, $8 + addq $22, $21, $22 + mulq $7, $4, $28 + umulh $7, $4, $18 + cmplt $28, $31, $27 + cmplt $18, $31, $25 + addq $28, $28, $28 + addq $18, $18, $18 + addq $18, $27, $18 + addq $22, $25, $22 + addq $23, $28, $23 + addq $8, $18, $8 + cmpult $23, $28, $20 + cmpult $8, $18, $19 + addq $8, $20, $8 + addq $22, $19, $22 + stq $23, 88($16) + bis $31, $31, $23 + mulq $6, $6, $17 + umulh $6, $6, $24 + addq $8, $17, $8 + addq $22, $24, $22 + cmpult $8, $17, $21 + cmpult $22, $24, $27 + addq $22, $21, $22 + addq $23, $27, $23 + mulq $7, $5, $25 + umulh $7, $5, $28 + cmplt $25, $31, $18 + cmplt $28, $31, $20 + addq $25, $25, $25 + addq $28, $28, $28 + addq $28, $18, $28 + addq $23, $20, $23 + addq $8, $25, $8 + addq $22, $28, $22 + cmpult $8, $25, $19 + cmpult $22, $28, $17 + addq $22, $19, $22 + addq $23, $17, $23 + stq $8, 96($16) + bis $31, $31, $8 + mulq $7, $6, $24 + umulh $7, $6, $21 + cmplt $24, $31, $27 + cmplt $21, $31, $18 + addq $24, $24, $24 + addq $21, $21, $21 + addq $21, $27, $21 + addq $8, $18, $8 + addq $22, $24, $22 + addq $23, $21, $23 + cmpult $22, $24, $20 + cmpult $23, $21, $25 + addq $23, $20, $23 + addq $8, $25, $8 + stq $22, 104($16) + bis $31, $31, $22 + mulq $7, $7, $28 + umulh $7, $7, $19 + addq $23, $28, $23 + addq $8, $19, $8 + cmpult $23, $28, $17 + cmpult $8, $19, $27 + addq $8, $17, $8 + addq $22, $27, $22 + stq $23, 112($16) + stq $8, 120($16) + ret $31,($26),1 + .end bn_sqr_comba8 diff --git a/crypto/bn/asm/alpha.s.works b/crypto/bn/asm/alpha.s.works new file mode 100644 index 0000000000..ee6c587809 --- /dev/null +++ b/crypto/bn/asm/alpha.s.works @@ -0,0 +1,533 @@ + + # DEC Alpha assember + # The bn_div64 is actually gcc output but the other parts are hand done. + # Thanks to tzeruch@ceddec.com for sending me the gcc output for + # bn_div64. + # I've gone back and re-done most of routines. + # The key thing to remeber for the 164 CPU is that while a + # multiply operation takes 8 cycles, another one can only be issued + # after 4 cycles have elapsed. I've done modification to help + # improve this. Also, normally, a ld instruction will not be available + # for about 3 cycles. + .file 1 "bn_asm.c" + .set noat +gcc2_compiled.: +__gnu_compiled_c: + .text + .align 3 + .globl bn_mul_add_words + .ent bn_mul_add_words +bn_mul_add_words: +bn_mul_add_words..ng: + .frame $30,0,$26,0 + .prologue 0 + .align 5 + subq $18,4,$18 + bis $31,$31,$0 + blt $18,$43 # if we are -1, -2, -3 or -4 goto tail code + ldq $20,0($17) # 1 1 + ldq $1,0($16) # 1 1 + .align 3 +$42: + mulq $20,$19,$5 # 1 2 1 ###### + ldq $21,8($17) # 2 1 + ldq $2,8($16) # 2 1 + umulh $20,$19,$20 # 1 2 ###### + ldq $27,16($17) # 3 1 + ldq $3,16($16) # 3 1 + mulq $21,$19,$6 # 2 2 1 ###### + ldq $28,24($17) # 4 1 + addq $1,$5,$1 # 1 2 2 + ldq $4,24($16) # 4 1 + umulh $21,$19,$21 # 2 2 ###### + cmpult $1,$5,$22 # 1 2 3 1 + addq $20,$22,$20 # 1 3 1 + addq $1,$0,$1 # 1 2 3 1 + mulq $27,$19,$7 # 3 2 1 ###### + cmpult $1,$0,$0 # 1 2 3 2 + addq $2,$6,$2 # 2 2 2 + addq $20,$0,$0 # 1 3 2 + cmpult $2,$6,$23 # 2 2 3 1 + addq $21,$23,$21 # 2 3 1 + umulh $27,$19,$27 # 3 2 ###### + addq $2,$0,$2 # 2 2 3 1 + cmpult $2,$0,$0 # 2 2 3 2 + subq $18,4,$18 + mulq $28,$19,$8 # 4 2 1 ###### + addq $21,$0,$0 # 2 3 2 + addq $3,$7,$3 # 3 2 2 + addq $16,32,$16 + cmpult $3,$7,$24 # 3 2 3 1 + stq $1,-32($16) # 1 2 4 + umulh $28,$19,$28 # 4 2 ###### + addq $27,$24,$27 # 3 3 1 + addq $3,$0,$3 # 3 2 3 1 + stq $2,-24($16) # 2 2 4 + cmpult $3,$0,$0 # 3 2 3 2 + stq $3,-16($16) # 3 2 4 + addq $4,$8,$4 # 4 2 2 + addq $27,$0,$0 # 3 3 2 + cmpult $4,$8,$25 # 4 2 3 1 + addq $17,32,$17 + addq $28,$25,$28 # 4 3 1 + addq $4,$0,$4 # 4 2 3 1 + cmpult $4,$0,$0 # 4 2 3 2 + stq $4,-8($16) # 4 2 4 + addq $28,$0,$0 # 4 3 2 + blt $18,$43 + + ldq $20,0($17) # 1 1 + ldq $1,0($16) # 1 1 + + br $42 + + .align 4 +$45: + ldq $20,0($17) # 4 1 + ldq $1,0($16) # 4 1 + mulq $20,$19,$5 # 4 2 1 + subq $18,1,$18 + addq $16,8,$16 + addq $17,8,$17 + umulh $20,$19,$20 # 4 2 + addq $1,$5,$1 # 4 2 2 + cmpult $1,$5,$22 # 4 2 3 1 + addq $20,$22,$20 # 4 3 1 + addq $1,$0,$1 # 4 2 3 1 + cmpult $1,$0,$0 # 4 2 3 2 + addq $20,$0,$0 # 4 3 2 + stq $1,-8($16) # 4 2 4 + bgt $18,$45 + ret $31,($26),1 # else exit + + .align 4 +$43: + addq $18,4,$18 + bgt $18,$45 # goto tail code + ret $31,($26),1 # else exit + + .end bn_mul_add_words + .align 3 + .globl bn_mul_words + .ent bn_mul_words +bn_mul_words: +bn_mul_words..ng: + .frame $30,0,$26,0 + .prologue 0 + .align 5 + subq $18,4,$18 + bis $31,$31,$0 + blt $18,$143 # if we are -1, -2, -3 or -4 goto tail code + ldq $20,0($17) # 1 1 + .align 3 +$142: + + mulq $20,$19,$5 # 1 2 1 ##### + ldq $21,8($17) # 2 1 + ldq $27,16($17) # 3 1 + umulh $20,$19,$20 # 1 2 ##### + ldq $28,24($17) # 4 1 + mulq $21,$19,$6 # 2 2 1 ##### + addq $5,$0,$5 # 1 2 3 1 + subq $18,4,$18 + cmpult $5,$0,$0 # 1 2 3 2 + umulh $21,$19,$21 # 2 2 ##### + addq $20,$0,$0 # 1 3 2 + addq $17,32,$17 + addq $6,$0,$6 # 2 2 3 1 + mulq $27,$19,$7 # 3 2 1 ##### + cmpult $6,$0,$0 # 2 2 3 2 + addq $21,$0,$0 # 2 3 2 + addq $16,32,$16 + umulh $27,$19,$27 # 3 2 ##### + stq $5,-32($16) # 1 2 4 + mulq $28,$19,$8 # 4 2 1 ##### + addq $7,$0,$7 # 3 2 3 1 + stq $6,-24($16) # 2 2 4 + cmpult $7,$0,$0 # 3 2 3 2 + umulh $28,$19,$28 # 4 2 ##### + addq $27,$0,$0 # 3 3 2 + stq $7,-16($16) # 3 2 4 + addq $8,$0,$8 # 4 2 3 1 + cmpult $8,$0,$0 # 4 2 3 2 + + addq $28,$0,$0 # 4 3 2 + + stq $8,-8($16) # 4 2 4 + + blt $18,$143 + + ldq $20,0($17) # 1 1 + + br $142 + + .align 4 +$145: + ldq $20,0($17) # 4 1 + mulq $20,$19,$5 # 4 2 1 + subq $18,1,$18 + umulh $20,$19,$20 # 4 2 + addq $5,$0,$5 # 4 2 3 1 + addq $16,8,$16 + cmpult $5,$0,$0 # 4 2 3 2 + addq $17,8,$17 + addq $20,$0,$0 # 4 3 2 + stq $5,-8($16) # 4 2 4 + + bgt $18,$145 + ret $31,($26),1 # else exit + + .align 4 +$143: + addq $18,4,$18 + bgt $18,$145 # goto tail code + ret $31,($26),1 # else exit + + .end bn_mul_words + .align 3 + .globl bn_sqr_words + .ent bn_sqr_words +bn_sqr_words: +bn_sqr_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $18,4,$18 + blt $18,$543 # if we are -1, -2, -3 or -4 goto tail code + ldq $20,0($17) # 1 1 + .align 3 +$542: + mulq $20,$20,$5 ###### + ldq $21,8($17) # 1 1 + subq $18,4 + umulh $20,$20,$1 ###### + ldq $27,16($17) # 1 1 + mulq $21,$21,$6 ###### + ldq $28,24($17) # 1 1 + stq $5,0($16) # r[0] + umulh $21,$21,$2 ###### + stq $1,8($16) # r[1] + mulq $27,$27,$7 ###### + stq $6,16($16) # r[0] + umulh $27,$27,$3 ###### + stq $2,24($16) # r[1] + mulq $28,$28,$8 ###### + stq $7,32($16) # r[0] + umulh $28,$28,$4 ###### + stq $3,40($16) # r[1] + + addq $16,64,$16 + addq $17,32,$17 + stq $8,-16($16) # r[0] + stq $4,-8($16) # r[1] + + blt $18,$543 + ldq $20,0($17) # 1 1 + br $542 + +$442: + ldq $20,0($17) # a[0] + mulq $20,$20,$5 # a[0]*w low part r2 + addq $16,16,$16 + addq $17,8,$17 + subq $18,1,$18 + umulh $20,$20,$1 # a[0]*w high part r3 + stq $5,-16($16) # r[0] + stq $1,-8($16) # r[1] + + bgt $18,$442 + ret $31,($26),1 # else exit + + .align 4 +$543: + addq $18,4,$18 + bgt $18,$442 # goto tail code + ret $31,($26),1 # else exit + .end bn_sqr_words + + .align 3 + .globl bn_add_words + .ent bn_add_words +bn_add_words: +bn_add_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $19,4,$19 + bis $31,$31,$0 # carry = 0 + blt $19,$900 + ldq $5,0($17) # a[0] + ldq $1,0($18) # b[1] + .align 3 +$901: + addq $1,$5,$1 # r=a+b; + ldq $6,8($17) # a[1] + cmpult $1,$5,$22 # did we overflow? + ldq $2,8($18) # b[1] + addq $1,$0,$1 # c+= overflow + ldq $7,16($17) # a[2] + cmpult $1,$0,$0 # overflow? + ldq $3,16($18) # b[2] + addq $0,$22,$0 + ldq $8,24($17) # a[3] + addq $2,$6,$2 # r=a+b; + ldq $4,24($18) # b[3] + cmpult $2,$6,$23 # did we overflow? + addq $3,$7,$3 # r=a+b; + addq $2,$0,$2 # c+= overflow + cmpult $3,$7,$24 # did we overflow? + cmpult $2,$0,$0 # overflow? + addq $4,$8,$4 # r=a+b; + addq $0,$23,$0 + cmpult $4,$8,$25 # did we overflow? + addq $3,$0,$3 # c+= overflow + stq $1,0($16) # r[0]=c + cmpult $3,$0,$0 # overflow? + stq $2,8($16) # r[1]=c + addq $0,$24,$0 + stq $3,16($16) # r[2]=c + addq $4,$0,$4 # c+= overflow + subq $19,4,$19 # loop-- + cmpult $4,$0,$0 # overflow? + addq $17,32,$17 # a++ + addq $0,$25,$0 + stq $4,24($16) # r[3]=c + addq $18,32,$18 # b++ + addq $16,32,$16 # r++ + + blt $19,$900 + ldq $5,0($17) # a[0] + ldq $1,0($18) # b[1] + br $901 + .align 4 +$945: + ldq $5,0($17) # a[0] + ldq $1,0($18) # b[1] + addq $1,$5,$1 # r=a+b; + subq $19,1,$19 # loop-- + addq $1,$0,$1 # c+= overflow + addq $17,8,$17 # a++ + cmpult $1,$5,$22 # did we overflow? + cmpult $1,$0,$0 # overflow? + addq $18,8,$18 # b++ + stq $1,0($16) # r[0]=c + addq $0,$22,$0 + addq $16,8,$16 # r++ + + bgt $19,$945 + ret $31,($26),1 # else exit + +$900: + addq $19,4,$19 + bgt $19,$945 # goto tail code + ret $31,($26),1 # else exit + .end bn_add_words + + # + # What follows was taken directly from the C compiler with a few + # hacks to redo the lables. + # +.text + .align 3 + .globl bn_div64 + .ent bn_div64 +bn_div64: + ldgp $29,0($27) +bn_div64..ng: + lda $30,-48($30) + .frame $30,48,$26,0 + stq $26,0($30) + stq $9,8($30) + stq $10,16($30) + stq $11,24($30) + stq $12,32($30) + stq $13,40($30) + .mask 0x4003e00,-48 + .prologue 1 + bis $16,$16,$9 + bis $17,$17,$10 + bis $18,$18,$11 + bis $31,$31,$13 + bis $31,2,$12 + bne $11,$119 + lda $0,-1 + br $31,$136 + .align 4 +$119: + bis $11,$11,$16 + jsr $26,BN_num_bits_word + ldgp $29,0($26) + subq $0,64,$1 + beq $1,$120 + bis $31,1,$1 + sll $1,$0,$1 + cmpule $9,$1,$1 + bne $1,$120 + # lda $16,_IO_stderr_ + # lda $17,$C32 + # bis $0,$0,$18 + # jsr $26,fprintf + # ldgp $29,0($26) + jsr $26,abort + ldgp $29,0($26) + .align 4 +$120: + bis $31,64,$3 + cmpult $9,$11,$2 + subq $3,$0,$1 + addl $1,$31,$0 + subq $9,$11,$1 + cmoveq $2,$1,$9 + beq $0,$122 + zapnot $0,15,$2 + subq $3,$0,$1 + sll $11,$2,$11 + sll $9,$2,$3 + srl $10,$1,$1 + sll $10,$2,$10 + bis $3,$1,$9 +$122: + srl $11,32,$5 + zapnot $11,15,$6 + lda $7,-1 + .align 5 +$123: + srl $9,32,$1 + subq $1,$5,$1 + bne $1,$126 + zapnot $7,15,$27 + br $31,$127 + .align 4 +$126: + bis $9,$9,$24 + bis $5,$5,$25 + divqu $24,$25,$27 +$127: + srl $10,32,$4 + .align 5 +$128: + mulq $27,$5,$1 + subq $9,$1,$3 + zapnot $3,240,$1 + bne $1,$129 + mulq $6,$27,$2 + sll $3,32,$1 + addq $1,$4,$1 + cmpule $2,$1,$2 + bne $2,$129 + subq $27,1,$27 + br $31,$128 + .align 4 +$129: + mulq $27,$6,$1 + mulq $27,$5,$4 + srl $1,32,$3 + sll $1,32,$1 + addq $4,$3,$4 + cmpult $10,$1,$2 + subq $10,$1,$10 + addq $2,$4,$2 + cmpult $9,$2,$1 + bis $2,$2,$4 + beq $1,$134 + addq $9,$11,$9 + subq $27,1,$27 +$134: + subl $12,1,$12 + subq $9,$4,$9 + beq $12,$124 + sll $27,32,$13 + sll $9,32,$2 + srl $10,32,$1 + sll $10,32,$10 + bis $2,$1,$9 + br $31,$123 + .align 4 +$124: + bis $13,$27,$0 +$136: + ldq $26,0($30) + ldq $9,8($30) + ldq $10,16($30) + ldq $11,24($30) + ldq $12,32($30) + ldq $13,40($30) + addq $30,48,$30 + ret $31,($26),1 + .end bn_div64 + + .set noat + .text + .align 3 + .globl bn_sub_words + .ent bn_sub_words +bn_sub_words: +bn_sub_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $19, 4, $19 + bis $31, $31, $0 + blt $19, $100 + ldq $1, 0($17) + ldq $2, 0($18) +$101: + ldq $3, 8($17) + cmpult $1, $2, $4 + ldq $5, 8($18) + subq $1, $2, $1 + ldq $6, 16($17) + cmpult $1, $0, $2 + ldq $7, 16($18) + subq $1, $0, $23 + ldq $8, 24($17) + addq $2, $4, $0 + cmpult $3, $5, $24 + subq $3, $5, $3 + ldq $22, 24($18) + cmpult $3, $0, $5 + subq $3, $0, $25 + addq $5, $24, $0 + cmpult $6, $7, $27 + subq $6, $7, $6 + stq $23, 0($16) + cmpult $6, $0, $7 + subq $6, $0, $28 + addq $7, $27, $0 + cmpult $8, $22, $21 + subq $8, $22, $8 + stq $25, 8($16) + cmpult $8, $0, $22 + subq $8, $0, $20 + addq $22, $21, $0 + stq $28, 16($16) + subq $19, 4, $19 + stq $20, 24($16) + addq $17, 32, $17 + addq $18, 32, $18 + addq $16, 32, $16 + blt $19, $100 + ldq $1, 0($17) + ldq $2, 0($18) + br $101 +$102: + ldq $1, 0($17) + ldq $2, 0($18) + cmpult $1, $2, $27 + subq $1, $2, $1 + cmpult $1, $0, $2 + subq $1, $0, $1 + stq $1, 0($16) + addq $2, $27, $0 + addq $17, 8, $17 + addq $18, 8, $18 + addq $16, 8, $16 + subq $19, 1, $19 + bgt $19, $102 + ret $31,($26),1 +$100: + addq $19, 4, $19 + bgt $19, $102 +$103: + ret $31,($26),1 + .end bn_sub_words diff --git a/crypto/bn/asm/alpha.works/add.pl b/crypto/bn/asm/alpha.works/add.pl new file mode 100644 index 0000000000..4dc76e6b69 --- /dev/null +++ b/crypto/bn/asm/alpha.works/add.pl @@ -0,0 +1,119 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_add_words + { + local($name)=@_; + local($cc,$a,$b,$r); + + &init_pool(4); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + $count=&wparam(3); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + &br(&label("finish")); + &blt($count,&label("finish")); + + ($a0,$b0)=&NR(2); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + +########################################################## + &set_label("loop"); + + ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap)); + ($b1)=&NR(1); &ld($b1,&QWPw(1,$bp)); + ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap)); + ($b2)=&NR(1); &ld($b2,&QWPw(2,$bp)); + ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap)); + ($b3)=&NR(1); &ld($b3,&QWPw(3,$bp)); + + ($o0,$t0)=&NR(2); + &add($a0,$b0,$o0); + &cmpult($o0,$b0,$t0); + &add($o0,$cc,$o0); + &cmpult($o0,$cc,$cc); + &add($cc,$t0,$cc); &FR($t0); + + ($t1,$o1)=&NR(2); + + &add($a1,$b1,$o1); &FR($a1); + &cmpult($o1,$b1,$t1); &FR($b1); + &add($o1,$cc,$o1); + &cmpult($o1,$cc,$cc); + &add($cc,$t1,$cc); &FR($t1); + + ($t2,$o2)=&NR(2); + + &add($a2,$b2,$o2); &FR($a2); + &cmpult($o2,$b2,$t2); &FR($b2); + &add($o2,$cc,$o2); + &cmpult($o2,$cc,$cc); + &add($cc,$t2,$cc); &FR($t2); + + ($t3,$o3)=&NR(2); + + &add($a3,$b3,$o3); &FR($a3); + &cmpult($o3,$b3,$t3); &FR($b3); + &add($o3,$cc,$o3); + &cmpult($o3,$cc,$cc); + &add($cc,$t3,$cc); &FR($t3); + + &st($o0,&QWPw(0,$rp)); &FR($o0); + &st($o1,&QWPw(0,$rp)); &FR($o1); + &st($o2,&QWPw(0,$rp)); &FR($o2); + &st($o3,&QWPw(0,$rp)); &FR($o3); + + &sub($count,4,$count); # count-=4 + &add($ap,4*$QWS,$ap); # count+=4 + &add($bp,4*$QWS,$bp); # count+=4 + &add($rp,4*$QWS,$rp); # count+=4 + + &blt($count,&label("finish")); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + &br(&label("loop")); +################################################## + # Do the last 0..3 words + + ($t0,$o0)=&NR(2); + &set_label("last_loop"); + + &ld($a0,&QWPw(0,$ap)); # get a + &ld($b0,&QWPw(0,$bp)); # get b + + &add($a0,$b0,$o0); + &cmpult($o0,$b0,$t0); # will we borrow? + &add($o0,$cc,$o0); # will we borrow? + &cmpult($o0,$cc,$cc); # will we borrow? + &add($cc,$t0,$cc); # add the borrows + &st($o0,&QWPw(0,$rp)); # save + + &add($ap,$QWS,$ap); + &add($bp,$QWS,$bp); + &add($rp,$QWS,$rp); + &sub($count,1,$count); + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &FR($o0,$t0,$a0,$b0); + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/div.pl b/crypto/bn/asm/alpha.works/div.pl new file mode 100644 index 0000000000..7ec144377f --- /dev/null +++ b/crypto/bn/asm/alpha.works/div.pl @@ -0,0 +1,144 @@ +#!/usr/local/bin/perl + +sub bn_div64 + { + local($data)=<<'EOF'; + # + # What follows was taken directly from the C compiler with a few + # hacks to redo the lables. + # +.text + .set noreorder + .set volatile + .align 3 + .globl bn_div64 + .ent bn_div64 +bn_div64: + ldgp $29,0($27) +bn_div64..ng: + lda $30,-48($30) + .frame $30,48,$26,0 + stq $26,0($30) + stq $9,8($30) + stq $10,16($30) + stq $11,24($30) + stq $12,32($30) + stq $13,40($30) + .mask 0x4003e00,-48 + .prologue 1 + bis $16,$16,$9 + bis $17,$17,$10 + bis $18,$18,$11 + bis $31,$31,$13 + bis $31,2,$12 + bne $11,$9119 + lda $0,-1 + br $31,$9136 + .align 4 +$9119: + bis $11,$11,$16 + jsr $26,BN_num_bits_word + ldgp $29,0($26) + subq $0,64,$1 + beq $1,$9120 + bis $31,1,$1 + sll $1,$0,$1 + cmpule $9,$1,$1 + bne $1,$9120 + # lda $16,_IO_stderr_ + # lda $17,$C32 + # bis $0,$0,$18 + # jsr $26,fprintf + # ldgp $29,0($26) + jsr $26,abort + ldgp $29,0($26) + .align 4 +$9120: + bis $31,64,$3 + cmpult $9,$11,$2 + subq $3,$0,$1 + addl $1,$31,$0 + subq $9,$11,$1 + cmoveq $2,$1,$9 + beq $0,$9122 + zapnot $0,15,$2 + subq $3,$0,$1 + sll $11,$2,$11 + sll $9,$2,$3 + srl $10,$1,$1 + sll $10,$2,$10 + bis $3,$1,$9 +$9122: + srl $11,32,$5 + zapnot $11,15,$6 + lda $7,-1 + .align 5 +$9123: + srl $9,32,$1 + subq $1,$5,$1 + bne $1,$9126 + zapnot $7,15,$27 + br $31,$9127 + .align 4 +$9126: + bis $9,$9,$24 + bis $5,$5,$25 + divqu $24,$25,$27 +$9127: + srl $10,32,$4 + .align 5 +$9128: + mulq $27,$5,$1 + subq $9,$1,$3 + zapnot $3,240,$1 + bne $1,$9129 + mulq $6,$27,$2 + sll $3,32,$1 + addq $1,$4,$1 + cmpule $2,$1,$2 + bne $2,$9129 + subq $27,1,$27 + br $31,$9128 + .align 4 +$9129: + mulq $27,$6,$1 + mulq $27,$5,$4 + srl $1,32,$3 + sll $1,32,$1 + addq $4,$3,$4 + cmpult $10,$1,$2 + subq $10,$1,$10 + addq $2,$4,$2 + cmpult $9,$2,$1 + bis $2,$2,$4 + beq $1,$9134 + addq $9,$11,$9 + subq $27,1,$27 +$9134: + subl $12,1,$12 + subq $9,$4,$9 + beq $12,$9124 + sll $27,32,$13 + sll $9,32,$2 + srl $10,32,$1 + sll $10,32,$10 + bis $2,$1,$9 + br $31,$9123 + .align 4 +$9124: + bis $13,$27,$0 +$9136: + ldq $26,0($30) + ldq $9,8($30) + ldq $10,16($30) + ldq $11,24($30) + ldq $12,32($30) + ldq $13,40($30) + addq $30,48,$30 + ret $31,($26),1 + .end bn_div64 +EOF + &asm_add($data); + } + +1; diff --git a/crypto/bn/asm/alpha.works/mul.pl b/crypto/bn/asm/alpha.works/mul.pl new file mode 100644 index 0000000000..b182bae452 --- /dev/null +++ b/crypto/bn/asm/alpha.works/mul.pl @@ -0,0 +1,116 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_mul_words + { + local($name)=@_; + local($cc,$a,$b,$r,$couny); + + &init_pool(4); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $count=&wparam(2); + $word=&wparam(3); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + &br(&label("finish")); + &blt($count,&label("finish")); + + ($a0,$r0)=&NR(2); + &ld($a0,&QWPw(0,$ap)); + &ld($r0,&QWPw(0,$rp)); + +$a=<<'EOF'; +########################################################## + &set_label("loop"); + + ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap)); + ($b1)=&NR(1); &ld($b1,&QWPw(1,$bp)); + ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap)); + ($b2)=&NR(1); &ld($b2,&QWPw(2,$bp)); + ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap)); + ($b3)=&NR(1); &ld($b3,&QWPw(3,$bp)); + + ($o0,$t0)=&NR(2); + &add($a0,$b0,$o0); + &cmpult($o0,$b0,$t0); + &add($o0,$cc,$o0); + &cmpult($o0,$cc,$cc); + &add($cc,$t0,$cc); &FR($t0); + + ($t1,$o1)=&NR(2); + + &add($a1,$b1,$o1); &FR($a1); + &cmpult($o1,$b1,$t1); &FR($b1); + &add($o1,$cc,$o1); + &cmpult($o1,$cc,$cc); + &add($cc,$t1,$cc); &FR($t1); + + ($t2,$o2)=&NR(2); + + &add($a2,$b2,$o2); &FR($a2); + &cmpult($o2,$b2,$t2); &FR($b2); + &add($o2,$cc,$o2); + &cmpult($o2,$cc,$cc); + &add($cc,$t2,$cc); &FR($t2); + + ($t3,$o3)=&NR(2); + + &add($a3,$b3,$o3); &FR($a3); + &cmpult($o3,$b3,$t3); &FR($b3); + &add($o3,$cc,$o3); + &cmpult($o3,$cc,$cc); + &add($cc,$t3,$cc); &FR($t3); + + &st($o0,&QWPw(0,$rp)); &FR($o0); + &st($o1,&QWPw(0,$rp)); &FR($o1); + &st($o2,&QWPw(0,$rp)); &FR($o2); + &st($o3,&QWPw(0,$rp)); &FR($o3); + + &sub($count,4,$count); # count-=4 + &add($ap,4*$QWS,$ap); # count+=4 + &add($bp,4*$QWS,$bp); # count+=4 + &add($rp,4*$QWS,$rp); # count+=4 + + &blt($count,&label("finish")); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + &br(&label("loop")); +EOF +################################################## + # Do the last 0..3 words + + &set_label("last_loop"); + + &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a + &mul($a0,$word,($l0)=&NR(1)); + &add($ap,$QWS,$ap); + &muh($a0,$word,($h0)=&NR(1)); &FR($a0); + &add($l0,$cc,$l0); + &add($rp,$QWS,$rp); + &sub($count,1,$count); + &cmpult($l0,$cc,$cc); + &st($l0,&QWPw(-1,$rp)); &FR($l0); + &add($h0,$cc,$cc); &FR($h0); + + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/mul_add.pl b/crypto/bn/asm/alpha.works/mul_add.pl new file mode 100644 index 0000000000..e37f6315fb --- /dev/null +++ b/crypto/bn/asm/alpha.works/mul_add.pl @@ -0,0 +1,120 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_mul_add_words + { + local($name)=@_; + local($cc,$a,$b,$r,$couny); + + &init_pool(4); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $count=&wparam(2); + $word=&wparam(3); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + &br(&label("finish")); + &blt($count,&label("finish")); + + ($a0,$r0)=&NR(2); + &ld($a0,&QWPw(0,$ap)); + &ld($r0,&QWPw(0,$rp)); + +$a=<<'EOF'; +########################################################## + &set_label("loop"); + + ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap)); + ($b1)=&NR(1); &ld($b1,&QWPw(1,$bp)); + ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap)); + ($b2)=&NR(1); &ld($b2,&QWPw(2,$bp)); + ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap)); + ($b3)=&NR(1); &ld($b3,&QWPw(3,$bp)); + + ($o0,$t0)=&NR(2); + &add($a0,$b0,$o0); + &cmpult($o0,$b0,$t0); + &add($o0,$cc,$o0); + &cmpult($o0,$cc,$cc); + &add($cc,$t0,$cc); &FR($t0); + + ($t1,$o1)=&NR(2); + + &add($a1,$b1,$o1); &FR($a1); + &cmpult($o1,$b1,$t1); &FR($b1); + &add($o1,$cc,$o1); + &cmpult($o1,$cc,$cc); + &add($cc,$t1,$cc); &FR($t1); + + ($t2,$o2)=&NR(2); + + &add($a2,$b2,$o2); &FR($a2); + &cmpult($o2,$b2,$t2); &FR($b2); + &add($o2,$cc,$o2); + &cmpult($o2,$cc,$cc); + &add($cc,$t2,$cc); &FR($t2); + + ($t3,$o3)=&NR(2); + + &add($a3,$b3,$o3); &FR($a3); + &cmpult($o3,$b3,$t3); &FR($b3); + &add($o3,$cc,$o3); + &cmpult($o3,$cc,$cc); + &add($cc,$t3,$cc); &FR($t3); + + &st($o0,&QWPw(0,$rp)); &FR($o0); + &st($o1,&QWPw(0,$rp)); &FR($o1); + &st($o2,&QWPw(0,$rp)); &FR($o2); + &st($o3,&QWPw(0,$rp)); &FR($o3); + + &sub($count,4,$count); # count-=4 + &add($ap,4*$QWS,$ap); # count+=4 + &add($bp,4*$QWS,$bp); # count+=4 + &add($rp,4*$QWS,$rp); # count+=4 + + &blt($count,&label("finish")); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + &br(&label("loop")); +EOF +################################################## + # Do the last 0..3 words + + &set_label("last_loop"); + + &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a + &ld(($r0)=&NR(1),&QWPw(0,$rp)); # get b + &mul($a0,$word,($l0)=&NR(1)); + &sub($count,1,$count); + &add($ap,$QWS,$ap); + &muh($a0,$word,($h0)=&NR(1)); &FR($a0); + &add($r0,$l0,$r0); + &add($rp,$QWS,$rp); + &cmpult($r0,$l0,($t0)=&NR(1)); &FR($l0); + &add($r0,$cc,$r0); + &add($h0,$t0,$h0); &FR($t0); + &cmpult($r0,$cc,$cc); + &st($r0,&QWPw(-1,$rp)); &FR($r0); + &add($h0,$cc,$cc); &FR($h0); + + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/mul_c4.pl b/crypto/bn/asm/alpha.works/mul_c4.pl new file mode 100644 index 0000000000..5efd201281 --- /dev/null +++ b/crypto/bn/asm/alpha.works/mul_c4.pl @@ -0,0 +1,213 @@ +#!/usr/local/bin/perl +# alpha assember + +sub mul_add_c + { + local($a,$b,$c0,$c1,$c2)=@_; + local($l1,$h1,$t1,$t2); + + &mul($a,$b,($l1)=&NR(1)); + &muh($a,$b,($h1)=&NR(1)); + &add($c0,$l1,$c0); + &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1); + &add($t1,$h1,$h1); &FR($t1); + &add($c1,$h1,$c1); + &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1); + &add($c2,$t2,$c2); &FR($t2); + } + +sub bn_mul_comba4 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(3); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + + &function_begin($name,""); + + &comment(""); + + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($b[0])=&NR(1),&QWPw(0,$bp)); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($b[1])=&NR(1),&QWPw(1,$bp)); + &mul($a[0],$b[0],($r00)=&NR(1)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($b[2])=&NR(1),&QWPw(2,$bp)); + &muh($a[0],$b[0],($r01)=&NR(1)); + &FR($ap); &ld(($a[3])=&NR(1),&QWPw(3,$ap)); + &FR($bp); &ld(($b[3])=&NR(1),&QWPw(3,$bp)); + &mul($a[0],$b[1],($r02)=&NR(1)); + + ($R,$H1,$H2)=&NR(3); + + &st($r00,&QWPw(0,$rp)); &FR($r00); + + &mov("zero",$R); + &mul($a[1],$b[0],($r03)=&NR(1)); + + &mov("zero",$H1); + &mov("zero",$H0); + &add($R,$r01,$R); + &muh($a[0],$b[1],($r04)=&NR(1)); + &cmpult($R,$r01,($t01)=&NR(1)); &FR($r01); + &add($R,$r02,$R); + &add($H1,$t01,$H1) &FR($t01); + &muh($a[1],$b[0],($r05)=&NR(1)); + &cmpult($R,$r02,($t02)=&NR(1)); &FR($r02); + &add($R,$r03,$R); + &add($H2,$t02,$H2) &FR($t02); + &mul($a[0],$b[2],($r06)=&NR(1)); + &cmpult($R,$r03,($t03)=&NR(1)); &FR($r03); + &add($H1,$t03,$H1) &FR($t03); + &st($R,&QWPw(1,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r04,$R); + &mov("zero",$H2); + &mul($a[1],$b[1],($r07)=&NR(1)); + &cmpult($R,$r04,($t04)=&NR(1)); &FR($r04); + &add($R,$r05,$R); + &add($H1,$t04,$H1) &FR($t04); + &mul($a[2],$b[0],($r08)=&NR(1)); + &cmpult($R,$r05,($t05)=&NR(1)); &FR($r05); + &add($R,$r01,$R); + &add($H2,$t05,$H2) &FR($t05); + &muh($a[0],$b[2],($r09)=&NR(1)); + &cmpult($R,$r06,($t06)=&NR(1)); &FR($r06); + &add($R,$r07,$R); + &add($H1,$t06,$H1) &FR($t06); + &muh($a[1],$b[1],($r10)=&NR(1)); + &cmpult($R,$r07,($t07)=&NR(1)); &FR($r07); + &add($R,$r08,$R); + &add($H2,$t07,$H2) &FR($t07); + &muh($a[2],$b[0],($r11)=&NR(1)); + &cmpult($R,$r08,($t08)=&NR(1)); &FR($r08); + &add($H1,$t08,$H1) &FR($t08); + &st($R,&QWPw(2,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r09,$R); + &mov("zero",$H2); + &mul($a[0],$b[3],($r12)=&NR(1)); + &cmpult($R,$r09,($t09)=&NR(1)); &FR($r09); + &add($R,$r10,$R); + &add($H1,$t09,$H1) &FR($t09); + &mul($a[1],$b[2],($r13)=&NR(1)); + &cmpult($R,$r10,($t10)=&NR(1)); &FR($r10); + &add($R,$r11,$R); + &add($H1,$t10,$H1) &FR($t10); + &mul($a[2],$b[1],($r14)=&NR(1)); + &cmpult($R,$r11,($t11)=&NR(1)); &FR($r11); + &add($R,$r12,$R); + &add($H1,$t11,$H1) &FR($t11); + &mul($a[3],$b[0],($r15)=&NR(1)); + &cmpult($R,$r12,($t12)=&NR(1)); &FR($r12); + &add($R,$r13,$R); + &add($H1,$t12,$H1) &FR($t12); + &muh($a[0],$b[3],($r16)=&NR(1)); + &cmpult($R,$r13,($t13)=&NR(1)); &FR($r13); + &add($R,$r14,$R); + &add($H1,$t13,$H1) &FR($t13); + &muh($a[1],$b[2],($r17)=&NR(1)); + &cmpult($R,$r14,($t14)=&NR(1)); &FR($r14); + &add($R,$r15,$R); + &add($H1,$t14,$H1) &FR($t14); + &muh($a[2],$b[1],($r18)=&NR(1)); + &cmpult($R,$r15,($t15)=&NR(1)); &FR($r15); + &add($H1,$t15,$H1) &FR($t15); + &st($R,&QWPw(3,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r16,$R); + &mov("zero",$H2); + &muh($a[3],$b[0],($r19)=&NR(1)); + &cmpult($R,$r16,($t16)=&NR(1)); &FR($r16); + &add($R,$r17,$R); + &add($H1,$t16,$H1) &FR($t16); + &mul($a[1],$b[3],($r20)=&NR(1)); + &cmpult($R,$r17,($t17)=&NR(1)); &FR($r17); + &add($R,$r18,$R); + &add($H1,$t17,$H1) &FR($t17); + &mul($a[2],$b[2],($r21)=&NR(1)); + &cmpult($R,$r18,($t18)=&NR(1)); &FR($r18); + &add($R,$r19,$R); + &add($H1,$t18,$H1) &FR($t18); + &mul($a[3],$b[1],($r22)=&NR(1)); + &cmpult($R,$r19,($t19)=&NR(1)); &FR($r19); + &add($R,$r20,$R); + &add($H1,$t19,$H1) &FR($t19); + &muh($a[1],$b[3],($r23)=&NR(1)); + &cmpult($R,$r20,($t20)=&NR(1)); &FR($r20); + &add($R,$r21,$R); + &add($H1,$t20,$H1) &FR($t20); + &muh($a[2],$b[2],($r24)=&NR(1)); + &cmpult($R,$r21,($t21)=&NR(1)); &FR($r21); + &add($R,$r22,$R); + &add($H1,$t21,$H1) &FR($t21); + &muh($a[3],$b[1],($r25)=&NR(1)); + &cmpult($R,$r22,($t22)=&NR(1)); &FR($r22); + &add($H1,$t22,$H1) &FR($t22); + &st($R,&QWPw(4,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r23,$R); + &mov("zero",$H2); + &mul($a[2],$b[3],($r26)=&NR(1)); + &cmpult($R,$r23,($t23)=&NR(1)); &FR($r23); + &add($R,$r24,$R); + &add($H1,$t23,$H1) &FR($t23); + &mul($a[3],$b[2],($r27)=&NR(1)); + &cmpult($R,$r24,($t24)=&NR(1)); &FR($r24); + &add($R,$r25,$R); + &add($H1,$t24,$H1) &FR($t24); + &muh($a[2],$b[3],($r28)=&NR(1)); + &cmpult($R,$r25,($t25)=&NR(1)); &FR($r25); + &add($R,$r26,$R); + &add($H1,$t25,$H1) &FR($t25); + &muh($a[3],$b[2],($r29)=&NR(1)); + &cmpult($R,$r26,($t26)=&NR(1)); &FR($r26); + &add($R,$r27,$R); + &add($H1,$t26,$H1) &FR($t26); + &mul($a[3],$b[3],($r30)=&NR(1)); + &cmpult($R,$r27,($t27)=&NR(1)); &FR($r27); + &add($H1,$t27,$H1) &FR($t27); + &st($R,&QWPw(5,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r28,$R); + &mov("zero",$H2); + &muh($a[3],$b[3],($r31)=&NR(1)); + &cmpult($R,$r28,($t28)=&NR(1)); &FR($r28); + &add($R,$r29,$R); + &add($H1,$t28,$H1) &FR($t28); + ############ + &cmpult($R,$r29,($t29)=&NR(1)); &FR($r29); + &add($R,$r30,$R); + &add($H1,$t29,$H1) &FR($t29); + ############ + &cmpult($R,$r30,($t30)=&NR(1)); &FR($r30); + &add($H1,$t30,$H1) &FR($t30); + &st($R,&QWPw(6,$rp)); + &add($H1,$H2,$R); + + &add($R,$r31,$R); &FR($r31); + &st($R,&QWPw(7,$rp)); + + &FR($R,$H1,$H2); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/mul_c4.works.pl b/crypto/bn/asm/alpha.works/mul_c4.works.pl new file mode 100644 index 0000000000..79d86dd25c --- /dev/null +++ b/crypto/bn/asm/alpha.works/mul_c4.works.pl @@ -0,0 +1,98 @@ +#!/usr/local/bin/perl +# alpha assember + +sub mul_add_c + { + local($a,$b,$c0,$c1,$c2)=@_; + local($l1,$h1,$t1,$t2); + +print STDERR "count=$cnt\n"; $cnt++; + &mul($a,$b,($l1)=&NR(1)); + &muh($a,$b,($h1)=&NR(1)); + &add($c0,$l1,$c0); + &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1); + &add($t1,$h1,$h1); &FR($t1); + &add($c1,$h1,$c1); + &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1); + &add($c2,$t2,$c2); &FR($t2); + } + +sub bn_mul_comba4 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(3); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + + &function_begin($name,""); + + &comment(""); + + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($b[0])=&NR(1),&QWPw(0,$bp)); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($b[1])=&NR(1),&QWPw(1,$bp)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($b[2])=&NR(1),&QWPw(2,$bp)); + &ld(($a[3])=&NR(1),&QWPw(3,$ap)); &FR($ap); + &ld(($b[3])=&NR(1),&QWPw(3,$bp)); &FR($bp); + + ($c0,$c1,$c2)=&NR(3); + &mov("zero",$c2); + &mul($a[0],$b[0],$c0); + &muh($a[0],$b[0],$c1); + &st($c0,&QWPw(0,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[1],$c0,$c1,$c2); + &mul_add_c($a[1],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(1,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[1],$b[1],$c0,$c1,$c2); + &mul_add_c($a[0],$b[2],$c0,$c1,$c2); + &mul_add_c($a[2],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(2,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[3],$c0,$c1,$c2); &FR($a[0]); + &mul_add_c($a[1],$b[2],$c0,$c1,$c2); + &mul_add_c($a[2],$b[1],$c0,$c1,$c2); + &mul_add_c($a[3],$b[0],$c0,$c1,$c2); &FR($b[0]); + &st($c0,&QWPw(3,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[1],$b[3],$c0,$c1,$c2); &FR($a[1]); + &mul_add_c($a[2],$b[2],$c0,$c1,$c2); + &mul_add_c($a[3],$b[1],$c0,$c1,$c2); &FR($b[1]); + &st($c0,&QWPw(4,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[2],$b[3],$c0,$c1,$c2); &FR($a[2]); + &mul_add_c($a[3],$b[2],$c0,$c1,$c2); &FR($b[2]); + &st($c0,&QWPw(5,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[3],$b[3],$c0,$c1,$c2); &FR($a[3],$b[3]); + &st($c0,&QWPw(6,$rp)); + &st($c1,&QWPw(7,$rp)); + + &FR($c0,$c1,$c2); + + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/mul_c8.pl b/crypto/bn/asm/alpha.works/mul_c8.pl new file mode 100644 index 0000000000..525ca7494b --- /dev/null +++ b/crypto/bn/asm/alpha.works/mul_c8.pl @@ -0,0 +1,177 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_mul_comba8 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(3); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + + &function_begin($name,""); + + &comment(""); + + &stack_push(2); + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($b[0])=&NR(1),&QWPw(0,$bp)); + &st($reg_s0,&swtmp(0)); &FR($reg_s0); + &st($reg_s1,&swtmp(1)); &FR($reg_s1); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($b[1])=&NR(1),&QWPw(1,$bp)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($b[2])=&NR(1),&QWPw(2,$bp)); + &ld(($a[3])=&NR(1),&QWPw(3,$ap)); + &ld(($b[3])=&NR(1),&QWPw(3,$bp)); + &ld(($a[4])=&NR(1),&QWPw(1,$ap)); + &ld(($b[4])=&NR(1),&QWPw(1,$bp)); + &ld(($a[5])=&NR(1),&QWPw(1,$ap)); + &ld(($b[5])=&NR(1),&QWPw(1,$bp)); + &ld(($a[6])=&NR(1),&QWPw(1,$ap)); + &ld(($b[6])=&NR(1),&QWPw(1,$bp)); + &ld(($a[7])=&NR(1),&QWPw(1,$ap)); &FR($ap); + &ld(($b[7])=&NR(1),&QWPw(1,$bp)); &FR($bp); + + ($c0,$c1,$c2)=&NR(3); + &mov("zero",$c2); + &mul($a[0],$b[0],$c0); + &muh($a[0],$b[0],$c1); + &st($c0,&QWPw(0,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[1],$c0,$c1,$c2); + &mul_add_c($a[1],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(1,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[2],$c0,$c1,$c2); + &mul_add_c($a[1],$b[1],$c0,$c1,$c2); + &mul_add_c($a[2],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(2,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[3],$c0,$c1,$c2); + &mul_add_c($a[1],$b[2],$c0,$c1,$c2); + &mul_add_c($a[2],$b[1],$c0,$c1,$c2); + &mul_add_c($a[3],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(3,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[4],$c0,$c1,$c2); + &mul_add_c($a[1],$b[3],$c0,$c1,$c2); + &mul_add_c($a[2],$b[2],$c0,$c1,$c2); + &mul_add_c($a[3],$b[1],$c0,$c1,$c2); + &mul_add_c($a[4],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(4,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[5],$c0,$c1,$c2); + &mul_add_c($a[1],$b[4],$c0,$c1,$c2); + &mul_add_c($a[2],$b[3],$c0,$c1,$c2); + &mul_add_c($a[3],$b[2],$c0,$c1,$c2); + &mul_add_c($a[4],$b[1],$c0,$c1,$c2); + &mul_add_c($a[5],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(5,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[6],$c0,$c1,$c2); + &mul_add_c($a[1],$b[5],$c0,$c1,$c2); + &mul_add_c($a[2],$b[4],$c0,$c1,$c2); + &mul_add_c($a[3],$b[3],$c0,$c1,$c2); + &mul_add_c($a[4],$b[2],$c0,$c1,$c2); + &mul_add_c($a[5],$b[1],$c0,$c1,$c2); + &mul_add_c($a[6],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(6,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[7],$c0,$c1,$c2); &FR($a[0]); + &mul_add_c($a[1],$b[6],$c0,$c1,$c2); + &mul_add_c($a[2],$b[5],$c0,$c1,$c2); + &mul_add_c($a[3],$b[4],$c0,$c1,$c2); + &mul_add_c($a[4],$b[3],$c0,$c1,$c2); + &mul_add_c($a[5],$b[2],$c0,$c1,$c2); + &mul_add_c($a[6],$b[1],$c0,$c1,$c2); + &mul_add_c($a[7],$b[0],$c0,$c1,$c2); &FR($b[0]); + &st($c0,&QWPw(7,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[1],$b[7],$c0,$c1,$c2); &FR($a[1]); + &mul_add_c($a[2],$b[6],$c0,$c1,$c2); + &mul_add_c($a[3],$b[5],$c0,$c1,$c2); + &mul_add_c($a[4],$b[4],$c0,$c1,$c2); + &mul_add_c($a[5],$b[3],$c0,$c1,$c2); + &mul_add_c($a[6],$b[2],$c0,$c1,$c2); + &mul_add_c($a[7],$b[1],$c0,$c1,$c2); &FR($b[1]); + &st($c0,&QWPw(8,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[2],$b[7],$c0,$c1,$c2); &FR($a[2]); + &mul_add_c($a[3],$b[6],$c0,$c1,$c2); + &mul_add_c($a[4],$b[5],$c0,$c1,$c2); + &mul_add_c($a[5],$b[4],$c0,$c1,$c2); + &mul_add_c($a[6],$b[3],$c0,$c1,$c2); + &mul_add_c($a[7],$b[2],$c0,$c1,$c2); &FR($b[2]); + &st($c0,&QWPw(9,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[3],$b[7],$c0,$c1,$c2); &FR($a[3]); + &mul_add_c($a[4],$b[6],$c0,$c1,$c2); + &mul_add_c($a[5],$b[5],$c0,$c1,$c2); + &mul_add_c($a[6],$b[4],$c0,$c1,$c2); + &mul_add_c($a[7],$b[3],$c0,$c1,$c2); &FR($b[3]); + &st($c0,&QWPw(10,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[4],$b[7],$c0,$c1,$c2); &FR($a[4]); + &mul_add_c($a[5],$b[6],$c0,$c1,$c2); + &mul_add_c($a[6],$b[5],$c0,$c1,$c2); + &mul_add_c($a[7],$b[4],$c0,$c1,$c2); &FR($b[4]); + &st($c0,&QWPw(11,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[5],$b[7],$c0,$c1,$c2); &FR($a[5]); + &mul_add_c($a[6],$b[6],$c0,$c1,$c2); + &mul_add_c($a[7],$b[5],$c0,$c1,$c2); &FR($b[5]); + &st($c0,&QWPw(12,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[6],$b[7],$c0,$c1,$c2); &FR($a[6]); + &mul_add_c($a[7],$b[6],$c0,$c1,$c2); &FR($b[6]); + &st($c0,&QWPw(13,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[7],$b[7],$c0,$c1,$c2); &FR($a[7],$b[7]); + &st($c0,&QWPw(14,$rp)); + &st($c1,&QWPw(15,$rp)); + + &FR($c0,$c1,$c2); + + &ld($reg_s0,&swtmp(0)); + &ld($reg_s1,&swtmp(1)); + &stack_pop(2); + + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/sqr.pl b/crypto/bn/asm/alpha.works/sqr.pl new file mode 100644 index 0000000000..a55b696906 --- /dev/null +++ b/crypto/bn/asm/alpha.works/sqr.pl @@ -0,0 +1,113 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_sqr_words + { + local($name)=@_; + local($cc,$a,$b,$r,$couny); + + &init_pool(3); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $count=&wparam(2); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + &br(&label("finish")); + &blt($count,&label("finish")); + + ($a0,$r0)=&NR(2); + &ld($a0,&QWPw(0,$ap)); + &ld($r0,&QWPw(0,$rp)); + +$a=<<'EOF'; +########################################################## + &set_label("loop"); + + ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap)); + ($b1)=&NR(1); &ld($b1,&QWPw(1,$bp)); + ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap)); + ($b2)=&NR(1); &ld($b2,&QWPw(2,$bp)); + ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap)); + ($b3)=&NR(1); &ld($b3,&QWPw(3,$bp)); + + ($o0,$t0)=&NR(2); + &add($a0,$b0,$o0); + &cmpult($o0,$b0,$t0); + &add($o0,$cc,$o0); + &cmpult($o0,$cc,$cc); + &add($cc,$t0,$cc); &FR($t0); + + ($t1,$o1)=&NR(2); + + &add($a1,$b1,$o1); &FR($a1); + &cmpult($o1,$b1,$t1); &FR($b1); + &add($o1,$cc,$o1); + &cmpult($o1,$cc,$cc); + &add($cc,$t1,$cc); &FR($t1); + + ($t2,$o2)=&NR(2); + + &add($a2,$b2,$o2); &FR($a2); + &cmpult($o2,$b2,$t2); &FR($b2); + &add($o2,$cc,$o2); + &cmpult($o2,$cc,$cc); + &add($cc,$t2,$cc); &FR($t2); + + ($t3,$o3)=&NR(2); + + &add($a3,$b3,$o3); &FR($a3); + &cmpult($o3,$b3,$t3); &FR($b3); + &add($o3,$cc,$o3); + &cmpult($o3,$cc,$cc); + &add($cc,$t3,$cc); &FR($t3); + + &st($o0,&QWPw(0,$rp)); &FR($o0); + &st($o1,&QWPw(0,$rp)); &FR($o1); + &st($o2,&QWPw(0,$rp)); &FR($o2); + &st($o3,&QWPw(0,$rp)); &FR($o3); + + &sub($count,4,$count); # count-=4 + &add($ap,4*$QWS,$ap); # count+=4 + &add($bp,4*$QWS,$bp); # count+=4 + &add($rp,4*$QWS,$rp); # count+=4 + + &blt($count,&label("finish")); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + &br(&label("loop")); +EOF +################################################## + # Do the last 0..3 words + + &set_label("last_loop"); + + &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a + &mul($a0,$a0,($l0)=&NR(1)); + &add($ap,$QWS,$ap); + &add($rp,2*$QWS,$rp); + &sub($count,1,$count); + &muh($a0,$a0,($h0)=&NR(1)); &FR($a0); + &st($l0,&QWPw(-2,$rp)); &FR($l0); + &st($h0,&QWPw(-1,$rp)); &FR($h0); + + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/sqr_c4.pl b/crypto/bn/asm/alpha.works/sqr_c4.pl new file mode 100644 index 0000000000..bf33f5b503 --- /dev/null +++ b/crypto/bn/asm/alpha.works/sqr_c4.pl @@ -0,0 +1,109 @@ +#!/usr/local/bin/perl +# alpha assember + +sub sqr_add_c + { + local($a,$c0,$c1,$c2)=@_; + local($l1,$h1,$t1,$t2); + + &mul($a,$a,($l1)=&NR(1)); + &muh($a,$a,($h1)=&NR(1)); + &add($c0,$l1,$c0); + &add($c1,$h1,$c1); + &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1); + &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1); + &add($c1,$t1,$c1); &FR($t1); + &add($c2,$t2,$c2); &FR($t2); + } + +sub sqr_add_c2 + { + local($a,$b,$c0,$c1,$c2)=@_; + local($l1,$h1,$t1,$t2); + + &mul($a,$b,($l1)=&NR(1)); + &muh($a,$b,($h1)=&NR(1)); + &cmplt($l1,"zero",($lc1)=&NR(1)); + &cmplt($h1,"zero",($hc1)=&NR(1)); + &add($l1,$l1,$l1); + &add($h1,$h1,$h1); + &add($h1,$lc1,$h1); &FR($lc1); + &add($c2,$hc1,$c2); &FR($hc1); + + &add($c0,$l1,$c0); + &add($c1,$h1,$c1); + &cmpult($c0,$l1,($lc1)=&NR(1)); &FR($l1); + &cmpult($c1,$h1,($hc1)=&NR(1)); &FR($h1); + + &add($c1,$lc1,$c1); &FR($lc1); + &add($c2,$hc1,$c2); &FR($hc1); + } + + +sub bn_sqr_comba4 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(2); + + $rp=&wparam(0); + $ap=&wparam(1); + + &function_begin($name,""); + + &comment(""); + + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($a[3])=&NR(1),&QWPw(3,$ap)); &FR($ap); + + ($c0,$c1,$c2)=&NR(3); + + &mov("zero",$c2); + &mul($a[0],$a[0],$c0); + &muh($a[0],$a[0],$c1); + &st($c0,&QWPw(0,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[0],$a[1],$c0,$c1,$c2); + &st($c0,&QWPw(1,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[1],$c0,$c1,$c2); + &sqr_add_c2($a[2],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(2,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[3],$a[0],$c0,$c1,$c2); + &sqr_add_c2($a[2],$a[1],$c0,$c1,$c2); + &st($c0,&QWPw(3,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[2],$c0,$c1,$c2); + &sqr_add_c2($a[3],$a[1],$c0,$c1,$c2); + &st($c0,&QWPw(4,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[3],$a[2],$c0,$c1,$c2); + &st($c0,&QWPw(5,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[3],$c0,$c1,$c2); + &st($c0,&QWPw(6,$rp)); + &st($c1,&QWPw(7,$rp)); + + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/sqr_c8.pl b/crypto/bn/asm/alpha.works/sqr_c8.pl new file mode 100644 index 0000000000..b4afe085f1 --- /dev/null +++ b/crypto/bn/asm/alpha.works/sqr_c8.pl @@ -0,0 +1,132 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_sqr_comba8 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(2); + + $rp=&wparam(0); + $ap=&wparam(1); + + &function_begin($name,""); + + &comment(""); + + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($a[3])=&NR(1),&QWPw(3,$ap)); + &ld(($a[4])=&NR(1),&QWPw(4,$ap)); + &ld(($a[5])=&NR(1),&QWPw(5,$ap)); + &ld(($a[6])=&NR(1),&QWPw(6,$ap)); + &ld(($a[7])=&NR(1),&QWPw(7,$ap)); &FR($ap); + + ($c0,$c1,$c2)=&NR(3); + + &mov("zero",$c2); + &mul($a[0],$a[0],$c0); + &muh($a[0],$a[0],$c1); + &st($c0,&QWPw(0,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[1],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(1,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[1],$c0,$c1,$c2); + &sqr_add_c2($a[2],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(2,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[2],$a[1],$c0,$c1,$c2); + &sqr_add_c2($a[3],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(3,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[2],$c0,$c1,$c2); + &sqr_add_c2($a[3],$a[1],$c0,$c1,$c2); + &sqr_add_c2($a[4],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(4,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[3],$a[2],$c0,$c1,$c2); + &sqr_add_c2($a[4],$a[1],$c0,$c1,$c2); + &sqr_add_c2($a[5],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(5,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[3],$c0,$c1,$c2); + &sqr_add_c2($a[4],$a[2],$c0,$c1,$c2); + &sqr_add_c2($a[5],$a[1],$c0,$c1,$c2); + &sqr_add_c2($a[6],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(6,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[4],$a[3],$c0,$c1,$c2); + &sqr_add_c2($a[5],$a[2],$c0,$c1,$c2); + &sqr_add_c2($a[6],$a[1],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(7,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[4],$c0,$c1,$c2); + &sqr_add_c2($a[5],$a[3],$c0,$c1,$c2); + &sqr_add_c2($a[6],$a[2],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[1],$c0,$c1,$c2); + &st($c0,&QWPw(8,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[5],$a[4],$c0,$c1,$c2); + &sqr_add_c2($a[6],$a[3],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[2],$c0,$c1,$c2); + &st($c0,&QWPw(9,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[5],$c0,$c1,$c2); + &sqr_add_c2($a[6],$a[4],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[3],$c0,$c1,$c2); + &st($c0,&QWPw(10,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[6],$a[5],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[4],$c0,$c1,$c2); + &st($c0,&QWPw(11,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[6],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[5],$c0,$c1,$c2); + &st($c0,&QWPw(12,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[7],$a[6],$c0,$c1,$c2); + &st($c0,&QWPw(13,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[7],$c0,$c1,$c2); + &st($c0,&QWPw(14,$rp)); + &st($c1,&QWPw(15,$rp)); + + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha.works/sub.pl b/crypto/bn/asm/alpha.works/sub.pl new file mode 100644 index 0000000000..d998da5c21 --- /dev/null +++ b/crypto/bn/asm/alpha.works/sub.pl @@ -0,0 +1,108 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_sub_words + { + local($name)=@_; + local($cc,$a,$b,$r); + + &init_pool(4); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + $count=&wparam(3); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + &blt($count,&label("finish")); + + ($a0,$b0)=&NR(2); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + +########################################################## + &set_label("loop"); + + ($a1,$tmp,$b1,$a2,$b2,$a3,$b3,$o0)=&NR(8); + &ld($a1,&QWPw(1,$ap)); + &cmpult($a0,$b0,$tmp); # will we borrow? + &ld($b1,&QWPw(1,$bp)); + &sub($a0,$b0,$a0); # do the subtract + &ld($a2,&QWPw(2,$ap)); + &cmpult($a0,$cc,$b0); # will we borrow? + &ld($b2,&QWPw(2,$bp)); + &sub($a0,$cc,$o0); # will we borrow? + &ld($a3,&QWPw(3,$ap)); + &add($b0,$tmp,$cc); ($t1,$o1)=&NR(2); &FR($tmp); + + &cmpult($a1,$b1,$t1); # will we borrow? + &sub($a1,$b1,$a1); # do the subtract + &ld($b3,&QWPw(3,$bp)); + &cmpult($a1,$cc,$b1); # will we borrow? + &sub($a1,$cc,$o1); # will we borrow? + &add($b1,$t1,$cc); ($tmp,$o2)=&NR(2); &FR($t1,$a1,$b1); + + &cmpult($a2,$b2,$tmp); # will we borrow? + &sub($a2,$b2,$a2); # do the subtract + &st($o0,&QWPw(0,$rp)); &FR($o0); # save + &cmpult($a2,$cc,$b2); # will we borrow? + &sub($a2,$cc,$o2); # will we borrow? + &add($b2,$tmp,$cc); ($t3,$o3)=&NR(2); &FR($tmp,$a2,$b2); + + &cmpult($a3,$b3,$t3); # will we borrow? + &sub($a3,$b3,$a3); # do the subtract + &st($o1,&QWPw(1,$rp)); &FR($o1); + &cmpult($a3,$cc,$b3); # will we borrow? + &sub($a3,$cc,$o3); # will we borrow? + &add($b3,$t3,$cc); &FR($t3,$a3,$b3); + + &st($o2,&QWPw(2,$rp)); &FR($o2); + &sub($count,4,$count); # count-=4 + &st($o3,&QWPw(3,$rp)); &FR($o3); + &add($ap,4*$QWS,$ap); # count+=4 + &add($bp,4*$QWS,$bp); # count+=4 + &add($rp,4*$QWS,$rp); # count+=4 + + &blt($count,&label("finish")); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + &br(&label("loop")); +################################################## + # Do the last 0..3 words + + &set_label("last_loop"); + + &ld($a0,&QWPw(0,$ap)); # get a + &ld($b0,&QWPw(0,$bp)); # get b + &cmpult($a0,$b0,$tmp); # will we borrow? + &sub($a0,$b0,$a0); # do the subtract + &cmpult($a0,$cc,$b0); # will we borrow? + &sub($a0,$cc,$a0); # will we borrow? + &st($a0,&QWPw(0,$rp)); # save + &add($b0,$tmp,$cc); # add the borrows + + &add($ap,$QWS,$ap); + &add($bp,$QWS,$bp); + &add($rp,$QWS,$rp); + &sub($count,1,$count); + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &FR($a0,$b0); + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha/add.pl b/crypto/bn/asm/alpha/add.pl new file mode 100644 index 0000000000..13bf516428 --- /dev/null +++ b/crypto/bn/asm/alpha/add.pl @@ -0,0 +1,118 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_add_words + { + local($name)=@_; + local($cc,$a,$b,$r); + + &init_pool(4); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + $count=&wparam(3); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + &blt($count,&label("finish")); + + ($a0,$b0)=&NR(2); + +########################################################## + &set_label("loop"); + + &ld(($a0)=&NR(1),&QWPw(0,$ap)); + &ld(($b0)=&NR(1),&QWPw(0,$bp)); + &ld(($a1)=&NR(1),&QWPw(1,$ap)); + &ld(($b1)=&NR(1),&QWPw(1,$bp)); + + ($o0,$t0)=&NR(2); + &add($a0,$b0,$o0); + &ld(($a2)=&NR(1),&QWPw(2,$ap)); + &cmpult($o0,$b0,$t0); + &add($o0,$cc,$o0); + &cmpult($o0,$cc,$cc); + &ld(($b2)=&NR(1),&QWPw(2,$bp)); + &add($cc,$t0,$cc); &FR($t0); + + ($t1,$o1)=&NR(2); + + &add($a1,$b1,$o1); &FR($a1); + &cmpult($o1,$b1,$t1); &FR($b1); + &add($o1,$cc,$o1); + &cmpult($o1,$cc,$cc); + &ld(($a3)=&NR(1),&QWPw(3,$ap)); + &add($cc,$t1,$cc); &FR($t1); + + ($t2,$o2)=&NR(2); + + &add($a2,$b2,$o2); &FR($a2); + &cmpult($o2,$b2,$t2); &FR($b2); + &add($o2,$cc,$o2); + &cmpult($o2,$cc,$cc); + &ld(($b3)=&NR(1),&QWPw(3,$bp)); + &st($o0,&QWPw(0,$rp)); &FR($o0); + &add($cc,$t2,$cc); &FR($t2); + + ($t3,$o3)=&NR(2); + + &st($o1,&QWPw(0,$rp)); &FR($o1); + &add($a3,$b3,$o3); &FR($a3); + &cmpult($o3,$b3,$t3); &FR($b3); + &add($o3,$cc,$o3); + &st($o2,&QWPw(0,$rp)); &FR($o2); + &cmpult($o3,$cc,$cc); + &st($o3,&QWPw(0,$rp)); &FR($o3); + &add($cc,$t3,$cc); &FR($t3); + + + &sub($count,4,$count); # count-=4 + &add($ap,4*$QWS,$ap); # count+=4 + &add($bp,4*$QWS,$bp); # count+=4 + &add($rp,4*$QWS,$rp); # count+=4 + + ### + &bge($count,&label("loop")); + ### + &br(&label("finish")); +################################################## + # Do the last 0..3 words + + ($t0,$o0)=&NR(2); + &set_label("last_loop"); + + &ld($a0,&QWPw(0,$ap)); # get a + &ld($b0,&QWPw(0,$bp)); # get b + &add($ap,$QWS,$ap); + &add($bp,$QWS,$bp); + &add($a0,$b0,$o0); + &sub($count,1,$count); + &cmpult($o0,$b0,$t0); # will we borrow? + &add($o0,$cc,$o0); # will we borrow? + &cmpult($o0,$cc,$cc); # will we borrow? + &add($rp,$QWS,$rp); + &st($o0,&QWPw(-1,$rp)); # save + &add($cc,$t0,$cc); # add the borrows + + ### + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &FR($o0,$t0,$a0,$b0); + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha/div.pl b/crypto/bn/asm/alpha/div.pl new file mode 100644 index 0000000000..e9e680897a --- /dev/null +++ b/crypto/bn/asm/alpha/div.pl @@ -0,0 +1,144 @@ +#!/usr/local/bin/perl + +sub bn_div_words + { + local($data)=<<'EOF'; + # + # What follows was taken directly from the C compiler with a few + # hacks to redo the lables. + # +.text + .set noreorder + .set volatile + .align 3 + .globl bn_div_words + .ent bn_div_words +bn_div_words + ldgp $29,0($27) +bn_div_words.ng: + lda $30,-48($30) + .frame $30,48,$26,0 + stq $26,0($30) + stq $9,8($30) + stq $10,16($30) + stq $11,24($30) + stq $12,32($30) + stq $13,40($30) + .mask 0x4003e00,-48 + .prologue 1 + bis $16,$16,$9 + bis $17,$17,$10 + bis $18,$18,$11 + bis $31,$31,$13 + bis $31,2,$12 + bne $11,$9119 + lda $0,-1 + br $31,$9136 + .align 4 +$9119: + bis $11,$11,$16 + jsr $26,BN_num_bits_word + ldgp $29,0($26) + subq $0,64,$1 + beq $1,$9120 + bis $31,1,$1 + sll $1,$0,$1 + cmpule $9,$1,$1 + bne $1,$9120 + # lda $16,_IO_stderr_ + # lda $17,$C32 + # bis $0,$0,$18 + # jsr $26,fprintf + # ldgp $29,0($26) + jsr $26,abort + ldgp $29,0($26) + .align 4 +$9120: + bis $31,64,$3 + cmpult $9,$11,$2 + subq $3,$0,$1 + addl $1,$31,$0 + subq $9,$11,$1 + cmoveq $2,$1,$9 + beq $0,$9122 + zapnot $0,15,$2 + subq $3,$0,$1 + sll $11,$2,$11 + sll $9,$2,$3 + srl $10,$1,$1 + sll $10,$2,$10 + bis $3,$1,$9 +$9122: + srl $11,32,$5 + zapnot $11,15,$6 + lda $7,-1 + .align 5 +$9123: + srl $9,32,$1 + subq $1,$5,$1 + bne $1,$9126 + zapnot $7,15,$27 + br $31,$9127 + .align 4 +$9126: + bis $9,$9,$24 + bis $5,$5,$25 + divqu $24,$25,$27 +$9127: + srl $10,32,$4 + .align 5 +$9128: + mulq $27,$5,$1 + subq $9,$1,$3 + zapnot $3,240,$1 + bne $1,$9129 + mulq $6,$27,$2 + sll $3,32,$1 + addq $1,$4,$1 + cmpule $2,$1,$2 + bne $2,$9129 + subq $27,1,$27 + br $31,$9128 + .align 4 +$9129: + mulq $27,$6,$1 + mulq $27,$5,$4 + srl $1,32,$3 + sll $1,32,$1 + addq $4,$3,$4 + cmpult $10,$1,$2 + subq $10,$1,$10 + addq $2,$4,$2 + cmpult $9,$2,$1 + bis $2,$2,$4 + beq $1,$9134 + addq $9,$11,$9 + subq $27,1,$27 +$9134: + subl $12,1,$12 + subq $9,$4,$9 + beq $12,$9124 + sll $27,32,$13 + sll $9,32,$2 + srl $10,32,$1 + sll $10,32,$10 + bis $2,$1,$9 + br $31,$9123 + .align 4 +$9124: + bis $13,$27,$0 +$9136: + ldq $26,0($30) + ldq $9,8($30) + ldq $10,16($30) + ldq $11,24($30) + ldq $12,32($30) + ldq $13,40($30) + addq $30,48,$30 + ret $31,($26),1 + .end bn_div_words +EOF + &asm_add($data); + } + +1; diff --git a/crypto/bn/asm/alpha/mul.pl b/crypto/bn/asm/alpha/mul.pl new file mode 100644 index 0000000000..76c926566c --- /dev/null +++ b/crypto/bn/asm/alpha/mul.pl @@ -0,0 +1,104 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_mul_words + { + local($name)=@_; + local($cc,$a,$b,$r,$couny); + + &init_pool(4); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $count=&wparam(2); + $word=&wparam(3); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + ### + &blt($count,&label("finish")); + + ($a0)=&NR(1); &ld($a0,&QWPw(0,$ap)); + + &set_label("loop"); + + ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap)); + ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap)); + + &muh($a0,$word,($h0)=&NR(1)); &FR($a0); + ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap)); + ### wait 8 + &mul($a0,$word,($l0)=&NR(1)); &FR($a0); + ### wait 8 + &muh($a1,$word,($h1)=&NR(1)); &FR($a1); + &add($l0,$cc,$l0); ### wait 8 + &mul($a1,$word,($l1)=&NR(1)); &FR($a1); + &cmpult($l0,$cc,$cc); ### wait 8 + &muh($a2,$word,($h2)=&NR(1)); &FR($a2); + &add($h0,$cc,$cc); &FR($h0); ### wait 8 + &mul($a2,$word,($l2)=&NR(1)); &FR($a2); + &add($l1,$cc,$l1); ### wait 8 + &st($l0,&QWPw(0,$rp)); &FR($l0); + &cmpult($l1,$cc,$cc); ### wait 8 + &muh($a3,$word,($h3)=&NR(1)); &FR($a3); + &add($h1,$cc,$cc); &FR($h1); + &mul($a3,$word,($l3)=&NR(1)); &FR($a3); + &add($l2,$cc,$l2); + &st($l1,&QWPw(1,$rp)); &FR($l1); + &cmpult($l2,$cc,$cc); + &add($h2,$cc,$cc); &FR($h2); + &sub($count,4,$count); # count-=4 + &st($l2,&QWPw(2,$rp)); &FR($l2); + &add($l3,$cc,$l3); + &cmpult($l3,$cc,$cc); + &add($bp,4*$QWS,$bp); # count+=4 + &add($h3,$cc,$cc); &FR($h3); + &add($ap,4*$QWS,$ap); # count+=4 + &st($l3,&QWPw(3,$rp)); &FR($l3); + &add($rp,4*$QWS,$rp); # count+=4 + ### + &blt($count,&label("finish")); + ($a0)=&NR(1); &ld($a0,&QWPw(0,$ap)); + &br(&label("finish")); +################################################## + +################################################## + # Do the last 0..3 words + + &set_label("last_loop"); + + &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a + ### + ### + ### + &muh($a0,$word,($h0)=&NR(1)); + ### Wait 8 for next mul issue + &mul($a0,$word,($l0)=&NR(1)); &FR($a0) + &add($ap,$QWS,$ap); + ### Loose 12 until result is available + &add($rp,$QWS,$rp); + &sub($count,1,$count); + &add($l0,$cc,$l0); + ### + &st($l0,&QWPw(-1,$rp)); &FR($l0); + &cmpult($l0,$cc,$cc); + &add($h0,$cc,$cc); &FR($h0); + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha/mul_add.pl b/crypto/bn/asm/alpha/mul_add.pl new file mode 100644 index 0000000000..0d6df69bc4 --- /dev/null +++ b/crypto/bn/asm/alpha/mul_add.pl @@ -0,0 +1,123 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_mul_add_words + { + local($name)=@_; + local($cc,$a,$b,$r,$couny); + + &init_pool(4); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $count=&wparam(2); + $word=&wparam(3); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + ### + &blt($count,&label("finish")); + + &ld(($a0)=&NR(1),&QWPw(0,$ap)); + +$a=<<'EOF'; +########################################################## + &set_label("loop"); + + &ld(($r0)=&NR(1),&QWPw(0,$rp)); + &ld(($a1)=&NR(1),&QWPw(1,$ap)); + &muh($a0,$word,($h0)=&NR(1)); + &ld(($r1)=&NR(1),&QWPw(1,$rp)); + &ld(($a2)=&NR(1),&QWPw(2,$ap)); + ### + &mul($a0,$word,($l0)=&NR(1)); &FR($a0); + &ld(($r2)=&NR(1),&QWPw(2,$rp)); + &muh($a1,$word,($h1)=&NR(1)); + &ld(($a3)=&NR(1),&QWPw(3,$ap)); + &mul($a1,$word,($l1)=&NR(1)); &FR($a1); + &ld(($r3)=&NR(1),&QWPw(3,$rp)); + &add($r0,$l0,$r0); + &add($r1,$l1,$r1); + &cmpult($r0,$l0,($t0)=&NR(1)); &FR($l0); + &cmpult($r1,$l1,($t1)=&NR(1)); &FR($l1); + &muh($a2,$word,($h2)=&NR(1)); + &add($r0,$cc,$r0); + &add($h0,$t0,$h0); &FR($t0); + &cmpult($r0,$cc,$cc); + &add($h1,$t1,$h1); &FR($t1); + &add($h0,$cc,$cc); &FR($h0); + &mul($a2,$word,($l2)=&NR(1)); &FR($a2); + &add($r1,$cc,$r1); + &cmpult($r1,$cc,$cc); + &add($r2,$l2,$r2); + &add($h1,$cc,$cc); &FR($h1); + &cmpult($r2,$l2,($t2)=&NR(1)); &FR($l2); + &muh($a3,$word,($h3)=&NR(1)); + &add($r2,$cc,$r2); + &st($r0,&QWPw(0,$rp)); &FR($r0); + &add($h2,$t2,$h2); &FR($t2); + &st($r1,&QWPw(1,$rp)); &FR($r1); + &cmpult($r2,$cc,$cc); + &mul($a3,$word,($l3)=&NR(1)); &FR($a3); + &add($h2,$cc,$cc); &FR($h2); + &st($r2,&QWPw(2,$rp)); &FR($r2); + &sub($count,4,$count); # count-=4 + &add($rp,4*$QWS,$rp); # count+=4 + &add($r3,$l3,$r3); + &add($ap,4*$QWS,$ap); # count+=4 + &cmpult($r3,$l3,($t3)=&NR(1)); &FR($l3); + &add($r3,$cc,$r3); + &add($h3,$t3,$h3); &FR($t3); + &cmpult($r3,$cc,$cc); + &st($r3,&QWPw(-1,$rp)); &FR($r3); + &add($h3,$cc,$cc); &FR($h3); + + ### + &blt($count,&label("finish")); + &ld(($a0)=&NR(1),&QWPw(0,$ap)); + &br(&label("loop")); +EOF +################################################## + # Do the last 0..3 words + + &set_label("last_loop"); + + &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a + &ld(($r0)=&NR(1),&QWPw(0,$rp)); # get b + ### + ### + &muh($a0,$word,($h0)=&NR(1)); &FR($a0); + ### wait 8 + &mul($a0,$word,($l0)=&NR(1)); &FR($a0); + &add($rp,$QWS,$rp); + &add($ap,$QWS,$ap); + &sub($count,1,$count); + ### wait 3 until l0 is available + &add($r0,$l0,$r0); + ### + &cmpult($r0,$l0,($t0)=&NR(1)); &FR($l0); + &add($r0,$cc,$r0); + &add($h0,$t0,$h0); &FR($t0); + &cmpult($r0,$cc,$cc); + &add($h0,$cc,$cc); &FR($h0); + + &st($r0,&QWPw(-1,$rp)); &FR($r0); + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha/mul_c4.pl b/crypto/bn/asm/alpha/mul_c4.pl new file mode 100644 index 0000000000..9cc876ded4 --- /dev/null +++ b/crypto/bn/asm/alpha/mul_c4.pl @@ -0,0 +1,215 @@ +#!/usr/local/bin/perl +# alpha assember + +# upto + +sub mul_add_c + { + local($a,$b,$c0,$c1,$c2)=@_; + local($l1,$h1,$t1,$t2); + + &mul($a,$b,($l1)=&NR(1)); + &muh($a,$b,($h1)=&NR(1)); + &add($c0,$l1,$c0); + &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1); + &add($t1,$h1,$h1); &FR($t1); + &add($c1,$h1,$c1); + &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1); + &add($c2,$t2,$c2); &FR($t2); + } + +sub bn_mul_comba4 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(3); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + + &function_begin($name,""); + + &comment(""); + + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($b[0])=&NR(1),&QWPw(0,$bp)); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($b[1])=&NR(1),&QWPw(1,$bp)); + &mul($a[0],$b[0],($r00)=&NR(1)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($b[2])=&NR(1),&QWPw(2,$bp)); + &muh($a[0],$b[0],($r01)=&NR(1)); + &FR($ap); &ld(($a[3])=&NR(1),&QWPw(3,$ap)); + &FR($bp); &ld(($b[3])=&NR(1),&QWPw(3,$bp)); + &mul($a[0],$b[1],($r02)=&NR(1)); + + ($R,$H1,$H2)=&NR(3); + + &st($r00,&QWPw(0,$rp)); &FR($r00); + + &mov("zero",$R); + &mul($a[1],$b[0],($r03)=&NR(1)); + + &mov("zero",$H1); + &mov("zero",$H0); + &add($R,$r01,$R); + &muh($a[0],$b[1],($r04)=&NR(1)); + &cmpult($R,$r01,($t01)=&NR(1)); &FR($r01); + &add($R,$r02,$R); + &add($H1,$t01,$H1) &FR($t01); + &muh($a[1],$b[0],($r05)=&NR(1)); + &cmpult($R,$r02,($t02)=&NR(1)); &FR($r02); + &add($R,$r03,$R); + &add($H2,$t02,$H2) &FR($t02); + &mul($a[0],$b[2],($r06)=&NR(1)); + &cmpult($R,$r03,($t03)=&NR(1)); &FR($r03); + &add($H1,$t03,$H1) &FR($t03); + &st($R,&QWPw(1,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r04,$R); + &mov("zero",$H2); + &mul($a[1],$b[1],($r07)=&NR(1)); + &cmpult($R,$r04,($t04)=&NR(1)); &FR($r04); + &add($R,$r05,$R); + &add($H1,$t04,$H1) &FR($t04); + &mul($a[2],$b[0],($r08)=&NR(1)); + &cmpult($R,$r05,($t05)=&NR(1)); &FR($r05); + &add($R,$r01,$R); + &add($H2,$t05,$H2) &FR($t05); + &muh($a[0],$b[2],($r09)=&NR(1)); + &cmpult($R,$r06,($t06)=&NR(1)); &FR($r06); + &add($R,$r07,$R); + &add($H1,$t06,$H1) &FR($t06); + &muh($a[1],$b[1],($r10)=&NR(1)); + &cmpult($R,$r07,($t07)=&NR(1)); &FR($r07); + &add($R,$r08,$R); + &add($H2,$t07,$H2) &FR($t07); + &muh($a[2],$b[0],($r11)=&NR(1)); + &cmpult($R,$r08,($t08)=&NR(1)); &FR($r08); + &add($H1,$t08,$H1) &FR($t08); + &st($R,&QWPw(2,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r09,$R); + &mov("zero",$H2); + &mul($a[0],$b[3],($r12)=&NR(1)); + &cmpult($R,$r09,($t09)=&NR(1)); &FR($r09); + &add($R,$r10,$R); + &add($H1,$t09,$H1) &FR($t09); + &mul($a[1],$b[2],($r13)=&NR(1)); + &cmpult($R,$r10,($t10)=&NR(1)); &FR($r10); + &add($R,$r11,$R); + &add($H1,$t10,$H1) &FR($t10); + &mul($a[2],$b[1],($r14)=&NR(1)); + &cmpult($R,$r11,($t11)=&NR(1)); &FR($r11); + &add($R,$r12,$R); + &add($H1,$t11,$H1) &FR($t11); + &mul($a[3],$b[0],($r15)=&NR(1)); + &cmpult($R,$r12,($t12)=&NR(1)); &FR($r12); + &add($R,$r13,$R); + &add($H1,$t12,$H1) &FR($t12); + &muh($a[0],$b[3],($r16)=&NR(1)); + &cmpult($R,$r13,($t13)=&NR(1)); &FR($r13); + &add($R,$r14,$R); + &add($H1,$t13,$H1) &FR($t13); + &muh($a[1],$b[2],($r17)=&NR(1)); + &cmpult($R,$r14,($t14)=&NR(1)); &FR($r14); + &add($R,$r15,$R); + &add($H1,$t14,$H1) &FR($t14); + &muh($a[2],$b[1],($r18)=&NR(1)); + &cmpult($R,$r15,($t15)=&NR(1)); &FR($r15); + &add($H1,$t15,$H1) &FR($t15); + &st($R,&QWPw(3,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r16,$R); + &mov("zero",$H2); + &muh($a[3],$b[0],($r19)=&NR(1)); + &cmpult($R,$r16,($t16)=&NR(1)); &FR($r16); + &add($R,$r17,$R); + &add($H1,$t16,$H1) &FR($t16); + &mul($a[1],$b[3],($r20)=&NR(1)); + &cmpult($R,$r17,($t17)=&NR(1)); &FR($r17); + &add($R,$r18,$R); + &add($H1,$t17,$H1) &FR($t17); + &mul($a[2],$b[2],($r21)=&NR(1)); + &cmpult($R,$r18,($t18)=&NR(1)); &FR($r18); + &add($R,$r19,$R); + &add($H1,$t18,$H1) &FR($t18); + &mul($a[3],$b[1],($r22)=&NR(1)); + &cmpult($R,$r19,($t19)=&NR(1)); &FR($r19); + &add($R,$r20,$R); + &add($H1,$t19,$H1) &FR($t19); + &muh($a[1],$b[3],($r23)=&NR(1)); + &cmpult($R,$r20,($t20)=&NR(1)); &FR($r20); + &add($R,$r21,$R); + &add($H1,$t20,$H1) &FR($t20); + &muh($a[2],$b[2],($r24)=&NR(1)); + &cmpult($R,$r21,($t21)=&NR(1)); &FR($r21); + &add($R,$r22,$R); + &add($H1,$t21,$H1) &FR($t21); + &muh($a[3],$b[1],($r25)=&NR(1)); + &cmpult($R,$r22,($t22)=&NR(1)); &FR($r22); + &add($H1,$t22,$H1) &FR($t22); + &st($R,&QWPw(4,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r23,$R); + &mov("zero",$H2); + &mul($a[2],$b[3],($r26)=&NR(1)); + &cmpult($R,$r23,($t23)=&NR(1)); &FR($r23); + &add($R,$r24,$R); + &add($H1,$t23,$H1) &FR($t23); + &mul($a[3],$b[2],($r27)=&NR(1)); + &cmpult($R,$r24,($t24)=&NR(1)); &FR($r24); + &add($R,$r25,$R); + &add($H1,$t24,$H1) &FR($t24); + &muh($a[2],$b[3],($r28)=&NR(1)); + &cmpult($R,$r25,($t25)=&NR(1)); &FR($r25); + &add($R,$r26,$R); + &add($H1,$t25,$H1) &FR($t25); + &muh($a[3],$b[2],($r29)=&NR(1)); + &cmpult($R,$r26,($t26)=&NR(1)); &FR($r26); + &add($R,$r27,$R); + &add($H1,$t26,$H1) &FR($t26); + &mul($a[3],$b[3],($r30)=&NR(1)); + &cmpult($R,$r27,($t27)=&NR(1)); &FR($r27); + &add($H1,$t27,$H1) &FR($t27); + &st($R,&QWPw(5,$rp)); + &add($H1,$H2,$R); + + &mov("zero",$H1); + &add($R,$r28,$R); + &mov("zero",$H2); + &muh($a[3],$b[3],($r31)=&NR(1)); + &cmpult($R,$r28,($t28)=&NR(1)); &FR($r28); + &add($R,$r29,$R); + &add($H1,$t28,$H1) &FR($t28); + ############ + &cmpult($R,$r29,($t29)=&NR(1)); &FR($r29); + &add($R,$r30,$R); + &add($H1,$t29,$H1) &FR($t29); + ############ + &cmpult($R,$r30,($t30)=&NR(1)); &FR($r30); + &add($H1,$t30,$H1) &FR($t30); + &st($R,&QWPw(6,$rp)); + &add($H1,$H2,$R); + + &add($R,$r31,$R); &FR($r31); + &st($R,&QWPw(7,$rp)); + + &FR($R,$H1,$H2); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha/mul_c4.works.pl b/crypto/bn/asm/alpha/mul_c4.works.pl new file mode 100644 index 0000000000..79d86dd25c --- /dev/null +++ b/crypto/bn/asm/alpha/mul_c4.works.pl @@ -0,0 +1,98 @@ +#!/usr/local/bin/perl +# alpha assember + +sub mul_add_c + { + local($a,$b,$c0,$c1,$c2)=@_; + local($l1,$h1,$t1,$t2); + +print STDERR "count=$cnt\n"; $cnt++; + &mul($a,$b,($l1)=&NR(1)); + &muh($a,$b,($h1)=&NR(1)); + &add($c0,$l1,$c0); + &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1); + &add($t1,$h1,$h1); &FR($t1); + &add($c1,$h1,$c1); + &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1); + &add($c2,$t2,$c2); &FR($t2); + } + +sub bn_mul_comba4 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(3); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + + &function_begin($name,""); + + &comment(""); + + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($b[0])=&NR(1),&QWPw(0,$bp)); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($b[1])=&NR(1),&QWPw(1,$bp)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($b[2])=&NR(1),&QWPw(2,$bp)); + &ld(($a[3])=&NR(1),&QWPw(3,$ap)); &FR($ap); + &ld(($b[3])=&NR(1),&QWPw(3,$bp)); &FR($bp); + + ($c0,$c1,$c2)=&NR(3); + &mov("zero",$c2); + &mul($a[0],$b[0],$c0); + &muh($a[0],$b[0],$c1); + &st($c0,&QWPw(0,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[1],$c0,$c1,$c2); + &mul_add_c($a[1],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(1,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[1],$b[1],$c0,$c1,$c2); + &mul_add_c($a[0],$b[2],$c0,$c1,$c2); + &mul_add_c($a[2],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(2,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[3],$c0,$c1,$c2); &FR($a[0]); + &mul_add_c($a[1],$b[2],$c0,$c1,$c2); + &mul_add_c($a[2],$b[1],$c0,$c1,$c2); + &mul_add_c($a[3],$b[0],$c0,$c1,$c2); &FR($b[0]); + &st($c0,&QWPw(3,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[1],$b[3],$c0,$c1,$c2); &FR($a[1]); + &mul_add_c($a[2],$b[2],$c0,$c1,$c2); + &mul_add_c($a[3],$b[1],$c0,$c1,$c2); &FR($b[1]); + &st($c0,&QWPw(4,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[2],$b[3],$c0,$c1,$c2); &FR($a[2]); + &mul_add_c($a[3],$b[2],$c0,$c1,$c2); &FR($b[2]); + &st($c0,&QWPw(5,$rp)); &FR($c0); ($c0)=&NR($c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[3],$b[3],$c0,$c1,$c2); &FR($a[3],$b[3]); + &st($c0,&QWPw(6,$rp)); + &st($c1,&QWPw(7,$rp)); + + &FR($c0,$c1,$c2); + + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha/mul_c8.pl b/crypto/bn/asm/alpha/mul_c8.pl new file mode 100644 index 0000000000..525ca7494b --- /dev/null +++ b/crypto/bn/asm/alpha/mul_c8.pl @@ -0,0 +1,177 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_mul_comba8 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(3); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + + &function_begin($name,""); + + &comment(""); + + &stack_push(2); + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($b[0])=&NR(1),&QWPw(0,$bp)); + &st($reg_s0,&swtmp(0)); &FR($reg_s0); + &st($reg_s1,&swtmp(1)); &FR($reg_s1); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($b[1])=&NR(1),&QWPw(1,$bp)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($b[2])=&NR(1),&QWPw(2,$bp)); + &ld(($a[3])=&NR(1),&QWPw(3,$ap)); + &ld(($b[3])=&NR(1),&QWPw(3,$bp)); + &ld(($a[4])=&NR(1),&QWPw(1,$ap)); + &ld(($b[4])=&NR(1),&QWPw(1,$bp)); + &ld(($a[5])=&NR(1),&QWPw(1,$ap)); + &ld(($b[5])=&NR(1),&QWPw(1,$bp)); + &ld(($a[6])=&NR(1),&QWPw(1,$ap)); + &ld(($b[6])=&NR(1),&QWPw(1,$bp)); + &ld(($a[7])=&NR(1),&QWPw(1,$ap)); &FR($ap); + &ld(($b[7])=&NR(1),&QWPw(1,$bp)); &FR($bp); + + ($c0,$c1,$c2)=&NR(3); + &mov("zero",$c2); + &mul($a[0],$b[0],$c0); + &muh($a[0],$b[0],$c1); + &st($c0,&QWPw(0,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[1],$c0,$c1,$c2); + &mul_add_c($a[1],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(1,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[2],$c0,$c1,$c2); + &mul_add_c($a[1],$b[1],$c0,$c1,$c2); + &mul_add_c($a[2],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(2,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[3],$c0,$c1,$c2); + &mul_add_c($a[1],$b[2],$c0,$c1,$c2); + &mul_add_c($a[2],$b[1],$c0,$c1,$c2); + &mul_add_c($a[3],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(3,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[4],$c0,$c1,$c2); + &mul_add_c($a[1],$b[3],$c0,$c1,$c2); + &mul_add_c($a[2],$b[2],$c0,$c1,$c2); + &mul_add_c($a[3],$b[1],$c0,$c1,$c2); + &mul_add_c($a[4],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(4,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[5],$c0,$c1,$c2); + &mul_add_c($a[1],$b[4],$c0,$c1,$c2); + &mul_add_c($a[2],$b[3],$c0,$c1,$c2); + &mul_add_c($a[3],$b[2],$c0,$c1,$c2); + &mul_add_c($a[4],$b[1],$c0,$c1,$c2); + &mul_add_c($a[5],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(5,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[6],$c0,$c1,$c2); + &mul_add_c($a[1],$b[5],$c0,$c1,$c2); + &mul_add_c($a[2],$b[4],$c0,$c1,$c2); + &mul_add_c($a[3],$b[3],$c0,$c1,$c2); + &mul_add_c($a[4],$b[2],$c0,$c1,$c2); + &mul_add_c($a[5],$b[1],$c0,$c1,$c2); + &mul_add_c($a[6],$b[0],$c0,$c1,$c2); + &st($c0,&QWPw(6,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[0],$b[7],$c0,$c1,$c2); &FR($a[0]); + &mul_add_c($a[1],$b[6],$c0,$c1,$c2); + &mul_add_c($a[2],$b[5],$c0,$c1,$c2); + &mul_add_c($a[3],$b[4],$c0,$c1,$c2); + &mul_add_c($a[4],$b[3],$c0,$c1,$c2); + &mul_add_c($a[5],$b[2],$c0,$c1,$c2); + &mul_add_c($a[6],$b[1],$c0,$c1,$c2); + &mul_add_c($a[7],$b[0],$c0,$c1,$c2); &FR($b[0]); + &st($c0,&QWPw(7,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[1],$b[7],$c0,$c1,$c2); &FR($a[1]); + &mul_add_c($a[2],$b[6],$c0,$c1,$c2); + &mul_add_c($a[3],$b[5],$c0,$c1,$c2); + &mul_add_c($a[4],$b[4],$c0,$c1,$c2); + &mul_add_c($a[5],$b[3],$c0,$c1,$c2); + &mul_add_c($a[6],$b[2],$c0,$c1,$c2); + &mul_add_c($a[7],$b[1],$c0,$c1,$c2); &FR($b[1]); + &st($c0,&QWPw(8,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[2],$b[7],$c0,$c1,$c2); &FR($a[2]); + &mul_add_c($a[3],$b[6],$c0,$c1,$c2); + &mul_add_c($a[4],$b[5],$c0,$c1,$c2); + &mul_add_c($a[5],$b[4],$c0,$c1,$c2); + &mul_add_c($a[6],$b[3],$c0,$c1,$c2); + &mul_add_c($a[7],$b[2],$c0,$c1,$c2); &FR($b[2]); + &st($c0,&QWPw(9,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[3],$b[7],$c0,$c1,$c2); &FR($a[3]); + &mul_add_c($a[4],$b[6],$c0,$c1,$c2); + &mul_add_c($a[5],$b[5],$c0,$c1,$c2); + &mul_add_c($a[6],$b[4],$c0,$c1,$c2); + &mul_add_c($a[7],$b[3],$c0,$c1,$c2); &FR($b[3]); + &st($c0,&QWPw(10,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[4],$b[7],$c0,$c1,$c2); &FR($a[4]); + &mul_add_c($a[5],$b[6],$c0,$c1,$c2); + &mul_add_c($a[6],$b[5],$c0,$c1,$c2); + &mul_add_c($a[7],$b[4],$c0,$c1,$c2); &FR($b[4]); + &st($c0,&QWPw(11,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[5],$b[7],$c0,$c1,$c2); &FR($a[5]); + &mul_add_c($a[6],$b[6],$c0,$c1,$c2); + &mul_add_c($a[7],$b[5],$c0,$c1,$c2); &FR($b[5]); + &st($c0,&QWPw(12,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[6],$b[7],$c0,$c1,$c2); &FR($a[6]); + &mul_add_c($a[7],$b[6],$c0,$c1,$c2); &FR($b[6]); + &st($c0,&QWPw(13,$rp)); &FR($c0); ($c0)=&NR(1); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &mul_add_c($a[7],$b[7],$c0,$c1,$c2); &FR($a[7],$b[7]); + &st($c0,&QWPw(14,$rp)); + &st($c1,&QWPw(15,$rp)); + + &FR($c0,$c1,$c2); + + &ld($reg_s0,&swtmp(0)); + &ld($reg_s1,&swtmp(1)); + &stack_pop(2); + + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha/sqr.pl b/crypto/bn/asm/alpha/sqr.pl new file mode 100644 index 0000000000..a55b696906 --- /dev/null +++ b/crypto/bn/asm/alpha/sqr.pl @@ -0,0 +1,113 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_sqr_words + { + local($name)=@_; + local($cc,$a,$b,$r,$couny); + + &init_pool(3); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $count=&wparam(2); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + &br(&label("finish")); + &blt($count,&label("finish")); + + ($a0,$r0)=&NR(2); + &ld($a0,&QWPw(0,$ap)); + &ld($r0,&QWPw(0,$rp)); + +$a=<<'EOF'; +########################################################## + &set_label("loop"); + + ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap)); + ($b1)=&NR(1); &ld($b1,&QWPw(1,$bp)); + ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap)); + ($b2)=&NR(1); &ld($b2,&QWPw(2,$bp)); + ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap)); + ($b3)=&NR(1); &ld($b3,&QWPw(3,$bp)); + + ($o0,$t0)=&NR(2); + &add($a0,$b0,$o0); + &cmpult($o0,$b0,$t0); + &add($o0,$cc,$o0); + &cmpult($o0,$cc,$cc); + &add($cc,$t0,$cc); &FR($t0); + + ($t1,$o1)=&NR(2); + + &add($a1,$b1,$o1); &FR($a1); + &cmpult($o1,$b1,$t1); &FR($b1); + &add($o1,$cc,$o1); + &cmpult($o1,$cc,$cc); + &add($cc,$t1,$cc); &FR($t1); + + ($t2,$o2)=&NR(2); + + &add($a2,$b2,$o2); &FR($a2); + &cmpult($o2,$b2,$t2); &FR($b2); + &add($o2,$cc,$o2); + &cmpult($o2,$cc,$cc); + &add($cc,$t2,$cc); &FR($t2); + + ($t3,$o3)=&NR(2); + + &add($a3,$b3,$o3); &FR($a3); + &cmpult($o3,$b3,$t3); &FR($b3); + &add($o3,$cc,$o3); + &cmpult($o3,$cc,$cc); + &add($cc,$t3,$cc); &FR($t3); + + &st($o0,&QWPw(0,$rp)); &FR($o0); + &st($o1,&QWPw(0,$rp)); &FR($o1); + &st($o2,&QWPw(0,$rp)); &FR($o2); + &st($o3,&QWPw(0,$rp)); &FR($o3); + + &sub($count,4,$count); # count-=4 + &add($ap,4*$QWS,$ap); # count+=4 + &add($bp,4*$QWS,$bp); # count+=4 + &add($rp,4*$QWS,$rp); # count+=4 + + &blt($count,&label("finish")); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + &br(&label("loop")); +EOF +################################################## + # Do the last 0..3 words + + &set_label("last_loop"); + + &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a + &mul($a0,$a0,($l0)=&NR(1)); + &add($ap,$QWS,$ap); + &add($rp,2*$QWS,$rp); + &sub($count,1,$count); + &muh($a0,$a0,($h0)=&NR(1)); &FR($a0); + &st($l0,&QWPw(-2,$rp)); &FR($l0); + &st($h0,&QWPw(-1,$rp)); &FR($h0); + + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha/sqr_c4.pl b/crypto/bn/asm/alpha/sqr_c4.pl new file mode 100644 index 0000000000..bf33f5b503 --- /dev/null +++ b/crypto/bn/asm/alpha/sqr_c4.pl @@ -0,0 +1,109 @@ +#!/usr/local/bin/perl +# alpha assember + +sub sqr_add_c + { + local($a,$c0,$c1,$c2)=@_; + local($l1,$h1,$t1,$t2); + + &mul($a,$a,($l1)=&NR(1)); + &muh($a,$a,($h1)=&NR(1)); + &add($c0,$l1,$c0); + &add($c1,$h1,$c1); + &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1); + &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1); + &add($c1,$t1,$c1); &FR($t1); + &add($c2,$t2,$c2); &FR($t2); + } + +sub sqr_add_c2 + { + local($a,$b,$c0,$c1,$c2)=@_; + local($l1,$h1,$t1,$t2); + + &mul($a,$b,($l1)=&NR(1)); + &muh($a,$b,($h1)=&NR(1)); + &cmplt($l1,"zero",($lc1)=&NR(1)); + &cmplt($h1,"zero",($hc1)=&NR(1)); + &add($l1,$l1,$l1); + &add($h1,$h1,$h1); + &add($h1,$lc1,$h1); &FR($lc1); + &add($c2,$hc1,$c2); &FR($hc1); + + &add($c0,$l1,$c0); + &add($c1,$h1,$c1); + &cmpult($c0,$l1,($lc1)=&NR(1)); &FR($l1); + &cmpult($c1,$h1,($hc1)=&NR(1)); &FR($h1); + + &add($c1,$lc1,$c1); &FR($lc1); + &add($c2,$hc1,$c2); &FR($hc1); + } + + +sub bn_sqr_comba4 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(2); + + $rp=&wparam(0); + $ap=&wparam(1); + + &function_begin($name,""); + + &comment(""); + + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($a[3])=&NR(1),&QWPw(3,$ap)); &FR($ap); + + ($c0,$c1,$c2)=&NR(3); + + &mov("zero",$c2); + &mul($a[0],$a[0],$c0); + &muh($a[0],$a[0],$c1); + &st($c0,&QWPw(0,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[0],$a[1],$c0,$c1,$c2); + &st($c0,&QWPw(1,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[1],$c0,$c1,$c2); + &sqr_add_c2($a[2],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(2,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[3],$a[0],$c0,$c1,$c2); + &sqr_add_c2($a[2],$a[1],$c0,$c1,$c2); + &st($c0,&QWPw(3,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[2],$c0,$c1,$c2); + &sqr_add_c2($a[3],$a[1],$c0,$c1,$c2); + &st($c0,&QWPw(4,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[3],$a[2],$c0,$c1,$c2); + &st($c0,&QWPw(5,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[3],$c0,$c1,$c2); + &st($c0,&QWPw(6,$rp)); + &st($c1,&QWPw(7,$rp)); + + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha/sqr_c8.pl b/crypto/bn/asm/alpha/sqr_c8.pl new file mode 100644 index 0000000000..b4afe085f1 --- /dev/null +++ b/crypto/bn/asm/alpha/sqr_c8.pl @@ -0,0 +1,132 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_sqr_comba8 + { + local($name)=@_; + local(@a,@b,$r,$c0,$c1,$c2); + + $cnt=1; + &init_pool(2); + + $rp=&wparam(0); + $ap=&wparam(1); + + &function_begin($name,""); + + &comment(""); + + &ld(($a[0])=&NR(1),&QWPw(0,$ap)); + &ld(($a[1])=&NR(1),&QWPw(1,$ap)); + &ld(($a[2])=&NR(1),&QWPw(2,$ap)); + &ld(($a[3])=&NR(1),&QWPw(3,$ap)); + &ld(($a[4])=&NR(1),&QWPw(4,$ap)); + &ld(($a[5])=&NR(1),&QWPw(5,$ap)); + &ld(($a[6])=&NR(1),&QWPw(6,$ap)); + &ld(($a[7])=&NR(1),&QWPw(7,$ap)); &FR($ap); + + ($c0,$c1,$c2)=&NR(3); + + &mov("zero",$c2); + &mul($a[0],$a[0],$c0); + &muh($a[0],$a[0],$c1); + &st($c0,&QWPw(0,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[1],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(1,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[1],$c0,$c1,$c2); + &sqr_add_c2($a[2],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(2,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[2],$a[1],$c0,$c1,$c2); + &sqr_add_c2($a[3],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(3,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[2],$c0,$c1,$c2); + &sqr_add_c2($a[3],$a[1],$c0,$c1,$c2); + &sqr_add_c2($a[4],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(4,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[3],$a[2],$c0,$c1,$c2); + &sqr_add_c2($a[4],$a[1],$c0,$c1,$c2); + &sqr_add_c2($a[5],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(5,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[3],$c0,$c1,$c2); + &sqr_add_c2($a[4],$a[2],$c0,$c1,$c2); + &sqr_add_c2($a[5],$a[1],$c0,$c1,$c2); + &sqr_add_c2($a[6],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(6,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[4],$a[3],$c0,$c1,$c2); + &sqr_add_c2($a[5],$a[2],$c0,$c1,$c2); + &sqr_add_c2($a[6],$a[1],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[0],$c0,$c1,$c2); + &st($c0,&QWPw(7,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[4],$c0,$c1,$c2); + &sqr_add_c2($a[5],$a[3],$c0,$c1,$c2); + &sqr_add_c2($a[6],$a[2],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[1],$c0,$c1,$c2); + &st($c0,&QWPw(8,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[5],$a[4],$c0,$c1,$c2); + &sqr_add_c2($a[6],$a[3],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[2],$c0,$c1,$c2); + &st($c0,&QWPw(9,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[5],$c0,$c1,$c2); + &sqr_add_c2($a[6],$a[4],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[3],$c0,$c1,$c2); + &st($c0,&QWPw(10,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[6],$a[5],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[4],$c0,$c1,$c2); + &st($c0,&QWPw(11,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[6],$c0,$c1,$c2); + &sqr_add_c2($a[7],$a[5],$c0,$c1,$c2); + &st($c0,&QWPw(12,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c2($a[7],$a[6],$c0,$c1,$c2); + &st($c0,&QWPw(13,$rp)); + ($c0,$c1,$c2)=($c1,$c2,$c0); + &mov("zero",$c2); + + &sqr_add_c($a[7],$c0,$c1,$c2); + &st($c0,&QWPw(14,$rp)); + &st($c1,&QWPw(15,$rp)); + + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/alpha/sub.pl b/crypto/bn/asm/alpha/sub.pl new file mode 100644 index 0000000000..d998da5c21 --- /dev/null +++ b/crypto/bn/asm/alpha/sub.pl @@ -0,0 +1,108 @@ +#!/usr/local/bin/perl +# alpha assember + +sub bn_sub_words + { + local($name)=@_; + local($cc,$a,$b,$r); + + &init_pool(4); + ($cc)=GR("r0"); + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + $count=&wparam(3); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + &blt($count,&label("finish")); + + ($a0,$b0)=&NR(2); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + +########################################################## + &set_label("loop"); + + ($a1,$tmp,$b1,$a2,$b2,$a3,$b3,$o0)=&NR(8); + &ld($a1,&QWPw(1,$ap)); + &cmpult($a0,$b0,$tmp); # will we borrow? + &ld($b1,&QWPw(1,$bp)); + &sub($a0,$b0,$a0); # do the subtract + &ld($a2,&QWPw(2,$ap)); + &cmpult($a0,$cc,$b0); # will we borrow? + &ld($b2,&QWPw(2,$bp)); + &sub($a0,$cc,$o0); # will we borrow? + &ld($a3,&QWPw(3,$ap)); + &add($b0,$tmp,$cc); ($t1,$o1)=&NR(2); &FR($tmp); + + &cmpult($a1,$b1,$t1); # will we borrow? + &sub($a1,$b1,$a1); # do the subtract + &ld($b3,&QWPw(3,$bp)); + &cmpult($a1,$cc,$b1); # will we borrow? + &sub($a1,$cc,$o1); # will we borrow? + &add($b1,$t1,$cc); ($tmp,$o2)=&NR(2); &FR($t1,$a1,$b1); + + &cmpult($a2,$b2,$tmp); # will we borrow? + &sub($a2,$b2,$a2); # do the subtract + &st($o0,&QWPw(0,$rp)); &FR($o0); # save + &cmpult($a2,$cc,$b2); # will we borrow? + &sub($a2,$cc,$o2); # will we borrow? + &add($b2,$tmp,$cc); ($t3,$o3)=&NR(2); &FR($tmp,$a2,$b2); + + &cmpult($a3,$b3,$t3); # will we borrow? + &sub($a3,$b3,$a3); # do the subtract + &st($o1,&QWPw(1,$rp)); &FR($o1); + &cmpult($a3,$cc,$b3); # will we borrow? + &sub($a3,$cc,$o3); # will we borrow? + &add($b3,$t3,$cc); &FR($t3,$a3,$b3); + + &st($o2,&QWPw(2,$rp)); &FR($o2); + &sub($count,4,$count); # count-=4 + &st($o3,&QWPw(3,$rp)); &FR($o3); + &add($ap,4*$QWS,$ap); # count+=4 + &add($bp,4*$QWS,$bp); # count+=4 + &add($rp,4*$QWS,$rp); # count+=4 + + &blt($count,&label("finish")); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + &br(&label("loop")); +################################################## + # Do the last 0..3 words + + &set_label("last_loop"); + + &ld($a0,&QWPw(0,$ap)); # get a + &ld($b0,&QWPw(0,$bp)); # get b + &cmpult($a0,$b0,$tmp); # will we borrow? + &sub($a0,$b0,$a0); # do the subtract + &cmpult($a0,$cc,$b0); # will we borrow? + &sub($a0,$cc,$a0); # will we borrow? + &st($a0,&QWPw(0,$rp)); # save + &add($b0,$tmp,$cc); # add the borrows + + &add($ap,$QWS,$ap); + &add($bp,$QWS,$bp); + &add($rp,$QWS,$rp); + &sub($count,1,$count); + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &FR($a0,$b0); + &set_label("end"); + &function_end($name); + + &fin_pool; + } + +1; diff --git a/crypto/bn/asm/bn-586.pl b/crypto/bn/asm/bn-586.pl index 128f0f29d6..7a03c67b5b 100644 --- a/crypto/bn/asm/bn-586.pl +++ b/crypto/bn/asm/bn-586.pl @@ -1,7 +1,4 @@ #!/usr/local/bin/perl -# - -#!/usr/local/bin/perl push(@INC,"perlasm","../../perlasm"); require "x86asm.pl"; @@ -11,8 +8,9 @@ require "x86asm.pl"; &bn_mul_add_words("bn_mul_add_words"); &bn_mul_words("bn_mul_words"); &bn_sqr_words("bn_sqr_words"); -&bn_div64("bn_div64"); +&bn_div_words("bn_div_words"); &bn_add_words("bn_add_words"); +&bn_sub_words("bn_sub_words"); &asm_finish(); @@ -228,7 +226,7 @@ sub bn_sqr_words &function_end($name); } -sub bn_div64 +sub bn_div_words { local($name)=@_; @@ -307,7 +305,79 @@ sub bn_add_words } &set_label("aw_end",0); - &mov("eax",$c); +# &mov("eax",$c); # $c is "eax" + + &function_end($name); + } + +sub bn_sub_words + { + local($name)=@_; + + &function_begin($name,""); + + &comment(""); + $a="esi"; + $b="edi"; + $c="eax"; + $r="ebx"; + $tmp1="ecx"; + $tmp2="edx"; + $num="ebp"; + + &mov($r,&wparam(0)); # get r + &mov($a,&wparam(1)); # get a + &mov($b,&wparam(2)); # get b + &mov($num,&wparam(3)); # get num + &xor($c,$c); # clear carry + &and($num,0xfffffff8); # num / 8 + + &jz(&label("aw_finish")); + + &set_label("aw_loop",0); + for ($i=0; $i<8; $i++) + { + &comment("Round $i"); + + &mov($tmp1,&DWP($i*4,$a,"",0)); # *a + &mov($tmp2,&DWP($i*4,$b,"",0)); # *b + &sub($tmp1,$c); + &mov($c,0); + &adc($c,$c); + &sub($tmp1,$tmp2); + &adc($c,0); + &mov(&DWP($i*4,$r,"",0),$tmp1); # *r + } + + &comment(""); + &add($a,32); + &add($b,32); + &add($r,32); + &sub($num,8); + &jnz(&label("aw_loop")); + + &set_label("aw_finish",0); + &mov($num,&wparam(3)); # get num + &and($num,7); + &jz(&label("aw_end")); + + for ($i=0; $i<7; $i++) + { + &comment("Tail Round $i"); + &mov($tmp1,&DWP($i*4,$a,"",0)); # *a + &mov($tmp2,&DWP($i*4,$b,"",0));# *b + &sub($tmp1,$c); + &mov($c,0); + &adc($c,$c); + &sub($tmp1,$tmp2); + &adc($c,0); + &dec($num) if ($i != 6); + &mov(&DWP($i*4,$r,"",0),$tmp1); # *a + &jz(&label("aw_end")) if ($i != 6); + } + &set_label("aw_end",0); + +# &mov("eax",$c); # $c is "eax" &function_end($name); } diff --git a/crypto/bn/asm/bn-alpha.pl b/crypto/bn/asm/bn-alpha.pl new file mode 100644 index 0000000000..302edf2376 --- /dev/null +++ b/crypto/bn/asm/bn-alpha.pl @@ -0,0 +1,571 @@ +#!/usr/local/bin/perl +# I have this in perl so I can use more usefull register names and then convert +# them into alpha registers. +# + +$d=&data(); +$d =~ s/CC/0/g; +$d =~ s/R1/1/g; +$d =~ s/R2/2/g; +$d =~ s/R3/3/g; +$d =~ s/R4/4/g; +$d =~ s/L1/5/g; +$d =~ s/L2/6/g; +$d =~ s/L3/7/g; +$d =~ s/L4/8/g; +$d =~ s/O1/22/g; +$d =~ s/O2/23/g; +$d =~ s/O3/24/g; +$d =~ s/O4/25/g; +$d =~ s/A1/20/g; +$d =~ s/A2/21/g; +$d =~ s/A3/27/g; +$d =~ s/A4/28/g; +if (0){ +} + +print $d; + +sub data + { + local($data)=<<'EOF'; + + # DEC Alpha assember + # The bn_div_words is actually gcc output but the other parts are hand done. + # Thanks to tzeruch@ceddec.com for sending me the gcc output for + # bn_div_words. + # I've gone back and re-done most of routines. + # The key thing to remeber for the 164 CPU is that while a + # multiply operation takes 8 cycles, another one can only be issued + # after 4 cycles have elapsed. I've done modification to help + # improve this. Also, normally, a ld instruction will not be available + # for about 3 cycles. + .file 1 "bn_asm.c" + .set noat +gcc2_compiled.: +__gnu_compiled_c: + .text + .align 3 + .globl bn_mul_add_words + .ent bn_mul_add_words +bn_mul_add_words: +bn_mul_add_words..ng: + .frame $30,0,$26,0 + .prologue 0 + .align 5 + subq $18,4,$18 + bis $31,$31,$CC + blt $18,$43 # if we are -1, -2, -3 or -4 goto tail code + ldq $A1,0($17) # 1 1 + ldq $R1,0($16) # 1 1 + .align 3 +$42: + mulq $A1,$19,$L1 # 1 2 1 ###### + ldq $A2,8($17) # 2 1 + ldq $R2,8($16) # 2 1 + umulh $A1,$19,$A1 # 1 2 ###### + ldq $A3,16($17) # 3 1 + ldq $R3,16($16) # 3 1 + mulq $A2,$19,$L2 # 2 2 1 ###### + ldq $A4,24($17) # 4 1 + addq $R1,$L1,$R1 # 1 2 2 + ldq $R4,24($16) # 4 1 + umulh $A2,$19,$A2 # 2 2 ###### + cmpult $R1,$L1,$O1 # 1 2 3 1 + addq $A1,$O1,$A1 # 1 3 1 + addq $R1,$CC,$R1 # 1 2 3 1 + mulq $A3,$19,$L3 # 3 2 1 ###### + cmpult $R1,$CC,$CC # 1 2 3 2 + addq $R2,$L2,$R2 # 2 2 2 + addq $A1,$CC,$CC # 1 3 2 + cmpult $R2,$L2,$O2 # 2 2 3 1 + addq $A2,$O2,$A2 # 2 3 1 + umulh $A3,$19,$A3 # 3 2 ###### + addq $R2,$CC,$R2 # 2 2 3 1 + cmpult $R2,$CC,$CC # 2 2 3 2 + subq $18,4,$18 + mulq $A4,$19,$L4 # 4 2 1 ###### + addq $A2,$CC,$CC # 2 3 2 + addq $R3,$L3,$R3 # 3 2 2 + addq $16,32,$16 + cmpult $R3,$L3,$O3 # 3 2 3 1 + stq $R1,-32($16) # 1 2 4 + umulh $A4,$19,$A4 # 4 2 ###### + addq $A3,$O3,$A3 # 3 3 1 + addq $R3,$CC,$R3 # 3 2 3 1 + stq $R2,-24($16) # 2 2 4 + cmpult $R3,$CC,$CC # 3 2 3 2 + stq $R3,-16($16) # 3 2 4 + addq $R4,$L4,$R4 # 4 2 2 + addq $A3,$CC,$CC # 3 3 2 + cmpult $R4,$L4,$O4 # 4 2 3 1 + addq $17,32,$17 + addq $A4,$O4,$A4 # 4 3 1 + addq $R4,$CC,$R4 # 4 2 3 1 + cmpult $R4,$CC,$CC # 4 2 3 2 + stq $R4,-8($16) # 4 2 4 + addq $A4,$CC,$CC # 4 3 2 + blt $18,$43 + + ldq $A1,0($17) # 1 1 + ldq $R1,0($16) # 1 1 + + br $42 + + .align 4 +$45: + ldq $A1,0($17) # 4 1 + ldq $R1,0($16) # 4 1 + mulq $A1,$19,$L1 # 4 2 1 + subq $18,1,$18 + addq $16,8,$16 + addq $17,8,$17 + umulh $A1,$19,$A1 # 4 2 + addq $R1,$L1,$R1 # 4 2 2 + cmpult $R1,$L1,$O1 # 4 2 3 1 + addq $A1,$O1,$A1 # 4 3 1 + addq $R1,$CC,$R1 # 4 2 3 1 + cmpult $R1,$CC,$CC # 4 2 3 2 + addq $A1,$CC,$CC # 4 3 2 + stq $R1,-8($16) # 4 2 4 + bgt $18,$45 + ret $31,($26),1 # else exit + + .align 4 +$43: + addq $18,4,$18 + bgt $18,$45 # goto tail code + ret $31,($26),1 # else exit + + .end bn_mul_add_words + .align 3 + .globl bn_mul_words + .ent bn_mul_words +bn_mul_words: +bn_mul_words..ng: + .frame $30,0,$26,0 + .prologue 0 + .align 5 + subq $18,4,$18 + bis $31,$31,$CC + blt $18,$143 # if we are -1, -2, -3 or -4 goto tail code + ldq $A1,0($17) # 1 1 + .align 3 +$142: + + mulq $A1,$19,$L1 # 1 2 1 ##### + ldq $A2,8($17) # 2 1 + ldq $A3,16($17) # 3 1 + umulh $A1,$19,$A1 # 1 2 ##### + ldq $A4,24($17) # 4 1 + mulq $A2,$19,$L2 # 2 2 1 ##### + addq $L1,$CC,$L1 # 1 2 3 1 + subq $18,4,$18 + cmpult $L1,$CC,$CC # 1 2 3 2 + umulh $A2,$19,$A2 # 2 2 ##### + addq $A1,$CC,$CC # 1 3 2 + addq $17,32,$17 + addq $L2,$CC,$L2 # 2 2 3 1 + mulq $A3,$19,$L3 # 3 2 1 ##### + cmpult $L2,$CC,$CC # 2 2 3 2 + addq $A2,$CC,$CC # 2 3 2 + addq $16,32,$16 + umulh $A3,$19,$A3 # 3 2 ##### + stq $L1,-32($16) # 1 2 4 + mulq $A4,$19,$L4 # 4 2 1 ##### + addq $L3,$CC,$L3 # 3 2 3 1 + stq $L2,-24($16) # 2 2 4 + cmpult $L3,$CC,$CC # 3 2 3 2 + umulh $A4,$19,$A4 # 4 2 ##### + addq $A3,$CC,$CC # 3 3 2 + stq $L3,-16($16) # 3 2 4 + addq $L4,$CC,$L4 # 4 2 3 1 + cmpult $L4,$CC,$CC # 4 2 3 2 + + addq $A4,$CC,$CC # 4 3 2 + + stq $L4,-8($16) # 4 2 4 + + blt $18,$143 + + ldq $A1,0($17) # 1 1 + + br $142 + + .align 4 +$145: + ldq $A1,0($17) # 4 1 + mulq $A1,$19,$L1 # 4 2 1 + subq $18,1,$18 + umulh $A1,$19,$A1 # 4 2 + addq $L1,$CC,$L1 # 4 2 3 1 + addq $16,8,$16 + cmpult $L1,$CC,$CC # 4 2 3 2 + addq $17,8,$17 + addq $A1,$CC,$CC # 4 3 2 + stq $L1,-8($16) # 4 2 4 + + bgt $18,$145 + ret $31,($26),1 # else exit + + .align 4 +$143: + addq $18,4,$18 + bgt $18,$145 # goto tail code + ret $31,($26),1 # else exit + + .end bn_mul_words + .align 3 + .globl bn_sqr_words + .ent bn_sqr_words +bn_sqr_words: +bn_sqr_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $18,4,$18 + blt $18,$543 # if we are -1, -2, -3 or -4 goto tail code + ldq $A1,0($17) # 1 1 + .align 3 +$542: + mulq $A1,$A1,$L1 ###### + ldq $A2,8($17) # 1 1 + subq $18,4 + umulh $A1,$A1,$R1 ###### + ldq $A3,16($17) # 1 1 + mulq $A2,$A2,$L2 ###### + ldq $A4,24($17) # 1 1 + stq $L1,0($16) # r[0] + umulh $A2,$A2,$R2 ###### + stq $R1,8($16) # r[1] + mulq $A3,$A3,$L3 ###### + stq $L2,16($16) # r[0] + umulh $A3,$A3,$R3 ###### + stq $R2,24($16) # r[1] + mulq $A4,$A4,$L4 ###### + stq $L3,32($16) # r[0] + umulh $A4,$A4,$R4 ###### + stq $R3,40($16) # r[1] + + addq $16,64,$16 + addq $17,32,$17 + stq $L4,-16($16) # r[0] + stq $R4,-8($16) # r[1] + + blt $18,$543 + ldq $A1,0($17) # 1 1 + br $542 + +$442: + ldq $A1,0($17) # a[0] + mulq $A1,$A1,$L1 # a[0]*w low part r2 + addq $16,16,$16 + addq $17,8,$17 + subq $18,1,$18 + umulh $A1,$A1,$R1 # a[0]*w high part r3 + stq $L1,-16($16) # r[0] + stq $R1,-8($16) # r[1] + + bgt $18,$442 + ret $31,($26),1 # else exit + + .align 4 +$543: + addq $18,4,$18 + bgt $18,$442 # goto tail code + ret $31,($26),1 # else exit + .end bn_sqr_words + + .align 3 + .globl bn_add_words + .ent bn_add_words +bn_add_words: +bn_add_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $19,4,$19 + bis $31,$31,$CC # carry = 0 + blt $19,$900 + ldq $L1,0($17) # a[0] + ldq $R1,0($18) # b[1] + .align 3 +$901: + addq $R1,$L1,$R1 # r=a+b; + ldq $L2,8($17) # a[1] + cmpult $R1,$L1,$O1 # did we overflow? + ldq $R2,8($18) # b[1] + addq $R1,$CC,$R1 # c+= overflow + ldq $L3,16($17) # a[2] + cmpult $R1,$CC,$CC # overflow? + ldq $R3,16($18) # b[2] + addq $CC,$O1,$CC + ldq $L4,24($17) # a[3] + addq $R2,$L2,$R2 # r=a+b; + ldq $R4,24($18) # b[3] + cmpult $R2,$L2,$O2 # did we overflow? + addq $R3,$L3,$R3 # r=a+b; + addq $R2,$CC,$R2 # c+= overflow + cmpult $R3,$L3,$O3 # did we overflow? + cmpult $R2,$CC,$CC # overflow? + addq $R4,$L4,$R4 # r=a+b; + addq $CC,$O2,$CC + cmpult $R4,$L4,$O4 # did we overflow? + addq $R3,$CC,$R3 # c+= overflow + stq $R1,0($16) # r[0]=c + cmpult $R3,$CC,$CC # overflow? + stq $R2,8($16) # r[1]=c + addq $CC,$O3,$CC + stq $R3,16($16) # r[2]=c + addq $R4,$CC,$R4 # c+= overflow + subq $19,4,$19 # loop-- + cmpult $R4,$CC,$CC # overflow? + addq $17,32,$17 # a++ + addq $CC,$O4,$CC + stq $R4,24($16) # r[3]=c + addq $18,32,$18 # b++ + addq $16,32,$16 # r++ + + blt $19,$900 + ldq $L1,0($17) # a[0] + ldq $R1,0($18) # b[1] + br $901 + .align 4 +$945: + ldq $L1,0($17) # a[0] + ldq $R1,0($18) # b[1] + addq $R1,$L1,$R1 # r=a+b; + subq $19,1,$19 # loop-- + addq $R1,$CC,$R1 # c+= overflow + addq $17,8,$17 # a++ + cmpult $R1,$L1,$O1 # did we overflow? + cmpult $R1,$CC,$CC # overflow? + addq $18,8,$18 # b++ + stq $R1,0($16) # r[0]=c + addq $CC,$O1,$CC + addq $16,8,$16 # r++ + + bgt $19,$945 + ret $31,($26),1 # else exit + +$900: + addq $19,4,$19 + bgt $19,$945 # goto tail code + ret $31,($26),1 # else exit + .end bn_add_words + + .align 3 + .globl bn_sub_words + .ent bn_sub_words +bn_sub_words: +bn_sub_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $19,4,$19 + bis $31,$31,$CC # carry = 0 + br $800 + blt $19,$800 + ldq $L1,0($17) # a[0] + ldq $R1,0($18) # b[1] + .align 3 +$801: + addq $R1,$L1,$R1 # r=a+b; + ldq $L2,8($17) # a[1] + cmpult $R1,$L1,$O1 # did we overflow? + ldq $R2,8($18) # b[1] + addq $R1,$CC,$R1 # c+= overflow + ldq $L3,16($17) # a[2] + cmpult $R1,$CC,$CC # overflow? + ldq $R3,16($18) # b[2] + addq $CC,$O1,$CC + ldq $L4,24($17) # a[3] + addq $R2,$L2,$R2 # r=a+b; + ldq $R4,24($18) # b[3] + cmpult $R2,$L2,$O2 # did we overflow? + addq $R3,$L3,$R3 # r=a+b; + addq $R2,$CC,$R2 # c+= overflow + cmpult $R3,$L3,$O3 # did we overflow? + cmpult $R2,$CC,$CC # overflow? + addq $R4,$L4,$R4 # r=a+b; + addq $CC,$O2,$CC + cmpult $R4,$L4,$O4 # did we overflow? + addq $R3,$CC,$R3 # c+= overflow + stq $R1,0($16) # r[0]=c + cmpult $R3,$CC,$CC # overflow? + stq $R2,8($16) # r[1]=c + addq $CC,$O3,$CC + stq $R3,16($16) # r[2]=c + addq $R4,$CC,$R4 # c+= overflow + subq $19,4,$19 # loop-- + cmpult $R4,$CC,$CC # overflow? + addq $17,32,$17 # a++ + addq $CC,$O4,$CC + stq $R4,24($16) # r[3]=c + addq $18,32,$18 # b++ + addq $16,32,$16 # r++ + + blt $19,$800 + ldq $L1,0($17) # a[0] + ldq $R1,0($18) # b[1] + br $801 + .align 4 +$845: + ldq $L1,0($17) # a[0] + ldq $R1,0($18) # b[1] + cmpult $L1,$R1,$O1 # will we borrow? + subq $L1,$R1,$R1 # r=a-b; + subq $19,1,$19 # loop-- + cmpult $R1,$CC,$O2 # will we borrow? + subq $R1,$CC,$R1 # c+= overflow + addq $17,8,$17 # a++ + addq $18,8,$18 # b++ + stq $R1,0($16) # r[0]=c + addq $O2,$O1,$CC + addq $16,8,$16 # r++ + + bgt $19,$845 + ret $31,($26),1 # else exit + +$800: + addq $19,4,$19 + bgt $19,$845 # goto tail code + ret $31,($26),1 # else exit + .end bn_sub_words + + # + # What follows was taken directly from the C compiler with a few + # hacks to redo the lables. + # +.text + .align 3 + .globl bn_div_words + .ent bn_div_words +bn_div_words: + ldgp $29,0($27) +bn_div_words..ng: + lda $30,-48($30) + .frame $30,48,$26,0 + stq $26,0($30) + stq $9,8($30) + stq $10,16($30) + stq $11,24($30) + stq $12,32($30) + stq $13,40($30) + .mask 0x4003e00,-48 + .prologue 1 + bis $16,$16,$9 + bis $17,$17,$10 + bis $18,$18,$11 + bis $31,$31,$13 + bis $31,2,$12 + bne $11,$119 + lda $0,-1 + br $31,$136 + .align 4 +$119: + bis $11,$11,$16 + jsr $26,BN_num_bits_word + ldgp $29,0($26) + subq $0,64,$1 + beq $1,$120 + bis $31,1,$1 + sll $1,$0,$1 + cmpule $9,$1,$1 + bne $1,$120 + # lda $16,_IO_stderr_ + # lda $17,$C32 + # bis $0,$0,$18 + # jsr $26,fprintf + # ldgp $29,0($26) + jsr $26,abort + ldgp $29,0($26) + .align 4 +$120: + bis $31,64,$3 + cmpult $9,$11,$2 + subq $3,$0,$1 + addl $1,$31,$0 + subq $9,$11,$1 + cmoveq $2,$1,$9 + beq $0,$122 + zapnot $0,15,$2 + subq $3,$0,$1 + sll $11,$2,$11 + sll $9,$2,$3 + srl $10,$1,$1 + sll $10,$2,$10 + bis $3,$1,$9 +$122: + srl $11,32,$5 + zapnot $11,15,$6 + lda $7,-1 + .align 5 +$123: + srl $9,32,$1 + subq $1,$5,$1 + bne $1,$126 + zapnot $7,15,$27 + br $31,$127 + .align 4 +$126: + bis $9,$9,$24 + bis $5,$5,$25 + divqu $24,$25,$27 +$127: + srl $10,32,$4 + .align 5 +$128: + mulq $27,$5,$1 + subq $9,$1,$3 + zapnot $3,240,$1 + bne $1,$129 + mulq $6,$27,$2 + sll $3,32,$1 + addq $1,$4,$1 + cmpule $2,$1,$2 + bne $2,$129 + subq $27,1,$27 + br $31,$128 + .align 4 +$129: + mulq $27,$6,$1 + mulq $27,$5,$4 + srl $1,32,$3 + sll $1,32,$1 + addq $4,$3,$4 + cmpult $10,$1,$2 + subq $10,$1,$10 + addq $2,$4,$2 + cmpult $9,$2,$1 + bis $2,$2,$4 + beq $1,$134 + addq $9,$11,$9 + subq $27,1,$27 +$134: + subl $12,1,$12 + subq $9,$4,$9 + beq $12,$124 + sll $27,32,$13 + sll $9,32,$2 + srl $10,32,$1 + sll $10,32,$10 + bis $2,$1,$9 + br $31,$123 + .align 4 +$124: + bis $13,$27,$0 +$136: + ldq $26,0($30) + ldq $9,8($30) + ldq $10,16($30) + ldq $11,24($30) + ldq $12,32($30) + ldq $13,40($30) + addq $30,48,$30 + ret $31,($26),1 + .end bn_div_words +EOF + return($data); + } + diff --git a/crypto/bn/asm/bn-win32.asm b/crypto/bn/asm/bn-win32.asm index 017ea462b0..871bd88d77 100644 --- a/crypto/bn/asm/bn-win32.asm +++ b/crypto/bn/asm/bn-win32.asm @@ -485,9 +485,9 @@ $L010sw_end: _bn_sqr_words ENDP _TEXT ENDS _TEXT SEGMENT -PUBLIC _bn_div64 +PUBLIC _bn_div_words -_bn_div64 PROC NEAR +_bn_div_words PROC NEAR push ebp push ebx push esi @@ -501,7 +501,7 @@ _bn_div64 PROC NEAR pop ebx pop ebp ret -_bn_div64 ENDP +_bn_div_words ENDP _TEXT ENDS _TEXT SEGMENT PUBLIC _bn_add_words @@ -678,7 +678,6 @@ $L011aw_finish: adc eax, 0 mov DWORD PTR 24[ebx],ecx $L013aw_end: - mov eax, eax pop edi pop esi pop ebx @@ -686,4 +685,1438 @@ $L013aw_end: ret _bn_add_words ENDP _TEXT ENDS +_TEXT SEGMENT +PUBLIC _bn_sub_words + +_bn_sub_words PROC NEAR + push ebp + push ebx + push esi + push edi + ; + mov ebx, DWORD PTR 20[esp] + mov esi, DWORD PTR 24[esp] + mov edi, DWORD PTR 28[esp] + mov ebp, DWORD PTR 32[esp] + xor eax, eax + and ebp, 4294967288 + jz $L014aw_finish +L015aw_loop: + ; Round 0 + mov ecx, DWORD PTR [esi] + mov edx, DWORD PTR [edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + mov DWORD PTR [ebx],ecx + ; Round 1 + mov ecx, DWORD PTR 4[esi] + mov edx, DWORD PTR 4[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + mov DWORD PTR 4[ebx],ecx + ; Round 2 + mov ecx, DWORD PTR 8[esi] + mov edx, DWORD PTR 8[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + mov DWORD PTR 8[ebx],ecx + ; Round 3 + mov ecx, DWORD PTR 12[esi] + mov edx, DWORD PTR 12[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + mov DWORD PTR 12[ebx],ecx + ; Round 4 + mov ecx, DWORD PTR 16[esi] + mov edx, DWORD PTR 16[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + mov DWORD PTR 16[ebx],ecx + ; Round 5 + mov ecx, DWORD PTR 20[esi] + mov edx, DWORD PTR 20[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + mov DWORD PTR 20[ebx],ecx + ; Round 6 + mov ecx, DWORD PTR 24[esi] + mov edx, DWORD PTR 24[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + mov DWORD PTR 24[ebx],ecx + ; Round 7 + mov ecx, DWORD PTR 28[esi] + mov edx, DWORD PTR 28[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + mov DWORD PTR 28[ebx],ecx + ; + add esi, 32 + add edi, 32 + add ebx, 32 + sub ebp, 8 + jnz L015aw_loop +$L014aw_finish: + mov ebp, DWORD PTR 32[esp] + and ebp, 7 + jz $L016aw_end + ; Tail Round 0 + mov ecx, DWORD PTR [esi] + mov edx, DWORD PTR [edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + dec ebp + mov DWORD PTR [ebx],ecx + jz $L016aw_end + ; Tail Round 1 + mov ecx, DWORD PTR 4[esi] + mov edx, DWORD PTR 4[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + dec ebp + mov DWORD PTR 4[ebx],ecx + jz $L016aw_end + ; Tail Round 2 + mov ecx, DWORD PTR 8[esi] + mov edx, DWORD PTR 8[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + dec ebp + mov DWORD PTR 8[ebx],ecx + jz $L016aw_end + ; Tail Round 3 + mov ecx, DWORD PTR 12[esi] + mov edx, DWORD PTR 12[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + dec ebp + mov DWORD PTR 12[ebx],ecx + jz $L016aw_end + ; Tail Round 4 + mov ecx, DWORD PTR 16[esi] + mov edx, DWORD PTR 16[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + dec ebp + mov DWORD PTR 16[ebx],ecx + jz $L016aw_end + ; Tail Round 5 + mov ecx, DWORD PTR 20[esi] + mov edx, DWORD PTR 20[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + dec ebp + mov DWORD PTR 20[ebx],ecx + jz $L016aw_end + ; Tail Round 6 + mov ecx, DWORD PTR 24[esi] + mov edx, DWORD PTR 24[edi] + sub ecx, eax + mov eax, 0 + adc eax, eax + sub ecx, edx + adc eax, 0 + mov DWORD PTR 24[ebx],ecx +$L016aw_end: + pop edi + pop esi + pop ebx + pop ebp + ret +_bn_sub_words ENDP +_TEXT ENDS +_TEXT SEGMENT +PUBLIC _bn_mul_comba8 + +_bn_mul_comba8 PROC NEAR + push esi + mov esi, DWORD PTR 12[esp] + push edi + mov edi, DWORD PTR 20[esp] + push ebp + push ebx + xor ebx, ebx + mov eax, DWORD PTR [esi] + xor ecx, ecx + mov edx, DWORD PTR [edi] + ; ################## Calculate word 0 + xor ebp, ebp + ; mul a[0]*b[0] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esp] + adc ecx, edx + mov edx, DWORD PTR [edi] + adc ebp, 0 + mov DWORD PTR [eax],ebx + mov eax, DWORD PTR 4[esi] + ; saved r[0] + ; ################## Calculate word 1 + xor ebx, ebx + ; mul a[1]*b[0] + mul edx + add ecx, eax + mov eax, DWORD PTR [esi] + adc ebp, edx + mov edx, DWORD PTR 4[edi] + adc ebx, 0 + ; mul a[0]*b[1] + mul edx + add ecx, eax + mov eax, DWORD PTR 20[esp] + adc ebp, edx + mov edx, DWORD PTR [edi] + adc ebx, 0 + mov DWORD PTR 4[eax],ecx + mov eax, DWORD PTR 8[esi] + ; saved r[1] + ; ################## Calculate word 2 + xor ecx, ecx + ; mul a[2]*b[0] + mul edx + add ebp, eax + mov eax, DWORD PTR 4[esi] + adc ebx, edx + mov edx, DWORD PTR 4[edi] + adc ecx, 0 + ; mul a[1]*b[1] + mul edx + add ebp, eax + mov eax, DWORD PTR [esi] + adc ebx, edx + mov edx, DWORD PTR 8[edi] + adc ecx, 0 + ; mul a[0]*b[2] + mul edx + add ebp, eax + mov eax, DWORD PTR 20[esp] + adc ebx, edx + mov edx, DWORD PTR [edi] + adc ecx, 0 + mov DWORD PTR 8[eax],ebp + mov eax, DWORD PTR 12[esi] + ; saved r[2] + ; ################## Calculate word 3 + xor ebp, ebp + ; mul a[3]*b[0] + mul edx + add ebx, eax + mov eax, DWORD PTR 8[esi] + adc ecx, edx + mov edx, DWORD PTR 4[edi] + adc ebp, 0 + ; mul a[2]*b[1] + mul edx + add ebx, eax + mov eax, DWORD PTR 4[esi] + adc ecx, edx + mov edx, DWORD PTR 8[edi] + adc ebp, 0 + ; mul a[1]*b[2] + mul edx + add ebx, eax + mov eax, DWORD PTR [esi] + adc ecx, edx + mov edx, DWORD PTR 12[edi] + adc ebp, 0 + ; mul a[0]*b[3] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esp] + adc ecx, edx + mov edx, DWORD PTR [edi] + adc ebp, 0 + mov DWORD PTR 12[eax],ebx + mov eax, DWORD PTR 16[esi] + ; saved r[3] + ; ################## Calculate word 4 + xor ebx, ebx + ; mul a[4]*b[0] + mul edx + add ecx, eax + mov eax, DWORD PTR 12[esi] + adc ebp, edx + mov edx, DWORD PTR 4[edi] + adc ebx, 0 + ; mul a[3]*b[1] + mul edx + add ecx, eax + mov eax, DWORD PTR 8[esi] + adc ebp, edx + mov edx, DWORD PTR 8[edi] + adc ebx, 0 + ; mul a[2]*b[2] + mul edx + add ecx, eax + mov eax, DWORD PTR 4[esi] + adc ebp, edx + mov edx, DWORD PTR 12[edi] + adc ebx, 0 + ; mul a[1]*b[3] + mul edx + add ecx, eax + mov eax, DWORD PTR [esi] + adc ebp, edx + mov edx, DWORD PTR 16[edi] + adc ebx, 0 + ; mul a[0]*b[4] + mul edx + add ecx, eax + mov eax, DWORD PTR 20[esp] + adc ebp, edx + mov edx, DWORD PTR [edi] + adc ebx, 0 + mov DWORD PTR 16[eax],ecx + mov eax, DWORD PTR 20[esi] + ; saved r[4] + ; ################## Calculate word 5 + xor ecx, ecx + ; mul a[5]*b[0] + mul edx + add ebp, eax + mov eax, DWORD PTR 16[esi] + adc ebx, edx + mov edx, DWORD PTR 4[edi] + adc ecx, 0 + ; mul a[4]*b[1] + mul edx + add ebp, eax + mov eax, DWORD PTR 12[esi] + adc ebx, edx + mov edx, DWORD PTR 8[edi] + adc ecx, 0 + ; mul a[3]*b[2] + mul edx + add ebp, eax + mov eax, DWORD PTR 8[esi] + adc ebx, edx + mov edx, DWORD PTR 12[edi] + adc ecx, 0 + ; mul a[2]*b[3] + mul edx + add ebp, eax + mov eax, DWORD PTR 4[esi] + adc ebx, edx + mov edx, DWORD PTR 16[edi] + adc ecx, 0 + ; mul a[1]*b[4] + mul edx + add ebp, eax + mov eax, DWORD PTR [esi] + adc ebx, edx + mov edx, DWORD PTR 20[edi] + adc ecx, 0 + ; mul a[0]*b[5] + mul edx + add ebp, eax + mov eax, DWORD PTR 20[esp] + adc ebx, edx + mov edx, DWORD PTR [edi] + adc ecx, 0 + mov DWORD PTR 20[eax],ebp + mov eax, DWORD PTR 24[esi] + ; saved r[5] + ; ################## Calculate word 6 + xor ebp, ebp + ; mul a[6]*b[0] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esi] + adc ecx, edx + mov edx, DWORD PTR 4[edi] + adc ebp, 0 + ; mul a[5]*b[1] + mul edx + add ebx, eax + mov eax, DWORD PTR 16[esi] + adc ecx, edx + mov edx, DWORD PTR 8[edi] + adc ebp, 0 + ; mul a[4]*b[2] + mul edx + add ebx, eax + mov eax, DWORD PTR 12[esi] + adc ecx, edx + mov edx, DWORD PTR 12[edi] + adc ebp, 0 + ; mul a[3]*b[3] + mul edx + add ebx, eax + mov eax, DWORD PTR 8[esi] + adc ecx, edx + mov edx, DWORD PTR 16[edi] + adc ebp, 0 + ; mul a[2]*b[4] + mul edx + add ebx, eax + mov eax, DWORD PTR 4[esi] + adc ecx, edx + mov edx, DWORD PTR 20[edi] + adc ebp, 0 + ; mul a[1]*b[5] + mul edx + add ebx, eax + mov eax, DWORD PTR [esi] + adc ecx, edx + mov edx, DWORD PTR 24[edi] + adc ebp, 0 + ; mul a[0]*b[6] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esp] + adc ecx, edx + mov edx, DWORD PTR [edi] + adc ebp, 0 + mov DWORD PTR 24[eax],ebx + mov eax, DWORD PTR 28[esi] + ; saved r[6] + ; ################## Calculate word 7 + xor ebx, ebx + ; mul a[7]*b[0] + mul edx + add ecx, eax + mov eax, DWORD PTR 24[esi] + adc ebp, edx + mov edx, DWORD PTR 4[edi] + adc ebx, 0 + ; mul a[6]*b[1] + mul edx + add ecx, eax + mov eax, DWORD PTR 20[esi] + adc ebp, edx + mov edx, DWORD PTR 8[edi] + adc ebx, 0 + ; mul a[5]*b[2] + mul edx + add ecx, eax + mov eax, DWORD PTR 16[esi] + adc ebp, edx + mov edx, DWORD PTR 12[edi] + adc ebx, 0 + ; mul a[4]*b[3] + mul edx + add ecx, eax + mov eax, DWORD PTR 12[esi] + adc ebp, edx + mov edx, DWORD PTR 16[edi] + adc ebx, 0 + ; mul a[3]*b[4] + mul edx + add ecx, eax + mov eax, DWORD PTR 8[esi] + adc ebp, edx + mov edx, DWORD PTR 20[edi] + adc ebx, 0 + ; mul a[2]*b[5] + mul edx + add ecx, eax + mov eax, DWORD PTR 4[esi] + adc ebp, edx + mov edx, DWORD PTR 24[edi] + adc ebx, 0 + ; mul a[1]*b[6] + mul edx + add ecx, eax + mov eax, DWORD PTR [esi] + adc ebp, edx + mov edx, DWORD PTR 28[edi] + adc ebx, 0 + ; mul a[0]*b[7] + mul edx + add ecx, eax + mov eax, DWORD PTR 20[esp] + adc ebp, edx + mov edx, DWORD PTR 4[edi] + adc ebx, 0 + mov DWORD PTR 28[eax],ecx + mov eax, DWORD PTR 28[esi] + ; saved r[7] + ; ################## Calculate word 8 + xor ecx, ecx + ; mul a[7]*b[1] + mul edx + add ebp, eax + mov eax, DWORD PTR 24[esi] + adc ebx, edx + mov edx, DWORD PTR 8[edi] + adc ecx, 0 + ; mul a[6]*b[2] + mul edx + add ebp, eax + mov eax, DWORD PTR 20[esi] + adc ebx, edx + mov edx, DWORD PTR 12[edi] + adc ecx, 0 + ; mul a[5]*b[3] + mul edx + add ebp, eax + mov eax, DWORD PTR 16[esi] + adc ebx, edx + mov edx, DWORD PTR 16[edi] + adc ecx, 0 + ; mul a[4]*b[4] + mul edx + add ebp, eax + mov eax, DWORD PTR 12[esi] + adc ebx, edx + mov edx, DWORD PTR 20[edi] + adc ecx, 0 + ; mul a[3]*b[5] + mul edx + add ebp, eax + mov eax, DWORD PTR 8[esi] + adc ebx, edx + mov edx, DWORD PTR 24[edi] + adc ecx, 0 + ; mul a[2]*b[6] + mul edx + add ebp, eax + mov eax, DWORD PTR 4[esi] + adc ebx, edx + mov edx, DWORD PTR 28[edi] + adc ecx, 0 + ; mul a[1]*b[7] + mul edx + add ebp, eax + mov eax, DWORD PTR 20[esp] + adc ebx, edx + mov edx, DWORD PTR 8[edi] + adc ecx, 0 + mov DWORD PTR 32[eax],ebp + mov eax, DWORD PTR 28[esi] + ; saved r[8] + ; ################## Calculate word 9 + xor ebp, ebp + ; mul a[7]*b[2] + mul edx + add ebx, eax + mov eax, DWORD PTR 24[esi] + adc ecx, edx + mov edx, DWORD PTR 12[edi] + adc ebp, 0 + ; mul a[6]*b[3] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esi] + adc ecx, edx + mov edx, DWORD PTR 16[edi] + adc ebp, 0 + ; mul a[5]*b[4] + mul edx + add ebx, eax + mov eax, DWORD PTR 16[esi] + adc ecx, edx + mov edx, DWORD PTR 20[edi] + adc ebp, 0 + ; mul a[4]*b[5] + mul edx + add ebx, eax + mov eax, DWORD PTR 12[esi] + adc ecx, edx + mov edx, DWORD PTR 24[edi] + adc ebp, 0 + ; mul a[3]*b[6] + mul edx + add ebx, eax + mov eax, DWORD PTR 8[esi] + adc ecx, edx + mov edx, DWORD PTR 28[edi] + adc ebp, 0 + ; mul a[2]*b[7] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esp] + adc ecx, edx + mov edx, DWORD PTR 12[edi] + adc ebp, 0 + mov DWORD PTR 36[eax],ebx + mov eax, DWORD PTR 28[esi] + ; saved r[9] + ; ################## Calculate word 10 + xor ebx, ebx + ; mul a[7]*b[3] + mul edx + add ecx, eax + mov eax, DWORD PTR 24[esi] + adc ebp, edx + mov edx, DWORD PTR 16[edi] + adc ebx, 0 + ; mul a[6]*b[4] + mul edx + add ecx, eax + mov eax, DWORD PTR 20[esi] + adc ebp, edx + mov edx, DWORD PTR 20[edi] + adc ebx, 0 + ; mul a[5]*b[5] + mul edx + add ecx, eax + mov eax, DWORD PTR 16[esi] + adc ebp, edx + mov edx, DWORD PTR 24[edi] + adc ebx, 0 + ; mul a[4]*b[6] + mul edx + add ecx, eax + mov eax, DWORD PTR 12[esi] + adc ebp, edx + mov edx, DWORD PTR 28[edi] + adc ebx, 0 + ; mul a[3]*b[7] + mul edx + add ecx, eax + mov eax, DWORD PTR 20[esp] + adc ebp, edx + mov edx, DWORD PTR 16[edi] + adc ebx, 0 + mov DWORD PTR 40[eax],ecx + mov eax, DWORD PTR 28[esi] + ; saved r[10] + ; ################## Calculate word 11 + xor ecx, ecx + ; mul a[7]*b[4] + mul edx + add ebp, eax + mov eax, DWORD PTR 24[esi] + adc ebx, edx + mov edx, DWORD PTR 20[edi] + adc ecx, 0 + ; mul a[6]*b[5] + mul edx + add ebp, eax + mov eax, DWORD PTR 20[esi] + adc ebx, edx + mov edx, DWORD PTR 24[edi] + adc ecx, 0 + ; mul a[5]*b[6] + mul edx + add ebp, eax + mov eax, DWORD PTR 16[esi] + adc ebx, edx + mov edx, DWORD PTR 28[edi] + adc ecx, 0 + ; mul a[4]*b[7] + mul edx + add ebp, eax + mov eax, DWORD PTR 20[esp] + adc ebx, edx + mov edx, DWORD PTR 20[edi] + adc ecx, 0 + mov DWORD PTR 44[eax],ebp + mov eax, DWORD PTR 28[esi] + ; saved r[11] + ; ################## Calculate word 12 + xor ebp, ebp + ; mul a[7]*b[5] + mul edx + add ebx, eax + mov eax, DWORD PTR 24[esi] + adc ecx, edx + mov edx, DWORD PTR 24[edi] + adc ebp, 0 + ; mul a[6]*b[6] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esi] + adc ecx, edx + mov edx, DWORD PTR 28[edi] + adc ebp, 0 + ; mul a[5]*b[7] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esp] + adc ecx, edx + mov edx, DWORD PTR 24[edi] + adc ebp, 0 + mov DWORD PTR 48[eax],ebx + mov eax, DWORD PTR 28[esi] + ; saved r[12] + ; ################## Calculate word 13 + xor ebx, ebx + ; mul a[7]*b[6] + mul edx + add ecx, eax + mov eax, DWORD PTR 24[esi] + adc ebp, edx + mov edx, DWORD PTR 28[edi] + adc ebx, 0 + ; mul a[6]*b[7] + mul edx + add ecx, eax + mov eax, DWORD PTR 20[esp] + adc ebp, edx + mov edx, DWORD PTR 28[edi] + adc ebx, 0 + mov DWORD PTR 52[eax],ecx + mov eax, DWORD PTR 28[esi] + ; saved r[13] + ; ################## Calculate word 14 + xor ecx, ecx + ; mul a[7]*b[7] + mul edx + add ebp, eax + mov eax, DWORD PTR 20[esp] + adc ebx, edx + adc ecx, 0 + mov DWORD PTR 56[eax],ebp + ; saved r[14] + ; save r[15] + mov DWORD PTR 60[eax],ebx + pop ebx + pop ebp + pop edi + pop esi + ret +_bn_mul_comba8 ENDP +_TEXT ENDS +_TEXT SEGMENT +PUBLIC _bn_mul_comba4 + +_bn_mul_comba4 PROC NEAR + push esi + mov esi, DWORD PTR 12[esp] + push edi + mov edi, DWORD PTR 20[esp] + push ebp + push ebx + xor ebx, ebx + mov eax, DWORD PTR [esi] + xor ecx, ecx + mov edx, DWORD PTR [edi] + ; ################## Calculate word 0 + xor ebp, ebp + ; mul a[0]*b[0] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esp] + adc ecx, edx + mov edx, DWORD PTR [edi] + adc ebp, 0 + mov DWORD PTR [eax],ebx + mov eax, DWORD PTR 4[esi] + ; saved r[0] + ; ################## Calculate word 1 + xor ebx, ebx + ; mul a[1]*b[0] + mul edx + add ecx, eax + mov eax, DWORD PTR [esi] + adc ebp, edx + mov edx, DWORD PTR 4[edi] + adc ebx, 0 + ; mul a[0]*b[1] + mul edx + add ecx, eax + mov eax, DWORD PTR 20[esp] + adc ebp, edx + mov edx, DWORD PTR [edi] + adc ebx, 0 + mov DWORD PTR 4[eax],ecx + mov eax, DWORD PTR 8[esi] + ; saved r[1] + ; ################## Calculate word 2 + xor ecx, ecx + ; mul a[2]*b[0] + mul edx + add ebp, eax + mov eax, DWORD PTR 4[esi] + adc ebx, edx + mov edx, DWORD PTR 4[edi] + adc ecx, 0 + ; mul a[1]*b[1] + mul edx + add ebp, eax + mov eax, DWORD PTR [esi] + adc ebx, edx + mov edx, DWORD PTR 8[edi] + adc ecx, 0 + ; mul a[0]*b[2] + mul edx + add ebp, eax + mov eax, DWORD PTR 20[esp] + adc ebx, edx + mov edx, DWORD PTR [edi] + adc ecx, 0 + mov DWORD PTR 8[eax],ebp + mov eax, DWORD PTR 12[esi] + ; saved r[2] + ; ################## Calculate word 3 + xor ebp, ebp + ; mul a[3]*b[0] + mul edx + add ebx, eax + mov eax, DWORD PTR 8[esi] + adc ecx, edx + mov edx, DWORD PTR 4[edi] + adc ebp, 0 + ; mul a[2]*b[1] + mul edx + add ebx, eax + mov eax, DWORD PTR 4[esi] + adc ecx, edx + mov edx, DWORD PTR 8[edi] + adc ebp, 0 + ; mul a[1]*b[2] + mul edx + add ebx, eax + mov eax, DWORD PTR [esi] + adc ecx, edx + mov edx, DWORD PTR 12[edi] + adc ebp, 0 + ; mul a[0]*b[3] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esp] + adc ecx, edx + mov edx, DWORD PTR 4[edi] + adc ebp, 0 + mov DWORD PTR 12[eax],ebx + mov eax, DWORD PTR 12[esi] + ; saved r[3] + ; ################## Calculate word 4 + xor ebx, ebx + ; mul a[3]*b[1] + mul edx + add ecx, eax + mov eax, DWORD PTR 8[esi] + adc ebp, edx + mov edx, DWORD PTR 8[edi] + adc ebx, 0 + ; mul a[2]*b[2] + mul edx + add ecx, eax + mov eax, DWORD PTR 4[esi] + adc ebp, edx + mov edx, DWORD PTR 12[edi] + adc ebx, 0 + ; mul a[1]*b[3] + mul edx + add ecx, eax + mov eax, DWORD PTR 20[esp] + adc ebp, edx + mov edx, DWORD PTR 8[edi] + adc ebx, 0 + mov DWORD PTR 16[eax],ecx + mov eax, DWORD PTR 12[esi] + ; saved r[4] + ; ################## Calculate word 5 + xor ecx, ecx + ; mul a[3]*b[2] + mul edx + add ebp, eax + mov eax, DWORD PTR 8[esi] + adc ebx, edx + mov edx, DWORD PTR 12[edi] + adc ecx, 0 + ; mul a[2]*b[3] + mul edx + add ebp, eax + mov eax, DWORD PTR 20[esp] + adc ebx, edx + mov edx, DWORD PTR 12[edi] + adc ecx, 0 + mov DWORD PTR 20[eax],ebp + mov eax, DWORD PTR 12[esi] + ; saved r[5] + ; ################## Calculate word 6 + xor ebp, ebp + ; mul a[3]*b[3] + mul edx + add ebx, eax + mov eax, DWORD PTR 20[esp] + adc ecx, edx + adc ebp, 0 + mov DWORD PTR 24[eax],ebx + ; saved r[6] + ; save r[7] + mov DWORD PTR 28[eax],ecx + pop ebx + pop ebp + pop edi + pop esi + ret +_bn_mul_comba4 ENDP +_TEXT ENDS +_TEXT SEGMENT +PUBLIC _bn_sqr_comba8 + +_bn_sqr_comba8 PROC NEAR + push esi + push edi + push ebp + push ebx + mov edi, DWORD PTR 20[esp] + mov esi, DWORD PTR 24[esp] + xor ebx, ebx + xor ecx, ecx + mov eax, DWORD PTR [esi] + ; ############### Calculate word 0 + xor ebp, ebp + ; sqr a[0]*a[0] + mul eax + add ebx, eax + adc ecx, edx + mov edx, DWORD PTR [esi] + adc ebp, 0 + mov DWORD PTR [edi],ebx + mov eax, DWORD PTR 4[esi] + ; saved r[0] + ; ############### Calculate word 1 + xor ebx, ebx + ; sqr a[1]*a[0] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 8[esi] + adc ebx, 0 + mov DWORD PTR 4[edi],ecx + mov edx, DWORD PTR [esi] + ; saved r[1] + ; ############### Calculate word 2 + xor ecx, ecx + ; sqr a[2]*a[0] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 4[esi] + adc ecx, 0 + ; sqr a[1]*a[1] + mul eax + add ebp, eax + adc ebx, edx + mov edx, DWORD PTR [esi] + adc ecx, 0 + mov DWORD PTR 8[edi],ebp + mov eax, DWORD PTR 12[esi] + ; saved r[2] + ; ############### Calculate word 3 + xor ebp, ebp + ; sqr a[3]*a[0] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 8[esi] + adc ebp, 0 + mov edx, DWORD PTR 4[esi] + ; sqr a[2]*a[1] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 16[esi] + adc ebp, 0 + mov DWORD PTR 12[edi],ebx + mov edx, DWORD PTR [esi] + ; saved r[3] + ; ############### Calculate word 4 + xor ebx, ebx + ; sqr a[4]*a[0] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 12[esi] + adc ebx, 0 + mov edx, DWORD PTR 4[esi] + ; sqr a[3]*a[1] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 8[esi] + adc ebx, 0 + ; sqr a[2]*a[2] + mul eax + add ecx, eax + adc ebp, edx + mov edx, DWORD PTR [esi] + adc ebx, 0 + mov DWORD PTR 16[edi],ecx + mov eax, DWORD PTR 20[esi] + ; saved r[4] + ; ############### Calculate word 5 + xor ecx, ecx + ; sqr a[5]*a[0] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 16[esi] + adc ecx, 0 + mov edx, DWORD PTR 4[esi] + ; sqr a[4]*a[1] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 12[esi] + adc ecx, 0 + mov edx, DWORD PTR 8[esi] + ; sqr a[3]*a[2] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 24[esi] + adc ecx, 0 + mov DWORD PTR 20[edi],ebp + mov edx, DWORD PTR [esi] + ; saved r[5] + ; ############### Calculate word 6 + xor ebp, ebp + ; sqr a[6]*a[0] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 20[esi] + adc ebp, 0 + mov edx, DWORD PTR 4[esi] + ; sqr a[5]*a[1] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 16[esi] + adc ebp, 0 + mov edx, DWORD PTR 8[esi] + ; sqr a[4]*a[2] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 12[esi] + adc ebp, 0 + ; sqr a[3]*a[3] + mul eax + add ebx, eax + adc ecx, edx + mov edx, DWORD PTR [esi] + adc ebp, 0 + mov DWORD PTR 24[edi],ebx + mov eax, DWORD PTR 28[esi] + ; saved r[6] + ; ############### Calculate word 7 + xor ebx, ebx + ; sqr a[7]*a[0] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 24[esi] + adc ebx, 0 + mov edx, DWORD PTR 4[esi] + ; sqr a[6]*a[1] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 20[esi] + adc ebx, 0 + mov edx, DWORD PTR 8[esi] + ; sqr a[5]*a[2] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 16[esi] + adc ebx, 0 + mov edx, DWORD PTR 12[esi] + ; sqr a[4]*a[3] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 28[esi] + adc ebx, 0 + mov DWORD PTR 28[edi],ecx + mov edx, DWORD PTR 4[esi] + ; saved r[7] + ; ############### Calculate word 8 + xor ecx, ecx + ; sqr a[7]*a[1] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 24[esi] + adc ecx, 0 + mov edx, DWORD PTR 8[esi] + ; sqr a[6]*a[2] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 20[esi] + adc ecx, 0 + mov edx, DWORD PTR 12[esi] + ; sqr a[5]*a[3] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 16[esi] + adc ecx, 0 + ; sqr a[4]*a[4] + mul eax + add ebp, eax + adc ebx, edx + mov edx, DWORD PTR 8[esi] + adc ecx, 0 + mov DWORD PTR 32[edi],ebp + mov eax, DWORD PTR 28[esi] + ; saved r[8] + ; ############### Calculate word 9 + xor ebp, ebp + ; sqr a[7]*a[2] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 24[esi] + adc ebp, 0 + mov edx, DWORD PTR 12[esi] + ; sqr a[6]*a[3] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 20[esi] + adc ebp, 0 + mov edx, DWORD PTR 16[esi] + ; sqr a[5]*a[4] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 28[esi] + adc ebp, 0 + mov DWORD PTR 36[edi],ebx + mov edx, DWORD PTR 12[esi] + ; saved r[9] + ; ############### Calculate word 10 + xor ebx, ebx + ; sqr a[7]*a[3] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 24[esi] + adc ebx, 0 + mov edx, DWORD PTR 16[esi] + ; sqr a[6]*a[4] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 20[esi] + adc ebx, 0 + ; sqr a[5]*a[5] + mul eax + add ecx, eax + adc ebp, edx + mov edx, DWORD PTR 16[esi] + adc ebx, 0 + mov DWORD PTR 40[edi],ecx + mov eax, DWORD PTR 28[esi] + ; saved r[10] + ; ############### Calculate word 11 + xor ecx, ecx + ; sqr a[7]*a[4] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 24[esi] + adc ecx, 0 + mov edx, DWORD PTR 20[esi] + ; sqr a[6]*a[5] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 28[esi] + adc ecx, 0 + mov DWORD PTR 44[edi],ebp + mov edx, DWORD PTR 20[esi] + ; saved r[11] + ; ############### Calculate word 12 + xor ebp, ebp + ; sqr a[7]*a[5] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 24[esi] + adc ebp, 0 + ; sqr a[6]*a[6] + mul eax + add ebx, eax + adc ecx, edx + mov edx, DWORD PTR 24[esi] + adc ebp, 0 + mov DWORD PTR 48[edi],ebx + mov eax, DWORD PTR 28[esi] + ; saved r[12] + ; ############### Calculate word 13 + xor ebx, ebx + ; sqr a[7]*a[6] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 28[esi] + adc ebx, 0 + mov DWORD PTR 52[edi],ecx + ; saved r[13] + ; ############### Calculate word 14 + xor ecx, ecx + ; sqr a[7]*a[7] + mul eax + add ebp, eax + adc ebx, edx + adc ecx, 0 + mov DWORD PTR 56[edi],ebp + ; saved r[14] + mov DWORD PTR 60[edi],ebx + pop ebx + pop ebp + pop edi + pop esi + ret +_bn_sqr_comba8 ENDP +_TEXT ENDS +_TEXT SEGMENT +PUBLIC _bn_sqr_comba4 + +_bn_sqr_comba4 PROC NEAR + push esi + push edi + push ebp + push ebx + mov edi, DWORD PTR 20[esp] + mov esi, DWORD PTR 24[esp] + xor ebx, ebx + xor ecx, ecx + mov eax, DWORD PTR [esi] + ; ############### Calculate word 0 + xor ebp, ebp + ; sqr a[0]*a[0] + mul eax + add ebx, eax + adc ecx, edx + mov edx, DWORD PTR [esi] + adc ebp, 0 + mov DWORD PTR [edi],ebx + mov eax, DWORD PTR 4[esi] + ; saved r[0] + ; ############### Calculate word 1 + xor ebx, ebx + ; sqr a[1]*a[0] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 8[esi] + adc ebx, 0 + mov DWORD PTR 4[edi],ecx + mov edx, DWORD PTR [esi] + ; saved r[1] + ; ############### Calculate word 2 + xor ecx, ecx + ; sqr a[2]*a[0] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 4[esi] + adc ecx, 0 + ; sqr a[1]*a[1] + mul eax + add ebp, eax + adc ebx, edx + mov edx, DWORD PTR [esi] + adc ecx, 0 + mov DWORD PTR 8[edi],ebp + mov eax, DWORD PTR 12[esi] + ; saved r[2] + ; ############### Calculate word 3 + xor ebp, ebp + ; sqr a[3]*a[0] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 8[esi] + adc ebp, 0 + mov edx, DWORD PTR 4[esi] + ; sqr a[2]*a[1] + mul edx + add eax, eax + adc edx, edx + adc ebp, 0 + add ebx, eax + adc ecx, edx + mov eax, DWORD PTR 12[esi] + adc ebp, 0 + mov DWORD PTR 12[edi],ebx + mov edx, DWORD PTR 4[esi] + ; saved r[3] + ; ############### Calculate word 4 + xor ebx, ebx + ; sqr a[3]*a[1] + mul edx + add eax, eax + adc edx, edx + adc ebx, 0 + add ecx, eax + adc ebp, edx + mov eax, DWORD PTR 8[esi] + adc ebx, 0 + ; sqr a[2]*a[2] + mul eax + add ecx, eax + adc ebp, edx + mov edx, DWORD PTR 8[esi] + adc ebx, 0 + mov DWORD PTR 16[edi],ecx + mov eax, DWORD PTR 12[esi] + ; saved r[4] + ; ############### Calculate word 5 + xor ecx, ecx + ; sqr a[3]*a[2] + mul edx + add eax, eax + adc edx, edx + adc ecx, 0 + add ebp, eax + adc ebx, edx + mov eax, DWORD PTR 12[esi] + adc ecx, 0 + mov DWORD PTR 20[edi],ebp + ; saved r[5] + ; ############### Calculate word 6 + xor ebp, ebp + ; sqr a[3]*a[3] + mul eax + add ebx, eax + adc ecx, edx + adc ebp, 0 + mov DWORD PTR 24[edi],ebx + ; saved r[6] + mov DWORD PTR 28[edi],ecx + pop ebx + pop ebp + pop edi + pop esi + ret +_bn_sqr_comba4 ENDP +_TEXT ENDS END diff --git a/crypto/bn/asm/bn86unix.cpp b/crypto/bn/asm/bn86unix.cpp index 64702201ea..639a3ac41c 100644 --- a/crypto/bn/asm/bn86unix.cpp +++ b/crypto/bn/asm/bn86unix.cpp @@ -12,8 +12,13 @@ #define bn_mul_add_words _bn_mul_add_words #define bn_mul_words _bn_mul_words #define bn_sqr_words _bn_sqr_words -#define bn_div64 _bn_div64 +#define bn_div_words _bn_div_words #define bn_add_words _bn_add_words +#define bn_sub_words _bn_sub_words +#define bn_mul_comba8 _bn_mul_comba8 +#define bn_mul_comba4 _bn_mul_comba4 +#define bn_sqr_comba8 _bn_sqr_comba8 +#define bn_sqr_comba4 _bn_sqr_comba4 #endif @@ -544,9 +549,9 @@ bn_sqr_words: .ident "bn_sqr_words" .text .align ALIGN -.globl bn_div64 - TYPE(bn_div64,@function) -bn_div64: +.globl bn_div_words + TYPE(bn_div_words,@function) +bn_div_words: pushl %ebp pushl %ebx pushl %esi @@ -561,9 +566,9 @@ bn_div64: popl %ebx popl %ebp ret -.bn_div64_end: - SIZE(bn_div64,.bn_div64_end-bn_div64) -.ident "bn_div64" +.bn_div_words_end: + SIZE(bn_div_words,.bn_div_words_end-bn_div_words) +.ident "bn_div_words" .text .align ALIGN .globl bn_add_words @@ -741,7 +746,6 @@ bn_add_words: adcl $0, %eax movl %ecx, 24(%ebx) .L013aw_end: - movl %eax, %eax popl %edi popl %esi popl %ebx @@ -750,3 +754,1448 @@ bn_add_words: .bn_add_words_end: SIZE(bn_add_words,.bn_add_words_end-bn_add_words) .ident "bn_add_words" +.text + .align ALIGN +.globl bn_sub_words + TYPE(bn_sub_words,@function) +bn_sub_words: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + + movl 20(%esp), %ebx + movl 24(%esp), %esi + movl 28(%esp), %edi + movl 32(%esp), %ebp + xorl %eax, %eax + andl $4294967288, %ebp + jz .L014aw_finish +.L015aw_loop: + /* Round 0 */ + movl (%esi), %ecx + movl (%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, (%ebx) + /* Round 1 */ + movl 4(%esi), %ecx + movl 4(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 4(%ebx) + /* Round 2 */ + movl 8(%esi), %ecx + movl 8(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 8(%ebx) + /* Round 3 */ + movl 12(%esi), %ecx + movl 12(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 12(%ebx) + /* Round 4 */ + movl 16(%esi), %ecx + movl 16(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 16(%ebx) + /* Round 5 */ + movl 20(%esi), %ecx + movl 20(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 20(%ebx) + /* Round 6 */ + movl 24(%esi), %ecx + movl 24(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 24(%ebx) + /* Round 7 */ + movl 28(%esi), %ecx + movl 28(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 28(%ebx) + + addl $32, %esi + addl $32, %edi + addl $32, %ebx + subl $8, %ebp + jnz .L015aw_loop +.L014aw_finish: + movl 32(%esp), %ebp + andl $7, %ebp + jz .L016aw_end + /* Tail Round 0 */ + movl (%esi), %ecx + movl (%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, (%ebx) + jz .L016aw_end + /* Tail Round 1 */ + movl 4(%esi), %ecx + movl 4(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 4(%ebx) + jz .L016aw_end + /* Tail Round 2 */ + movl 8(%esi), %ecx + movl 8(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 8(%ebx) + jz .L016aw_end + /* Tail Round 3 */ + movl 12(%esi), %ecx + movl 12(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 12(%ebx) + jz .L016aw_end + /* Tail Round 4 */ + movl 16(%esi), %ecx + movl 16(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 16(%ebx) + jz .L016aw_end + /* Tail Round 5 */ + movl 20(%esi), %ecx + movl 20(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 20(%ebx) + jz .L016aw_end + /* Tail Round 6 */ + movl 24(%esi), %ecx + movl 24(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 24(%ebx) +.L016aw_end: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.bn_sub_words_end: + SIZE(bn_sub_words,.bn_sub_words_end-bn_sub_words) +.ident "bn_sub_words" +.text + .align ALIGN +.globl bn_mul_comba8 + TYPE(bn_mul_comba8,@function) +bn_mul_comba8: + pushl %esi + movl 12(%esp), %esi + pushl %edi + movl 20(%esp), %edi + pushl %ebp + pushl %ebx + xorl %ebx, %ebx + movl (%esi), %eax + xorl %ecx, %ecx + movl (%edi), %edx + /* ################## Calculate word 0 */ + xorl %ebp, %ebp + /* mul a[0]*b[0] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, (%eax) + movl 4(%esi), %eax + /* saved r[0] */ + /* ################## Calculate word 1 */ + xorl %ebx, %ebx + /* mul a[1]*b[0] */ + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + /* mul a[0]*b[1] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 4(%eax) + movl 8(%esi), %eax + /* saved r[1] */ + /* ################## Calculate word 2 */ + xorl %ecx, %ecx + /* mul a[2]*b[0] */ + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + /* mul a[1]*b[1] */ + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + /* mul a[0]*b[2] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 8(%eax) + movl 12(%esi), %eax + /* saved r[2] */ + /* ################## Calculate word 3 */ + xorl %ebp, %ebp + /* mul a[3]*b[0] */ + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + /* mul a[2]*b[1] */ + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + /* mul a[1]*b[2] */ + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + /* mul a[0]*b[3] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, 12(%eax) + movl 16(%esi), %eax + /* saved r[3] */ + /* ################## Calculate word 4 */ + xorl %ebx, %ebx + /* mul a[4]*b[0] */ + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + /* mul a[3]*b[1] */ + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + /* mul a[2]*b[2] */ + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + /* mul a[1]*b[3] */ + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + /* mul a[0]*b[4] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 16(%eax) + movl 20(%esi), %eax + /* saved r[4] */ + /* ################## Calculate word 5 */ + xorl %ecx, %ecx + /* mul a[5]*b[0] */ + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + /* mul a[4]*b[1] */ + mull %edx + addl %eax, %ebp + movl 12(%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + /* mul a[3]*b[2] */ + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + /* mul a[2]*b[3] */ + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 16(%edi), %edx + adcl $0, %ecx + /* mul a[1]*b[4] */ + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + /* mul a[0]*b[5] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 20(%eax) + movl 24(%esi), %eax + /* saved r[5] */ + /* ################## Calculate word 6 */ + xorl %ebp, %ebp + /* mul a[6]*b[0] */ + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + /* mul a[5]*b[1] */ + mull %edx + addl %eax, %ebx + movl 16(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + /* mul a[4]*b[2] */ + mull %edx + addl %eax, %ebx + movl 12(%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + /* mul a[3]*b[3] */ + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 16(%edi), %edx + adcl $0, %ebp + /* mul a[2]*b[4] */ + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 20(%edi), %edx + adcl $0, %ebp + /* mul a[1]*b[5] */ + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + /* mul a[0]*b[6] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, 24(%eax) + movl 28(%esi), %eax + /* saved r[6] */ + /* ################## Calculate word 7 */ + xorl %ebx, %ebx + /* mul a[7]*b[0] */ + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + /* mul a[6]*b[1] */ + mull %edx + addl %eax, %ecx + movl 20(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + /* mul a[5]*b[2] */ + mull %edx + addl %eax, %ecx + movl 16(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + /* mul a[4]*b[3] */ + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + /* mul a[3]*b[4] */ + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 20(%edi), %edx + adcl $0, %ebx + /* mul a[2]*b[5] */ + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 24(%edi), %edx + adcl $0, %ebx + /* mul a[1]*b[6] */ + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + /* mul a[0]*b[7] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + movl %ecx, 28(%eax) + movl 28(%esi), %eax + /* saved r[7] */ + /* ################## Calculate word 8 */ + xorl %ecx, %ecx + /* mul a[7]*b[1] */ + mull %edx + addl %eax, %ebp + movl 24(%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + /* mul a[6]*b[2] */ + mull %edx + addl %eax, %ebp + movl 20(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + /* mul a[5]*b[3] */ + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 16(%edi), %edx + adcl $0, %ecx + /* mul a[4]*b[4] */ + mull %edx + addl %eax, %ebp + movl 12(%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + /* mul a[3]*b[5] */ + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 24(%edi), %edx + adcl $0, %ecx + /* mul a[2]*b[6] */ + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 28(%edi), %edx + adcl $0, %ecx + /* mul a[1]*b[7] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + movl %ebp, 32(%eax) + movl 28(%esi), %eax + /* saved r[8] */ + /* ################## Calculate word 9 */ + xorl %ebp, %ebp + /* mul a[7]*b[2] */ + mull %edx + addl %eax, %ebx + movl 24(%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + /* mul a[6]*b[3] */ + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 16(%edi), %edx + adcl $0, %ebp + /* mul a[5]*b[4] */ + mull %edx + addl %eax, %ebx + movl 16(%esi), %eax + adcl %edx, %ecx + movl 20(%edi), %edx + adcl $0, %ebp + /* mul a[4]*b[5] */ + mull %edx + addl %eax, %ebx + movl 12(%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + /* mul a[3]*b[6] */ + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 28(%edi), %edx + adcl $0, %ebp + /* mul a[2]*b[7] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + movl %ebx, 36(%eax) + movl 28(%esi), %eax + /* saved r[9] */ + /* ################## Calculate word 10 */ + xorl %ebx, %ebx + /* mul a[7]*b[3] */ + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + /* mul a[6]*b[4] */ + mull %edx + addl %eax, %ecx + movl 20(%esi), %eax + adcl %edx, %ebp + movl 20(%edi), %edx + adcl $0, %ebx + /* mul a[5]*b[5] */ + mull %edx + addl %eax, %ecx + movl 16(%esi), %eax + adcl %edx, %ebp + movl 24(%edi), %edx + adcl $0, %ebx + /* mul a[4]*b[6] */ + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + /* mul a[3]*b[7] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + movl %ecx, 40(%eax) + movl 28(%esi), %eax + /* saved r[10] */ + /* ################## Calculate word 11 */ + xorl %ecx, %ecx + /* mul a[7]*b[4] */ + mull %edx + addl %eax, %ebp + movl 24(%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + /* mul a[6]*b[5] */ + mull %edx + addl %eax, %ebp + movl 20(%esi), %eax + adcl %edx, %ebx + movl 24(%edi), %edx + adcl $0, %ecx + /* mul a[5]*b[6] */ + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 28(%edi), %edx + adcl $0, %ecx + /* mul a[4]*b[7] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + movl %ebp, 44(%eax) + movl 28(%esi), %eax + /* saved r[11] */ + /* ################## Calculate word 12 */ + xorl %ebp, %ebp + /* mul a[7]*b[5] */ + mull %edx + addl %eax, %ebx + movl 24(%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + /* mul a[6]*b[6] */ + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 28(%edi), %edx + adcl $0, %ebp + /* mul a[5]*b[7] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + movl %ebx, 48(%eax) + movl 28(%esi), %eax + /* saved r[12] */ + /* ################## Calculate word 13 */ + xorl %ebx, %ebx + /* mul a[7]*b[6] */ + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + /* mul a[6]*b[7] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + movl %ecx, 52(%eax) + movl 28(%esi), %eax + /* saved r[13] */ + /* ################## Calculate word 14 */ + xorl %ecx, %ecx + /* mul a[7]*b[7] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + adcl $0, %ecx + movl %ebp, 56(%eax) + /* saved r[14] */ + /* save r[15] */ + movl %ebx, 60(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_mul_comba8_end: + SIZE(bn_mul_comba8,.bn_mul_comba8_end-bn_mul_comba8) +.ident "desasm.pl" +.text + .align ALIGN +.globl bn_mul_comba4 + TYPE(bn_mul_comba4,@function) +bn_mul_comba4: + pushl %esi + movl 12(%esp), %esi + pushl %edi + movl 20(%esp), %edi + pushl %ebp + pushl %ebx + xorl %ebx, %ebx + movl (%esi), %eax + xorl %ecx, %ecx + movl (%edi), %edx + /* ################## Calculate word 0 */ + xorl %ebp, %ebp + /* mul a[0]*b[0] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, (%eax) + movl 4(%esi), %eax + /* saved r[0] */ + /* ################## Calculate word 1 */ + xorl %ebx, %ebx + /* mul a[1]*b[0] */ + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + /* mul a[0]*b[1] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 4(%eax) + movl 8(%esi), %eax + /* saved r[1] */ + /* ################## Calculate word 2 */ + xorl %ecx, %ecx + /* mul a[2]*b[0] */ + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + /* mul a[1]*b[1] */ + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + /* mul a[0]*b[2] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 8(%eax) + movl 12(%esi), %eax + /* saved r[2] */ + /* ################## Calculate word 3 */ + xorl %ebp, %ebp + /* mul a[3]*b[0] */ + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + /* mul a[2]*b[1] */ + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + /* mul a[1]*b[2] */ + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + /* mul a[0]*b[3] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + movl %ebx, 12(%eax) + movl 12(%esi), %eax + /* saved r[3] */ + /* ################## Calculate word 4 */ + xorl %ebx, %ebx + /* mul a[3]*b[1] */ + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + /* mul a[2]*b[2] */ + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + /* mul a[1]*b[3] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + movl %ecx, 16(%eax) + movl 12(%esi), %eax + /* saved r[4] */ + /* ################## Calculate word 5 */ + xorl %ecx, %ecx + /* mul a[3]*b[2] */ + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + /* mul a[2]*b[3] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + movl %ebp, 20(%eax) + movl 12(%esi), %eax + /* saved r[5] */ + /* ################## Calculate word 6 */ + xorl %ebp, %ebp + /* mul a[3]*b[3] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + adcl $0, %ebp + movl %ebx, 24(%eax) + /* saved r[6] */ + /* save r[7] */ + movl %ecx, 28(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_mul_comba4_end: + SIZE(bn_mul_comba4,.bn_mul_comba4_end-bn_mul_comba4) +.ident "desasm.pl" +.text + .align ALIGN +.globl bn_sqr_comba8 + TYPE(bn_sqr_comba8,@function) +bn_sqr_comba8: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp), %edi + movl 24(%esp), %esi + xorl %ebx, %ebx + xorl %ecx, %ecx + movl (%esi), %eax + /* ############### Calculate word 0 */ + xorl %ebp, %ebp + /* sqr a[0]*a[0] */ + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, (%edi) + movl 4(%esi), %eax + /* saved r[0] */ + /* ############### Calculate word 1 */ + xorl %ebx, %ebx + /* sqr a[1]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + movl %ecx, 4(%edi) + movl (%esi), %edx + /* saved r[1] */ + /* ############### Calculate word 2 */ + xorl %ecx, %ecx + /* sqr a[2]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 4(%esi), %eax + adcl $0, %ecx + /* sqr a[1]*a[1] */ + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl (%esi), %edx + adcl $0, %ecx + movl %ebp, 8(%edi) + movl 12(%esi), %eax + /* saved r[2] */ + /* ############### Calculate word 3 */ + xorl %ebp, %ebp + /* sqr a[3]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 8(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + /* sqr a[2]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 16(%esi), %eax + adcl $0, %ebp + movl %ebx, 12(%edi) + movl (%esi), %edx + /* saved r[3] */ + /* ############### Calculate word 4 */ + xorl %ebx, %ebx + /* sqr a[4]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 12(%esi), %eax + adcl $0, %ebx + movl 4(%esi), %edx + /* sqr a[3]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + /* sqr a[2]*a[2] */ + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl (%esi), %edx + adcl $0, %ebx + movl %ecx, 16(%edi) + movl 20(%esi), %eax + /* saved r[4] */ + /* ############### Calculate word 5 */ + xorl %ecx, %ecx + /* sqr a[5]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 16(%esi), %eax + adcl $0, %ecx + movl 4(%esi), %edx + /* sqr a[4]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 12(%esi), %eax + adcl $0, %ecx + movl 8(%esi), %edx + /* sqr a[3]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl %ebp, 20(%edi) + movl (%esi), %edx + /* saved r[5] */ + /* ############### Calculate word 6 */ + xorl %ebp, %ebp + /* sqr a[6]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 20(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + /* sqr a[5]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 16(%esi), %eax + adcl $0, %ebp + movl 8(%esi), %edx + /* sqr a[4]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 12(%esi), %eax + adcl $0, %ebp + /* sqr a[3]*a[3] */ + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, 24(%edi) + movl 28(%esi), %eax + /* saved r[6] */ + /* ############### Calculate word 7 */ + xorl %ebx, %ebx + /* sqr a[7]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 24(%esi), %eax + adcl $0, %ebx + movl 4(%esi), %edx + /* sqr a[6]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 20(%esi), %eax + adcl $0, %ebx + movl 8(%esi), %edx + /* sqr a[5]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 16(%esi), %eax + adcl $0, %ebx + movl 12(%esi), %edx + /* sqr a[4]*a[3] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 28(%esi), %eax + adcl $0, %ebx + movl %ecx, 28(%edi) + movl 4(%esi), %edx + /* saved r[7] */ + /* ############### Calculate word 8 */ + xorl %ecx, %ecx + /* sqr a[7]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl 8(%esi), %edx + /* sqr a[6]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 20(%esi), %eax + adcl $0, %ecx + movl 12(%esi), %edx + /* sqr a[5]*a[3] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 16(%esi), %eax + adcl $0, %ecx + /* sqr a[4]*a[4] */ + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl 8(%esi), %edx + adcl $0, %ecx + movl %ebp, 32(%edi) + movl 28(%esi), %eax + /* saved r[8] */ + /* ############### Calculate word 9 */ + xorl %ebp, %ebp + /* sqr a[7]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %eax + adcl $0, %ebp + movl 12(%esi), %edx + /* sqr a[6]*a[3] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 20(%esi), %eax + adcl $0, %ebp + movl 16(%esi), %edx + /* sqr a[5]*a[4] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 28(%esi), %eax + adcl $0, %ebp + movl %ebx, 36(%edi) + movl 12(%esi), %edx + /* saved r[9] */ + /* ############### Calculate word 10 */ + xorl %ebx, %ebx + /* sqr a[7]*a[3] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 24(%esi), %eax + adcl $0, %ebx + movl 16(%esi), %edx + /* sqr a[6]*a[4] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 20(%esi), %eax + adcl $0, %ebx + /* sqr a[5]*a[5] */ + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl 16(%esi), %edx + adcl $0, %ebx + movl %ecx, 40(%edi) + movl 28(%esi), %eax + /* saved r[10] */ + /* ############### Calculate word 11 */ + xorl %ecx, %ecx + /* sqr a[7]*a[4] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl 20(%esi), %edx + /* sqr a[6]*a[5] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 28(%esi), %eax + adcl $0, %ecx + movl %ebp, 44(%edi) + movl 20(%esi), %edx + /* saved r[11] */ + /* ############### Calculate word 12 */ + xorl %ebp, %ebp + /* sqr a[7]*a[5] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %eax + adcl $0, %ebp + /* sqr a[6]*a[6] */ + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %edx + adcl $0, %ebp + movl %ebx, 48(%edi) + movl 28(%esi), %eax + /* saved r[12] */ + /* ############### Calculate word 13 */ + xorl %ebx, %ebx + /* sqr a[7]*a[6] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 28(%esi), %eax + adcl $0, %ebx + movl %ecx, 52(%edi) + /* saved r[13] */ + /* ############### Calculate word 14 */ + xorl %ecx, %ecx + /* sqr a[7]*a[7] */ + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + adcl $0, %ecx + movl %ebp, 56(%edi) + /* saved r[14] */ + movl %ebx, 60(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_sqr_comba8_end: + SIZE(bn_sqr_comba8,.bn_sqr_comba8_end-bn_sqr_comba8) +.ident "desasm.pl" +.text + .align ALIGN +.globl bn_sqr_comba4 + TYPE(bn_sqr_comba4,@function) +bn_sqr_comba4: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp), %edi + movl 24(%esp), %esi + xorl %ebx, %ebx + xorl %ecx, %ecx + movl (%esi), %eax + /* ############### Calculate word 0 */ + xorl %ebp, %ebp + /* sqr a[0]*a[0] */ + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, (%edi) + movl 4(%esi), %eax + /* saved r[0] */ + /* ############### Calculate word 1 */ + xorl %ebx, %ebx + /* sqr a[1]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + movl %ecx, 4(%edi) + movl (%esi), %edx + /* saved r[1] */ + /* ############### Calculate word 2 */ + xorl %ecx, %ecx + /* sqr a[2]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 4(%esi), %eax + adcl $0, %ecx + /* sqr a[1]*a[1] */ + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl (%esi), %edx + adcl $0, %ecx + movl %ebp, 8(%edi) + movl 12(%esi), %eax + /* saved r[2] */ + /* ############### Calculate word 3 */ + xorl %ebp, %ebp + /* sqr a[3]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 8(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + /* sqr a[2]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 12(%esi), %eax + adcl $0, %ebp + movl %ebx, 12(%edi) + movl 4(%esi), %edx + /* saved r[3] */ + /* ############### Calculate word 4 */ + xorl %ebx, %ebx + /* sqr a[3]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + /* sqr a[2]*a[2] */ + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %edx + adcl $0, %ebx + movl %ecx, 16(%edi) + movl 12(%esi), %eax + /* saved r[4] */ + /* ############### Calculate word 5 */ + xorl %ecx, %ecx + /* sqr a[3]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 12(%esi), %eax + adcl $0, %ecx + movl %ebp, 20(%edi) + /* saved r[5] */ + /* ############### Calculate word 6 */ + xorl %ebp, %ebp + /* sqr a[3]*a[3] */ + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + adcl $0, %ebp + movl %ebx, 24(%edi) + /* saved r[6] */ + movl %ecx, 28(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_sqr_comba4_end: + SIZE(bn_sqr_comba4,.bn_sqr_comba4_end-bn_sqr_comba4) +.ident "desasm.pl" diff --git a/crypto/bn/asm/ca.pl b/crypto/bn/asm/ca.pl new file mode 100644 index 0000000000..181d1f007e --- /dev/null +++ b/crypto/bn/asm/ca.pl @@ -0,0 +1,33 @@ +#!/usr/local/bin/perl +# I have this in perl so I can use more usefull register names and then convert +# them into alpha registers. +# + +push(@INC,"perlasm","../../perlasm"); +require "alpha.pl"; +require "alpha/mul_add.pl"; +require "alpha/mul.pl"; +require "alpha/sqr.pl"; +require "alpha/add.pl"; +require "alpha/sub.pl"; +require "alpha/mul_c8.pl"; +require "alpha/mul_c4.pl"; +require "alpha/sqr_c4.pl"; +require "alpha/sqr_c8.pl"; +require "alpha/div.pl"; + +&asm_init($ARGV[0],"bn-586.pl"); + +&bn_mul_words("bn_mul_words"); +&bn_sqr_words("bn_sqr_words"); +&bn_mul_add_words("bn_mul_add_words"); +&bn_add_words("bn_add_words"); +&bn_sub_words("bn_sub_words"); +&bn_div_words("bn_div_words"); +&bn_mul_comba8("bn_mul_comba8"); +&bn_mul_comba4("bn_mul_comba4"); +&bn_sqr_comba4("bn_sqr_comba4"); +&bn_sqr_comba8("bn_sqr_comba8"); + +&asm_finish(); + diff --git a/crypto/bn/asm/co-586.pl b/crypto/bn/asm/co-586.pl new file mode 100644 index 0000000000..0bcb5a6d47 --- /dev/null +++ b/crypto/bn/asm/co-586.pl @@ -0,0 +1,286 @@ +#!/usr/local/bin/perl + +push(@INC,"perlasm","../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],"bn-586.pl"); + +&bn_mul_comba("bn_mul_comba8",8); +&bn_mul_comba("bn_mul_comba4",4); +&bn_sqr_comba("bn_sqr_comba8",8); +&bn_sqr_comba("bn_sqr_comba4",4); + +&asm_finish(); + +sub mul_add_c + { + local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; + + # pos == -1 if eax and edx are pre-loaded, 0 to load from next + # words, and 1 if load return value + + &comment("mul a[$ai]*b[$bi]"); + + # "eax" and "edx" will always be pre-loaded. + # &mov("eax",&DWP($ai*4,$a,"",0)) ; + # &mov("edx",&DWP($bi*4,$b,"",0)); + + &mul("edx"); + &add($c0,"eax"); + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a + &mov("eax",&wparam(0)) if $pos > 0; # load r[] + ### + &adc($c1,"edx"); + &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b + &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b + ### + &adc($c2,0); + # is pos > 1, it means it is the last loop + &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[]; + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a + } + +sub sqr_add_c + { + local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; + + # pos == -1 if eax and edx are pre-loaded, 0 to load from next + # words, and 1 if load return value + + &comment("sqr a[$ai]*a[$bi]"); + + # "eax" and "edx" will always be pre-loaded. + # &mov("eax",&DWP($ai*4,$a,"",0)) ; + # &mov("edx",&DWP($bi*4,$b,"",0)); + + if ($ai == $bi) + { &mul("eax");} + else + { &mul("edx");} + &add($c0,"eax"); + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a + ### + &adc($c1,"edx"); + &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb); + ### + &adc($c2,0); + # is pos > 1, it means it is the last loop + &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[]; + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b + } + +sub sqr_add_c2 + { + local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; + + # pos == -1 if eax and edx are pre-loaded, 0 to load from next + # words, and 1 if load return value + + &comment("sqr a[$ai]*a[$bi]"); + + # "eax" and "edx" will always be pre-loaded. + # &mov("eax",&DWP($ai*4,$a,"",0)) ; + # &mov("edx",&DWP($bi*4,$a,"",0)); + + if ($ai == $bi) + { &mul("eax");} + else + { &mul("edx");} + &add("eax","eax"); + ### + &adc("edx","edx"); + ### + &adc($c2,0); + &add($c0,"eax"); + &adc($c1,"edx"); + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b + &adc($c2,0); + &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[]; + &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb); + ### + } + +sub bn_mul_comba + { + local($name,$num)=@_; + local($a,$b,$c0,$c1,$c2); + local($i,$as,$ae,$bs,$be,$ai,$bi); + local($tot,$end); + + &function_begin_B($name,""); + + $c0="ebx"; + $c1="ecx"; + $c2="ebp"; + $a="esi"; + $b="edi"; + + $as=0; + $ae=0; + $bs=0; + $be=0; + $tot=$num+$num-1; + + &push("esi"); + &mov($a,&wparam(1)); + &push("edi"); + &mov($b,&wparam(2)); + &push("ebp"); + &push("ebx"); + + &xor($c0,$c0); + &mov("eax",&DWP(0,$a,"",0)); # load the first word + &xor($c1,$c1); + &mov("edx",&DWP(0,$b,"",0)); # load the first second + + for ($i=0; $i<$tot; $i++) + { + $ai=$as; + $bi=$bs; + $end=$be+1; + + &comment("################## Calculate word $i"); + + for ($j=$bs; $j<$end; $j++) + { + &xor($c2,$c2) if ($j == $bs); + if (($j+1) == $end) + { + $v=1; + $v=2 if (($i+1) == $tot); + } + else + { $v=0; } + if (($j+1) != $end) + { + $na=($ai-1); + $nb=($bi+1); + } + else + { + $na=$as+($i < ($num-1)); + $nb=$bs+($i >= ($num-1)); + } +#printf STDERR "[$ai,$bi] -> [$na,$nb]\n"; + &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb); + if ($v) + { + &comment("saved r[$i]"); + # &mov("eax",&wparam(0)); + # &mov(&DWP($i*4,"eax","",0),$c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + } + $ai--; + $bi++; + } + $as++ if ($i < ($num-1)); + $ae++ if ($i >= ($num-1)); + + $bs++ if ($i >= ($num-1)); + $be++ if ($i < ($num-1)); + } + &comment("save r[$i]"); + # &mov("eax",&wparam(0)); + &mov(&DWP($i*4,"eax","",0),$c0); + + &pop("ebx"); + &pop("ebp"); + &pop("edi"); + &pop("esi"); + &ret(); + &function_end_B($name); + } + +sub bn_sqr_comba + { + local($name,$num)=@_; + local($r,$a,$c0,$c1,$c2)=@_; + local($i,$as,$ae,$bs,$be,$ai,$bi); + local($b,$tot,$end,$half); + + &function_begin_B($name,""); + + $c0="ebx"; + $c1="ecx"; + $c2="ebp"; + $a="esi"; + $r="edi"; + + &push("esi"); + &push("edi"); + &push("ebp"); + &push("ebx"); + &mov($r,&wparam(0)); + &mov($a,&wparam(1)); + &xor($c0,$c0); + &xor($c1,$c1); + &mov("eax",&DWP(0,$a,"",0)); # load the first word + + $as=0; + $ae=0; + $bs=0; + $be=0; + $tot=$num+$num-1; + + for ($i=0; $i<$tot; $i++) + { + $ai=$as; + $bi=$bs; + $end=$be+1; + + &comment("############### Calculate word $i"); + for ($j=$bs; $j<$end; $j++) + { + &xor($c2,$c2) if ($j == $bs); + if (($ai-1) < ($bi+1)) + { + $v=1; + $v=2 if ($i+1) == $tot; + } + else + { $v=0; } + if (!$v) + { + $na=$ai-1; + $nb=$bi+1; + } + else + { + $na=$as+($i < ($num-1)); + $nb=$bs+($i >= ($num-1)); + } + if ($ai == $bi) + { + &sqr_add_c($r,$a,$ai,$bi, + $c0,$c1,$c2,$v,$i,$na,$nb); + } + else + { + &sqr_add_c2($r,$a,$ai,$bi, + $c0,$c1,$c2,$v,$i,$na,$nb); + } + if ($v) + { + &comment("saved r[$i]"); + #&mov(&DWP($i*4,$r,"",0),$c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + last; + } + $ai--; + $bi++; + } + $as++ if ($i < ($num-1)); + $ae++ if ($i >= ($num-1)); + + $bs++ if ($i >= ($num-1)); + $be++ if ($i < ($num-1)); + } + &mov(&DWP($i*4,$r,"",0),$c0); + &pop("ebx"); + &pop("ebp"); + &pop("edi"); + &pop("esi"); + &ret(); + &function_end_B($name); + } diff --git a/crypto/bn/asm/co-alpha.pl b/crypto/bn/asm/co-alpha.pl new file mode 100644 index 0000000000..23869a4ef5 --- /dev/null +++ b/crypto/bn/asm/co-alpha.pl @@ -0,0 +1,116 @@ +#!/usr/local/bin/perl +# I have this in perl so I can use more usefull register names and then convert +# them into alpha registers. +# + +push(@INC,"perlasm","../../perlasm"); +require "alpha.pl"; + +&asm_init($ARGV[0],"bn-586.pl"); + +print &bn_sub_words("bn_sub_words"); + +&asm_finish(); + +sub bn_sub_words + { + local($name)=@_; + local($cc,$a,$b,$r); + + $cc="r0"; + $a0="r1"; $b0="r5"; $r0="r9"; $tmp="r13"; + $a1="r2"; $b1="r6"; $r1="r10"; $t1="r14"; + $a2="r3"; $b2="r7"; $r2="r11"; + $a3="r4"; $b3="r8"; $r3="r12"; $t3="r15"; + + $rp=&wparam(0); + $ap=&wparam(1); + $bp=&wparam(2); + $count=&wparam(3); + + &function_begin($name,""); + + &comment(""); + &sub($count,4,$count); + &mov("zero",$cc); + &blt($count,&label("finish")); + + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + +########################################################## + &set_label("loop"); + + &ld($a1,&QWPw(1,$ap)); + &cmpult($a0,$b0,$tmp); # will we borrow? + &ld($b1,&QWPw(1,$bp)); + &sub($a0,$b0,$a0); # do the subtract + &ld($a2,&QWPw(2,$ap)); + &cmpult($a0,$cc,$b0); # will we borrow? + &ld($b2,&QWPw(2,$bp)); + &sub($a0,$cc,$a0); # will we borrow? + &ld($a3,&QWPw(3,$ap)); + &add($b0,$tmp,$cc); # add the borrows + + &cmpult($a1,$b1,$t1); # will we borrow? + &sub($a1,$b1,$a1); # do the subtract + &ld($b3,&QWPw(3,$bp)); + &cmpult($a1,$cc,$b1); # will we borrow? + &sub($a1,$cc,$a1); # will we borrow? + &add($b1,$t1,$cc); # add the borrows + + &cmpult($a2,$b2,$tmp); # will we borrow? + &sub($a2,$b2,$a2); # do the subtract + &st($a0,&QWPw(0,$rp)); # save + &cmpult($a2,$cc,$b2); # will we borrow? + &sub($a2,$cc,$a2); # will we borrow? + &add($b2,$tmp,$cc); # add the borrows + + &cmpult($a3,$b3,$t3); # will we borrow? + &sub($a3,$b3,$a3); # do the subtract + &st($a1,&QWPw(1,$rp)); # save + &cmpult($a3,$cc,$b3); # will we borrow? + &sub($a3,$cc,$a3); # will we borrow? + &add($b3,$t3,$cc); # add the borrows + + &st($a2,&QWPw(2,$rp)); # save + &sub($count,4,$count); # count-=4 + &st($a3,&QWPw(3,$rp)); # save + &add($ap,4*$QWS,$ap); # count+=4 + &add($bp,4*$QWS,$bp); # count+=4 + &add($rp,4*$QWS,$rp); # count+=4 + + &blt($count,&label("finish")); + &ld($a0,&QWPw(0,$ap)); + &ld($b0,&QWPw(0,$bp)); + &br(&label("loop")); +################################################## + # Do the last 0..3 words + + &set_label("last_loop"); + + &ld($a0,&QWPw(0,$ap)); # get a + &ld($b0,&QWPw(0,$bp)); # get b + &cmpult($a0,$b0,$tmp); # will we borrow? + &sub($a0,$b0,$a0); # do the subtract + &cmpult($a0,$cc,$b0); # will we borrow? + &sub($a0,$cc,$a0); # will we borrow? + &st($a0,&QWPw(0,$rp)); # save + &add($b0,$tmp,$cc); # add the borrows + + &add($ap,$QWS,$ap); + &add($bp,$QWS,$bp); + &add($rp,$QWS,$rp); + &sub($count,1,$count); + &bgt($count,&label("last_loop")); + &function_end_A($name); + +###################################################### + &set_label("finish"); + &add($count,4,$count); + &bgt($count,&label("last_loop")); + + &set_label("end"); + &function_end($name); + } + diff --git a/crypto/bn/asm/co86unix.cpp b/crypto/bn/asm/co86unix.cpp new file mode 100644 index 0000000000..fa80b14046 --- /dev/null +++ b/crypto/bn/asm/co86unix.cpp @@ -0,0 +1,1315 @@ +/* Run the C pre-processor over this file with one of the following defined + * ELF - elf object files, + * OUT - a.out object files, + * BSDI - BSDI style a.out object files + * SOL - Solaris style elf + */ + +#define TYPE(a,b) .type a,b +#define SIZE(a,b) .size a,b + +#if defined(OUT) || defined(BSDI) +#define bn_mul_comba8 _bn_mul_comba8 +#define bn_mul_comba4 _bn_mul_comba4 +#define bn_sqr_comba8 _bn_sqr_comba8 +#define bn_sqr_comba4 _bn_sqr_comba4 + +#endif + +#ifdef OUT +#define OK 1 +#define ALIGN 4 +#endif + +#ifdef BSDI +#define OK 1 +#define ALIGN 4 +#undef SIZE +#undef TYPE +#define SIZE(a,b) +#define TYPE(a,b) +#endif + +#if defined(ELF) || defined(SOL) +#define OK 1 +#define ALIGN 16 +#endif + +#ifndef OK +You need to define one of +ELF - elf systems - linux-elf, NetBSD and DG-UX +OUT - a.out systems - linux-a.out and FreeBSD +SOL - solaris systems, which are elf with strange comment lines +BSDI - a.out with a very primative version of as. +#endif + +/* Let the Assembler begin :-) */ + /* Don't even think of reading this code */ + /* It was automatically generated by bn-586.pl */ + /* Which is a perl program used to generate the x86 assember for */ + /* any of elf, a.out, BSDI,Win32, or Solaris */ + /* eric <eay@cryptsoft.com> */ + + .file "bn-586.s" + .version "01.01" +gcc2_compiled.: +.text + .align ALIGN +.globl bn_mul_comba8 + TYPE(bn_mul_comba8,@function) +bn_mul_comba8: + pushl %esi + movl 12(%esp), %esi + pushl %edi + movl 20(%esp), %edi + pushl %ebp + pushl %ebx + xorl %ebx, %ebx + movl (%esi), %eax + xorl %ecx, %ecx + movl (%edi), %edx + /* ################## Calculate word 0 */ + xorl %ebp, %ebp + /* mul a[0]*b[0] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, (%eax) + movl 4(%esi), %eax + /* saved r[0] */ + /* ################## Calculate word 1 */ + xorl %ebx, %ebx + /* mul a[1]*b[0] */ + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + /* mul a[0]*b[1] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 4(%eax) + movl 8(%esi), %eax + /* saved r[1] */ + /* ################## Calculate word 2 */ + xorl %ecx, %ecx + /* mul a[2]*b[0] */ + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + /* mul a[1]*b[1] */ + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + /* mul a[0]*b[2] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 8(%eax) + movl 12(%esi), %eax + /* saved r[2] */ + /* ################## Calculate word 3 */ + xorl %ebp, %ebp + /* mul a[3]*b[0] */ + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + /* mul a[2]*b[1] */ + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + /* mul a[1]*b[2] */ + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + /* mul a[0]*b[3] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, 12(%eax) + movl 16(%esi), %eax + /* saved r[3] */ + /* ################## Calculate word 4 */ + xorl %ebx, %ebx + /* mul a[4]*b[0] */ + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + /* mul a[3]*b[1] */ + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + /* mul a[2]*b[2] */ + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + /* mul a[1]*b[3] */ + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + /* mul a[0]*b[4] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 16(%eax) + movl 20(%esi), %eax + /* saved r[4] */ + /* ################## Calculate word 5 */ + xorl %ecx, %ecx + /* mul a[5]*b[0] */ + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + /* mul a[4]*b[1] */ + mull %edx + addl %eax, %ebp + movl 12(%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + /* mul a[3]*b[2] */ + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + /* mul a[2]*b[3] */ + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 16(%edi), %edx + adcl $0, %ecx + /* mul a[1]*b[4] */ + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + /* mul a[0]*b[5] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 20(%eax) + movl 24(%esi), %eax + /* saved r[5] */ + /* ################## Calculate word 6 */ + xorl %ebp, %ebp + /* mul a[6]*b[0] */ + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + /* mul a[5]*b[1] */ + mull %edx + addl %eax, %ebx + movl 16(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + /* mul a[4]*b[2] */ + mull %edx + addl %eax, %ebx + movl 12(%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + /* mul a[3]*b[3] */ + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 16(%edi), %edx + adcl $0, %ebp + /* mul a[2]*b[4] */ + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 20(%edi), %edx + adcl $0, %ebp + /* mul a[1]*b[5] */ + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + /* mul a[0]*b[6] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, 24(%eax) + movl 28(%esi), %eax + /* saved r[6] */ + /* ################## Calculate word 7 */ + xorl %ebx, %ebx + /* mul a[7]*b[0] */ + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + /* mul a[6]*b[1] */ + mull %edx + addl %eax, %ecx + movl 20(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + /* mul a[5]*b[2] */ + mull %edx + addl %eax, %ecx + movl 16(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + /* mul a[4]*b[3] */ + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + /* mul a[3]*b[4] */ + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 20(%edi), %edx + adcl $0, %ebx + /* mul a[2]*b[5] */ + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 24(%edi), %edx + adcl $0, %ebx + /* mul a[1]*b[6] */ + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + /* mul a[0]*b[7] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + movl %ecx, 28(%eax) + movl 28(%esi), %eax + /* saved r[7] */ + /* ################## Calculate word 8 */ + xorl %ecx, %ecx + /* mul a[7]*b[1] */ + mull %edx + addl %eax, %ebp + movl 24(%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + /* mul a[6]*b[2] */ + mull %edx + addl %eax, %ebp + movl 20(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + /* mul a[5]*b[3] */ + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 16(%edi), %edx + adcl $0, %ecx + /* mul a[4]*b[4] */ + mull %edx + addl %eax, %ebp + movl 12(%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + /* mul a[3]*b[5] */ + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 24(%edi), %edx + adcl $0, %ecx + /* mul a[2]*b[6] */ + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 28(%edi), %edx + adcl $0, %ecx + /* mul a[1]*b[7] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + movl %ebp, 32(%eax) + movl 28(%esi), %eax + /* saved r[8] */ + /* ################## Calculate word 9 */ + xorl %ebp, %ebp + /* mul a[7]*b[2] */ + mull %edx + addl %eax, %ebx + movl 24(%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + /* mul a[6]*b[3] */ + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 16(%edi), %edx + adcl $0, %ebp + /* mul a[5]*b[4] */ + mull %edx + addl %eax, %ebx + movl 16(%esi), %eax + adcl %edx, %ecx + movl 20(%edi), %edx + adcl $0, %ebp + /* mul a[4]*b[5] */ + mull %edx + addl %eax, %ebx + movl 12(%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + /* mul a[3]*b[6] */ + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 28(%edi), %edx + adcl $0, %ebp + /* mul a[2]*b[7] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + movl %ebx, 36(%eax) + movl 28(%esi), %eax + /* saved r[9] */ + /* ################## Calculate word 10 */ + xorl %ebx, %ebx + /* mul a[7]*b[3] */ + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + /* mul a[6]*b[4] */ + mull %edx + addl %eax, %ecx + movl 20(%esi), %eax + adcl %edx, %ebp + movl 20(%edi), %edx + adcl $0, %ebx + /* mul a[5]*b[5] */ + mull %edx + addl %eax, %ecx + movl 16(%esi), %eax + adcl %edx, %ebp + movl 24(%edi), %edx + adcl $0, %ebx + /* mul a[4]*b[6] */ + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + /* mul a[3]*b[7] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + movl %ecx, 40(%eax) + movl 28(%esi), %eax + /* saved r[10] */ + /* ################## Calculate word 11 */ + xorl %ecx, %ecx + /* mul a[7]*b[4] */ + mull %edx + addl %eax, %ebp + movl 24(%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + /* mul a[6]*b[5] */ + mull %edx + addl %eax, %ebp + movl 20(%esi), %eax + adcl %edx, %ebx + movl 24(%edi), %edx + adcl $0, %ecx + /* mul a[5]*b[6] */ + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 28(%edi), %edx + adcl $0, %ecx + /* mul a[4]*b[7] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + movl %ebp, 44(%eax) + movl 28(%esi), %eax + /* saved r[11] */ + /* ################## Calculate word 12 */ + xorl %ebp, %ebp + /* mul a[7]*b[5] */ + mull %edx + addl %eax, %ebx + movl 24(%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + /* mul a[6]*b[6] */ + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 28(%edi), %edx + adcl $0, %ebp + /* mul a[5]*b[7] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + movl %ebx, 48(%eax) + movl 28(%esi), %eax + /* saved r[12] */ + /* ################## Calculate word 13 */ + xorl %ebx, %ebx + /* mul a[7]*b[6] */ + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + /* mul a[6]*b[7] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + movl %ecx, 52(%eax) + movl 28(%esi), %eax + /* saved r[13] */ + /* ################## Calculate word 14 */ + xorl %ecx, %ecx + /* mul a[7]*b[7] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + adcl $0, %ecx + movl %ebp, 56(%eax) + /* saved r[14] */ + /* save r[15] */ + movl %ebx, 60(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_mul_comba8_end: + SIZE(bn_mul_comba8,.bn_mul_comba8_end-bn_mul_comba8) +.ident "desasm.pl" +.text + .align ALIGN +.globl bn_mul_comba4 + TYPE(bn_mul_comba4,@function) +bn_mul_comba4: + pushl %esi + movl 12(%esp), %esi + pushl %edi + movl 20(%esp), %edi + pushl %ebp + pushl %ebx + xorl %ebx, %ebx + movl (%esi), %eax + xorl %ecx, %ecx + movl (%edi), %edx + /* ################## Calculate word 0 */ + xorl %ebp, %ebp + /* mul a[0]*b[0] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, (%eax) + movl 4(%esi), %eax + /* saved r[0] */ + /* ################## Calculate word 1 */ + xorl %ebx, %ebx + /* mul a[1]*b[0] */ + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + /* mul a[0]*b[1] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 4(%eax) + movl 8(%esi), %eax + /* saved r[1] */ + /* ################## Calculate word 2 */ + xorl %ecx, %ecx + /* mul a[2]*b[0] */ + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + /* mul a[1]*b[1] */ + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + /* mul a[0]*b[2] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 8(%eax) + movl 12(%esi), %eax + /* saved r[2] */ + /* ################## Calculate word 3 */ + xorl %ebp, %ebp + /* mul a[3]*b[0] */ + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + /* mul a[2]*b[1] */ + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + /* mul a[1]*b[2] */ + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + /* mul a[0]*b[3] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + movl %ebx, 12(%eax) + movl 12(%esi), %eax + /* saved r[3] */ + /* ################## Calculate word 4 */ + xorl %ebx, %ebx + /* mul a[3]*b[1] */ + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + /* mul a[2]*b[2] */ + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + /* mul a[1]*b[3] */ + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + movl %ecx, 16(%eax) + movl 12(%esi), %eax + /* saved r[4] */ + /* ################## Calculate word 5 */ + xorl %ecx, %ecx + /* mul a[3]*b[2] */ + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + /* mul a[2]*b[3] */ + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + movl %ebp, 20(%eax) + movl 12(%esi), %eax + /* saved r[5] */ + /* ################## Calculate word 6 */ + xorl %ebp, %ebp + /* mul a[3]*b[3] */ + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + adcl $0, %ebp + movl %ebx, 24(%eax) + /* saved r[6] */ + /* save r[7] */ + movl %ecx, 28(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_mul_comba4_end: + SIZE(bn_mul_comba4,.bn_mul_comba4_end-bn_mul_comba4) +.ident "desasm.pl" +.text + .align ALIGN +.globl bn_sqr_comba8 + TYPE(bn_sqr_comba8,@function) +bn_sqr_comba8: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp), %edi + movl 24(%esp), %esi + xorl %ebx, %ebx + xorl %ecx, %ecx + movl (%esi), %eax + /* ############### Calculate word 0 */ + xorl %ebp, %ebp + /* sqr a[0]*a[0] */ + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, (%edi) + movl 4(%esi), %eax + /* saved r[0] */ + /* ############### Calculate word 1 */ + xorl %ebx, %ebx + /* sqr a[1]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + movl %ecx, 4(%edi) + movl (%esi), %edx + /* saved r[1] */ + /* ############### Calculate word 2 */ + xorl %ecx, %ecx + /* sqr a[2]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 4(%esi), %eax + adcl $0, %ecx + /* sqr a[1]*a[1] */ + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl (%esi), %edx + adcl $0, %ecx + movl %ebp, 8(%edi) + movl 12(%esi), %eax + /* saved r[2] */ + /* ############### Calculate word 3 */ + xorl %ebp, %ebp + /* sqr a[3]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 8(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + /* sqr a[2]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 16(%esi), %eax + adcl $0, %ebp + movl %ebx, 12(%edi) + movl (%esi), %edx + /* saved r[3] */ + /* ############### Calculate word 4 */ + xorl %ebx, %ebx + /* sqr a[4]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 12(%esi), %eax + adcl $0, %ebx + movl 4(%esi), %edx + /* sqr a[3]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + /* sqr a[2]*a[2] */ + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl (%esi), %edx + adcl $0, %ebx + movl %ecx, 16(%edi) + movl 20(%esi), %eax + /* saved r[4] */ + /* ############### Calculate word 5 */ + xorl %ecx, %ecx + /* sqr a[5]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 16(%esi), %eax + adcl $0, %ecx + movl 4(%esi), %edx + /* sqr a[4]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 12(%esi), %eax + adcl $0, %ecx + movl 8(%esi), %edx + /* sqr a[3]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl %ebp, 20(%edi) + movl (%esi), %edx + /* saved r[5] */ + /* ############### Calculate word 6 */ + xorl %ebp, %ebp + /* sqr a[6]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 20(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + /* sqr a[5]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 16(%esi), %eax + adcl $0, %ebp + movl 8(%esi), %edx + /* sqr a[4]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 12(%esi), %eax + adcl $0, %ebp + /* sqr a[3]*a[3] */ + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, 24(%edi) + movl 28(%esi), %eax + /* saved r[6] */ + /* ############### Calculate word 7 */ + xorl %ebx, %ebx + /* sqr a[7]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 24(%esi), %eax + adcl $0, %ebx + movl 4(%esi), %edx + /* sqr a[6]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 20(%esi), %eax + adcl $0, %ebx + movl 8(%esi), %edx + /* sqr a[5]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 16(%esi), %eax + adcl $0, %ebx + movl 12(%esi), %edx + /* sqr a[4]*a[3] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 28(%esi), %eax + adcl $0, %ebx + movl %ecx, 28(%edi) + movl 4(%esi), %edx + /* saved r[7] */ + /* ############### Calculate word 8 */ + xorl %ecx, %ecx + /* sqr a[7]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl 8(%esi), %edx + /* sqr a[6]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 20(%esi), %eax + adcl $0, %ecx + movl 12(%esi), %edx + /* sqr a[5]*a[3] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 16(%esi), %eax + adcl $0, %ecx + /* sqr a[4]*a[4] */ + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl 8(%esi), %edx + adcl $0, %ecx + movl %ebp, 32(%edi) + movl 28(%esi), %eax + /* saved r[8] */ + /* ############### Calculate word 9 */ + xorl %ebp, %ebp + /* sqr a[7]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %eax + adcl $0, %ebp + movl 12(%esi), %edx + /* sqr a[6]*a[3] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 20(%esi), %eax + adcl $0, %ebp + movl 16(%esi), %edx + /* sqr a[5]*a[4] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 28(%esi), %eax + adcl $0, %ebp + movl %ebx, 36(%edi) + movl 12(%esi), %edx + /* saved r[9] */ + /* ############### Calculate word 10 */ + xorl %ebx, %ebx + /* sqr a[7]*a[3] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 24(%esi), %eax + adcl $0, %ebx + movl 16(%esi), %edx + /* sqr a[6]*a[4] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 20(%esi), %eax + adcl $0, %ebx + /* sqr a[5]*a[5] */ + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl 16(%esi), %edx + adcl $0, %ebx + movl %ecx, 40(%edi) + movl 28(%esi), %eax + /* saved r[10] */ + /* ############### Calculate word 11 */ + xorl %ecx, %ecx + /* sqr a[7]*a[4] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl 20(%esi), %edx + /* sqr a[6]*a[5] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 28(%esi), %eax + adcl $0, %ecx + movl %ebp, 44(%edi) + movl 20(%esi), %edx + /* saved r[11] */ + /* ############### Calculate word 12 */ + xorl %ebp, %ebp + /* sqr a[7]*a[5] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %eax + adcl $0, %ebp + /* sqr a[6]*a[6] */ + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %edx + adcl $0, %ebp + movl %ebx, 48(%edi) + movl 28(%esi), %eax + /* saved r[12] */ + /* ############### Calculate word 13 */ + xorl %ebx, %ebx + /* sqr a[7]*a[6] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 28(%esi), %eax + adcl $0, %ebx + movl %ecx, 52(%edi) + /* saved r[13] */ + /* ############### Calculate word 14 */ + xorl %ecx, %ecx + /* sqr a[7]*a[7] */ + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + adcl $0, %ecx + movl %ebp, 56(%edi) + /* saved r[14] */ + movl %ebx, 60(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_sqr_comba8_end: + SIZE(bn_sqr_comba8,.bn_sqr_comba8_end-bn_sqr_comba8) +.ident "desasm.pl" +.text + .align ALIGN +.globl bn_sqr_comba4 + TYPE(bn_sqr_comba4,@function) +bn_sqr_comba4: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp), %edi + movl 24(%esp), %esi + xorl %ebx, %ebx + xorl %ecx, %ecx + movl (%esi), %eax + /* ############### Calculate word 0 */ + xorl %ebp, %ebp + /* sqr a[0]*a[0] */ + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, (%edi) + movl 4(%esi), %eax + /* saved r[0] */ + /* ############### Calculate word 1 */ + xorl %ebx, %ebx + /* sqr a[1]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + movl %ecx, 4(%edi) + movl (%esi), %edx + /* saved r[1] */ + /* ############### Calculate word 2 */ + xorl %ecx, %ecx + /* sqr a[2]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 4(%esi), %eax + adcl $0, %ecx + /* sqr a[1]*a[1] */ + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl (%esi), %edx + adcl $0, %ecx + movl %ebp, 8(%edi) + movl 12(%esi), %eax + /* saved r[2] */ + /* ############### Calculate word 3 */ + xorl %ebp, %ebp + /* sqr a[3]*a[0] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 8(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + /* sqr a[2]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 12(%esi), %eax + adcl $0, %ebp + movl %ebx, 12(%edi) + movl 4(%esi), %edx + /* saved r[3] */ + /* ############### Calculate word 4 */ + xorl %ebx, %ebx + /* sqr a[3]*a[1] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + /* sqr a[2]*a[2] */ + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %edx + adcl $0, %ebx + movl %ecx, 16(%edi) + movl 12(%esi), %eax + /* saved r[4] */ + /* ############### Calculate word 5 */ + xorl %ecx, %ecx + /* sqr a[3]*a[2] */ + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 12(%esi), %eax + adcl $0, %ecx + movl %ebp, 20(%edi) + /* saved r[5] */ + /* ############### Calculate word 6 */ + xorl %ebp, %ebp + /* sqr a[3]*a[3] */ + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + adcl $0, %ebp + movl %ebx, 24(%edi) + /* saved r[6] */ + movl %ecx, 28(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_sqr_comba4_end: + SIZE(bn_sqr_comba4,.bn_sqr_comba4_end-bn_sqr_comba4) +.ident "desasm.pl" diff --git a/crypto/bn/asm/elf.s b/crypto/bn/asm/elf.s new file mode 100644 index 0000000000..97ad1264db --- /dev/null +++ b/crypto/bn/asm/elf.s @@ -0,0 +1,1269 @@ + # Don't even think of reading this code + # It was automatically generated by bn-586.pl + # Which is a perl program used to generate the x86 assember for + # any of elf, a.out, BSDI,Win32, or Solaris + # eric <eay@cryptsoft.com> + + .file "bn-586.s" + .version "01.01" +gcc2_compiled.: +.text + .align 16 +.globl bn_mul_comba8 + .type bn_mul_comba8,@function +bn_mul_comba8: + pushl %esi + movl 12(%esp), %esi + pushl %edi + movl 20(%esp), %edi + pushl %ebp + pushl %ebx + xorl %ebx, %ebx + movl (%esi), %eax + xorl %ecx, %ecx + movl (%edi), %edx + # ################## Calculate word 0 + xorl %ebp, %ebp + # mul a[0]*b[0] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, (%eax) + movl 4(%esi), %eax + # saved r[0] + # ################## Calculate word 1 + xorl %ebx, %ebx + # mul a[1]*b[0] + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + # mul a[0]*b[1] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 4(%eax) + movl 8(%esi), %eax + # saved r[1] + # ################## Calculate word 2 + xorl %ecx, %ecx + # mul a[2]*b[0] + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + # mul a[1]*b[1] + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + # mul a[0]*b[2] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 8(%eax) + movl 12(%esi), %eax + # saved r[2] + # ################## Calculate word 3 + xorl %ebp, %ebp + # mul a[3]*b[0] + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + # mul a[2]*b[1] + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + # mul a[1]*b[2] + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + # mul a[0]*b[3] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, 12(%eax) + movl 16(%esi), %eax + # saved r[3] + # ################## Calculate word 4 + xorl %ebx, %ebx + # mul a[4]*b[0] + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + # mul a[3]*b[1] + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + # mul a[2]*b[2] + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + # mul a[1]*b[3] + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + # mul a[0]*b[4] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 16(%eax) + movl 20(%esi), %eax + # saved r[4] + # ################## Calculate word 5 + xorl %ecx, %ecx + # mul a[5]*b[0] + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + # mul a[4]*b[1] + mull %edx + addl %eax, %ebp + movl 12(%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + # mul a[3]*b[2] + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + # mul a[2]*b[3] + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 16(%edi), %edx + adcl $0, %ecx + # mul a[1]*b[4] + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + # mul a[0]*b[5] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 20(%eax) + movl 24(%esi), %eax + # saved r[5] + # ################## Calculate word 6 + xorl %ebp, %ebp + # mul a[6]*b[0] + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + # mul a[5]*b[1] + mull %edx + addl %eax, %ebx + movl 16(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + # mul a[4]*b[2] + mull %edx + addl %eax, %ebx + movl 12(%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + # mul a[3]*b[3] + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 16(%edi), %edx + adcl $0, %ebp + # mul a[2]*b[4] + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 20(%edi), %edx + adcl $0, %ebp + # mul a[1]*b[5] + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + # mul a[0]*b[6] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, 24(%eax) + movl 28(%esi), %eax + # saved r[6] + # ################## Calculate word 7 + xorl %ebx, %ebx + # mul a[7]*b[0] + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + # mul a[6]*b[1] + mull %edx + addl %eax, %ecx + movl 20(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + # mul a[5]*b[2] + mull %edx + addl %eax, %ecx + movl 16(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + # mul a[4]*b[3] + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + # mul a[3]*b[4] + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 20(%edi), %edx + adcl $0, %ebx + # mul a[2]*b[5] + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 24(%edi), %edx + adcl $0, %ebx + # mul a[1]*b[6] + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + # mul a[0]*b[7] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + movl %ecx, 28(%eax) + movl 28(%esi), %eax + # saved r[7] + # ################## Calculate word 8 + xorl %ecx, %ecx + # mul a[7]*b[1] + mull %edx + addl %eax, %ebp + movl 24(%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + # mul a[6]*b[2] + mull %edx + addl %eax, %ebp + movl 20(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + # mul a[5]*b[3] + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 16(%edi), %edx + adcl $0, %ecx + # mul a[4]*b[4] + mull %edx + addl %eax, %ebp + movl 12(%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + # mul a[3]*b[5] + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 24(%edi), %edx + adcl $0, %ecx + # mul a[2]*b[6] + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 28(%edi), %edx + adcl $0, %ecx + # mul a[1]*b[7] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + movl %ebp, 32(%eax) + movl 28(%esi), %eax + # saved r[8] + # ################## Calculate word 9 + xorl %ebp, %ebp + # mul a[7]*b[2] + mull %edx + addl %eax, %ebx + movl 24(%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + # mul a[6]*b[3] + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 16(%edi), %edx + adcl $0, %ebp + # mul a[5]*b[4] + mull %edx + addl %eax, %ebx + movl 16(%esi), %eax + adcl %edx, %ecx + movl 20(%edi), %edx + adcl $0, %ebp + # mul a[4]*b[5] + mull %edx + addl %eax, %ebx + movl 12(%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + # mul a[3]*b[6] + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 28(%edi), %edx + adcl $0, %ebp + # mul a[2]*b[7] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + movl %ebx, 36(%eax) + movl 28(%esi), %eax + # saved r[9] + # ################## Calculate word 10 + xorl %ebx, %ebx + # mul a[7]*b[3] + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + # mul a[6]*b[4] + mull %edx + addl %eax, %ecx + movl 20(%esi), %eax + adcl %edx, %ebp + movl 20(%edi), %edx + adcl $0, %ebx + # mul a[5]*b[5] + mull %edx + addl %eax, %ecx + movl 16(%esi), %eax + adcl %edx, %ebp + movl 24(%edi), %edx + adcl $0, %ebx + # mul a[4]*b[6] + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + # mul a[3]*b[7] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + movl %ecx, 40(%eax) + movl 28(%esi), %eax + # saved r[10] + # ################## Calculate word 11 + xorl %ecx, %ecx + # mul a[7]*b[4] + mull %edx + addl %eax, %ebp + movl 24(%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + # mul a[6]*b[5] + mull %edx + addl %eax, %ebp + movl 20(%esi), %eax + adcl %edx, %ebx + movl 24(%edi), %edx + adcl $0, %ecx + # mul a[5]*b[6] + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 28(%edi), %edx + adcl $0, %ecx + # mul a[4]*b[7] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + movl %ebp, 44(%eax) + movl 28(%esi), %eax + # saved r[11] + # ################## Calculate word 12 + xorl %ebp, %ebp + # mul a[7]*b[5] + mull %edx + addl %eax, %ebx + movl 24(%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + # mul a[6]*b[6] + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 28(%edi), %edx + adcl $0, %ebp + # mul a[5]*b[7] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + movl %ebx, 48(%eax) + movl 28(%esi), %eax + # saved r[12] + # ################## Calculate word 13 + xorl %ebx, %ebx + # mul a[7]*b[6] + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + # mul a[6]*b[7] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + movl %ecx, 52(%eax) + movl 28(%esi), %eax + # saved r[13] + # ################## Calculate word 14 + xorl %ecx, %ecx + # mul a[7]*b[7] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + adcl $0, %ecx + movl %ebp, 56(%eax) + # saved r[14] + # save r[15] + movl %ebx, 60(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_mul_comba8_end: + .size bn_mul_comba8,.bn_mul_comba8_end-bn_mul_comba8 +.ident "desasm.pl" +.text + .align 16 +.globl bn_mul_comba4 + .type bn_mul_comba4,@function +bn_mul_comba4: + pushl %esi + movl 12(%esp), %esi + pushl %edi + movl 20(%esp), %edi + pushl %ebp + pushl %ebx + xorl %ebx, %ebx + movl (%esi), %eax + xorl %ecx, %ecx + movl (%edi), %edx + # ################## Calculate word 0 + xorl %ebp, %ebp + # mul a[0]*b[0] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, (%eax) + movl 4(%esi), %eax + # saved r[0] + # ################## Calculate word 1 + xorl %ebx, %ebx + # mul a[1]*b[0] + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + # mul a[0]*b[1] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 4(%eax) + movl 8(%esi), %eax + # saved r[1] + # ################## Calculate word 2 + xorl %ecx, %ecx + # mul a[2]*b[0] + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + # mul a[1]*b[1] + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + # mul a[0]*b[2] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 8(%eax) + movl 12(%esi), %eax + # saved r[2] + # ################## Calculate word 3 + xorl %ebp, %ebp + # mul a[3]*b[0] + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + # mul a[2]*b[1] + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + # mul a[1]*b[2] + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + # mul a[0]*b[3] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + movl %ebx, 12(%eax) + movl 12(%esi), %eax + # saved r[3] + # ################## Calculate word 4 + xorl %ebx, %ebx + # mul a[3]*b[1] + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + # mul a[2]*b[2] + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + # mul a[1]*b[3] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + movl %ecx, 16(%eax) + movl 12(%esi), %eax + # saved r[4] + # ################## Calculate word 5 + xorl %ecx, %ecx + # mul a[3]*b[2] + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + # mul a[2]*b[3] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + movl %ebp, 20(%eax) + movl 12(%esi), %eax + # saved r[5] + # ################## Calculate word 6 + xorl %ebp, %ebp + # mul a[3]*b[3] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + adcl $0, %ebp + movl %ebx, 24(%eax) + # saved r[6] + # save r[7] + movl %ecx, 28(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_mul_comba4_end: + .size bn_mul_comba4,.bn_mul_comba4_end-bn_mul_comba4 +.ident "desasm.pl" +.text + .align 16 +.globl bn_sqr_comba8 + .type bn_sqr_comba8,@function +bn_sqr_comba8: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp), %edi + movl 24(%esp), %esi + xorl %ebx, %ebx + xorl %ecx, %ecx + movl (%esi), %eax + # ############### Calculate word 0 + xorl %ebp, %ebp + # sqr a[0]*a[0] + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, (%edi) + movl 4(%esi), %eax + # saved r[0] + # ############### Calculate word 1 + xorl %ebx, %ebx + # sqr a[1]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + movl %ecx, 4(%edi) + movl (%esi), %edx + # saved r[1] + # ############### Calculate word 2 + xorl %ecx, %ecx + # sqr a[2]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 4(%esi), %eax + adcl $0, %ecx + # sqr a[1]*a[1] + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl (%esi), %edx + adcl $0, %ecx + movl %ebp, 8(%edi) + movl 12(%esi), %eax + # saved r[2] + # ############### Calculate word 3 + xorl %ebp, %ebp + # sqr a[3]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 8(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + # sqr a[2]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 16(%esi), %eax + adcl $0, %ebp + movl %ebx, 12(%edi) + movl (%esi), %edx + # saved r[3] + # ############### Calculate word 4 + xorl %ebx, %ebx + # sqr a[4]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 12(%esi), %eax + adcl $0, %ebx + movl 4(%esi), %edx + # sqr a[3]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + # sqr a[2]*a[2] + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl (%esi), %edx + adcl $0, %ebx + movl %ecx, 16(%edi) + movl 20(%esi), %eax + # saved r[4] + # ############### Calculate word 5 + xorl %ecx, %ecx + # sqr a[5]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 16(%esi), %eax + adcl $0, %ecx + movl 4(%esi), %edx + # sqr a[4]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 12(%esi), %eax + adcl $0, %ecx + movl 8(%esi), %edx + # sqr a[3]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl %ebp, 20(%edi) + movl (%esi), %edx + # saved r[5] + # ############### Calculate word 6 + xorl %ebp, %ebp + # sqr a[6]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 20(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + # sqr a[5]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 16(%esi), %eax + adcl $0, %ebp + movl 8(%esi), %edx + # sqr a[4]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 12(%esi), %eax + adcl $0, %ebp + # sqr a[3]*a[3] + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, 24(%edi) + movl 28(%esi), %eax + # saved r[6] + # ############### Calculate word 7 + xorl %ebx, %ebx + # sqr a[7]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 24(%esi), %eax + adcl $0, %ebx + movl 4(%esi), %edx + # sqr a[6]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 20(%esi), %eax + adcl $0, %ebx + movl 8(%esi), %edx + # sqr a[5]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 16(%esi), %eax + adcl $0, %ebx + movl 12(%esi), %edx + # sqr a[4]*a[3] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 28(%esi), %eax + adcl $0, %ebx + movl %ecx, 28(%edi) + movl 4(%esi), %edx + # saved r[7] + # ############### Calculate word 8 + xorl %ecx, %ecx + # sqr a[7]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl 8(%esi), %edx + # sqr a[6]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 20(%esi), %eax + adcl $0, %ecx + movl 12(%esi), %edx + # sqr a[5]*a[3] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 16(%esi), %eax + adcl $0, %ecx + # sqr a[4]*a[4] + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl 8(%esi), %edx + adcl $0, %ecx + movl %ebp, 32(%edi) + movl 28(%esi), %eax + # saved r[8] + # ############### Calculate word 9 + xorl %ebp, %ebp + # sqr a[7]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %eax + adcl $0, %ebp + movl 12(%esi), %edx + # sqr a[6]*a[3] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 20(%esi), %eax + adcl $0, %ebp + movl 16(%esi), %edx + # sqr a[5]*a[4] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 28(%esi), %eax + adcl $0, %ebp + movl %ebx, 36(%edi) + movl 12(%esi), %edx + # saved r[9] + # ############### Calculate word 10 + xorl %ebx, %ebx + # sqr a[7]*a[3] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 24(%esi), %eax + adcl $0, %ebx + movl 16(%esi), %edx + # sqr a[6]*a[4] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 20(%esi), %eax + adcl $0, %ebx + # sqr a[5]*a[5] + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl 16(%esi), %edx + adcl $0, %ebx + movl %ecx, 40(%edi) + movl 28(%esi), %eax + # saved r[10] + # ############### Calculate word 11 + xorl %ecx, %ecx + # sqr a[7]*a[4] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl 20(%esi), %edx + # sqr a[6]*a[5] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 28(%esi), %eax + adcl $0, %ecx + movl %ebp, 44(%edi) + movl 20(%esi), %edx + # saved r[11] + # ############### Calculate word 12 + xorl %ebp, %ebp + # sqr a[7]*a[5] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %eax + adcl $0, %ebp + # sqr a[6]*a[6] + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %edx + adcl $0, %ebp + movl %ebx, 48(%edi) + movl 28(%esi), %eax + # saved r[12] + # ############### Calculate word 13 + xorl %ebx, %ebx + # sqr a[7]*a[6] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 28(%esi), %eax + adcl $0, %ebx + movl %ecx, 52(%edi) + # saved r[13] + # ############### Calculate word 14 + xorl %ecx, %ecx + # sqr a[7]*a[7] + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + adcl $0, %ecx + movl %ebp, 56(%edi) + # saved r[14] + movl %ebx, 60(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_sqr_comba8_end: + .size bn_sqr_comba8,.bn_sqr_comba8_end-bn_sqr_comba8 +.ident "desasm.pl" +.text + .align 16 +.globl bn_sqr_comba4 + .type bn_sqr_comba4,@function +bn_sqr_comba4: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp), %edi + movl 24(%esp), %esi + xorl %ebx, %ebx + xorl %ecx, %ecx + movl (%esi), %eax + # ############### Calculate word 0 + xorl %ebp, %ebp + # sqr a[0]*a[0] + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, (%edi) + movl 4(%esi), %eax + # saved r[0] + # ############### Calculate word 1 + xorl %ebx, %ebx + # sqr a[1]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + movl %ecx, 4(%edi) + movl (%esi), %edx + # saved r[1] + # ############### Calculate word 2 + xorl %ecx, %ecx + # sqr a[2]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 4(%esi), %eax + adcl $0, %ecx + # sqr a[1]*a[1] + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl (%esi), %edx + adcl $0, %ecx + movl %ebp, 8(%edi) + movl 12(%esi), %eax + # saved r[2] + # ############### Calculate word 3 + xorl %ebp, %ebp + # sqr a[3]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 8(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + # sqr a[2]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 12(%esi), %eax + adcl $0, %ebp + movl %ebx, 12(%edi) + movl 4(%esi), %edx + # saved r[3] + # ############### Calculate word 4 + xorl %ebx, %ebx + # sqr a[3]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + # sqr a[2]*a[2] + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %edx + adcl $0, %ebx + movl %ecx, 16(%edi) + movl 12(%esi), %eax + # saved r[4] + # ############### Calculate word 5 + xorl %ecx, %ecx + # sqr a[3]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 12(%esi), %eax + adcl $0, %ecx + movl %ebp, 20(%edi) + # saved r[5] + # ############### Calculate word 6 + xorl %ebp, %ebp + # sqr a[3]*a[3] + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + adcl $0, %ebp + movl %ebx, 24(%edi) + # saved r[6] + movl %ecx, 28(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_sqr_comba4_end: + .size bn_sqr_comba4,.bn_sqr_comba4_end-bn_sqr_comba4 +.ident "desasm.pl" diff --git a/crypto/bn/asm/f b/crypto/bn/asm/f new file mode 100644 index 0000000000..a23fa159b2 --- /dev/null +++ b/crypto/bn/asm/f @@ -0,0 +1,500 @@ + .text + .align 3 + .globl bn_sqr_comba8 + .ent bn_sqr_comba8 +bn_sqr_comba8: +bn_sqr_comba8..ng: + .frame $30,0,$26,0 + .prologue 0 + + ldq $0, 0($17) + ldq $1, 8($17) + ldq $2, 16($17) + ldq $3, 24($17) + ldq $4, 32($17) + ldq $5, 40($17) + ldq $6, 48($17) + ldq $7, 56($17) + bis $31, $31, $23 + mulq $0, $0, $8 + umulh $0, $0, $22 + stq $8, 0($16) + bis $31, $31, $8 + mulq $1, $0, $24 + umulh $1, $0, $25 + cmplt $24, $31, $27 + cmplt $25, $31, $28 + addq $24, $24, $24 + addq $25, $25, $25 + addq $25, $27, $25 + addq $8, $28, $8 + addq $22, $24, $22 + addq $23, $25, $23 + cmpult $22, $24, $21 + cmpult $23, $25, $20 + addq $23, $21, $23 + addq $8, $20, $8 + stq $22, 8($16) + bis $31, $31, $22 + mulq $1, $1, $19 + umulh $1, $1, $18 + addq $23, $19, $23 + addq $8, $18, $8 + cmpult $23, $19, $17 + cmpult $8, $18, $27 + addq $8, $17, $8 + addq $22, $27, $22 + mulq $2, $0, $28 + umulh $2, $0, $24 + cmplt $28, $31, $25 + cmplt $24, $31, $21 + addq $28, $28, $28 + addq $24, $24, $24 + addq $24, $25, $24 + addq $22, $21, $22 + addq $23, $28, $23 + addq $8, $24, $8 + cmpult $23, $28, $20 + cmpult $8, $24, $19 + addq $8, $20, $8 + addq $22, $19, $22 + stq $23, 16($16) + bis $31, $31, $23 + mulq $2, $1, $18 + umulh $2, $1, $17 + cmplt $18, $31, $27 + cmplt $17, $31, $25 + addq $18, $18, $18 + addq $17, $17, $17 + addq $17, $27, $17 + addq $23, $25, $23 + addq $8, $18, $8 + addq $22, $17, $22 + cmpult $8, $18, $21 + cmpult $22, $17, $28 + addq $22, $21, $22 + addq $23, $28, $23 + mulq $3, $0, $24 + umulh $3, $0, $20 + cmplt $24, $31, $19 + cmplt $20, $31, $27 + addq $24, $24, $24 + addq $20, $20, $20 + addq $20, $19, $20 + addq $23, $27, $23 + addq $8, $24, $8 + addq $22, $20, $22 + cmpult $8, $24, $25 + cmpult $22, $20, $18 + addq $22, $25, $22 + addq $23, $18, $23 + stq $8, 24($16) + bis $31, $31, $8 + mulq $2, $2, $17 + umulh $2, $2, $21 + addq $22, $17, $22 + addq $23, $21, $23 + cmpult $22, $17, $28 + cmpult $23, $21, $19 + addq $23, $28, $23 + addq $8, $19, $8 + mulq $3, $1, $27 + umulh $3, $1, $24 + cmplt $27, $31, $20 + cmplt $24, $31, $25 + addq $27, $27, $27 + addq $24, $24, $24 + addq $24, $20, $24 + addq $8, $25, $8 + addq $22, $27, $22 + addq $23, $24, $23 + cmpult $22, $27, $18 + cmpult $23, $24, $17 + addq $23, $18, $23 + addq $8, $17, $8 + mulq $4, $0, $21 + umulh $4, $0, $28 + cmplt $21, $31, $19 + cmplt $28, $31, $20 + addq $21, $21, $21 + addq $28, $28, $28 + addq $28, $19, $28 + addq $8, $20, $8 + addq $22, $21, $22 + addq $23, $28, $23 + cmpult $22, $21, $25 + cmpult $23, $28, $27 + addq $23, $25, $23 + addq $8, $27, $8 + stq $22, 32($16) + bis $31, $31, $22 + mulq $3, $2, $24 + umulh $3, $2, $18 + cmplt $24, $31, $17 + cmplt $18, $31, $19 + addq $24, $24, $24 + addq $18, $18, $18 + addq $18, $17, $18 + addq $22, $19, $22 + addq $23, $24, $23 + addq $8, $18, $8 + cmpult $23, $24, $20 + cmpult $8, $18, $21 + addq $8, $20, $8 + addq $22, $21, $22 + mulq $4, $1, $28 + umulh $4, $1, $25 + cmplt $28, $31, $27 + cmplt $25, $31, $17 + addq $28, $28, $28 + addq $25, $25, $25 + addq $25, $27, $25 + addq $22, $17, $22 + addq $23, $28, $23 + addq $8, $25, $8 + cmpult $23, $28, $19 + cmpult $8, $25, $24 + addq $8, $19, $8 + addq $22, $24, $22 + mulq $5, $0, $18 + umulh $5, $0, $20 + cmplt $18, $31, $21 + cmplt $20, $31, $27 + addq $18, $18, $18 + addq $20, $20, $20 + addq $20, $21, $20 + addq $22, $27, $22 + addq $23, $18, $23 + addq $8, $20, $8 + cmpult $23, $18, $17 + cmpult $8, $20, $28 + addq $8, $17, $8 + addq $22, $28, $22 + stq $23, 40($16) + bis $31, $31, $23 + mulq $3, $3, $25 + umulh $3, $3, $19 + addq $8, $25, $8 + addq $22, $19, $22 + cmpult $8, $25, $24 + cmpult $22, $19, $21 + addq $22, $24, $22 + addq $23, $21, $23 + mulq $4, $2, $27 + umulh $4, $2, $18 + cmplt $27, $31, $20 + cmplt $18, $31, $17 + addq $27, $27, $27 + addq $18, $18, $18 + addq $18, $20, $18 + addq $23, $17, $23 + addq $8, $27, $8 + addq $22, $18, $22 + cmpult $8, $27, $28 + cmpult $22, $18, $25 + addq $22, $28, $22 + addq $23, $25, $23 + mulq $5, $1, $19 + umulh $5, $1, $24 + cmplt $19, $31, $21 + cmplt $24, $31, $20 + addq $19, $19, $19 + addq $24, $24, $24 + addq $24, $21, $24 + addq $23, $20, $23 + addq $8, $19, $8 + addq $22, $24, $22 + cmpult $8, $19, $17 + cmpult $22, $24, $27 + addq $22, $17, $22 + addq $23, $27, $23 + mulq $6, $0, $18 + umulh $6, $0, $28 + cmplt $18, $31, $25 + cmplt $28, $31, $21 + addq $18, $18, $18 + addq $28, $28, $28 + addq $28, $25, $28 + addq $23, $21, $23 + addq $8, $18, $8 + addq $22, $28, $22 + cmpult $8, $18, $20 + cmpult $22, $28, $19 + addq $22, $20, $22 + addq $23, $19, $23 + stq $8, 48($16) + bis $31, $31, $8 + mulq $4, $3, $24 + umulh $4, $3, $17 + cmplt $24, $31, $27 + cmplt $17, $31, $25 + addq $24, $24, $24 + addq $17, $17, $17 + addq $17, $27, $17 + addq $8, $25, $8 + addq $22, $24, $22 + addq $23, $17, $23 + cmpult $22, $24, $21 + cmpult $23, $17, $18 + addq $23, $21, $23 + addq $8, $18, $8 + mulq $5, $2, $28 + umulh $5, $2, $20 + cmplt $28, $31, $19 + cmplt $20, $31, $27 + addq $28, $28, $28 + addq $20, $20, $20 + addq $20, $19, $20 + addq $8, $27, $8 + addq $22, $28, $22 + addq $23, $20, $23 + cmpult $22, $28, $25 + cmpult $23, $20, $24 + addq $23, $25, $23 + addq $8, $24, $8 + mulq $6, $1, $17 + umulh $6, $1, $21 + cmplt $17, $31, $18 + cmplt $21, $31, $19 + addq $17, $17, $17 + addq $21, $21, $21 + addq $21, $18, $21 + addq $8, $19, $8 + addq $22, $17, $22 + addq $23, $21, $23 + cmpult $22, $17, $27 + cmpult $23, $21, $28 + addq $23, $27, $23 + addq $8, $28, $8 + mulq $7, $0, $20 + umulh $7, $0, $25 + cmplt $20, $31, $24 + cmplt $25, $31, $18 + addq $20, $20, $20 + addq $25, $25, $25 + addq $25, $24, $25 + addq $8, $18, $8 + addq $22, $20, $22 + addq $23, $25, $23 + cmpult $22, $20, $19 + cmpult $23, $25, $17 + addq $23, $19, $23 + addq $8, $17, $8 + stq $22, 56($16) + bis $31, $31, $22 + mulq $4, $4, $21 + umulh $4, $4, $27 + addq $23, $21, $23 + addq $8, $27, $8 + cmpult $23, $21, $28 + cmpult $8, $27, $24 + addq $8, $28, $8 + addq $22, $24, $22 + mulq $5, $3, $18 + umulh $5, $3, $20 + cmplt $18, $31, $25 + cmplt $20, $31, $19 + addq $18, $18, $18 + addq $20, $20, $20 + addq $20, $25, $20 + addq $22, $19, $22 + addq $23, $18, $23 + addq $8, $20, $8 + cmpult $23, $18, $17 + cmpult $8, $20, $21 + addq $8, $17, $8 + addq $22, $21, $22 + mulq $6, $2, $27 + umulh $6, $2, $28 + cmplt $27, $31, $24 + cmplt $28, $31, $25 + addq $27, $27, $27 + addq $28, $28, $28 + addq $28, $24, $28 + addq $22, $25, $22 + addq $23, $27, $23 + addq $8, $28, $8 + cmpult $23, $27, $19 + cmpult $8, $28, $18 + addq $8, $19, $8 + addq $22, $18, $22 + mulq $7, $1, $20 + umulh $7, $1, $17 + cmplt $20, $31, $21 + cmplt $17, $31, $24 + addq $20, $20, $20 + addq $17, $17, $17 + addq $17, $21, $17 + addq $22, $24, $22 + addq $23, $20, $23 + addq $8, $17, $8 + cmpult $23, $20, $25 + cmpult $8, $17, $27 + addq $8, $25, $8 + addq $22, $27, $22 + stq $23, 64($16) + bis $31, $31, $23 + mulq $5, $4, $28 + umulh $5, $4, $19 + cmplt $28, $31, $18 + cmplt $19, $31, $21 + addq $28, $28, $28 + addq $19, $19, $19 + addq $19, $18, $19 + addq $23, $21, $23 + addq $8, $28, $8 + addq $22, $19, $22 + cmpult $8, $28, $24 + cmpult $22, $19, $20 + addq $22, $24, $22 + addq $23, $20, $23 + mulq $6, $3, $17 + umulh $6, $3, $25 + cmplt $17, $31, $27 + cmplt $25, $31, $18 + addq $17, $17, $17 + addq $25, $25, $25 + addq $25, $27, $25 + addq $23, $18, $23 + addq $8, $17, $8 + addq $22, $25, $22 + cmpult $8, $17, $21 + cmpult $22, $25, $28 + addq $22, $21, $22 + addq $23, $28, $23 + mulq $7, $2, $19 + umulh $7, $2, $24 + cmplt $19, $31, $20 + cmplt $24, $31, $27 + addq $19, $19, $19 + addq $24, $24, $24 + addq $24, $20, $24 + addq $23, $27, $23 + addq $8, $19, $8 + addq $22, $24, $22 + cmpult $8, $19, $18 + cmpult $22, $24, $17 + addq $22, $18, $22 + addq $23, $17, $23 + stq $8, 72($16) + bis $31, $31, $8 + mulq $5, $5, $25 + umulh $5, $5, $21 + addq $22, $25, $22 + addq $23, $21, $23 + cmpult $22, $25, $28 + cmpult $23, $21, $20 + addq $23, $28, $23 + addq $8, $20, $8 + mulq $6, $4, $27 + umulh $6, $4, $19 + cmplt $27, $31, $24 + cmplt $19, $31, $18 + addq $27, $27, $27 + addq $19, $19, $19 + addq $19, $24, $19 + addq $8, $18, $8 + addq $22, $27, $22 + addq $23, $19, $23 + cmpult $22, $27, $17 + cmpult $23, $19, $25 + addq $23, $17, $23 + addq $8, $25, $8 + mulq $7, $3, $21 + umulh $7, $3, $28 + cmplt $21, $31, $20 + cmplt $28, $31, $24 + addq $21, $21, $21 + addq $28, $28, $28 + addq $28, $20, $28 + addq $8, $24, $8 + addq $22, $21, $22 + addq $23, $28, $23 + cmpult $22, $21, $18 + cmpult $23, $28, $27 + addq $23, $18, $23 + addq $8, $27, $8 + stq $22, 80($16) + bis $31, $31, $22 + mulq $6, $5, $19 + umulh $6, $5, $17 + cmplt $19, $31, $25 + cmplt $17, $31, $20 + addq $19, $19, $19 + addq $17, $17, $17 + addq $17, $25, $17 + addq $22, $20, $22 + addq $23, $19, $23 + addq $8, $17, $8 + cmpult $23, $19, $24 + cmpult $8, $17, $21 + addq $8, $24, $8 + addq $22, $21, $22 + mulq $7, $4, $28 + umulh $7, $4, $18 + cmplt $28, $31, $27 + cmplt $18, $31, $25 + addq $28, $28, $28 + addq $18, $18, $18 + addq $18, $27, $18 + addq $22, $25, $22 + addq $23, $28, $23 + addq $8, $18, $8 + cmpult $23, $28, $20 + cmpult $8, $18, $19 + addq $8, $20, $8 + addq $22, $19, $22 + stq $23, 88($16) + bis $31, $31, $23 + mulq $6, $6, $17 + umulh $6, $6, $24 + addq $8, $17, $8 + addq $22, $24, $22 + cmpult $8, $17, $21 + cmpult $22, $24, $27 + addq $22, $21, $22 + addq $23, $27, $23 + mulq $7, $5, $25 + umulh $7, $5, $28 + cmplt $25, $31, $18 + cmplt $28, $31, $20 + addq $25, $25, $25 + addq $28, $28, $28 + addq $28, $18, $28 + addq $23, $20, $23 + addq $8, $25, $8 + addq $22, $28, $22 + cmpult $8, $25, $19 + cmpult $22, $28, $17 + addq $22, $19, $22 + addq $23, $17, $23 + stq $8, 96($16) + bis $31, $31, $8 + mulq $7, $6, $24 + umulh $7, $6, $21 + cmplt $24, $31, $27 + cmplt $21, $31, $18 + addq $24, $24, $24 + addq $21, $21, $21 + addq $21, $27, $21 + addq $8, $18, $8 + addq $22, $24, $22 + addq $23, $21, $23 + cmpult $22, $24, $20 + cmpult $23, $21, $25 + addq $23, $20, $23 + addq $8, $25, $8 + stq $22, 104($16) + bis $31, $31, $22 + mulq $7, $7, $28 + umulh $7, $7, $19 + addq $23, $28, $23 + addq $8, $19, $8 + cmpult $23, $28, $17 + cmpult $8, $19, $27 + addq $8, $17, $8 + addq $22, $27, $22 + stq $23, 112($16) + stq $8, 120($16) + ret $31,($26),1 + .end bn_sqr_comba8 diff --git a/crypto/bn/asm/f.c b/crypto/bn/asm/f.c new file mode 100644 index 0000000000..bfdccae4a0 --- /dev/null +++ b/crypto/bn/asm/f.c @@ -0,0 +1,8 @@ +int abc(a,b,c,d,e,f,g,h,i,j) +unsigned long a,b,c,d,e,f,g,h,i,j; + { + gg(g); + if (g) + gg(h); + gg(i); + } diff --git a/crypto/bn/asm/f.elf b/crypto/bn/asm/f.elf new file mode 100644 index 0000000000..39d07b79e1 --- /dev/null +++ b/crypto/bn/asm/f.elf @@ -0,0 +1,2149 @@ + # Don't even think of reading this code + # It was automatically generated by bn-586.pl + # Which is a perl program used to generate the x86 assember for + # any of elf, a.out, BSDI,Win32, or Solaris + # eric <eay@cryptsoft.com> + + .file "bn-586.s" + .version "01.01" +gcc2_compiled.: +.text + .align 16 +.globl bn_mul_add_words + .type bn_mul_add_words,@function +bn_mul_add_words: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + + xorl %esi, %esi + movl 20(%esp), %edi + movl 28(%esp), %ecx + movl 24(%esp), %ebx + andl $4294967288, %ecx + movl 32(%esp), %ebp + pushl %ecx + jz .L000maw_finish +.L001maw_loop: + movl %ecx, (%esp) + # Round 0 + movl (%ebx), %eax + mull %ebp + addl %esi, %eax + movl (%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + movl %eax, (%edi) + movl %edx, %esi + # Round 4 + movl 4(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 4(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + movl %eax, 4(%edi) + movl %edx, %esi + # Round 8 + movl 8(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 8(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + movl %eax, 8(%edi) + movl %edx, %esi + # Round 12 + movl 12(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 12(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + movl %eax, 12(%edi) + movl %edx, %esi + # Round 16 + movl 16(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 16(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + movl %eax, 16(%edi) + movl %edx, %esi + # Round 20 + movl 20(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 20(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + movl %eax, 20(%edi) + movl %edx, %esi + # Round 24 + movl 24(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 24(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + movl %eax, 24(%edi) + movl %edx, %esi + # Round 28 + movl 28(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 28(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + movl %eax, 28(%edi) + movl %edx, %esi + + movl (%esp), %ecx + addl $32, %ebx + addl $32, %edi + subl $8, %ecx + jnz .L001maw_loop +.L000maw_finish: + movl 32(%esp), %ecx + andl $7, %ecx + jnz .L002maw_finish2 + jmp .L003maw_end +.align 16 +.L002maw_finish2: + # Tail Round 0 + movl (%ebx), %eax + mull %ebp + addl %esi, %eax + movl (%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + decl %ecx + movl %eax, (%edi) + movl %edx, %esi + jz .L003maw_end + # Tail Round 1 + movl 4(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 4(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + decl %ecx + movl %eax, 4(%edi) + movl %edx, %esi + jz .L003maw_end + # Tail Round 2 + movl 8(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 8(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + decl %ecx + movl %eax, 8(%edi) + movl %edx, %esi + jz .L003maw_end + # Tail Round 3 + movl 12(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 12(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + decl %ecx + movl %eax, 12(%edi) + movl %edx, %esi + jz .L003maw_end + # Tail Round 4 + movl 16(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 16(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + decl %ecx + movl %eax, 16(%edi) + movl %edx, %esi + jz .L003maw_end + # Tail Round 5 + movl 20(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 20(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + decl %ecx + movl %eax, 20(%edi) + movl %edx, %esi + jz .L003maw_end + # Tail Round 6 + movl 24(%ebx), %eax + mull %ebp + addl %esi, %eax + movl 24(%edi), %esi + adcl $0, %edx + addl %esi, %eax + adcl $0, %edx + movl %eax, 24(%edi) + movl %edx, %esi +.L003maw_end: + movl %esi, %eax + popl %ecx + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.bn_mul_add_words_end: + .size bn_mul_add_words,.bn_mul_add_words_end-bn_mul_add_words +.ident "bn_mul_add_words" +.text + .align 16 +.globl bn_mul_words + .type bn_mul_words,@function +bn_mul_words: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + + xorl %esi, %esi + movl 20(%esp), %edi + movl 24(%esp), %ebx + movl 28(%esp), %ebp + movl 32(%esp), %ecx + andl $4294967288, %ebp + jz .L004mw_finish +.L005mw_loop: + # Round 0 + movl (%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, (%edi) + movl %edx, %esi + # Round 4 + movl 4(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 4(%edi) + movl %edx, %esi + # Round 8 + movl 8(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 8(%edi) + movl %edx, %esi + # Round 12 + movl 12(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 12(%edi) + movl %edx, %esi + # Round 16 + movl 16(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 16(%edi) + movl %edx, %esi + # Round 20 + movl 20(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 20(%edi) + movl %edx, %esi + # Round 24 + movl 24(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 24(%edi) + movl %edx, %esi + # Round 28 + movl 28(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 28(%edi) + movl %edx, %esi + + addl $32, %ebx + addl $32, %edi + subl $8, %ebp + jz .L004mw_finish + jmp .L005mw_loop +.L004mw_finish: + movl 28(%esp), %ebp + andl $7, %ebp + jnz .L006mw_finish2 + jmp .L007mw_end +.align 16 +.L006mw_finish2: + # Tail Round 0 + movl (%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, (%edi) + movl %edx, %esi + decl %ebp + jz .L007mw_end + # Tail Round 1 + movl 4(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 4(%edi) + movl %edx, %esi + decl %ebp + jz .L007mw_end + # Tail Round 2 + movl 8(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 8(%edi) + movl %edx, %esi + decl %ebp + jz .L007mw_end + # Tail Round 3 + movl 12(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 12(%edi) + movl %edx, %esi + decl %ebp + jz .L007mw_end + # Tail Round 4 + movl 16(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 16(%edi) + movl %edx, %esi + decl %ebp + jz .L007mw_end + # Tail Round 5 + movl 20(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 20(%edi) + movl %edx, %esi + decl %ebp + jz .L007mw_end + # Tail Round 6 + movl 24(%ebx), %eax + mull %ecx + addl %esi, %eax + adcl $0, %edx + movl %eax, 24(%edi) + movl %edx, %esi +.L007mw_end: + movl %esi, %eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.bn_mul_words_end: + .size bn_mul_words,.bn_mul_words_end-bn_mul_words +.ident "bn_mul_words" +.text + .align 16 +.globl bn_sqr_words + .type bn_sqr_words,@function +bn_sqr_words: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + + movl 20(%esp), %esi + movl 24(%esp), %edi + movl 28(%esp), %ebx + andl $4294967288, %ebx + jz .L008sw_finish +.L009sw_loop: + # Round 0 + movl (%edi), %eax + mull %eax + movl %eax, (%esi) + movl %edx, 4(%esi) + # Round 4 + movl 4(%edi), %eax + mull %eax + movl %eax, 8(%esi) + movl %edx, 12(%esi) + # Round 8 + movl 8(%edi), %eax + mull %eax + movl %eax, 16(%esi) + movl %edx, 20(%esi) + # Round 12 + movl 12(%edi), %eax + mull %eax + movl %eax, 24(%esi) + movl %edx, 28(%esi) + # Round 16 + movl 16(%edi), %eax + mull %eax + movl %eax, 32(%esi) + movl %edx, 36(%esi) + # Round 20 + movl 20(%edi), %eax + mull %eax + movl %eax, 40(%esi) + movl %edx, 44(%esi) + # Round 24 + movl 24(%edi), %eax + mull %eax + movl %eax, 48(%esi) + movl %edx, 52(%esi) + # Round 28 + movl 28(%edi), %eax + mull %eax + movl %eax, 56(%esi) + movl %edx, 60(%esi) + + addl $32, %edi + addl $64, %esi + subl $8, %ebx + jnz .L009sw_loop +.L008sw_finish: + movl 28(%esp), %ebx + andl $7, %ebx + jz .L010sw_end + # Tail Round 0 + movl (%edi), %eax + mull %eax + movl %eax, (%esi) + decl %ebx + movl %edx, 4(%esi) + jz .L010sw_end + # Tail Round 1 + movl 4(%edi), %eax + mull %eax + movl %eax, 8(%esi) + decl %ebx + movl %edx, 12(%esi) + jz .L010sw_end + # Tail Round 2 + movl 8(%edi), %eax + mull %eax + movl %eax, 16(%esi) + decl %ebx + movl %edx, 20(%esi) + jz .L010sw_end + # Tail Round 3 + movl 12(%edi), %eax + mull %eax + movl %eax, 24(%esi) + decl %ebx + movl %edx, 28(%esi) + jz .L010sw_end + # Tail Round 4 + movl 16(%edi), %eax + mull %eax + movl %eax, 32(%esi) + decl %ebx + movl %edx, 36(%esi) + jz .L010sw_end + # Tail Round 5 + movl 20(%edi), %eax + mull %eax + movl %eax, 40(%esi) + decl %ebx + movl %edx, 44(%esi) + jz .L010sw_end + # Tail Round 6 + movl 24(%edi), %eax + mull %eax + movl %eax, 48(%esi) + movl %edx, 52(%esi) +.L010sw_end: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.bn_sqr_words_end: + .size bn_sqr_words,.bn_sqr_words_end-bn_sqr_words +.ident "bn_sqr_words" +.text + .align 16 +.globl bn_div64 + .type bn_div64,@function +bn_div64: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + movl 20(%esp), %edx + movl 24(%esp), %eax + movl 28(%esp), %ebx + divl %ebx + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.bn_div64_end: + .size bn_div64,.bn_div64_end-bn_div64 +.ident "bn_div64" +.text + .align 16 +.globl bn_add_words + .type bn_add_words,@function +bn_add_words: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + + movl 20(%esp), %ebx + movl 24(%esp), %esi + movl 28(%esp), %edi + movl 32(%esp), %ebp + xorl %eax, %eax + andl $4294967288, %ebp + jz .L011aw_finish +.L012aw_loop: + # Round 0 + movl (%esi), %ecx + movl (%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + movl %ecx, (%ebx) + # Round 1 + movl 4(%esi), %ecx + movl 4(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + movl %ecx, 4(%ebx) + # Round 2 + movl 8(%esi), %ecx + movl 8(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + movl %ecx, 8(%ebx) + # Round 3 + movl 12(%esi), %ecx + movl 12(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + movl %ecx, 12(%ebx) + # Round 4 + movl 16(%esi), %ecx + movl 16(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + movl %ecx, 16(%ebx) + # Round 5 + movl 20(%esi), %ecx + movl 20(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + movl %ecx, 20(%ebx) + # Round 6 + movl 24(%esi), %ecx + movl 24(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + movl %ecx, 24(%ebx) + # Round 7 + movl 28(%esi), %ecx + movl 28(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + movl %ecx, 28(%ebx) + + addl $32, %esi + addl $32, %edi + addl $32, %ebx + subl $8, %ebp + jnz .L012aw_loop +.L011aw_finish: + movl 32(%esp), %ebp + andl $7, %ebp + jz .L013aw_end + # Tail Round 0 + movl (%esi), %ecx + movl (%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, (%ebx) + jz .L013aw_end + # Tail Round 1 + movl 4(%esi), %ecx + movl 4(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 4(%ebx) + jz .L013aw_end + # Tail Round 2 + movl 8(%esi), %ecx + movl 8(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 8(%ebx) + jz .L013aw_end + # Tail Round 3 + movl 12(%esi), %ecx + movl 12(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 12(%ebx) + jz .L013aw_end + # Tail Round 4 + movl 16(%esi), %ecx + movl 16(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 16(%ebx) + jz .L013aw_end + # Tail Round 5 + movl 20(%esi), %ecx + movl 20(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 20(%ebx) + jz .L013aw_end + # Tail Round 6 + movl 24(%esi), %ecx + movl 24(%edi), %edx + addl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + addl %edx, %ecx + adcl $0, %eax + movl %ecx, 24(%ebx) +.L013aw_end: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.bn_add_words_end: + .size bn_add_words,.bn_add_words_end-bn_add_words +.ident "bn_add_words" +.text + .align 16 +.globl bn_sub_words + .type bn_sub_words,@function +bn_sub_words: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + + movl 20(%esp), %ebx + movl 24(%esp), %esi + movl 28(%esp), %edi + movl 32(%esp), %ebp + xorl %eax, %eax + andl $4294967288, %ebp + jz .L014aw_finish +.L015aw_loop: + # Round 0 + movl (%esi), %ecx + movl (%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, (%ebx) + # Round 1 + movl 4(%esi), %ecx + movl 4(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 4(%ebx) + # Round 2 + movl 8(%esi), %ecx + movl 8(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 8(%ebx) + # Round 3 + movl 12(%esi), %ecx + movl 12(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 12(%ebx) + # Round 4 + movl 16(%esi), %ecx + movl 16(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 16(%ebx) + # Round 5 + movl 20(%esi), %ecx + movl 20(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 20(%ebx) + # Round 6 + movl 24(%esi), %ecx + movl 24(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 24(%ebx) + # Round 7 + movl 28(%esi), %ecx + movl 28(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 28(%ebx) + + addl $32, %esi + addl $32, %edi + addl $32, %ebx + subl $8, %ebp + jnz .L015aw_loop +.L014aw_finish: + movl 32(%esp), %ebp + andl $7, %ebp + jz .L016aw_end + # Tail Round 0 + movl (%esi), %ecx + movl (%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, (%ebx) + jz .L016aw_end + # Tail Round 1 + movl 4(%esi), %ecx + movl 4(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 4(%ebx) + jz .L016aw_end + # Tail Round 2 + movl 8(%esi), %ecx + movl 8(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 8(%ebx) + jz .L016aw_end + # Tail Round 3 + movl 12(%esi), %ecx + movl 12(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 12(%ebx) + jz .L016aw_end + # Tail Round 4 + movl 16(%esi), %ecx + movl 16(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 16(%ebx) + jz .L016aw_end + # Tail Round 5 + movl 20(%esi), %ecx + movl 20(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + decl %ebp + movl %ecx, 20(%ebx) + jz .L016aw_end + # Tail Round 6 + movl 24(%esi), %ecx + movl 24(%edi), %edx + subl %eax, %ecx + movl $0, %eax + adcl %eax, %eax + subl %edx, %ecx + adcl $0, %eax + movl %ecx, 24(%ebx) +.L016aw_end: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.bn_sub_words_end: + .size bn_sub_words,.bn_sub_words_end-bn_sub_words +.ident "bn_sub_words" +.text + .align 16 +.globl bn_mul_comba8 + .type bn_mul_comba8,@function +bn_mul_comba8: + pushl %esi + movl 12(%esp), %esi + pushl %edi + movl 20(%esp), %edi + pushl %ebp + pushl %ebx + xorl %ebx, %ebx + movl (%esi), %eax + xorl %ecx, %ecx + movl (%edi), %edx + # ################## Calculate word 0 + xorl %ebp, %ebp + # mul a[0]*b[0] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, (%eax) + movl 4(%esi), %eax + # saved r[0] + # ################## Calculate word 1 + xorl %ebx, %ebx + # mul a[1]*b[0] + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + # mul a[0]*b[1] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 4(%eax) + movl 8(%esi), %eax + # saved r[1] + # ################## Calculate word 2 + xorl %ecx, %ecx + # mul a[2]*b[0] + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + # mul a[1]*b[1] + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + # mul a[0]*b[2] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 8(%eax) + movl 12(%esi), %eax + # saved r[2] + # ################## Calculate word 3 + xorl %ebp, %ebp + # mul a[3]*b[0] + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + # mul a[2]*b[1] + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + # mul a[1]*b[2] + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + # mul a[0]*b[3] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, 12(%eax) + movl 16(%esi), %eax + # saved r[3] + # ################## Calculate word 4 + xorl %ebx, %ebx + # mul a[4]*b[0] + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + # mul a[3]*b[1] + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + # mul a[2]*b[2] + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + # mul a[1]*b[3] + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + # mul a[0]*b[4] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 16(%eax) + movl 20(%esi), %eax + # saved r[4] + # ################## Calculate word 5 + xorl %ecx, %ecx + # mul a[5]*b[0] + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + # mul a[4]*b[1] + mull %edx + addl %eax, %ebp + movl 12(%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + # mul a[3]*b[2] + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + # mul a[2]*b[3] + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 16(%edi), %edx + adcl $0, %ecx + # mul a[1]*b[4] + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + # mul a[0]*b[5] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 20(%eax) + movl 24(%esi), %eax + # saved r[5] + # ################## Calculate word 6 + xorl %ebp, %ebp + # mul a[6]*b[0] + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + # mul a[5]*b[1] + mull %edx + addl %eax, %ebx + movl 16(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + # mul a[4]*b[2] + mull %edx + addl %eax, %ebx + movl 12(%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + # mul a[3]*b[3] + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 16(%edi), %edx + adcl $0, %ebp + # mul a[2]*b[4] + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 20(%edi), %edx + adcl $0, %ebp + # mul a[1]*b[5] + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + # mul a[0]*b[6] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, 24(%eax) + movl 28(%esi), %eax + # saved r[6] + # ################## Calculate word 7 + xorl %ebx, %ebx + # mul a[7]*b[0] + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + # mul a[6]*b[1] + mull %edx + addl %eax, %ecx + movl 20(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + # mul a[5]*b[2] + mull %edx + addl %eax, %ecx + movl 16(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + # mul a[4]*b[3] + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + # mul a[3]*b[4] + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 20(%edi), %edx + adcl $0, %ebx + # mul a[2]*b[5] + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 24(%edi), %edx + adcl $0, %ebx + # mul a[1]*b[6] + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + # mul a[0]*b[7] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + movl %ecx, 28(%eax) + movl 28(%esi), %eax + # saved r[7] + # ################## Calculate word 8 + xorl %ecx, %ecx + # mul a[7]*b[1] + mull %edx + addl %eax, %ebp + movl 24(%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + # mul a[6]*b[2] + mull %edx + addl %eax, %ebp + movl 20(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + # mul a[5]*b[3] + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 16(%edi), %edx + adcl $0, %ecx + # mul a[4]*b[4] + mull %edx + addl %eax, %ebp + movl 12(%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + # mul a[3]*b[5] + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 24(%edi), %edx + adcl $0, %ecx + # mul a[2]*b[6] + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 28(%edi), %edx + adcl $0, %ecx + # mul a[1]*b[7] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + movl %ebp, 32(%eax) + movl 28(%esi), %eax + # saved r[8] + # ################## Calculate word 9 + xorl %ebp, %ebp + # mul a[7]*b[2] + mull %edx + addl %eax, %ebx + movl 24(%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + # mul a[6]*b[3] + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 16(%edi), %edx + adcl $0, %ebp + # mul a[5]*b[4] + mull %edx + addl %eax, %ebx + movl 16(%esi), %eax + adcl %edx, %ecx + movl 20(%edi), %edx + adcl $0, %ebp + # mul a[4]*b[5] + mull %edx + addl %eax, %ebx + movl 12(%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + # mul a[3]*b[6] + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 28(%edi), %edx + adcl $0, %ebp + # mul a[2]*b[7] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + movl %ebx, 36(%eax) + movl 28(%esi), %eax + # saved r[9] + # ################## Calculate word 10 + xorl %ebx, %ebx + # mul a[7]*b[3] + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + # mul a[6]*b[4] + mull %edx + addl %eax, %ecx + movl 20(%esi), %eax + adcl %edx, %ebp + movl 20(%edi), %edx + adcl $0, %ebx + # mul a[5]*b[5] + mull %edx + addl %eax, %ecx + movl 16(%esi), %eax + adcl %edx, %ebp + movl 24(%edi), %edx + adcl $0, %ebx + # mul a[4]*b[6] + mull %edx + addl %eax, %ecx + movl 12(%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + # mul a[3]*b[7] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 16(%edi), %edx + adcl $0, %ebx + movl %ecx, 40(%eax) + movl 28(%esi), %eax + # saved r[10] + # ################## Calculate word 11 + xorl %ecx, %ecx + # mul a[7]*b[4] + mull %edx + addl %eax, %ebp + movl 24(%esi), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + # mul a[6]*b[5] + mull %edx + addl %eax, %ebp + movl 20(%esi), %eax + adcl %edx, %ebx + movl 24(%edi), %edx + adcl $0, %ecx + # mul a[5]*b[6] + mull %edx + addl %eax, %ebp + movl 16(%esi), %eax + adcl %edx, %ebx + movl 28(%edi), %edx + adcl $0, %ecx + # mul a[4]*b[7] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 20(%edi), %edx + adcl $0, %ecx + movl %ebp, 44(%eax) + movl 28(%esi), %eax + # saved r[11] + # ################## Calculate word 12 + xorl %ebp, %ebp + # mul a[7]*b[5] + mull %edx + addl %eax, %ebx + movl 24(%esi), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + # mul a[6]*b[6] + mull %edx + addl %eax, %ebx + movl 20(%esi), %eax + adcl %edx, %ecx + movl 28(%edi), %edx + adcl $0, %ebp + # mul a[5]*b[7] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 24(%edi), %edx + adcl $0, %ebp + movl %ebx, 48(%eax) + movl 28(%esi), %eax + # saved r[12] + # ################## Calculate word 13 + xorl %ebx, %ebx + # mul a[7]*b[6] + mull %edx + addl %eax, %ecx + movl 24(%esi), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + # mul a[6]*b[7] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 28(%edi), %edx + adcl $0, %ebx + movl %ecx, 52(%eax) + movl 28(%esi), %eax + # saved r[13] + # ################## Calculate word 14 + xorl %ecx, %ecx + # mul a[7]*b[7] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + adcl $0, %ecx + movl %ebp, 56(%eax) + # saved r[14] + # save r[15] + movl %ebx, 60(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_mul_comba8_end: + .size bn_mul_comba8,.bn_mul_comba8_end-bn_mul_comba8 +.ident "desasm.pl" +.text + .align 16 +.globl bn_mul_comba4 + .type bn_mul_comba4,@function +bn_mul_comba4: + pushl %esi + movl 12(%esp), %esi + pushl %edi + movl 20(%esp), %edi + pushl %ebp + pushl %ebx + xorl %ebx, %ebx + movl (%esi), %eax + xorl %ecx, %ecx + movl (%edi), %edx + # ################## Calculate word 0 + xorl %ebp, %ebp + # mul a[0]*b[0] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl (%edi), %edx + adcl $0, %ebp + movl %ebx, (%eax) + movl 4(%esi), %eax + # saved r[0] + # ################## Calculate word 1 + xorl %ebx, %ebx + # mul a[1]*b[0] + mull %edx + addl %eax, %ecx + movl (%esi), %eax + adcl %edx, %ebp + movl 4(%edi), %edx + adcl $0, %ebx + # mul a[0]*b[1] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl (%edi), %edx + adcl $0, %ebx + movl %ecx, 4(%eax) + movl 8(%esi), %eax + # saved r[1] + # ################## Calculate word 2 + xorl %ecx, %ecx + # mul a[2]*b[0] + mull %edx + addl %eax, %ebp + movl 4(%esi), %eax + adcl %edx, %ebx + movl 4(%edi), %edx + adcl $0, %ecx + # mul a[1]*b[1] + mull %edx + addl %eax, %ebp + movl (%esi), %eax + adcl %edx, %ebx + movl 8(%edi), %edx + adcl $0, %ecx + # mul a[0]*b[2] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl (%edi), %edx + adcl $0, %ecx + movl %ebp, 8(%eax) + movl 12(%esi), %eax + # saved r[2] + # ################## Calculate word 3 + xorl %ebp, %ebp + # mul a[3]*b[0] + mull %edx + addl %eax, %ebx + movl 8(%esi), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + # mul a[2]*b[1] + mull %edx + addl %eax, %ebx + movl 4(%esi), %eax + adcl %edx, %ecx + movl 8(%edi), %edx + adcl $0, %ebp + # mul a[1]*b[2] + mull %edx + addl %eax, %ebx + movl (%esi), %eax + adcl %edx, %ecx + movl 12(%edi), %edx + adcl $0, %ebp + # mul a[0]*b[3] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + movl 4(%edi), %edx + adcl $0, %ebp + movl %ebx, 12(%eax) + movl 12(%esi), %eax + # saved r[3] + # ################## Calculate word 4 + xorl %ebx, %ebx + # mul a[3]*b[1] + mull %edx + addl %eax, %ecx + movl 8(%esi), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + # mul a[2]*b[2] + mull %edx + addl %eax, %ecx + movl 4(%esi), %eax + adcl %edx, %ebp + movl 12(%edi), %edx + adcl $0, %ebx + # mul a[1]*b[3] + mull %edx + addl %eax, %ecx + movl 20(%esp), %eax + adcl %edx, %ebp + movl 8(%edi), %edx + adcl $0, %ebx + movl %ecx, 16(%eax) + movl 12(%esi), %eax + # saved r[4] + # ################## Calculate word 5 + xorl %ecx, %ecx + # mul a[3]*b[2] + mull %edx + addl %eax, %ebp + movl 8(%esi), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + # mul a[2]*b[3] + mull %edx + addl %eax, %ebp + movl 20(%esp), %eax + adcl %edx, %ebx + movl 12(%edi), %edx + adcl $0, %ecx + movl %ebp, 20(%eax) + movl 12(%esi), %eax + # saved r[5] + # ################## Calculate word 6 + xorl %ebp, %ebp + # mul a[3]*b[3] + mull %edx + addl %eax, %ebx + movl 20(%esp), %eax + adcl %edx, %ecx + adcl $0, %ebp + movl %ebx, 24(%eax) + # saved r[6] + # save r[7] + movl %ecx, 28(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_mul_comba4_end: + .size bn_mul_comba4,.bn_mul_comba4_end-bn_mul_comba4 +.ident "desasm.pl" +.text + .align 16 +.globl bn_sqr_comba8 + .type bn_sqr_comba8,@function +bn_sqr_comba8: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp), %edi + movl 24(%esp), %esi + xorl %ebx, %ebx + xorl %ecx, %ecx + movl (%esi), %eax + # ############### Calculate word 0 + xorl %ebp, %ebp + # sqr a[0]*a[0] + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, (%edi) + movl 4(%esi), %eax + # saved r[0] + # ############### Calculate word 1 + xorl %ebx, %ebx + # sqr a[1]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + movl %ecx, 4(%edi) + movl (%esi), %edx + # saved r[1] + # ############### Calculate word 2 + xorl %ecx, %ecx + # sqr a[2]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 4(%esi), %eax + adcl $0, %ecx + # sqr a[1]*a[1] + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl (%esi), %edx + adcl $0, %ecx + movl %ebp, 8(%edi) + movl 12(%esi), %eax + # saved r[2] + # ############### Calculate word 3 + xorl %ebp, %ebp + # sqr a[3]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 8(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + # sqr a[2]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 16(%esi), %eax + adcl $0, %ebp + movl %ebx, 12(%edi) + movl (%esi), %edx + # saved r[3] + # ############### Calculate word 4 + xorl %ebx, %ebx + # sqr a[4]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 12(%esi), %eax + adcl $0, %ebx + movl 4(%esi), %edx + # sqr a[3]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + # sqr a[2]*a[2] + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl (%esi), %edx + adcl $0, %ebx + movl %ecx, 16(%edi) + movl 20(%esi), %eax + # saved r[4] + # ############### Calculate word 5 + xorl %ecx, %ecx + # sqr a[5]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 16(%esi), %eax + adcl $0, %ecx + movl 4(%esi), %edx + # sqr a[4]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 12(%esi), %eax + adcl $0, %ecx + movl 8(%esi), %edx + # sqr a[3]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl %ebp, 20(%edi) + movl (%esi), %edx + # saved r[5] + # ############### Calculate word 6 + xorl %ebp, %ebp + # sqr a[6]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 20(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + # sqr a[5]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 16(%esi), %eax + adcl $0, %ebp + movl 8(%esi), %edx + # sqr a[4]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 12(%esi), %eax + adcl $0, %ebp + # sqr a[3]*a[3] + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, 24(%edi) + movl 28(%esi), %eax + # saved r[6] + # ############### Calculate word 7 + xorl %ebx, %ebx + # sqr a[7]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 24(%esi), %eax + adcl $0, %ebx + movl 4(%esi), %edx + # sqr a[6]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 20(%esi), %eax + adcl $0, %ebx + movl 8(%esi), %edx + # sqr a[5]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 16(%esi), %eax + adcl $0, %ebx + movl 12(%esi), %edx + # sqr a[4]*a[3] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 28(%esi), %eax + adcl $0, %ebx + movl %ecx, 28(%edi) + movl 4(%esi), %edx + # saved r[7] + # ############### Calculate word 8 + xorl %ecx, %ecx + # sqr a[7]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl 8(%esi), %edx + # sqr a[6]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 20(%esi), %eax + adcl $0, %ecx + movl 12(%esi), %edx + # sqr a[5]*a[3] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 16(%esi), %eax + adcl $0, %ecx + # sqr a[4]*a[4] + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl 8(%esi), %edx + adcl $0, %ecx + movl %ebp, 32(%edi) + movl 28(%esi), %eax + # saved r[8] + # ############### Calculate word 9 + xorl %ebp, %ebp + # sqr a[7]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %eax + adcl $0, %ebp + movl 12(%esi), %edx + # sqr a[6]*a[3] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 20(%esi), %eax + adcl $0, %ebp + movl 16(%esi), %edx + # sqr a[5]*a[4] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 28(%esi), %eax + adcl $0, %ebp + movl %ebx, 36(%edi) + movl 12(%esi), %edx + # saved r[9] + # ############### Calculate word 10 + xorl %ebx, %ebx + # sqr a[7]*a[3] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 24(%esi), %eax + adcl $0, %ebx + movl 16(%esi), %edx + # sqr a[6]*a[4] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 20(%esi), %eax + adcl $0, %ebx + # sqr a[5]*a[5] + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl 16(%esi), %edx + adcl $0, %ebx + movl %ecx, 40(%edi) + movl 28(%esi), %eax + # saved r[10] + # ############### Calculate word 11 + xorl %ecx, %ecx + # sqr a[7]*a[4] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 24(%esi), %eax + adcl $0, %ecx + movl 20(%esi), %edx + # sqr a[6]*a[5] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 28(%esi), %eax + adcl $0, %ecx + movl %ebp, 44(%edi) + movl 20(%esi), %edx + # saved r[11] + # ############### Calculate word 12 + xorl %ebp, %ebp + # sqr a[7]*a[5] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %eax + adcl $0, %ebp + # sqr a[6]*a[6] + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl 24(%esi), %edx + adcl $0, %ebp + movl %ebx, 48(%edi) + movl 28(%esi), %eax + # saved r[12] + # ############### Calculate word 13 + xorl %ebx, %ebx + # sqr a[7]*a[6] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 28(%esi), %eax + adcl $0, %ebx + movl %ecx, 52(%edi) + # saved r[13] + # ############### Calculate word 14 + xorl %ecx, %ecx + # sqr a[7]*a[7] + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + adcl $0, %ecx + movl %ebp, 56(%edi) + # saved r[14] + movl %ebx, 60(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_sqr_comba8_end: + .size bn_sqr_comba8,.bn_sqr_comba8_end-bn_sqr_comba8 +.ident "desasm.pl" +.text + .align 16 +.globl bn_sqr_comba4 + .type bn_sqr_comba4,@function +bn_sqr_comba4: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp), %edi + movl 24(%esp), %esi + xorl %ebx, %ebx + xorl %ecx, %ecx + movl (%esi), %eax + # ############### Calculate word 0 + xorl %ebp, %ebp + # sqr a[0]*a[0] + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + movl (%esi), %edx + adcl $0, %ebp + movl %ebx, (%edi) + movl 4(%esi), %eax + # saved r[0] + # ############### Calculate word 1 + xorl %ebx, %ebx + # sqr a[1]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + movl %ecx, 4(%edi) + movl (%esi), %edx + # saved r[1] + # ############### Calculate word 2 + xorl %ecx, %ecx + # sqr a[2]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 4(%esi), %eax + adcl $0, %ecx + # sqr a[1]*a[1] + mull %eax + addl %eax, %ebp + adcl %edx, %ebx + movl (%esi), %edx + adcl $0, %ecx + movl %ebp, 8(%edi) + movl 12(%esi), %eax + # saved r[2] + # ############### Calculate word 3 + xorl %ebp, %ebp + # sqr a[3]*a[0] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 8(%esi), %eax + adcl $0, %ebp + movl 4(%esi), %edx + # sqr a[2]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebp + addl %eax, %ebx + adcl %edx, %ecx + movl 12(%esi), %eax + adcl $0, %ebp + movl %ebx, 12(%edi) + movl 4(%esi), %edx + # saved r[3] + # ############### Calculate word 4 + xorl %ebx, %ebx + # sqr a[3]*a[1] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ebx + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %eax + adcl $0, %ebx + # sqr a[2]*a[2] + mull %eax + addl %eax, %ecx + adcl %edx, %ebp + movl 8(%esi), %edx + adcl $0, %ebx + movl %ecx, 16(%edi) + movl 12(%esi), %eax + # saved r[4] + # ############### Calculate word 5 + xorl %ecx, %ecx + # sqr a[3]*a[2] + mull %edx + addl %eax, %eax + adcl %edx, %edx + adcl $0, %ecx + addl %eax, %ebp + adcl %edx, %ebx + movl 12(%esi), %eax + adcl $0, %ecx + movl %ebp, 20(%edi) + # saved r[5] + # ############### Calculate word 6 + xorl %ebp, %ebp + # sqr a[3]*a[3] + mull %eax + addl %eax, %ebx + adcl %edx, %ecx + adcl $0, %ebp + movl %ebx, 24(%edi) + # saved r[6] + movl %ecx, 28(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.bn_sqr_comba4_end: + .size bn_sqr_comba4,.bn_sqr_comba4_end-bn_sqr_comba4 +.ident "desasm.pl" diff --git a/crypto/bn/asm/f.s b/crypto/bn/asm/f.s new file mode 100644 index 0000000000..2f8f63c690 --- /dev/null +++ b/crypto/bn/asm/f.s @@ -0,0 +1,1773 @@ + # Don't even think of reading this code + # It was automatically generated by bn-586.pl + # Which is a perl program used to generate the alpha assember. + # eric <eay@cryptsoft.com> + + # DEC Alpha assember + # Generated from perl scripts contains in SSLeay + .file 1 "bn-586.s" + .set noat + .text + .align 3 + .globl bn_mul_words + .ent bn_mul_words +bn_mul_words: +bn_mul_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $18, 4, $18 + bis $31, $31, $0 + br $100 + blt $18, $100 + ldq $1, 0($17) + ldq $2, 0($16) +$101: + ldq $3, 0($17) + mulq $3, $19, $4 + addq $17, 8, $17 + umulh $3, $19, $5 + addq $4, $0, $4 + addq $16, 8, $16 + subq $18, 1, $18 + cmpult $4, $0, $0 + stq $4, -8($16) + addq $5, $0, $0 + bgt $18, $101 + ret $31,($26),1 +$100: + addq $18, 4, $18 + bgt $18, $101 +$102: + ret $31,($26),1 + .end bn_mul_words + .text + .align 3 + .globl bn_sqr_words + .ent bn_sqr_words +bn_sqr_words: +bn_sqr_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $18, 4, $18 + bis $31, $31, $0 + br $103 + blt $18, $103 + ldq $1, 0($17) + ldq $2, 0($16) +$104: + ldq $3, 0($17) + mulq $3, $3, $4 + addq $17, 8, $17 + addq $16, 16, $16 + subq $18, 1, $18 + umulh $3, $3, $5 + stq $4, -16($16) + stq $5, -8($16) + bgt $18, $104 + ret $31,($26),1 +$103: + addq $18, 4, $18 + bgt $18, $104 +$105: + ret $31,($26),1 + .end bn_sqr_words + .text + .align 3 + .globl bn_mul_add_words + .ent bn_mul_add_words +bn_mul_add_words: +bn_mul_add_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $18, 4, $18 + bis $31, $31, $0 + br $106 + blt $18, $106 + ldq $1, 0($17) + ldq $2, 0($16) +$107: + ldq $3, 0($17) + ldq $4, 0($16) + mulq $3, $19, $5 + subq $18, 1, $18 + addq $17, 8, $17 + umulh $3, $19, $6 + addq $4, $5, $4 + addq $16, 8, $16 + cmpult $4, $5, $7 + addq $4, $0, $4 + addq $6, $7, $6 + cmpult $4, $0, $0 + stq $4, -8($16) + addq $6, $0, $0 + bgt $18, $107 + ret $31,($26),1 +$106: + addq $18, 4, $18 + bgt $18, $107 +$108: + ret $31,($26),1 + .end bn_mul_add_words + .text + .align 3 + .globl bn_add_words + .ent bn_add_words +bn_add_words: +bn_add_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $19, 4, $19 + bis $31, $31, $0 + br $109 + blt $19, $109 + ldq $1, 0($17) + ldq $2, 0($18) +$110: + ldq $3, 8($17) + ldq $4, 8($18) + ldq $5, 16($17) + ldq $6, 16($18) + ldq $7, 24($17) + ldq $8, 24($18) + addq $1, $2, $22 + cmpult $22, $2, $23 + addq $22, $0, $22 + cmpult $22, $0, $0 + addq $0, $23, $0 + addq $3, $4, $25 + cmpult $25, $4, $24 + addq $25, $0, $25 + cmpult $25, $0, $0 + addq $0, $24, $0 + addq $5, $6, $28 + cmpult $28, $6, $27 + addq $28, $0, $28 + cmpult $28, $0, $0 + addq $0, $27, $0 + addq $7, $8, $20 + cmpult $20, $8, $21 + addq $20, $0, $20 + cmpult $20, $0, $0 + addq $0, $21, $0 + stq $22, 0($16) + stq $25, 0($16) + stq $28, 0($16) + stq $20, 0($16) + subq $19, 4, $19 + addq $17, 32, $17 + addq $18, 32, $18 + addq $16, 32, $16 + blt $19, $109 + ldq $1, 0($17) + ldq $2, 0($18) + br $110 +$111: + ldq $1, 0($17) + ldq $2, 0($18) + addq $1, $2, $3 + cmpult $3, $2, $23 + addq $3, $0, $3 + cmpult $3, $0, $0 + addq $0, $23, $0 + stq $3, 0($16) + addq $17, 8, $17 + addq $18, 8, $18 + addq $16, 8, $16 + subq $19, 1, $19 + bgt $19, $111 + ret $31,($26),1 +$109: + addq $19, 4, $19 + bgt $19, $111 +$112: + ret $31,($26),1 + .end bn_add_words + .text + .align 3 + .globl bn_sub_words + .ent bn_sub_words +bn_sub_words: +bn_sub_words..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $19, 4, $19 + bis $31, $31, $0 + blt $19, $113 + ldq $1, 0($17) + ldq $2, 0($18) +$114: + ldq $3, 8($17) + cmpult $1, $2, $4 + ldq $5, 8($18) + subq $1, $2, $1 + ldq $6, 16($17) + cmpult $1, $0, $2 + ldq $7, 16($18) + subq $1, $0, $23 + ldq $8, 24($17) + addq $2, $4, $0 + cmpult $3, $5, $24 + subq $3, $5, $3 + ldq $22, 24($18) + cmpult $3, $0, $5 + subq $3, $0, $25 + addq $5, $24, $0 + cmpult $6, $7, $27 + subq $6, $7, $6 + stq $23, 0($16) + cmpult $6, $0, $7 + subq $6, $0, $28 + addq $7, $27, $0 + cmpult $8, $22, $21 + subq $8, $22, $8 + stq $25, 8($16) + cmpult $8, $0, $22 + subq $8, $0, $20 + addq $22, $21, $0 + stq $28, 16($16) + subq $19, 4, $19 + stq $20, 24($16) + addq $17, 32, $17 + addq $18, 32, $18 + addq $16, 32, $16 + blt $19, $113 + ldq $1, 0($17) + ldq $2, 0($18) + br $114 +$115: + ldq $1, 0($17) + ldq $2, 0($18) + cmpult $1, $2, $27 + subq $1, $2, $1 + cmpult $1, $0, $2 + subq $1, $0, $1 + stq $1, 0($16) + addq $2, $27, $0 + addq $17, 8, $17 + addq $18, 8, $18 + addq $16, 8, $16 + subq $19, 1, $19 + bgt $19, $115 + ret $31,($26),1 +$113: + addq $19, 4, $19 + bgt $19, $115 +$116: + ret $31,($26),1 + .end bn_sub_words + # + # What follows was taken directly from the C compiler with a few + # hacks to redo the lables. + # +.text + .align 3 + .globl bn_div64 + .ent bn_div64 +bn_div64: + ldgp $29,0($27) +bn_div64..ng: + lda $30,-48($30) + .frame $30,48,$26,0 + stq $26,0($30) + stq $9,8($30) + stq $10,16($30) + stq $11,24($30) + stq $12,32($30) + stq $13,40($30) + .mask 0x4003e00,-48 + .prologue 1 + bis $16,$16,$9 + bis $17,$17,$10 + bis $18,$18,$11 + bis $31,$31,$13 + bis $31,2,$12 + bne $11,$9119 + lda $0,-1 + br $31,$9136 + .align 4 +$9119: + bis $11,$11,$16 + jsr $26,BN_num_bits_word + ldgp $29,0($26) + subq $0,64,$1 + beq $1,$9120 + bis $31,1,$1 + sll $1,$0,$1 + cmpule $9,$1,$1 + bne $1,$9120 + # lda $16,_IO_stderr_ + # lda $17,$C32 + # bis $0,$0,$18 + # jsr $26,fprintf + # ldgp $29,0($26) + jsr $26,abort + ldgp $29,0($26) + .align 4 +$9120: + bis $31,64,$3 + cmpult $9,$11,$2 + subq $3,$0,$1 + addl $1,$31,$0 + subq $9,$11,$1 + cmoveq $2,$1,$9 + beq $0,$9122 + zapnot $0,15,$2 + subq $3,$0,$1 + sll $11,$2,$11 + sll $9,$2,$3 + srl $10,$1,$1 + sll $10,$2,$10 + bis $3,$1,$9 +$9122: + srl $11,32,$5 + zapnot $11,15,$6 + lda $7,-1 + .align 5 +$9123: + srl $9,32,$1 + subq $1,$5,$1 + bne $1,$9126 + zapnot $7,15,$27 + br $31,$9127 + .align 4 +$9126: + bis $9,$9,$24 + bis $5,$5,$25 + divqu $24,$25,$27 +$9127: + srl $10,32,$4 + .align 5 +$9128: + mulq $27,$5,$1 + subq $9,$1,$3 + zapnot $3,240,$1 + bne $1,$9129 + mulq $6,$27,$2 + sll $3,32,$1 + addq $1,$4,$1 + cmpule $2,$1,$2 + bne $2,$9129 + subq $27,1,$27 + br $31,$9128 + .align 4 +$9129: + mulq $27,$6,$1 + mulq $27,$5,$4 + srl $1,32,$3 + sll $1,32,$1 + addq $4,$3,$4 + cmpult $10,$1,$2 + subq $10,$1,$10 + addq $2,$4,$2 + cmpult $9,$2,$1 + bis $2,$2,$4 + beq $1,$9134 + addq $9,$11,$9 + subq $27,1,$27 +$9134: + subl $12,1,$12 + subq $9,$4,$9 + beq $12,$9124 + sll $27,32,$13 + sll $9,32,$2 + srl $10,32,$1 + sll $10,32,$10 + bis $2,$1,$9 + br $31,$9123 + .align 4 +$9124: + bis $13,$27,$0 +$9136: + ldq $26,0($30) + ldq $9,8($30) + ldq $10,16($30) + ldq $11,24($30) + ldq $12,32($30) + ldq $13,40($30) + addq $30,48,$30 + ret $31,($26),1 + .end bn_div64 + .text + .align 3 + .globl bn_mul_comba8 + .ent bn_mul_comba8 +bn_mul_comba8: +bn_mul_comba8..ng: + .frame $30,0,$26,0 + .prologue 0 + + subq $30, 16, $30 + ldq $0, 0($17) + ldq $1, 0($18) + stq $9, 0($30) + stq $10, 8($30) + ldq $2, 8($17) + ldq $3, 8($18) + ldq $4, 16($17) + ldq $5, 16($18) + ldq $6, 24($17) + ldq $7, 24($18) + ldq $8, 8($17) + ldq $22, 8($18) + ldq $23, 8($17) + ldq $24, 8($18) + ldq $25, 8($17) + ldq $27, 8($18) + ldq $28, 8($17) + ldq $21, 8($18) + bis $31, $31, $9 + mulq $0, $1, $20 + umulh $0, $1, $19 + stq $20, 0($16) + bis $31, $31, $10 + mulq $0, $3, $17 + umulh $0, $3, $18 + addq $19, $17, $19 + cmpult $19, $17, $20 + addq $20, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $17 + addq $10, $17, $10 + mulq $2, $1, $20 + umulh $2, $1, $18 + addq $19, $20, $19 + cmpult $19, $20, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $20 + addq $10, $20, $10 + stq $19, 8($16) + bis $31, $31, $17 + mulq $0, $5, $18 + umulh $0, $5, $20 + addq $9, $18, $9 + cmpult $9, $18, $19 + addq $19, $20, $20 + addq $10, $20, $10 + cmpult $10, $20, $18 + addq $17, $18, $17 + mulq $2, $3, $19 + umulh $2, $3, $20 + addq $9, $19, $9 + cmpult $9, $19, $18 + addq $18, $20, $20 + addq $10, $20, $10 + cmpult $10, $20, $19 + addq $17, $19, $17 + mulq $4, $1, $18 + umulh $4, $1, $20 + addq $9, $18, $9 + cmpult $9, $18, $19 + addq $19, $20, $20 + addq $10, $20, $10 + cmpult $10, $20, $18 + addq $17, $18, $17 + stq $9, 16($16) + bis $31, $31, $19 + mulq $0, $7, $20 + umulh $0, $7, $18 + addq $10, $20, $10 + cmpult $10, $20, $9 + addq $9, $18, $18 + addq $17, $18, $17 + cmpult $17, $18, $20 + addq $19, $20, $19 + mulq $2, $5, $9 + umulh $2, $5, $18 + addq $10, $9, $10 + cmpult $10, $9, $20 + addq $20, $18, $18 + addq $17, $18, $17 + cmpult $17, $18, $9 + addq $19, $9, $19 + mulq $4, $3, $20 + umulh $4, $3, $18 + addq $10, $20, $10 + cmpult $10, $20, $9 + addq $9, $18, $18 + addq $17, $18, $17 + cmpult $17, $18, $20 + addq $19, $20, $19 + mulq $6, $1, $9 + umulh $6, $1, $18 + addq $10, $9, $10 + cmpult $10, $9, $20 + addq $20, $18, $18 + addq $17, $18, $17 + cmpult $17, $18, $9 + addq $19, $9, $19 + stq $10, 24($16) + bis $31, $31, $20 + mulq $0, $22, $18 + umulh $0, $22, $9 + addq $17, $18, $17 + cmpult $17, $18, $10 + addq $10, $9, $9 + addq $19, $9, $19 + cmpult $19, $9, $18 + addq $20, $18, $20 + mulq $2, $7, $10 + umulh $2, $7, $9 + addq $17, $10, $17 + cmpult $17, $10, $18 + addq $18, $9, $9 + addq $19, $9, $19 + cmpult $19, $9, $10 + addq $20, $10, $20 + mulq $4, $5, $18 + umulh $4, $5, $9 + addq $17, $18, $17 + cmpult $17, $18, $10 + addq $10, $9, $9 + addq $19, $9, $19 + cmpult $19, $9, $18 + addq $20, $18, $20 + mulq $6, $3, $10 + umulh $6, $3, $9 + addq $17, $10, $17 + cmpult $17, $10, $18 + addq $18, $9, $9 + addq $19, $9, $19 + cmpult $19, $9, $10 + addq $20, $10, $20 + mulq $8, $1, $18 + umulh $8, $1, $9 + addq $17, $18, $17 + cmpult $17, $18, $10 + addq $10, $9, $9 + addq $19, $9, $19 + cmpult $19, $9, $18 + addq $20, $18, $20 + stq $17, 32($16) + bis $31, $31, $10 + mulq $0, $24, $9 + umulh $0, $24, $18 + addq $19, $9, $19 + cmpult $19, $9, $17 + addq $17, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $9 + addq $10, $9, $10 + mulq $2, $22, $17 + umulh $2, $22, $18 + addq $19, $17, $19 + cmpult $19, $17, $9 + addq $9, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $17 + addq $10, $17, $10 + mulq $4, $7, $9 + umulh $4, $7, $18 + addq $19, $9, $19 + cmpult $19, $9, $17 + addq $17, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $9 + addq $10, $9, $10 + mulq $6, $5, $17 + umulh $6, $5, $18 + addq $19, $17, $19 + cmpult $19, $17, $9 + addq $9, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $17 + addq $10, $17, $10 + mulq $8, $3, $9 + umulh $8, $3, $18 + addq $19, $9, $19 + cmpult $19, $9, $17 + addq $17, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $9 + addq $10, $9, $10 + mulq $23, $1, $17 + umulh $23, $1, $18 + addq $19, $17, $19 + cmpult $19, $17, $9 + addq $9, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $17 + addq $10, $17, $10 + stq $19, 40($16) + bis $31, $31, $9 + mulq $0, $27, $18 + umulh $0, $27, $17 + addq $20, $18, $20 + cmpult $20, $18, $19 + addq $19, $17, $17 + addq $10, $17, $10 + cmpult $10, $17, $18 + addq $9, $18, $9 + mulq $2, $24, $19 + umulh $2, $24, $17 + addq $20, $19, $20 + cmpult $20, $19, $18 + addq $18, $17, $17 + addq $10, $17, $10 + cmpult $10, $17, $19 + addq $9, $19, $9 + mulq $4, $22, $18 + umulh $4, $22, $17 + addq $20, $18, $20 + cmpult $20, $18, $19 + addq $19, $17, $17 + addq $10, $17, $10 + cmpult $10, $17, $18 + addq $9, $18, $9 + mulq $6, $7, $19 + umulh $6, $7, $17 + addq $20, $19, $20 + cmpult $20, $19, $18 + addq $18, $17, $17 + addq $10, $17, $10 + cmpult $10, $17, $19 + addq $9, $19, $9 + mulq $8, $5, $18 + umulh $8, $5, $17 + addq $20, $18, $20 + cmpult $20, $18, $19 + addq $19, $17, $17 + addq $10, $17, $10 + cmpult $10, $17, $18 + addq $9, $18, $9 + mulq $23, $3, $19 + umulh $23, $3, $17 + addq $20, $19, $20 + cmpult $20, $19, $18 + addq $18, $17, $17 + addq $10, $17, $10 + cmpult $10, $17, $19 + addq $9, $19, $9 + mulq $25, $1, $18 + umulh $25, $1, $17 + addq $20, $18, $20 + cmpult $20, $18, $19 + addq $19, $17, $17 + addq $10, $17, $10 + cmpult $10, $17, $18 + addq $9, $18, $9 + stq $20, 48($16) + bis $31, $31, $19 + mulq $0, $21, $17 + umulh $0, $21, $18 + addq $10, $17, $10 + cmpult $10, $17, $20 + addq $20, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $17 + addq $19, $17, $19 + mulq $2, $27, $20 + umulh $2, $27, $18 + addq $10, $20, $10 + cmpult $10, $20, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $0 + addq $19, $0, $19 + mulq $4, $24, $20 + umulh $4, $24, $17 + addq $10, $20, $10 + cmpult $10, $20, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $19, $0, $19 + mulq $6, $22, $20 + umulh $6, $22, $18 + addq $10, $20, $10 + cmpult $10, $20, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $0 + addq $19, $0, $19 + mulq $8, $7, $20 + umulh $8, $7, $17 + addq $10, $20, $10 + cmpult $10, $20, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $19, $0, $19 + mulq $23, $5, $20 + umulh $23, $5, $18 + addq $10, $20, $10 + cmpult $10, $20, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $0 + addq $19, $0, $19 + mulq $25, $3, $20 + umulh $25, $3, $17 + addq $10, $20, $10 + cmpult $10, $20, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $19, $0, $19 + mulq $28, $1, $20 + umulh $28, $1, $18 + addq $10, $20, $10 + cmpult $10, $20, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $0 + addq $19, $0, $19 + stq $10, 56($16) + bis $31, $31, $20 + mulq $2, $21, $17 + umulh $2, $21, $18 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $0, $18, $18 + addq $19, $18, $19 + cmpult $19, $18, $1 + addq $20, $1, $20 + mulq $4, $27, $10 + umulh $4, $27, $17 + addq $9, $10, $9 + cmpult $9, $10, $0 + addq $0, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $20, $18, $20 + mulq $6, $24, $1 + umulh $6, $24, $2 + addq $9, $1, $9 + cmpult $9, $1, $10 + addq $10, $2, $2 + addq $19, $2, $19 + cmpult $19, $2, $0 + addq $20, $0, $20 + mulq $8, $22, $17 + umulh $8, $22, $18 + addq $9, $17, $9 + cmpult $9, $17, $1 + addq $1, $18, $18 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $20, $10, $20 + mulq $23, $7, $2 + umulh $23, $7, $0 + addq $9, $2, $9 + cmpult $9, $2, $17 + addq $17, $0, $0 + addq $19, $0, $19 + cmpult $19, $0, $1 + addq $20, $1, $20 + mulq $25, $5, $18 + umulh $25, $5, $10 + addq $9, $18, $9 + cmpult $9, $18, $2 + addq $2, $10, $10 + addq $19, $10, $19 + cmpult $19, $10, $17 + addq $20, $17, $20 + mulq $28, $3, $0 + umulh $28, $3, $1 + addq $9, $0, $9 + cmpult $9, $0, $18 + addq $18, $1, $1 + addq $19, $1, $19 + cmpult $19, $1, $2 + addq $20, $2, $20 + stq $9, 64($16) + bis $31, $31, $10 + mulq $4, $21, $17 + umulh $4, $21, $0 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $18, $0, $0 + addq $20, $0, $20 + cmpult $20, $0, $1 + addq $10, $1, $10 + mulq $6, $27, $2 + umulh $6, $27, $3 + addq $19, $2, $19 + cmpult $19, $2, $9 + addq $9, $3, $3 + addq $20, $3, $20 + cmpult $20, $3, $17 + addq $10, $17, $10 + mulq $8, $24, $18 + umulh $8, $24, $0 + addq $19, $18, $19 + cmpult $19, $18, $1 + addq $1, $0, $0 + addq $20, $0, $20 + cmpult $20, $0, $4 + addq $10, $4, $10 + mulq $23, $22, $2 + umulh $23, $22, $9 + addq $19, $2, $19 + cmpult $19, $2, $3 + addq $3, $9, $9 + addq $20, $9, $20 + cmpult $20, $9, $17 + addq $10, $17, $10 + mulq $25, $7, $18 + umulh $25, $7, $1 + addq $19, $18, $19 + cmpult $19, $18, $0 + addq $0, $1, $1 + addq $20, $1, $20 + cmpult $20, $1, $4 + addq $10, $4, $10 + mulq $28, $5, $2 + umulh $28, $5, $3 + addq $19, $2, $19 + cmpult $19, $2, $9 + addq $9, $3, $3 + addq $20, $3, $20 + cmpult $20, $3, $17 + addq $10, $17, $10 + stq $19, 72($16) + bis $31, $31, $18 + mulq $6, $21, $0 + umulh $6, $21, $1 + addq $20, $0, $20 + cmpult $20, $0, $4 + addq $4, $1, $1 + addq $10, $1, $10 + cmpult $10, $1, $2 + addq $18, $2, $18 + mulq $8, $27, $9 + umulh $8, $27, $3 + addq $20, $9, $20 + cmpult $20, $9, $17 + addq $17, $3, $3 + addq $10, $3, $10 + cmpult $10, $3, $5 + addq $18, $5, $18 + mulq $23, $24, $19 + umulh $23, $24, $0 + addq $20, $19, $20 + cmpult $20, $19, $4 + addq $4, $0, $0 + addq $10, $0, $10 + cmpult $10, $0, $1 + addq $18, $1, $18 + mulq $25, $22, $2 + umulh $25, $22, $6 + addq $20, $2, $20 + cmpult $20, $2, $9 + addq $9, $6, $6 + addq $10, $6, $10 + cmpult $10, $6, $17 + addq $18, $17, $18 + mulq $28, $7, $3 + umulh $28, $7, $5 + addq $20, $3, $20 + cmpult $20, $3, $19 + addq $19, $5, $5 + addq $10, $5, $10 + cmpult $10, $5, $4 + addq $18, $4, $18 + stq $20, 80($16) + bis $31, $31, $0 + mulq $8, $21, $1 + umulh $8, $21, $2 + addq $10, $1, $10 + cmpult $10, $1, $9 + addq $9, $2, $2 + addq $18, $2, $18 + cmpult $18, $2, $6 + addq $0, $6, $0 + mulq $23, $27, $17 + umulh $23, $27, $3 + addq $10, $17, $10 + cmpult $10, $17, $19 + addq $19, $3, $3 + addq $18, $3, $18 + cmpult $18, $3, $5 + addq $0, $5, $0 + mulq $25, $24, $4 + umulh $25, $24, $7 + addq $10, $4, $10 + cmpult $10, $4, $20 + addq $20, $7, $7 + addq $18, $7, $18 + cmpult $18, $7, $1 + addq $0, $1, $0 + mulq $28, $22, $9 + umulh $28, $22, $2 + addq $10, $9, $10 + cmpult $10, $9, $6 + addq $6, $2, $2 + addq $18, $2, $18 + cmpult $18, $2, $8 + addq $0, $8, $0 + stq $10, 88($16) + bis $31, $31, $17 + mulq $23, $21, $19 + umulh $23, $21, $3 + addq $18, $19, $18 + cmpult $18, $19, $5 + addq $5, $3, $3 + addq $0, $3, $0 + cmpult $0, $3, $4 + addq $17, $4, $17 + mulq $25, $27, $20 + umulh $25, $27, $7 + addq $18, $20, $18 + cmpult $18, $20, $1 + addq $1, $7, $7 + addq $0, $7, $0 + cmpult $0, $7, $9 + addq $17, $9, $17 + mulq $28, $24, $6 + umulh $28, $24, $2 + addq $18, $6, $18 + cmpult $18, $6, $8 + addq $8, $2, $2 + addq $0, $2, $0 + cmpult $0, $2, $22 + addq $17, $22, $17 + stq $18, 96($16) + bis $31, $31, $10 + mulq $25, $21, $19 + umulh $25, $21, $5 + addq $0, $19, $0 + cmpult $0, $19, $3 + addq $3, $5, $5 + addq $17, $5, $17 + cmpult $17, $5, $4 + addq $10, $4, $10 + mulq $28, $27, $23 + umulh $28, $27, $20 + addq $0, $23, $0 + cmpult $0, $23, $1 + addq $1, $20, $20 + addq $17, $20, $17 + cmpult $17, $20, $7 + addq $10, $7, $10 + stq $0, 104($16) + bis $31, $31, $9 + mulq $28, $21, $6 + umulh $28, $21, $8 + addq $17, $6, $17 + cmpult $17, $6, $2 + addq $2, $8, $8 + addq $10, $8, $10 + cmpult $10, $8, $22 + addq $9, $22, $9 + stq $17, 112($16) + stq $10, 120($16) + ldq $9, 0($30) + ldq $10, 8($30) + addq $30, 16, $30 + ret $31,($26),1 + .end bn_mul_comba8 + .text + .align 3 + .globl bn_mul_comba4 + .ent bn_mul_comba4 +bn_mul_comba4: +bn_mul_comba4..ng: + .frame $30,0,$26,0 + .prologue 0 + + ldq $0, 0($17) + ldq $1, 0($18) + ldq $2, 8($17) + ldq $3, 8($18) + mulq $0, $1, $4 + ldq $5, 16($17) + ldq $6, 16($18) + umulh $0, $1, $7 + ldq $8, 24($17) + ldq $22, 24($18) + mulq $0, $3, $23 + stq $4, 0($16) + bis $31, $31, $24 + mulq $2, $1, $28 + bis $31, $31, $25 + bis $31, $31, + addq $24, $7, $24 + umulh $0, $3, $21 + cmpult $24, $7, $20 + addq $24, $23, $24 + addq $25, $20, $25 + umulh $2, $1, $19 + cmpult $24, $23, $17 + addq $24, $28, $24 + addq $27, $17, $27 + mulq $0, $6, $18 + cmpult $24, $28, $4 + addq $25, $4, $25 + stq $24, 8($16) + addq $25, $27, $24 + bis $31, $31, $25 + addq $24, $21, $24 + bis $31, $31, $27 + mulq $2, $3, $7 + cmpult $24, $21, $20 + addq $24, $19, $24 + addq $25, $20, $25 + mulq $5, $1, $23 + cmpult $24, $19, $17 + addq $24, $7, $24 + addq $27, $17, $27 + umulh $0, $6, $28 + cmpult $24, $18, $4 + addq $24, $7, $24 + addq $25, $4, $25 + umulh $2, $3, $21 + cmpult $24, $7, $20 + addq $24, $23, $24 + addq $27, $20, $27 + umulh $5, $1, $19 + cmpult $24, $23, $17 + addq $25, $17, $25 + stq $24, 16($16) + addq $25, $27, $24 + bis $31, $31, $25 + addq $24, $28, $24 + bis $31, $31, $27 + mulq $0, $22, $18 + cmpult $24, $28, $4 + addq $24, $21, $24 + addq $25, $4, $25 + mulq $2, $6, $7 + cmpult $24, $21, $20 + addq $24, $19, $24 + addq $25, $20, $25 + mulq $5, $3, $23 + cmpult $24, $19, $17 + addq $24, $18, $24 + addq $25, $17, $25 + mulq $8, $1, $28 + cmpult $24, $18, $4 + addq $24, $7, $24 + addq $25, $4, $25 + umulh $0, $22, $21 + cmpult $24, $7, $20 + addq $24, $23, $24 + addq $25, $20, $25 + umulh $2, $6, $19 + cmpult $24, $23, $17 + addq $24, $28, $24 + addq $25, $17, $25 + umulh $5, $3, $18 + cmpult $24, $28, $4 + addq $25, $4, $25 + stq $24, 24($16) + addq $25, $27, $24 + bis $31, $31, $25 + addq $24, $21, $24 + bis $31, $31, $27 + umulh $8, $1, $7 + cmpult $24, $21, $20 + addq $24, $19, $24 + addq $25, $20, $25 + mulq $2, $22, $23 + cmpult $24, $19, $17 + addq $24, $18, $24 + addq $25, $17, $25 + mulq $5, $6, $28 + cmpult $24, $18, $4 + addq $24, $7, $24 + addq $25, $4, $25 + mulq $8, $3, $21 + cmpult $24, $7, $20 + addq $24, $23, $24 + addq $25, $20, $25 + umulh $2, $22, $19 + cmpult $24, $23, $17 + addq $24, $28, $24 + addq $25, $17, $25 + umulh $5, $6, $18 + cmpult $24, $28, $4 + addq $24, $21, $24 + addq $25, $4, $25 + umulh $8, $3, $7 + cmpult $24, $21, $20 + addq $25, $20, $25 + stq $24, 32($16) + addq $25, $27, $24 + bis $31, $31, $25 + addq $24, $19, $24 + bis $31, $31, $27 + mulq $5, $22, $23 + cmpult $24, $19, $17 + addq $24, $18, $24 + addq $25, $17, $25 + mulq $8, $6, $28 + cmpult $24, $18, $4 + addq $24, $7, $24 + addq $25, $4, $25 + umulh $5, $22, $21 + cmpult $24, $7, $20 + addq $24, $23, $24 + addq $25, $20, $25 + umulh $8, $6, $19 + cmpult $24, $23, $17 + addq $24, $28, $24 + addq $25, $17, $25 + mulq $8, $22, $18 + cmpult $24, $28, $4 + addq $25, $4, $25 + stq $24, 40($16) + addq $25, $27, $24 + bis $31, $31, $25 + addq $24, $21, $24 + bis $31, $31, $27 + umulh $8, $22, $7 + cmpult $24, $21, $20 + addq $24, $19, $24 + addq $25, $20, $25 + cmpult $24, $19, $23 + addq $24, $18, $24 + addq $25, $23, $25 + cmpult $24, $18, $17 + addq $25, $17, $25 + stq $24, 48($16) + addq $25, $27, $24 + addq $24, $7, $24 + stq $24, 56($16) + ret $31,($26),1 + .end bn_mul_comba4 + .text + .align 3 + .globl bn_sqr_comba4 + .ent bn_sqr_comba4 +bn_sqr_comba4: +bn_sqr_comba4..ng: + .frame $30,0,$26,0 + .prologue 0 + + ldq $0, 0($17) + ldq $1, 8($17) + ldq $2, 16($17) + ldq $3, 24($17) + bis $31, $31, $6 + mulq $0, $0, $4 + umulh $0, $0, $5 + stq $4, 0($16) + bis $31, $31, $4 + mulq $0, $1, $7 + umulh $0, $1, $8 + cmplt $7, $31, $22 + cmplt $8, $31, $23 + addq $7, $7, $7 + addq $8, $8, $8 + addq $8, $22, $8 + addq $4, $23, $4 + addq $5, $7, $5 + addq $6, $8, $6 + cmpult $5, $7, $24 + cmpult $6, $8, $25 + addq $6, $24, $6 + addq $4, $25, $4 + stq $5, 8($16) + bis $31, $31, $5 + mulq $1, $1, $27 + umulh $1, $1, $28 + addq $6, $27, $6 + addq $4, $28, $4 + cmpult $6, $27, $21 + cmpult $4, $28, $20 + addq $4, $21, $4 + addq $5, $20, $5 + mulq $2, $0, $19 + umulh $2, $0, $18 + cmplt $19, $31, $17 + cmplt $18, $31, $22 + addq $19, $19, $19 + addq $18, $18, $18 + addq $18, $17, $18 + addq $5, $22, $5 + addq $6, $19, $6 + addq $4, $18, $4 + cmpult $6, $19, $23 + cmpult $4, $18, $7 + addq $4, $23, $4 + addq $5, $7, $5 + stq $6, 16($16) + bis $31, $31, $6 + mulq $3, $0, $8 + umulh $3, $0, $24 + cmplt $8, $31, $25 + cmplt $24, $31, $27 + addq $8, $8, $8 + addq $24, $24, $24 + addq $24, $25, $24 + addq $6, $27, $6 + addq $4, $8, $4 + addq $5, $24, $5 + cmpult $4, $8, $28 + cmpult $5, $24, $21 + addq $5, $28, $5 + addq $6, $21, $6 + mulq $2, $1, $20 + umulh $2, $1, $17 + cmplt $20, $31, $22 + cmplt $17, $31, $19 + addq $20, $20, $20 + addq $17, $17, $17 + addq $17, $22, $17 + addq $6, $19, $6 + addq $4, $20, $4 + addq $5, $17, $5 + cmpult $4, $20, $18 + cmpult $5, $17, $23 + addq $5, $18, $5 + addq $6, $23, $6 + stq $4, 24($16) + bis $31, $31, $4 + mulq $2, $2, $7 + umulh $2, $2, $25 + addq $5, $7, $5 + addq $6, $25, $6 + cmpult $5, $7, $27 + cmpult $6, $25, $8 + addq $6, $27, $6 + addq $4, $8, $4 + mulq $3, $1, $24 + umulh $3, $1, $28 + cmplt $24, $31, $21 + cmplt $28, $31, $22 + addq $24, $24, $24 + addq $28, $28, $28 + addq $28, $21, $28 + addq $4, $22, $4 + addq $5, $24, $5 + addq $6, $28, $6 + cmpult $5, $24, $19 + cmpult $6, $28, $20 + addq $6, $19, $6 + addq $4, $20, $4 + stq $5, 32($16) + bis $31, $31, $5 + mulq $3, $2, $17 + umulh $3, $2, $18 + cmplt $17, $31, $23 + cmplt $18, $31, $7 + addq $17, $17, $17 + addq $18, $18, $18 + addq $18, $23, $18 + addq $5, $7, $5 + addq $6, $17, $6 + addq $4, $18, $4 + cmpult $6, $17, $25 + cmpult $4, $18, $27 + addq $4, $25, $4 + addq $5, $27, $5 + stq $6, 40($16) + bis $31, $31, $6 + mulq $3, $3, $8 + umulh $3, $3, $21 + addq $4, $8, $4 + addq $5, $21, $5 + cmpult $4, $8, $22 + cmpult $5, $21, $24 + addq $5, $22, $5 + addq $6, $24, $6 + stq $4, 48($16) + stq $5, 56($16) + ret $31,($26),1 + .end bn_sqr_comba4 + .text + .align 3 + .globl bn_sqr_comba8 + .ent bn_sqr_comba8 +bn_sqr_comba8: +bn_sqr_comba8..ng: + .frame $30,0,$26,0 + .prologue 0 + + ldq $0, 0($17) + ldq $1, 8($17) + ldq $2, 16($17) + ldq $3, 24($17) + ldq $4, 32($17) + ldq $5, 40($17) + ldq $6, 48($17) + ldq $7, 56($17) + bis $31, $31, $23 + mulq $0, $0, $8 + umulh $0, $0, $22 + stq $8, 0($16) + bis $31, $31, $8 + mulq $1, $0, $24 + umulh $1, $0, $25 + cmplt $24, $31, $27 + cmplt $25, $31, $28 + addq $24, $24, $24 + addq $25, $25, $25 + addq $25, $27, $25 + addq $8, $28, $8 + addq $22, $24, $22 + addq $23, $25, $23 + cmpult $22, $24, $21 + cmpult $23, $25, $20 + addq $23, $21, $23 + addq $8, $20, $8 + stq $22, 8($16) + bis $31, $31, $22 + mulq $1, $1, $19 + umulh $1, $1, $18 + addq $23, $19, $23 + addq $8, $18, $8 + cmpult $23, $19, $17 + cmpult $8, $18, $27 + addq $8, $17, $8 + addq $22, $27, $22 + mulq $2, $0, $28 + umulh $2, $0, $24 + cmplt $28, $31, $25 + cmplt $24, $31, $21 + addq $28, $28, $28 + addq $24, $24, $24 + addq $24, $25, $24 + addq $22, $21, $22 + addq $23, $28, $23 + addq $8, $24, $8 + cmpult $23, $28, $20 + cmpult $8, $24, $19 + addq $8, $20, $8 + addq $22, $19, $22 + stq $23, 16($16) + bis $31, $31, $23 + mulq $2, $1, $18 + umulh $2, $1, $17 + cmplt $18, $31, $27 + cmplt $17, $31, $25 + addq $18, $18, $18 + addq $17, $17, $17 + addq $17, $27, $17 + addq $23, $25, $23 + addq $8, $18, $8 + addq $22, $17, $22 + cmpult $8, $18, $21 + cmpult $22, $17, $28 + addq $22, $21, $22 + addq $23, $28, $23 + mulq $3, $0, $24 + umulh $3, $0, $20 + cmplt $24, $31, $19 + cmplt $20, $31, $27 + addq $24, $24, $24 + addq $20, $20, $20 + addq $20, $19, $20 + addq $23, $27, $23 + addq $8, $24, $8 + addq $22, $20, $22 + cmpult $8, $24, $25 + cmpult $22, $20, $18 + addq $22, $25, $22 + addq $23, $18, $23 + stq $8, 24($16) + bis $31, $31, $8 + mulq $2, $2, $17 + umulh $2, $2, $21 + addq $22, $17, $22 + addq $23, $21, $23 + cmpult $22, $17, $28 + cmpult $23, $21, $19 + addq $23, $28, $23 + addq $8, $19, $8 + mulq $3, $1, $27 + umulh $3, $1, $24 + cmplt $27, $31, $20 + cmplt $24, $31, $25 + addq $27, $27, $27 + addq $24, $24, $24 + addq $24, $20, $24 + addq $8, $25, $8 + addq $22, $27, $22 + addq $23, $24, $23 + cmpult $22, $27, $18 + cmpult $23, $24, $17 + addq $23, $18, $23 + addq $8, $17, $8 + mulq $4, $0, $21 + umulh $4, $0, $28 + cmplt $21, $31, $19 + cmplt $28, $31, $20 + addq $21, $21, $21 + addq $28, $28, $28 + addq $28, $19, $28 + addq $8, $20, $8 + addq $22, $21, $22 + addq $23, $28, $23 + cmpult $22, $21, $25 + cmpult $23, $28, $27 + addq $23, $25, $23 + addq $8, $27, $8 + stq $22, 32($16) + bis $31, $31, $22 + mulq $3, $2, $24 + umulh $3, $2, $18 + cmplt $24, $31, $17 + cmplt $18, $31, $19 + addq $24, $24, $24 + addq $18, $18, $18 + addq $18, $17, $18 + addq $22, $19, $22 + addq $23, $24, $23 + addq $8, $18, $8 + cmpult $23, $24, $20 + cmpult $8, $18, $21 + addq $8, $20, $8 + addq $22, $21, $22 + mulq $4, $1, $28 + umulh $4, $1, $25 + cmplt $28, $31, $27 + cmplt $25, $31, $17 + addq $28, $28, $28 + addq $25, $25, $25 + addq $25, $27, $25 + addq $22, $17, $22 + addq $23, $28, $23 + addq $8, $25, $8 + cmpult $23, $28, $19 + cmpult $8, $25, $24 + addq $8, $19, $8 + addq $22, $24, $22 + mulq $5, $0, $18 + umulh $5, $0, $20 + cmplt $18, $31, $21 + cmplt $20, $31, $27 + addq $18, $18, $18 + addq $20, $20, $20 + addq $20, $21, $20 + addq $22, $27, $22 + addq $23, $18, $23 + addq $8, $20, $8 + cmpult $23, $18, $17 + cmpult $8, $20, $28 + addq $8, $17, $8 + addq $22, $28, $22 + stq $23, 40($16) + bis $31, $31, $23 + mulq $3, $3, $25 + umulh $3, $3, $19 + addq $8, $25, $8 + addq $22, $19, $22 + cmpult $8, $25, $24 + cmpult $22, $19, $21 + addq $22, $24, $22 + addq $23, $21, $23 + mulq $4, $2, $27 + umulh $4, $2, $18 + cmplt $27, $31, $20 + cmplt $18, $31, $17 + addq $27, $27, $27 + addq $18, $18, $18 + addq $18, $20, $18 + addq $23, $17, $23 + addq $8, $27, $8 + addq $22, $18, $22 + cmpult $8, $27, $28 + cmpult $22, $18, $25 + addq $22, $28, $22 + addq $23, $25, $23 + mulq $5, $1, $19 + umulh $5, $1, $24 + cmplt $19, $31, $21 + cmplt $24, $31, $20 + addq $19, $19, $19 + addq $24, $24, $24 + addq $24, $21, $24 + addq $23, $20, $23 + addq $8, $19, $8 + addq $22, $24, $22 + cmpult $8, $19, $17 + cmpult $22, $24, $27 + addq $22, $17, $22 + addq $23, $27, $23 + mulq $6, $0, $18 + umulh $6, $0, $28 + cmplt $18, $31, $25 + cmplt $28, $31, $21 + addq $18, $18, $18 + addq $28, $28, $28 + addq $28, $25, $28 + addq $23, $21, $23 + addq $8, $18, $8 + addq $22, $28, $22 + cmpult $8, $18, $20 + cmpult $22, $28, $19 + addq $22, $20, $22 + addq $23, $19, $23 + stq $8, 48($16) + bis $31, $31, $8 + mulq $4, $3, $24 + umulh $4, $3, $17 + cmplt $24, $31, $27 + cmplt $17, $31, $25 + addq $24, $24, $24 + addq $17, $17, $17 + addq $17, $27, $17 + addq $8, $25, $8 + addq $22, $24, $22 + addq $23, $17, $23 + cmpult $22, $24, $21 + cmpult $23, $17, $18 + addq $23, $21, $23 + addq $8, $18, $8 + mulq $5, $2, $28 + umulh $5, $2, $20 + cmplt $28, $31, $19 + cmplt $20, $31, $27 + addq $28, $28, $28 + addq $20, $20, $20 + addq $20, $19, $20 + addq $8, $27, $8 + addq $22, $28, $22 + addq $23, $20, $23 + cmpult $22, $28, $25 + cmpult $23, $20, $24 + addq $23, $25, $23 + addq $8, $24, $8 + mulq $6, $1, $17 + umulh $6, $1, $21 + cmplt $17, $31, $18 + cmplt $21, $31, $19 + addq $17, $17, $17 + addq $21, $21, $21 + addq $21, $18, $21 + addq $8, $19, $8 + addq $22, $17, $22 + addq $23, $21, $23 + cmpult $22, $17, $27 + cmpult $23, $21, $28 + addq $23, $27, $23 + addq $8, $28, $8 + mulq $7, $0, $20 + umulh $7, $0, $25 + cmplt $20, $31, $24 + cmplt $25, $31, $18 + addq $20, $20, $20 + addq $25, $25, $25 + addq $25, $24, $25 + addq $8, $18, $8 + addq $22, $20, $22 + addq $23, $25, $23 + cmpult $22, $20, $19 + cmpult $23, $25, $17 + addq $23, $19, $23 + addq $8, $17, $8 + stq $22, 56($16) + bis $31, $31, $22 + mulq $4, $4, $21 + umulh $4, $4, $27 + addq $23, $21, $23 + addq $8, $27, $8 + cmpult $23, $21, $28 + cmpult $8, $27, $24 + addq $8, $28, $8 + addq $22, $24, $22 + mulq $5, $3, $18 + umulh $5, $3, $20 + cmplt $18, $31, $25 + cmplt $20, $31, $19 + addq $18, $18, $18 + addq $20, $20, $20 + addq $20, $25, $20 + addq $22, $19, $22 + addq $23, $18, $23 + addq $8, $20, $8 + cmpult $23, $18, $17 + cmpult $8, $20, $21 + addq $8, $17, $8 + addq $22, $21, $22 + mulq $6, $2, $27 + umulh $6, $2, $28 + cmplt $27, $31, $24 + cmplt $28, $31, $25 + addq $27, $27, $27 + addq $28, $28, $28 + addq $28, $24, $28 + addq $22, $25, $22 + addq $23, $27, $23 + addq $8, $28, $8 + cmpult $23, $27, $19 + cmpult $8, $28, $18 + addq $8, $19, $8 + addq $22, $18, $22 + mulq $7, $1, $20 + umulh $7, $1, $17 + cmplt $20, $31, $21 + cmplt $17, $31, $24 + addq $20, $20, $20 + addq $17, $17, $17 + addq $17, $21, $17 + addq $22, $24, $22 + addq $23, $20, $23 + addq $8, $17, $8 + cmpult $23, $20, $25 + cmpult $8, $17, $27 + addq $8, $25, $8 + addq $22, $27, $22 + stq $23, 64($16) + bis $31, $31, $23 + mulq $5, $4, $28 + umulh $5, $4, $19 + cmplt $28, $31, $18 + cmplt $19, $31, $21 + addq $28, $28, $28 + addq $19, $19, $19 + addq $19, $18, $19 + addq $23, $21, $23 + addq $8, $28, $8 + addq $22, $19, $22 + cmpult $8, $28, $24 + cmpult $22, $19, $20 + addq $22, $24, $22 + addq $23, $20, $23 + mulq $6, $3, $17 + umulh $6, $3, $25 + cmplt $17, $31, $27 + cmplt $25, $31, $18 + addq $17, $17, $17 + addq $25, $25, $25 + addq $25, $27, $25 + addq $23, $18, $23 + addq $8, $17, $8 + addq $22, $25, $22 + cmpult $8, $17, $21 + cmpult $22, $25, $28 + addq $22, $21, $22 + addq $23, $28, $23 + mulq $7, $2, $19 + umulh $7, $2, $24 + cmplt $19, $31, $20 + cmplt $24, $31, $27 + addq $19, $19, $19 + addq $24, $24, $24 + addq $24, $20, $24 + addq $23, $27, $23 + addq $8, $19, $8 + addq $22, $24, $22 + cmpult $8, $19, $18 + cmpult $22, $24, $17 + addq $22, $18, $22 + addq $23, $17, $23 + stq $8, 72($16) + bis $31, $31, $8 + mulq $5, $5, $25 + umulh $5, $5, $21 + addq $22, $25, $22 + addq $23, $21, $23 + cmpult $22, $25, $28 + cmpult $23, $21, $20 + addq $23, $28, $23 + addq $8, $20, $8 + mulq $6, $4, $27 + umulh $6, $4, $19 + cmplt $27, $31, $24 + cmplt $19, $31, $18 + addq $27, $27, $27 + addq $19, $19, $19 + addq $19, $24, $19 + addq $8, $18, $8 + addq $22, $27, $22 + addq $23, $19, $23 + cmpult $22, $27, $17 + cmpult $23, $19, $25 + addq $23, $17, $23 + addq $8, $25, $8 + mulq $7, $3, $21 + umulh $7, $3, $28 + cmplt $21, $31, $20 + cmplt $28, $31, $24 + addq $21, $21, $21 + addq $28, $28, $28 + addq $28, $20, $28 + addq $8, $24, $8 + addq $22, $21, $22 + addq $23, $28, $23 + cmpult $22, $21, $18 + cmpult $23, $28, $27 + addq $23, $18, $23 + addq $8, $27, $8 + stq $22, 80($16) + bis $31, $31, $22 + mulq $6, $5, $19 + umulh $6, $5, $17 + cmplt $19, $31, $25 + cmplt $17, $31, $20 + addq $19, $19, $19 + addq $17, $17, $17 + addq $17, $25, $17 + addq $22, $20, $22 + addq $23, $19, $23 + addq $8, $17, $8 + cmpult $23, $19, $24 + cmpult $8, $17, $21 + addq $8, $24, $8 + addq $22, $21, $22 + mulq $7, $4, $28 + umulh $7, $4, $18 + cmplt $28, $31, $27 + cmplt $18, $31, $25 + addq $28, $28, $28 + addq $18, $18, $18 + addq $18, $27, $18 + addq $22, $25, $22 + addq $23, $28, $23 + addq $8, $18, $8 + cmpult $23, $28, $20 + cmpult $8, $18, $19 + addq $8, $20, $8 + addq $22, $19, $22 + stq $23, 88($16) + bis $31, $31, $23 + mulq $6, $6, $17 + umulh $6, $6, $24 + addq $8, $17, $8 + addq $22, $24, $22 + cmpult $8, $17, $21 + cmpult $22, $24, $27 + addq $22, $21, $22 + addq $23, $27, $23 + mulq $7, $5, $25 + umulh $7, $5, $28 + cmplt $25, $31, $18 + cmplt $28, $31, $20 + addq $25, $25, $25 + addq $28, $28, $28 + addq $28, $18, $28 + addq $23, $20, $23 + addq $8, $25, $8 + addq $22, $28, $22 + cmpult $8, $25, $19 + cmpult $22, $28, $17 + addq $22, $19, $22 + addq $23, $17, $23 + stq $8, 96($16) + bis $31, $31, $8 + mulq $7, $6, $24 + umulh $7, $6, $21 + cmplt $24, $31, $27 + cmplt $21, $31, $18 + addq $24, $24, $24 + addq $21, $21, $21 + addq $21, $27, $21 + addq $8, $18, $8 + addq $22, $24, $22 + addq $23, $21, $23 + cmpult $22, $24, $20 + cmpult $23, $21, $25 + addq $23, $20, $23 + addq $8, $25, $8 + stq $22, 104($16) + bis $31, $31, $22 + mulq $7, $7, $28 + umulh $7, $7, $19 + addq $23, $28, $23 + addq $8, $19, $8 + cmpult $23, $28, $17 + cmpult $8, $19, $27 + addq $8, $17, $8 + addq $22, $27, $22 + stq $23, 112($16) + stq $8, 120($16) + ret $31,($26),1 + .end bn_sqr_comba8 diff --git a/crypto/bn/asm/ff b/crypto/bn/asm/ff new file mode 100644 index 0000000000..4af216889d --- /dev/null +++ b/crypto/bn/asm/ff @@ -0,0 +1,724 @@ + .text + .align 3 + .globl bn_mul_comba4 + .ent bn_mul_comba4 +bn_mul_comba4: +bn_mul_comba4..ng: + .frame $30,0,$26,0 + .prologue 0 + + ldq $0, 0($17) + ldq $1, 0($18) + ldq $2, 8($17) + ldq $3, 8($18) + ldq $4, 16($17) + ldq $5, 16($18) + ldq $6, 24($17) + ldq $7, 24($18) + bis $31, $31, $23 + mulq $0, $1, $8 + umulh $0, $1, $22 + stq $8, 0($16) + bis $31, $31, $8 + mulq $0, $3, $24 + umulh $0, $3, $25 + addq $22, $24, $22 + cmpult $22, $24, $27 + addq $27, $25, $25 + addq $23, $25, $23 + cmpult $23, $25, $28 + addq $8, $28, $8 + mulq $2, $1, $21 + umulh $2, $1, $20 + addq $22, $21, $22 + cmpult $22, $21, $19 + addq $19, $20, $20 + addq $23, $20, $23 + cmpult $23, $20, $17 + addq $8, $17, $8 + stq $22, 8($16) + bis $31, $31, $22 + mulq $2, $3, $18 + umulh $2, $3, $24 + addq $23, $18, $23 + cmpult $23, $18, $27 + addq $27, $24, $24 + addq $8, $24, $8 + cmpult $8, $24, $25 + addq $22, $25, $22 + mulq $0, $5, $28 + umulh $0, $5, $21 + addq $23, $28, $23 + cmpult $23, $28, $19 + addq $19, $21, $21 + addq $8, $21, $8 + cmpult $8, $21, $20 + addq $22, $20, $22 + mulq $4, $1, $17 + umulh $4, $1, $18 + addq $23, $17, $23 + cmpult $23, $17, $27 + addq $27, $18, $18 + addq $8, $18, $8 + cmpult $8, $18, $24 + addq $22, $24, $22 + stq $23, 16($16) + bis $31, $31, $23 + mulq $0, $7, $25 + umulh $0, $7, $28 + addq $8, $25, $8 + cmpult $8, $25, $19 + addq $19, $28, $28 + addq $22, $28, $22 + cmpult $22, $28, $21 + addq $23, $21, $23 + mulq $2, $5, $20 + umulh $2, $5, $17 + addq $8, $20, $8 + cmpult $8, $20, $27 + addq $27, $17, $17 + addq $22, $17, $22 + cmpult $22, $17, $18 + addq $23, $18, $23 + mulq $4, $3, $24 + umulh $4, $3, $25 + addq $8, $24, $8 + cmpult $8, $24, $19 + addq $19, $25, $25 + addq $22, $25, $22 + cmpult $22, $25, $28 + addq $23, $28, $23 + mulq $6, $1, $21 + umulh $6, $1, $0 + addq $8, $21, $8 + cmpult $8, $21, $20 + addq $20, $0, $0 + addq $22, $0, $22 + cmpult $22, $0, $27 + addq $23, $27, $23 + stq $8, 24($16) + bis $31, $31, $8 + mulq $2, $7, $17 + umulh $2, $7, $18 + addq $22, $17, $22 + cmpult $22, $17, $24 + addq $24, $18, $18 + addq $23, $18, $23 + cmpult $23, $18, $19 + addq $8, $19, $8 + mulq $4, $5, $25 + umulh $4, $5, $28 + addq $22, $25, $22 + cmpult $22, $25, $21 + addq $21, $28, $28 + addq $23, $28, $23 + cmpult $23, $28, $20 + addq $8, $20, $8 + mulq $6, $3, $0 + umulh $6, $3, $27 + addq $22, $0, $22 + cmpult $22, $0, $1 + addq $1, $27, $27 + addq $23, $27, $23 + cmpult $23, $27, $17 + addq $8, $17, $8 + stq $22, 32($16) + bis $31, $31, $22 + mulq $4, $7, $24 + umulh $4, $7, $18 + addq $23, $24, $23 + cmpult $23, $24, $19 + addq $19, $18, $18 + addq $8, $18, $8 + cmpult $8, $18, $2 + addq $22, $2, $22 + mulq $6, $5, $25 + umulh $6, $5, $21 + addq $23, $25, $23 + cmpult $23, $25, $28 + addq $28, $21, $21 + addq $8, $21, $8 + cmpult $8, $21, $20 + addq $22, $20, $22 + stq $23, 40($16) + bis $31, $31, $23 + mulq $6, $7, $0 + umulh $6, $7, $1 + addq $8, $0, $8 + cmpult $8, $0, $27 + addq $27, $1, $1 + addq $22, $1, $22 + cmpult $22, $1, $17 + addq $23, $17, $23 + stq $8, 48($16) + stq $22, 56($16) + ret $31,($26),1 + .end bn_mul_comba4 + .text + .align 3 + .globl bn_mul_comba8 + .ent bn_mul_comba8 +bn_mul_comba8: +bn_mul_comba8..ng: + .frame $30,0,$26,0 + .prologue 0 + + stq $9, 8($30) + stq $10, 16($30) + ldq $0, 0($17) + ldq $1, 0($18) + ldq $2, 8($17) + ldq $3, 8($18) + ldq $4, 16($17) + ldq $5, 16($18) + ldq $6, 24($17) + ldq $7, 24($18) + ldq $8, 8($17) + ldq $22, 8($18) + ldq $23, 8($17) + ldq $24, 8($18) + ldq $25, 8($17) + ldq $27, 8($18) + ldq $28, 8($17) + ldq $21, 8($18) + bis $31, $31, $9 + mulq $0, $1, $20 + umulh $0, $1, $19 + stq $20, 0($16) + bis $31, $31, $20 + mulq $0, $3, $10 + umulh $0, $3, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $10 + addq $20, $10, $20 + mulq $2, $1, $18 + umulh $2, $1, $17 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $10, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $20, $18, $20 + stq $19, 8($16) + bis $31, $31, $19 + mulq $0, $5, $10 + umulh $0, $5, $17 + addq $9, $10, $9 + cmpult $9, $10, $18 + addq $18, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + mulq $2, $3, $18 + umulh $2, $3, $17 + addq $9, $18, $9 + cmpult $9, $18, $10 + addq $10, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $18 + addq $19, $18, $19 + mulq $4, $1, $10 + umulh $4, $1, $17 + addq $9, $10, $9 + cmpult $9, $10, $18 + addq $18, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + stq $9, 16($16) + bis $31, $31, $9 + mulq $0, $7, $18 + umulh $0, $7, $17 + addq $20, $18, $20 + cmpult $20, $18, $10 + addq $10, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $9, $18, $9 + mulq $2, $5, $10 + umulh $2, $5, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + mulq $4, $3, $18 + umulh $4, $3, $17 + addq $20, $18, $20 + cmpult $20, $18, $10 + addq $10, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $9, $18, $9 + mulq $6, $1, $10 + umulh $6, $1, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + stq $20, 24($16) + bis $31, $31, $20 + mulq $0, $22, $18 + umulh $0, $22, $17 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $10, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $20, $18, $20 + mulq $2, $7, $10 + umulh $2, $7, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $10 + addq $20, $10, $20 + mulq $4, $5, $18 + umulh $4, $5, $17 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $10, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $20, $18, $20 + mulq $6, $3, $10 + umulh $6, $3, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $10 + addq $20, $10, $20 + mulq $8, $1, $18 + umulh $8, $1, $17 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $10, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $20, $18, $20 + stq $19, 32($16) + bis $31, $31, $19 + mulq $0, $24, $10 + umulh $0, $24, $17 + addq $9, $10, $9 + cmpult $9, $10, $18 + addq $18, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + mulq $2, $22, $18 + umulh $2, $22, $17 + addq $9, $18, $9 + cmpult $9, $18, $10 + addq $10, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $18 + addq $19, $18, $19 + mulq $4, $7, $10 + umulh $4, $7, $17 + addq $9, $10, $9 + cmpult $9, $10, $18 + addq $18, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + mulq $6, $5, $18 + umulh $6, $5, $17 + addq $9, $18, $9 + cmpult $9, $18, $10 + addq $10, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $18 + addq $19, $18, $19 + mulq $8, $3, $10 + umulh $8, $3, $17 + addq $9, $10, $9 + cmpult $9, $10, $18 + addq $18, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + mulq $23, $1, $18 + umulh $23, $1, $17 + addq $9, $18, $9 + cmpult $9, $18, $10 + addq $10, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $18 + addq $19, $18, $19 + stq $9, 40($16) + bis $31, $31, $9 + mulq $0, $27, $10 + umulh $0, $27, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + mulq $2, $24, $18 + umulh $2, $24, $17 + addq $20, $18, $20 + cmpult $20, $18, $10 + addq $10, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $9, $18, $9 + mulq $4, $22, $10 + umulh $4, $22, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + mulq $6, $7, $18 + umulh $6, $7, $17 + addq $20, $18, $20 + cmpult $20, $18, $10 + addq $10, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $9, $18, $9 + mulq $8, $5, $10 + umulh $8, $5, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + mulq $23, $3, $18 + umulh $23, $3, $17 + addq $20, $18, $20 + cmpult $20, $18, $10 + addq $10, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $9, $18, $9 + mulq $25, $1, $10 + umulh $25, $1, $17 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $18, $17, $17 + addq $19, $17, $19 + cmpult $19, $17, $10 + addq $9, $10, $9 + stq $20, 48($16) + bis $31, $31, $20 + mulq $0, $21, $18 + umulh $0, $21, $17 + addq $19, $18, $19 + cmpult $19, $18, $10 + addq $10, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $20, $18, $20 + mulq $2, $27, $10 + umulh $2, $27, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $20, $0, $20 + mulq $4, $24, $10 + umulh $4, $24, $18 + addq $19, $10, $19 + cmpult $19, $10, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $0 + addq $20, $0, $20 + mulq $6, $22, $10 + umulh $6, $22, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $20, $0, $20 + mulq $8, $7, $10 + umulh $8, $7, $18 + addq $19, $10, $19 + cmpult $19, $10, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $0 + addq $20, $0, $20 + mulq $23, $5, $10 + umulh $23, $5, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $20, $0, $20 + mulq $25, $3, $10 + umulh $25, $3, $18 + addq $19, $10, $19 + cmpult $19, $10, $17 + addq $17, $18, $18 + addq $9, $18, $9 + cmpult $9, $18, $0 + addq $20, $0, $20 + mulq $28, $1, $10 + umulh $28, $1, $17 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $0 + addq $20, $0, $20 + stq $19, 56($16) + bis $31, $31, $19 + mulq $2, $21, $10 + umulh $2, $21, $18 + addq $9, $10, $9 + cmpult $9, $10, $17 + addq $17, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $0 + addq $19, $0, $19 + mulq $4, $27, $1 + umulh $4, $27, $10 + addq $9, $1, $9 + cmpult $9, $1, $17 + addq $17, $10, $10 + addq $20, $10, $20 + cmpult $20, $10, $18 + addq $19, $18, $19 + mulq $6, $24, $0 + umulh $6, $24, $2 + addq $9, $0, $9 + cmpult $9, $0, $1 + addq $1, $2, $2 + addq $20, $2, $20 + cmpult $20, $2, $17 + addq $19, $17, $19 + mulq $8, $22, $10 + umulh $8, $22, $18 + addq $9, $10, $9 + cmpult $9, $10, $0 + addq $0, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $1 + addq $19, $1, $19 + mulq $23, $7, $2 + umulh $23, $7, $17 + addq $9, $2, $9 + cmpult $9, $2, $10 + addq $10, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $0 + addq $19, $0, $19 + mulq $25, $5, $18 + umulh $25, $5, $1 + addq $9, $18, $9 + cmpult $9, $18, $2 + addq $2, $1, $1 + addq $20, $1, $20 + cmpult $20, $1, $10 + addq $19, $10, $19 + mulq $28, $3, $17 + umulh $28, $3, $0 + addq $9, $17, $9 + cmpult $9, $17, $18 + addq $18, $0, $0 + addq $20, $0, $20 + cmpult $20, $0, $2 + addq $19, $2, $19 + stq $9, 64($16) + bis $31, $31, $9 + mulq $4, $21, $1 + umulh $4, $21, $10 + addq $20, $1, $20 + cmpult $20, $1, $17 + addq $17, $10, $10 + addq $19, $10, $19 + cmpult $19, $10, $18 + addq $9, $18, $9 + mulq $6, $27, $0 + umulh $6, $27, $2 + addq $20, $0, $20 + cmpult $20, $0, $3 + addq $3, $2, $2 + addq $19, $2, $19 + cmpult $19, $2, $1 + addq $9, $1, $9 + mulq $8, $24, $17 + umulh $8, $24, $10 + addq $20, $17, $20 + cmpult $20, $17, $18 + addq $18, $10, $10 + addq $19, $10, $19 + cmpult $19, $10, $4 + addq $9, $4, $9 + mulq $23, $22, $0 + umulh $23, $22, $3 + addq $20, $0, $20 + cmpult $20, $0, $2 + addq $2, $3, $3 + addq $19, $3, $19 + cmpult $19, $3, $1 + addq $9, $1, $9 + mulq $25, $7, $17 + umulh $25, $7, $18 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $10, $18, $18 + addq $19, $18, $19 + cmpult $19, $18, $4 + addq $9, $4, $9 + mulq $28, $5, $0 + umulh $28, $5, $2 + addq $20, $0, $20 + cmpult $20, $0, $3 + addq $3, $2, $2 + addq $19, $2, $19 + cmpult $19, $2, $1 + addq $9, $1, $9 + stq $20, 72($16) + bis $31, $31, $20 + mulq $6, $21, $17 + umulh $6, $21, $10 + addq $19, $17, $19 + cmpult $19, $17, $18 + addq $18, $10, $10 + addq $9, $10, $9 + cmpult $9, $10, $4 + addq $20, $4, $20 + mulq $8, $27, $0 + umulh $8, $27, $3 + addq $19, $0, $19 + cmpult $19, $0, $2 + addq $2, $3, $3 + addq $9, $3, $9 + cmpult $9, $3, $1 + addq $20, $1, $20 + mulq $23, $24, $5 + umulh $23, $24, $17 + addq $19, $5, $19 + cmpult $19, $5, $18 + addq $18, $17, $17 + addq $9, $17, $9 + cmpult $9, $17, $10 + addq $20, $10, $20 + mulq $25, $22, $4 + umulh $25, $22, $6 + addq $19, $4, $19 + cmpult $19, $4, $0 + addq $0, $6, $6 + addq $9, $6, $9 + cmpult $9, $6, $2 + addq $20, $2, $20 + mulq $28, $7, $3 + umulh $28, $7, $1 + addq $19, $3, $19 + cmpult $19, $3, $5 + addq $5, $1, $1 + addq $9, $1, $9 + cmpult $9, $1, $18 + addq $20, $18, $20 + stq $19, 80($16) + bis $31, $31, $19 + mulq $8, $21, $17 + umulh $8, $21, $10 + addq $9, $17, $9 + cmpult $9, $17, $4 + addq $4, $10, $10 + addq $20, $10, $20 + cmpult $20, $10, $0 + addq $19, $0, $19 + mulq $23, $27, $6 + umulh $23, $27, $2 + addq $9, $6, $9 + cmpult $9, $6, $3 + addq $3, $2, $2 + addq $20, $2, $20 + cmpult $20, $2, $5 + addq $19, $5, $19 + mulq $25, $24, $1 + umulh $25, $24, $18 + addq $9, $1, $9 + cmpult $9, $1, $7 + addq $7, $18, $18 + addq $20, $18, $20 + cmpult $20, $18, $17 + addq $19, $17, $19 + mulq $28, $22, $4 + umulh $28, $22, $10 + addq $9, $4, $9 + cmpult $9, $4, $0 + addq $0, $10, $10 + addq $20, $10, $20 + cmpult $20, $10, $8 + addq $19, $8, $19 + stq $9, 88($16) + bis $31, $31, $9 + mulq $23, $21, $6 + umulh $23, $21, $3 + addq $20, $6, $20 + cmpult $20, $6, $2 + addq $2, $3, $3 + addq $19, $3, $19 + cmpult $19, $3, $5 + addq $9, $5, $9 + mulq $25, $27, $1 + umulh $25, $27, $7 + addq $20, $1, $20 + cmpult $20, $1, $18 + addq $18, $7, $7 + addq $19, $7, $19 + cmpult $19, $7, $17 + addq $9, $17, $9 + mulq $28, $24, $4 + umulh $28, $24, $0 + addq $20, $4, $20 + cmpult $20, $4, $10 + addq $10, $0, $0 + addq $19, $0, $19 + cmpult $19, $0, $8 + addq $9, $8, $9 + stq $20, 96($16) + bis $31, $31, $20 + mulq $25, $21, $22 + umulh $25, $21, $6 + addq $19, $22, $19 + cmpult $19, $22, $2 + addq $2, $6, $6 + addq $9, $6, $9 + cmpult $9, $6, $3 + addq $20, $3, $20 + mulq $28, $27, $5 + umulh $28, $27, $23 + addq $19, $5, $19 + cmpult $19, $5, $1 + addq $1, $23, $23 + addq $9, $23, $9 + cmpult $9, $23, $18 + addq $20, $18, $20 + stq $19, 104($16) + bis $31, $31, $19 + mulq $28, $21, $7 + umulh $28, $21, $17 + addq $9, $7, $9 + cmpult $9, $7, $4 + addq $4, $17, $17 + addq $20, $17, $20 + cmpult $20, $17, $10 + addq $19, $10, $19 + stq $9, 112($16) + stq $20, 120($16) + ldq $9, 8($30) + ldq $10, 16($30) + ret $31,($26),1 + .end bn_mul_comba8 diff --git a/crypto/bn/asm/mips1.s b/crypto/bn/asm/mips1.s new file mode 100644 index 0000000000..44fa1254c7 --- /dev/null +++ b/crypto/bn/asm/mips1.s @@ -0,0 +1,539 @@ +/* This assember is for R2000/R3000 machines, or higher ones that do + * no want to do any 64 bit arithmatic. + * Make sure that the SSLeay bignum library is compiled with + * THIRTY_TWO_BIT set. + * This must either be compiled with the system CC, or, if you use GNU gas, + * cc -E mips1.s|gas -o mips1.o + */ + .set reorder + .set noat + +#define R1 $1 +#define CC $2 +#define R2 $3 +#define R3 $8 +#define R4 $9 +#define L1 $10 +#define L2 $11 +#define L3 $12 +#define L4 $13 +#define H1 $14 +#define H2 $15 +#define H3 $24 +#define H4 $25 + +#define P1 $4 +#define P2 $5 +#define P3 $6 +#define P4 $7 + + .align 2 + .ent bn_mul_add_words + .globl bn_mul_add_words +.text +bn_mul_add_words: + .frame $sp,0,$31 + .mask 0x00000000,0 + .fmask 0x00000000,0 + + #blt P3,4,$lab34 + + subu R1,P3,4 + move CC,$0 + bltz R1,$lab34 +$lab2: + lw R1,0(P1) + lw L1,0(P2) + lw R2,4(P1) + lw L2,4(P2) + lw R3,8(P1) + lw L3,8(P2) + lw R4,12(P1) + lw L4,12(P2) + multu L1,P4 + addu R1,R1,CC + mflo L1 + sltu CC,R1,CC + addu R1,R1,L1 + mfhi H1 + sltu L1,R1,L1 + sw R1,0(P1) + addu CC,CC,L1 + multu L2,P4 + addu CC,H1,CC + mflo L2 + addu R2,R2,CC + sltu CC,R2,CC + mfhi H2 + addu R2,R2,L2 + addu P2,P2,16 + sltu L2,R2,L2 + sw R2,4(P1) + addu CC,CC,L2 + multu L3,P4 + addu CC,H2,CC + mflo L3 + addu R3,R3,CC + sltu CC,R3,CC + mfhi H3 + addu R3,R3,L3 + addu P1,P1,16 + sltu L3,R3,L3 + sw R3,-8(P1) + addu CC,CC,L3 + multu L4,P4 + addu CC,H3,CC + mflo L4 + addu R4,R4,CC + sltu CC,R4,CC + mfhi H4 + addu R4,R4,L4 + subu P3,P3,4 + sltu L4,R4,L4 + addu CC,CC,L4 + addu CC,H4,CC + + subu R1,P3,4 + sw R4,-4(P1) # delay slot + bgez R1,$lab2 + + bleu P3,0,$lab3 + .align 2 +$lab33: + lw L1,0(P2) + lw R1,0(P1) + multu L1,P4 + addu R1,R1,CC + sltu CC,R1,CC + addu P1,P1,4 + mflo L1 + mfhi H1 + addu R1,R1,L1 + addu P2,P2,4 + sltu L1,R1,L1 + subu P3,P3,1 + addu CC,CC,L1 + sw R1,-4(P1) + addu CC,H1,CC + bgtz P3,$lab33 + j $31 + .align 2 +$lab3: + j $31 + .align 2 +$lab34: + bgt P3,0,$lab33 + j $31 + .end bn_mul_add_words + + .align 2 + # Program Unit: bn_mul_words + .ent bn_mul_words + .globl bn_mul_words +.text +bn_mul_words: + .frame $sp,0,$31 + .mask 0x00000000,0 + .fmask 0x00000000,0 + + subu P3,P3,4 + move CC,$0 + bltz P3,$lab45 +$lab44: + lw L1,0(P2) + lw L2,4(P2) + lw L3,8(P2) + lw L4,12(P2) + multu L1,P4 + subu P3,P3,4 + mflo L1 + mfhi H1 + addu L1,L1,CC + multu L2,P4 + sltu CC,L1,CC + sw L1,0(P1) + addu CC,H1,CC + mflo L2 + mfhi H2 + addu L2,L2,CC + multu L3,P4 + sltu CC,L2,CC + sw L2,4(P1) + addu CC,H2,CC + mflo L3 + mfhi H3 + addu L3,L3,CC + multu L4,P4 + sltu CC,L3,CC + sw L3,8(P1) + addu CC,H3,CC + mflo L4 + mfhi H4 + addu L4,L4,CC + addu P1,P1,16 + sltu CC,L4,CC + addu P2,P2,16 + addu CC,H4,CC + sw L4,-4(P1) + + bgez P3,$lab44 + b $lab45 +$lab46: + lw L1,0(P2) + addu P1,P1,4 + multu L1,P4 + addu P2,P2,4 + mflo L1 + mfhi H1 + addu L1,L1,CC + subu P3,P3,1 + sltu CC,L1,CC + sw L1,-4(P1) + addu CC,H1,CC + bgtz P3,$lab46 + j $31 +$lab45: + addu P3,P3,4 + bgtz P3,$lab46 + j $31 + .align 2 + .end bn_mul_words + + # Program Unit: bn_sqr_words + .ent bn_sqr_words + .globl bn_sqr_words +.text +bn_sqr_words: + .frame $sp,0,$31 + .mask 0x00000000,0 + .fmask 0x00000000,0 + + subu P3,P3,4 + bltz P3,$lab55 +$lab54: + lw L1,0(P2) + lw L2,4(P2) + lw L3,8(P2) + lw L4,12(P2) + + multu L1,L1 + subu P3,P3,4 + mflo L1 + mfhi H1 + sw L1,0(P1) + sw H1,4(P1) + + multu L2,L2 + addu P1,P1,32 + mflo L2 + mfhi H2 + sw L2,-24(P1) + sw H2,-20(P1) + + multu L3,L3 + addu P2,P2,16 + mflo L3 + mfhi H3 + sw L3,-16(P1) + sw H3,-12(P1) + + multu L4,L4 + + mflo L4 + mfhi H4 + sw L4,-8(P1) + sw H4,-4(P1) + + bgtz P3,$lab54 + b $lab55 +$lab56: + lw L1,0(P2) + addu P1,P1,8 + multu L1,L1 + addu P2,P2,4 + subu P3,P3,1 + mflo L1 + mfhi H1 + sw L1,-8(P1) + sw H1,-4(P1) + + bgtz P3,$lab56 + j $31 +$lab55: + addu P3,P3,4 + bgtz P3,$lab56 + j $31 + .align 2 + .end bn_sqr_words + + # Program Unit: bn_add_words + .ent bn_add_words + .globl bn_add_words +.text +bn_add_words: # 0x590 + .frame $sp,0,$31 + .mask 0x00000000,0 + .fmask 0x00000000,0 + + subu P4,P4,4 + move CC,$0 + bltz P4,$lab65 +$lab64: + lw L1,0(P2) + lw R1,0(P3) + lw L2,4(P2) + lw R2,4(P3) + + addu L1,L1,CC + lw L3,8(P2) + sltu CC,L1,CC + addu L1,L1,R1 + sltu R1,L1,R1 + lw R3,8(P3) + addu CC,CC,R1 + lw L4,12(P2) + + addu L2,L2,CC + lw R4,12(P3) + sltu CC,L2,CC + addu L2,L2,R2 + sltu R2,L2,R2 + sw L1,0(P1) + addu CC,CC,R2 + addu P1,P1,16 + addu L3,L3,CC + sw L2,-12(P1) + + sltu CC,L3,CC + addu L3,L3,R3 + sltu R3,L3,R3 + addu P2,P2,16 + addu CC,CC,R3 + + addu L4,L4,CC + addu P3,P3,16 + sltu CC,L4,CC + addu L4,L4,R4 + subu P4,P4,4 + sltu R4,L4,R4 + sw L3,-8(P1) + addu CC,CC,R4 + sw L4,-4(P1) + + bgtz P4,$lab64 + b $lab65 +$lab66: + lw L1,0(P2) + lw R1,0(P3) + addu L1,L1,CC + addu P1,P1,4 + sltu CC,L1,CC + addu P2,P2,4 + addu P3,P3,4 + addu L1,L1,R1 + subu P4,P4,1 + sltu R1,L1,R1 + sw L1,-4(P1) + addu CC,CC,R1 + + bgtz P4,$lab66 + j $31 +$lab65: + addu P4,P4,4 + bgtz P4,$lab66 + j $31 + .end bn_add_words + + # Program Unit: bn_div64 + .set at + .set reorder + .text + .align 2 + .globl bn_div64 + # 321 { + .ent bn_div64 2 +bn_div64: + subu $sp, 64 + sw $31, 56($sp) + sw $16, 48($sp) + .mask 0x80010000, -56 + .frame $sp, 64, $31 + move $9, $4 + move $12, $5 + move $16, $6 + # 322 BN_ULONG dh,dl,q,ret=0,th,tl,t; + move $31, $0 + # 323 int i,count=2; + li $13, 2 + # 324 + # 325 if (d == 0) return(BN_MASK2); + bne $16, 0, $80 + li $2, -1 + b $93 +$80: + # 326 + # 327 i=BN_num_bits_word(d); + move $4, $16 + sw $31, 16($sp) + sw $9, 24($sp) + sw $12, 32($sp) + sw $13, 40($sp) + .livereg 0x800ff0e,0xfff + jal BN_num_bits_word + li $4, 32 + lw $31, 16($sp) + lw $9, 24($sp) + lw $12, 32($sp) + lw $13, 40($sp) + move $3, $2 + # 328 if ((i != BN_BITS2) && (h > (BN_ULONG)1<<i)) + beq $2, $4, $81 + li $14, 1 + sll $15, $14, $2 + bleu $9, $15, $81 + # 329 { + # 330 #if !defined(NO_STDIO) && !defined(WIN16) + # 331 fprintf(stderr,"Division would overflow (%d)\n",i); + # 332 #endif + # 333 abort(); + sw $3, 8($sp) + sw $9, 24($sp) + sw $12, 32($sp) + sw $13, 40($sp) + sw $31, 26($sp) + .livereg 0xff0e,0xfff + jal abort + lw $3, 8($sp) + li $4, 32 + lw $9, 24($sp) + lw $12, 32($sp) + lw $13, 40($sp) + lw $31, 26($sp) + # 334 } +$81: + # 335 i=BN_BITS2-i; + subu $3, $4, $3 + # 336 if (h >= d) h-=d; + bltu $9, $16, $82 + subu $9, $9, $16 +$82: + # 337 + # 338 if (i) + beq $3, 0, $83 + # 339 { + # 340 d<<=i; + sll $16, $16, $3 + # 341 h=(h<<i)|(l>>(BN_BITS2-i)); + sll $24, $9, $3 + subu $25, $4, $3 + srl $14, $12, $25 + or $9, $24, $14 + # 342 l<<=i; + sll $12, $12, $3 + # 343 } +$83: + # 344 dh=(d&BN_MASK2h)>>BN_BITS4; + # 345 dl=(d&BN_MASK2l); + and $8, $16, -65536 + srl $8, $8, 16 + and $10, $16, 65535 + li $6, -65536 +$84: + # 346 for (;;) + # 347 { + # 348 if ((h>>BN_BITS4) == dh) + srl $15, $9, 16 + bne $8, $15, $85 + # 349 q=BN_MASK2l; + li $5, 65535 + b $86 +$85: + # 350 else + # 351 q=h/dh; + divu $5, $9, $8 +$86: + # 352 + # 353 for (;;) + # 354 { + # 355 t=(h-q*dh); + mul $4, $5, $8 + subu $2, $9, $4 + move $3, $2 + # 356 if ((t&BN_MASK2h) || + # 357 ((dl*q) <= ( + # 358 (t<<BN_BITS4)+ + # 359 ((l&BN_MASK2h)>>BN_BITS4)))) + and $25, $2, $6 + bne $25, $0, $87 + mul $24, $10, $5 + sll $14, $3, 16 + and $15, $12, $6 + srl $25, $15, 16 + addu $15, $14, $25 + bgtu $24, $15, $88 +$87: + # 360 break; + mul $3, $10, $5 + b $89 +$88: + # 361 q--; + addu $5, $5, -1 + # 362 } + b $86 +$89: + # 363 th=q*dh; + # 364 tl=q*dl; + # 365 t=(tl>>BN_BITS4); + # 366 tl=(tl<<BN_BITS4)&BN_MASK2h; + sll $14, $3, 16 + and $2, $14, $6 + move $11, $2 + # 367 th+=t; + srl $25, $3, 16 + addu $7, $4, $25 + # 368 + # 369 if (l < tl) th++; + bgeu $12, $2, $90 + addu $7, $7, 1 +$90: + # 370 l-=tl; + subu $12, $12, $11 + # 371 if (h < th) + bgeu $9, $7, $91 + # 372 { + # 373 h+=d; + addu $9, $9, $16 + # 374 q--; + addu $5, $5, -1 + # 375 } +$91: + # 376 h-=th; + subu $9, $9, $7 + # 377 + # 378 if (--count == 0) break; + addu $13, $13, -1 + beq $13, 0, $92 + # 379 + # 380 ret=q<<BN_BITS4; + sll $31, $5, 16 + # 381 h=((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2; + sll $24, $9, 16 + srl $15, $12, 16 + or $9, $24, $15 + # 382 l=(l&BN_MASK2l)<<BN_BITS4; + and $12, $12, 65535 + sll $12, $12, 16 + # 383 } + b $84 +$92: + # 384 ret|=q; + or $31, $31, $5 + # 385 return(ret); + move $2, $31 +$93: + lw $16, 48($sp) + lw $31, 56($sp) + addu $sp, 64 + j $31 + .end bn_div64 + diff --git a/crypto/bn/asm/mips3.s b/crypto/bn/asm/mips3.s new file mode 100644 index 0000000000..e8fdd50d16 --- /dev/null +++ b/crypto/bn/asm/mips3.s @@ -0,0 +1,544 @@ +/* This assember is for R4000 and above machines. It takes advantage + * of the 64 bit registers present on these CPUs. + * Make sure that the SSLeay bignum library is compiled with + * SIXTY_FOUR_BIT set and BN_LLONG undefined. + * This must either be compiled with the system CC, or, if you use GNU gas, + * cc -E mips3.s|gas -o mips3.o + */ + .set reorder + .set noat + +#define R1 $1 +#define CC $2 +#define R2 $3 +#define R3 $8 +#define R4 $9 +#define L1 $10 +#define L2 $11 +#define L3 $12 +#define L4 $13 +#define H1 $14 +#define H2 $15 +#define H3 $24 +#define H4 $25 + +#define P1 $4 +#define P2 $5 +#define P3 $6 +#define P4 $7 + + .align 2 + .ent bn_mul_add_words + .globl bn_mul_add_words +.text +bn_mul_add_words: + .frame $sp,0,$31 + .mask 0x00000000,0 + .fmask 0x00000000,0 + + #blt P3,4,$lab34 + + subu R1,P3,4 + move CC,$0 + bltz R1,$lab34 +$lab2: + ld R1,0(P1) + ld L1,0(P2) + ld R2,8(P1) + ld L2,8(P2) + ld R3,16(P1) + ld L3,16(P2) + ld R4,24(P1) + ld L4,24(P2) + dmultu L1,P4 + daddu R1,R1,CC + mflo L1 + sltu CC,R1,CC + daddu R1,R1,L1 + mfhi H1 + sltu L1,R1,L1 + sd R1,0(P1) + daddu CC,CC,L1 + dmultu L2,P4 + daddu CC,H1,CC + mflo L2 + daddu R2,R2,CC + sltu CC,R2,CC + mfhi H2 + daddu R2,R2,L2 + daddu P2,P2,32 + sltu L2,R2,L2 + sd R2,8(P1) + daddu CC,CC,L2 + dmultu L3,P4 + daddu CC,H2,CC + mflo L3 + daddu R3,R3,CC + sltu CC,R3,CC + mfhi H3 + daddu R3,R3,L3 + daddu P1,P1,32 + sltu L3,R3,L3 + sd R3,-16(P1) + daddu CC,CC,L3 + dmultu L4,P4 + daddu CC,H3,CC + mflo L4 + daddu R4,R4,CC + sltu CC,R4,CC + mfhi H4 + daddu R4,R4,L4 + subu P3,P3,4 + sltu L4,R4,L4 + daddu CC,CC,L4 + daddu CC,H4,CC + + subu R1,P3,4 + sd R4,-8(P1) # delay slot + bgez R1,$lab2 + + bleu P3,0,$lab3 + .align 2 +$lab33: + ld L1,0(P2) + ld R1,0(P1) + dmultu L1,P4 + daddu R1,R1,CC + sltu CC,R1,CC + daddu P1,P1,8 + mflo L1 + mfhi H1 + daddu R1,R1,L1 + daddu P2,P2,8 + sltu L1,R1,L1 + subu P3,P3,1 + daddu CC,CC,L1 + sd R1,-8(P1) + daddu CC,H1,CC + bgtz P3,$lab33 + j $31 + .align 2 +$lab3: + j $31 + .align 2 +$lab34: + bgt P3,0,$lab33 + j $31 + .end bn_mul_add_words + + .align 2 + # Program Unit: bn_mul_words + .ent bn_mul_words + .globl bn_mul_words +.text +bn_mul_words: + .frame $sp,0,$31 + .mask 0x00000000,0 + .fmask 0x00000000,0 + + subu P3,P3,4 + move CC,$0 + bltz P3,$lab45 +$lab44: + ld L1,0(P2) + ld L2,8(P2) + ld L3,16(P2) + ld L4,24(P2) + dmultu L1,P4 + subu P3,P3,4 + mflo L1 + mfhi H1 + daddu L1,L1,CC + dmultu L2,P4 + sltu CC,L1,CC + sd L1,0(P1) + daddu CC,H1,CC + mflo L2 + mfhi H2 + daddu L2,L2,CC + dmultu L3,P4 + sltu CC,L2,CC + sd L2,8(P1) + daddu CC,H2,CC + mflo L3 + mfhi H3 + daddu L3,L3,CC + dmultu L4,P4 + sltu CC,L3,CC + sd L3,16(P1) + daddu CC,H3,CC + mflo L4 + mfhi H4 + daddu L4,L4,CC + daddu P1,P1,32 + sltu CC,L4,CC + daddu P2,P2,32 + daddu CC,H4,CC + sd L4,-8(P1) + + bgez P3,$lab44 + b $lab45 +$lab46: + ld L1,0(P2) + daddu P1,P1,8 + dmultu L1,P4 + daddu P2,P2,8 + mflo L1 + mfhi H1 + daddu L1,L1,CC + subu P3,P3,1 + sltu CC,L1,CC + sd L1,-8(P1) + daddu CC,H1,CC + bgtz P3,$lab46 + j $31 +$lab45: + addu P3,P3,4 + bgtz P3,$lab46 + j $31 + .align 2 + .end bn_mul_words + + # Program Unit: bn_sqr_words + .ent bn_sqr_words + .globl bn_sqr_words +.text +bn_sqr_words: + .frame $sp,0,$31 + .mask 0x00000000,0 + .fmask 0x00000000,0 + + subu P3,P3,4 + b $lab55 + bltz P3,$lab55 +$lab54: + ld L1,0(P2) + ld L2,8(P2) + ld L3,16(P2) + ld L4,24(P2) + + dmultu L1,L1 + subu P3,P3,4 + mflo L1 + mfhi H1 + sd L1,0(P1) + sd H1,8(P1) + + dmultu L2,L2 + daddu P1,P1,32 + mflo L2 + mfhi H2 + sd L2,-48(P1) + sd H2,-40(P1) + + dmultu L3,L3 + daddu P2,P2,32 + mflo L3 + mfhi H3 + sd L3,-32(P1) + sd H3,-24(P1) + + dmultu L4,L4 + + mflo L4 + mfhi H4 + sd L4,-16(P1) + sd H4,-8(P1) + + bgtz P3,$lab54 + b $lab55 +$lab56: + ld L1,0(P2) + daddu P1,P1,16 + dmultu L1,L1 + daddu P2,P2,8 + subu P3,P3,1 + mflo L1 + mfhi H1 + sd L1,-16(P1) + sd H1,-8(P1) + + bgtz P3,$lab56 + j $31 +$lab55: + daddu P3,P3,4 + bgtz P3,$lab56 + j $31 + .align 2 + .end bn_sqr_words + + # Program Unit: bn_add_words + .ent bn_add_words + .globl bn_add_words +.text +bn_add_words: # 0x590 + .frame $sp,0,$31 + .mask 0x00000000,0 + .fmask 0x00000000,0 + + subu P4,P4,4 + move CC,$0 + bltz P4,$lab65 +$lab64: + ld L1,0(P2) + ld R1,0(P3) + ld L2,8(P2) + ld R2,8(P3) + + daddu L1,L1,CC + ld L3,16(P2) + sltu CC,L1,CC + daddu L1,L1,R1 + sltu R1,L1,R1 + ld R3,16(P3) + daddu CC,CC,R1 + ld L4,24(P2) + + daddu L2,L2,CC + ld R4,24(P3) + sltu CC,L2,CC + daddu L2,L2,R2 + sltu R2,L2,R2 + sd L1,0(P1) + daddu CC,CC,R2 + daddu P1,P1,32 + daddu L3,L3,CC + sd L2,-24(P1) + + sltu CC,L3,CC + daddu L3,L3,R3 + sltu R3,L3,R3 + daddu P2,P2,32 + daddu CC,CC,R3 + + daddu L4,L4,CC + daddu P3,P3,32 + sltu CC,L4,CC + daddu L4,L4,R4 + sltu R4,L4,R4 + subu P4,P4,4 + sd L3,-16(P1) + daddu CC,CC,R4 + sd L4,-8(P1) + + bgtz P4,$lab64 + b $lab65 +$lab66: + ld L1,0(P2) + ld R1,0(P3) + daddu L1,L1,CC + daddu P1,P1,8 + sltu CC,L1,CC + daddu P2,P2,8 + daddu P3,P3,8 + daddu L1,L1,R1 + subu P4,P4,1 + sltu R1,L1,R1 + sd L1,-8(P1) + daddu CC,CC,R1 + + bgtz P4,$lab66 + j $31 +$lab65: + addu P4,P4,4 + bgtz P4,$lab66 + j $31 + .end bn_add_words + +#if 1 + # Program Unit: bn_div64 + .set at + .set reorder + .text + .align 2 + .globl bn_div64 + # 321 { + .ent bn_div64 +bn_div64: + dsubu $sp, 64 + sd $31, 56($sp) + sd $16, 48($sp) + .mask 0x80010000, -56 + .frame $sp, 64, $31 + move $9, $4 + move $12, $5 + move $16, $6 + # 322 BN_ULONG dh,dl,q,ret=0,th,tl,t; + move $31, $0 + # 323 int i,count=2; + li $13, 2 + # 324 + # 325 if (d == 0) return(BN_MASK2); + bne $16, 0, $80 + dli $2, -1 + b $93 +$80: + # 326 + # 327 i=BN_num_bits_word(d); + move $4, $16 + sd $31, 16($sp) + sd $9, 24($sp) + sd $12, 32($sp) + sd $13, 40($sp) + .livereg 0x800ff0e,0xfff + jal BN_num_bits_word + dli $4, 64 + ld $31, 16($sp) + ld $9, 24($sp) + ld $12, 32($sp) + ld $13, 40($sp) + move $3, $2 + # 328 if ((i != BN_BITS2) && (h > (BN_ULONG)1<<i)) + beq $2, $4, $81 + dli $14, 1 + dsll $15, $14, $2 + bleu $9, $15, $81 + # 329 { + # 330 #if !defined(NO_STDIO) && !defined(WIN16) + # 331 fprintf(stderr,"Division would overflow (%d)\n",i); + # 332 #endif + # 333 abort(); + sd $3, 8($sp) + sd $31, 16($sp) + sd $9, 24($sp) + sd $12, 32($sp) + sd $13, 40($sp) + .livereg 0xff0e,0xfff + jal abort + dli $4, 64 + ld $3, 8($sp) + ld $31, 16($sp) + ld $9, 24($sp) + ld $12, 32($sp) + ld $13, 40($sp) + # 334 } +$81: + # 335 i=BN_BITS2-i; + dsubu $3, $4, $3 + # 336 if (h >= d) h-=d; + bltu $9, $16, $82 + dsubu $9, $9, $16 +$82: + # 337 + # 338 if (i) + beq $3, 0, $83 + # 339 { + # 340 d<<=i; + dsll $16, $16, $3 + # 341 h=(h<<i)|(l>>(BN_BITS2-i)); + dsll $24, $9, $3 + dsubu $25, $4, $3 + dsrl $14, $12, $25 + or $9, $24, $14 + # 342 l<<=i; + dsll $12, $12, $3 + # 343 } +$83: + # 344 dh=(d&BN_MASK2h)>>BN_BITS4; + # 345 dl=(d&BN_MASK2l); + and $8, $16,0xFFFFFFFF00000000 + dsrl $8, $8, 32 + # dli $10,0xFFFFFFFF # Is this needed? + # and $10, $16, $10 + dsll $10, $16, 32 + dsrl $10, $10, 32 + dli $6,0xFFFFFFFF00000000 +$84: + # 346 for (;;) + # 347 { + # 348 if ((h>>BN_BITS4) == dh) + dsrl $15, $9, 32 + bne $8, $15, $85 + # 349 q=BN_MASK2l; + dli $5, 0xFFFFFFFF + b $86 +$85: + # 350 else + # 351 q=h/dh; + ddivu $5, $9, $8 +$86: + # 352 + # 353 for (;;) + # 354 { + # 355 t=(h-q*dh); + dmul $4, $5, $8 + dsubu $2, $9, $4 + move $3, $2 + # 356 if ((t&BN_MASK2h) || + # 357 ((dl*q) <= ( + # 358 (t<<BN_BITS4)+ + # 359 ((l&BN_MASK2h)>>BN_BITS4)))) + and $25, $2, $6 + bne $25, $0, $87 + dmul $24, $10, $5 + dsll $14, $3, 32 + and $15, $12, $6 + dsrl $25, $15, 32 + daddu $15, $14, $25 + bgtu $24, $15, $88 +$87: + # 360 break; + dmul $3, $10, $5 + b $89 +$88: + # 361 q--; + daddu $5, $5, -1 + # 362 } + b $86 +$89: + # 363 th=q*dh; + # 364 tl=q*dl; + # 365 t=(tl>>BN_BITS4); + # 366 tl=(tl<<BN_BITS4)&BN_MASK2h; + dsll $14, $3, 32 + and $2, $14, $6 + move $11, $2 + # 367 th+=t; + dsrl $25, $3, 32 + daddu $7, $4, $25 + # 368 + # 369 if (l < tl) th++; + bgeu $12, $2, $90 + daddu $7, $7, 1 +$90: + # 370 l-=tl; + dsubu $12, $12, $11 + # 371 if (h < th) + bgeu $9, $7, $91 + # 372 { + # 373 h+=d; + daddu $9, $9, $16 + # 374 q--; + daddu $5, $5, -1 + # 375 } +$91: + # 376 h-=th; + dsubu $9, $9, $7 + # 377 + # 378 if (--count == 0) break; + addu $13, $13, -1 + beq $13, 0, $92 + # 379 + # 380 ret=q<<BN_BITS4; + dsll $31, $5, 32 + # 381 h=((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2; + dsll $24, $9, 32 + dsrl $15, $12, 32 + or $9, $24, $15 + # 382 l=(l&BN_MASK2l)<<BN_BITS4; + and $12, $12, 0xFFFFFFFF + dsll $12, $12, 32 + # 383 } + b $84 +$92: + # 384 ret|=q; + or $31, $31, $5 + # 385 return(ret); + move $2, $31 +$93: + ld $16, 48($sp) + ld $31, 56($sp) + daddu $sp, 64 + j $31 + .end bn_div64 +#endif diff --git a/crypto/bn/asm/x86.pl b/crypto/bn/asm/x86.pl new file mode 100644 index 0000000000..bf869fd0ee --- /dev/null +++ b/crypto/bn/asm/x86.pl @@ -0,0 +1,28 @@ +#!/usr/local/bin/perl + +push(@INC,"perlasm","../../perlasm"); +require "x86asm.pl"; + +require("x86/mul_add.pl"); +require("x86/mul.pl"); +require("x86/sqr.pl"); +require("x86/div.pl"); +require("x86/add.pl"); +require("x86/sub.pl"); +require("x86/comba.pl"); + +&asm_init($ARGV[0],"bn-586.pl"); + +&bn_mul_add_words("bn_mul_add_words"); +&bn_mul_words("bn_mul_words"); +&bn_sqr_words("bn_sqr_words"); +&bn_div_words("bn_div_words"); +&bn_add_words("bn_add_words"); +&bn_sub_words("bn_sub_words"); +&bn_mul_comba("bn_mul_comba8",8); +&bn_mul_comba("bn_mul_comba4",4); +&bn_sqr_comba("bn_sqr_comba8",8); +&bn_sqr_comba("bn_sqr_comba4",4); + +&asm_finish(); + diff --git a/crypto/bn/asm/x86/add.pl b/crypto/bn/asm/x86/add.pl new file mode 100644 index 0000000000..0b5cf583e3 --- /dev/null +++ b/crypto/bn/asm/x86/add.pl @@ -0,0 +1,76 @@ +#!/usr/local/bin/perl +# x86 assember + +sub bn_add_words + { + local($name)=@_; + + &function_begin($name,""); + + &comment(""); + $a="esi"; + $b="edi"; + $c="eax"; + $r="ebx"; + $tmp1="ecx"; + $tmp2="edx"; + $num="ebp"; + + &mov($r,&wparam(0)); # get r + &mov($a,&wparam(1)); # get a + &mov($b,&wparam(2)); # get b + &mov($num,&wparam(3)); # get num + &xor($c,$c); # clear carry + &and($num,0xfffffff8); # num / 8 + + &jz(&label("aw_finish")); + + &set_label("aw_loop",0); + for ($i=0; $i<8; $i++) + { + &comment("Round $i"); + + &mov($tmp1,&DWP($i*4,$a,"",0)); # *a + &mov($tmp2,&DWP($i*4,$b,"",0)); # *b + &add($tmp1,$c); + &mov($c,0); + &adc($c,$c); + &add($tmp1,$tmp2); + &adc($c,0); + &mov(&DWP($i*4,$r,"",0),$tmp1); # *r + } + + &comment(""); + &add($a,32); + &add($b,32); + &add($r,32); + &sub($num,8); + &jnz(&label("aw_loop")); + + &set_label("aw_finish",0); + &mov($num,&wparam(3)); # get num + &and($num,7); + &jz(&label("aw_end")); + + for ($i=0; $i<7; $i++) + { + &comment("Tail Round $i"); + &mov($tmp1,&DWP($i*4,$a,"",0)); # *a + &mov($tmp2,&DWP($i*4,$b,"",0));# *b + &add($tmp1,$c); + &mov($c,0); + &adc($c,$c); + &add($tmp1,$tmp2); + &adc($c,0); + &dec($num) if ($i != 6); + &mov(&DWP($i*4,$r,"",0),$tmp1); # *a + &jz(&label("aw_end")) if ($i != 6); + } + &set_label("aw_end",0); + +# &mov("eax",$c); # $c is "eax" + + &function_end($name); + } + +1; diff --git a/crypto/bn/asm/x86/comba.pl b/crypto/bn/asm/x86/comba.pl new file mode 100644 index 0000000000..2291253629 --- /dev/null +++ b/crypto/bn/asm/x86/comba.pl @@ -0,0 +1,277 @@ +#!/usr/local/bin/perl +# x86 assember + +sub mul_add_c + { + local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; + + # pos == -1 if eax and edx are pre-loaded, 0 to load from next + # words, and 1 if load return value + + &comment("mul a[$ai]*b[$bi]"); + + # "eax" and "edx" will always be pre-loaded. + # &mov("eax",&DWP($ai*4,$a,"",0)) ; + # &mov("edx",&DWP($bi*4,$b,"",0)); + + &mul("edx"); + &add($c0,"eax"); + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a + &mov("eax",&wparam(0)) if $pos > 0; # load r[] + ### + &adc($c1,"edx"); + &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b + &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b + ### + &adc($c2,0); + # is pos > 1, it means it is the last loop + &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[]; + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a + } + +sub sqr_add_c + { + local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; + + # pos == -1 if eax and edx are pre-loaded, 0 to load from next + # words, and 1 if load return value + + &comment("sqr a[$ai]*a[$bi]"); + + # "eax" and "edx" will always be pre-loaded. + # &mov("eax",&DWP($ai*4,$a,"",0)) ; + # &mov("edx",&DWP($bi*4,$b,"",0)); + + if ($ai == $bi) + { &mul("eax");} + else + { &mul("edx");} + &add($c0,"eax"); + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a + ### + &adc($c1,"edx"); + &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb); + ### + &adc($c2,0); + # is pos > 1, it means it is the last loop + &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[]; + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b + } + +sub sqr_add_c2 + { + local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; + + # pos == -1 if eax and edx are pre-loaded, 0 to load from next + # words, and 1 if load return value + + &comment("sqr a[$ai]*a[$bi]"); + + # "eax" and "edx" will always be pre-loaded. + # &mov("eax",&DWP($ai*4,$a,"",0)) ; + # &mov("edx",&DWP($bi*4,$a,"",0)); + + if ($ai == $bi) + { &mul("eax");} + else + { &mul("edx");} + &add("eax","eax"); + ### + &adc("edx","edx"); + ### + &adc($c2,0); + &add($c0,"eax"); + &adc($c1,"edx"); + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a + &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b + &adc($c2,0); + &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[]; + &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb); + ### + } + +sub bn_mul_comba + { + local($name,$num)=@_; + local($a,$b,$c0,$c1,$c2); + local($i,$as,$ae,$bs,$be,$ai,$bi); + local($tot,$end); + + &function_begin_B($name,""); + + $c0="ebx"; + $c1="ecx"; + $c2="ebp"; + $a="esi"; + $b="edi"; + + $as=0; + $ae=0; + $bs=0; + $be=0; + $tot=$num+$num-1; + + &push("esi"); + &mov($a,&wparam(1)); + &push("edi"); + &mov($b,&wparam(2)); + &push("ebp"); + &push("ebx"); + + &xor($c0,$c0); + &mov("eax",&DWP(0,$a,"",0)); # load the first word + &xor($c1,$c1); + &mov("edx",&DWP(0,$b,"",0)); # load the first second + + for ($i=0; $i<$tot; $i++) + { + $ai=$as; + $bi=$bs; + $end=$be+1; + + &comment("################## Calculate word $i"); + + for ($j=$bs; $j<$end; $j++) + { + &xor($c2,$c2) if ($j == $bs); + if (($j+1) == $end) + { + $v=1; + $v=2 if (($i+1) == $tot); + } + else + { $v=0; } + if (($j+1) != $end) + { + $na=($ai-1); + $nb=($bi+1); + } + else + { + $na=$as+($i < ($num-1)); + $nb=$bs+($i >= ($num-1)); + } +#printf STDERR "[$ai,$bi] -> [$na,$nb]\n"; + &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb); + if ($v) + { + &comment("saved r[$i]"); + # &mov("eax",&wparam(0)); + # &mov(&DWP($i*4,"eax","",0),$c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + } + $ai--; + $bi++; + } + $as++ if ($i < ($num-1)); + $ae++ if ($i >= ($num-1)); + + $bs++ if ($i >= ($num-1)); + $be++ if ($i < ($num-1)); + } + &comment("save r[$i]"); + # &mov("eax",&wparam(0)); + &mov(&DWP($i*4,"eax","",0),$c0); + + &pop("ebx"); + &pop("ebp"); + &pop("edi"); + &pop("esi"); + &ret(); + &function_end_B($name); + } + +sub bn_sqr_comba + { + local($name,$num)=@_; + local($r,$a,$c0,$c1,$c2)=@_; + local($i,$as,$ae,$bs,$be,$ai,$bi); + local($b,$tot,$end,$half); + + &function_begin_B($name,""); + + $c0="ebx"; + $c1="ecx"; + $c2="ebp"; + $a="esi"; + $r="edi"; + + &push("esi"); + &push("edi"); + &push("ebp"); + &push("ebx"); + &mov($r,&wparam(0)); + &mov($a,&wparam(1)); + &xor($c0,$c0); + &xor($c1,$c1); + &mov("eax",&DWP(0,$a,"",0)); # load the first word + + $as=0; + $ae=0; + $bs=0; + $be=0; + $tot=$num+$num-1; + + for ($i=0; $i<$tot; $i++) + { + $ai=$as; + $bi=$bs; + $end=$be+1; + + &comment("############### Calculate word $i"); + for ($j=$bs; $j<$end; $j++) + { + &xor($c2,$c2) if ($j == $bs); + if (($ai-1) < ($bi+1)) + { + $v=1; + $v=2 if ($i+1) == $tot; + } + else + { $v=0; } + if (!$v) + { + $na=$ai-1; + $nb=$bi+1; + } + else + { + $na=$as+($i < ($num-1)); + $nb=$bs+($i >= ($num-1)); + } + if ($ai == $bi) + { + &sqr_add_c($r,$a,$ai,$bi, + $c0,$c1,$c2,$v,$i,$na,$nb); + } + else + { + &sqr_add_c2($r,$a,$ai,$bi, + $c0,$c1,$c2,$v,$i,$na,$nb); + } + if ($v) + { + &comment("saved r[$i]"); + #&mov(&DWP($i*4,$r,"",0),$c0); + ($c0,$c1,$c2)=($c1,$c2,$c0); + last; + } + $ai--; + $bi++; + } + $as++ if ($i < ($num-1)); + $ae++ if ($i >= ($num-1)); + + $bs++ if ($i >= ($num-1)); + $be++ if ($i < ($num-1)); + } + &mov(&DWP($i*4,$r,"",0),$c0); + &pop("ebx"); + &pop("ebp"); + &pop("edi"); + &pop("esi"); + &ret(); + &function_end_B($name); + } + +1; diff --git a/crypto/bn/asm/x86/div.pl b/crypto/bn/asm/x86/div.pl new file mode 100644 index 0000000000..0e90152caa --- /dev/null +++ b/crypto/bn/asm/x86/div.pl @@ -0,0 +1,15 @@ +#!/usr/local/bin/perl +# x86 assember + +sub bn_div_words + { + local($name)=@_; + + &function_begin($name,""); + &mov("edx",&wparam(0)); # + &mov("eax",&wparam(1)); # + &mov("ebx",&wparam(2)); # + &div("ebx"); + &function_end($name); + } +1; diff --git a/crypto/bn/asm/x86/f b/crypto/bn/asm/x86/f new file mode 100644 index 0000000000..22e4112224 --- /dev/null +++ b/crypto/bn/asm/x86/f @@ -0,0 +1,3 @@ +#!/usr/local/bin/perl +# x86 assember + diff --git a/crypto/bn/asm/x86/mul.pl b/crypto/bn/asm/x86/mul.pl new file mode 100644 index 0000000000..674cb9b055 --- /dev/null +++ b/crypto/bn/asm/x86/mul.pl @@ -0,0 +1,77 @@ +#!/usr/local/bin/perl +# x86 assember + +sub bn_mul_words + { + local($name)=@_; + + &function_begin($name,""); + + &comment(""); + $Low="eax"; + $High="edx"; + $a="ebx"; + $w="ecx"; + $r="edi"; + $c="esi"; + $num="ebp"; + + &xor($c,$c); # clear carry + &mov($r,&wparam(0)); # + &mov($a,&wparam(1)); # + &mov($num,&wparam(2)); # + &mov($w,&wparam(3)); # + + &and($num,0xfffffff8); # num / 8 + &jz(&label("mw_finish")); + + &set_label("mw_loop",0); + for ($i=0; $i<32; $i+=4) + { + &comment("Round $i"); + + &mov("eax",&DWP($i,$a,"",0)); # *a + &mul($w); # *a * w + &add("eax",$c); # L(t)+=c + # XXX + + &adc("edx",0); # H(t)+=carry + &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); + + &mov($c,"edx"); # c= H(t); + } + + &comment(""); + &add($a,32); + &add($r,32); + &sub($num,8); + &jz(&label("mw_finish")); + &jmp(&label("mw_loop")); + + &set_label("mw_finish",0); + &mov($num,&wparam(2)); # get num + &and($num,7); + &jnz(&label("mw_finish2")); + &jmp(&label("mw_end")); + + &set_label("mw_finish2",1); + for ($i=0; $i<7; $i++) + { + &comment("Tail Round $i"); + &mov("eax",&DWP($i*4,$a,"",0));# *a + &mul($w); # *a * w + &add("eax",$c); # L(t)+=c + # XXX + &adc("edx",0); # H(t)+=carry + &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t); + &mov($c,"edx"); # c= H(t); + &dec($num) if ($i != 7-1); + &jz(&label("mw_end")) if ($i != 7-1); + } + &set_label("mw_end",0); + &mov("eax",$c); + + &function_end($name); + } + +1; diff --git a/crypto/bn/asm/x86/mul_add.pl b/crypto/bn/asm/x86/mul_add.pl new file mode 100644 index 0000000000..61830d3a90 --- /dev/null +++ b/crypto/bn/asm/x86/mul_add.pl @@ -0,0 +1,87 @@ +#!/usr/local/bin/perl +# x86 assember + +sub bn_mul_add_words + { + local($name)=@_; + + &function_begin($name,""); + + &comment(""); + $Low="eax"; + $High="edx"; + $a="ebx"; + $w="ebp"; + $r="edi"; + $c="esi"; + + &xor($c,$c); # clear carry + &mov($r,&wparam(0)); # + + &mov("ecx",&wparam(2)); # + &mov($a,&wparam(1)); # + + &and("ecx",0xfffffff8); # num / 8 + &mov($w,&wparam(3)); # + + &push("ecx"); # Up the stack for a tmp variable + + &jz(&label("maw_finish")); + + &set_label("maw_loop",0); + + &mov(&swtmp(0),"ecx"); # + + for ($i=0; $i<32; $i+=4) + { + &comment("Round $i"); + + &mov("eax",&DWP($i,$a,"",0)); # *a + &mul($w); # *a * w + &add("eax",$c); # L(t)+= *r + &mov($c,&DWP($i,$r,"",0)); # L(t)+= *r + &adc("edx",0); # H(t)+=carry + &add("eax",$c); # L(t)+=c + &adc("edx",0); # H(t)+=carry + &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); + &mov($c,"edx"); # c= H(t); + } + + &comment(""); + &mov("ecx",&swtmp(0)); # + &add($a,32); + &add($r,32); + &sub("ecx",8); + &jnz(&label("maw_loop")); + + &set_label("maw_finish",0); + &mov("ecx",&wparam(2)); # get num + &and("ecx",7); + &jnz(&label("maw_finish2")); # helps branch prediction + &jmp(&label("maw_end")); + + &set_label("maw_finish2",1); + for ($i=0; $i<7; $i++) + { + &comment("Tail Round $i"); + &mov("eax",&DWP($i*4,$a,"",0));# *a + &mul($w); # *a * w + &add("eax",$c); # L(t)+=c + &mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r + &adc("edx",0); # H(t)+=carry + &add("eax",$c); + &adc("edx",0); # H(t)+=carry + &dec("ecx") if ($i != 7-1); + &mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t); + &mov($c,"edx"); # c= H(t); + &jz(&label("maw_end")) if ($i != 7-1); + } + &set_label("maw_end",0); + &mov("eax",$c); + + &pop("ecx"); # clear variable from + + &function_end($name); + } + +1; diff --git a/crypto/bn/asm/x86/sqr.pl b/crypto/bn/asm/x86/sqr.pl new file mode 100644 index 0000000000..1f90993cf6 --- /dev/null +++ b/crypto/bn/asm/x86/sqr.pl @@ -0,0 +1,60 @@ +#!/usr/local/bin/perl +# x86 assember + +sub bn_sqr_words + { + local($name)=@_; + + &function_begin($name,""); + + &comment(""); + $r="esi"; + $a="edi"; + $num="ebx"; + + &mov($r,&wparam(0)); # + &mov($a,&wparam(1)); # + &mov($num,&wparam(2)); # + + &and($num,0xfffffff8); # num / 8 + &jz(&label("sw_finish")); + + &set_label("sw_loop",0); + for ($i=0; $i<32; $i+=4) + { + &comment("Round $i"); + &mov("eax",&DWP($i,$a,"",0)); # *a + # XXX + &mul("eax"); # *a * *a + &mov(&DWP($i*2,$r,"",0),"eax"); # + &mov(&DWP($i*2+4,$r,"",0),"edx");# + } + + &comment(""); + &add($a,32); + &add($r,64); + &sub($num,8); + &jnz(&label("sw_loop")); + + &set_label("sw_finish",0); + &mov($num,&wparam(2)); # get num + &and($num,7); + &jz(&label("sw_end")); + + for ($i=0; $i<7; $i++) + { + &comment("Tail Round $i"); + &mov("eax",&DWP($i*4,$a,"",0)); # *a + # XXX + &mul("eax"); # *a * *a + &mov(&DWP($i*8,$r,"",0),"eax"); # + &dec($num) if ($i != 7-1); + &mov(&DWP($i*8+4,$r,"",0),"edx"); + &jz(&label("sw_end")) if ($i != 7-1); + } + &set_label("sw_end",0); + + &function_end($name); + } + +1; diff --git a/crypto/bn/asm/x86/sub.pl b/crypto/bn/asm/x86/sub.pl new file mode 100644 index 0000000000..837b0e1b07 --- /dev/null +++ b/crypto/bn/asm/x86/sub.pl @@ -0,0 +1,76 @@ +#!/usr/local/bin/perl +# x86 assember + +sub bn_sub_words + { + local($name)=@_; + + &function_begin($name,""); + + &comment(""); + $a="esi"; + $b="edi"; + $c="eax"; + $r="ebx"; + $tmp1="ecx"; + $tmp2="edx"; + $num="ebp"; + + &mov($r,&wparam(0)); # get r + &mov($a,&wparam(1)); # get a + &mov($b,&wparam(2)); # get b + &mov($num,&wparam(3)); # get num + &xor($c,$c); # clear carry + &and($num,0xfffffff8); # num / 8 + + &jz(&label("aw_finish")); + + &set_label("aw_loop",0); + for ($i=0; $i<8; $i++) + { + &comment("Round $i"); + + &mov($tmp1,&DWP($i*4,$a,"",0)); # *a + &mov($tmp2,&DWP($i*4,$b,"",0)); # *b + &sub($tmp1,$c); + &mov($c,0); + &adc($c,$c); + &sub($tmp1,$tmp2); + &adc($c,0); + &mov(&DWP($i*4,$r,"",0),$tmp1); # *r + } + + &comment(""); + &add($a,32); + &add($b,32); + &add($r,32); + &sub($num,8); + &jnz(&label("aw_loop")); + + &set_label("aw_finish",0); + &mov($num,&wparam(3)); # get num + &and($num,7); + &jz(&label("aw_end")); + + for ($i=0; $i<7; $i++) + { + &comment("Tail Round $i"); + &mov($tmp1,&DWP($i*4,$a,"",0)); # *a + &mov($tmp2,&DWP($i*4,$b,"",0));# *b + &sub($tmp1,$c); + &mov($c,0); + &adc($c,$c); + &sub($tmp1,$tmp2); + &adc($c,0); + &dec($num) if ($i != 6); + &mov(&DWP($i*4,$r,"",0),$tmp1); # *a + &jz(&label("aw_end")) if ($i != 6); + } + &set_label("aw_end",0); + +# &mov("eax",$c); # $c is "eax" + + &function_end($name); + } + +1; diff --git a/crypto/bn/asm/x86w16.asm b/crypto/bn/asm/x86w16.asm index 74a933a8cd..80a9ed6eef 100644 --- a/crypto/bn/asm/x86w16.asm +++ b/crypto/bn/asm/x86w16.asm @@ -6,11 +6,11 @@ F_TEXT SEGMENT WORD PUBLIC 'CODE' F_TEXT ENDS _DATA SEGMENT WORD PUBLIC 'DATA' _DATA ENDS -CONST SEGMENT WORD PUBLIC 'CONST' -CONST ENDS +_CONST SEGMENT WORD PUBLIC 'CONST' +_CONST ENDS _BSS SEGMENT WORD PUBLIC 'BSS' _BSS ENDS -DGROUP GROUP CONST, _BSS, _DATA +DGROUP GROUP _CONST, _BSS, _DATA ASSUME DS: DGROUP, SS: DGROUP F_TEXT SEGMENT ASSUME CS: F_TEXT diff --git a/crypto/bn/asm/x86w32.asm b/crypto/bn/asm/x86w32.asm index fc6f917714..957d71e3b1 100644 --- a/crypto/bn/asm/x86w32.asm +++ b/crypto/bn/asm/x86w32.asm @@ -6,11 +6,11 @@ F_TEXT SEGMENT WORD USE16 PUBLIC 'CODE' F_TEXT ENDS _DATA SEGMENT WORD USE16 PUBLIC 'DATA' _DATA ENDS -CONST SEGMENT WORD USE16 PUBLIC 'CONST' -CONST ENDS +_CONST SEGMENT WORD USE16 PUBLIC 'CONST' +_CONST ENDS _BSS SEGMENT WORD USE16 PUBLIC 'BSS' _BSS ENDS -DGROUP GROUP CONST, _BSS, _DATA +DGROUP GROUP _CONST, _BSS, _DATA ASSUME DS: DGROUP, SS: DGROUP F_TEXT SEGMENT ASSUME CS: F_TEXT @@ -89,7 +89,7 @@ $L555: mov bp,WORD PTR [bp+26] ; load num and bp,3 dec bp - js $L547 + js $L547m mov eax,ecx mul DWORD PTR es:[bx] ; w* *a @@ -100,7 +100,7 @@ $L555: mov DWORD PTR ds:[di],eax mov esi,edx dec bp - js $L547 ; Note that we are now testing for -1 + js $L547m ; Note that we are now testing for -1 ; mov eax,ecx mul DWORD PTR es:[bx+4] ; w* *a @@ -111,7 +111,7 @@ $L555: mov DWORD PTR ds:[di+4],eax mov esi,edx dec bp - js $L547 + js $L547m ; mov eax,ecx mul DWORD PTR es:[bx+8] ; w* *a @@ -121,7 +121,7 @@ $L555: adc edx,0 mov DWORD PTR ds:[di+8],eax mov esi,edx -$L547: +$L547m: mov eax,esi mov edx,esi shr edx,16 @@ -315,37 +315,35 @@ _bn_add_words PROC FAR ; ap = 22 ; rp = 18 xor esi,esi ;c=0; + mov bx,WORD PTR [bp+18] ; load low r mov si,WORD PTR [bp+22] ; load a mov es,WORD PTR [bp+24] ; load a mov di,WORD PTR [bp+26] ; load b mov ds,WORD PTR [bp+28] ; load b mov dx,WORD PTR [bp+30] ; load num - dec dx - js $L547 xor ecx,ecx + dec dx + js $L547a $L5477: - xor ebx,ebx mov eax,DWORD PTR es:[si] ; *a add eax,ecx - adc ebx,0 + mov ecx,0 + adc ecx,0 add si,4 ; a++ add eax,DWORD PTR ds:[di] ; + *b - mov ecx,ebx adc ecx,0 - add di,4 - mov bx,WORD PTR [bp+18] mov ds,WORD PTR [bp+20] + add di,4 mov DWORD PTR ds:[bx],eax - add bx,4 mov ds,WORD PTR [bp+28] - mov WORD PTR [bp+18],bx + add bx,4 dec dx - js $L547 ; Note that we are now testing for -1 + js $L547a ; Note that we are now testing for -1 jmp $L5477 ; -$L547: +$L547a: mov eax,ecx mov edx,ecx shr edx,16 |