summaryrefslogtreecommitdiff
path: root/crypto/bn/asm
diff options
context:
space:
mode:
authorRalf S. Engelschall <rse@openssl.org>1998-12-21 11:00:56 +0000
committerRalf S. Engelschall <rse@openssl.org>1998-12-21 11:00:56 +0000
commitdfeab0689f69c0b4bd3480ffd37a9cacc2f17d9c (patch)
tree2f74e0cfd76a9e092548a9bf52e579aef984299b /crypto/bn/asm
parent58964a492275ca9a59a0cd9c8155cb2491b4b909 (diff)
downloadopenssl-new-SSLeay.tar.gz
Import of old SSLeay release: SSLeay 0.9.1b (unreleased)SSLeay
Diffstat (limited to 'crypto/bn/asm')
-rw-r--r--crypto/bn/asm/a.outbin0 -> 5795 bytes
-rw-r--r--crypto/bn/asm/alpha.s1846
-rw-r--r--crypto/bn/asm/alpha.s.works533
-rw-r--r--crypto/bn/asm/alpha.works/add.pl119
-rw-r--r--crypto/bn/asm/alpha.works/div.pl144
-rw-r--r--crypto/bn/asm/alpha.works/mul.pl116
-rw-r--r--crypto/bn/asm/alpha.works/mul_add.pl120
-rw-r--r--crypto/bn/asm/alpha.works/mul_c4.pl213
-rw-r--r--crypto/bn/asm/alpha.works/mul_c4.works.pl98
-rw-r--r--crypto/bn/asm/alpha.works/mul_c8.pl177
-rw-r--r--crypto/bn/asm/alpha.works/sqr.pl113
-rw-r--r--crypto/bn/asm/alpha.works/sqr_c4.pl109
-rw-r--r--crypto/bn/asm/alpha.works/sqr_c8.pl132
-rw-r--r--crypto/bn/asm/alpha.works/sub.pl108
-rw-r--r--crypto/bn/asm/alpha/add.pl118
-rw-r--r--crypto/bn/asm/alpha/div.pl144
-rw-r--r--crypto/bn/asm/alpha/mul.pl104
-rw-r--r--crypto/bn/asm/alpha/mul_add.pl123
-rw-r--r--crypto/bn/asm/alpha/mul_c4.pl215
-rw-r--r--crypto/bn/asm/alpha/mul_c4.works.pl98
-rw-r--r--crypto/bn/asm/alpha/mul_c8.pl177
-rw-r--r--crypto/bn/asm/alpha/sqr.pl113
-rw-r--r--crypto/bn/asm/alpha/sqr_c4.pl109
-rw-r--r--crypto/bn/asm/alpha/sqr_c8.pl132
-rw-r--r--crypto/bn/asm/alpha/sub.pl108
-rw-r--r--crypto/bn/asm/bn-586.pl82
-rw-r--r--crypto/bn/asm/bn-alpha.pl571
-rw-r--r--crypto/bn/asm/bn-win32.asm1441
-rw-r--r--crypto/bn/asm/bn86unix.cpp1465
-rw-r--r--crypto/bn/asm/ca.pl33
-rw-r--r--crypto/bn/asm/co-586.pl286
-rw-r--r--crypto/bn/asm/co-alpha.pl116
-rw-r--r--crypto/bn/asm/co86unix.cpp1315
-rw-r--r--crypto/bn/asm/elf.s1269
-rw-r--r--crypto/bn/asm/f500
-rw-r--r--crypto/bn/asm/f.c8
-rw-r--r--crypto/bn/asm/f.elf2149
-rw-r--r--crypto/bn/asm/f.s1773
-rw-r--r--crypto/bn/asm/ff724
-rw-r--r--crypto/bn/asm/mips1.s539
-rw-r--r--crypto/bn/asm/mips3.s544
-rw-r--r--crypto/bn/asm/x86.pl28
-rw-r--r--crypto/bn/asm/x86/add.pl76
-rw-r--r--crypto/bn/asm/x86/comba.pl277
-rw-r--r--crypto/bn/asm/x86/div.pl15
-rw-r--r--crypto/bn/asm/x86/f3
-rw-r--r--crypto/bn/asm/x86/mul.pl77
-rw-r--r--crypto/bn/asm/x86/mul_add.pl87
-rw-r--r--crypto/bn/asm/x86/sqr.pl60
-rw-r--r--crypto/bn/asm/x86/sub.pl76
-rw-r--r--crypto/bn/asm/x86w16.asm6
-rw-r--r--crypto/bn/asm/x86w32.asm34
52 files changed, 18638 insertions, 185 deletions
diff --git a/crypto/bn/asm/a.out b/crypto/bn/asm/a.out
new file mode 100644
index 0000000000..cc5094ff45
--- /dev/null
+++ b/crypto/bn/asm/a.out
Binary files differ
diff --git a/crypto/bn/asm/alpha.s b/crypto/bn/asm/alpha.s
index 1d17b1d619..cf0b69cff9 100644
--- a/crypto/bn/asm/alpha.s
+++ b/crypto/bn/asm/alpha.s
@@ -2,7 +2,13 @@
# The bn_div64 is actually gcc output but the other parts are hand done.
# Thanks to tzeruch@ceddec.com for sending me the gcc output for
# bn_div64.
- .file 1 "bn_mulw.c"
+ # I've gone back and re-done most of routines.
+ # The key thing to remeber for the 164 CPU is that while a
+ # multiply operation takes 8 cycles, another one can only be issued
+ # after 4 cycles have elapsed. I've done modification to help
+ # improve this. Also, normally, a ld instruction will not be available
+ # for about 3 cycles.
+ .file 1 "bn_asm.c"
.set noat
gcc2_compiled.:
__gnu_compiled_c:
@@ -14,65 +20,91 @@ bn_mul_add_words:
bn_mul_add_words..ng:
.frame $30,0,$26,0
.prologue 0
- subq $18,2,$25 # num=-2
- bis $31,$31,$0
- blt $25,$42
.align 5
-$142:
- subq $18,2,$18 # num-=2
- subq $25,2,$25 # num-=2
-
- ldq $1,0($17) # a[0]
- ldq $2,8($17) # a[1]
-
- mulq $19,$1,$3 # a[0]*w low part r3
- umulh $19,$1,$1 # a[0]*w high part r1
- mulq $19,$2,$4 # a[1]*w low part r4
- umulh $19,$2,$2 # a[1]*w high part r2
-
- ldq $22,0($16) # r[0] r22
- ldq $23,8($16) # r[1] r23
-
- addq $3,$22,$3 # a0 low part + r[0]
- addq $4,$23,$4 # a1 low part + r[1]
- cmpult $3,$22,$5 # overflow?
- cmpult $4,$23,$6 # overflow?
- addq $5,$1,$1 # high part + overflow
- addq $6,$2,$2 # high part + overflow
-
- addq $3,$0,$3 # add c
- cmpult $3,$0,$5 # overflow?
- stq $3,0($16)
- addq $5,$1,$0 # c=high part + overflow
-
- addq $4,$0,$4 # add c
- cmpult $4,$0,$5 # overflow?
- stq $4,8($16)
- addq $5,$2,$0 # c=high part + overflow
+ subq $18,4,$18
+ bis $31,$31,$0
+ blt $18,$43 # if we are -1, -2, -3 or -4 goto tail code
+ ldq $20,0($17) # 1 1
+ ldq $1,0($16) # 1 1
+ .align 3
+$42:
+ mulq $20,$19,$5 # 1 2 1 ######
+ ldq $21,8($17) # 2 1
+ ldq $2,8($16) # 2 1
+ umulh $20,$19,$20 # 1 2 ######
+ ldq $27,16($17) # 3 1
+ ldq $3,16($16) # 3 1
+ mulq $21,$19,$6 # 2 2 1 ######
+ ldq $28,24($17) # 4 1
+ addq $1,$5,$1 # 1 2 2
+ ldq $4,24($16) # 4 1
+ umulh $21,$19,$21 # 2 2 ######
+ cmpult $1,$5,$22 # 1 2 3 1
+ addq $20,$22,$20 # 1 3 1
+ addq $1,$0,$1 # 1 2 3 1
+ mulq $27,$19,$7 # 3 2 1 ######
+ cmpult $1,$0,$0 # 1 2 3 2
+ addq $2,$6,$2 # 2 2 2
+ addq $20,$0,$0 # 1 3 2
+ cmpult $2,$6,$23 # 2 2 3 1
+ addq $21,$23,$21 # 2 3 1
+ umulh $27,$19,$27 # 3 2 ######
+ addq $2,$0,$2 # 2 2 3 1
+ cmpult $2,$0,$0 # 2 2 3 2
+ subq $18,4,$18
+ mulq $28,$19,$8 # 4 2 1 ######
+ addq $21,$0,$0 # 2 3 2
+ addq $3,$7,$3 # 3 2 2
+ addq $16,32,$16
+ cmpult $3,$7,$24 # 3 2 3 1
+ stq $1,-32($16) # 1 2 4
+ umulh $28,$19,$28 # 4 2 ######
+ addq $27,$24,$27 # 3 3 1
+ addq $3,$0,$3 # 3 2 3 1
+ stq $2,-24($16) # 2 2 4
+ cmpult $3,$0,$0 # 3 2 3 2
+ stq $3,-16($16) # 3 2 4
+ addq $4,$8,$4 # 4 2 2
+ addq $27,$0,$0 # 3 3 2
+ cmpult $4,$8,$25 # 4 2 3 1
+ addq $17,32,$17
+ addq $28,$25,$28 # 4 3 1
+ addq $4,$0,$4 # 4 2 3 1
+ cmpult $4,$0,$0 # 4 2 3 2
+ stq $4,-8($16) # 4 2 4
+ addq $28,$0,$0 # 4 3 2
+ blt $18,$43
- ble $18,$43
+ ldq $20,0($17) # 1 1
+ ldq $1,0($16) # 1 1
- addq $16,16,$16
- addq $17,16,$17
- blt $25,$42
+ br $42
- br $31,$142
-$42:
- ldq $1,0($17) # a[0]
- umulh $19,$1,$3 # a[0]*w high part
- mulq $19,$1,$1 # a[0]*w low part
- ldq $2,0($16) # r[0]
- addq $1,$2,$1 # low part + r[0]
- cmpult $1,$2,$4 # overflow?
- addq $4,$3,$3 # high part + overflow
- addq $1,$0,$1 # add c
- cmpult $1,$0,$4 # overflow?
- addq $4,$3,$0 # c=high part + overflow
- stq $1,0($16)
+ .align 4
+$45:
+ ldq $20,0($17) # 4 1
+ ldq $1,0($16) # 4 1
+ mulq $20,$19,$5 # 4 2 1
+ subq $18,1,$18
+ addq $16,8,$16
+ addq $17,8,$17
+ umulh $20,$19,$20 # 4 2
+ addq $1,$5,$1 # 4 2 2
+ cmpult $1,$5,$22 # 4 2 3 1
+ addq $20,$22,$20 # 4 3 1
+ addq $1,$0,$1 # 4 2 3 1
+ cmpult $1,$0,$0 # 4 2 3 2
+ addq $20,$0,$0 # 4 3 2
+ stq $1,-8($16) # 4 2 4
+ bgt $18,$45
+ ret $31,($26),1 # else exit
.align 4
$43:
- ret $31,($26),1
+ addq $18,4,$18
+ bgt $18,$45 # goto tail code
+ ret $31,($26),1 # else exit
+
.end bn_mul_add_words
.align 3
.globl bn_mul_words
@@ -81,49 +113,75 @@ bn_mul_words:
bn_mul_words..ng:
.frame $30,0,$26,0
.prologue 0
- subq $18,2,$25 # num=-2
- bis $31,$31,$0
- blt $25,$242
.align 5
-$342:
- subq $18,2,$18 # num-=2
- subq $25,2,$25 # num-=2
-
- ldq $1,0($17) # a[0]
- ldq $2,8($17) # a[1]
-
- mulq $19,$1,$3 # a[0]*w low part r3
- umulh $19,$1,$1 # a[0]*w high part r1
- mulq $19,$2,$4 # a[1]*w low part r4
- umulh $19,$2,$2 # a[1]*w high part r2
-
- addq $3,$0,$3 # add c
- cmpult $3,$0,$5 # overflow?
- stq $3,0($16)
- addq $5,$1,$0 # c=high part + overflow
-
- addq $4,$0,$4 # add c
- cmpult $4,$0,$5 # overflow?
- stq $4,8($16)
- addq $5,$2,$0 # c=high part + overflow
-
- ble $18,$243
-
- addq $16,16,$16
- addq $17,16,$17
- blt $25,$242
-
- br $31,$342
-$242:
- ldq $1,0($17) # a[0]
- umulh $19,$1,$3 # a[0]*w high part
- mulq $19,$1,$1 # a[0]*w low part
- addq $1,$0,$1 # add c
- cmpult $1,$0,$4 # overflow?
- addq $4,$3,$0 # c=high part + overflow
- stq $1,0($16)
-$243:
- ret $31,($26),1
+ subq $18,4,$18
+ bis $31,$31,$0
+ blt $18,$143 # if we are -1, -2, -3 or -4 goto tail code
+ ldq $20,0($17) # 1 1
+ .align 3
+$142:
+
+ mulq $20,$19,$5 # 1 2 1 #####
+ ldq $21,8($17) # 2 1
+ ldq $27,16($17) # 3 1
+ umulh $20,$19,$20 # 1 2 #####
+ ldq $28,24($17) # 4 1
+ mulq $21,$19,$6 # 2 2 1 #####
+ addq $5,$0,$5 # 1 2 3 1
+ subq $18,4,$18
+ cmpult $5,$0,$0 # 1 2 3 2
+ umulh $21,$19,$21 # 2 2 #####
+ addq $20,$0,$0 # 1 3 2
+ addq $17,32,$17
+ addq $6,$0,$6 # 2 2 3 1
+ mulq $27,$19,$7 # 3 2 1 #####
+ cmpult $6,$0,$0 # 2 2 3 2
+ addq $21,$0,$0 # 2 3 2
+ addq $16,32,$16
+ umulh $27,$19,$27 # 3 2 #####
+ stq $5,-32($16) # 1 2 4
+ mulq $28,$19,$8 # 4 2 1 #####
+ addq $7,$0,$7 # 3 2 3 1
+ stq $6,-24($16) # 2 2 4
+ cmpult $7,$0,$0 # 3 2 3 2
+ umulh $28,$19,$28 # 4 2 #####
+ addq $27,$0,$0 # 3 3 2
+ stq $7,-16($16) # 3 2 4
+ addq $8,$0,$8 # 4 2 3 1
+ cmpult $8,$0,$0 # 4 2 3 2
+
+ addq $28,$0,$0 # 4 3 2
+
+ stq $8,-8($16) # 4 2 4
+
+ blt $18,$143
+
+ ldq $20,0($17) # 1 1
+
+ br $142
+
+ .align 4
+$145:
+ ldq $20,0($17) # 4 1
+ mulq $20,$19,$5 # 4 2 1
+ subq $18,1,$18
+ umulh $20,$19,$20 # 4 2
+ addq $5,$0,$5 # 4 2 3 1
+ addq $16,8,$16
+ cmpult $5,$0,$0 # 4 2 3 2
+ addq $17,8,$17
+ addq $20,$0,$0 # 4 3 2
+ stq $5,-8($16) # 4 2 4
+
+ bgt $18,$145
+ ret $31,($26),1 # else exit
+
+ .align 4
+$143:
+ addq $18,4,$18
+ bgt $18,$145 # goto tail code
+ ret $31,($26),1 # else exit
+
.end bn_mul_words
.align 3
.globl bn_sqr_words
@@ -132,44 +190,58 @@ bn_sqr_words:
bn_sqr_words..ng:
.frame $30,0,$26,0
.prologue 0
-
- subq $18,2,$25 # num=-2
- blt $25,$442
- .align 5
-$542:
- subq $18,2,$18 # num-=2
- subq $25,2,$25 # num-=2
-
- ldq $1,0($17) # a[0]
- ldq $4,8($17) # a[1]
- mulq $1,$1,$2 # a[0]*w low part r2
- umulh $1,$1,$3 # a[0]*w high part r3
- mulq $4,$4,$5 # a[1]*w low part r5
- umulh $4,$4,$6 # a[1]*w high part r6
-
- stq $2,0($16) # r[0]
- stq $3,8($16) # r[1]
- stq $5,16($16) # r[3]
- stq $6,24($16) # r[4]
+ subq $18,4,$18
+ blt $18,$543 # if we are -1, -2, -3 or -4 goto tail code
+ ldq $20,0($17) # 1 1
+ .align 3
+$542:
+ mulq $20,$20,$5 ######
+ ldq $21,8($17) # 1 1
+ subq $18,4
+ umulh $20,$20,$1 ######
+ ldq $27,16($17) # 1 1
+ mulq $21,$21,$6 ######
+ ldq $28,24($17) # 1 1
+ stq $5,0($16) # r[0]
+ umulh $21,$21,$2 ######
+ stq $1,8($16) # r[1]
+ mulq $27,$27,$7 ######
+ stq $6,16($16) # r[0]
+ umulh $27,$27,$3 ######
+ stq $2,24($16) # r[1]
+ mulq $28,$28,$8 ######
+ stq $7,32($16) # r[0]
+ umulh $28,$28,$4 ######
+ stq $3,40($16) # r[1]
- ble $18,$443
+ addq $16,64,$16
+ addq $17,32,$17
+ stq $8,-16($16) # r[0]
+ stq $4,-8($16) # r[1]
- addq $16,32,$16
- addq $17,16,$17
- blt $25,$442
- br $31,$542
+ blt $18,$543
+ ldq $20,0($17) # 1 1
+ br $542
$442:
- ldq $1,0($17) # a[0]
- mulq $1,$1,$2 # a[0]*w low part r2
- umulh $1,$1,$3 # a[0]*w high part r3
- stq $2,0($16) # r[0]
- stq $3,8($16) # r[1]
+ ldq $20,0($17) # a[0]
+ mulq $20,$20,$5 # a[0]*w low part r2
+ addq $16,16,$16
+ addq $17,8,$17
+ subq $18,1,$18
+ umulh $20,$20,$1 # a[0]*w high part r3
+ stq $5,-16($16) # r[0]
+ stq $1,-8($16) # r[1]
+
+ bgt $18,$442
+ ret $31,($26),1 # else exit
.align 4
-$443:
- ret $31,($26),1
+$543:
+ addq $18,4,$18
+ bgt $18,$442 # goto tail code
+ ret $31,($26),1 # else exit
.end bn_sqr_words
.align 3
@@ -180,31 +252,74 @@ bn_add_words..ng:
.frame $30,0,$26,0
.prologue 0
- bis $31,$31,$8 # carry = 0
- ble $19,$900
+ subq $19,4,$19
+ bis $31,$31,$0 # carry = 0
+ blt $19,$900
+ ldq $5,0($17) # a[0]
+ ldq $1,0($18) # b[1]
+ .align 3
$901:
- ldq $0,0($17) # a[0]
- ldq $1,0($18) # a[1]
+ addq $1,$5,$1 # r=a+b;
+ ldq $6,8($17) # a[1]
+ cmpult $1,$5,$22 # did we overflow?
+ ldq $2,8($18) # b[1]
+ addq $1,$0,$1 # c+= overflow
+ ldq $7,16($17) # a[2]
+ cmpult $1,$0,$0 # overflow?
+ ldq $3,16($18) # b[2]
+ addq $0,$22,$0
+ ldq $8,24($17) # a[3]
+ addq $2,$6,$2 # r=a+b;
+ ldq $4,24($18) # b[3]
+ cmpult $2,$6,$23 # did we overflow?
+ addq $3,$7,$3 # r=a+b;
+ addq $2,$0,$2 # c+= overflow
+ cmpult $3,$7,$24 # did we overflow?
+ cmpult $2,$0,$0 # overflow?
+ addq $4,$8,$4 # r=a+b;
+ addq $0,$23,$0
+ cmpult $4,$8,$25 # did we overflow?
+ addq $3,$0,$3 # c+= overflow
+ stq $1,0($16) # r[0]=c
+ cmpult $3,$0,$0 # overflow?
+ stq $2,8($16) # r[1]=c
+ addq $0,$24,$0
+ stq $3,16($16) # r[2]=c
+ addq $4,$0,$4 # c+= overflow
+ subq $19,4,$19 # loop--
+ cmpult $4,$0,$0 # overflow?
+ addq $17,32,$17 # a++
+ addq $0,$25,$0
+ stq $4,24($16) # r[3]=c
+ addq $18,32,$18 # b++
+ addq $16,32,$16 # r++
- addq $0,$1,$3 # c=a+b;
+ blt $19,$900
+ ldq $5,0($17) # a[0]
+ ldq $1,0($18) # b[1]
+ br $901
+ .align 4
+$945:
+ ldq $5,0($17) # a[0]
+ ldq $1,0($18) # b[1]
+ addq $1,$5,$1 # r=a+b;
+ subq $19,1,$19 # loop--
+ addq $1,$0,$1 # c+= overflow
addq $17,8,$17 # a++
+ cmpult $1,$5,$22 # did we overflow?
+ cmpult $1,$0,$0 # overflow?
+ addq $18,8,$18 # b++
+ stq $1,0($16) # r[0]=c
+ addq $0,$22,$0
+ addq $16,8,$16 # r++
- cmpult $3,$1,$7 # did we overflow?
- addq $18,8,$18 # b++
-
- addq $8,$3,$3 # c+=carry
+ bgt $19,$945
+ ret $31,($26),1 # else exit
- cmpult $3,$8,$8 # did we overflow?
- stq $3,($16) # r[0]=c
-
- addq $7,$8,$8 # add into overflow
- subq $19,1,$19 # loop--
-
- addq $16,8,$16 # r++
- bgt $19,$901
$900:
- bis $8,$8,$0 # return carry
- ret $31,($26),1
+ addq $19,4,$19
+ bgt $19,$945 # goto tail code
+ ret $31,($26),1 # else exit
.end bn_add_words
#
@@ -339,6 +454,1445 @@ $136:
addq $30,48,$30
ret $31,($26),1
.end bn_div64
- .ident "GCC: (GNU) 2.7.2.1"
+ .set noat
+ .text
+ .align 3
+ .globl bn_sub_words
+ .ent bn_sub_words
+bn_sub_words:
+bn_sub_words..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ subq $19, 4, $19
+ bis $31, $31, $0
+ blt $19, $100
+ ldq $1, 0($17)
+ ldq $2, 0($18)
+$101:
+ ldq $3, 8($17)
+ cmpult $1, $2, $4
+ ldq $5, 8($18)
+ subq $1, $2, $1
+ ldq $6, 16($17)
+ cmpult $1, $0, $2
+ ldq $7, 16($18)
+ subq $1, $0, $23
+ ldq $8, 24($17)
+ addq $2, $4, $0
+ cmpult $3, $5, $24
+ subq $3, $5, $3
+ ldq $22, 24($18)
+ cmpult $3, $0, $5
+ subq $3, $0, $25
+ addq $5, $24, $0
+ cmpult $6, $7, $27
+ subq $6, $7, $6
+ stq $23, 0($16)
+ cmpult $6, $0, $7
+ subq $6, $0, $28
+ addq $7, $27, $0
+ cmpult $8, $22, $21
+ subq $8, $22, $8
+ stq $25, 8($16)
+ cmpult $8, $0, $22
+ subq $8, $0, $20
+ addq $22, $21, $0
+ stq $28, 16($16)
+ subq $19, 4, $19
+ stq $20, 24($16)
+ addq $17, 32, $17
+ addq $18, 32, $18
+ addq $16, 32, $16
+ blt $19, $100
+ ldq $1, 0($17)
+ ldq $2, 0($18)
+ br $101
+$102:
+ ldq $1, 0($17)
+ ldq $2, 0($18)
+ cmpult $1, $2, $27
+ subq $1, $2, $1
+ cmpult $1, $0, $2
+ subq $1, $0, $1
+ stq $1, 0($16)
+ addq $2, $27, $0
+ addq $17, 8, $17
+ addq $18, 8, $18
+ addq $16, 8, $16
+ subq $19, 1, $19
+ bgt $19, $102
+ ret $31,($26),1
+$100:
+ addq $19, 4, $19
+ bgt $19, $102
+$103:
+ ret $31,($26),1
+ .end bn_sub_words
+ .text
+ .align 3
+ .globl bn_mul_comba4
+ .ent bn_mul_comba4
+bn_mul_comba4:
+bn_mul_comba4..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ ldq $0, 0($17)
+ ldq $1, 0($18)
+ ldq $2, 8($17)
+ ldq $3, 8($18)
+ ldq $4, 16($17)
+ ldq $5, 16($18)
+ ldq $6, 24($17)
+ ldq $7, 24($18)
+ bis $31, $31, $23
+ mulq $0, $1, $8
+ umulh $0, $1, $22
+ stq $8, 0($16)
+ bis $31, $31, $8
+ mulq $0, $3, $24
+ umulh $0, $3, $25
+ addq $22, $24, $22
+ cmpult $22, $24, $27
+ addq $27, $25, $25
+ addq $23, $25, $23
+ cmpult $23, $25, $28
+ addq $8, $28, $8
+ mulq $2, $1, $21
+ umulh $2, $1, $20
+ addq $22, $21, $22
+ cmpult $22, $21, $19
+ addq $19, $20, $20
+ addq $23, $20, $23
+ cmpult $23, $20, $17
+ addq $8, $17, $8
+ stq $22, 8($16)
+ bis $31, $31, $22
+ mulq $2, $3, $18
+ umulh $2, $3, $24
+ addq $23, $18, $23
+ cmpult $23, $18, $27
+ addq $27, $24, $24
+ addq $8, $24, $8
+ cmpult $8, $24, $25
+ addq $22, $25, $22
+ mulq $0, $5, $28
+ umulh $0, $5, $21
+ addq $23, $28, $23
+ cmpult $23, $28, $19
+ addq $19, $21, $21
+ addq $8, $21, $8
+ cmpult $8, $21, $20
+ addq $22, $20, $22
+ mulq $4, $1, $17
+ umulh $4, $1, $18
+ addq $23, $17, $23
+ cmpult $23, $17, $27
+ addq $27, $18, $18
+ addq $8, $18, $8
+ cmpult $8, $18, $24
+ addq $22, $24, $22
+ stq $23, 16($16)
+ bis $31, $31, $23
+ mulq $0, $7, $25
+ umulh $0, $7, $28
+ addq $8, $25, $8
+ cmpult $8, $25, $19
+ addq $19, $28, $28
+ addq $22, $28, $22
+ cmpult $22, $28, $21
+ addq $23, $21, $23
+ mulq $2, $5, $20
+ umulh $2, $5, $17
+ addq $8, $20, $8
+ cmpult $8, $20, $27
+ addq $27, $17, $17
+ addq $22, $17, $22
+ cmpult $22, $17, $18
+ addq $23, $18, $23
+ mulq $4, $3, $24
+ umulh $4, $3, $25
+ addq $8, $24, $8
+ cmpult $8, $24, $19
+ addq $19, $25, $25
+ addq $22, $25, $22
+ cmpult $22, $25, $28
+ addq $23, $28, $23
+ mulq $6, $1, $21
+ umulh $6, $1, $0
+ addq $8, $21, $8
+ cmpult $8, $21, $20
+ addq $20, $0, $0
+ addq $22, $0, $22
+ cmpult $22, $0, $27
+ addq $23, $27, $23
+ stq $8, 24($16)
+ bis $31, $31, $8
+ mulq $2, $7, $17
+ umulh $2, $7, $18
+ addq $22, $17, $22
+ cmpult $22, $17, $24
+ addq $24, $18, $18
+ addq $23, $18, $23
+ cmpult $23, $18, $19
+ addq $8, $19, $8
+ mulq $4, $5, $25
+ umulh $4, $5, $28
+ addq $22, $25, $22
+ cmpult $22, $25, $21
+ addq $21, $28, $28
+ addq $23, $28, $23
+ cmpult $23, $28, $20
+ addq $8, $20, $8
+ mulq $6, $3, $0
+ umulh $6, $3, $27
+ addq $22, $0, $22
+ cmpult $22, $0, $1
+ addq $1, $27, $27
+ addq $23, $27, $23
+ cmpult $23, $27, $17
+ addq $8, $17, $8
+ stq $22, 32($16)
+ bis $31, $31, $22
+ mulq $4, $7, $24
+ umulh $4, $7, $18
+ addq $23, $24, $23
+ cmpult $23, $24, $19
+ addq $19, $18, $18
+ addq $8, $18, $8
+ cmpult $8, $18, $2
+ addq $22, $2, $22
+ mulq $6, $5, $25
+ umulh $6, $5, $21
+ addq $23, $25, $23
+ cmpult $23, $25, $28
+ addq $28, $21, $21
+ addq $8, $21, $8
+ cmpult $8, $21, $20
+ addq $22, $20, $22
+ stq $23, 40($16)
+ bis $31, $31, $23
+ mulq $6, $7, $0
+ umulh $6, $7, $1
+ addq $8, $0, $8
+ cmpult $8, $0, $27
+ addq $27, $1, $1
+ addq $22, $1, $22
+ cmpult $22, $1, $17
+ addq $23, $17, $23
+ stq $8, 48($16)
+ stq $22, 56($16)
+ ret $31,($26),1
+ .end bn_mul_comba4
+ .text
+ .align 3
+ .globl bn_mul_comba8
+ .ent bn_mul_comba8
+bn_mul_comba8:
+bn_mul_comba8..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ subq $30, 16, $30
+ ldq $0, 0($17)
+ ldq $1, 0($18)
+ stq $9, 0($30)
+ stq $10, 8($30)
+ ldq $2, 8($17)
+ ldq $3, 8($18)
+ ldq $4, 16($17)
+ ldq $5, 16($18)
+ ldq $6, 24($17)
+ ldq $7, 24($18)
+ ldq $8, 8($17)
+ ldq $22, 8($18)
+ ldq $23, 8($17)
+ ldq $24, 8($18)
+ ldq $25, 8($17)
+ ldq $27, 8($18)
+ ldq $28, 8($17)
+ ldq $21, 8($18)
+ bis $31, $31, $9
+ mulq $0, $1, $20
+ umulh $0, $1, $19
+ stq $20, 0($16)
+ bis $31, $31, $20
+ mulq $0, $3, $10
+ umulh $0, $3, $17
+ addq $19, $10, $19
+ cmpult $19, $10, $18
+ addq $18, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $10
+ addq $20, $10, $20
+ mulq $2, $1, $18
+ umulh $2, $1, $17
+ addq $19, $18, $19
+ cmpult $19, $18, $10
+ addq $10, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $18
+ addq $20, $18, $20
+ stq $19, 8($16)
+ bis $31, $31, $19
+ mulq $0, $5, $10
+ umulh $0, $5, $17
+ addq $9, $10, $9
+ cmpult $9, $10, $18
+ addq $18, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $10
+ addq $19, $10, $19
+ mulq $2, $3, $18
+ umulh $2, $3, $17
+ addq $9, $18, $9
+ cmpult $9, $18, $10
+ addq $10, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $18
+ addq $19, $18, $19
+ mulq $4, $1, $10
+ umulh $4, $1, $17
+ addq $9, $10, $9
+ cmpult $9, $10, $18
+ addq $18, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $10
+ addq $19, $10, $19
+ stq $9, 16($16)
+ bis $31, $31, $9
+ mulq $0, $7, $18
+ umulh $0, $7, $17
+ addq $20, $18, $20
+ cmpult $20, $18, $10
+ addq $10, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $18
+ addq $9, $18, $9
+ mulq $2, $5, $10
+ umulh $2, $5, $17
+ addq $20, $10, $20
+ cmpult $20, $10, $18
+ addq $18, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $10
+ addq $9, $10, $9
+ mulq $4, $3, $18
+ umulh $4, $3, $17
+ addq $20, $18, $20
+ cmpult $20, $18, $10
+ addq $10, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $18
+ addq $9, $18, $9
+ mulq $6, $1, $10
+ umulh $6, $1, $17
+ addq $20, $10, $20
+ cmpult $20, $10, $18
+ addq $18, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $10
+ addq $9, $10, $9
+ stq $20, 24($16)
+ bis $31, $31, $20
+ mulq $0, $22, $18
+ umulh $0, $22, $17
+ addq $19, $18, $19
+ cmpult $19, $18, $10
+ addq $10, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $18
+ addq $20, $18, $20
+ mulq $2, $7, $10
+ umulh $2, $7, $17
+ addq $19, $10, $19
+ cmpult $19, $10, $18
+ addq $18, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $10
+ addq $20, $10, $20
+ mulq $4, $5, $18
+ umulh $4, $5, $17
+ addq $19, $18, $19
+ cmpult $19, $18, $10
+ addq $10, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $18
+ addq $20, $18, $20
+ mulq $6, $3, $10
+ umulh $6, $3, $17
+ addq $19, $10, $19
+ cmpult $19, $10, $18
+ addq $18, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $10
+ addq $20, $10, $20
+ mulq $8, $1, $18
+ umulh $8, $1, $17
+ addq $19, $18, $19
+ cmpult $19, $18, $10
+ addq $10, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $18
+ addq $20, $18, $20
+ stq $19, 32($16)
+ bis $31, $31, $19
+ mulq $0, $24, $10
+ umulh $0, $24, $17
+ addq $9, $10, $9
+ cmpult $9, $10, $18
+ addq $18, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $10
+ addq $19, $10, $19
+ mulq $2, $22, $18
+ umulh $2, $22, $17
+ addq $9, $18, $9
+ cmpult $9, $18, $10
+ addq $10, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $18
+ addq $19, $18, $19
+ mulq $4, $7, $10
+ umulh $4, $7, $17
+ addq $9, $10, $9
+ cmpult $9, $10, $18
+ addq $18, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $10
+ addq $19, $10, $19
+ mulq $6, $5, $18
+ umulh $6, $5, $17
+ addq $9, $18, $9
+ cmpult $9, $18, $10
+ addq $10, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $18
+ addq $19, $18, $19
+ mulq $8, $3, $10
+ umulh $8, $3, $17
+ addq $9, $10, $9
+ cmpult $9, $10, $18
+ addq $18, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $10
+ addq $19, $10, $19
+ mulq $23, $1, $18
+ umulh $23, $1, $17
+ addq $9, $18, $9
+ cmpult $9, $18, $10
+ addq $10, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $18
+ addq $19, $18, $19
+ stq $9, 40($16)
+ bis $31, $31, $9
+ mulq $0, $27, $10
+ umulh $0, $27, $17
+ addq $20, $10, $20
+ cmpult $20, $10, $18
+ addq $18, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $10
+ addq $9, $10, $9
+ mulq $2, $24, $18
+ umulh $2, $24, $17
+ addq $20, $18, $20
+ cmpult $20, $18, $10
+ addq $10, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $18
+ addq $9, $18, $9
+ mulq $4, $22, $10
+ umulh $4, $22, $17
+ addq $20, $10, $20
+ cmpult $20, $10, $18
+ addq $18, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $10
+ addq $9, $10, $9
+ mulq $6, $7, $18
+ umulh $6, $7, $17
+ addq $20, $18, $20
+ cmpult $20, $18, $10
+ addq $10, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $18
+ addq $9, $18, $9
+ mulq $8, $5, $10
+ umulh $8, $5, $17
+ addq $20, $10, $20
+ cmpult $20, $10, $18
+ addq $18, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $10
+ addq $9, $10, $9
+ mulq $23, $3, $18
+ umulh $23, $3, $17
+ addq $20, $18, $20
+ cmpult $20, $18, $10
+ addq $10, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $18
+ addq $9, $18, $9
+ mulq $25, $1, $10
+ umulh $25, $1, $17
+ addq $20, $10, $20
+ cmpult $20, $10, $18
+ addq $18, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $10
+ addq $9, $10, $9
+ stq $20, 48($16)
+ bis $31, $31, $20
+ mulq $0, $21, $18
+ umulh $0, $21, $17
+ addq $19, $18, $19
+ cmpult $19, $18, $10
+ addq $10, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $18
+ addq $20, $18, $20
+ mulq $2, $27, $10
+ umulh $2, $27, $17
+ addq $19, $10, $19
+ cmpult $19, $10, $18
+ addq $18, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $0
+ addq $20, $0, $20
+ mulq $4, $24, $10
+ umulh $4, $24, $18
+ addq $19, $10, $19
+ cmpult $19, $10, $17
+ addq $17, $18, $18
+ addq $9, $18, $9
+ cmpult $9, $18, $0
+ addq $20, $0, $20
+ mulq $6, $22, $10
+ umulh $6, $22, $17
+ addq $19, $10, $19
+ cmpult $19, $10, $18
+ addq $18, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $0
+ addq $20, $0, $20
+ mulq $8, $7, $10
+ umulh $8, $7, $18
+ addq $19, $10, $19
+ cmpult $19, $10, $17
+ addq $17, $18, $18
+ addq $9, $18, $9
+ cmpult $9, $18, $0
+ addq $20, $0, $20
+ mulq $23, $5, $10
+ umulh $23, $5, $17
+ addq $19, $10, $19
+ cmpult $19, $10, $18
+ addq $18, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $0
+ addq $20, $0, $20
+ mulq $25, $3, $10
+ umulh $25, $3, $18
+ addq $19, $10, $19
+ cmpult $19, $10, $17
+ addq $17, $18, $18
+ addq $9, $18, $9
+ cmpult $9, $18, $0
+ addq $20, $0, $20
+ mulq $28, $1, $10
+ umulh $28, $1, $17
+ addq $19, $10, $19
+ cmpult $19, $10, $18
+ addq $18, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $0
+ addq $20, $0, $20
+ stq $19, 56($16)
+ bis $31, $31, $19
+ mulq $2, $21, $10
+ umulh $2, $21, $18
+ addq $9, $10, $9
+ cmpult $9, $10, $17
+ addq $17, $18, $18
+ addq $20, $18, $20
+ cmpult $20, $18, $0
+ addq $19, $0, $19
+ mulq $4, $27, $1
+ umulh $4, $27, $10
+ addq $9, $1, $9
+ cmpult $9, $1, $17
+ addq $17, $10, $10
+ addq $20, $10, $20
+ cmpult $20, $10, $18
+ addq $19, $18, $19
+ mulq $6, $24, $0
+ umulh $6, $24, $2
+ addq $9, $0, $9
+ cmpult $9, $0, $1
+ addq $1, $2, $2
+ addq $20, $2, $20
+ cmpult $20, $2, $17
+ addq $19, $17, $19
+ mulq $8, $22, $10
+ umulh $8, $22, $18
+ addq $9, $10, $9
+ cmpult $9, $10, $0
+ addq $0, $18, $18
+ addq $20, $18, $20
+ cmpult $20, $18, $1
+ addq $19, $1, $19
+ mulq $23, $7, $2
+ umulh $23, $7, $17
+ addq $9, $2, $9
+ cmpult $9, $2, $10
+ addq $10, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $0
+ addq $19, $0, $19
+ mulq $25, $5, $18
+ umulh $25, $5, $1
+ addq $9, $18, $9
+ cmpult $9, $18, $2
+ addq $2, $1, $1
+ addq $20, $1, $20
+ cmpult $20, $1, $10
+ addq $19, $10, $19
+ mulq $28, $3, $17
+ umulh $28, $3, $0
+ addq $9, $17, $9
+ cmpult $9, $17, $18
+ addq $18, $0, $0
+ addq $20, $0, $20
+ cmpult $20, $0, $2
+ addq $19, $2, $19
+ stq $9, 64($16)
+ bis $31, $31, $9
+ mulq $4, $21, $1
+ umulh $4, $21, $10
+ addq $20, $1, $20
+ cmpult $20, $1, $17
+ addq $17, $10, $10
+ addq $19, $10, $19
+ cmpult $19, $10, $18
+ addq $9, $18, $9
+ mulq $6, $27, $0
+ umulh $6, $27, $2
+ addq $20, $0, $20
+ cmpult $20, $0, $3
+ addq $3, $2, $2
+ addq $19, $2, $19
+ cmpult $19, $2, $1
+ addq $9, $1, $9
+ mulq $8, $24, $17
+ umulh $8, $24, $10
+ addq $20, $17, $20
+ cmpult $20, $17, $18
+ addq $18, $10, $10
+ addq $19, $10, $19
+ cmpult $19, $10, $4
+ addq $9, $4, $9
+ mulq $23, $22, $0
+ umulh $23, $22, $3
+ addq $20, $0, $20
+ cmpult $20, $0, $2
+ addq $2, $3, $3
+ addq $19, $3, $19
+ cmpult $19, $3, $1
+ addq $9, $1, $9
+ mulq $25, $7, $17
+ umulh $25, $7, $18
+ addq $20, $17, $20
+ cmpult $20, $17, $10
+ addq $10, $18, $18
+ addq $19, $18, $19
+ cmpult $19, $18, $4
+ addq $9, $4, $9
+ mulq $28, $5, $0
+ umulh $28, $5, $2
+ addq $20, $0, $20
+ cmpult $20, $0, $3
+ addq $3, $2, $2
+ addq $19, $2, $19
+ cmpult $19, $2, $1
+ addq $9, $1, $9
+ stq $20, 72($16)
+ bis $31, $31, $20
+ mulq $6, $21, $17
+ umulh $6, $21, $10
+ addq $19, $17, $19
+ cmpult $19, $17, $18
+ addq $18, $10, $10
+ addq $9, $10, $9
+ cmpult $9, $10, $4
+ addq $20, $4, $20
+ mulq $8, $27, $0
+ umulh $8, $27, $3
+ addq $19, $0, $19
+ cmpult $19, $0, $2
+ addq $2, $3, $3
+ addq $9, $3, $9
+ cmpult $9, $3, $1
+ addq $20, $1, $20
+ mulq $23, $24, $5
+ umulh $23, $24, $17
+ addq $19, $5, $19
+ cmpult $19, $5, $18
+ addq $18, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $10
+ addq $20, $10, $20
+ mulq $25, $22, $4
+ umulh $25, $22, $6
+ addq $19, $4, $19
+ cmpult $19, $4, $0
+ addq $0, $6, $6
+ addq $9, $6, $9
+ cmpult $9, $6, $2
+ addq $20, $2, $20
+ mulq $28, $7, $3
+ umulh $28, $7, $1
+ addq $19, $3, $19
+ cmpult $19, $3, $5
+ addq $5, $1, $1
+ addq $9, $1, $9
+ cmpult $9, $1, $18
+ addq $20, $18, $20
+ stq $19, 80($16)
+ bis $31, $31, $19
+ mulq $8, $21, $17
+ umulh $8, $21, $10
+ addq $9, $17, $9
+ cmpult $9, $17, $4
+ addq $4, $10, $10
+ addq $20, $10, $20
+ cmpult $20, $10, $0
+ addq $19, $0, $19
+ mulq $23, $27, $6
+ umulh $23, $27, $2
+ addq $9, $6, $9
+ cmpult $9, $6, $3
+ addq $3, $2, $2
+ addq $20, $2, $20
+ cmpult $20, $2, $5
+ addq $19, $5, $19
+ mulq $25, $24, $1
+ umulh $25, $24, $18
+ addq $9, $1, $9
+ cmpult $9, $1, $7
+ addq $7, $18, $18
+ addq $20, $18, $20
+ cmpult $20, $18, $17
+ addq $19, $17, $19
+ mulq $28, $22, $4
+ umulh $28, $22, $10
+ addq $9, $4, $9
+ cmpult $9, $4, $0
+ addq $0, $10, $10
+ addq $20, $10, $20
+ cmpult $20, $10, $8
+ addq $19, $8, $19
+ stq $9, 88($16)
+ bis $31, $31, $9
+ mulq $23, $21, $6
+ umulh $23, $21, $3
+ addq $20, $6, $20
+ cmpult $20, $6, $2
+ addq $2, $3, $3
+ addq $19, $3, $19
+ cmpult $19, $3, $5
+ addq $9, $5, $9
+ mulq $25, $27, $1
+ umulh $25, $27, $7
+ addq $20, $1, $20
+ cmpult $20, $1, $18
+ addq $18, $7, $7
+ addq $19, $7, $19
+ cmpult $19, $7, $17
+ addq $9, $17, $9
+ mulq $28, $24, $4
+ umulh $28, $24, $0
+ addq $20, $4, $20
+ cmpult $20, $4, $10
+ addq $10, $0, $0
+ addq $19, $0, $19
+ cmpult $19, $0, $8
+ addq $9, $8, $9
+ stq $20, 96($16)
+ bis $31, $31, $20
+ mulq $25, $21, $22
+ umulh $25, $21, $6
+ addq $19, $22, $19
+ cmpult $19, $22, $2
+ addq $2, $6, $6
+ addq $9, $6, $9
+ cmpult $9, $6, $3
+ addq $20, $3, $20
+ mulq $28, $27, $5
+ umulh $28, $27, $23
+ addq $19, $5, $19
+ cmpult $19, $5, $1
+ addq $1, $23, $23
+ addq $9, $23, $9
+ cmpult $9, $23, $18
+ addq $20, $18, $20
+ stq $19, 104($16)
+ bis $31, $31, $19
+ mulq $28, $21, $7
+ umulh $28, $21, $17
+ addq $9, $7, $9
+ cmpult $9, $7, $4
+ addq $4, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $10
+ addq $19, $10, $19
+ stq $9, 112($16)
+ stq $20, 120($16)
+ ldq $9, 0($30)
+ ldq $10, 8($30)
+ addq $30, 16, $30
+ ret $31,($26),1
+ .end bn_mul_comba8
+ .text
+ .align 3
+ .globl bn_sqr_comba4
+ .ent bn_sqr_comba4
+bn_sqr_comba4:
+bn_sqr_comba4..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ ldq $0, 0($17)
+ ldq $1, 8($17)
+ ldq $2, 16($17)
+ ldq $3, 24($17)
+ bis $31, $31, $6
+ mulq $0, $0, $4
+ umulh $0, $0, $5
+ stq $4, 0($16)
+ bis $31, $31, $4
+ mulq $0, $1, $7
+ umulh $0, $1, $8
+ cmplt $7, $31, $22
+ cmplt $8, $31, $23
+ addq $7, $7, $7
+ addq $8, $8, $8
+ addq $8, $22, $8
+ addq $4, $23, $4
+ addq $5, $7, $5
+ addq $6, $8, $6
+ cmpult $5, $7, $24
+ cmpult $6, $8, $25
+ addq $6, $24, $6
+ addq $4, $25, $4
+ stq $5, 8($16)
+ bis $31, $31, $5
+ mulq $1, $1, $27
+ umulh $1, $1, $28
+ addq $6, $27, $6
+ addq $4, $28, $4
+ cmpult $6, $27, $21
+ cmpult $4, $28, $20
+ addq $4, $21, $4
+ addq $5, $20, $5
+ mulq $2, $0, $19
+ umulh $2, $0, $18
+ cmplt $19, $31, $17
+ cmplt $18, $31, $22
+ addq $19, $19, $19
+ addq $18, $18, $18
+ addq $18, $17, $18
+ addq $5, $22, $5
+ addq $6, $19, $6
+ addq $4, $18, $4
+ cmpult $6, $19, $23
+ cmpult $4, $18, $7
+ addq $4, $23, $4
+ addq $5, $7, $5
+ stq $6, 16($16)
+ bis $31, $31, $6
+ mulq $3, $0, $8
+ umulh $3, $0, $24
+ cmplt $8, $31, $25
+ cmplt $24, $31, $27
+ addq $8, $8, $8
+ addq $24, $24, $24
+ addq $24, $25, $24
+ addq $6, $27, $6
+ addq $4, $8, $4
+ addq $5, $24, $5
+ cmpult $4, $8, $28
+ cmpult $5, $24, $21
+ addq $5, $28, $5
+ addq $6, $21, $6
+ mulq $2, $1, $20
+ umulh $2, $1, $17
+ cmplt $20, $31, $22
+ cmplt $17, $31, $19
+ addq $20, $20, $20
+ addq $17, $17, $17
+ addq $17, $22, $17
+ addq $6, $19, $6
+ addq $4, $20, $4
+ addq $5, $17, $5
+ cmpult $4, $20, $18
+ cmpult $5, $17, $23
+ addq $5, $18, $5
+ addq $6, $23, $6
+ stq $4, 24($16)
+ bis $31, $31, $4
+ mulq $2, $2, $7
+ umulh $2, $2, $25
+ addq $5, $7, $5
+ addq $6, $25, $6
+ cmpult $5, $7, $27
+ cmpult $6, $25, $8
+ addq $6, $27, $6
+ addq $4, $8, $4
+ mulq $3, $1, $24
+ umulh $3, $1, $28
+ cmplt $24, $31, $21
+ cmplt $28, $31, $22
+ addq $24, $24, $24
+ addq $28, $28, $28
+ addq $28, $21, $28
+ addq $4, $22, $4
+ addq $5, $24, $5
+ addq $6, $28, $6
+ cmpult $5, $24, $19
+ cmpult $6, $28, $20
+ addq $6, $19, $6
+ addq $4, $20, $4
+ stq $5, 32($16)
+ bis $31, $31, $5
+ mulq $3, $2, $17
+ umulh $3, $2, $18
+ cmplt $17, $31, $23
+ cmplt $18, $31, $7
+ addq $17, $17, $17
+ addq $18, $18, $18
+ addq $18, $23, $18
+ addq $5, $7, $5
+ addq $6, $17, $6
+ addq $4, $18, $4
+ cmpult $6, $17, $25
+ cmpult $4, $18, $27
+ addq $4, $25, $4
+ addq $5, $27, $5
+ stq $6, 40($16)
+ bis $31, $31, $6
+ mulq $3, $3, $8
+ umulh $3, $3, $21
+ addq $4, $8, $4
+ addq $5, $21, $5
+ cmpult $4, $8, $22
+ cmpult $5, $21, $24
+ addq $5, $22, $5
+ addq $6, $24, $6
+ stq $4, 48($16)
+ stq $5, 56($16)
+ ret $31,($26),1
+ .end bn_sqr_comba4
+ .text
+ .align 3
+ .globl bn_sqr_comba8
+ .ent bn_sqr_comba8
+bn_sqr_comba8:
+bn_sqr_comba8..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+ ldq $0, 0($17)
+ ldq $1, 8($17)
+ ldq $2, 16($17)
+ ldq $3, 24($17)
+ ldq $4, 32($17)
+ ldq $5, 40($17)
+ ldq $6, 48($17)
+ ldq $7, 56($17)
+ bis $31, $31, $23
+ mulq $0, $0, $8
+ umulh $0, $0, $22
+ stq $8, 0($16)
+ bis $31, $31, $8
+ mulq $1, $0, $24
+ umulh $1, $0, $25
+ cmplt $24, $31, $27
+ cmplt $25, $31, $28
+ addq $24, $24, $24
+ addq $25, $25, $25
+ addq $25, $27, $25
+ addq $8, $28, $8
+ addq $22, $24, $22
+ addq $23, $25, $23
+ cmpult $22, $24, $21
+ cmpult $23, $25, $20
+ addq $23, $21, $23
+ addq $8, $20, $8
+ stq $22, 8($16)
+ bis $31, $31, $22
+ mulq $1, $1, $19
+ umulh $1, $1, $18
+ addq $23, $19, $23
+ addq $8, $18, $8
+ cmpult $23, $19, $17
+ cmpult $8, $18, $27
+ addq $8, $17, $8
+ addq $22, $27, $22
+ mulq $2, $0, $28
+ umulh $2, $0, $24
+ cmplt $28, $31, $25
+ cmplt $24, $31, $21
+ addq $28, $28, $28
+ addq $24, $24, $24
+ addq $24, $25, $24
+ addq $22, $21, $22
+ addq $23, $28, $23
+ addq $8, $24, $8
+ cmpult $23, $28, $20
+ cmpult $8, $24, $19
+ addq $8, $20, $8
+ addq $22, $19, $22
+ stq $23, 16($16)
+ bis $31, $31, $23
+ mulq $2, $1, $18
+ umulh $2, $1, $17
+ cmplt $18, $31, $27
+ cmplt $17, $31, $25
+ addq $18, $18, $18
+ addq $17, $17, $17
+ addq $17, $27, $17
+ addq $23, $25, $23
+ addq $8, $18, $8
+ addq $22, $17, $22
+ cmpult $8, $18, $21
+ cmpult $22, $17, $28
+ addq $22, $21, $22
+ addq $23, $28, $23
+ mulq $3, $0, $24
+ umulh $3, $0, $20
+ cmplt $24, $31, $19
+ cmplt $20, $31, $27
+ addq $24, $24, $24
+ addq $20, $20, $20
+ addq $20, $19, $20
+ addq $23, $27, $23
+ addq $8, $24, $8
+ addq $22, $20, $22
+ cmpult $8, $24, $25
+ cmpult $22, $20, $18
+ addq $22, $25, $22
+ addq $23, $18, $23
+ stq $8, 24($16)
+ bis $31, $31, $8
+ mulq $2, $2, $17
+ umulh $2, $2, $21
+ addq $22, $17, $22
+ addq $23, $21, $23
+ cmpult $22, $17, $28
+ cmpult $23, $21, $19
+ addq $23, $28, $23
+ addq $8, $19, $8
+ mulq $3, $1, $27
+ umulh $3, $1, $24
+ cmplt $27, $31, $20
+ cmplt $24, $31, $25
+ addq $27, $27, $27
+ addq $24, $24, $24
+ addq $24, $20, $24
+ addq $8, $25, $8
+ addq $22, $27, $22
+ addq $23, $24, $23
+ cmpult $22, $27, $18
+ cmpult $23, $24, $17
+ addq $23, $18, $23
+ addq $8, $17, $8
+ mulq $4, $0, $21
+ umulh $4, $0, $28
+ cmplt $21, $31, $19
+ cmplt $28, $31, $20
+ addq $21, $21, $21
+ addq $28, $28, $28
+ addq $28, $19, $28
+ addq $8, $20, $8
+ addq $22, $21, $22
+ addq $23, $28, $23
+ cmpult $22, $21, $25
+ cmpult $23, $28, $27
+ addq $23, $25, $23
+ addq $8, $27, $8
+ stq $22, 32($16)
+ bis $31, $31, $22
+ mulq $3, $2, $24
+ umulh $3, $2, $18
+ cmplt $24, $31, $17
+ cmplt $18, $31, $19
+ addq $24, $24, $24
+ addq $18, $18, $18
+ addq $18, $17, $18
+ addq $22, $19, $22
+ addq $23, $24, $23
+ addq $8, $18, $8
+ cmpult $23, $24, $20
+ cmpult $8, $18, $21
+ addq $8, $20, $8
+ addq $22, $21, $22
+ mulq $4, $1, $28
+ umulh $4, $1, $25
+ cmplt $28, $31, $27
+ cmplt $25, $31, $17
+ addq $28, $28, $28
+ addq $25, $25, $25
+ addq $25, $27, $25
+ addq $22, $17, $22
+ addq $23, $28, $23
+ addq $8, $25, $8
+ cmpult $23, $28, $19
+ cmpult $8, $25, $24
+ addq $8, $19, $8
+ addq $22, $24, $22
+ mulq $5, $0, $18
+ umulh $5, $0, $20
+ cmplt $18, $31, $21
+ cmplt $20, $31, $27
+ addq $18, $18, $18
+ addq $20, $20, $20
+ addq $20, $21, $20
+ addq $22, $27, $22
+ addq $23, $18, $23
+ addq $8, $20, $8
+ cmpult $23, $18, $17
+ cmpult $8, $20, $28
+ addq $8, $17, $8
+ addq $22, $28, $22
+ stq $23, 40($16)
+ bis $31, $31, $23
+ mulq $3, $3, $25
+ umulh $3, $3, $19
+ addq $8, $25, $8
+ addq $22, $19, $22
+ cmpult $8, $25, $24
+ cmpult $22, $19, $21
+ addq $22, $24, $22
+ addq $23, $21, $23
+ mulq $4, $2, $27
+ umulh $4, $2, $18
+ cmplt $27, $31, $20
+ cmplt $18, $31, $17
+ addq $27, $27, $27
+ addq $18, $18, $18
+ addq $18, $20, $18
+ addq $23, $17, $23
+ addq $8, $27, $8
+ addq $22, $18, $22
+ cmpult $8, $27, $28
+ cmpult $22, $18, $25
+ addq $22, $28, $22
+ addq $23, $25, $23
+ mulq $5, $1, $19
+ umulh $5, $1, $24
+ cmplt $19, $31, $21
+ cmplt $24, $31, $20
+ addq $19, $19, $19
+ addq $24, $24, $24
+ addq $24, $21, $24
+ addq $23, $20, $23
+ addq $8, $19, $8
+ addq $22, $24, $22
+ cmpult $8, $19, $17
+ cmpult $22, $24, $27
+ addq $22, $17, $22
+ addq $23, $27, $23
+ mulq $6, $0, $18
+ umulh $6, $0, $28
+ cmplt $18, $31, $25
+ cmplt $28, $31, $21
+ addq $18, $18, $18
+ addq $28, $28, $28
+ addq $28, $25, $28
+ addq $23, $21, $23
+ addq $8, $18, $8
+ addq $22, $28, $22
+ cmpult $8, $18, $20
+ cmpult $22, $28, $19
+ addq $22, $20, $22
+ addq $23, $19, $23
+ stq $8, 48($16)
+ bis $31, $31, $8
+ mulq $4, $3, $24
+ umulh $4, $3, $17
+ cmplt $24, $31, $27
+ cmplt $17, $31, $25
+ addq $24, $24, $24
+ addq $17, $17, $17
+ addq $17, $27, $17
+ addq $8, $25, $8
+ addq $22, $24, $22
+ addq $23, $17, $23
+ cmpult $22, $24, $21
+ cmpult $23, $17, $18
+ addq $23, $21, $23
+ addq $8, $18, $8
+ mulq $5, $2, $28
+ umulh $5, $2, $20
+ cmplt $28, $31, $19
+ cmplt $20, $31, $27
+ addq $28, $28, $28
+ addq $20, $20, $20
+ addq $20, $19, $20
+ addq $8, $27, $8
+ addq $22, $28, $22
+ addq $23, $20, $23
+ cmpult $22, $28, $25
+ cmpult $23, $20, $24
+ addq $23, $25, $23
+ addq $8, $24, $8
+ mulq $6, $1, $17
+ umulh $6, $1, $21
+ cmplt $17, $31, $18
+ cmplt $21, $31, $19
+ addq $17, $17, $17
+ addq $21, $21, $21
+ addq $21, $18, $21
+ addq $8, $19, $8
+ addq $22, $17, $22
+ addq $23, $21, $23
+ cmpult $22, $17, $27
+ cmpult $23, $21, $28
+ addq $23, $27, $23
+ addq $8, $28, $8
+ mulq $7, $0, $20
+ umulh $7, $0, $25
+ cmplt $20, $31, $24
+ cmplt $25, $31, $18
+ addq $20, $20, $20
+ addq $25, $25, $25
+ addq $25, $24, $25
+ addq $8, $18, $8
+ addq $22, $20, $22
+ addq $23, $25, $23
+ cmpult $22, $20, $19
+ cmpult $23, $25, $17
+ addq $23, $19, $23
+ addq $8, $17, $8
+ stq $22, 56($16)
+ bis $31, $31, $22
+ mulq $4, $4, $21
+ umulh $4, $4, $27
+ addq $23, $21, $23
+ addq $8, $27, $8
+ cmpult $23, $21, $28
+ cmpult $8, $27, $24
+ addq $8, $28, $8
+ addq $22, $24, $22
+ mulq $5, $3, $18
+ umulh $5, $3, $20
+ cmplt $18, $31, $25
+ cmplt $20, $31, $19
+ addq $18, $18, $18
+ addq $20, $20, $20
+ addq $20, $25, $20
+ addq $22, $19, $22
+ addq $23, $18, $23
+ addq $8, $20, $8
+ cmpult $23, $18, $17
+ cmpult $8, $20, $21
+ addq $8, $17, $8
+ addq $22, $21, $22
+ mulq $6, $2, $27
+ umulh $6, $2, $28
+ cmplt $27, $31, $24
+ cmplt $28, $31, $25
+ addq $27, $27, $27
+ addq $28, $28, $28
+ addq $28, $24, $28
+ addq $22, $25, $22
+ addq $23, $27, $23
+ addq $8, $28, $8
+ cmpult $23, $27, $19
+ cmpult $8, $28, $18
+ addq $8, $19, $8
+ addq $22, $18, $22
+ mulq $7, $1, $20
+ umulh $7, $1, $17
+ cmplt $20, $31, $21
+ cmplt $17, $31, $24
+ addq $20, $20, $20
+ addq $17, $17, $17
+ addq $17, $21, $17
+ addq $22, $24, $22
+ addq $23, $20, $23
+ addq $8, $17, $8
+ cmpult $23, $20, $25
+ cmpult $8, $17, $27
+ addq $8, $25, $8
+ addq $22, $27, $22
+ stq $23, 64($16)
+ bis $31, $31, $23
+ mulq $5, $4, $28
+ umulh $5, $4, $19
+ cmplt $28, $31, $18
+ cmplt $19, $31, $21
+ addq $28, $28, $28
+ addq $19, $19, $19
+ addq $19, $18, $19
+ addq $23, $21, $23
+ addq $8, $28, $8
+ addq $22, $19, $22
+ cmpult $8, $28, $24
+ cmpult $22, $19, $20
+ addq $22, $24, $22
+ addq $23, $20, $23
+ mulq $6, $3, $17
+ umulh $6, $3, $25
+ cmplt $17, $31, $27
+ cmplt $25, $31, $18
+ addq $17, $17, $17
+ addq $25, $25, $25
+ addq $25, $27, $25
+ addq $23, $18, $23
+ addq $8, $17, $8
+ addq $22, $25, $22
+ cmpult $8, $17, $21
+ cmpult $22, $25, $28
+ addq $22, $21, $22
+ addq $23, $28, $23
+ mulq $7, $2, $19
+ umulh $7, $2, $24
+ cmplt $19, $31, $20
+ cmplt $24, $31, $27
+ addq $19, $19, $19
+ addq $24, $24, $24
+ addq $24, $20, $24
+ addq $23, $27, $23
+ addq $8, $19, $8
+ addq $22, $24, $22
+ cmpult $8, $19, $18
+ cmpult $22, $24, $17
+ addq $22, $18, $22
+ addq $23, $17, $23
+ stq $8, 72($16)
+ bis $31, $31, $8
+ mulq $5, $5, $25
+ umulh $5, $5, $21
+ addq $22, $25, $22
+ addq $23, $21, $23
+ cmpult $22, $25, $28
+ cmpult $23, $21, $20
+ addq $23, $28, $23
+ addq $8, $20, $8
+ mulq $6, $4, $27
+ umulh $6, $4, $19
+ cmplt $27, $31, $24
+ cmplt $19, $31, $18
+ addq $27, $27, $27
+ addq $19, $19, $19
+ addq $19, $24, $19
+ addq $8, $18, $8
+ addq $22, $27, $22
+ addq $23, $19, $23
+ cmpult $22, $27, $17
+ cmpult $23, $19, $25
+ addq $23, $17, $23
+ addq $8, $25, $8
+ mulq $7, $3, $21
+ umulh $7, $3, $28
+ cmplt $21, $31, $20
+ cmplt $28, $31, $24
+ addq $21, $21, $21
+ addq $28, $28, $28
+ addq $28, $20, $28
+ addq $8, $24, $8
+ addq $22, $21, $22
+ addq $23, $28, $23
+ cmpult $22, $21, $18
+ cmpult $23, $28, $27
+ addq $23, $18, $23
+ addq $8, $27, $8
+ stq $22, 80($16)
+ bis $31, $31, $22
+ mulq $6, $5, $19
+ umulh $6, $5, $17
+ cmplt $19, $31, $25
+ cmplt $17, $31, $20
+ addq $19, $19, $19
+ addq $17, $17, $17
+ addq $17, $25, $17
+ addq $22, $20, $22
+ addq $23, $19, $23
+ addq $8, $17, $8
+ cmpult $23, $19, $24
+ cmpult $8, $17, $21
+ addq $8, $24, $8
+ addq $22, $21, $22
+ mulq $7, $4, $28
+ umulh $7, $4, $18
+ cmplt $28, $31, $27
+ cmplt $18, $31, $25
+ addq $28, $28, $28
+ addq $18, $18, $18
+ addq $18, $27, $18
+ addq $22, $25, $22
+ addq $23, $28, $23
+ addq $8, $18, $8
+ cmpult $23, $28, $20
+ cmpult $8, $18, $19
+ addq $8, $20, $8
+ addq $22, $19, $22
+ stq $23, 88($16)
+ bis $31, $31, $23
+ mulq $6, $6, $17
+ umulh $6, $6, $24
+ addq $8, $17, $8
+ addq $22, $24, $22
+ cmpult $8, $17, $21
+ cmpult $22, $24, $27
+ addq $22, $21, $22
+ addq $23, $27, $23
+ mulq $7, $5, $25
+ umulh $7, $5, $28
+ cmplt $25, $31, $18
+ cmplt $28, $31, $20
+ addq $25, $25, $25
+ addq $28, $28, $28
+ addq $28, $18, $28
+ addq $23, $20, $23
+ addq $8, $25, $8
+ addq $22, $28, $22
+ cmpult $8, $25, $19
+ cmpult $22, $28, $17
+ addq $22, $19, $22
+ addq $23, $17, $23
+ stq $8, 96($16)
+ bis $31, $31, $8
+ mulq $7, $6, $24
+ umulh $7, $6, $21
+ cmplt $24, $31, $27
+ cmplt $21, $31, $18
+ addq $24, $24, $24
+ addq $21, $21, $21
+ addq $21, $27, $21
+ addq $8, $18, $8
+ addq $22, $24, $22
+ addq $23, $21, $23
+ cmpult $22, $24, $20
+ cmpult $23, $21, $25
+ addq $23, $20, $23
+ addq $8, $25, $8
+ stq $22, 104($16)
+ bis $31, $31, $22
+ mulq $7, $7, $28
+ umulh $7, $7, $19
+ addq $23, $28, $23
+ addq $8, $19, $8
+ cmpult $23, $28, $17
+ cmpult $8, $19, $27
+ addq $8, $17, $8
+ addq $22, $27, $22
+ stq $23, 112($16)
+ stq $8, 120($16)
+ ret $31,($26),1
+ .end bn_sqr_comba8
diff --git a/crypto/bn/asm/alpha.s.works b/crypto/bn/asm/alpha.s.works
new file mode 100644
index 0000000000..ee6c587809
--- /dev/null
+++ b/crypto/bn/asm/alpha.s.works
@@ -0,0 +1,533 @@
+
+ # DEC Alpha assember
+ # The bn_div64 is actually gcc output but the other parts are hand done.
+ # Thanks to tzeruch@ceddec.com for sending me the gcc output for
+ # bn_div64.
+ # I've gone back and re-done most of routines.
+ # The key thing to remeber for the 164 CPU is that while a
+ # multiply operation takes 8 cycles, another one can only be issued
+ # after 4 cycles have elapsed. I've done modification to help
+ # improve this. Also, normally, a ld instruction will not be available
+ # for about 3 cycles.
+ .file 1 "bn_asm.c"
+ .set noat
+gcc2_compiled.:
+__gnu_compiled_c:
+ .text
+ .align 3
+ .globl bn_mul_add_words
+ .ent bn_mul_add_words
+bn_mul_add_words:
+bn_mul_add_words..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+ .align 5
+ subq $18,4,$18
+ bis $31,$31,$0
+ blt $18,$43 # if we are -1, -2, -3 or -4 goto tail code
+ ldq $20,0($17) # 1 1
+ ldq $1,0($16) # 1 1
+ .align 3
+$42:
+ mulq $20,$19,$5 # 1 2 1 ######
+ ldq $21,8($17) # 2 1
+ ldq $2,8($16) # 2 1
+ umulh $20,$19,$20 # 1 2 ######
+ ldq $27,16($17) # 3 1
+ ldq $3,16($16) # 3 1
+ mulq $21,$19,$6 # 2 2 1 ######
+ ldq $28,24($17) # 4 1
+ addq $1,$5,$1 # 1 2 2
+ ldq $4,24($16) # 4 1
+ umulh $21,$19,$21 # 2 2 ######
+ cmpult $1,$5,$22 # 1 2 3 1
+ addq $20,$22,$20 # 1 3 1
+ addq $1,$0,$1 # 1 2 3 1
+ mulq $27,$19,$7 # 3 2 1 ######
+ cmpult $1,$0,$0 # 1 2 3 2
+ addq $2,$6,$2 # 2 2 2
+ addq $20,$0,$0 # 1 3 2
+ cmpult $2,$6,$23 # 2 2 3 1
+ addq $21,$23,$21 # 2 3 1
+ umulh $27,$19,$27 # 3 2 ######
+ addq $2,$0,$2 # 2 2 3 1
+ cmpult $2,$0,$0 # 2 2 3 2
+ subq $18,4,$18
+ mulq $28,$19,$8 # 4 2 1 ######
+ addq $21,$0,$0 # 2 3 2
+ addq $3,$7,$3 # 3 2 2
+ addq $16,32,$16
+ cmpult $3,$7,$24 # 3 2 3 1
+ stq $1,-32($16) # 1 2 4
+ umulh $28,$19,$28 # 4 2 ######
+ addq $27,$24,$27 # 3 3 1
+ addq $3,$0,$3 # 3 2 3 1
+ stq $2,-24($16) # 2 2 4
+ cmpult $3,$0,$0 # 3 2 3 2
+ stq $3,-16($16) # 3 2 4
+ addq $4,$8,$4 # 4 2 2
+ addq $27,$0,$0 # 3 3 2
+ cmpult $4,$8,$25 # 4 2 3 1
+ addq $17,32,$17
+ addq $28,$25,$28 # 4 3 1
+ addq $4,$0,$4 # 4 2 3 1
+ cmpult $4,$0,$0 # 4 2 3 2
+ stq $4,-8($16) # 4 2 4
+ addq $28,$0,$0 # 4 3 2
+ blt $18,$43
+
+ ldq $20,0($17) # 1 1
+ ldq $1,0($16) # 1 1
+
+ br $42
+
+ .align 4
+$45:
+ ldq $20,0($17) # 4 1
+ ldq $1,0($16) # 4 1
+ mulq $20,$19,$5 # 4 2 1
+ subq $18,1,$18
+ addq $16,8,$16
+ addq $17,8,$17
+ umulh $20,$19,$20 # 4 2
+ addq $1,$5,$1 # 4 2 2
+ cmpult $1,$5,$22 # 4 2 3 1
+ addq $20,$22,$20 # 4 3 1
+ addq $1,$0,$1 # 4 2 3 1
+ cmpult $1,$0,$0 # 4 2 3 2
+ addq $20,$0,$0 # 4 3 2
+ stq $1,-8($16) # 4 2 4
+ bgt $18,$45
+ ret $31,($26),1 # else exit
+
+ .align 4
+$43:
+ addq $18,4,$18
+ bgt $18,$45 # goto tail code
+ ret $31,($26),1 # else exit
+
+ .end bn_mul_add_words
+ .align 3
+ .globl bn_mul_words
+ .ent bn_mul_words
+bn_mul_words:
+bn_mul_words..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+ .align 5
+ subq $18,4,$18
+ bis $31,$31,$0
+ blt $18,$143 # if we are -1, -2, -3 or -4 goto tail code
+ ldq $20,0($17) # 1 1
+ .align 3
+$142:
+
+ mulq $20,$19,$5 # 1 2 1 #####
+ ldq $21,8($17) # 2 1
+ ldq $27,16($17) # 3 1
+ umulh $20,$19,$20 # 1 2 #####
+ ldq $28,24($17) # 4 1
+ mulq $21,$19,$6 # 2 2 1 #####
+ addq $5,$0,$5 # 1 2 3 1
+ subq $18,4,$18
+ cmpult $5,$0,$0 # 1 2 3 2
+ umulh $21,$19,$21 # 2 2 #####
+ addq $20,$0,$0 # 1 3 2
+ addq $17,32,$17
+ addq $6,$0,$6 # 2 2 3 1
+ mulq $27,$19,$7 # 3 2 1 #####
+ cmpult $6,$0,$0 # 2 2 3 2
+ addq $21,$0,$0 # 2 3 2
+ addq $16,32,$16
+ umulh $27,$19,$27 # 3 2 #####
+ stq $5,-32($16) # 1 2 4
+ mulq $28,$19,$8 # 4 2 1 #####
+ addq $7,$0,$7 # 3 2 3 1
+ stq $6,-24($16) # 2 2 4
+ cmpult $7,$0,$0 # 3 2 3 2
+ umulh $28,$19,$28 # 4 2 #####
+ addq $27,$0,$0 # 3 3 2
+ stq $7,-16($16) # 3 2 4
+ addq $8,$0,$8 # 4 2 3 1
+ cmpult $8,$0,$0 # 4 2 3 2
+
+ addq $28,$0,$0 # 4 3 2
+
+ stq $8,-8($16) # 4 2 4
+
+ blt $18,$143
+
+ ldq $20,0($17) # 1 1
+
+ br $142
+
+ .align 4
+$145:
+ ldq $20,0($17) # 4 1
+ mulq $20,$19,$5 # 4 2 1
+ subq $18,1,$18
+ umulh $20,$19,$20 # 4 2
+ addq $5,$0,$5 # 4 2 3 1
+ addq $16,8,$16
+ cmpult $5,$0,$0 # 4 2 3 2
+ addq $17,8,$17
+ addq $20,$0,$0 # 4 3 2
+ stq $5,-8($16) # 4 2 4
+
+ bgt $18,$145
+ ret $31,($26),1 # else exit
+
+ .align 4
+$143:
+ addq $18,4,$18
+ bgt $18,$145 # goto tail code
+ ret $31,($26),1 # else exit
+
+ .end bn_mul_words
+ .align 3
+ .globl bn_sqr_words
+ .ent bn_sqr_words
+bn_sqr_words:
+bn_sqr_words..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ subq $18,4,$18
+ blt $18,$543 # if we are -1, -2, -3 or -4 goto tail code
+ ldq $20,0($17) # 1 1
+ .align 3
+$542:
+ mulq $20,$20,$5 ######
+ ldq $21,8($17) # 1 1
+ subq $18,4
+ umulh $20,$20,$1 ######
+ ldq $27,16($17) # 1 1
+ mulq $21,$21,$6 ######
+ ldq $28,24($17) # 1 1
+ stq $5,0($16) # r[0]
+ umulh $21,$21,$2 ######
+ stq $1,8($16) # r[1]
+ mulq $27,$27,$7 ######
+ stq $6,16($16) # r[0]
+ umulh $27,$27,$3 ######
+ stq $2,24($16) # r[1]
+ mulq $28,$28,$8 ######
+ stq $7,32($16) # r[0]
+ umulh $28,$28,$4 ######
+ stq $3,40($16) # r[1]
+
+ addq $16,64,$16
+ addq $17,32,$17
+ stq $8,-16($16) # r[0]
+ stq $4,-8($16) # r[1]
+
+ blt $18,$543
+ ldq $20,0($17) # 1 1
+ br $542
+
+$442:
+ ldq $20,0($17) # a[0]
+ mulq $20,$20,$5 # a[0]*w low part r2
+ addq $16,16,$16
+ addq $17,8,$17
+ subq $18,1,$18
+ umulh $20,$20,$1 # a[0]*w high part r3
+ stq $5,-16($16) # r[0]
+ stq $1,-8($16) # r[1]
+
+ bgt $18,$442
+ ret $31,($26),1 # else exit
+
+ .align 4
+$543:
+ addq $18,4,$18
+ bgt $18,$442 # goto tail code
+ ret $31,($26),1 # else exit
+ .end bn_sqr_words
+
+ .align 3
+ .globl bn_add_words
+ .ent bn_add_words
+bn_add_words:
+bn_add_words..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ subq $19,4,$19
+ bis $31,$31,$0 # carry = 0
+ blt $19,$900
+ ldq $5,0($17) # a[0]
+ ldq $1,0($18) # b[1]
+ .align 3
+$901:
+ addq $1,$5,$1 # r=a+b;
+ ldq $6,8($17) # a[1]
+ cmpult $1,$5,$22 # did we overflow?
+ ldq $2,8($18) # b[1]
+ addq $1,$0,$1 # c+= overflow
+ ldq $7,16($17) # a[2]
+ cmpult $1,$0,$0 # overflow?
+ ldq $3,16($18) # b[2]
+ addq $0,$22,$0
+ ldq $8,24($17) # a[3]
+ addq $2,$6,$2 # r=a+b;
+ ldq $4,24($18) # b[3]
+ cmpult $2,$6,$23 # did we overflow?
+ addq $3,$7,$3 # r=a+b;
+ addq $2,$0,$2 # c+= overflow
+ cmpult $3,$7,$24 # did we overflow?
+ cmpult $2,$0,$0 # overflow?
+ addq $4,$8,$4 # r=a+b;
+ addq $0,$23,$0
+ cmpult $4,$8,$25 # did we overflow?
+ addq $3,$0,$3 # c+= overflow
+ stq $1,0($16) # r[0]=c
+ cmpult $3,$0,$0 # overflow?
+ stq $2,8($16) # r[1]=c
+ addq $0,$24,$0
+ stq $3,16($16) # r[2]=c
+ addq $4,$0,$4 # c+= overflow
+ subq $19,4,$19 # loop--
+ cmpult $4,$0,$0 # overflow?
+ addq $17,32,$17 # a++
+ addq $0,$25,$0
+ stq $4,24($16) # r[3]=c
+ addq $18,32,$18 # b++
+ addq $16,32,$16 # r++
+
+ blt $19,$900
+ ldq $5,0($17) # a[0]
+ ldq $1,0($18) # b[1]
+ br $901
+ .align 4
+$945:
+ ldq $5,0($17) # a[0]
+ ldq $1,0($18) # b[1]
+ addq $1,$5,$1 # r=a+b;
+ subq $19,1,$19 # loop--
+ addq $1,$0,$1 # c+= overflow
+ addq $17,8,$17 # a++
+ cmpult $1,$5,$22 # did we overflow?
+ cmpult $1,$0,$0 # overflow?
+ addq $18,8,$18 # b++
+ stq $1,0($16) # r[0]=c
+ addq $0,$22,$0
+ addq $16,8,$16 # r++
+
+ bgt $19,$945
+ ret $31,($26),1 # else exit
+
+$900:
+ addq $19,4,$19
+ bgt $19,$945 # goto tail code
+ ret $31,($26),1 # else exit
+ .end bn_add_words
+
+ #
+ # What follows was taken directly from the C compiler with a few
+ # hacks to redo the lables.
+ #
+.text
+ .align 3
+ .globl bn_div64
+ .ent bn_div64
+bn_div64:
+ ldgp $29,0($27)
+bn_div64..ng:
+ lda $30,-48($30)
+ .frame $30,48,$26,0
+ stq $26,0($30)
+ stq $9,8($30)
+ stq $10,16($30)
+ stq $11,24($30)
+ stq $12,32($30)
+ stq $13,40($30)
+ .mask 0x4003e00,-48
+ .prologue 1
+ bis $16,$16,$9
+ bis $17,$17,$10
+ bis $18,$18,$11
+ bis $31,$31,$13
+ bis $31,2,$12
+ bne $11,$119
+ lda $0,-1
+ br $31,$136
+ .align 4
+$119:
+ bis $11,$11,$16
+ jsr $26,BN_num_bits_word
+ ldgp $29,0($26)
+ subq $0,64,$1
+ beq $1,$120
+ bis $31,1,$1
+ sll $1,$0,$1
+ cmpule $9,$1,$1
+ bne $1,$120
+ # lda $16,_IO_stderr_
+ # lda $17,$C32
+ # bis $0,$0,$18
+ # jsr $26,fprintf
+ # ldgp $29,0($26)
+ jsr $26,abort
+ ldgp $29,0($26)
+ .align 4
+$120:
+ bis $31,64,$3
+ cmpult $9,$11,$2
+ subq $3,$0,$1
+ addl $1,$31,$0
+ subq $9,$11,$1
+ cmoveq $2,$1,$9
+ beq $0,$122
+ zapnot $0,15,$2
+ subq $3,$0,$1
+ sll $11,$2,$11
+ sll $9,$2,$3
+ srl $10,$1,$1
+ sll $10,$2,$10
+ bis $3,$1,$9
+$122:
+ srl $11,32,$5
+ zapnot $11,15,$6
+ lda $7,-1
+ .align 5
+$123:
+ srl $9,32,$1
+ subq $1,$5,$1
+ bne $1,$126
+ zapnot $7,15,$27
+ br $31,$127
+ .align 4
+$126:
+ bis $9,$9,$24
+ bis $5,$5,$25
+ divqu $24,$25,$27
+$127:
+ srl $10,32,$4
+ .align 5
+$128:
+ mulq $27,$5,$1
+ subq $9,$1,$3
+ zapnot $3,240,$1
+ bne $1,$129
+ mulq $6,$27,$2
+ sll $3,32,$1
+ addq $1,$4,$1
+ cmpule $2,$1,$2
+ bne $2,$129
+ subq $27,1,$27
+ br $31,$128
+ .align 4
+$129:
+ mulq $27,$6,$1
+ mulq $27,$5,$4
+ srl $1,32,$3
+ sll $1,32,$1
+ addq $4,$3,$4
+ cmpult $10,$1,$2
+ subq $10,$1,$10
+ addq $2,$4,$2
+ cmpult $9,$2,$1
+ bis $2,$2,$4
+ beq $1,$134
+ addq $9,$11,$9
+ subq $27,1,$27
+$134:
+ subl $12,1,$12
+ subq $9,$4,$9
+ beq $12,$124
+ sll $27,32,$13
+ sll $9,32,$2
+ srl $10,32,$1
+ sll $10,32,$10
+ bis $2,$1,$9
+ br $31,$123
+ .align 4
+$124:
+ bis $13,$27,$0
+$136:
+ ldq $26,0($30)
+ ldq $9,8($30)
+ ldq $10,16($30)
+ ldq $11,24($30)
+ ldq $12,32($30)
+ ldq $13,40($30)
+ addq $30,48,$30
+ ret $31,($26),1
+ .end bn_div64
+
+ .set noat
+ .text
+ .align 3
+ .globl bn_sub_words
+ .ent bn_sub_words
+bn_sub_words:
+bn_sub_words..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ subq $19, 4, $19
+ bis $31, $31, $0
+ blt $19, $100
+ ldq $1, 0($17)
+ ldq $2, 0($18)
+$101:
+ ldq $3, 8($17)
+ cmpult $1, $2, $4
+ ldq $5, 8($18)
+ subq $1, $2, $1
+ ldq $6, 16($17)
+ cmpult $1, $0, $2
+ ldq $7, 16($18)
+ subq $1, $0, $23
+ ldq $8, 24($17)
+ addq $2, $4, $0
+ cmpult $3, $5, $24
+ subq $3, $5, $3
+ ldq $22, 24($18)
+ cmpult $3, $0, $5
+ subq $3, $0, $25
+ addq $5, $24, $0
+ cmpult $6, $7, $27
+ subq $6, $7, $6
+ stq $23, 0($16)
+ cmpult $6, $0, $7
+ subq $6, $0, $28
+ addq $7, $27, $0
+ cmpult $8, $22, $21
+ subq $8, $22, $8
+ stq $25, 8($16)
+ cmpult $8, $0, $22
+ subq $8, $0, $20
+ addq $22, $21, $0
+ stq $28, 16($16)
+ subq $19, 4, $19
+ stq $20, 24($16)
+ addq $17, 32, $17
+ addq $18, 32, $18
+ addq $16, 32, $16
+ blt $19, $100
+ ldq $1, 0($17)
+ ldq $2, 0($18)
+ br $101
+$102:
+ ldq $1, 0($17)
+ ldq $2, 0($18)
+ cmpult $1, $2, $27
+ subq $1, $2, $1
+ cmpult $1, $0, $2
+ subq $1, $0, $1
+ stq $1, 0($16)
+ addq $2, $27, $0
+ addq $17, 8, $17
+ addq $18, 8, $18
+ addq $16, 8, $16
+ subq $19, 1, $19
+ bgt $19, $102
+ ret $31,($26),1
+$100:
+ addq $19, 4, $19
+ bgt $19, $102
+$103:
+ ret $31,($26),1
+ .end bn_sub_words
diff --git a/crypto/bn/asm/alpha.works/add.pl b/crypto/bn/asm/alpha.works/add.pl
new file mode 100644
index 0000000000..4dc76e6b69
--- /dev/null
+++ b/crypto/bn/asm/alpha.works/add.pl
@@ -0,0 +1,119 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub bn_add_words
+ {
+ local($name)=@_;
+ local($cc,$a,$b,$r);
+
+ &init_pool(4);
+ ($cc)=GR("r0");
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $bp=&wparam(2);
+ $count=&wparam(3);
+
+ &function_begin($name,"");
+
+ &comment("");
+ &sub($count,4,$count);
+ &mov("zero",$cc);
+ &br(&label("finish"));
+ &blt($count,&label("finish"));
+
+ ($a0,$b0)=&NR(2);
+ &ld($a0,&QWPw(0,$ap));
+ &ld($b0,&QWPw(0,$bp));
+
+##########################################################
+ &set_label("loop");
+
+ ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap));
+ ($b1)=&NR(1); &ld($b1,&QWPw(1,$bp));
+ ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap));
+ ($b2)=&NR(1); &ld($b2,&QWPw(2,$bp));
+ ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap));
+ ($b3)=&NR(1); &ld($b3,&QWPw(3,$bp));
+
+ ($o0,$t0)=&NR(2);
+ &add($a0,$b0,$o0);
+ &cmpult($o0,$b0,$t0);
+ &add($o0,$cc,$o0);
+ &cmpult($o0,$cc,$cc);
+ &add($cc,$t0,$cc); &FR($t0);
+
+ ($t1,$o1)=&NR(2);
+
+ &add($a1,$b1,$o1); &FR($a1);
+ &cmpult($o1,$b1,$t1); &FR($b1);
+ &add($o1,$cc,$o1);
+ &cmpult($o1,$cc,$cc);
+ &add($cc,$t1,$cc); &FR($t1);
+
+ ($t2,$o2)=&NR(2);
+
+ &add($a2,$b2,$o2); &FR($a2);
+ &cmpult($o2,$b2,$t2); &FR($b2);
+ &add($o2,$cc,$o2);
+ &cmpult($o2,$cc,$cc);
+ &add($cc,$t2,$cc); &FR($t2);
+
+ ($t3,$o3)=&NR(2);
+
+ &add($a3,$b3,$o3); &FR($a3);
+ &cmpult($o3,$b3,$t3); &FR($b3);
+ &add($o3,$cc,$o3);
+ &cmpult($o3,$cc,$cc);
+ &add($cc,$t3,$cc); &FR($t3);
+
+ &st($o0,&QWPw(0,$rp)); &FR($o0);
+ &st($o1,&QWPw(0,$rp)); &FR($o1);
+ &st($o2,&QWPw(0,$rp)); &FR($o2);
+ &st($o3,&QWPw(0,$rp)); &FR($o3);
+
+ &sub($count,4,$count); # count-=4
+ &add($ap,4*$QWS,$ap); # count+=4
+ &add($bp,4*$QWS,$bp); # count+=4
+ &add($rp,4*$QWS,$rp); # count+=4
+
+ &blt($count,&label("finish"));
+ &ld($a0,&QWPw(0,$ap));
+ &ld($b0,&QWPw(0,$bp));
+ &br(&label("loop"));
+##################################################
+ # Do the last 0..3 words
+
+ ($t0,$o0)=&NR(2);
+ &set_label("last_loop");
+
+ &ld($a0,&QWPw(0,$ap)); # get a
+ &ld($b0,&QWPw(0,$bp)); # get b
+
+ &add($a0,$b0,$o0);
+ &cmpult($o0,$b0,$t0); # will we borrow?
+ &add($o0,$cc,$o0); # will we borrow?
+ &cmpult($o0,$cc,$cc); # will we borrow?
+ &add($cc,$t0,$cc); # add the borrows
+ &st($o0,&QWPw(0,$rp)); # save
+
+ &add($ap,$QWS,$ap);
+ &add($bp,$QWS,$bp);
+ &add($rp,$QWS,$rp);
+ &sub($count,1,$count);
+ &bgt($count,&label("last_loop"));
+ &function_end_A($name);
+
+######################################################
+ &set_label("finish");
+ &add($count,4,$count);
+ &bgt($count,&label("last_loop"));
+
+ &FR($o0,$t0,$a0,$b0);
+ &set_label("end");
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha.works/div.pl b/crypto/bn/asm/alpha.works/div.pl
new file mode 100644
index 0000000000..7ec144377f
--- /dev/null
+++ b/crypto/bn/asm/alpha.works/div.pl
@@ -0,0 +1,144 @@
+#!/usr/local/bin/perl
+
+sub bn_div64
+ {
+ local($data)=<<'EOF';
+ #
+ # What follows was taken directly from the C compiler with a few
+ # hacks to redo the lables.
+ #
+.text
+ .set noreorder
+ .set volatile
+ .align 3
+ .globl bn_div64
+ .ent bn_div64
+bn_div64:
+ ldgp $29,0($27)
+bn_div64..ng:
+ lda $30,-48($30)
+ .frame $30,48,$26,0
+ stq $26,0($30)
+ stq $9,8($30)
+ stq $10,16($30)
+ stq $11,24($30)
+ stq $12,32($30)
+ stq $13,40($30)
+ .mask 0x4003e00,-48
+ .prologue 1
+ bis $16,$16,$9
+ bis $17,$17,$10
+ bis $18,$18,$11
+ bis $31,$31,$13
+ bis $31,2,$12
+ bne $11,$9119
+ lda $0,-1
+ br $31,$9136
+ .align 4
+$9119:
+ bis $11,$11,$16
+ jsr $26,BN_num_bits_word
+ ldgp $29,0($26)
+ subq $0,64,$1
+ beq $1,$9120
+ bis $31,1,$1
+ sll $1,$0,$1
+ cmpule $9,$1,$1
+ bne $1,$9120
+ # lda $16,_IO_stderr_
+ # lda $17,$C32
+ # bis $0,$0,$18
+ # jsr $26,fprintf
+ # ldgp $29,0($26)
+ jsr $26,abort
+ ldgp $29,0($26)
+ .align 4
+$9120:
+ bis $31,64,$3
+ cmpult $9,$11,$2
+ subq $3,$0,$1
+ addl $1,$31,$0
+ subq $9,$11,$1
+ cmoveq $2,$1,$9
+ beq $0,$9122
+ zapnot $0,15,$2
+ subq $3,$0,$1
+ sll $11,$2,$11
+ sll $9,$2,$3
+ srl $10,$1,$1
+ sll $10,$2,$10
+ bis $3,$1,$9
+$9122:
+ srl $11,32,$5
+ zapnot $11,15,$6
+ lda $7,-1
+ .align 5
+$9123:
+ srl $9,32,$1
+ subq $1,$5,$1
+ bne $1,$9126
+ zapnot $7,15,$27
+ br $31,$9127
+ .align 4
+$9126:
+ bis $9,$9,$24
+ bis $5,$5,$25
+ divqu $24,$25,$27
+$9127:
+ srl $10,32,$4
+ .align 5
+$9128:
+ mulq $27,$5,$1
+ subq $9,$1,$3
+ zapnot $3,240,$1
+ bne $1,$9129
+ mulq $6,$27,$2
+ sll $3,32,$1
+ addq $1,$4,$1
+ cmpule $2,$1,$2
+ bne $2,$9129
+ subq $27,1,$27
+ br $31,$9128
+ .align 4
+$9129:
+ mulq $27,$6,$1
+ mulq $27,$5,$4
+ srl $1,32,$3
+ sll $1,32,$1
+ addq $4,$3,$4
+ cmpult $10,$1,$2
+ subq $10,$1,$10
+ addq $2,$4,$2
+ cmpult $9,$2,$1
+ bis $2,$2,$4
+ beq $1,$9134
+ addq $9,$11,$9
+ subq $27,1,$27
+$9134:
+ subl $12,1,$12
+ subq $9,$4,$9
+ beq $12,$9124
+ sll $27,32,$13
+ sll $9,32,$2
+ srl $10,32,$1
+ sll $10,32,$10
+ bis $2,$1,$9
+ br $31,$9123
+ .align 4
+$9124:
+ bis $13,$27,$0
+$9136:
+ ldq $26,0($30)
+ ldq $9,8($30)
+ ldq $10,16($30)
+ ldq $11,24($30)
+ ldq $12,32($30)
+ ldq $13,40($30)
+ addq $30,48,$30
+ ret $31,($26),1
+ .end bn_div64
+EOF
+ &asm_add($data);
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha.works/mul.pl b/crypto/bn/asm/alpha.works/mul.pl
new file mode 100644
index 0000000000..b182bae452
--- /dev/null
+++ b/crypto/bn/asm/alpha.works/mul.pl
@@ -0,0 +1,116 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub bn_mul_words
+ {
+ local($name)=@_;
+ local($cc,$a,$b,$r,$couny);
+
+ &init_pool(4);
+ ($cc)=GR("r0");
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $count=&wparam(2);
+ $word=&wparam(3);
+
+ &function_begin($name,"");
+
+ &comment("");
+ &sub($count,4,$count);
+ &mov("zero",$cc);
+ &br(&label("finish"));
+ &blt($count,&label("finish"));
+
+ ($a0,$r0)=&NR(2);
+ &ld($a0,&QWPw(0,$ap));
+ &ld($r0,&QWPw(0,$rp));
+
+$a=<<'EOF';
+##########################################################
+ &set_label("loop");
+
+ ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap));
+ ($b1)=&NR(1); &ld($b1,&QWPw(1,$bp));
+ ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap));
+ ($b2)=&NR(1); &ld($b2,&QWPw(2,$bp));
+ ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap));
+ ($b3)=&NR(1); &ld($b3,&QWPw(3,$bp));
+
+ ($o0,$t0)=&NR(2);
+ &add($a0,$b0,$o0);
+ &cmpult($o0,$b0,$t0);
+ &add($o0,$cc,$o0);
+ &cmpult($o0,$cc,$cc);
+ &add($cc,$t0,$cc); &FR($t0);
+
+ ($t1,$o1)=&NR(2);
+
+ &add($a1,$b1,$o1); &FR($a1);
+ &cmpult($o1,$b1,$t1); &FR($b1);
+ &add($o1,$cc,$o1);
+ &cmpult($o1,$cc,$cc);
+ &add($cc,$t1,$cc); &FR($t1);
+
+ ($t2,$o2)=&NR(2);
+
+ &add($a2,$b2,$o2); &FR($a2);
+ &cmpult($o2,$b2,$t2); &FR($b2);
+ &add($o2,$cc,$o2);
+ &cmpult($o2,$cc,$cc);
+ &add($cc,$t2,$cc); &FR($t2);
+
+ ($t3,$o3)=&NR(2);
+
+ &add($a3,$b3,$o3); &FR($a3);
+ &cmpult($o3,$b3,$t3); &FR($b3);
+ &add($o3,$cc,$o3);
+ &cmpult($o3,$cc,$cc);
+ &add($cc,$t3,$cc); &FR($t3);
+
+ &st($o0,&QWPw(0,$rp)); &FR($o0);
+ &st($o1,&QWPw(0,$rp)); &FR($o1);
+ &st($o2,&QWPw(0,$rp)); &FR($o2);
+ &st($o3,&QWPw(0,$rp)); &FR($o3);
+
+ &sub($count,4,$count); # count-=4
+ &add($ap,4*$QWS,$ap); # count+=4
+ &add($bp,4*$QWS,$bp); # count+=4
+ &add($rp,4*$QWS,$rp); # count+=4
+
+ &blt($count,&label("finish"));
+ &ld($a0,&QWPw(0,$ap));
+ &ld($b0,&QWPw(0,$bp));
+ &br(&label("loop"));
+EOF
+##################################################
+ # Do the last 0..3 words
+
+ &set_label("last_loop");
+
+ &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a
+ &mul($a0,$word,($l0)=&NR(1));
+ &add($ap,$QWS,$ap);
+ &muh($a0,$word,($h0)=&NR(1)); &FR($a0);
+ &add($l0,$cc,$l0);
+ &add($rp,$QWS,$rp);
+ &sub($count,1,$count);
+ &cmpult($l0,$cc,$cc);
+ &st($l0,&QWPw(-1,$rp)); &FR($l0);
+ &add($h0,$cc,$cc); &FR($h0);
+
+ &bgt($count,&label("last_loop"));
+ &function_end_A($name);
+
+######################################################
+ &set_label("finish");
+ &add($count,4,$count);
+ &bgt($count,&label("last_loop"));
+
+ &set_label("end");
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha.works/mul_add.pl b/crypto/bn/asm/alpha.works/mul_add.pl
new file mode 100644
index 0000000000..e37f6315fb
--- /dev/null
+++ b/crypto/bn/asm/alpha.works/mul_add.pl
@@ -0,0 +1,120 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub bn_mul_add_words
+ {
+ local($name)=@_;
+ local($cc,$a,$b,$r,$couny);
+
+ &init_pool(4);
+ ($cc)=GR("r0");
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $count=&wparam(2);
+ $word=&wparam(3);
+
+ &function_begin($name,"");
+
+ &comment("");
+ &sub($count,4,$count);
+ &mov("zero",$cc);
+ &br(&label("finish"));
+ &blt($count,&label("finish"));
+
+ ($a0,$r0)=&NR(2);
+ &ld($a0,&QWPw(0,$ap));
+ &ld($r0,&QWPw(0,$rp));
+
+$a=<<'EOF';
+##########################################################
+ &set_label("loop");
+
+ ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap));
+ ($b1)=&NR(1); &ld($b1,&QWPw(1,$bp));
+ ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap));
+ ($b2)=&NR(1); &ld($b2,&QWPw(2,$bp));
+ ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap));
+ ($b3)=&NR(1); &ld($b3,&QWPw(3,$bp));
+
+ ($o0,$t0)=&NR(2);
+ &add($a0,$b0,$o0);
+ &cmpult($o0,$b0,$t0);
+ &add($o0,$cc,$o0);
+ &cmpult($o0,$cc,$cc);
+ &add($cc,$t0,$cc); &FR($t0);
+
+ ($t1,$o1)=&NR(2);
+
+ &add($a1,$b1,$o1); &FR($a1);
+ &cmpult($o1,$b1,$t1); &FR($b1);
+ &add($o1,$cc,$o1);
+ &cmpult($o1,$cc,$cc);
+ &add($cc,$t1,$cc); &FR($t1);
+
+ ($t2,$o2)=&NR(2);
+
+ &add($a2,$b2,$o2); &FR($a2);
+ &cmpult($o2,$b2,$t2); &FR($b2);
+ &add($o2,$cc,$o2);
+ &cmpult($o2,$cc,$cc);
+ &add($cc,$t2,$cc); &FR($t2);
+
+ ($t3,$o3)=&NR(2);
+
+ &add($a3,$b3,$o3); &FR($a3);
+ &cmpult($o3,$b3,$t3); &FR($b3);
+ &add($o3,$cc,$o3);
+ &cmpult($o3,$cc,$cc);
+ &add($cc,$t3,$cc); &FR($t3);
+
+ &st($o0,&QWPw(0,$rp)); &FR($o0);
+ &st($o1,&QWPw(0,$rp)); &FR($o1);
+ &st($o2,&QWPw(0,$rp)); &FR($o2);
+ &st($o3,&QWPw(0,$rp)); &FR($o3);
+
+ &sub($count,4,$count); # count-=4
+ &add($ap,4*$QWS,$ap); # count+=4
+ &add($bp,4*$QWS,$bp); # count+=4
+ &add($rp,4*$QWS,$rp); # count+=4
+
+ &blt($count,&label("finish"));
+ &ld($a0,&QWPw(0,$ap));
+ &ld($b0,&QWPw(0,$bp));
+ &br(&label("loop"));
+EOF
+##################################################
+ # Do the last 0..3 words
+
+ &set_label("last_loop");
+
+ &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a
+ &ld(($r0)=&NR(1),&QWPw(0,$rp)); # get b
+ &mul($a0,$word,($l0)=&NR(1));
+ &sub($count,1,$count);
+ &add($ap,$QWS,$ap);
+ &muh($a0,$word,($h0)=&NR(1)); &FR($a0);
+ &add($r0,$l0,$r0);
+ &add($rp,$QWS,$rp);
+ &cmpult($r0,$l0,($t0)=&NR(1)); &FR($l0);
+ &add($r0,$cc,$r0);
+ &add($h0,$t0,$h0); &FR($t0);
+ &cmpult($r0,$cc,$cc);
+ &st($r0,&QWPw(-1,$rp)); &FR($r0);
+ &add($h0,$cc,$cc); &FR($h0);
+
+ &bgt($count,&label("last_loop"));
+ &function_end_A($name);
+
+######################################################
+ &set_label("finish");
+ &add($count,4,$count);
+ &bgt($count,&label("last_loop"));
+
+ &set_label("end");
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha.works/mul_c4.pl b/crypto/bn/asm/alpha.works/mul_c4.pl
new file mode 100644
index 0000000000..5efd201281
--- /dev/null
+++ b/crypto/bn/asm/alpha.works/mul_c4.pl
@@ -0,0 +1,213 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub mul_add_c
+ {
+ local($a,$b,$c0,$c1,$c2)=@_;
+ local($l1,$h1,$t1,$t2);
+
+ &mul($a,$b,($l1)=&NR(1));
+ &muh($a,$b,($h1)=&NR(1));
+ &add($c0,$l1,$c0);
+ &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1);
+ &add($t1,$h1,$h1); &FR($t1);
+ &add($c1,$h1,$c1);
+ &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1);
+ &add($c2,$t2,$c2); &FR($t2);
+ }
+
+sub bn_mul_comba4
+ {
+ local($name)=@_;
+ local(@a,@b,$r,$c0,$c1,$c2);
+
+ $cnt=1;
+ &init_pool(3);
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $bp=&wparam(2);
+
+ &function_begin($name,"");
+
+ &comment("");
+
+ &ld(($a[0])=&NR(1),&QWPw(0,$ap));
+ &ld(($b[0])=&NR(1),&QWPw(0,$bp));
+ &ld(($a[1])=&NR(1),&QWPw(1,$ap));
+ &ld(($b[1])=&NR(1),&QWPw(1,$bp));
+ &mul($a[0],$b[0],($r00)=&NR(1));
+ &ld(($a[2])=&NR(1),&QWPw(2,$ap));
+ &ld(($b[2])=&NR(1),&QWPw(2,$bp));
+ &muh($a[0],$b[0],($r01)=&NR(1));
+ &FR($ap); &ld(($a[3])=&NR(1),&QWPw(3,$ap));
+ &FR($bp); &ld(($b[3])=&NR(1),&QWPw(3,$bp));
+ &mul($a[0],$b[1],($r02)=&NR(1));
+
+ ($R,$H1,$H2)=&NR(3);
+
+ &st($r00,&QWPw(0,$rp)); &FR($r00);
+
+ &mov("zero",$R);
+ &mul($a[1],$b[0],($r03)=&NR(1));
+
+ &mov("zero",$H1);
+ &mov("zero",$H0);
+ &add($R,$r01,$R);
+ &muh($a[0],$b[1],($r04)=&NR(1));
+ &cmpult($R,$r01,($t01)=&NR(1)); &FR($r01);
+ &add($R,$r02,$R);
+ &add($H1,$t01,$H1) &FR($t01);
+ &muh($a[1],$b[0],($r05)=&NR(1));
+ &cmpult($R,$r02,($t02)=&NR(1)); &FR($r02);
+ &add($R,$r03,$R);
+ &add($H2,$t02,$H2) &FR($t02);
+ &mul($a[0],$b[2],($r06)=&NR(1));
+ &cmpult($R,$r03,($t03)=&NR(1)); &FR($r03);
+ &add($H1,$t03,$H1) &FR($t03);
+ &st($R,&QWPw(1,$rp));
+ &add($H1,$H2,$R);
+
+ &mov("zero",$H1);
+ &add($R,$r04,$R);
+ &mov("zero",$H2);
+ &mul($a[1],$b[1],($r07)=&NR(1));
+ &cmpult($R,$r04,($t04)=&NR(1)); &FR($r04);
+ &add($R,$r05,$R);
+ &add($H1,$t04,$H1) &FR($t04);
+ &mul($a[2],$b[0],($r08)=&NR(1));
+ &cmpult($R,$r05,($t05)=&NR(1)); &FR($r05);
+ &add($R,$r01,$R);
+ &add($H2,$t05,$H2) &FR($t05);
+ &muh($a[0],$b[2],($r09)=&NR(1));
+ &cmpult($R,$r06,($t06)=&NR(1)); &FR($r06);
+ &add($R,$r07,$R);
+ &add($H1,$t06,$H1) &FR($t06);
+ &muh($a[1],$b[1],($r10)=&NR(1));
+ &cmpult($R,$r07,($t07)=&NR(1)); &FR($r07);
+ &add($R,$r08,$R);
+ &add($H2,$t07,$H2) &FR($t07);
+ &muh($a[2],$b[0],($r11)=&NR(1));
+ &cmpult($R,$r08,($t08)=&NR(1)); &FR($r08);
+ &add($H1,$t08,$H1) &FR($t08);
+ &st($R,&QWPw(2,$rp));
+ &add($H1,$H2,$R);
+
+ &mov("zero",$H1);
+ &add($R,$r09,$R);
+ &mov("zero",$H2);
+ &mul($a[0],$b[3],($r12)=&NR(1));
+ &cmpult($R,$r09,($t09)=&NR(1)); &FR($r09);
+ &add($R,$r10,$R);
+ &add($H1,$t09,$H1) &FR($t09);
+ &mul($a[1],$b[2],($r13)=&NR(1));
+ &cmpult($R,$r10,($t10)=&NR(1)); &FR($r10);
+ &add($R,$r11,$R);
+ &add($H1,$t10,$H1) &FR($t10);
+ &mul($a[2],$b[1],($r14)=&NR(1));
+ &cmpult($R,$r11,($t11)=&NR(1)); &FR($r11);
+ &add($R,$r12,$R);
+ &add($H1,$t11,$H1) &FR($t11);
+ &mul($a[3],$b[0],($r15)=&NR(1));
+ &cmpult($R,$r12,($t12)=&NR(1)); &FR($r12);
+ &add($R,$r13,$R);
+ &add($H1,$t12,$H1) &FR($t12);
+ &muh($a[0],$b[3],($r16)=&NR(1));
+ &cmpult($R,$r13,($t13)=&NR(1)); &FR($r13);
+ &add($R,$r14,$R);
+ &add($H1,$t13,$H1) &FR($t13);
+ &muh($a[1],$b[2],($r17)=&NR(1));
+ &cmpult($R,$r14,($t14)=&NR(1)); &FR($r14);
+ &add($R,$r15,$R);
+ &add($H1,$t14,$H1) &FR($t14);
+ &muh($a[2],$b[1],($r18)=&NR(1));
+ &cmpult($R,$r15,($t15)=&NR(1)); &FR($r15);
+ &add($H1,$t15,$H1) &FR($t15);
+ &st($R,&QWPw(3,$rp));
+ &add($H1,$H2,$R);
+
+ &mov("zero",$H1);
+ &add($R,$r16,$R);
+ &mov("zero",$H2);
+ &muh($a[3],$b[0],($r19)=&NR(1));
+ &cmpult($R,$r16,($t16)=&NR(1)); &FR($r16);
+ &add($R,$r17,$R);
+ &add($H1,$t16,$H1) &FR($t16);
+ &mul($a[1],$b[3],($r20)=&NR(1));
+ &cmpult($R,$r17,($t17)=&NR(1)); &FR($r17);
+ &add($R,$r18,$R);
+ &add($H1,$t17,$H1) &FR($t17);
+ &mul($a[2],$b[2],($r21)=&NR(1));
+ &cmpult($R,$r18,($t18)=&NR(1)); &FR($r18);
+ &add($R,$r19,$R);
+ &add($H1,$t18,$H1) &FR($t18);
+ &mul($a[3],$b[1],($r22)=&NR(1));
+ &cmpult($R,$r19,($t19)=&NR(1)); &FR($r19);
+ &add($R,$r20,$R);
+ &add($H1,$t19,$H1) &FR($t19);
+ &muh($a[1],$b[3],($r23)=&NR(1));
+ &cmpult($R,$r20,($t20)=&NR(1)); &FR($r20);
+ &add($R,$r21,$R);
+ &add($H1,$t20,$H1) &FR($t20);
+ &muh($a[2],$b[2],($r24)=&NR(1));
+ &cmpult($R,$r21,($t21)=&NR(1)); &FR($r21);
+ &add($R,$r22,$R);
+ &add($H1,$t21,$H1) &FR($t21);
+ &muh($a[3],$b[1],($r25)=&NR(1));
+ &cmpult($R,$r22,($t22)=&NR(1)); &FR($r22);
+ &add($H1,$t22,$H1) &FR($t22);
+ &st($R,&QWPw(4,$rp));
+ &add($H1,$H2,$R);
+
+ &mov("zero",$H1);
+ &add($R,$r23,$R);
+ &mov("zero",$H2);
+ &mul($a[2],$b[3],($r26)=&NR(1));
+ &cmpult($R,$r23,($t23)=&NR(1)); &FR($r23);
+ &add($R,$r24,$R);
+ &add($H1,$t23,$H1) &FR($t23);
+ &mul($a[3],$b[2],($r27)=&NR(1));
+ &cmpult($R,$r24,($t24)=&NR(1)); &FR($r24);
+ &add($R,$r25,$R);
+ &add($H1,$t24,$H1) &FR($t24);
+ &muh($a[2],$b[3],($r28)=&NR(1));
+ &cmpult($R,$r25,($t25)=&NR(1)); &FR($r25);
+ &add($R,$r26,$R);
+ &add($H1,$t25,$H1) &FR($t25);
+ &muh($a[3],$b[2],($r29)=&NR(1));
+ &cmpult($R,$r26,($t26)=&NR(1)); &FR($r26);
+ &add($R,$r27,$R);
+ &add($H1,$t26,$H1) &FR($t26);
+ &mul($a[3],$b[3],($r30)=&NR(1));
+ &cmpult($R,$r27,($t27)=&NR(1)); &FR($r27);
+ &add($H1,$t27,$H1) &FR($t27);
+ &st($R,&QWPw(5,$rp));
+ &add($H1,$H2,$R);
+
+ &mov("zero",$H1);
+ &add($R,$r28,$R);
+ &mov("zero",$H2);
+ &muh($a[3],$b[3],($r31)=&NR(1));
+ &cmpult($R,$r28,($t28)=&NR(1)); &FR($r28);
+ &add($R,$r29,$R);
+ &add($H1,$t28,$H1) &FR($t28);
+ ############
+ &cmpult($R,$r29,($t29)=&NR(1)); &FR($r29);
+ &add($R,$r30,$R);
+ &add($H1,$t29,$H1) &FR($t29);
+ ############
+ &cmpult($R,$r30,($t30)=&NR(1)); &FR($r30);
+ &add($H1,$t30,$H1) &FR($t30);
+ &st($R,&QWPw(6,$rp));
+ &add($H1,$H2,$R);
+
+ &add($R,$r31,$R); &FR($r31);
+ &st($R,&QWPw(7,$rp));
+
+ &FR($R,$H1,$H2);
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha.works/mul_c4.works.pl b/crypto/bn/asm/alpha.works/mul_c4.works.pl
new file mode 100644
index 0000000000..79d86dd25c
--- /dev/null
+++ b/crypto/bn/asm/alpha.works/mul_c4.works.pl
@@ -0,0 +1,98 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub mul_add_c
+ {
+ local($a,$b,$c0,$c1,$c2)=@_;
+ local($l1,$h1,$t1,$t2);
+
+print STDERR "count=$cnt\n"; $cnt++;
+ &mul($a,$b,($l1)=&NR(1));
+ &muh($a,$b,($h1)=&NR(1));
+ &add($c0,$l1,$c0);
+ &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1);
+ &add($t1,$h1,$h1); &FR($t1);
+ &add($c1,$h1,$c1);
+ &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1);
+ &add($c2,$t2,$c2); &FR($t2);
+ }
+
+sub bn_mul_comba4
+ {
+ local($name)=@_;
+ local(@a,@b,$r,$c0,$c1,$c2);
+
+ $cnt=1;
+ &init_pool(3);
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $bp=&wparam(2);
+
+ &function_begin($name,"");
+
+ &comment("");
+
+ &ld(($a[0])=&NR(1),&QWPw(0,$ap));
+ &ld(($b[0])=&NR(1),&QWPw(0,$bp));
+ &ld(($a[1])=&NR(1),&QWPw(1,$ap));
+ &ld(($b[1])=&NR(1),&QWPw(1,$bp));
+ &ld(($a[2])=&NR(1),&QWPw(2,$ap));
+ &ld(($b[2])=&NR(1),&QWPw(2,$bp));
+ &ld(($a[3])=&NR(1),&QWPw(3,$ap)); &FR($ap);
+ &ld(($b[3])=&NR(1),&QWPw(3,$bp)); &FR($bp);
+
+ ($c0,$c1,$c2)=&NR(3);
+ &mov("zero",$c2);
+ &mul($a[0],$b[0],$c0);
+ &muh($a[0],$b[0],$c1);
+ &st($c0,&QWPw(0,$rp)); &FR($c0); ($c0)=&NR($c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[1],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(1,$rp)); &FR($c0); ($c0)=&NR($c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[1],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[0],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(2,$rp)); &FR($c0); ($c0)=&NR($c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[3],$c0,$c1,$c2); &FR($a[0]);
+ &mul_add_c($a[1],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[0],$c0,$c1,$c2); &FR($b[0]);
+ &st($c0,&QWPw(3,$rp)); &FR($c0); ($c0)=&NR($c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[1],$b[3],$c0,$c1,$c2); &FR($a[1]);
+ &mul_add_c($a[2],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[1],$c0,$c1,$c2); &FR($b[1]);
+ &st($c0,&QWPw(4,$rp)); &FR($c0); ($c0)=&NR($c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[2],$b[3],$c0,$c1,$c2); &FR($a[2]);
+ &mul_add_c($a[3],$b[2],$c0,$c1,$c2); &FR($b[2]);
+ &st($c0,&QWPw(5,$rp)); &FR($c0); ($c0)=&NR($c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[3],$b[3],$c0,$c1,$c2); &FR($a[3],$b[3]);
+ &st($c0,&QWPw(6,$rp));
+ &st($c1,&QWPw(7,$rp));
+
+ &FR($c0,$c1,$c2);
+
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha.works/mul_c8.pl b/crypto/bn/asm/alpha.works/mul_c8.pl
new file mode 100644
index 0000000000..525ca7494b
--- /dev/null
+++ b/crypto/bn/asm/alpha.works/mul_c8.pl
@@ -0,0 +1,177 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub bn_mul_comba8
+ {
+ local($name)=@_;
+ local(@a,@b,$r,$c0,$c1,$c2);
+
+ $cnt=1;
+ &init_pool(3);
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $bp=&wparam(2);
+
+ &function_begin($name,"");
+
+ &comment("");
+
+ &stack_push(2);
+ &ld(($a[0])=&NR(1),&QWPw(0,$ap));
+ &ld(($b[0])=&NR(1),&QWPw(0,$bp));
+ &st($reg_s0,&swtmp(0)); &FR($reg_s0);
+ &st($reg_s1,&swtmp(1)); &FR($reg_s1);
+ &ld(($a[1])=&NR(1),&QWPw(1,$ap));
+ &ld(($b[1])=&NR(1),&QWPw(1,$bp));
+ &ld(($a[2])=&NR(1),&QWPw(2,$ap));
+ &ld(($b[2])=&NR(1),&QWPw(2,$bp));
+ &ld(($a[3])=&NR(1),&QWPw(3,$ap));
+ &ld(($b[3])=&NR(1),&QWPw(3,$bp));
+ &ld(($a[4])=&NR(1),&QWPw(1,$ap));
+ &ld(($b[4])=&NR(1),&QWPw(1,$bp));
+ &ld(($a[5])=&NR(1),&QWPw(1,$ap));
+ &ld(($b[5])=&NR(1),&QWPw(1,$bp));
+ &ld(($a[6])=&NR(1),&QWPw(1,$ap));
+ &ld(($b[6])=&NR(1),&QWPw(1,$bp));
+ &ld(($a[7])=&NR(1),&QWPw(1,$ap)); &FR($ap);
+ &ld(($b[7])=&NR(1),&QWPw(1,$bp)); &FR($bp);
+
+ ($c0,$c1,$c2)=&NR(3);
+ &mov("zero",$c2);
+ &mul($a[0],$b[0],$c0);
+ &muh($a[0],$b[0],$c1);
+ &st($c0,&QWPw(0,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[1],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(1,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[1],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(2,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[3],$c0,$c1,$c2);
+ &mul_add_c($a[1],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(3,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[4],$c0,$c1,$c2);
+ &mul_add_c($a[1],$b[3],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[4],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(4,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[5],$c0,$c1,$c2);
+ &mul_add_c($a[1],$b[4],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[3],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[4],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[5],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(5,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[6],$c0,$c1,$c2);
+ &mul_add_c($a[1],$b[5],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[4],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[3],$c0,$c1,$c2);
+ &mul_add_c($a[4],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[5],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[6],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(6,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[7],$c0,$c1,$c2); &FR($a[0]);
+ &mul_add_c($a[1],$b[6],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[5],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[4],$c0,$c1,$c2);
+ &mul_add_c($a[4],$b[3],$c0,$c1,$c2);
+ &mul_add_c($a[5],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[6],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[7],$b[0],$c0,$c1,$c2); &FR($b[0]);
+ &st($c0,&QWPw(7,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[1],$b[7],$c0,$c1,$c2); &FR($a[1]);
+ &mul_add_c($a[2],$b[6],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[5],$c0,$c1,$c2);
+ &mul_add_c($a[4],$b[4],$c0,$c1,$c2);
+ &mul_add_c($a[5],$b[3],$c0,$c1,$c2);
+ &mul_add_c($a[6],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[7],$b[1],$c0,$c1,$c2); &FR($b[1]);
+ &st($c0,&QWPw(8,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[2],$b[7],$c0,$c1,$c2); &FR($a[2]);
+ &mul_add_c($a[3],$b[6],$c0,$c1,$c2);
+ &mul_add_c($a[4],$b[5],$c0,$c1,$c2);
+ &mul_add_c($a[5],$b[4],$c0,$c1,$c2);
+ &mul_add_c($a[6],$b[3],$c0,$c1,$c2);
+ &mul_add_c($a[7],$b[2],$c0,$c1,$c2); &FR($b[2]);
+ &st($c0,&QWPw(9,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[3],$b[7],$c0,$c1,$c2); &FR($a[3]);
+ &mul_add_c($a[4],$b[6],$c0,$c1,$c2);
+ &mul_add_c($a[5],$b[5],$c0,$c1,$c2);
+ &mul_add_c($a[6],$b[4],$c0,$c1,$c2);
+ &mul_add_c($a[7],$b[3],$c0,$c1,$c2); &FR($b[3]);
+ &st($c0,&QWPw(10,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[4],$b[7],$c0,$c1,$c2); &FR($a[4]);
+ &mul_add_c($a[5],$b[6],$c0,$c1,$c2);
+ &mul_add_c($a[6],$b[5],$c0,$c1,$c2);
+ &mul_add_c($a[7],$b[4],$c0,$c1,$c2); &FR($b[4]);
+ &st($c0,&QWPw(11,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[5],$b[7],$c0,$c1,$c2); &FR($a[5]);
+ &mul_add_c($a[6],$b[6],$c0,$c1,$c2);
+ &mul_add_c($a[7],$b[5],$c0,$c1,$c2); &FR($b[5]);
+ &st($c0,&QWPw(12,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[6],$b[7],$c0,$c1,$c2); &FR($a[6]);
+ &mul_add_c($a[7],$b[6],$c0,$c1,$c2); &FR($b[6]);
+ &st($c0,&QWPw(13,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[7],$b[7],$c0,$c1,$c2); &FR($a[7],$b[7]);
+ &st($c0,&QWPw(14,$rp));
+ &st($c1,&QWPw(15,$rp));
+
+ &FR($c0,$c1,$c2);
+
+ &ld($reg_s0,&swtmp(0));
+ &ld($reg_s1,&swtmp(1));
+ &stack_pop(2);
+
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha.works/sqr.pl b/crypto/bn/asm/alpha.works/sqr.pl
new file mode 100644
index 0000000000..a55b696906
--- /dev/null
+++ b/crypto/bn/asm/alpha.works/sqr.pl
@@ -0,0 +1,113 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub bn_sqr_words
+ {
+ local($name)=@_;
+ local($cc,$a,$b,$r,$couny);
+
+ &init_pool(3);
+ ($cc)=GR("r0");
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $count=&wparam(2);
+
+ &function_begin($name,"");
+
+ &comment("");
+ &sub($count,4,$count);
+ &mov("zero",$cc);
+ &br(&label("finish"));
+ &blt($count,&label("finish"));
+
+ ($a0,$r0)=&NR(2);
+ &ld($a0,&QWPw(0,$ap));
+ &ld($r0,&QWPw(0,$rp));
+
+$a=<<'EOF';
+##########################################################
+ &set_label("loop");
+
+ ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap));
+ ($b1)=&NR(1); &ld($b1,&QWPw(1,$bp));
+ ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap));
+ ($b2)=&NR(1); &ld($b2,&QWPw(2,$bp));
+ ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap));
+ ($b3)=&NR(1); &ld($b3,&QWPw(3,$bp));
+
+ ($o0,$t0)=&NR(2);
+ &add($a0,$b0,$o0);
+ &cmpult($o0,$b0,$t0);
+ &add($o0,$cc,$o0);
+ &cmpult($o0,$cc,$cc);
+ &add($cc,$t0,$cc); &FR($t0);
+
+ ($t1,$o1)=&NR(2);
+
+ &add($a1,$b1,$o1); &FR($a1);
+ &cmpult($o1,$b1,$t1); &FR($b1);
+ &add($o1,$cc,$o1);
+ &cmpult($o1,$cc,$cc);
+ &add($cc,$t1,$cc); &FR($t1);
+
+ ($t2,$o2)=&NR(2);
+
+ &add($a2,$b2,$o2); &FR($a2);
+ &cmpult($o2,$b2,$t2); &FR($b2);
+ &add($o2,$cc,$o2);
+ &cmpult($o2,$cc,$cc);
+ &add($cc,$t2,$cc); &FR($t2);
+
+ ($t3,$o3)=&NR(2);
+
+ &add($a3,$b3,$o3); &FR($a3);
+ &cmpult($o3,$b3,$t3); &FR($b3);
+ &add($o3,$cc,$o3);
+ &cmpult($o3,$cc,$cc);
+ &add($cc,$t3,$cc); &FR($t3);
+
+ &st($o0,&QWPw(0,$rp)); &FR($o0);
+ &st($o1,&QWPw(0,$rp)); &FR($o1);
+ &st($o2,&QWPw(0,$rp)); &FR($o2);
+ &st($o3,&QWPw(0,$rp)); &FR($o3);
+
+ &sub($count,4,$count); # count-=4
+ &add($ap,4*$QWS,$ap); # count+=4
+ &add($bp,4*$QWS,$bp); # count+=4
+ &add($rp,4*$QWS,$rp); # count+=4
+
+ &blt($count,&label("finish"));
+ &ld($a0,&QWPw(0,$ap));
+ &ld($b0,&QWPw(0,$bp));
+ &br(&label("loop"));
+EOF
+##################################################
+ # Do the last 0..3 words
+
+ &set_label("last_loop");
+
+ &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a
+ &mul($a0,$a0,($l0)=&NR(1));
+ &add($ap,$QWS,$ap);
+ &add($rp,2*$QWS,$rp);
+ &sub($count,1,$count);
+ &muh($a0,$a0,($h0)=&NR(1)); &FR($a0);
+ &st($l0,&QWPw(-2,$rp)); &FR($l0);
+ &st($h0,&QWPw(-1,$rp)); &FR($h0);
+
+ &bgt($count,&label("last_loop"));
+ &function_end_A($name);
+
+######################################################
+ &set_label("finish");
+ &add($count,4,$count);
+ &bgt($count,&label("last_loop"));
+
+ &set_label("end");
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha.works/sqr_c4.pl b/crypto/bn/asm/alpha.works/sqr_c4.pl
new file mode 100644
index 0000000000..bf33f5b503
--- /dev/null
+++ b/crypto/bn/asm/alpha.works/sqr_c4.pl
@@ -0,0 +1,109 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub sqr_add_c
+ {
+ local($a,$c0,$c1,$c2)=@_;
+ local($l1,$h1,$t1,$t2);
+
+ &mul($a,$a,($l1)=&NR(1));
+ &muh($a,$a,($h1)=&NR(1));
+ &add($c0,$l1,$c0);
+ &add($c1,$h1,$c1);
+ &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1);
+ &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1);
+ &add($c1,$t1,$c1); &FR($t1);
+ &add($c2,$t2,$c2); &FR($t2);
+ }
+
+sub sqr_add_c2
+ {
+ local($a,$b,$c0,$c1,$c2)=@_;
+ local($l1,$h1,$t1,$t2);
+
+ &mul($a,$b,($l1)=&NR(1));
+ &muh($a,$b,($h1)=&NR(1));
+ &cmplt($l1,"zero",($lc1)=&NR(1));
+ &cmplt($h1,"zero",($hc1)=&NR(1));
+ &add($l1,$l1,$l1);
+ &add($h1,$h1,$h1);
+ &add($h1,$lc1,$h1); &FR($lc1);
+ &add($c2,$hc1,$c2); &FR($hc1);
+
+ &add($c0,$l1,$c0);
+ &add($c1,$h1,$c1);
+ &cmpult($c0,$l1,($lc1)=&NR(1)); &FR($l1);
+ &cmpult($c1,$h1,($hc1)=&NR(1)); &FR($h1);
+
+ &add($c1,$lc1,$c1); &FR($lc1);
+ &add($c2,$hc1,$c2); &FR($hc1);
+ }
+
+
+sub bn_sqr_comba4
+ {
+ local($name)=@_;
+ local(@a,@b,$r,$c0,$c1,$c2);
+
+ $cnt=1;
+ &init_pool(2);
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+
+ &function_begin($name,"");
+
+ &comment("");
+
+ &ld(($a[0])=&NR(1),&QWPw(0,$ap));
+ &ld(($a[1])=&NR(1),&QWPw(1,$ap));
+ &ld(($a[2])=&NR(1),&QWPw(2,$ap));
+ &ld(($a[3])=&NR(1),&QWPw(3,$ap)); &FR($ap);
+
+ ($c0,$c1,$c2)=&NR(3);
+
+ &mov("zero",$c2);
+ &mul($a[0],$a[0],$c0);
+ &muh($a[0],$a[0],$c1);
+ &st($c0,&QWPw(0,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c2($a[0],$a[1],$c0,$c1,$c2);
+ &st($c0,&QWPw(1,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c($a[1],$c0,$c1,$c2);
+ &sqr_add_c2($a[2],$a[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(2,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c2($a[3],$a[0],$c0,$c1,$c2);
+ &sqr_add_c2($a[2],$a[1],$c0,$c1,$c2);
+ &st($c0,&QWPw(3,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c($a[2],$c0,$c1,$c2);
+ &sqr_add_c2($a[3],$a[1],$c0,$c1,$c2);
+ &st($c0,&QWPw(4,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c2($a[3],$a[2],$c0,$c1,$c2);
+ &st($c0,&QWPw(5,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c($a[3],$c0,$c1,$c2);
+ &st($c0,&QWPw(6,$rp));
+ &st($c1,&QWPw(7,$rp));
+
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha.works/sqr_c8.pl b/crypto/bn/asm/alpha.works/sqr_c8.pl
new file mode 100644
index 0000000000..b4afe085f1
--- /dev/null
+++ b/crypto/bn/asm/alpha.works/sqr_c8.pl
@@ -0,0 +1,132 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub bn_sqr_comba8
+ {
+ local($name)=@_;
+ local(@a,@b,$r,$c0,$c1,$c2);
+
+ $cnt=1;
+ &init_pool(2);
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+
+ &function_begin($name,"");
+
+ &comment("");
+
+ &ld(($a[0])=&NR(1),&QWPw(0,$ap));
+ &ld(($a[1])=&NR(1),&QWPw(1,$ap));
+ &ld(($a[2])=&NR(1),&QWPw(2,$ap));
+ &ld(($a[3])=&NR(1),&QWPw(3,$ap));
+ &ld(($a[4])=&NR(1),&QWPw(4,$ap));
+ &ld(($a[5])=&NR(1),&QWPw(5,$ap));
+ &ld(($a[6])=&NR(1),&QWPw(6,$ap));
+ &ld(($a[7])=&NR(1),&QWPw(7,$ap)); &FR($ap);
+
+ ($c0,$c1,$c2)=&NR(3);
+
+ &mov("zero",$c2);
+ &mul($a[0],$a[0],$c0);
+ &muh($a[0],$a[0],$c1);
+ &st($c0,&QWPw(0,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c2($a[1],$a[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(1,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c($a[1],$c0,$c1,$c2);
+ &sqr_add_c2($a[2],$a[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(2,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c2($a[2],$a[1],$c0,$c1,$c2);
+ &sqr_add_c2($a[3],$a[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(3,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c($a[2],$c0,$c1,$c2);
+ &sqr_add_c2($a[3],$a[1],$c0,$c1,$c2);
+ &sqr_add_c2($a[4],$a[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(4,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c2($a[3],$a[2],$c0,$c1,$c2);
+ &sqr_add_c2($a[4],$a[1],$c0,$c1,$c2);
+ &sqr_add_c2($a[5],$a[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(5,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c($a[3],$c0,$c1,$c2);
+ &sqr_add_c2($a[4],$a[2],$c0,$c1,$c2);
+ &sqr_add_c2($a[5],$a[1],$c0,$c1,$c2);
+ &sqr_add_c2($a[6],$a[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(6,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c2($a[4],$a[3],$c0,$c1,$c2);
+ &sqr_add_c2($a[5],$a[2],$c0,$c1,$c2);
+ &sqr_add_c2($a[6],$a[1],$c0,$c1,$c2);
+ &sqr_add_c2($a[7],$a[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(7,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c($a[4],$c0,$c1,$c2);
+ &sqr_add_c2($a[5],$a[3],$c0,$c1,$c2);
+ &sqr_add_c2($a[6],$a[2],$c0,$c1,$c2);
+ &sqr_add_c2($a[7],$a[1],$c0,$c1,$c2);
+ &st($c0,&QWPw(8,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c2($a[5],$a[4],$c0,$c1,$c2);
+ &sqr_add_c2($a[6],$a[3],$c0,$c1,$c2);
+ &sqr_add_c2($a[7],$a[2],$c0,$c1,$c2);
+ &st($c0,&QWPw(9,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c($a[5],$c0,$c1,$c2);
+ &sqr_add_c2($a[6],$a[4],$c0,$c1,$c2);
+ &sqr_add_c2($a[7],$a[3],$c0,$c1,$c2);
+ &st($c0,&QWPw(10,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c2($a[6],$a[5],$c0,$c1,$c2);
+ &sqr_add_c2($a[7],$a[4],$c0,$c1,$c2);
+ &st($c0,&QWPw(11,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c($a[6],$c0,$c1,$c2);
+ &sqr_add_c2($a[7],$a[5],$c0,$c1,$c2);
+ &st($c0,&QWPw(12,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c2($a[7],$a[6],$c0,$c1,$c2);
+ &st($c0,&QWPw(13,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c($a[7],$c0,$c1,$c2);
+ &st($c0,&QWPw(14,$rp));
+ &st($c1,&QWPw(15,$rp));
+
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha.works/sub.pl b/crypto/bn/asm/alpha.works/sub.pl
new file mode 100644
index 0000000000..d998da5c21
--- /dev/null
+++ b/crypto/bn/asm/alpha.works/sub.pl
@@ -0,0 +1,108 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub bn_sub_words
+ {
+ local($name)=@_;
+ local($cc,$a,$b,$r);
+
+ &init_pool(4);
+ ($cc)=GR("r0");
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $bp=&wparam(2);
+ $count=&wparam(3);
+
+ &function_begin($name,"");
+
+ &comment("");
+ &sub($count,4,$count);
+ &mov("zero",$cc);
+ &blt($count,&label("finish"));
+
+ ($a0,$b0)=&NR(2);
+ &ld($a0,&QWPw(0,$ap));
+ &ld($b0,&QWPw(0,$bp));
+
+##########################################################
+ &set_label("loop");
+
+ ($a1,$tmp,$b1,$a2,$b2,$a3,$b3,$o0)=&NR(8);
+ &ld($a1,&QWPw(1,$ap));
+ &cmpult($a0,$b0,$tmp); # will we borrow?
+ &ld($b1,&QWPw(1,$bp));
+ &sub($a0,$b0,$a0); # do the subtract
+ &ld($a2,&QWPw(2,$ap));
+ &cmpult($a0,$cc,$b0); # will we borrow?
+ &ld($b2,&QWPw(2,$bp));
+ &sub($a0,$cc,$o0); # will we borrow?
+ &ld($a3,&QWPw(3,$ap));
+ &add($b0,$tmp,$cc); ($t1,$o1)=&NR(2); &FR($tmp);
+
+ &cmpult($a1,$b1,$t1); # will we borrow?
+ &sub($a1,$b1,$a1); # do the subtract
+ &ld($b3,&QWPw(3,$bp));
+ &cmpult($a1,$cc,$b1); # will we borrow?
+ &sub($a1,$cc,$o1); # will we borrow?
+ &add($b1,$t1,$cc); ($tmp,$o2)=&NR(2); &FR($t1,$a1,$b1);
+
+ &cmpult($a2,$b2,$tmp); # will we borrow?
+ &sub($a2,$b2,$a2); # do the subtract
+ &st($o0,&QWPw(0,$rp)); &FR($o0); # save
+ &cmpult($a2,$cc,$b2); # will we borrow?
+ &sub($a2,$cc,$o2); # will we borrow?
+ &add($b2,$tmp,$cc); ($t3,$o3)=&NR(2); &FR($tmp,$a2,$b2);
+
+ &cmpult($a3,$b3,$t3); # will we borrow?
+ &sub($a3,$b3,$a3); # do the subtract
+ &st($o1,&QWPw(1,$rp)); &FR($o1);
+ &cmpult($a3,$cc,$b3); # will we borrow?
+ &sub($a3,$cc,$o3); # will we borrow?
+ &add($b3,$t3,$cc); &FR($t3,$a3,$b3);
+
+ &st($o2,&QWPw(2,$rp)); &FR($o2);
+ &sub($count,4,$count); # count-=4
+ &st($o3,&QWPw(3,$rp)); &FR($o3);
+ &add($ap,4*$QWS,$ap); # count+=4
+ &add($bp,4*$QWS,$bp); # count+=4
+ &add($rp,4*$QWS,$rp); # count+=4
+
+ &blt($count,&label("finish"));
+ &ld($a0,&QWPw(0,$ap));
+ &ld($b0,&QWPw(0,$bp));
+ &br(&label("loop"));
+##################################################
+ # Do the last 0..3 words
+
+ &set_label("last_loop");
+
+ &ld($a0,&QWPw(0,$ap)); # get a
+ &ld($b0,&QWPw(0,$bp)); # get b
+ &cmpult($a0,$b0,$tmp); # will we borrow?
+ &sub($a0,$b0,$a0); # do the subtract
+ &cmpult($a0,$cc,$b0); # will we borrow?
+ &sub($a0,$cc,$a0); # will we borrow?
+ &st($a0,&QWPw(0,$rp)); # save
+ &add($b0,$tmp,$cc); # add the borrows
+
+ &add($ap,$QWS,$ap);
+ &add($bp,$QWS,$bp);
+ &add($rp,$QWS,$rp);
+ &sub($count,1,$count);
+ &bgt($count,&label("last_loop"));
+ &function_end_A($name);
+
+######################################################
+ &set_label("finish");
+ &add($count,4,$count);
+ &bgt($count,&label("last_loop"));
+
+ &FR($a0,$b0);
+ &set_label("end");
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha/add.pl b/crypto/bn/asm/alpha/add.pl
new file mode 100644
index 0000000000..13bf516428
--- /dev/null
+++ b/crypto/bn/asm/alpha/add.pl
@@ -0,0 +1,118 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub bn_add_words
+ {
+ local($name)=@_;
+ local($cc,$a,$b,$r);
+
+ &init_pool(4);
+ ($cc)=GR("r0");
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $bp=&wparam(2);
+ $count=&wparam(3);
+
+ &function_begin($name,"");
+
+ &comment("");
+ &sub($count,4,$count);
+ &mov("zero",$cc);
+ &blt($count,&label("finish"));
+
+ ($a0,$b0)=&NR(2);
+
+##########################################################
+ &set_label("loop");
+
+ &ld(($a0)=&NR(1),&QWPw(0,$ap));
+ &ld(($b0)=&NR(1),&QWPw(0,$bp));
+ &ld(($a1)=&NR(1),&QWPw(1,$ap));
+ &ld(($b1)=&NR(1),&QWPw(1,$bp));
+
+ ($o0,$t0)=&NR(2);
+ &add($a0,$b0,$o0);
+ &ld(($a2)=&NR(1),&QWPw(2,$ap));
+ &cmpult($o0,$b0,$t0);
+ &add($o0,$cc,$o0);
+ &cmpult($o0,$cc,$cc);
+ &ld(($b2)=&NR(1),&QWPw(2,$bp));
+ &add($cc,$t0,$cc); &FR($t0);
+
+ ($t1,$o1)=&NR(2);
+
+ &add($a1,$b1,$o1); &FR($a1);
+ &cmpult($o1,$b1,$t1); &FR($b1);
+ &add($o1,$cc,$o1);
+ &cmpult($o1,$cc,$cc);
+ &ld(($a3)=&NR(1),&QWPw(3,$ap));
+ &add($cc,$t1,$cc); &FR($t1);
+
+ ($t2,$o2)=&NR(2);
+
+ &add($a2,$b2,$o2); &FR($a2);
+ &cmpult($o2,$b2,$t2); &FR($b2);
+ &add($o2,$cc,$o2);
+ &cmpult($o2,$cc,$cc);
+ &ld(($b3)=&NR(1),&QWPw(3,$bp));
+ &st($o0,&QWPw(0,$rp)); &FR($o0);
+ &add($cc,$t2,$cc); &FR($t2);
+
+ ($t3,$o3)=&NR(2);
+
+ &st($o1,&QWPw(0,$rp)); &FR($o1);
+ &add($a3,$b3,$o3); &FR($a3);
+ &cmpult($o3,$b3,$t3); &FR($b3);
+ &add($o3,$cc,$o3);
+ &st($o2,&QWPw(0,$rp)); &FR($o2);
+ &cmpult($o3,$cc,$cc);
+ &st($o3,&QWPw(0,$rp)); &FR($o3);
+ &add($cc,$t3,$cc); &FR($t3);
+
+
+ &sub($count,4,$count); # count-=4
+ &add($ap,4*$QWS,$ap); # count+=4
+ &add($bp,4*$QWS,$bp); # count+=4
+ &add($rp,4*$QWS,$rp); # count+=4
+
+ ###
+ &bge($count,&label("loop"));
+ ###
+ &br(&label("finish"));
+##################################################
+ # Do the last 0..3 words
+
+ ($t0,$o0)=&NR(2);
+ &set_label("last_loop");
+
+ &ld($a0,&QWPw(0,$ap)); # get a
+ &ld($b0,&QWPw(0,$bp)); # get b
+ &add($ap,$QWS,$ap);
+ &add($bp,$QWS,$bp);
+ &add($a0,$b0,$o0);
+ &sub($count,1,$count);
+ &cmpult($o0,$b0,$t0); # will we borrow?
+ &add($o0,$cc,$o0); # will we borrow?
+ &cmpult($o0,$cc,$cc); # will we borrow?
+ &add($rp,$QWS,$rp);
+ &st($o0,&QWPw(-1,$rp)); # save
+ &add($cc,$t0,$cc); # add the borrows
+
+ ###
+ &bgt($count,&label("last_loop"));
+ &function_end_A($name);
+
+######################################################
+ &set_label("finish");
+ &add($count,4,$count);
+ &bgt($count,&label("last_loop"));
+
+ &FR($o0,$t0,$a0,$b0);
+ &set_label("end");
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha/div.pl b/crypto/bn/asm/alpha/div.pl
new file mode 100644
index 0000000000..e9e680897a
--- /dev/null
+++ b/crypto/bn/asm/alpha/div.pl
@@ -0,0 +1,144 @@
+#!/usr/local/bin/perl
+
+sub bn_div_words
+ {
+ local($data)=<<'EOF';
+ #
+ # What follows was taken directly from the C compiler with a few
+ # hacks to redo the lables.
+ #
+.text
+ .set noreorder
+ .set volatile
+ .align 3
+ .globl bn_div_words
+ .ent bn_div_words
+bn_div_words
+ ldgp $29,0($27)
+bn_div_words.ng:
+ lda $30,-48($30)
+ .frame $30,48,$26,0
+ stq $26,0($30)
+ stq $9,8($30)
+ stq $10,16($30)
+ stq $11,24($30)
+ stq $12,32($30)
+ stq $13,40($30)
+ .mask 0x4003e00,-48
+ .prologue 1
+ bis $16,$16,$9
+ bis $17,$17,$10
+ bis $18,$18,$11
+ bis $31,$31,$13
+ bis $31,2,$12
+ bne $11,$9119
+ lda $0,-1
+ br $31,$9136
+ .align 4
+$9119:
+ bis $11,$11,$16
+ jsr $26,BN_num_bits_word
+ ldgp $29,0($26)
+ subq $0,64,$1
+ beq $1,$9120
+ bis $31,1,$1
+ sll $1,$0,$1
+ cmpule $9,$1,$1
+ bne $1,$9120
+ # lda $16,_IO_stderr_
+ # lda $17,$C32
+ # bis $0,$0,$18
+ # jsr $26,fprintf
+ # ldgp $29,0($26)
+ jsr $26,abort
+ ldgp $29,0($26)
+ .align 4
+$9120:
+ bis $31,64,$3
+ cmpult $9,$11,$2
+ subq $3,$0,$1
+ addl $1,$31,$0
+ subq $9,$11,$1
+ cmoveq $2,$1,$9
+ beq $0,$9122
+ zapnot $0,15,$2
+ subq $3,$0,$1
+ sll $11,$2,$11
+ sll $9,$2,$3
+ srl $10,$1,$1
+ sll $10,$2,$10
+ bis $3,$1,$9
+$9122:
+ srl $11,32,$5
+ zapnot $11,15,$6
+ lda $7,-1
+ .align 5
+$9123:
+ srl $9,32,$1
+ subq $1,$5,$1
+ bne $1,$9126
+ zapnot $7,15,$27
+ br $31,$9127
+ .align 4
+$9126:
+ bis $9,$9,$24
+ bis $5,$5,$25
+ divqu $24,$25,$27
+$9127:
+ srl $10,32,$4
+ .align 5
+$9128:
+ mulq $27,$5,$1
+ subq $9,$1,$3
+ zapnot $3,240,$1
+ bne $1,$9129
+ mulq $6,$27,$2
+ sll $3,32,$1
+ addq $1,$4,$1
+ cmpule $2,$1,$2
+ bne $2,$9129
+ subq $27,1,$27
+ br $31,$9128
+ .align 4
+$9129:
+ mulq $27,$6,$1
+ mulq $27,$5,$4
+ srl $1,32,$3
+ sll $1,32,$1
+ addq $4,$3,$4
+ cmpult $10,$1,$2
+ subq $10,$1,$10
+ addq $2,$4,$2
+ cmpult $9,$2,$1
+ bis $2,$2,$4
+ beq $1,$9134
+ addq $9,$11,$9
+ subq $27,1,$27
+$9134:
+ subl $12,1,$12
+ subq $9,$4,$9
+ beq $12,$9124
+ sll $27,32,$13
+ sll $9,32,$2
+ srl $10,32,$1
+ sll $10,32,$10
+ bis $2,$1,$9
+ br $31,$9123
+ .align 4
+$9124:
+ bis $13,$27,$0
+$9136:
+ ldq $26,0($30)
+ ldq $9,8($30)
+ ldq $10,16($30)
+ ldq $11,24($30)
+ ldq $12,32($30)
+ ldq $13,40($30)
+ addq $30,48,$30
+ ret $31,($26),1
+ .end bn_div_words
+EOF
+ &asm_add($data);
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha/mul.pl b/crypto/bn/asm/alpha/mul.pl
new file mode 100644
index 0000000000..76c926566c
--- /dev/null
+++ b/crypto/bn/asm/alpha/mul.pl
@@ -0,0 +1,104 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub bn_mul_words
+ {
+ local($name)=@_;
+ local($cc,$a,$b,$r,$couny);
+
+ &init_pool(4);
+ ($cc)=GR("r0");
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $count=&wparam(2);
+ $word=&wparam(3);
+
+ &function_begin($name,"");
+
+ &comment("");
+ &sub($count,4,$count);
+ &mov("zero",$cc);
+ ###
+ &blt($count,&label("finish"));
+
+ ($a0)=&NR(1); &ld($a0,&QWPw(0,$ap));
+
+ &set_label("loop");
+
+ ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap));
+ ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap));
+
+ &muh($a0,$word,($h0)=&NR(1)); &FR($a0);
+ ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap));
+ ### wait 8
+ &mul($a0,$word,($l0)=&NR(1)); &FR($a0);
+ ### wait 8
+ &muh($a1,$word,($h1)=&NR(1)); &FR($a1);
+ &add($l0,$cc,$l0); ### wait 8
+ &mul($a1,$word,($l1)=&NR(1)); &FR($a1);
+ &cmpult($l0,$cc,$cc); ### wait 8
+ &muh($a2,$word,($h2)=&NR(1)); &FR($a2);
+ &add($h0,$cc,$cc); &FR($h0); ### wait 8
+ &mul($a2,$word,($l2)=&NR(1)); &FR($a2);
+ &add($l1,$cc,$l1); ### wait 8
+ &st($l0,&QWPw(0,$rp)); &FR($l0);
+ &cmpult($l1,$cc,$cc); ### wait 8
+ &muh($a3,$word,($h3)=&NR(1)); &FR($a3);
+ &add($h1,$cc,$cc); &FR($h1);
+ &mul($a3,$word,($l3)=&NR(1)); &FR($a3);
+ &add($l2,$cc,$l2);
+ &st($l1,&QWPw(1,$rp)); &FR($l1);
+ &cmpult($l2,$cc,$cc);
+ &add($h2,$cc,$cc); &FR($h2);
+ &sub($count,4,$count); # count-=4
+ &st($l2,&QWPw(2,$rp)); &FR($l2);
+ &add($l3,$cc,$l3);
+ &cmpult($l3,$cc,$cc);
+ &add($bp,4*$QWS,$bp); # count+=4
+ &add($h3,$cc,$cc); &FR($h3);
+ &add($ap,4*$QWS,$ap); # count+=4
+ &st($l3,&QWPw(3,$rp)); &FR($l3);
+ &add($rp,4*$QWS,$rp); # count+=4
+ ###
+ &blt($count,&label("finish"));
+ ($a0)=&NR(1); &ld($a0,&QWPw(0,$ap));
+ &br(&label("finish"));
+##################################################
+
+##################################################
+ # Do the last 0..3 words
+
+ &set_label("last_loop");
+
+ &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a
+ ###
+ ###
+ ###
+ &muh($a0,$word,($h0)=&NR(1));
+ ### Wait 8 for next mul issue
+ &mul($a0,$word,($l0)=&NR(1)); &FR($a0)
+ &add($ap,$QWS,$ap);
+ ### Loose 12 until result is available
+ &add($rp,$QWS,$rp);
+ &sub($count,1,$count);
+ &add($l0,$cc,$l0);
+ ###
+ &st($l0,&QWPw(-1,$rp)); &FR($l0);
+ &cmpult($l0,$cc,$cc);
+ &add($h0,$cc,$cc); &FR($h0);
+ &bgt($count,&label("last_loop"));
+ &function_end_A($name);
+
+######################################################
+ &set_label("finish");
+ &add($count,4,$count);
+ &bgt($count,&label("last_loop"));
+
+ &set_label("end");
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha/mul_add.pl b/crypto/bn/asm/alpha/mul_add.pl
new file mode 100644
index 0000000000..0d6df69bc4
--- /dev/null
+++ b/crypto/bn/asm/alpha/mul_add.pl
@@ -0,0 +1,123 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub bn_mul_add_words
+ {
+ local($name)=@_;
+ local($cc,$a,$b,$r,$couny);
+
+ &init_pool(4);
+ ($cc)=GR("r0");
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $count=&wparam(2);
+ $word=&wparam(3);
+
+ &function_begin($name,"");
+
+ &comment("");
+ &sub($count,4,$count);
+ &mov("zero",$cc);
+ ###
+ &blt($count,&label("finish"));
+
+ &ld(($a0)=&NR(1),&QWPw(0,$ap));
+
+$a=<<'EOF';
+##########################################################
+ &set_label("loop");
+
+ &ld(($r0)=&NR(1),&QWPw(0,$rp));
+ &ld(($a1)=&NR(1),&QWPw(1,$ap));
+ &muh($a0,$word,($h0)=&NR(1));
+ &ld(($r1)=&NR(1),&QWPw(1,$rp));
+ &ld(($a2)=&NR(1),&QWPw(2,$ap));
+ ###
+ &mul($a0,$word,($l0)=&NR(1)); &FR($a0);
+ &ld(($r2)=&NR(1),&QWPw(2,$rp));
+ &muh($a1,$word,($h1)=&NR(1));
+ &ld(($a3)=&NR(1),&QWPw(3,$ap));
+ &mul($a1,$word,($l1)=&NR(1)); &FR($a1);
+ &ld(($r3)=&NR(1),&QWPw(3,$rp));
+ &add($r0,$l0,$r0);
+ &add($r1,$l1,$r1);
+ &cmpult($r0,$l0,($t0)=&NR(1)); &FR($l0);
+ &cmpult($r1,$l1,($t1)=&NR(1)); &FR($l1);
+ &muh($a2,$word,($h2)=&NR(1));
+ &add($r0,$cc,$r0);
+ &add($h0,$t0,$h0); &FR($t0);
+ &cmpult($r0,$cc,$cc);
+ &add($h1,$t1,$h1); &FR($t1);
+ &add($h0,$cc,$cc); &FR($h0);
+ &mul($a2,$word,($l2)=&NR(1)); &FR($a2);
+ &add($r1,$cc,$r1);
+ &cmpult($r1,$cc,$cc);
+ &add($r2,$l2,$r2);
+ &add($h1,$cc,$cc); &FR($h1);
+ &cmpult($r2,$l2,($t2)=&NR(1)); &FR($l2);
+ &muh($a3,$word,($h3)=&NR(1));
+ &add($r2,$cc,$r2);
+ &st($r0,&QWPw(0,$rp)); &FR($r0);
+ &add($h2,$t2,$h2); &FR($t2);
+ &st($r1,&QWPw(1,$rp)); &FR($r1);
+ &cmpult($r2,$cc,$cc);
+ &mul($a3,$word,($l3)=&NR(1)); &FR($a3);
+ &add($h2,$cc,$cc); &FR($h2);
+ &st($r2,&QWPw(2,$rp)); &FR($r2);
+ &sub($count,4,$count); # count-=4
+ &add($rp,4*$QWS,$rp); # count+=4
+ &add($r3,$l3,$r3);
+ &add($ap,4*$QWS,$ap); # count+=4
+ &cmpult($r3,$l3,($t3)=&NR(1)); &FR($l3);
+ &add($r3,$cc,$r3);
+ &add($h3,$t3,$h3); &FR($t3);
+ &cmpult($r3,$cc,$cc);
+ &st($r3,&QWPw(-1,$rp)); &FR($r3);
+ &add($h3,$cc,$cc); &FR($h3);
+
+ ###
+ &blt($count,&label("finish"));
+ &ld(($a0)=&NR(1),&QWPw(0,$ap));
+ &br(&label("loop"));
+EOF
+##################################################
+ # Do the last 0..3 words
+
+ &set_label("last_loop");
+
+ &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a
+ &ld(($r0)=&NR(1),&QWPw(0,$rp)); # get b
+ ###
+ ###
+ &muh($a0,$word,($h0)=&NR(1)); &FR($a0);
+ ### wait 8
+ &mul($a0,$word,($l0)=&NR(1)); &FR($a0);
+ &add($rp,$QWS,$rp);
+ &add($ap,$QWS,$ap);
+ &sub($count,1,$count);
+ ### wait 3 until l0 is available
+ &add($r0,$l0,$r0);
+ ###
+ &cmpult($r0,$l0,($t0)=&NR(1)); &FR($l0);
+ &add($r0,$cc,$r0);
+ &add($h0,$t0,$h0); &FR($t0);
+ &cmpult($r0,$cc,$cc);
+ &add($h0,$cc,$cc); &FR($h0);
+
+ &st($r0,&QWPw(-1,$rp)); &FR($r0);
+ &bgt($count,&label("last_loop"));
+ &function_end_A($name);
+
+######################################################
+ &set_label("finish");
+ &add($count,4,$count);
+ &bgt($count,&label("last_loop"));
+
+ &set_label("end");
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha/mul_c4.pl b/crypto/bn/asm/alpha/mul_c4.pl
new file mode 100644
index 0000000000..9cc876ded4
--- /dev/null
+++ b/crypto/bn/asm/alpha/mul_c4.pl
@@ -0,0 +1,215 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+# upto
+
+sub mul_add_c
+ {
+ local($a,$b,$c0,$c1,$c2)=@_;
+ local($l1,$h1,$t1,$t2);
+
+ &mul($a,$b,($l1)=&NR(1));
+ &muh($a,$b,($h1)=&NR(1));
+ &add($c0,$l1,$c0);
+ &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1);
+ &add($t1,$h1,$h1); &FR($t1);
+ &add($c1,$h1,$c1);
+ &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1);
+ &add($c2,$t2,$c2); &FR($t2);
+ }
+
+sub bn_mul_comba4
+ {
+ local($name)=@_;
+ local(@a,@b,$r,$c0,$c1,$c2);
+
+ $cnt=1;
+ &init_pool(3);
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $bp=&wparam(2);
+
+ &function_begin($name,"");
+
+ &comment("");
+
+ &ld(($a[0])=&NR(1),&QWPw(0,$ap));
+ &ld(($b[0])=&NR(1),&QWPw(0,$bp));
+ &ld(($a[1])=&NR(1),&QWPw(1,$ap));
+ &ld(($b[1])=&NR(1),&QWPw(1,$bp));
+ &mul($a[0],$b[0],($r00)=&NR(1));
+ &ld(($a[2])=&NR(1),&QWPw(2,$ap));
+ &ld(($b[2])=&NR(1),&QWPw(2,$bp));
+ &muh($a[0],$b[0],($r01)=&NR(1));
+ &FR($ap); &ld(($a[3])=&NR(1),&QWPw(3,$ap));
+ &FR($bp); &ld(($b[3])=&NR(1),&QWPw(3,$bp));
+ &mul($a[0],$b[1],($r02)=&NR(1));
+
+ ($R,$H1,$H2)=&NR(3);
+
+ &st($r00,&QWPw(0,$rp)); &FR($r00);
+
+ &mov("zero",$R);
+ &mul($a[1],$b[0],($r03)=&NR(1));
+
+ &mov("zero",$H1);
+ &mov("zero",$H0);
+ &add($R,$r01,$R);
+ &muh($a[0],$b[1],($r04)=&NR(1));
+ &cmpult($R,$r01,($t01)=&NR(1)); &FR($r01);
+ &add($R,$r02,$R);
+ &add($H1,$t01,$H1) &FR($t01);
+ &muh($a[1],$b[0],($r05)=&NR(1));
+ &cmpult($R,$r02,($t02)=&NR(1)); &FR($r02);
+ &add($R,$r03,$R);
+ &add($H2,$t02,$H2) &FR($t02);
+ &mul($a[0],$b[2],($r06)=&NR(1));
+ &cmpult($R,$r03,($t03)=&NR(1)); &FR($r03);
+ &add($H1,$t03,$H1) &FR($t03);
+ &st($R,&QWPw(1,$rp));
+ &add($H1,$H2,$R);
+
+ &mov("zero",$H1);
+ &add($R,$r04,$R);
+ &mov("zero",$H2);
+ &mul($a[1],$b[1],($r07)=&NR(1));
+ &cmpult($R,$r04,($t04)=&NR(1)); &FR($r04);
+ &add($R,$r05,$R);
+ &add($H1,$t04,$H1) &FR($t04);
+ &mul($a[2],$b[0],($r08)=&NR(1));
+ &cmpult($R,$r05,($t05)=&NR(1)); &FR($r05);
+ &add($R,$r01,$R);
+ &add($H2,$t05,$H2) &FR($t05);
+ &muh($a[0],$b[2],($r09)=&NR(1));
+ &cmpult($R,$r06,($t06)=&NR(1)); &FR($r06);
+ &add($R,$r07,$R);
+ &add($H1,$t06,$H1) &FR($t06);
+ &muh($a[1],$b[1],($r10)=&NR(1));
+ &cmpult($R,$r07,($t07)=&NR(1)); &FR($r07);
+ &add($R,$r08,$R);
+ &add($H2,$t07,$H2) &FR($t07);
+ &muh($a[2],$b[0],($r11)=&NR(1));
+ &cmpult($R,$r08,($t08)=&NR(1)); &FR($r08);
+ &add($H1,$t08,$H1) &FR($t08);
+ &st($R,&QWPw(2,$rp));
+ &add($H1,$H2,$R);
+
+ &mov("zero",$H1);
+ &add($R,$r09,$R);
+ &mov("zero",$H2);
+ &mul($a[0],$b[3],($r12)=&NR(1));
+ &cmpult($R,$r09,($t09)=&NR(1)); &FR($r09);
+ &add($R,$r10,$R);
+ &add($H1,$t09,$H1) &FR($t09);
+ &mul($a[1],$b[2],($r13)=&NR(1));
+ &cmpult($R,$r10,($t10)=&NR(1)); &FR($r10);
+ &add($R,$r11,$R);
+ &add($H1,$t10,$H1) &FR($t10);
+ &mul($a[2],$b[1],($r14)=&NR(1));
+ &cmpult($R,$r11,($t11)=&NR(1)); &FR($r11);
+ &add($R,$r12,$R);
+ &add($H1,$t11,$H1) &FR($t11);
+ &mul($a[3],$b[0],($r15)=&NR(1));
+ &cmpult($R,$r12,($t12)=&NR(1)); &FR($r12);
+ &add($R,$r13,$R);
+ &add($H1,$t12,$H1) &FR($t12);
+ &muh($a[0],$b[3],($r16)=&NR(1));
+ &cmpult($R,$r13,($t13)=&NR(1)); &FR($r13);
+ &add($R,$r14,$R);
+ &add($H1,$t13,$H1) &FR($t13);
+ &muh($a[1],$b[2],($r17)=&NR(1));
+ &cmpult($R,$r14,($t14)=&NR(1)); &FR($r14);
+ &add($R,$r15,$R);
+ &add($H1,$t14,$H1) &FR($t14);
+ &muh($a[2],$b[1],($r18)=&NR(1));
+ &cmpult($R,$r15,($t15)=&NR(1)); &FR($r15);
+ &add($H1,$t15,$H1) &FR($t15);
+ &st($R,&QWPw(3,$rp));
+ &add($H1,$H2,$R);
+
+ &mov("zero",$H1);
+ &add($R,$r16,$R);
+ &mov("zero",$H2);
+ &muh($a[3],$b[0],($r19)=&NR(1));
+ &cmpult($R,$r16,($t16)=&NR(1)); &FR($r16);
+ &add($R,$r17,$R);
+ &add($H1,$t16,$H1) &FR($t16);
+ &mul($a[1],$b[3],($r20)=&NR(1));
+ &cmpult($R,$r17,($t17)=&NR(1)); &FR($r17);
+ &add($R,$r18,$R);
+ &add($H1,$t17,$H1) &FR($t17);
+ &mul($a[2],$b[2],($r21)=&NR(1));
+ &cmpult($R,$r18,($t18)=&NR(1)); &FR($r18);
+ &add($R,$r19,$R);
+ &add($H1,$t18,$H1) &FR($t18);
+ &mul($a[3],$b[1],($r22)=&NR(1));
+ &cmpult($R,$r19,($t19)=&NR(1)); &FR($r19);
+ &add($R,$r20,$R);
+ &add($H1,$t19,$H1) &FR($t19);
+ &muh($a[1],$b[3],($r23)=&NR(1));
+ &cmpult($R,$r20,($t20)=&NR(1)); &FR($r20);
+ &add($R,$r21,$R);
+ &add($H1,$t20,$H1) &FR($t20);
+ &muh($a[2],$b[2],($r24)=&NR(1));
+ &cmpult($R,$r21,($t21)=&NR(1)); &FR($r21);
+ &add($R,$r22,$R);
+ &add($H1,$t21,$H1) &FR($t21);
+ &muh($a[3],$b[1],($r25)=&NR(1));
+ &cmpult($R,$r22,($t22)=&NR(1)); &FR($r22);
+ &add($H1,$t22,$H1) &FR($t22);
+ &st($R,&QWPw(4,$rp));
+ &add($H1,$H2,$R);
+
+ &mov("zero",$H1);
+ &add($R,$r23,$R);
+ &mov("zero",$H2);
+ &mul($a[2],$b[3],($r26)=&NR(1));
+ &cmpult($R,$r23,($t23)=&NR(1)); &FR($r23);
+ &add($R,$r24,$R);
+ &add($H1,$t23,$H1) &FR($t23);
+ &mul($a[3],$b[2],($r27)=&NR(1));
+ &cmpult($R,$r24,($t24)=&NR(1)); &FR($r24);
+ &add($R,$r25,$R);
+ &add($H1,$t24,$H1) &FR($t24);
+ &muh($a[2],$b[3],($r28)=&NR(1));
+ &cmpult($R,$r25,($t25)=&NR(1)); &FR($r25);
+ &add($R,$r26,$R);
+ &add($H1,$t25,$H1) &FR($t25);
+ &muh($a[3],$b[2],($r29)=&NR(1));
+ &cmpult($R,$r26,($t26)=&NR(1)); &FR($r26);
+ &add($R,$r27,$R);
+ &add($H1,$t26,$H1) &FR($t26);
+ &mul($a[3],$b[3],($r30)=&NR(1));
+ &cmpult($R,$r27,($t27)=&NR(1)); &FR($r27);
+ &add($H1,$t27,$H1) &FR($t27);
+ &st($R,&QWPw(5,$rp));
+ &add($H1,$H2,$R);
+
+ &mov("zero",$H1);
+ &add($R,$r28,$R);
+ &mov("zero",$H2);
+ &muh($a[3],$b[3],($r31)=&NR(1));
+ &cmpult($R,$r28,($t28)=&NR(1)); &FR($r28);
+ &add($R,$r29,$R);
+ &add($H1,$t28,$H1) &FR($t28);
+ ############
+ &cmpult($R,$r29,($t29)=&NR(1)); &FR($r29);
+ &add($R,$r30,$R);
+ &add($H1,$t29,$H1) &FR($t29);
+ ############
+ &cmpult($R,$r30,($t30)=&NR(1)); &FR($r30);
+ &add($H1,$t30,$H1) &FR($t30);
+ &st($R,&QWPw(6,$rp));
+ &add($H1,$H2,$R);
+
+ &add($R,$r31,$R); &FR($r31);
+ &st($R,&QWPw(7,$rp));
+
+ &FR($R,$H1,$H2);
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha/mul_c4.works.pl b/crypto/bn/asm/alpha/mul_c4.works.pl
new file mode 100644
index 0000000000..79d86dd25c
--- /dev/null
+++ b/crypto/bn/asm/alpha/mul_c4.works.pl
@@ -0,0 +1,98 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub mul_add_c
+ {
+ local($a,$b,$c0,$c1,$c2)=@_;
+ local($l1,$h1,$t1,$t2);
+
+print STDERR "count=$cnt\n"; $cnt++;
+ &mul($a,$b,($l1)=&NR(1));
+ &muh($a,$b,($h1)=&NR(1));
+ &add($c0,$l1,$c0);
+ &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1);
+ &add($t1,$h1,$h1); &FR($t1);
+ &add($c1,$h1,$c1);
+ &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1);
+ &add($c2,$t2,$c2); &FR($t2);
+ }
+
+sub bn_mul_comba4
+ {
+ local($name)=@_;
+ local(@a,@b,$r,$c0,$c1,$c2);
+
+ $cnt=1;
+ &init_pool(3);
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $bp=&wparam(2);
+
+ &function_begin($name,"");
+
+ &comment("");
+
+ &ld(($a[0])=&NR(1),&QWPw(0,$ap));
+ &ld(($b[0])=&NR(1),&QWPw(0,$bp));
+ &ld(($a[1])=&NR(1),&QWPw(1,$ap));
+ &ld(($b[1])=&NR(1),&QWPw(1,$bp));
+ &ld(($a[2])=&NR(1),&QWPw(2,$ap));
+ &ld(($b[2])=&NR(1),&QWPw(2,$bp));
+ &ld(($a[3])=&NR(1),&QWPw(3,$ap)); &FR($ap);
+ &ld(($b[3])=&NR(1),&QWPw(3,$bp)); &FR($bp);
+
+ ($c0,$c1,$c2)=&NR(3);
+ &mov("zero",$c2);
+ &mul($a[0],$b[0],$c0);
+ &muh($a[0],$b[0],$c1);
+ &st($c0,&QWPw(0,$rp)); &FR($c0); ($c0)=&NR($c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[1],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(1,$rp)); &FR($c0); ($c0)=&NR($c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[1],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[0],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(2,$rp)); &FR($c0); ($c0)=&NR($c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[3],$c0,$c1,$c2); &FR($a[0]);
+ &mul_add_c($a[1],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[0],$c0,$c1,$c2); &FR($b[0]);
+ &st($c0,&QWPw(3,$rp)); &FR($c0); ($c0)=&NR($c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[1],$b[3],$c0,$c1,$c2); &FR($a[1]);
+ &mul_add_c($a[2],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[1],$c0,$c1,$c2); &FR($b[1]);
+ &st($c0,&QWPw(4,$rp)); &FR($c0); ($c0)=&NR($c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[2],$b[3],$c0,$c1,$c2); &FR($a[2]);
+ &mul_add_c($a[3],$b[2],$c0,$c1,$c2); &FR($b[2]);
+ &st($c0,&QWPw(5,$rp)); &FR($c0); ($c0)=&NR($c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[3],$b[3],$c0,$c1,$c2); &FR($a[3],$b[3]);
+ &st($c0,&QWPw(6,$rp));
+ &st($c1,&QWPw(7,$rp));
+
+ &FR($c0,$c1,$c2);
+
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha/mul_c8.pl b/crypto/bn/asm/alpha/mul_c8.pl
new file mode 100644
index 0000000000..525ca7494b
--- /dev/null
+++ b/crypto/bn/asm/alpha/mul_c8.pl
@@ -0,0 +1,177 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub bn_mul_comba8
+ {
+ local($name)=@_;
+ local(@a,@b,$r,$c0,$c1,$c2);
+
+ $cnt=1;
+ &init_pool(3);
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $bp=&wparam(2);
+
+ &function_begin($name,"");
+
+ &comment("");
+
+ &stack_push(2);
+ &ld(($a[0])=&NR(1),&QWPw(0,$ap));
+ &ld(($b[0])=&NR(1),&QWPw(0,$bp));
+ &st($reg_s0,&swtmp(0)); &FR($reg_s0);
+ &st($reg_s1,&swtmp(1)); &FR($reg_s1);
+ &ld(($a[1])=&NR(1),&QWPw(1,$ap));
+ &ld(($b[1])=&NR(1),&QWPw(1,$bp));
+ &ld(($a[2])=&NR(1),&QWPw(2,$ap));
+ &ld(($b[2])=&NR(1),&QWPw(2,$bp));
+ &ld(($a[3])=&NR(1),&QWPw(3,$ap));
+ &ld(($b[3])=&NR(1),&QWPw(3,$bp));
+ &ld(($a[4])=&NR(1),&QWPw(1,$ap));
+ &ld(($b[4])=&NR(1),&QWPw(1,$bp));
+ &ld(($a[5])=&NR(1),&QWPw(1,$ap));
+ &ld(($b[5])=&NR(1),&QWPw(1,$bp));
+ &ld(($a[6])=&NR(1),&QWPw(1,$ap));
+ &ld(($b[6])=&NR(1),&QWPw(1,$bp));
+ &ld(($a[7])=&NR(1),&QWPw(1,$ap)); &FR($ap);
+ &ld(($b[7])=&NR(1),&QWPw(1,$bp)); &FR($bp);
+
+ ($c0,$c1,$c2)=&NR(3);
+ &mov("zero",$c2);
+ &mul($a[0],$b[0],$c0);
+ &muh($a[0],$b[0],$c1);
+ &st($c0,&QWPw(0,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[1],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(1,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[1],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(2,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[3],$c0,$c1,$c2);
+ &mul_add_c($a[1],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(3,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[4],$c0,$c1,$c2);
+ &mul_add_c($a[1],$b[3],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[4],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(4,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[5],$c0,$c1,$c2);
+ &mul_add_c($a[1],$b[4],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[3],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[4],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[5],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(5,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[6],$c0,$c1,$c2);
+ &mul_add_c($a[1],$b[5],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[4],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[3],$c0,$c1,$c2);
+ &mul_add_c($a[4],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[5],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[6],$b[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(6,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[0],$b[7],$c0,$c1,$c2); &FR($a[0]);
+ &mul_add_c($a[1],$b[6],$c0,$c1,$c2);
+ &mul_add_c($a[2],$b[5],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[4],$c0,$c1,$c2);
+ &mul_add_c($a[4],$b[3],$c0,$c1,$c2);
+ &mul_add_c($a[5],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[6],$b[1],$c0,$c1,$c2);
+ &mul_add_c($a[7],$b[0],$c0,$c1,$c2); &FR($b[0]);
+ &st($c0,&QWPw(7,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[1],$b[7],$c0,$c1,$c2); &FR($a[1]);
+ &mul_add_c($a[2],$b[6],$c0,$c1,$c2);
+ &mul_add_c($a[3],$b[5],$c0,$c1,$c2);
+ &mul_add_c($a[4],$b[4],$c0,$c1,$c2);
+ &mul_add_c($a[5],$b[3],$c0,$c1,$c2);
+ &mul_add_c($a[6],$b[2],$c0,$c1,$c2);
+ &mul_add_c($a[7],$b[1],$c0,$c1,$c2); &FR($b[1]);
+ &st($c0,&QWPw(8,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[2],$b[7],$c0,$c1,$c2); &FR($a[2]);
+ &mul_add_c($a[3],$b[6],$c0,$c1,$c2);
+ &mul_add_c($a[4],$b[5],$c0,$c1,$c2);
+ &mul_add_c($a[5],$b[4],$c0,$c1,$c2);
+ &mul_add_c($a[6],$b[3],$c0,$c1,$c2);
+ &mul_add_c($a[7],$b[2],$c0,$c1,$c2); &FR($b[2]);
+ &st($c0,&QWPw(9,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[3],$b[7],$c0,$c1,$c2); &FR($a[3]);
+ &mul_add_c($a[4],$b[6],$c0,$c1,$c2);
+ &mul_add_c($a[5],$b[5],$c0,$c1,$c2);
+ &mul_add_c($a[6],$b[4],$c0,$c1,$c2);
+ &mul_add_c($a[7],$b[3],$c0,$c1,$c2); &FR($b[3]);
+ &st($c0,&QWPw(10,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[4],$b[7],$c0,$c1,$c2); &FR($a[4]);
+ &mul_add_c($a[5],$b[6],$c0,$c1,$c2);
+ &mul_add_c($a[6],$b[5],$c0,$c1,$c2);
+ &mul_add_c($a[7],$b[4],$c0,$c1,$c2); &FR($b[4]);
+ &st($c0,&QWPw(11,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[5],$b[7],$c0,$c1,$c2); &FR($a[5]);
+ &mul_add_c($a[6],$b[6],$c0,$c1,$c2);
+ &mul_add_c($a[7],$b[5],$c0,$c1,$c2); &FR($b[5]);
+ &st($c0,&QWPw(12,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[6],$b[7],$c0,$c1,$c2); &FR($a[6]);
+ &mul_add_c($a[7],$b[6],$c0,$c1,$c2); &FR($b[6]);
+ &st($c0,&QWPw(13,$rp)); &FR($c0); ($c0)=&NR(1);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &mul_add_c($a[7],$b[7],$c0,$c1,$c2); &FR($a[7],$b[7]);
+ &st($c0,&QWPw(14,$rp));
+ &st($c1,&QWPw(15,$rp));
+
+ &FR($c0,$c1,$c2);
+
+ &ld($reg_s0,&swtmp(0));
+ &ld($reg_s1,&swtmp(1));
+ &stack_pop(2);
+
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha/sqr.pl b/crypto/bn/asm/alpha/sqr.pl
new file mode 100644
index 0000000000..a55b696906
--- /dev/null
+++ b/crypto/bn/asm/alpha/sqr.pl
@@ -0,0 +1,113 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub bn_sqr_words
+ {
+ local($name)=@_;
+ local($cc,$a,$b,$r,$couny);
+
+ &init_pool(3);
+ ($cc)=GR("r0");
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $count=&wparam(2);
+
+ &function_begin($name,"");
+
+ &comment("");
+ &sub($count,4,$count);
+ &mov("zero",$cc);
+ &br(&label("finish"));
+ &blt($count,&label("finish"));
+
+ ($a0,$r0)=&NR(2);
+ &ld($a0,&QWPw(0,$ap));
+ &ld($r0,&QWPw(0,$rp));
+
+$a=<<'EOF';
+##########################################################
+ &set_label("loop");
+
+ ($a1)=&NR(1); &ld($a1,&QWPw(1,$ap));
+ ($b1)=&NR(1); &ld($b1,&QWPw(1,$bp));
+ ($a2)=&NR(1); &ld($a2,&QWPw(2,$ap));
+ ($b2)=&NR(1); &ld($b2,&QWPw(2,$bp));
+ ($a3)=&NR(1); &ld($a3,&QWPw(3,$ap));
+ ($b3)=&NR(1); &ld($b3,&QWPw(3,$bp));
+
+ ($o0,$t0)=&NR(2);
+ &add($a0,$b0,$o0);
+ &cmpult($o0,$b0,$t0);
+ &add($o0,$cc,$o0);
+ &cmpult($o0,$cc,$cc);
+ &add($cc,$t0,$cc); &FR($t0);
+
+ ($t1,$o1)=&NR(2);
+
+ &add($a1,$b1,$o1); &FR($a1);
+ &cmpult($o1,$b1,$t1); &FR($b1);
+ &add($o1,$cc,$o1);
+ &cmpult($o1,$cc,$cc);
+ &add($cc,$t1,$cc); &FR($t1);
+
+ ($t2,$o2)=&NR(2);
+
+ &add($a2,$b2,$o2); &FR($a2);
+ &cmpult($o2,$b2,$t2); &FR($b2);
+ &add($o2,$cc,$o2);
+ &cmpult($o2,$cc,$cc);
+ &add($cc,$t2,$cc); &FR($t2);
+
+ ($t3,$o3)=&NR(2);
+
+ &add($a3,$b3,$o3); &FR($a3);
+ &cmpult($o3,$b3,$t3); &FR($b3);
+ &add($o3,$cc,$o3);
+ &cmpult($o3,$cc,$cc);
+ &add($cc,$t3,$cc); &FR($t3);
+
+ &st($o0,&QWPw(0,$rp)); &FR($o0);
+ &st($o1,&QWPw(0,$rp)); &FR($o1);
+ &st($o2,&QWPw(0,$rp)); &FR($o2);
+ &st($o3,&QWPw(0,$rp)); &FR($o3);
+
+ &sub($count,4,$count); # count-=4
+ &add($ap,4*$QWS,$ap); # count+=4
+ &add($bp,4*$QWS,$bp); # count+=4
+ &add($rp,4*$QWS,$rp); # count+=4
+
+ &blt($count,&label("finish"));
+ &ld($a0,&QWPw(0,$ap));
+ &ld($b0,&QWPw(0,$bp));
+ &br(&label("loop"));
+EOF
+##################################################
+ # Do the last 0..3 words
+
+ &set_label("last_loop");
+
+ &ld(($a0)=&NR(1),&QWPw(0,$ap)); # get a
+ &mul($a0,$a0,($l0)=&NR(1));
+ &add($ap,$QWS,$ap);
+ &add($rp,2*$QWS,$rp);
+ &sub($count,1,$count);
+ &muh($a0,$a0,($h0)=&NR(1)); &FR($a0);
+ &st($l0,&QWPw(-2,$rp)); &FR($l0);
+ &st($h0,&QWPw(-1,$rp)); &FR($h0);
+
+ &bgt($count,&label("last_loop"));
+ &function_end_A($name);
+
+######################################################
+ &set_label("finish");
+ &add($count,4,$count);
+ &bgt($count,&label("last_loop"));
+
+ &set_label("end");
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha/sqr_c4.pl b/crypto/bn/asm/alpha/sqr_c4.pl
new file mode 100644
index 0000000000..bf33f5b503
--- /dev/null
+++ b/crypto/bn/asm/alpha/sqr_c4.pl
@@ -0,0 +1,109 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub sqr_add_c
+ {
+ local($a,$c0,$c1,$c2)=@_;
+ local($l1,$h1,$t1,$t2);
+
+ &mul($a,$a,($l1)=&NR(1));
+ &muh($a,$a,($h1)=&NR(1));
+ &add($c0,$l1,$c0);
+ &add($c1,$h1,$c1);
+ &cmpult($c0,$l1,($t1)=&NR(1)); &FR($l1);
+ &cmpult($c1,$h1,($t2)=&NR(1)); &FR($h1);
+ &add($c1,$t1,$c1); &FR($t1);
+ &add($c2,$t2,$c2); &FR($t2);
+ }
+
+sub sqr_add_c2
+ {
+ local($a,$b,$c0,$c1,$c2)=@_;
+ local($l1,$h1,$t1,$t2);
+
+ &mul($a,$b,($l1)=&NR(1));
+ &muh($a,$b,($h1)=&NR(1));
+ &cmplt($l1,"zero",($lc1)=&NR(1));
+ &cmplt($h1,"zero",($hc1)=&NR(1));
+ &add($l1,$l1,$l1);
+ &add($h1,$h1,$h1);
+ &add($h1,$lc1,$h1); &FR($lc1);
+ &add($c2,$hc1,$c2); &FR($hc1);
+
+ &add($c0,$l1,$c0);
+ &add($c1,$h1,$c1);
+ &cmpult($c0,$l1,($lc1)=&NR(1)); &FR($l1);
+ &cmpult($c1,$h1,($hc1)=&NR(1)); &FR($h1);
+
+ &add($c1,$lc1,$c1); &FR($lc1);
+ &add($c2,$hc1,$c2); &FR($hc1);
+ }
+
+
+sub bn_sqr_comba4
+ {
+ local($name)=@_;
+ local(@a,@b,$r,$c0,$c1,$c2);
+
+ $cnt=1;
+ &init_pool(2);
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+
+ &function_begin($name,"");
+
+ &comment("");
+
+ &ld(($a[0])=&NR(1),&QWPw(0,$ap));
+ &ld(($a[1])=&NR(1),&QWPw(1,$ap));
+ &ld(($a[2])=&NR(1),&QWPw(2,$ap));
+ &ld(($a[3])=&NR(1),&QWPw(3,$ap)); &FR($ap);
+
+ ($c0,$c1,$c2)=&NR(3);
+
+ &mov("zero",$c2);
+ &mul($a[0],$a[0],$c0);
+ &muh($a[0],$a[0],$c1);
+ &st($c0,&QWPw(0,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c2($a[0],$a[1],$c0,$c1,$c2);
+ &st($c0,&QWPw(1,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c($a[1],$c0,$c1,$c2);
+ &sqr_add_c2($a[2],$a[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(2,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c2($a[3],$a[0],$c0,$c1,$c2);
+ &sqr_add_c2($a[2],$a[1],$c0,$c1,$c2);
+ &st($c0,&QWPw(3,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c($a[2],$c0,$c1,$c2);
+ &sqr_add_c2($a[3],$a[1],$c0,$c1,$c2);
+ &st($c0,&QWPw(4,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c2($a[3],$a[2],$c0,$c1,$c2);
+ &st($c0,&QWPw(5,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c($a[3],$c0,$c1,$c2);
+ &st($c0,&QWPw(6,$rp));
+ &st($c1,&QWPw(7,$rp));
+
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha/sqr_c8.pl b/crypto/bn/asm/alpha/sqr_c8.pl
new file mode 100644
index 0000000000..b4afe085f1
--- /dev/null
+++ b/crypto/bn/asm/alpha/sqr_c8.pl
@@ -0,0 +1,132 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub bn_sqr_comba8
+ {
+ local($name)=@_;
+ local(@a,@b,$r,$c0,$c1,$c2);
+
+ $cnt=1;
+ &init_pool(2);
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+
+ &function_begin($name,"");
+
+ &comment("");
+
+ &ld(($a[0])=&NR(1),&QWPw(0,$ap));
+ &ld(($a[1])=&NR(1),&QWPw(1,$ap));
+ &ld(($a[2])=&NR(1),&QWPw(2,$ap));
+ &ld(($a[3])=&NR(1),&QWPw(3,$ap));
+ &ld(($a[4])=&NR(1),&QWPw(4,$ap));
+ &ld(($a[5])=&NR(1),&QWPw(5,$ap));
+ &ld(($a[6])=&NR(1),&QWPw(6,$ap));
+ &ld(($a[7])=&NR(1),&QWPw(7,$ap)); &FR($ap);
+
+ ($c0,$c1,$c2)=&NR(3);
+
+ &mov("zero",$c2);
+ &mul($a[0],$a[0],$c0);
+ &muh($a[0],$a[0],$c1);
+ &st($c0,&QWPw(0,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c2($a[1],$a[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(1,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c($a[1],$c0,$c1,$c2);
+ &sqr_add_c2($a[2],$a[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(2,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c2($a[2],$a[1],$c0,$c1,$c2);
+ &sqr_add_c2($a[3],$a[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(3,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c($a[2],$c0,$c1,$c2);
+ &sqr_add_c2($a[3],$a[1],$c0,$c1,$c2);
+ &sqr_add_c2($a[4],$a[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(4,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c2($a[3],$a[2],$c0,$c1,$c2);
+ &sqr_add_c2($a[4],$a[1],$c0,$c1,$c2);
+ &sqr_add_c2($a[5],$a[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(5,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c($a[3],$c0,$c1,$c2);
+ &sqr_add_c2($a[4],$a[2],$c0,$c1,$c2);
+ &sqr_add_c2($a[5],$a[1],$c0,$c1,$c2);
+ &sqr_add_c2($a[6],$a[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(6,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c2($a[4],$a[3],$c0,$c1,$c2);
+ &sqr_add_c2($a[5],$a[2],$c0,$c1,$c2);
+ &sqr_add_c2($a[6],$a[1],$c0,$c1,$c2);
+ &sqr_add_c2($a[7],$a[0],$c0,$c1,$c2);
+ &st($c0,&QWPw(7,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c($a[4],$c0,$c1,$c2);
+ &sqr_add_c2($a[5],$a[3],$c0,$c1,$c2);
+ &sqr_add_c2($a[6],$a[2],$c0,$c1,$c2);
+ &sqr_add_c2($a[7],$a[1],$c0,$c1,$c2);
+ &st($c0,&QWPw(8,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c2($a[5],$a[4],$c0,$c1,$c2);
+ &sqr_add_c2($a[6],$a[3],$c0,$c1,$c2);
+ &sqr_add_c2($a[7],$a[2],$c0,$c1,$c2);
+ &st($c0,&QWPw(9,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c($a[5],$c0,$c1,$c2);
+ &sqr_add_c2($a[6],$a[4],$c0,$c1,$c2);
+ &sqr_add_c2($a[7],$a[3],$c0,$c1,$c2);
+ &st($c0,&QWPw(10,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c2($a[6],$a[5],$c0,$c1,$c2);
+ &sqr_add_c2($a[7],$a[4],$c0,$c1,$c2);
+ &st($c0,&QWPw(11,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c($a[6],$c0,$c1,$c2);
+ &sqr_add_c2($a[7],$a[5],$c0,$c1,$c2);
+ &st($c0,&QWPw(12,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c2($a[7],$a[6],$c0,$c1,$c2);
+ &st($c0,&QWPw(13,$rp));
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ &mov("zero",$c2);
+
+ &sqr_add_c($a[7],$c0,$c1,$c2);
+ &st($c0,&QWPw(14,$rp));
+ &st($c1,&QWPw(15,$rp));
+
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/alpha/sub.pl b/crypto/bn/asm/alpha/sub.pl
new file mode 100644
index 0000000000..d998da5c21
--- /dev/null
+++ b/crypto/bn/asm/alpha/sub.pl
@@ -0,0 +1,108 @@
+#!/usr/local/bin/perl
+# alpha assember
+
+sub bn_sub_words
+ {
+ local($name)=@_;
+ local($cc,$a,$b,$r);
+
+ &init_pool(4);
+ ($cc)=GR("r0");
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $bp=&wparam(2);
+ $count=&wparam(3);
+
+ &function_begin($name,"");
+
+ &comment("");
+ &sub($count,4,$count);
+ &mov("zero",$cc);
+ &blt($count,&label("finish"));
+
+ ($a0,$b0)=&NR(2);
+ &ld($a0,&QWPw(0,$ap));
+ &ld($b0,&QWPw(0,$bp));
+
+##########################################################
+ &set_label("loop");
+
+ ($a1,$tmp,$b1,$a2,$b2,$a3,$b3,$o0)=&NR(8);
+ &ld($a1,&QWPw(1,$ap));
+ &cmpult($a0,$b0,$tmp); # will we borrow?
+ &ld($b1,&QWPw(1,$bp));
+ &sub($a0,$b0,$a0); # do the subtract
+ &ld($a2,&QWPw(2,$ap));
+ &cmpult($a0,$cc,$b0); # will we borrow?
+ &ld($b2,&QWPw(2,$bp));
+ &sub($a0,$cc,$o0); # will we borrow?
+ &ld($a3,&QWPw(3,$ap));
+ &add($b0,$tmp,$cc); ($t1,$o1)=&NR(2); &FR($tmp);
+
+ &cmpult($a1,$b1,$t1); # will we borrow?
+ &sub($a1,$b1,$a1); # do the subtract
+ &ld($b3,&QWPw(3,$bp));
+ &cmpult($a1,$cc,$b1); # will we borrow?
+ &sub($a1,$cc,$o1); # will we borrow?
+ &add($b1,$t1,$cc); ($tmp,$o2)=&NR(2); &FR($t1,$a1,$b1);
+
+ &cmpult($a2,$b2,$tmp); # will we borrow?
+ &sub($a2,$b2,$a2); # do the subtract
+ &st($o0,&QWPw(0,$rp)); &FR($o0); # save
+ &cmpult($a2,$cc,$b2); # will we borrow?
+ &sub($a2,$cc,$o2); # will we borrow?
+ &add($b2,$tmp,$cc); ($t3,$o3)=&NR(2); &FR($tmp,$a2,$b2);
+
+ &cmpult($a3,$b3,$t3); # will we borrow?
+ &sub($a3,$b3,$a3); # do the subtract
+ &st($o1,&QWPw(1,$rp)); &FR($o1);
+ &cmpult($a3,$cc,$b3); # will we borrow?
+ &sub($a3,$cc,$o3); # will we borrow?
+ &add($b3,$t3,$cc); &FR($t3,$a3,$b3);
+
+ &st($o2,&QWPw(2,$rp)); &FR($o2);
+ &sub($count,4,$count); # count-=4
+ &st($o3,&QWPw(3,$rp)); &FR($o3);
+ &add($ap,4*$QWS,$ap); # count+=4
+ &add($bp,4*$QWS,$bp); # count+=4
+ &add($rp,4*$QWS,$rp); # count+=4
+
+ &blt($count,&label("finish"));
+ &ld($a0,&QWPw(0,$ap));
+ &ld($b0,&QWPw(0,$bp));
+ &br(&label("loop"));
+##################################################
+ # Do the last 0..3 words
+
+ &set_label("last_loop");
+
+ &ld($a0,&QWPw(0,$ap)); # get a
+ &ld($b0,&QWPw(0,$bp)); # get b
+ &cmpult($a0,$b0,$tmp); # will we borrow?
+ &sub($a0,$b0,$a0); # do the subtract
+ &cmpult($a0,$cc,$b0); # will we borrow?
+ &sub($a0,$cc,$a0); # will we borrow?
+ &st($a0,&QWPw(0,$rp)); # save
+ &add($b0,$tmp,$cc); # add the borrows
+
+ &add($ap,$QWS,$ap);
+ &add($bp,$QWS,$bp);
+ &add($rp,$QWS,$rp);
+ &sub($count,1,$count);
+ &bgt($count,&label("last_loop"));
+ &function_end_A($name);
+
+######################################################
+ &set_label("finish");
+ &add($count,4,$count);
+ &bgt($count,&label("last_loop"));
+
+ &FR($a0,$b0);
+ &set_label("end");
+ &function_end($name);
+
+ &fin_pool;
+ }
+
+1;
diff --git a/crypto/bn/asm/bn-586.pl b/crypto/bn/asm/bn-586.pl
index 128f0f29d6..7a03c67b5b 100644
--- a/crypto/bn/asm/bn-586.pl
+++ b/crypto/bn/asm/bn-586.pl
@@ -1,7 +1,4 @@
#!/usr/local/bin/perl
-#
-
-#!/usr/local/bin/perl
push(@INC,"perlasm","../../perlasm");
require "x86asm.pl";
@@ -11,8 +8,9 @@ require "x86asm.pl";
&bn_mul_add_words("bn_mul_add_words");
&bn_mul_words("bn_mul_words");
&bn_sqr_words("bn_sqr_words");
-&bn_div64("bn_div64");
+&bn_div_words("bn_div_words");
&bn_add_words("bn_add_words");
+&bn_sub_words("bn_sub_words");
&asm_finish();
@@ -228,7 +226,7 @@ sub bn_sqr_words
&function_end($name);
}
-sub bn_div64
+sub bn_div_words
{
local($name)=@_;
@@ -307,7 +305,79 @@ sub bn_add_words
}
&set_label("aw_end",0);
- &mov("eax",$c);
+# &mov("eax",$c); # $c is "eax"
+
+ &function_end($name);
+ }
+
+sub bn_sub_words
+ {
+ local($name)=@_;
+
+ &function_begin($name,"");
+
+ &comment("");
+ $a="esi";
+ $b="edi";
+ $c="eax";
+ $r="ebx";
+ $tmp1="ecx";
+ $tmp2="edx";
+ $num="ebp";
+
+ &mov($r,&wparam(0)); # get r
+ &mov($a,&wparam(1)); # get a
+ &mov($b,&wparam(2)); # get b
+ &mov($num,&wparam(3)); # get num
+ &xor($c,$c); # clear carry
+ &and($num,0xfffffff8); # num / 8
+
+ &jz(&label("aw_finish"));
+
+ &set_label("aw_loop",0);
+ for ($i=0; $i<8; $i++)
+ {
+ &comment("Round $i");
+
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
+ &sub($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &sub($tmp1,$tmp2);
+ &adc($c,0);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ }
+
+ &comment("");
+ &add($a,32);
+ &add($b,32);
+ &add($r,32);
+ &sub($num,8);
+ &jnz(&label("aw_loop"));
+
+ &set_label("aw_finish",0);
+ &mov($num,&wparam(3)); # get num
+ &and($num,7);
+ &jz(&label("aw_end"));
+
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("Tail Round $i");
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+ &sub($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &sub($tmp1,$tmp2);
+ &adc($c,0);
+ &dec($num) if ($i != 6);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *a
+ &jz(&label("aw_end")) if ($i != 6);
+ }
+ &set_label("aw_end",0);
+
+# &mov("eax",$c); # $c is "eax"
&function_end($name);
}
diff --git a/crypto/bn/asm/bn-alpha.pl b/crypto/bn/asm/bn-alpha.pl
new file mode 100644
index 0000000000..302edf2376
--- /dev/null
+++ b/crypto/bn/asm/bn-alpha.pl
@@ -0,0 +1,571 @@
+#!/usr/local/bin/perl
+# I have this in perl so I can use more usefull register names and then convert
+# them into alpha registers.
+#
+
+$d=&data();
+$d =~ s/CC/0/g;
+$d =~ s/R1/1/g;
+$d =~ s/R2/2/g;
+$d =~ s/R3/3/g;
+$d =~ s/R4/4/g;
+$d =~ s/L1/5/g;
+$d =~ s/L2/6/g;
+$d =~ s/L3/7/g;
+$d =~ s/L4/8/g;
+$d =~ s/O1/22/g;
+$d =~ s/O2/23/g;
+$d =~ s/O3/24/g;
+$d =~ s/O4/25/g;
+$d =~ s/A1/20/g;
+$d =~ s/A2/21/g;
+$d =~ s/A3/27/g;
+$d =~ s/A4/28/g;
+if (0){
+}
+
+print $d;
+
+sub data
+ {
+ local($data)=<<'EOF';
+
+ # DEC Alpha assember
+ # The bn_div_words is actually gcc output but the other parts are hand done.
+ # Thanks to tzeruch@ceddec.com for sending me the gcc output for
+ # bn_div_words.
+ # I've gone back and re-done most of routines.
+ # The key thing to remeber for the 164 CPU is that while a
+ # multiply operation takes 8 cycles, another one can only be issued
+ # after 4 cycles have elapsed. I've done modification to help
+ # improve this. Also, normally, a ld instruction will not be available
+ # for about 3 cycles.
+ .file 1 "bn_asm.c"
+ .set noat
+gcc2_compiled.:
+__gnu_compiled_c:
+ .text
+ .align 3
+ .globl bn_mul_add_words
+ .ent bn_mul_add_words
+bn_mul_add_words:
+bn_mul_add_words..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+ .align 5
+ subq $18,4,$18
+ bis $31,$31,$CC
+ blt $18,$43 # if we are -1, -2, -3 or -4 goto tail code
+ ldq $A1,0($17) # 1 1
+ ldq $R1,0($16) # 1 1
+ .align 3
+$42:
+ mulq $A1,$19,$L1 # 1 2 1 ######
+ ldq $A2,8($17) # 2 1
+ ldq $R2,8($16) # 2 1
+ umulh $A1,$19,$A1 # 1 2 ######
+ ldq $A3,16($17) # 3 1
+ ldq $R3,16($16) # 3 1
+ mulq $A2,$19,$L2 # 2 2 1 ######
+ ldq $A4,24($17) # 4 1
+ addq $R1,$L1,$R1 # 1 2 2
+ ldq $R4,24($16) # 4 1
+ umulh $A2,$19,$A2 # 2 2 ######
+ cmpult $R1,$L1,$O1 # 1 2 3 1
+ addq $A1,$O1,$A1 # 1 3 1
+ addq $R1,$CC,$R1 # 1 2 3 1
+ mulq $A3,$19,$L3 # 3 2 1 ######
+ cmpult $R1,$CC,$CC # 1 2 3 2
+ addq $R2,$L2,$R2 # 2 2 2
+ addq $A1,$CC,$CC # 1 3 2
+ cmpult $R2,$L2,$O2 # 2 2 3 1
+ addq $A2,$O2,$A2 # 2 3 1
+ umulh $A3,$19,$A3 # 3 2 ######
+ addq $R2,$CC,$R2 # 2 2 3 1
+ cmpult $R2,$CC,$CC # 2 2 3 2
+ subq $18,4,$18
+ mulq $A4,$19,$L4 # 4 2 1 ######
+ addq $A2,$CC,$CC # 2 3 2
+ addq $R3,$L3,$R3 # 3 2 2
+ addq $16,32,$16
+ cmpult $R3,$L3,$O3 # 3 2 3 1
+ stq $R1,-32($16) # 1 2 4
+ umulh $A4,$19,$A4 # 4 2 ######
+ addq $A3,$O3,$A3 # 3 3 1
+ addq $R3,$CC,$R3 # 3 2 3 1
+ stq $R2,-24($16) # 2 2 4
+ cmpult $R3,$CC,$CC # 3 2 3 2
+ stq $R3,-16($16) # 3 2 4
+ addq $R4,$L4,$R4 # 4 2 2
+ addq $A3,$CC,$CC # 3 3 2
+ cmpult $R4,$L4,$O4 # 4 2 3 1
+ addq $17,32,$17
+ addq $A4,$O4,$A4 # 4 3 1
+ addq $R4,$CC,$R4 # 4 2 3 1
+ cmpult $R4,$CC,$CC # 4 2 3 2
+ stq $R4,-8($16) # 4 2 4
+ addq $A4,$CC,$CC # 4 3 2
+ blt $18,$43
+
+ ldq $A1,0($17) # 1 1
+ ldq $R1,0($16) # 1 1
+
+ br $42
+
+ .align 4
+$45:
+ ldq $A1,0($17) # 4 1
+ ldq $R1,0($16) # 4 1
+ mulq $A1,$19,$L1 # 4 2 1
+ subq $18,1,$18
+ addq $16,8,$16
+ addq $17,8,$17
+ umulh $A1,$19,$A1 # 4 2
+ addq $R1,$L1,$R1 # 4 2 2
+ cmpult $R1,$L1,$O1 # 4 2 3 1
+ addq $A1,$O1,$A1 # 4 3 1
+ addq $R1,$CC,$R1 # 4 2 3 1
+ cmpult $R1,$CC,$CC # 4 2 3 2
+ addq $A1,$CC,$CC # 4 3 2
+ stq $R1,-8($16) # 4 2 4
+ bgt $18,$45
+ ret $31,($26),1 # else exit
+
+ .align 4
+$43:
+ addq $18,4,$18
+ bgt $18,$45 # goto tail code
+ ret $31,($26),1 # else exit
+
+ .end bn_mul_add_words
+ .align 3
+ .globl bn_mul_words
+ .ent bn_mul_words
+bn_mul_words:
+bn_mul_words..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+ .align 5
+ subq $18,4,$18
+ bis $31,$31,$CC
+ blt $18,$143 # if we are -1, -2, -3 or -4 goto tail code
+ ldq $A1,0($17) # 1 1
+ .align 3
+$142:
+
+ mulq $A1,$19,$L1 # 1 2 1 #####
+ ldq $A2,8($17) # 2 1
+ ldq $A3,16($17) # 3 1
+ umulh $A1,$19,$A1 # 1 2 #####
+ ldq $A4,24($17) # 4 1
+ mulq $A2,$19,$L2 # 2 2 1 #####
+ addq $L1,$CC,$L1 # 1 2 3 1
+ subq $18,4,$18
+ cmpult $L1,$CC,$CC # 1 2 3 2
+ umulh $A2,$19,$A2 # 2 2 #####
+ addq $A1,$CC,$CC # 1 3 2
+ addq $17,32,$17
+ addq $L2,$CC,$L2 # 2 2 3 1
+ mulq $A3,$19,$L3 # 3 2 1 #####
+ cmpult $L2,$CC,$CC # 2 2 3 2
+ addq $A2,$CC,$CC # 2 3 2
+ addq $16,32,$16
+ umulh $A3,$19,$A3 # 3 2 #####
+ stq $L1,-32($16) # 1 2 4
+ mulq $A4,$19,$L4 # 4 2 1 #####
+ addq $L3,$CC,$L3 # 3 2 3 1
+ stq $L2,-24($16) # 2 2 4
+ cmpult $L3,$CC,$CC # 3 2 3 2
+ umulh $A4,$19,$A4 # 4 2 #####
+ addq $A3,$CC,$CC # 3 3 2
+ stq $L3,-16($16) # 3 2 4
+ addq $L4,$CC,$L4 # 4 2 3 1
+ cmpult $L4,$CC,$CC # 4 2 3 2
+
+ addq $A4,$CC,$CC # 4 3 2
+
+ stq $L4,-8($16) # 4 2 4
+
+ blt $18,$143
+
+ ldq $A1,0($17) # 1 1
+
+ br $142
+
+ .align 4
+$145:
+ ldq $A1,0($17) # 4 1
+ mulq $A1,$19,$L1 # 4 2 1
+ subq $18,1,$18
+ umulh $A1,$19,$A1 # 4 2
+ addq $L1,$CC,$L1 # 4 2 3 1
+ addq $16,8,$16
+ cmpult $L1,$CC,$CC # 4 2 3 2
+ addq $17,8,$17
+ addq $A1,$CC,$CC # 4 3 2
+ stq $L1,-8($16) # 4 2 4
+
+ bgt $18,$145
+ ret $31,($26),1 # else exit
+
+ .align 4
+$143:
+ addq $18,4,$18
+ bgt $18,$145 # goto tail code
+ ret $31,($26),1 # else exit
+
+ .end bn_mul_words
+ .align 3
+ .globl bn_sqr_words
+ .ent bn_sqr_words
+bn_sqr_words:
+bn_sqr_words..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ subq $18,4,$18
+ blt $18,$543 # if we are -1, -2, -3 or -4 goto tail code
+ ldq $A1,0($17) # 1 1
+ .align 3
+$542:
+ mulq $A1,$A1,$L1 ######
+ ldq $A2,8($17) # 1 1
+ subq $18,4
+ umulh $A1,$A1,$R1 ######
+ ldq $A3,16($17) # 1 1
+ mulq $A2,$A2,$L2 ######
+ ldq $A4,24($17) # 1 1
+ stq $L1,0($16) # r[0]
+ umulh $A2,$A2,$R2 ######
+ stq $R1,8($16) # r[1]
+ mulq $A3,$A3,$L3 ######
+ stq $L2,16($16) # r[0]
+ umulh $A3,$A3,$R3 ######
+ stq $R2,24($16) # r[1]
+ mulq $A4,$A4,$L4 ######
+ stq $L3,32($16) # r[0]
+ umulh $A4,$A4,$R4 ######
+ stq $R3,40($16) # r[1]
+
+ addq $16,64,$16
+ addq $17,32,$17
+ stq $L4,-16($16) # r[0]
+ stq $R4,-8($16) # r[1]
+
+ blt $18,$543
+ ldq $A1,0($17) # 1 1
+ br $542
+
+$442:
+ ldq $A1,0($17) # a[0]
+ mulq $A1,$A1,$L1 # a[0]*w low part r2
+ addq $16,16,$16
+ addq $17,8,$17
+ subq $18,1,$18
+ umulh $A1,$A1,$R1 # a[0]*w high part r3
+ stq $L1,-16($16) # r[0]
+ stq $R1,-8($16) # r[1]
+
+ bgt $18,$442
+ ret $31,($26),1 # else exit
+
+ .align 4
+$543:
+ addq $18,4,$18
+ bgt $18,$442 # goto tail code
+ ret $31,($26),1 # else exit
+ .end bn_sqr_words
+
+ .align 3
+ .globl bn_add_words
+ .ent bn_add_words
+bn_add_words:
+bn_add_words..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ subq $19,4,$19
+ bis $31,$31,$CC # carry = 0
+ blt $19,$900
+ ldq $L1,0($17) # a[0]
+ ldq $R1,0($18) # b[1]
+ .align 3
+$901:
+ addq $R1,$L1,$R1 # r=a+b;
+ ldq $L2,8($17) # a[1]
+ cmpult $R1,$L1,$O1 # did we overflow?
+ ldq $R2,8($18) # b[1]
+ addq $R1,$CC,$R1 # c+= overflow
+ ldq $L3,16($17) # a[2]
+ cmpult $R1,$CC,$CC # overflow?
+ ldq $R3,16($18) # b[2]
+ addq $CC,$O1,$CC
+ ldq $L4,24($17) # a[3]
+ addq $R2,$L2,$R2 # r=a+b;
+ ldq $R4,24($18) # b[3]
+ cmpult $R2,$L2,$O2 # did we overflow?
+ addq $R3,$L3,$R3 # r=a+b;
+ addq $R2,$CC,$R2 # c+= overflow
+ cmpult $R3,$L3,$O3 # did we overflow?
+ cmpult $R2,$CC,$CC # overflow?
+ addq $R4,$L4,$R4 # r=a+b;
+ addq $CC,$O2,$CC
+ cmpult $R4,$L4,$O4 # did we overflow?
+ addq $R3,$CC,$R3 # c+= overflow
+ stq $R1,0($16) # r[0]=c
+ cmpult $R3,$CC,$CC # overflow?
+ stq $R2,8($16) # r[1]=c
+ addq $CC,$O3,$CC
+ stq $R3,16($16) # r[2]=c
+ addq $R4,$CC,$R4 # c+= overflow
+ subq $19,4,$19 # loop--
+ cmpult $R4,$CC,$CC # overflow?
+ addq $17,32,$17 # a++
+ addq $CC,$O4,$CC
+ stq $R4,24($16) # r[3]=c
+ addq $18,32,$18 # b++
+ addq $16,32,$16 # r++
+
+ blt $19,$900
+ ldq $L1,0($17) # a[0]
+ ldq $R1,0($18) # b[1]
+ br $901
+ .align 4
+$945:
+ ldq $L1,0($17) # a[0]
+ ldq $R1,0($18) # b[1]
+ addq $R1,$L1,$R1 # r=a+b;
+ subq $19,1,$19 # loop--
+ addq $R1,$CC,$R1 # c+= overflow
+ addq $17,8,$17 # a++
+ cmpult $R1,$L1,$O1 # did we overflow?
+ cmpult $R1,$CC,$CC # overflow?
+ addq $18,8,$18 # b++
+ stq $R1,0($16) # r[0]=c
+ addq $CC,$O1,$CC
+ addq $16,8,$16 # r++
+
+ bgt $19,$945
+ ret $31,($26),1 # else exit
+
+$900:
+ addq $19,4,$19
+ bgt $19,$945 # goto tail code
+ ret $31,($26),1 # else exit
+ .end bn_add_words
+
+ .align 3
+ .globl bn_sub_words
+ .ent bn_sub_words
+bn_sub_words:
+bn_sub_words..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ subq $19,4,$19
+ bis $31,$31,$CC # carry = 0
+ br $800
+ blt $19,$800
+ ldq $L1,0($17) # a[0]
+ ldq $R1,0($18) # b[1]
+ .align 3
+$801:
+ addq $R1,$L1,$R1 # r=a+b;
+ ldq $L2,8($17) # a[1]
+ cmpult $R1,$L1,$O1 # did we overflow?
+ ldq $R2,8($18) # b[1]
+ addq $R1,$CC,$R1 # c+= overflow
+ ldq $L3,16($17) # a[2]
+ cmpult $R1,$CC,$CC # overflow?
+ ldq $R3,16($18) # b[2]
+ addq $CC,$O1,$CC
+ ldq $L4,24($17) # a[3]
+ addq $R2,$L2,$R2 # r=a+b;
+ ldq $R4,24($18) # b[3]
+ cmpult $R2,$L2,$O2 # did we overflow?
+ addq $R3,$L3,$R3 # r=a+b;
+ addq $R2,$CC,$R2 # c+= overflow
+ cmpult $R3,$L3,$O3 # did we overflow?
+ cmpult $R2,$CC,$CC # overflow?
+ addq $R4,$L4,$R4 # r=a+b;
+ addq $CC,$O2,$CC
+ cmpult $R4,$L4,$O4 # did we overflow?
+ addq $R3,$CC,$R3 # c+= overflow
+ stq $R1,0($16) # r[0]=c
+ cmpult $R3,$CC,$CC # overflow?
+ stq $R2,8($16) # r[1]=c
+ addq $CC,$O3,$CC
+ stq $R3,16($16) # r[2]=c
+ addq $R4,$CC,$R4 # c+= overflow
+ subq $19,4,$19 # loop--
+ cmpult $R4,$CC,$CC # overflow?
+ addq $17,32,$17 # a++
+ addq $CC,$O4,$CC
+ stq $R4,24($16) # r[3]=c
+ addq $18,32,$18 # b++
+ addq $16,32,$16 # r++
+
+ blt $19,$800
+ ldq $L1,0($17) # a[0]
+ ldq $R1,0($18) # b[1]
+ br $801
+ .align 4
+$845:
+ ldq $L1,0($17) # a[0]
+ ldq $R1,0($18) # b[1]
+ cmpult $L1,$R1,$O1 # will we borrow?
+ subq $L1,$R1,$R1 # r=a-b;
+ subq $19,1,$19 # loop--
+ cmpult $R1,$CC,$O2 # will we borrow?
+ subq $R1,$CC,$R1 # c+= overflow
+ addq $17,8,$17 # a++
+ addq $18,8,$18 # b++
+ stq $R1,0($16) # r[0]=c
+ addq $O2,$O1,$CC
+ addq $16,8,$16 # r++
+
+ bgt $19,$845
+ ret $31,($26),1 # else exit
+
+$800:
+ addq $19,4,$19
+ bgt $19,$845 # goto tail code
+ ret $31,($26),1 # else exit
+ .end bn_sub_words
+
+ #
+ # What follows was taken directly from the C compiler with a few
+ # hacks to redo the lables.
+ #
+.text
+ .align 3
+ .globl bn_div_words
+ .ent bn_div_words
+bn_div_words:
+ ldgp $29,0($27)
+bn_div_words..ng:
+ lda $30,-48($30)
+ .frame $30,48,$26,0
+ stq $26,0($30)
+ stq $9,8($30)
+ stq $10,16($30)
+ stq $11,24($30)
+ stq $12,32($30)
+ stq $13,40($30)
+ .mask 0x4003e00,-48
+ .prologue 1
+ bis $16,$16,$9
+ bis $17,$17,$10
+ bis $18,$18,$11
+ bis $31,$31,$13
+ bis $31,2,$12
+ bne $11,$119
+ lda $0,-1
+ br $31,$136
+ .align 4
+$119:
+ bis $11,$11,$16
+ jsr $26,BN_num_bits_word
+ ldgp $29,0($26)
+ subq $0,64,$1
+ beq $1,$120
+ bis $31,1,$1
+ sll $1,$0,$1
+ cmpule $9,$1,$1
+ bne $1,$120
+ # lda $16,_IO_stderr_
+ # lda $17,$C32
+ # bis $0,$0,$18
+ # jsr $26,fprintf
+ # ldgp $29,0($26)
+ jsr $26,abort
+ ldgp $29,0($26)
+ .align 4
+$120:
+ bis $31,64,$3
+ cmpult $9,$11,$2
+ subq $3,$0,$1
+ addl $1,$31,$0
+ subq $9,$11,$1
+ cmoveq $2,$1,$9
+ beq $0,$122
+ zapnot $0,15,$2
+ subq $3,$0,$1
+ sll $11,$2,$11
+ sll $9,$2,$3
+ srl $10,$1,$1
+ sll $10,$2,$10
+ bis $3,$1,$9
+$122:
+ srl $11,32,$5
+ zapnot $11,15,$6
+ lda $7,-1
+ .align 5
+$123:
+ srl $9,32,$1
+ subq $1,$5,$1
+ bne $1,$126
+ zapnot $7,15,$27
+ br $31,$127
+ .align 4
+$126:
+ bis $9,$9,$24
+ bis $5,$5,$25
+ divqu $24,$25,$27
+$127:
+ srl $10,32,$4
+ .align 5
+$128:
+ mulq $27,$5,$1
+ subq $9,$1,$3
+ zapnot $3,240,$1
+ bne $1,$129
+ mulq $6,$27,$2
+ sll $3,32,$1
+ addq $1,$4,$1
+ cmpule $2,$1,$2
+ bne $2,$129
+ subq $27,1,$27
+ br $31,$128
+ .align 4
+$129:
+ mulq $27,$6,$1
+ mulq $27,$5,$4
+ srl $1,32,$3
+ sll $1,32,$1
+ addq $4,$3,$4
+ cmpult $10,$1,$2
+ subq $10,$1,$10
+ addq $2,$4,$2
+ cmpult $9,$2,$1
+ bis $2,$2,$4
+ beq $1,$134
+ addq $9,$11,$9
+ subq $27,1,$27
+$134:
+ subl $12,1,$12
+ subq $9,$4,$9
+ beq $12,$124
+ sll $27,32,$13
+ sll $9,32,$2
+ srl $10,32,$1
+ sll $10,32,$10
+ bis $2,$1,$9
+ br $31,$123
+ .align 4
+$124:
+ bis $13,$27,$0
+$136:
+ ldq $26,0($30)
+ ldq $9,8($30)
+ ldq $10,16($30)
+ ldq $11,24($30)
+ ldq $12,32($30)
+ ldq $13,40($30)
+ addq $30,48,$30
+ ret $31,($26),1
+ .end bn_div_words
+EOF
+ return($data);
+ }
+
diff --git a/crypto/bn/asm/bn-win32.asm b/crypto/bn/asm/bn-win32.asm
index 017ea462b0..871bd88d77 100644
--- a/crypto/bn/asm/bn-win32.asm
+++ b/crypto/bn/asm/bn-win32.asm
@@ -485,9 +485,9 @@ $L010sw_end:
_bn_sqr_words ENDP
_TEXT ENDS
_TEXT SEGMENT
-PUBLIC _bn_div64
+PUBLIC _bn_div_words
-_bn_div64 PROC NEAR
+_bn_div_words PROC NEAR
push ebp
push ebx
push esi
@@ -501,7 +501,7 @@ _bn_div64 PROC NEAR
pop ebx
pop ebp
ret
-_bn_div64 ENDP
+_bn_div_words ENDP
_TEXT ENDS
_TEXT SEGMENT
PUBLIC _bn_add_words
@@ -678,7 +678,6 @@ $L011aw_finish:
adc eax, 0
mov DWORD PTR 24[ebx],ecx
$L013aw_end:
- mov eax, eax
pop edi
pop esi
pop ebx
@@ -686,4 +685,1438 @@ $L013aw_end:
ret
_bn_add_words ENDP
_TEXT ENDS
+_TEXT SEGMENT
+PUBLIC _bn_sub_words
+
+_bn_sub_words PROC NEAR
+ push ebp
+ push ebx
+ push esi
+ push edi
+ ;
+ mov ebx, DWORD PTR 20[esp]
+ mov esi, DWORD PTR 24[esp]
+ mov edi, DWORD PTR 28[esp]
+ mov ebp, DWORD PTR 32[esp]
+ xor eax, eax
+ and ebp, 4294967288
+ jz $L014aw_finish
+L015aw_loop:
+ ; Round 0
+ mov ecx, DWORD PTR [esi]
+ mov edx, DWORD PTR [edi]
+ sub ecx, eax
+ mov eax, 0
+ adc eax, eax
+ sub ecx, edx
+ adc eax, 0
+ mov DWORD PTR [ebx],ecx
+ ; Round 1
+ mov ecx, DWORD PTR 4[esi]
+ mov edx, DWORD PTR 4[edi]
+ sub ecx, eax
+ mov eax, 0
+ adc eax, eax
+ sub ecx, edx
+ adc eax, 0
+ mov DWORD PTR 4[ebx],ecx
+ ; Round 2
+ mov ecx, DWORD PTR 8[esi]
+ mov edx, DWORD PTR 8[edi]
+ sub ecx, eax
+ mov eax, 0
+ adc eax, eax
+ sub ecx, edx
+ adc eax, 0
+ mov DWORD PTR 8[ebx],ecx
+ ; Round 3
+ mov ecx, DWORD PTR 12[esi]
+ mov edx, DWORD PTR 12[edi]
+ sub ecx, eax
+ mov eax, 0
+ adc eax, eax
+ sub ecx, edx
+ adc eax, 0
+ mov DWORD PTR 12[ebx],ecx
+ ; Round 4
+ mov ecx, DWORD PTR 16[esi]
+ mov edx, DWORD PTR 16[edi]
+ sub ecx, eax
+ mov eax, 0
+ adc eax, eax
+ sub ecx, edx
+ adc eax, 0
+ mov DWORD PTR 16[ebx],ecx
+ ; Round 5
+ mov ecx, DWORD PTR 20[esi]
+ mov edx, DWORD PTR 20[edi]
+ sub ecx, eax
+ mov eax, 0
+ adc eax, eax
+ sub ecx, edx
+ adc eax, 0
+ mov DWORD PTR 20[ebx],ecx
+ ; Round 6
+ mov ecx, DWORD PTR 24[esi]
+ mov edx, DWORD PTR 24[edi]
+ sub ecx, eax
+ mov eax, 0
+ adc eax, eax
+ sub ecx, edx
+ adc eax, 0
+ mov DWORD PTR 24[ebx],ecx
+ ; Round 7
+ mov ecx, DWORD PTR 28[esi]
+ mov edx, DWORD PTR 28[edi]
+ sub ecx, eax
+ mov eax, 0
+ adc eax, eax
+ sub ecx, edx
+ adc eax, 0
+ mov DWORD PTR 28[ebx],ecx
+ ;
+ add esi, 32
+ add edi, 32
+ add ebx, 32
+ sub ebp, 8
+ jnz L015aw_loop
+$L014aw_finish:
+ mov ebp, DWORD PTR 32[esp]
+ and ebp, 7
+ jz $L016aw_end
+ ; Tail Round 0
+ mov ecx, DWORD PTR [esi]
+ mov edx, DWORD PTR [edi]
+ sub ecx, eax
+ mov eax, 0
+ adc eax, eax
+ sub ecx, edx
+ adc eax, 0
+ dec ebp
+ mov DWORD PTR [ebx],ecx
+ jz $L016aw_end
+ ; Tail Round 1
+ mov ecx, DWORD PTR 4[esi]
+ mov edx, DWORD PTR 4[edi]
+ sub ecx, eax
+ mov eax, 0
+ adc eax, eax
+ sub ecx, edx
+ adc eax, 0
+ dec ebp
+ mov DWORD PTR 4[ebx],ecx
+ jz $L016aw_end
+ ; Tail Round 2
+ mov ecx, DWORD PTR 8[esi]
+ mov edx, DWORD PTR 8[edi]
+ sub ecx, eax
+ mov eax, 0
+ adc eax, eax
+ sub ecx, edx
+ adc eax, 0
+ dec ebp
+ mov DWORD PTR 8[ebx],ecx
+ jz $L016aw_end
+ ; Tail Round 3
+ mov ecx, DWORD PTR 12[esi]
+ mov edx, DWORD PTR 12[edi]
+ sub ecx, eax
+ mov eax, 0
+ adc eax, eax
+ sub ecx, edx
+ adc eax, 0
+ dec ebp
+ mov DWORD PTR 12[ebx],ecx
+ jz $L016aw_end
+ ; Tail Round 4
+ mov ecx, DWORD PTR 16[esi]
+ mov edx, DWORD PTR 16[edi]
+ sub ecx, eax
+ mov eax, 0
+ adc eax, eax
+ sub ecx, edx
+ adc eax, 0
+ dec ebp
+ mov DWORD PTR 16[ebx],ecx
+ jz $L016aw_end
+ ; Tail Round 5
+ mov ecx, DWORD PTR 20[esi]
+ mov edx, DWORD PTR 20[edi]
+ sub ecx, eax
+ mov eax, 0
+ adc eax, eax
+ sub ecx, edx
+ adc eax, 0
+ dec ebp
+ mov DWORD PTR 20[ebx],ecx
+ jz $L016aw_end
+ ; Tail Round 6
+ mov ecx, DWORD PTR 24[esi]
+ mov edx, DWORD PTR 24[edi]
+ sub ecx, eax
+ mov eax, 0
+ adc eax, eax
+ sub ecx, edx
+ adc eax, 0
+ mov DWORD PTR 24[ebx],ecx
+$L016aw_end:
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+_bn_sub_words ENDP
+_TEXT ENDS
+_TEXT SEGMENT
+PUBLIC _bn_mul_comba8
+
+_bn_mul_comba8 PROC NEAR
+ push esi
+ mov esi, DWORD PTR 12[esp]
+ push edi
+ mov edi, DWORD PTR 20[esp]
+ push ebp
+ push ebx
+ xor ebx, ebx
+ mov eax, DWORD PTR [esi]
+ xor ecx, ecx
+ mov edx, DWORD PTR [edi]
+ ; ################## Calculate word 0
+ xor ebp, ebp
+ ; mul a[0]*b[0]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ecx, edx
+ mov edx, DWORD PTR [edi]
+ adc ebp, 0
+ mov DWORD PTR [eax],ebx
+ mov eax, DWORD PTR 4[esi]
+ ; saved r[0]
+ ; ################## Calculate word 1
+ xor ebx, ebx
+ ; mul a[1]*b[0]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR [esi]
+ adc ebp, edx
+ mov edx, DWORD PTR 4[edi]
+ adc ebx, 0
+ ; mul a[0]*b[1]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ebp, edx
+ mov edx, DWORD PTR [edi]
+ adc ebx, 0
+ mov DWORD PTR 4[eax],ecx
+ mov eax, DWORD PTR 8[esi]
+ ; saved r[1]
+ ; ################## Calculate word 2
+ xor ecx, ecx
+ ; mul a[2]*b[0]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 4[esi]
+ adc ebx, edx
+ mov edx, DWORD PTR 4[edi]
+ adc ecx, 0
+ ; mul a[1]*b[1]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR [esi]
+ adc ebx, edx
+ mov edx, DWORD PTR 8[edi]
+ adc ecx, 0
+ ; mul a[0]*b[2]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ebx, edx
+ mov edx, DWORD PTR [edi]
+ adc ecx, 0
+ mov DWORD PTR 8[eax],ebp
+ mov eax, DWORD PTR 12[esi]
+ ; saved r[2]
+ ; ################## Calculate word 3
+ xor ebp, ebp
+ ; mul a[3]*b[0]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 8[esi]
+ adc ecx, edx
+ mov edx, DWORD PTR 4[edi]
+ adc ebp, 0
+ ; mul a[2]*b[1]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 4[esi]
+ adc ecx, edx
+ mov edx, DWORD PTR 8[edi]
+ adc ebp, 0
+ ; mul a[1]*b[2]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR [esi]
+ adc ecx, edx
+ mov edx, DWORD PTR 12[edi]
+ adc ebp, 0
+ ; mul a[0]*b[3]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ecx, edx
+ mov edx, DWORD PTR [edi]
+ adc ebp, 0
+ mov DWORD PTR 12[eax],ebx
+ mov eax, DWORD PTR 16[esi]
+ ; saved r[3]
+ ; ################## Calculate word 4
+ xor ebx, ebx
+ ; mul a[4]*b[0]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 12[esi]
+ adc ebp, edx
+ mov edx, DWORD PTR 4[edi]
+ adc ebx, 0
+ ; mul a[3]*b[1]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 8[esi]
+ adc ebp, edx
+ mov edx, DWORD PTR 8[edi]
+ adc ebx, 0
+ ; mul a[2]*b[2]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 4[esi]
+ adc ebp, edx
+ mov edx, DWORD PTR 12[edi]
+ adc ebx, 0
+ ; mul a[1]*b[3]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR [esi]
+ adc ebp, edx
+ mov edx, DWORD PTR 16[edi]
+ adc ebx, 0
+ ; mul a[0]*b[4]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ebp, edx
+ mov edx, DWORD PTR [edi]
+ adc ebx, 0
+ mov DWORD PTR 16[eax],ecx
+ mov eax, DWORD PTR 20[esi]
+ ; saved r[4]
+ ; ################## Calculate word 5
+ xor ecx, ecx
+ ; mul a[5]*b[0]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 16[esi]
+ adc ebx, edx
+ mov edx, DWORD PTR 4[edi]
+ adc ecx, 0
+ ; mul a[4]*b[1]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 12[esi]
+ adc ebx, edx
+ mov edx, DWORD PTR 8[edi]
+ adc ecx, 0
+ ; mul a[3]*b[2]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 8[esi]
+ adc ebx, edx
+ mov edx, DWORD PTR 12[edi]
+ adc ecx, 0
+ ; mul a[2]*b[3]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 4[esi]
+ adc ebx, edx
+ mov edx, DWORD PTR 16[edi]
+ adc ecx, 0
+ ; mul a[1]*b[4]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR [esi]
+ adc ebx, edx
+ mov edx, DWORD PTR 20[edi]
+ adc ecx, 0
+ ; mul a[0]*b[5]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ebx, edx
+ mov edx, DWORD PTR [edi]
+ adc ecx, 0
+ mov DWORD PTR 20[eax],ebp
+ mov eax, DWORD PTR 24[esi]
+ ; saved r[5]
+ ; ################## Calculate word 6
+ xor ebp, ebp
+ ; mul a[6]*b[0]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 20[esi]
+ adc ecx, edx
+ mov edx, DWORD PTR 4[edi]
+ adc ebp, 0
+ ; mul a[5]*b[1]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 16[esi]
+ adc ecx, edx
+ mov edx, DWORD PTR 8[edi]
+ adc ebp, 0
+ ; mul a[4]*b[2]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 12[esi]
+ adc ecx, edx
+ mov edx, DWORD PTR 12[edi]
+ adc ebp, 0
+ ; mul a[3]*b[3]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 8[esi]
+ adc ecx, edx
+ mov edx, DWORD PTR 16[edi]
+ adc ebp, 0
+ ; mul a[2]*b[4]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 4[esi]
+ adc ecx, edx
+ mov edx, DWORD PTR 20[edi]
+ adc ebp, 0
+ ; mul a[1]*b[5]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR [esi]
+ adc ecx, edx
+ mov edx, DWORD PTR 24[edi]
+ adc ebp, 0
+ ; mul a[0]*b[6]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ecx, edx
+ mov edx, DWORD PTR [edi]
+ adc ebp, 0
+ mov DWORD PTR 24[eax],ebx
+ mov eax, DWORD PTR 28[esi]
+ ; saved r[6]
+ ; ################## Calculate word 7
+ xor ebx, ebx
+ ; mul a[7]*b[0]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 24[esi]
+ adc ebp, edx
+ mov edx, DWORD PTR 4[edi]
+ adc ebx, 0
+ ; mul a[6]*b[1]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 20[esi]
+ adc ebp, edx
+ mov edx, DWORD PTR 8[edi]
+ adc ebx, 0
+ ; mul a[5]*b[2]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 16[esi]
+ adc ebp, edx
+ mov edx, DWORD PTR 12[edi]
+ adc ebx, 0
+ ; mul a[4]*b[3]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 12[esi]
+ adc ebp, edx
+ mov edx, DWORD PTR 16[edi]
+ adc ebx, 0
+ ; mul a[3]*b[4]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 8[esi]
+ adc ebp, edx
+ mov edx, DWORD PTR 20[edi]
+ adc ebx, 0
+ ; mul a[2]*b[5]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 4[esi]
+ adc ebp, edx
+ mov edx, DWORD PTR 24[edi]
+ adc ebx, 0
+ ; mul a[1]*b[6]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR [esi]
+ adc ebp, edx
+ mov edx, DWORD PTR 28[edi]
+ adc ebx, 0
+ ; mul a[0]*b[7]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ebp, edx
+ mov edx, DWORD PTR 4[edi]
+ adc ebx, 0
+ mov DWORD PTR 28[eax],ecx
+ mov eax, DWORD PTR 28[esi]
+ ; saved r[7]
+ ; ################## Calculate word 8
+ xor ecx, ecx
+ ; mul a[7]*b[1]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 24[esi]
+ adc ebx, edx
+ mov edx, DWORD PTR 8[edi]
+ adc ecx, 0
+ ; mul a[6]*b[2]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 20[esi]
+ adc ebx, edx
+ mov edx, DWORD PTR 12[edi]
+ adc ecx, 0
+ ; mul a[5]*b[3]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 16[esi]
+ adc ebx, edx
+ mov edx, DWORD PTR 16[edi]
+ adc ecx, 0
+ ; mul a[4]*b[4]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 12[esi]
+ adc ebx, edx
+ mov edx, DWORD PTR 20[edi]
+ adc ecx, 0
+ ; mul a[3]*b[5]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 8[esi]
+ adc ebx, edx
+ mov edx, DWORD PTR 24[edi]
+ adc ecx, 0
+ ; mul a[2]*b[6]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 4[esi]
+ adc ebx, edx
+ mov edx, DWORD PTR 28[edi]
+ adc ecx, 0
+ ; mul a[1]*b[7]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ebx, edx
+ mov edx, DWORD PTR 8[edi]
+ adc ecx, 0
+ mov DWORD PTR 32[eax],ebp
+ mov eax, DWORD PTR 28[esi]
+ ; saved r[8]
+ ; ################## Calculate word 9
+ xor ebp, ebp
+ ; mul a[7]*b[2]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 24[esi]
+ adc ecx, edx
+ mov edx, DWORD PTR 12[edi]
+ adc ebp, 0
+ ; mul a[6]*b[3]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 20[esi]
+ adc ecx, edx
+ mov edx, DWORD PTR 16[edi]
+ adc ebp, 0
+ ; mul a[5]*b[4]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 16[esi]
+ adc ecx, edx
+ mov edx, DWORD PTR 20[edi]
+ adc ebp, 0
+ ; mul a[4]*b[5]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 12[esi]
+ adc ecx, edx
+ mov edx, DWORD PTR 24[edi]
+ adc ebp, 0
+ ; mul a[3]*b[6]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 8[esi]
+ adc ecx, edx
+ mov edx, DWORD PTR 28[edi]
+ adc ebp, 0
+ ; mul a[2]*b[7]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ecx, edx
+ mov edx, DWORD PTR 12[edi]
+ adc ebp, 0
+ mov DWORD PTR 36[eax],ebx
+ mov eax, DWORD PTR 28[esi]
+ ; saved r[9]
+ ; ################## Calculate word 10
+ xor ebx, ebx
+ ; mul a[7]*b[3]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 24[esi]
+ adc ebp, edx
+ mov edx, DWORD PTR 16[edi]
+ adc ebx, 0
+ ; mul a[6]*b[4]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 20[esi]
+ adc ebp, edx
+ mov edx, DWORD PTR 20[edi]
+ adc ebx, 0
+ ; mul a[5]*b[5]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 16[esi]
+ adc ebp, edx
+ mov edx, DWORD PTR 24[edi]
+ adc ebx, 0
+ ; mul a[4]*b[6]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 12[esi]
+ adc ebp, edx
+ mov edx, DWORD PTR 28[edi]
+ adc ebx, 0
+ ; mul a[3]*b[7]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ebp, edx
+ mov edx, DWORD PTR 16[edi]
+ adc ebx, 0
+ mov DWORD PTR 40[eax],ecx
+ mov eax, DWORD PTR 28[esi]
+ ; saved r[10]
+ ; ################## Calculate word 11
+ xor ecx, ecx
+ ; mul a[7]*b[4]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 24[esi]
+ adc ebx, edx
+ mov edx, DWORD PTR 20[edi]
+ adc ecx, 0
+ ; mul a[6]*b[5]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 20[esi]
+ adc ebx, edx
+ mov edx, DWORD PTR 24[edi]
+ adc ecx, 0
+ ; mul a[5]*b[6]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 16[esi]
+ adc ebx, edx
+ mov edx, DWORD PTR 28[edi]
+ adc ecx, 0
+ ; mul a[4]*b[7]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ebx, edx
+ mov edx, DWORD PTR 20[edi]
+ adc ecx, 0
+ mov DWORD PTR 44[eax],ebp
+ mov eax, DWORD PTR 28[esi]
+ ; saved r[11]
+ ; ################## Calculate word 12
+ xor ebp, ebp
+ ; mul a[7]*b[5]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 24[esi]
+ adc ecx, edx
+ mov edx, DWORD PTR 24[edi]
+ adc ebp, 0
+ ; mul a[6]*b[6]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 20[esi]
+ adc ecx, edx
+ mov edx, DWORD PTR 28[edi]
+ adc ebp, 0
+ ; mul a[5]*b[7]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ecx, edx
+ mov edx, DWORD PTR 24[edi]
+ adc ebp, 0
+ mov DWORD PTR 48[eax],ebx
+ mov eax, DWORD PTR 28[esi]
+ ; saved r[12]
+ ; ################## Calculate word 13
+ xor ebx, ebx
+ ; mul a[7]*b[6]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 24[esi]
+ adc ebp, edx
+ mov edx, DWORD PTR 28[edi]
+ adc ebx, 0
+ ; mul a[6]*b[7]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ebp, edx
+ mov edx, DWORD PTR 28[edi]
+ adc ebx, 0
+ mov DWORD PTR 52[eax],ecx
+ mov eax, DWORD PTR 28[esi]
+ ; saved r[13]
+ ; ################## Calculate word 14
+ xor ecx, ecx
+ ; mul a[7]*b[7]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ebx, edx
+ adc ecx, 0
+ mov DWORD PTR 56[eax],ebp
+ ; saved r[14]
+ ; save r[15]
+ mov DWORD PTR 60[eax],ebx
+ pop ebx
+ pop ebp
+ pop edi
+ pop esi
+ ret
+_bn_mul_comba8 ENDP
+_TEXT ENDS
+_TEXT SEGMENT
+PUBLIC _bn_mul_comba4
+
+_bn_mul_comba4 PROC NEAR
+ push esi
+ mov esi, DWORD PTR 12[esp]
+ push edi
+ mov edi, DWORD PTR 20[esp]
+ push ebp
+ push ebx
+ xor ebx, ebx
+ mov eax, DWORD PTR [esi]
+ xor ecx, ecx
+ mov edx, DWORD PTR [edi]
+ ; ################## Calculate word 0
+ xor ebp, ebp
+ ; mul a[0]*b[0]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ecx, edx
+ mov edx, DWORD PTR [edi]
+ adc ebp, 0
+ mov DWORD PTR [eax],ebx
+ mov eax, DWORD PTR 4[esi]
+ ; saved r[0]
+ ; ################## Calculate word 1
+ xor ebx, ebx
+ ; mul a[1]*b[0]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR [esi]
+ adc ebp, edx
+ mov edx, DWORD PTR 4[edi]
+ adc ebx, 0
+ ; mul a[0]*b[1]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ebp, edx
+ mov edx, DWORD PTR [edi]
+ adc ebx, 0
+ mov DWORD PTR 4[eax],ecx
+ mov eax, DWORD PTR 8[esi]
+ ; saved r[1]
+ ; ################## Calculate word 2
+ xor ecx, ecx
+ ; mul a[2]*b[0]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 4[esi]
+ adc ebx, edx
+ mov edx, DWORD PTR 4[edi]
+ adc ecx, 0
+ ; mul a[1]*b[1]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR [esi]
+ adc ebx, edx
+ mov edx, DWORD PTR 8[edi]
+ adc ecx, 0
+ ; mul a[0]*b[2]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ebx, edx
+ mov edx, DWORD PTR [edi]
+ adc ecx, 0
+ mov DWORD PTR 8[eax],ebp
+ mov eax, DWORD PTR 12[esi]
+ ; saved r[2]
+ ; ################## Calculate word 3
+ xor ebp, ebp
+ ; mul a[3]*b[0]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 8[esi]
+ adc ecx, edx
+ mov edx, DWORD PTR 4[edi]
+ adc ebp, 0
+ ; mul a[2]*b[1]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 4[esi]
+ adc ecx, edx
+ mov edx, DWORD PTR 8[edi]
+ adc ebp, 0
+ ; mul a[1]*b[2]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR [esi]
+ adc ecx, edx
+ mov edx, DWORD PTR 12[edi]
+ adc ebp, 0
+ ; mul a[0]*b[3]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ecx, edx
+ mov edx, DWORD PTR 4[edi]
+ adc ebp, 0
+ mov DWORD PTR 12[eax],ebx
+ mov eax, DWORD PTR 12[esi]
+ ; saved r[3]
+ ; ################## Calculate word 4
+ xor ebx, ebx
+ ; mul a[3]*b[1]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 8[esi]
+ adc ebp, edx
+ mov edx, DWORD PTR 8[edi]
+ adc ebx, 0
+ ; mul a[2]*b[2]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 4[esi]
+ adc ebp, edx
+ mov edx, DWORD PTR 12[edi]
+ adc ebx, 0
+ ; mul a[1]*b[3]
+ mul edx
+ add ecx, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ebp, edx
+ mov edx, DWORD PTR 8[edi]
+ adc ebx, 0
+ mov DWORD PTR 16[eax],ecx
+ mov eax, DWORD PTR 12[esi]
+ ; saved r[4]
+ ; ################## Calculate word 5
+ xor ecx, ecx
+ ; mul a[3]*b[2]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 8[esi]
+ adc ebx, edx
+ mov edx, DWORD PTR 12[edi]
+ adc ecx, 0
+ ; mul a[2]*b[3]
+ mul edx
+ add ebp, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ebx, edx
+ mov edx, DWORD PTR 12[edi]
+ adc ecx, 0
+ mov DWORD PTR 20[eax],ebp
+ mov eax, DWORD PTR 12[esi]
+ ; saved r[5]
+ ; ################## Calculate word 6
+ xor ebp, ebp
+ ; mul a[3]*b[3]
+ mul edx
+ add ebx, eax
+ mov eax, DWORD PTR 20[esp]
+ adc ecx, edx
+ adc ebp, 0
+ mov DWORD PTR 24[eax],ebx
+ ; saved r[6]
+ ; save r[7]
+ mov DWORD PTR 28[eax],ecx
+ pop ebx
+ pop ebp
+ pop edi
+ pop esi
+ ret
+_bn_mul_comba4 ENDP
+_TEXT ENDS
+_TEXT SEGMENT
+PUBLIC _bn_sqr_comba8
+
+_bn_sqr_comba8 PROC NEAR
+ push esi
+ push edi
+ push ebp
+ push ebx
+ mov edi, DWORD PTR 20[esp]
+ mov esi, DWORD PTR 24[esp]
+ xor ebx, ebx
+ xor ecx, ecx
+ mov eax, DWORD PTR [esi]
+ ; ############### Calculate word 0
+ xor ebp, ebp
+ ; sqr a[0]*a[0]
+ mul eax
+ add ebx, eax
+ adc ecx, edx
+ mov edx, DWORD PTR [esi]
+ adc ebp, 0
+ mov DWORD PTR [edi],ebx
+ mov eax, DWORD PTR 4[esi]
+ ; saved r[0]
+ ; ############### Calculate word 1
+ xor ebx, ebx
+ ; sqr a[1]*a[0]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebx, 0
+ add ecx, eax
+ adc ebp, edx
+ mov eax, DWORD PTR 8[esi]
+ adc ebx, 0
+ mov DWORD PTR 4[edi],ecx
+ mov edx, DWORD PTR [esi]
+ ; saved r[1]
+ ; ############### Calculate word 2
+ xor ecx, ecx
+ ; sqr a[2]*a[0]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ecx, 0
+ add ebp, eax
+ adc ebx, edx
+ mov eax, DWORD PTR 4[esi]
+ adc ecx, 0
+ ; sqr a[1]*a[1]
+ mul eax
+ add ebp, eax
+ adc ebx, edx
+ mov edx, DWORD PTR [esi]
+ adc ecx, 0
+ mov DWORD PTR 8[edi],ebp
+ mov eax, DWORD PTR 12[esi]
+ ; saved r[2]
+ ; ############### Calculate word 3
+ xor ebp, ebp
+ ; sqr a[3]*a[0]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebp, 0
+ add ebx, eax
+ adc ecx, edx
+ mov eax, DWORD PTR 8[esi]
+ adc ebp, 0
+ mov edx, DWORD PTR 4[esi]
+ ; sqr a[2]*a[1]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebp, 0
+ add ebx, eax
+ adc ecx, edx
+ mov eax, DWORD PTR 16[esi]
+ adc ebp, 0
+ mov DWORD PTR 12[edi],ebx
+ mov edx, DWORD PTR [esi]
+ ; saved r[3]
+ ; ############### Calculate word 4
+ xor ebx, ebx
+ ; sqr a[4]*a[0]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebx, 0
+ add ecx, eax
+ adc ebp, edx
+ mov eax, DWORD PTR 12[esi]
+ adc ebx, 0
+ mov edx, DWORD PTR 4[esi]
+ ; sqr a[3]*a[1]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebx, 0
+ add ecx, eax
+ adc ebp, edx
+ mov eax, DWORD PTR 8[esi]
+ adc ebx, 0
+ ; sqr a[2]*a[2]
+ mul eax
+ add ecx, eax
+ adc ebp, edx
+ mov edx, DWORD PTR [esi]
+ adc ebx, 0
+ mov DWORD PTR 16[edi],ecx
+ mov eax, DWORD PTR 20[esi]
+ ; saved r[4]
+ ; ############### Calculate word 5
+ xor ecx, ecx
+ ; sqr a[5]*a[0]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ecx, 0
+ add ebp, eax
+ adc ebx, edx
+ mov eax, DWORD PTR 16[esi]
+ adc ecx, 0
+ mov edx, DWORD PTR 4[esi]
+ ; sqr a[4]*a[1]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ecx, 0
+ add ebp, eax
+ adc ebx, edx
+ mov eax, DWORD PTR 12[esi]
+ adc ecx, 0
+ mov edx, DWORD PTR 8[esi]
+ ; sqr a[3]*a[2]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ecx, 0
+ add ebp, eax
+ adc ebx, edx
+ mov eax, DWORD PTR 24[esi]
+ adc ecx, 0
+ mov DWORD PTR 20[edi],ebp
+ mov edx, DWORD PTR [esi]
+ ; saved r[5]
+ ; ############### Calculate word 6
+ xor ebp, ebp
+ ; sqr a[6]*a[0]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebp, 0
+ add ebx, eax
+ adc ecx, edx
+ mov eax, DWORD PTR 20[esi]
+ adc ebp, 0
+ mov edx, DWORD PTR 4[esi]
+ ; sqr a[5]*a[1]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebp, 0
+ add ebx, eax
+ adc ecx, edx
+ mov eax, DWORD PTR 16[esi]
+ adc ebp, 0
+ mov edx, DWORD PTR 8[esi]
+ ; sqr a[4]*a[2]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebp, 0
+ add ebx, eax
+ adc ecx, edx
+ mov eax, DWORD PTR 12[esi]
+ adc ebp, 0
+ ; sqr a[3]*a[3]
+ mul eax
+ add ebx, eax
+ adc ecx, edx
+ mov edx, DWORD PTR [esi]
+ adc ebp, 0
+ mov DWORD PTR 24[edi],ebx
+ mov eax, DWORD PTR 28[esi]
+ ; saved r[6]
+ ; ############### Calculate word 7
+ xor ebx, ebx
+ ; sqr a[7]*a[0]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebx, 0
+ add ecx, eax
+ adc ebp, edx
+ mov eax, DWORD PTR 24[esi]
+ adc ebx, 0
+ mov edx, DWORD PTR 4[esi]
+ ; sqr a[6]*a[1]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebx, 0
+ add ecx, eax
+ adc ebp, edx
+ mov eax, DWORD PTR 20[esi]
+ adc ebx, 0
+ mov edx, DWORD PTR 8[esi]
+ ; sqr a[5]*a[2]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebx, 0
+ add ecx, eax
+ adc ebp, edx
+ mov eax, DWORD PTR 16[esi]
+ adc ebx, 0
+ mov edx, DWORD PTR 12[esi]
+ ; sqr a[4]*a[3]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebx, 0
+ add ecx, eax
+ adc ebp, edx
+ mov eax, DWORD PTR 28[esi]
+ adc ebx, 0
+ mov DWORD PTR 28[edi],ecx
+ mov edx, DWORD PTR 4[esi]
+ ; saved r[7]
+ ; ############### Calculate word 8
+ xor ecx, ecx
+ ; sqr a[7]*a[1]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ecx, 0
+ add ebp, eax
+ adc ebx, edx
+ mov eax, DWORD PTR 24[esi]
+ adc ecx, 0
+ mov edx, DWORD PTR 8[esi]
+ ; sqr a[6]*a[2]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ecx, 0
+ add ebp, eax
+ adc ebx, edx
+ mov eax, DWORD PTR 20[esi]
+ adc ecx, 0
+ mov edx, DWORD PTR 12[esi]
+ ; sqr a[5]*a[3]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ecx, 0
+ add ebp, eax
+ adc ebx, edx
+ mov eax, DWORD PTR 16[esi]
+ adc ecx, 0
+ ; sqr a[4]*a[4]
+ mul eax
+ add ebp, eax
+ adc ebx, edx
+ mov edx, DWORD PTR 8[esi]
+ adc ecx, 0
+ mov DWORD PTR 32[edi],ebp
+ mov eax, DWORD PTR 28[esi]
+ ; saved r[8]
+ ; ############### Calculate word 9
+ xor ebp, ebp
+ ; sqr a[7]*a[2]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebp, 0
+ add ebx, eax
+ adc ecx, edx
+ mov eax, DWORD PTR 24[esi]
+ adc ebp, 0
+ mov edx, DWORD PTR 12[esi]
+ ; sqr a[6]*a[3]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebp, 0
+ add ebx, eax
+ adc ecx, edx
+ mov eax, DWORD PTR 20[esi]
+ adc ebp, 0
+ mov edx, DWORD PTR 16[esi]
+ ; sqr a[5]*a[4]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebp, 0
+ add ebx, eax
+ adc ecx, edx
+ mov eax, DWORD PTR 28[esi]
+ adc ebp, 0
+ mov DWORD PTR 36[edi],ebx
+ mov edx, DWORD PTR 12[esi]
+ ; saved r[9]
+ ; ############### Calculate word 10
+ xor ebx, ebx
+ ; sqr a[7]*a[3]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebx, 0
+ add ecx, eax
+ adc ebp, edx
+ mov eax, DWORD PTR 24[esi]
+ adc ebx, 0
+ mov edx, DWORD PTR 16[esi]
+ ; sqr a[6]*a[4]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebx, 0
+ add ecx, eax
+ adc ebp, edx
+ mov eax, DWORD PTR 20[esi]
+ adc ebx, 0
+ ; sqr a[5]*a[5]
+ mul eax
+ add ecx, eax
+ adc ebp, edx
+ mov edx, DWORD PTR 16[esi]
+ adc ebx, 0
+ mov DWORD PTR 40[edi],ecx
+ mov eax, DWORD PTR 28[esi]
+ ; saved r[10]
+ ; ############### Calculate word 11
+ xor ecx, ecx
+ ; sqr a[7]*a[4]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ecx, 0
+ add ebp, eax
+ adc ebx, edx
+ mov eax, DWORD PTR 24[esi]
+ adc ecx, 0
+ mov edx, DWORD PTR 20[esi]
+ ; sqr a[6]*a[5]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ecx, 0
+ add ebp, eax
+ adc ebx, edx
+ mov eax, DWORD PTR 28[esi]
+ adc ecx, 0
+ mov DWORD PTR 44[edi],ebp
+ mov edx, DWORD PTR 20[esi]
+ ; saved r[11]
+ ; ############### Calculate word 12
+ xor ebp, ebp
+ ; sqr a[7]*a[5]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebp, 0
+ add ebx, eax
+ adc ecx, edx
+ mov eax, DWORD PTR 24[esi]
+ adc ebp, 0
+ ; sqr a[6]*a[6]
+ mul eax
+ add ebx, eax
+ adc ecx, edx
+ mov edx, DWORD PTR 24[esi]
+ adc ebp, 0
+ mov DWORD PTR 48[edi],ebx
+ mov eax, DWORD PTR 28[esi]
+ ; saved r[12]
+ ; ############### Calculate word 13
+ xor ebx, ebx
+ ; sqr a[7]*a[6]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebx, 0
+ add ecx, eax
+ adc ebp, edx
+ mov eax, DWORD PTR 28[esi]
+ adc ebx, 0
+ mov DWORD PTR 52[edi],ecx
+ ; saved r[13]
+ ; ############### Calculate word 14
+ xor ecx, ecx
+ ; sqr a[7]*a[7]
+ mul eax
+ add ebp, eax
+ adc ebx, edx
+ adc ecx, 0
+ mov DWORD PTR 56[edi],ebp
+ ; saved r[14]
+ mov DWORD PTR 60[edi],ebx
+ pop ebx
+ pop ebp
+ pop edi
+ pop esi
+ ret
+_bn_sqr_comba8 ENDP
+_TEXT ENDS
+_TEXT SEGMENT
+PUBLIC _bn_sqr_comba4
+
+_bn_sqr_comba4 PROC NEAR
+ push esi
+ push edi
+ push ebp
+ push ebx
+ mov edi, DWORD PTR 20[esp]
+ mov esi, DWORD PTR 24[esp]
+ xor ebx, ebx
+ xor ecx, ecx
+ mov eax, DWORD PTR [esi]
+ ; ############### Calculate word 0
+ xor ebp, ebp
+ ; sqr a[0]*a[0]
+ mul eax
+ add ebx, eax
+ adc ecx, edx
+ mov edx, DWORD PTR [esi]
+ adc ebp, 0
+ mov DWORD PTR [edi],ebx
+ mov eax, DWORD PTR 4[esi]
+ ; saved r[0]
+ ; ############### Calculate word 1
+ xor ebx, ebx
+ ; sqr a[1]*a[0]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebx, 0
+ add ecx, eax
+ adc ebp, edx
+ mov eax, DWORD PTR 8[esi]
+ adc ebx, 0
+ mov DWORD PTR 4[edi],ecx
+ mov edx, DWORD PTR [esi]
+ ; saved r[1]
+ ; ############### Calculate word 2
+ xor ecx, ecx
+ ; sqr a[2]*a[0]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ecx, 0
+ add ebp, eax
+ adc ebx, edx
+ mov eax, DWORD PTR 4[esi]
+ adc ecx, 0
+ ; sqr a[1]*a[1]
+ mul eax
+ add ebp, eax
+ adc ebx, edx
+ mov edx, DWORD PTR [esi]
+ adc ecx, 0
+ mov DWORD PTR 8[edi],ebp
+ mov eax, DWORD PTR 12[esi]
+ ; saved r[2]
+ ; ############### Calculate word 3
+ xor ebp, ebp
+ ; sqr a[3]*a[0]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebp, 0
+ add ebx, eax
+ adc ecx, edx
+ mov eax, DWORD PTR 8[esi]
+ adc ebp, 0
+ mov edx, DWORD PTR 4[esi]
+ ; sqr a[2]*a[1]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebp, 0
+ add ebx, eax
+ adc ecx, edx
+ mov eax, DWORD PTR 12[esi]
+ adc ebp, 0
+ mov DWORD PTR 12[edi],ebx
+ mov edx, DWORD PTR 4[esi]
+ ; saved r[3]
+ ; ############### Calculate word 4
+ xor ebx, ebx
+ ; sqr a[3]*a[1]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ebx, 0
+ add ecx, eax
+ adc ebp, edx
+ mov eax, DWORD PTR 8[esi]
+ adc ebx, 0
+ ; sqr a[2]*a[2]
+ mul eax
+ add ecx, eax
+ adc ebp, edx
+ mov edx, DWORD PTR 8[esi]
+ adc ebx, 0
+ mov DWORD PTR 16[edi],ecx
+ mov eax, DWORD PTR 12[esi]
+ ; saved r[4]
+ ; ############### Calculate word 5
+ xor ecx, ecx
+ ; sqr a[3]*a[2]
+ mul edx
+ add eax, eax
+ adc edx, edx
+ adc ecx, 0
+ add ebp, eax
+ adc ebx, edx
+ mov eax, DWORD PTR 12[esi]
+ adc ecx, 0
+ mov DWORD PTR 20[edi],ebp
+ ; saved r[5]
+ ; ############### Calculate word 6
+ xor ebp, ebp
+ ; sqr a[3]*a[3]
+ mul eax
+ add ebx, eax
+ adc ecx, edx
+ adc ebp, 0
+ mov DWORD PTR 24[edi],ebx
+ ; saved r[6]
+ mov DWORD PTR 28[edi],ecx
+ pop ebx
+ pop ebp
+ pop edi
+ pop esi
+ ret
+_bn_sqr_comba4 ENDP
+_TEXT ENDS
END
diff --git a/crypto/bn/asm/bn86unix.cpp b/crypto/bn/asm/bn86unix.cpp
index 64702201ea..639a3ac41c 100644
--- a/crypto/bn/asm/bn86unix.cpp
+++ b/crypto/bn/asm/bn86unix.cpp
@@ -12,8 +12,13 @@
#define bn_mul_add_words _bn_mul_add_words
#define bn_mul_words _bn_mul_words
#define bn_sqr_words _bn_sqr_words
-#define bn_div64 _bn_div64
+#define bn_div_words _bn_div_words
#define bn_add_words _bn_add_words
+#define bn_sub_words _bn_sub_words
+#define bn_mul_comba8 _bn_mul_comba8
+#define bn_mul_comba4 _bn_mul_comba4
+#define bn_sqr_comba8 _bn_sqr_comba8
+#define bn_sqr_comba4 _bn_sqr_comba4
#endif
@@ -544,9 +549,9 @@ bn_sqr_words:
.ident "bn_sqr_words"
.text
.align ALIGN
-.globl bn_div64
- TYPE(bn_div64,@function)
-bn_div64:
+.globl bn_div_words
+ TYPE(bn_div_words,@function)
+bn_div_words:
pushl %ebp
pushl %ebx
pushl %esi
@@ -561,9 +566,9 @@ bn_div64:
popl %ebx
popl %ebp
ret
-.bn_div64_end:
- SIZE(bn_div64,.bn_div64_end-bn_div64)
-.ident "bn_div64"
+.bn_div_words_end:
+ SIZE(bn_div_words,.bn_div_words_end-bn_div_words)
+.ident "bn_div_words"
.text
.align ALIGN
.globl bn_add_words
@@ -741,7 +746,6 @@ bn_add_words:
adcl $0, %eax
movl %ecx, 24(%ebx)
.L013aw_end:
- movl %eax, %eax
popl %edi
popl %esi
popl %ebx
@@ -750,3 +754,1448 @@ bn_add_words:
.bn_add_words_end:
SIZE(bn_add_words,.bn_add_words_end-bn_add_words)
.ident "bn_add_words"
+.text
+ .align ALIGN
+.globl bn_sub_words
+ TYPE(bn_sub_words,@function)
+bn_sub_words:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+
+ movl 20(%esp), %ebx
+ movl 24(%esp), %esi
+ movl 28(%esp), %edi
+ movl 32(%esp), %ebp
+ xorl %eax, %eax
+ andl $4294967288, %ebp
+ jz .L014aw_finish
+.L015aw_loop:
+ /* Round 0 */
+ movl (%esi), %ecx
+ movl (%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, (%ebx)
+ /* Round 1 */
+ movl 4(%esi), %ecx
+ movl 4(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 4(%ebx)
+ /* Round 2 */
+ movl 8(%esi), %ecx
+ movl 8(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 8(%ebx)
+ /* Round 3 */
+ movl 12(%esi), %ecx
+ movl 12(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 12(%ebx)
+ /* Round 4 */
+ movl 16(%esi), %ecx
+ movl 16(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 16(%ebx)
+ /* Round 5 */
+ movl 20(%esi), %ecx
+ movl 20(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 20(%ebx)
+ /* Round 6 */
+ movl 24(%esi), %ecx
+ movl 24(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 24(%ebx)
+ /* Round 7 */
+ movl 28(%esi), %ecx
+ movl 28(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 28(%ebx)
+
+ addl $32, %esi
+ addl $32, %edi
+ addl $32, %ebx
+ subl $8, %ebp
+ jnz .L015aw_loop
+.L014aw_finish:
+ movl 32(%esp), %ebp
+ andl $7, %ebp
+ jz .L016aw_end
+ /* Tail Round 0 */
+ movl (%esi), %ecx
+ movl (%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ decl %ebp
+ movl %ecx, (%ebx)
+ jz .L016aw_end
+ /* Tail Round 1 */
+ movl 4(%esi), %ecx
+ movl 4(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ decl %ebp
+ movl %ecx, 4(%ebx)
+ jz .L016aw_end
+ /* Tail Round 2 */
+ movl 8(%esi), %ecx
+ movl 8(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ decl %ebp
+ movl %ecx, 8(%ebx)
+ jz .L016aw_end
+ /* Tail Round 3 */
+ movl 12(%esi), %ecx
+ movl 12(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ decl %ebp
+ movl %ecx, 12(%ebx)
+ jz .L016aw_end
+ /* Tail Round 4 */
+ movl 16(%esi), %ecx
+ movl 16(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ decl %ebp
+ movl %ecx, 16(%ebx)
+ jz .L016aw_end
+ /* Tail Round 5 */
+ movl 20(%esi), %ecx
+ movl 20(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ decl %ebp
+ movl %ecx, 20(%ebx)
+ jz .L016aw_end
+ /* Tail Round 6 */
+ movl 24(%esi), %ecx
+ movl 24(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 24(%ebx)
+.L016aw_end:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.bn_sub_words_end:
+ SIZE(bn_sub_words,.bn_sub_words_end-bn_sub_words)
+.ident "bn_sub_words"
+.text
+ .align ALIGN
+.globl bn_mul_comba8
+ TYPE(bn_mul_comba8,@function)
+bn_mul_comba8:
+ pushl %esi
+ movl 12(%esp), %esi
+ pushl %edi
+ movl 20(%esp), %edi
+ pushl %ebp
+ pushl %ebx
+ xorl %ebx, %ebx
+ movl (%esi), %eax
+ xorl %ecx, %ecx
+ movl (%edi), %edx
+ /* ################## Calculate word 0 */
+ xorl %ebp, %ebp
+ /* mul a[0]*b[0] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl (%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, (%eax)
+ movl 4(%esi), %eax
+ /* saved r[0] */
+ /* ################## Calculate word 1 */
+ xorl %ebx, %ebx
+ /* mul a[1]*b[0] */
+ mull %edx
+ addl %eax, %ecx
+ movl (%esi), %eax
+ adcl %edx, %ebp
+ movl 4(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[0]*b[1] */
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl (%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 4(%eax)
+ movl 8(%esi), %eax
+ /* saved r[1] */
+ /* ################## Calculate word 2 */
+ xorl %ecx, %ecx
+ /* mul a[2]*b[0] */
+ mull %edx
+ addl %eax, %ebp
+ movl 4(%esi), %eax
+ adcl %edx, %ebx
+ movl 4(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[1]*b[1] */
+ mull %edx
+ addl %eax, %ebp
+ movl (%esi), %eax
+ adcl %edx, %ebx
+ movl 8(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[0]*b[2] */
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl (%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 8(%eax)
+ movl 12(%esi), %eax
+ /* saved r[2] */
+ /* ################## Calculate word 3 */
+ xorl %ebp, %ebp
+ /* mul a[3]*b[0] */
+ mull %edx
+ addl %eax, %ebx
+ movl 8(%esi), %eax
+ adcl %edx, %ecx
+ movl 4(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[2]*b[1] */
+ mull %edx
+ addl %eax, %ebx
+ movl 4(%esi), %eax
+ adcl %edx, %ecx
+ movl 8(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[1]*b[2] */
+ mull %edx
+ addl %eax, %ebx
+ movl (%esi), %eax
+ adcl %edx, %ecx
+ movl 12(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[0]*b[3] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl (%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, 12(%eax)
+ movl 16(%esi), %eax
+ /* saved r[3] */
+ /* ################## Calculate word 4 */
+ xorl %ebx, %ebx
+ /* mul a[4]*b[0] */
+ mull %edx
+ addl %eax, %ecx
+ movl 12(%esi), %eax
+ adcl %edx, %ebp
+ movl 4(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[3]*b[1] */
+ mull %edx
+ addl %eax, %ecx
+ movl 8(%esi), %eax
+ adcl %edx, %ebp
+ movl 8(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[2]*b[2] */
+ mull %edx
+ addl %eax, %ecx
+ movl 4(%esi), %eax
+ adcl %edx, %ebp
+ movl 12(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[1]*b[3] */
+ mull %edx
+ addl %eax, %ecx
+ movl (%esi), %eax
+ adcl %edx, %ebp
+ movl 16(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[0]*b[4] */
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl (%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 16(%eax)
+ movl 20(%esi), %eax
+ /* saved r[4] */
+ /* ################## Calculate word 5 */
+ xorl %ecx, %ecx
+ /* mul a[5]*b[0] */
+ mull %edx
+ addl %eax, %ebp
+ movl 16(%esi), %eax
+ adcl %edx, %ebx
+ movl 4(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[4]*b[1] */
+ mull %edx
+ addl %eax, %ebp
+ movl 12(%esi), %eax
+ adcl %edx, %ebx
+ movl 8(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[3]*b[2] */
+ mull %edx
+ addl %eax, %ebp
+ movl 8(%esi), %eax
+ adcl %edx, %ebx
+ movl 12(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[2]*b[3] */
+ mull %edx
+ addl %eax, %ebp
+ movl 4(%esi), %eax
+ adcl %edx, %ebx
+ movl 16(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[1]*b[4] */
+ mull %edx
+ addl %eax, %ebp
+ movl (%esi), %eax
+ adcl %edx, %ebx
+ movl 20(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[0]*b[5] */
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl (%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 20(%eax)
+ movl 24(%esi), %eax
+ /* saved r[5] */
+ /* ################## Calculate word 6 */
+ xorl %ebp, %ebp
+ /* mul a[6]*b[0] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esi), %eax
+ adcl %edx, %ecx
+ movl 4(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[5]*b[1] */
+ mull %edx
+ addl %eax, %ebx
+ movl 16(%esi), %eax
+ adcl %edx, %ecx
+ movl 8(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[4]*b[2] */
+ mull %edx
+ addl %eax, %ebx
+ movl 12(%esi), %eax
+ adcl %edx, %ecx
+ movl 12(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[3]*b[3] */
+ mull %edx
+ addl %eax, %ebx
+ movl 8(%esi), %eax
+ adcl %edx, %ecx
+ movl 16(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[2]*b[4] */
+ mull %edx
+ addl %eax, %ebx
+ movl 4(%esi), %eax
+ adcl %edx, %ecx
+ movl 20(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[1]*b[5] */
+ mull %edx
+ addl %eax, %ebx
+ movl (%esi), %eax
+ adcl %edx, %ecx
+ movl 24(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[0]*b[6] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl (%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, 24(%eax)
+ movl 28(%esi), %eax
+ /* saved r[6] */
+ /* ################## Calculate word 7 */
+ xorl %ebx, %ebx
+ /* mul a[7]*b[0] */
+ mull %edx
+ addl %eax, %ecx
+ movl 24(%esi), %eax
+ adcl %edx, %ebp
+ movl 4(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[6]*b[1] */
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esi), %eax
+ adcl %edx, %ebp
+ movl 8(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[5]*b[2] */
+ mull %edx
+ addl %eax, %ecx
+ movl 16(%esi), %eax
+ adcl %edx, %ebp
+ movl 12(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[4]*b[3] */
+ mull %edx
+ addl %eax, %ecx
+ movl 12(%esi), %eax
+ adcl %edx, %ebp
+ movl 16(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[3]*b[4] */
+ mull %edx
+ addl %eax, %ecx
+ movl 8(%esi), %eax
+ adcl %edx, %ebp
+ movl 20(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[2]*b[5] */
+ mull %edx
+ addl %eax, %ecx
+ movl 4(%esi), %eax
+ adcl %edx, %ebp
+ movl 24(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[1]*b[6] */
+ mull %edx
+ addl %eax, %ecx
+ movl (%esi), %eax
+ adcl %edx, %ebp
+ movl 28(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[0]*b[7] */
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl 4(%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 28(%eax)
+ movl 28(%esi), %eax
+ /* saved r[7] */
+ /* ################## Calculate word 8 */
+ xorl %ecx, %ecx
+ /* mul a[7]*b[1] */
+ mull %edx
+ addl %eax, %ebp
+ movl 24(%esi), %eax
+ adcl %edx, %ebx
+ movl 8(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[6]*b[2] */
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esi), %eax
+ adcl %edx, %ebx
+ movl 12(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[5]*b[3] */
+ mull %edx
+ addl %eax, %ebp
+ movl 16(%esi), %eax
+ adcl %edx, %ebx
+ movl 16(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[4]*b[4] */
+ mull %edx
+ addl %eax, %ebp
+ movl 12(%esi), %eax
+ adcl %edx, %ebx
+ movl 20(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[3]*b[5] */
+ mull %edx
+ addl %eax, %ebp
+ movl 8(%esi), %eax
+ adcl %edx, %ebx
+ movl 24(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[2]*b[6] */
+ mull %edx
+ addl %eax, %ebp
+ movl 4(%esi), %eax
+ adcl %edx, %ebx
+ movl 28(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[1]*b[7] */
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl 8(%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 32(%eax)
+ movl 28(%esi), %eax
+ /* saved r[8] */
+ /* ################## Calculate word 9 */
+ xorl %ebp, %ebp
+ /* mul a[7]*b[2] */
+ mull %edx
+ addl %eax, %ebx
+ movl 24(%esi), %eax
+ adcl %edx, %ecx
+ movl 12(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[6]*b[3] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esi), %eax
+ adcl %edx, %ecx
+ movl 16(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[5]*b[4] */
+ mull %edx
+ addl %eax, %ebx
+ movl 16(%esi), %eax
+ adcl %edx, %ecx
+ movl 20(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[4]*b[5] */
+ mull %edx
+ addl %eax, %ebx
+ movl 12(%esi), %eax
+ adcl %edx, %ecx
+ movl 24(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[3]*b[6] */
+ mull %edx
+ addl %eax, %ebx
+ movl 8(%esi), %eax
+ adcl %edx, %ecx
+ movl 28(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[2]*b[7] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl 12(%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, 36(%eax)
+ movl 28(%esi), %eax
+ /* saved r[9] */
+ /* ################## Calculate word 10 */
+ xorl %ebx, %ebx
+ /* mul a[7]*b[3] */
+ mull %edx
+ addl %eax, %ecx
+ movl 24(%esi), %eax
+ adcl %edx, %ebp
+ movl 16(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[6]*b[4] */
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esi), %eax
+ adcl %edx, %ebp
+ movl 20(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[5]*b[5] */
+ mull %edx
+ addl %eax, %ecx
+ movl 16(%esi), %eax
+ adcl %edx, %ebp
+ movl 24(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[4]*b[6] */
+ mull %edx
+ addl %eax, %ecx
+ movl 12(%esi), %eax
+ adcl %edx, %ebp
+ movl 28(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[3]*b[7] */
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl 16(%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 40(%eax)
+ movl 28(%esi), %eax
+ /* saved r[10] */
+ /* ################## Calculate word 11 */
+ xorl %ecx, %ecx
+ /* mul a[7]*b[4] */
+ mull %edx
+ addl %eax, %ebp
+ movl 24(%esi), %eax
+ adcl %edx, %ebx
+ movl 20(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[6]*b[5] */
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esi), %eax
+ adcl %edx, %ebx
+ movl 24(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[5]*b[6] */
+ mull %edx
+ addl %eax, %ebp
+ movl 16(%esi), %eax
+ adcl %edx, %ebx
+ movl 28(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[4]*b[7] */
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl 20(%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 44(%eax)
+ movl 28(%esi), %eax
+ /* saved r[11] */
+ /* ################## Calculate word 12 */
+ xorl %ebp, %ebp
+ /* mul a[7]*b[5] */
+ mull %edx
+ addl %eax, %ebx
+ movl 24(%esi), %eax
+ adcl %edx, %ecx
+ movl 24(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[6]*b[6] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esi), %eax
+ adcl %edx, %ecx
+ movl 28(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[5]*b[7] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl 24(%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, 48(%eax)
+ movl 28(%esi), %eax
+ /* saved r[12] */
+ /* ################## Calculate word 13 */
+ xorl %ebx, %ebx
+ /* mul a[7]*b[6] */
+ mull %edx
+ addl %eax, %ecx
+ movl 24(%esi), %eax
+ adcl %edx, %ebp
+ movl 28(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[6]*b[7] */
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl 28(%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 52(%eax)
+ movl 28(%esi), %eax
+ /* saved r[13] */
+ /* ################## Calculate word 14 */
+ xorl %ecx, %ecx
+ /* mul a[7]*b[7] */
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ adcl $0, %ecx
+ movl %ebp, 56(%eax)
+ /* saved r[14] */
+ /* save r[15] */
+ movl %ebx, 60(%eax)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.bn_mul_comba8_end:
+ SIZE(bn_mul_comba8,.bn_mul_comba8_end-bn_mul_comba8)
+.ident "desasm.pl"
+.text
+ .align ALIGN
+.globl bn_mul_comba4
+ TYPE(bn_mul_comba4,@function)
+bn_mul_comba4:
+ pushl %esi
+ movl 12(%esp), %esi
+ pushl %edi
+ movl 20(%esp), %edi
+ pushl %ebp
+ pushl %ebx
+ xorl %ebx, %ebx
+ movl (%esi), %eax
+ xorl %ecx, %ecx
+ movl (%edi), %edx
+ /* ################## Calculate word 0 */
+ xorl %ebp, %ebp
+ /* mul a[0]*b[0] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl (%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, (%eax)
+ movl 4(%esi), %eax
+ /* saved r[0] */
+ /* ################## Calculate word 1 */
+ xorl %ebx, %ebx
+ /* mul a[1]*b[0] */
+ mull %edx
+ addl %eax, %ecx
+ movl (%esi), %eax
+ adcl %edx, %ebp
+ movl 4(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[0]*b[1] */
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl (%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 4(%eax)
+ movl 8(%esi), %eax
+ /* saved r[1] */
+ /* ################## Calculate word 2 */
+ xorl %ecx, %ecx
+ /* mul a[2]*b[0] */
+ mull %edx
+ addl %eax, %ebp
+ movl 4(%esi), %eax
+ adcl %edx, %ebx
+ movl 4(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[1]*b[1] */
+ mull %edx
+ addl %eax, %ebp
+ movl (%esi), %eax
+ adcl %edx, %ebx
+ movl 8(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[0]*b[2] */
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl (%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 8(%eax)
+ movl 12(%esi), %eax
+ /* saved r[2] */
+ /* ################## Calculate word 3 */
+ xorl %ebp, %ebp
+ /* mul a[3]*b[0] */
+ mull %edx
+ addl %eax, %ebx
+ movl 8(%esi), %eax
+ adcl %edx, %ecx
+ movl 4(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[2]*b[1] */
+ mull %edx
+ addl %eax, %ebx
+ movl 4(%esi), %eax
+ adcl %edx, %ecx
+ movl 8(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[1]*b[2] */
+ mull %edx
+ addl %eax, %ebx
+ movl (%esi), %eax
+ adcl %edx, %ecx
+ movl 12(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[0]*b[3] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl 4(%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, 12(%eax)
+ movl 12(%esi), %eax
+ /* saved r[3] */
+ /* ################## Calculate word 4 */
+ xorl %ebx, %ebx
+ /* mul a[3]*b[1] */
+ mull %edx
+ addl %eax, %ecx
+ movl 8(%esi), %eax
+ adcl %edx, %ebp
+ movl 8(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[2]*b[2] */
+ mull %edx
+ addl %eax, %ecx
+ movl 4(%esi), %eax
+ adcl %edx, %ebp
+ movl 12(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[1]*b[3] */
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl 8(%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 16(%eax)
+ movl 12(%esi), %eax
+ /* saved r[4] */
+ /* ################## Calculate word 5 */
+ xorl %ecx, %ecx
+ /* mul a[3]*b[2] */
+ mull %edx
+ addl %eax, %ebp
+ movl 8(%esi), %eax
+ adcl %edx, %ebx
+ movl 12(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[2]*b[3] */
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl 12(%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 20(%eax)
+ movl 12(%esi), %eax
+ /* saved r[5] */
+ /* ################## Calculate word 6 */
+ xorl %ebp, %ebp
+ /* mul a[3]*b[3] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ adcl $0, %ebp
+ movl %ebx, 24(%eax)
+ /* saved r[6] */
+ /* save r[7] */
+ movl %ecx, 28(%eax)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.bn_mul_comba4_end:
+ SIZE(bn_mul_comba4,.bn_mul_comba4_end-bn_mul_comba4)
+.ident "desasm.pl"
+.text
+ .align ALIGN
+.globl bn_sqr_comba8
+ TYPE(bn_sqr_comba8,@function)
+bn_sqr_comba8:
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushl %ebx
+ movl 20(%esp), %edi
+ movl 24(%esp), %esi
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ movl (%esi), %eax
+ /* ############### Calculate word 0 */
+ xorl %ebp, %ebp
+ /* sqr a[0]*a[0] */
+ mull %eax
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl (%esi), %edx
+ adcl $0, %ebp
+ movl %ebx, (%edi)
+ movl 4(%esi), %eax
+ /* saved r[0] */
+ /* ############### Calculate word 1 */
+ xorl %ebx, %ebx
+ /* sqr a[1]*a[0] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 8(%esi), %eax
+ adcl $0, %ebx
+ movl %ecx, 4(%edi)
+ movl (%esi), %edx
+ /* saved r[1] */
+ /* ############### Calculate word 2 */
+ xorl %ecx, %ecx
+ /* sqr a[2]*a[0] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 4(%esi), %eax
+ adcl $0, %ecx
+ /* sqr a[1]*a[1] */
+ mull %eax
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl (%esi), %edx
+ adcl $0, %ecx
+ movl %ebp, 8(%edi)
+ movl 12(%esi), %eax
+ /* saved r[2] */
+ /* ############### Calculate word 3 */
+ xorl %ebp, %ebp
+ /* sqr a[3]*a[0] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 8(%esi), %eax
+ adcl $0, %ebp
+ movl 4(%esi), %edx
+ /* sqr a[2]*a[1] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 16(%esi), %eax
+ adcl $0, %ebp
+ movl %ebx, 12(%edi)
+ movl (%esi), %edx
+ /* saved r[3] */
+ /* ############### Calculate word 4 */
+ xorl %ebx, %ebx
+ /* sqr a[4]*a[0] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 12(%esi), %eax
+ adcl $0, %ebx
+ movl 4(%esi), %edx
+ /* sqr a[3]*a[1] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 8(%esi), %eax
+ adcl $0, %ebx
+ /* sqr a[2]*a[2] */
+ mull %eax
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl (%esi), %edx
+ adcl $0, %ebx
+ movl %ecx, 16(%edi)
+ movl 20(%esi), %eax
+ /* saved r[4] */
+ /* ############### Calculate word 5 */
+ xorl %ecx, %ecx
+ /* sqr a[5]*a[0] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 16(%esi), %eax
+ adcl $0, %ecx
+ movl 4(%esi), %edx
+ /* sqr a[4]*a[1] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 12(%esi), %eax
+ adcl $0, %ecx
+ movl 8(%esi), %edx
+ /* sqr a[3]*a[2] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 24(%esi), %eax
+ adcl $0, %ecx
+ movl %ebp, 20(%edi)
+ movl (%esi), %edx
+ /* saved r[5] */
+ /* ############### Calculate word 6 */
+ xorl %ebp, %ebp
+ /* sqr a[6]*a[0] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 20(%esi), %eax
+ adcl $0, %ebp
+ movl 4(%esi), %edx
+ /* sqr a[5]*a[1] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 16(%esi), %eax
+ adcl $0, %ebp
+ movl 8(%esi), %edx
+ /* sqr a[4]*a[2] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 12(%esi), %eax
+ adcl $0, %ebp
+ /* sqr a[3]*a[3] */
+ mull %eax
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl (%esi), %edx
+ adcl $0, %ebp
+ movl %ebx, 24(%edi)
+ movl 28(%esi), %eax
+ /* saved r[6] */
+ /* ############### Calculate word 7 */
+ xorl %ebx, %ebx
+ /* sqr a[7]*a[0] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 24(%esi), %eax
+ adcl $0, %ebx
+ movl 4(%esi), %edx
+ /* sqr a[6]*a[1] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 20(%esi), %eax
+ adcl $0, %ebx
+ movl 8(%esi), %edx
+ /* sqr a[5]*a[2] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 16(%esi), %eax
+ adcl $0, %ebx
+ movl 12(%esi), %edx
+ /* sqr a[4]*a[3] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 28(%esi), %eax
+ adcl $0, %ebx
+ movl %ecx, 28(%edi)
+ movl 4(%esi), %edx
+ /* saved r[7] */
+ /* ############### Calculate word 8 */
+ xorl %ecx, %ecx
+ /* sqr a[7]*a[1] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 24(%esi), %eax
+ adcl $0, %ecx
+ movl 8(%esi), %edx
+ /* sqr a[6]*a[2] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 20(%esi), %eax
+ adcl $0, %ecx
+ movl 12(%esi), %edx
+ /* sqr a[5]*a[3] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 16(%esi), %eax
+ adcl $0, %ecx
+ /* sqr a[4]*a[4] */
+ mull %eax
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 8(%esi), %edx
+ adcl $0, %ecx
+ movl %ebp, 32(%edi)
+ movl 28(%esi), %eax
+ /* saved r[8] */
+ /* ############### Calculate word 9 */
+ xorl %ebp, %ebp
+ /* sqr a[7]*a[2] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 24(%esi), %eax
+ adcl $0, %ebp
+ movl 12(%esi), %edx
+ /* sqr a[6]*a[3] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 20(%esi), %eax
+ adcl $0, %ebp
+ movl 16(%esi), %edx
+ /* sqr a[5]*a[4] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 28(%esi), %eax
+ adcl $0, %ebp
+ movl %ebx, 36(%edi)
+ movl 12(%esi), %edx
+ /* saved r[9] */
+ /* ############### Calculate word 10 */
+ xorl %ebx, %ebx
+ /* sqr a[7]*a[3] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 24(%esi), %eax
+ adcl $0, %ebx
+ movl 16(%esi), %edx
+ /* sqr a[6]*a[4] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 20(%esi), %eax
+ adcl $0, %ebx
+ /* sqr a[5]*a[5] */
+ mull %eax
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 16(%esi), %edx
+ adcl $0, %ebx
+ movl %ecx, 40(%edi)
+ movl 28(%esi), %eax
+ /* saved r[10] */
+ /* ############### Calculate word 11 */
+ xorl %ecx, %ecx
+ /* sqr a[7]*a[4] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 24(%esi), %eax
+ adcl $0, %ecx
+ movl 20(%esi), %edx
+ /* sqr a[6]*a[5] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 28(%esi), %eax
+ adcl $0, %ecx
+ movl %ebp, 44(%edi)
+ movl 20(%esi), %edx
+ /* saved r[11] */
+ /* ############### Calculate word 12 */
+ xorl %ebp, %ebp
+ /* sqr a[7]*a[5] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 24(%esi), %eax
+ adcl $0, %ebp
+ /* sqr a[6]*a[6] */
+ mull %eax
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 24(%esi), %edx
+ adcl $0, %ebp
+ movl %ebx, 48(%edi)
+ movl 28(%esi), %eax
+ /* saved r[12] */
+ /* ############### Calculate word 13 */
+ xorl %ebx, %ebx
+ /* sqr a[7]*a[6] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 28(%esi), %eax
+ adcl $0, %ebx
+ movl %ecx, 52(%edi)
+ /* saved r[13] */
+ /* ############### Calculate word 14 */
+ xorl %ecx, %ecx
+ /* sqr a[7]*a[7] */
+ mull %eax
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ adcl $0, %ecx
+ movl %ebp, 56(%edi)
+ /* saved r[14] */
+ movl %ebx, 60(%edi)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.bn_sqr_comba8_end:
+ SIZE(bn_sqr_comba8,.bn_sqr_comba8_end-bn_sqr_comba8)
+.ident "desasm.pl"
+.text
+ .align ALIGN
+.globl bn_sqr_comba4
+ TYPE(bn_sqr_comba4,@function)
+bn_sqr_comba4:
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushl %ebx
+ movl 20(%esp), %edi
+ movl 24(%esp), %esi
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ movl (%esi), %eax
+ /* ############### Calculate word 0 */
+ xorl %ebp, %ebp
+ /* sqr a[0]*a[0] */
+ mull %eax
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl (%esi), %edx
+ adcl $0, %ebp
+ movl %ebx, (%edi)
+ movl 4(%esi), %eax
+ /* saved r[0] */
+ /* ############### Calculate word 1 */
+ xorl %ebx, %ebx
+ /* sqr a[1]*a[0] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 8(%esi), %eax
+ adcl $0, %ebx
+ movl %ecx, 4(%edi)
+ movl (%esi), %edx
+ /* saved r[1] */
+ /* ############### Calculate word 2 */
+ xorl %ecx, %ecx
+ /* sqr a[2]*a[0] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 4(%esi), %eax
+ adcl $0, %ecx
+ /* sqr a[1]*a[1] */
+ mull %eax
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl (%esi), %edx
+ adcl $0, %ecx
+ movl %ebp, 8(%edi)
+ movl 12(%esi), %eax
+ /* saved r[2] */
+ /* ############### Calculate word 3 */
+ xorl %ebp, %ebp
+ /* sqr a[3]*a[0] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 8(%esi), %eax
+ adcl $0, %ebp
+ movl 4(%esi), %edx
+ /* sqr a[2]*a[1] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 12(%esi), %eax
+ adcl $0, %ebp
+ movl %ebx, 12(%edi)
+ movl 4(%esi), %edx
+ /* saved r[3] */
+ /* ############### Calculate word 4 */
+ xorl %ebx, %ebx
+ /* sqr a[3]*a[1] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 8(%esi), %eax
+ adcl $0, %ebx
+ /* sqr a[2]*a[2] */
+ mull %eax
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 8(%esi), %edx
+ adcl $0, %ebx
+ movl %ecx, 16(%edi)
+ movl 12(%esi), %eax
+ /* saved r[4] */
+ /* ############### Calculate word 5 */
+ xorl %ecx, %ecx
+ /* sqr a[3]*a[2] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 12(%esi), %eax
+ adcl $0, %ecx
+ movl %ebp, 20(%edi)
+ /* saved r[5] */
+ /* ############### Calculate word 6 */
+ xorl %ebp, %ebp
+ /* sqr a[3]*a[3] */
+ mull %eax
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ adcl $0, %ebp
+ movl %ebx, 24(%edi)
+ /* saved r[6] */
+ movl %ecx, 28(%edi)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.bn_sqr_comba4_end:
+ SIZE(bn_sqr_comba4,.bn_sqr_comba4_end-bn_sqr_comba4)
+.ident "desasm.pl"
diff --git a/crypto/bn/asm/ca.pl b/crypto/bn/asm/ca.pl
new file mode 100644
index 0000000000..181d1f007e
--- /dev/null
+++ b/crypto/bn/asm/ca.pl
@@ -0,0 +1,33 @@
+#!/usr/local/bin/perl
+# I have this in perl so I can use more usefull register names and then convert
+# them into alpha registers.
+#
+
+push(@INC,"perlasm","../../perlasm");
+require "alpha.pl";
+require "alpha/mul_add.pl";
+require "alpha/mul.pl";
+require "alpha/sqr.pl";
+require "alpha/add.pl";
+require "alpha/sub.pl";
+require "alpha/mul_c8.pl";
+require "alpha/mul_c4.pl";
+require "alpha/sqr_c4.pl";
+require "alpha/sqr_c8.pl";
+require "alpha/div.pl";
+
+&asm_init($ARGV[0],"bn-586.pl");
+
+&bn_mul_words("bn_mul_words");
+&bn_sqr_words("bn_sqr_words");
+&bn_mul_add_words("bn_mul_add_words");
+&bn_add_words("bn_add_words");
+&bn_sub_words("bn_sub_words");
+&bn_div_words("bn_div_words");
+&bn_mul_comba8("bn_mul_comba8");
+&bn_mul_comba4("bn_mul_comba4");
+&bn_sqr_comba4("bn_sqr_comba4");
+&bn_sqr_comba8("bn_sqr_comba8");
+
+&asm_finish();
+
diff --git a/crypto/bn/asm/co-586.pl b/crypto/bn/asm/co-586.pl
new file mode 100644
index 0000000000..0bcb5a6d47
--- /dev/null
+++ b/crypto/bn/asm/co-586.pl
@@ -0,0 +1,286 @@
+#!/usr/local/bin/perl
+
+push(@INC,"perlasm","../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"bn-586.pl");
+
+&bn_mul_comba("bn_mul_comba8",8);
+&bn_mul_comba("bn_mul_comba4",4);
+&bn_sqr_comba("bn_sqr_comba8",8);
+&bn_sqr_comba("bn_sqr_comba4",4);
+
+&asm_finish();
+
+sub mul_add_c
+ {
+ local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+ # pos == -1 if eax and edx are pre-loaded, 0 to load from next
+ # words, and 1 if load return value
+
+ &comment("mul a[$ai]*b[$bi]");
+
+ # "eax" and "edx" will always be pre-loaded.
+ # &mov("eax",&DWP($ai*4,$a,"",0)) ;
+ # &mov("edx",&DWP($bi*4,$b,"",0));
+
+ &mul("edx");
+ &add($c0,"eax");
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
+ &mov("eax",&wparam(0)) if $pos > 0; # load r[]
+ ###
+ &adc($c1,"edx");
+ &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
+ &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
+ ###
+ &adc($c2,0);
+ # is pos > 1, it means it is the last loop
+ &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
+ }
+
+sub sqr_add_c
+ {
+ local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+ # pos == -1 if eax and edx are pre-loaded, 0 to load from next
+ # words, and 1 if load return value
+
+ &comment("sqr a[$ai]*a[$bi]");
+
+ # "eax" and "edx" will always be pre-loaded.
+ # &mov("eax",&DWP($ai*4,$a,"",0)) ;
+ # &mov("edx",&DWP($bi*4,$b,"",0));
+
+ if ($ai == $bi)
+ { &mul("eax");}
+ else
+ { &mul("edx");}
+ &add($c0,"eax");
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
+ ###
+ &adc($c1,"edx");
+ &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
+ ###
+ &adc($c2,0);
+ # is pos > 1, it means it is the last loop
+ &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
+ }
+
+sub sqr_add_c2
+ {
+ local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+ # pos == -1 if eax and edx are pre-loaded, 0 to load from next
+ # words, and 1 if load return value
+
+ &comment("sqr a[$ai]*a[$bi]");
+
+ # "eax" and "edx" will always be pre-loaded.
+ # &mov("eax",&DWP($ai*4,$a,"",0)) ;
+ # &mov("edx",&DWP($bi*4,$a,"",0));
+
+ if ($ai == $bi)
+ { &mul("eax");}
+ else
+ { &mul("edx");}
+ &add("eax","eax");
+ ###
+ &adc("edx","edx");
+ ###
+ &adc($c2,0);
+ &add($c0,"eax");
+ &adc($c1,"edx");
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
+ &adc($c2,0);
+ &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
+ &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
+ ###
+ }
+
+sub bn_mul_comba
+ {
+ local($name,$num)=@_;
+ local($a,$b,$c0,$c1,$c2);
+ local($i,$as,$ae,$bs,$be,$ai,$bi);
+ local($tot,$end);
+
+ &function_begin_B($name,"");
+
+ $c0="ebx";
+ $c1="ecx";
+ $c2="ebp";
+ $a="esi";
+ $b="edi";
+
+ $as=0;
+ $ae=0;
+ $bs=0;
+ $be=0;
+ $tot=$num+$num-1;
+
+ &push("esi");
+ &mov($a,&wparam(1));
+ &push("edi");
+ &mov($b,&wparam(2));
+ &push("ebp");
+ &push("ebx");
+
+ &xor($c0,$c0);
+ &mov("eax",&DWP(0,$a,"",0)); # load the first word
+ &xor($c1,$c1);
+ &mov("edx",&DWP(0,$b,"",0)); # load the first second
+
+ for ($i=0; $i<$tot; $i++)
+ {
+ $ai=$as;
+ $bi=$bs;
+ $end=$be+1;
+
+ &comment("################## Calculate word $i");
+
+ for ($j=$bs; $j<$end; $j++)
+ {
+ &xor($c2,$c2) if ($j == $bs);
+ if (($j+1) == $end)
+ {
+ $v=1;
+ $v=2 if (($i+1) == $tot);
+ }
+ else
+ { $v=0; }
+ if (($j+1) != $end)
+ {
+ $na=($ai-1);
+ $nb=($bi+1);
+ }
+ else
+ {
+ $na=$as+($i < ($num-1));
+ $nb=$bs+($i >= ($num-1));
+ }
+#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
+ &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
+ if ($v)
+ {
+ &comment("saved r[$i]");
+ # &mov("eax",&wparam(0));
+ # &mov(&DWP($i*4,"eax","",0),$c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ }
+ $ai--;
+ $bi++;
+ }
+ $as++ if ($i < ($num-1));
+ $ae++ if ($i >= ($num-1));
+
+ $bs++ if ($i >= ($num-1));
+ $be++ if ($i < ($num-1));
+ }
+ &comment("save r[$i]");
+ # &mov("eax",&wparam(0));
+ &mov(&DWP($i*4,"eax","",0),$c0);
+
+ &pop("ebx");
+ &pop("ebp");
+ &pop("edi");
+ &pop("esi");
+ &ret();
+ &function_end_B($name);
+ }
+
+sub bn_sqr_comba
+ {
+ local($name,$num)=@_;
+ local($r,$a,$c0,$c1,$c2)=@_;
+ local($i,$as,$ae,$bs,$be,$ai,$bi);
+ local($b,$tot,$end,$half);
+
+ &function_begin_B($name,"");
+
+ $c0="ebx";
+ $c1="ecx";
+ $c2="ebp";
+ $a="esi";
+ $r="edi";
+
+ &push("esi");
+ &push("edi");
+ &push("ebp");
+ &push("ebx");
+ &mov($r,&wparam(0));
+ &mov($a,&wparam(1));
+ &xor($c0,$c0);
+ &xor($c1,$c1);
+ &mov("eax",&DWP(0,$a,"",0)); # load the first word
+
+ $as=0;
+ $ae=0;
+ $bs=0;
+ $be=0;
+ $tot=$num+$num-1;
+
+ for ($i=0; $i<$tot; $i++)
+ {
+ $ai=$as;
+ $bi=$bs;
+ $end=$be+1;
+
+ &comment("############### Calculate word $i");
+ for ($j=$bs; $j<$end; $j++)
+ {
+ &xor($c2,$c2) if ($j == $bs);
+ if (($ai-1) < ($bi+1))
+ {
+ $v=1;
+ $v=2 if ($i+1) == $tot;
+ }
+ else
+ { $v=0; }
+ if (!$v)
+ {
+ $na=$ai-1;
+ $nb=$bi+1;
+ }
+ else
+ {
+ $na=$as+($i < ($num-1));
+ $nb=$bs+($i >= ($num-1));
+ }
+ if ($ai == $bi)
+ {
+ &sqr_add_c($r,$a,$ai,$bi,
+ $c0,$c1,$c2,$v,$i,$na,$nb);
+ }
+ else
+ {
+ &sqr_add_c2($r,$a,$ai,$bi,
+ $c0,$c1,$c2,$v,$i,$na,$nb);
+ }
+ if ($v)
+ {
+ &comment("saved r[$i]");
+ #&mov(&DWP($i*4,$r,"",0),$c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ last;
+ }
+ $ai--;
+ $bi++;
+ }
+ $as++ if ($i < ($num-1));
+ $ae++ if ($i >= ($num-1));
+
+ $bs++ if ($i >= ($num-1));
+ $be++ if ($i < ($num-1));
+ }
+ &mov(&DWP($i*4,$r,"",0),$c0);
+ &pop("ebx");
+ &pop("ebp");
+ &pop("edi");
+ &pop("esi");
+ &ret();
+ &function_end_B($name);
+ }
diff --git a/crypto/bn/asm/co-alpha.pl b/crypto/bn/asm/co-alpha.pl
new file mode 100644
index 0000000000..23869a4ef5
--- /dev/null
+++ b/crypto/bn/asm/co-alpha.pl
@@ -0,0 +1,116 @@
+#!/usr/local/bin/perl
+# I have this in perl so I can use more usefull register names and then convert
+# them into alpha registers.
+#
+
+push(@INC,"perlasm","../../perlasm");
+require "alpha.pl";
+
+&asm_init($ARGV[0],"bn-586.pl");
+
+print &bn_sub_words("bn_sub_words");
+
+&asm_finish();
+
+sub bn_sub_words
+ {
+ local($name)=@_;
+ local($cc,$a,$b,$r);
+
+ $cc="r0";
+ $a0="r1"; $b0="r5"; $r0="r9"; $tmp="r13";
+ $a1="r2"; $b1="r6"; $r1="r10"; $t1="r14";
+ $a2="r3"; $b2="r7"; $r2="r11";
+ $a3="r4"; $b3="r8"; $r3="r12"; $t3="r15";
+
+ $rp=&wparam(0);
+ $ap=&wparam(1);
+ $bp=&wparam(2);
+ $count=&wparam(3);
+
+ &function_begin($name,"");
+
+ &comment("");
+ &sub($count,4,$count);
+ &mov("zero",$cc);
+ &blt($count,&label("finish"));
+
+ &ld($a0,&QWPw(0,$ap));
+ &ld($b0,&QWPw(0,$bp));
+
+##########################################################
+ &set_label("loop");
+
+ &ld($a1,&QWPw(1,$ap));
+ &cmpult($a0,$b0,$tmp); # will we borrow?
+ &ld($b1,&QWPw(1,$bp));
+ &sub($a0,$b0,$a0); # do the subtract
+ &ld($a2,&QWPw(2,$ap));
+ &cmpult($a0,$cc,$b0); # will we borrow?
+ &ld($b2,&QWPw(2,$bp));
+ &sub($a0,$cc,$a0); # will we borrow?
+ &ld($a3,&QWPw(3,$ap));
+ &add($b0,$tmp,$cc); # add the borrows
+
+ &cmpult($a1,$b1,$t1); # will we borrow?
+ &sub($a1,$b1,$a1); # do the subtract
+ &ld($b3,&QWPw(3,$bp));
+ &cmpult($a1,$cc,$b1); # will we borrow?
+ &sub($a1,$cc,$a1); # will we borrow?
+ &add($b1,$t1,$cc); # add the borrows
+
+ &cmpult($a2,$b2,$tmp); # will we borrow?
+ &sub($a2,$b2,$a2); # do the subtract
+ &st($a0,&QWPw(0,$rp)); # save
+ &cmpult($a2,$cc,$b2); # will we borrow?
+ &sub($a2,$cc,$a2); # will we borrow?
+ &add($b2,$tmp,$cc); # add the borrows
+
+ &cmpult($a3,$b3,$t3); # will we borrow?
+ &sub($a3,$b3,$a3); # do the subtract
+ &st($a1,&QWPw(1,$rp)); # save
+ &cmpult($a3,$cc,$b3); # will we borrow?
+ &sub($a3,$cc,$a3); # will we borrow?
+ &add($b3,$t3,$cc); # add the borrows
+
+ &st($a2,&QWPw(2,$rp)); # save
+ &sub($count,4,$count); # count-=4
+ &st($a3,&QWPw(3,$rp)); # save
+ &add($ap,4*$QWS,$ap); # count+=4
+ &add($bp,4*$QWS,$bp); # count+=4
+ &add($rp,4*$QWS,$rp); # count+=4
+
+ &blt($count,&label("finish"));
+ &ld($a0,&QWPw(0,$ap));
+ &ld($b0,&QWPw(0,$bp));
+ &br(&label("loop"));
+##################################################
+ # Do the last 0..3 words
+
+ &set_label("last_loop");
+
+ &ld($a0,&QWPw(0,$ap)); # get a
+ &ld($b0,&QWPw(0,$bp)); # get b
+ &cmpult($a0,$b0,$tmp); # will we borrow?
+ &sub($a0,$b0,$a0); # do the subtract
+ &cmpult($a0,$cc,$b0); # will we borrow?
+ &sub($a0,$cc,$a0); # will we borrow?
+ &st($a0,&QWPw(0,$rp)); # save
+ &add($b0,$tmp,$cc); # add the borrows
+
+ &add($ap,$QWS,$ap);
+ &add($bp,$QWS,$bp);
+ &add($rp,$QWS,$rp);
+ &sub($count,1,$count);
+ &bgt($count,&label("last_loop"));
+ &function_end_A($name);
+
+######################################################
+ &set_label("finish");
+ &add($count,4,$count);
+ &bgt($count,&label("last_loop"));
+
+ &set_label("end");
+ &function_end($name);
+ }
+
diff --git a/crypto/bn/asm/co86unix.cpp b/crypto/bn/asm/co86unix.cpp
new file mode 100644
index 0000000000..fa80b14046
--- /dev/null
+++ b/crypto/bn/asm/co86unix.cpp
@@ -0,0 +1,1315 @@
+/* Run the C pre-processor over this file with one of the following defined
+ * ELF - elf object files,
+ * OUT - a.out object files,
+ * BSDI - BSDI style a.out object files
+ * SOL - Solaris style elf
+ */
+
+#define TYPE(a,b) .type a,b
+#define SIZE(a,b) .size a,b
+
+#if defined(OUT) || defined(BSDI)
+#define bn_mul_comba8 _bn_mul_comba8
+#define bn_mul_comba4 _bn_mul_comba4
+#define bn_sqr_comba8 _bn_sqr_comba8
+#define bn_sqr_comba4 _bn_sqr_comba4
+
+#endif
+
+#ifdef OUT
+#define OK 1
+#define ALIGN 4
+#endif
+
+#ifdef BSDI
+#define OK 1
+#define ALIGN 4
+#undef SIZE
+#undef TYPE
+#define SIZE(a,b)
+#define TYPE(a,b)
+#endif
+
+#if defined(ELF) || defined(SOL)
+#define OK 1
+#define ALIGN 16
+#endif
+
+#ifndef OK
+You need to define one of
+ELF - elf systems - linux-elf, NetBSD and DG-UX
+OUT - a.out systems - linux-a.out and FreeBSD
+SOL - solaris systems, which are elf with strange comment lines
+BSDI - a.out with a very primative version of as.
+#endif
+
+/* Let the Assembler begin :-) */
+ /* Don't even think of reading this code */
+ /* It was automatically generated by bn-586.pl */
+ /* Which is a perl program used to generate the x86 assember for */
+ /* any of elf, a.out, BSDI,Win32, or Solaris */
+ /* eric <eay@cryptsoft.com> */
+
+ .file "bn-586.s"
+ .version "01.01"
+gcc2_compiled.:
+.text
+ .align ALIGN
+.globl bn_mul_comba8
+ TYPE(bn_mul_comba8,@function)
+bn_mul_comba8:
+ pushl %esi
+ movl 12(%esp), %esi
+ pushl %edi
+ movl 20(%esp), %edi
+ pushl %ebp
+ pushl %ebx
+ xorl %ebx, %ebx
+ movl (%esi), %eax
+ xorl %ecx, %ecx
+ movl (%edi), %edx
+ /* ################## Calculate word 0 */
+ xorl %ebp, %ebp
+ /* mul a[0]*b[0] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl (%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, (%eax)
+ movl 4(%esi), %eax
+ /* saved r[0] */
+ /* ################## Calculate word 1 */
+ xorl %ebx, %ebx
+ /* mul a[1]*b[0] */
+ mull %edx
+ addl %eax, %ecx
+ movl (%esi), %eax
+ adcl %edx, %ebp
+ movl 4(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[0]*b[1] */
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl (%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 4(%eax)
+ movl 8(%esi), %eax
+ /* saved r[1] */
+ /* ################## Calculate word 2 */
+ xorl %ecx, %ecx
+ /* mul a[2]*b[0] */
+ mull %edx
+ addl %eax, %ebp
+ movl 4(%esi), %eax
+ adcl %edx, %ebx
+ movl 4(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[1]*b[1] */
+ mull %edx
+ addl %eax, %ebp
+ movl (%esi), %eax
+ adcl %edx, %ebx
+ movl 8(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[0]*b[2] */
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl (%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 8(%eax)
+ movl 12(%esi), %eax
+ /* saved r[2] */
+ /* ################## Calculate word 3 */
+ xorl %ebp, %ebp
+ /* mul a[3]*b[0] */
+ mull %edx
+ addl %eax, %ebx
+ movl 8(%esi), %eax
+ adcl %edx, %ecx
+ movl 4(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[2]*b[1] */
+ mull %edx
+ addl %eax, %ebx
+ movl 4(%esi), %eax
+ adcl %edx, %ecx
+ movl 8(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[1]*b[2] */
+ mull %edx
+ addl %eax, %ebx
+ movl (%esi), %eax
+ adcl %edx, %ecx
+ movl 12(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[0]*b[3] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl (%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, 12(%eax)
+ movl 16(%esi), %eax
+ /* saved r[3] */
+ /* ################## Calculate word 4 */
+ xorl %ebx, %ebx
+ /* mul a[4]*b[0] */
+ mull %edx
+ addl %eax, %ecx
+ movl 12(%esi), %eax
+ adcl %edx, %ebp
+ movl 4(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[3]*b[1] */
+ mull %edx
+ addl %eax, %ecx
+ movl 8(%esi), %eax
+ adcl %edx, %ebp
+ movl 8(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[2]*b[2] */
+ mull %edx
+ addl %eax, %ecx
+ movl 4(%esi), %eax
+ adcl %edx, %ebp
+ movl 12(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[1]*b[3] */
+ mull %edx
+ addl %eax, %ecx
+ movl (%esi), %eax
+ adcl %edx, %ebp
+ movl 16(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[0]*b[4] */
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl (%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 16(%eax)
+ movl 20(%esi), %eax
+ /* saved r[4] */
+ /* ################## Calculate word 5 */
+ xorl %ecx, %ecx
+ /* mul a[5]*b[0] */
+ mull %edx
+ addl %eax, %ebp
+ movl 16(%esi), %eax
+ adcl %edx, %ebx
+ movl 4(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[4]*b[1] */
+ mull %edx
+ addl %eax, %ebp
+ movl 12(%esi), %eax
+ adcl %edx, %ebx
+ movl 8(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[3]*b[2] */
+ mull %edx
+ addl %eax, %ebp
+ movl 8(%esi), %eax
+ adcl %edx, %ebx
+ movl 12(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[2]*b[3] */
+ mull %edx
+ addl %eax, %ebp
+ movl 4(%esi), %eax
+ adcl %edx, %ebx
+ movl 16(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[1]*b[4] */
+ mull %edx
+ addl %eax, %ebp
+ movl (%esi), %eax
+ adcl %edx, %ebx
+ movl 20(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[0]*b[5] */
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl (%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 20(%eax)
+ movl 24(%esi), %eax
+ /* saved r[5] */
+ /* ################## Calculate word 6 */
+ xorl %ebp, %ebp
+ /* mul a[6]*b[0] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esi), %eax
+ adcl %edx, %ecx
+ movl 4(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[5]*b[1] */
+ mull %edx
+ addl %eax, %ebx
+ movl 16(%esi), %eax
+ adcl %edx, %ecx
+ movl 8(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[4]*b[2] */
+ mull %edx
+ addl %eax, %ebx
+ movl 12(%esi), %eax
+ adcl %edx, %ecx
+ movl 12(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[3]*b[3] */
+ mull %edx
+ addl %eax, %ebx
+ movl 8(%esi), %eax
+ adcl %edx, %ecx
+ movl 16(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[2]*b[4] */
+ mull %edx
+ addl %eax, %ebx
+ movl 4(%esi), %eax
+ adcl %edx, %ecx
+ movl 20(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[1]*b[5] */
+ mull %edx
+ addl %eax, %ebx
+ movl (%esi), %eax
+ adcl %edx, %ecx
+ movl 24(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[0]*b[6] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl (%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, 24(%eax)
+ movl 28(%esi), %eax
+ /* saved r[6] */
+ /* ################## Calculate word 7 */
+ xorl %ebx, %ebx
+ /* mul a[7]*b[0] */
+ mull %edx
+ addl %eax, %ecx
+ movl 24(%esi), %eax
+ adcl %edx, %ebp
+ movl 4(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[6]*b[1] */
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esi), %eax
+ adcl %edx, %ebp
+ movl 8(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[5]*b[2] */
+ mull %edx
+ addl %eax, %ecx
+ movl 16(%esi), %eax
+ adcl %edx, %ebp
+ movl 12(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[4]*b[3] */
+ mull %edx
+ addl %eax, %ecx
+ movl 12(%esi), %eax
+ adcl %edx, %ebp
+ movl 16(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[3]*b[4] */
+ mull %edx
+ addl %eax, %ecx
+ movl 8(%esi), %eax
+ adcl %edx, %ebp
+ movl 20(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[2]*b[5] */
+ mull %edx
+ addl %eax, %ecx
+ movl 4(%esi), %eax
+ adcl %edx, %ebp
+ movl 24(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[1]*b[6] */
+ mull %edx
+ addl %eax, %ecx
+ movl (%esi), %eax
+ adcl %edx, %ebp
+ movl 28(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[0]*b[7] */
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl 4(%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 28(%eax)
+ movl 28(%esi), %eax
+ /* saved r[7] */
+ /* ################## Calculate word 8 */
+ xorl %ecx, %ecx
+ /* mul a[7]*b[1] */
+ mull %edx
+ addl %eax, %ebp
+ movl 24(%esi), %eax
+ adcl %edx, %ebx
+ movl 8(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[6]*b[2] */
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esi), %eax
+ adcl %edx, %ebx
+ movl 12(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[5]*b[3] */
+ mull %edx
+ addl %eax, %ebp
+ movl 16(%esi), %eax
+ adcl %edx, %ebx
+ movl 16(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[4]*b[4] */
+ mull %edx
+ addl %eax, %ebp
+ movl 12(%esi), %eax
+ adcl %edx, %ebx
+ movl 20(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[3]*b[5] */
+ mull %edx
+ addl %eax, %ebp
+ movl 8(%esi), %eax
+ adcl %edx, %ebx
+ movl 24(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[2]*b[6] */
+ mull %edx
+ addl %eax, %ebp
+ movl 4(%esi), %eax
+ adcl %edx, %ebx
+ movl 28(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[1]*b[7] */
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl 8(%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 32(%eax)
+ movl 28(%esi), %eax
+ /* saved r[8] */
+ /* ################## Calculate word 9 */
+ xorl %ebp, %ebp
+ /* mul a[7]*b[2] */
+ mull %edx
+ addl %eax, %ebx
+ movl 24(%esi), %eax
+ adcl %edx, %ecx
+ movl 12(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[6]*b[3] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esi), %eax
+ adcl %edx, %ecx
+ movl 16(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[5]*b[4] */
+ mull %edx
+ addl %eax, %ebx
+ movl 16(%esi), %eax
+ adcl %edx, %ecx
+ movl 20(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[4]*b[5] */
+ mull %edx
+ addl %eax, %ebx
+ movl 12(%esi), %eax
+ adcl %edx, %ecx
+ movl 24(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[3]*b[6] */
+ mull %edx
+ addl %eax, %ebx
+ movl 8(%esi), %eax
+ adcl %edx, %ecx
+ movl 28(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[2]*b[7] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl 12(%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, 36(%eax)
+ movl 28(%esi), %eax
+ /* saved r[9] */
+ /* ################## Calculate word 10 */
+ xorl %ebx, %ebx
+ /* mul a[7]*b[3] */
+ mull %edx
+ addl %eax, %ecx
+ movl 24(%esi), %eax
+ adcl %edx, %ebp
+ movl 16(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[6]*b[4] */
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esi), %eax
+ adcl %edx, %ebp
+ movl 20(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[5]*b[5] */
+ mull %edx
+ addl %eax, %ecx
+ movl 16(%esi), %eax
+ adcl %edx, %ebp
+ movl 24(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[4]*b[6] */
+ mull %edx
+ addl %eax, %ecx
+ movl 12(%esi), %eax
+ adcl %edx, %ebp
+ movl 28(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[3]*b[7] */
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl 16(%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 40(%eax)
+ movl 28(%esi), %eax
+ /* saved r[10] */
+ /* ################## Calculate word 11 */
+ xorl %ecx, %ecx
+ /* mul a[7]*b[4] */
+ mull %edx
+ addl %eax, %ebp
+ movl 24(%esi), %eax
+ adcl %edx, %ebx
+ movl 20(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[6]*b[5] */
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esi), %eax
+ adcl %edx, %ebx
+ movl 24(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[5]*b[6] */
+ mull %edx
+ addl %eax, %ebp
+ movl 16(%esi), %eax
+ adcl %edx, %ebx
+ movl 28(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[4]*b[7] */
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl 20(%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 44(%eax)
+ movl 28(%esi), %eax
+ /* saved r[11] */
+ /* ################## Calculate word 12 */
+ xorl %ebp, %ebp
+ /* mul a[7]*b[5] */
+ mull %edx
+ addl %eax, %ebx
+ movl 24(%esi), %eax
+ adcl %edx, %ecx
+ movl 24(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[6]*b[6] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esi), %eax
+ adcl %edx, %ecx
+ movl 28(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[5]*b[7] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl 24(%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, 48(%eax)
+ movl 28(%esi), %eax
+ /* saved r[12] */
+ /* ################## Calculate word 13 */
+ xorl %ebx, %ebx
+ /* mul a[7]*b[6] */
+ mull %edx
+ addl %eax, %ecx
+ movl 24(%esi), %eax
+ adcl %edx, %ebp
+ movl 28(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[6]*b[7] */
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl 28(%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 52(%eax)
+ movl 28(%esi), %eax
+ /* saved r[13] */
+ /* ################## Calculate word 14 */
+ xorl %ecx, %ecx
+ /* mul a[7]*b[7] */
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ adcl $0, %ecx
+ movl %ebp, 56(%eax)
+ /* saved r[14] */
+ /* save r[15] */
+ movl %ebx, 60(%eax)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.bn_mul_comba8_end:
+ SIZE(bn_mul_comba8,.bn_mul_comba8_end-bn_mul_comba8)
+.ident "desasm.pl"
+.text
+ .align ALIGN
+.globl bn_mul_comba4
+ TYPE(bn_mul_comba4,@function)
+bn_mul_comba4:
+ pushl %esi
+ movl 12(%esp), %esi
+ pushl %edi
+ movl 20(%esp), %edi
+ pushl %ebp
+ pushl %ebx
+ xorl %ebx, %ebx
+ movl (%esi), %eax
+ xorl %ecx, %ecx
+ movl (%edi), %edx
+ /* ################## Calculate word 0 */
+ xorl %ebp, %ebp
+ /* mul a[0]*b[0] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl (%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, (%eax)
+ movl 4(%esi), %eax
+ /* saved r[0] */
+ /* ################## Calculate word 1 */
+ xorl %ebx, %ebx
+ /* mul a[1]*b[0] */
+ mull %edx
+ addl %eax, %ecx
+ movl (%esi), %eax
+ adcl %edx, %ebp
+ movl 4(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[0]*b[1] */
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl (%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 4(%eax)
+ movl 8(%esi), %eax
+ /* saved r[1] */
+ /* ################## Calculate word 2 */
+ xorl %ecx, %ecx
+ /* mul a[2]*b[0] */
+ mull %edx
+ addl %eax, %ebp
+ movl 4(%esi), %eax
+ adcl %edx, %ebx
+ movl 4(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[1]*b[1] */
+ mull %edx
+ addl %eax, %ebp
+ movl (%esi), %eax
+ adcl %edx, %ebx
+ movl 8(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[0]*b[2] */
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl (%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 8(%eax)
+ movl 12(%esi), %eax
+ /* saved r[2] */
+ /* ################## Calculate word 3 */
+ xorl %ebp, %ebp
+ /* mul a[3]*b[0] */
+ mull %edx
+ addl %eax, %ebx
+ movl 8(%esi), %eax
+ adcl %edx, %ecx
+ movl 4(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[2]*b[1] */
+ mull %edx
+ addl %eax, %ebx
+ movl 4(%esi), %eax
+ adcl %edx, %ecx
+ movl 8(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[1]*b[2] */
+ mull %edx
+ addl %eax, %ebx
+ movl (%esi), %eax
+ adcl %edx, %ecx
+ movl 12(%edi), %edx
+ adcl $0, %ebp
+ /* mul a[0]*b[3] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl 4(%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, 12(%eax)
+ movl 12(%esi), %eax
+ /* saved r[3] */
+ /* ################## Calculate word 4 */
+ xorl %ebx, %ebx
+ /* mul a[3]*b[1] */
+ mull %edx
+ addl %eax, %ecx
+ movl 8(%esi), %eax
+ adcl %edx, %ebp
+ movl 8(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[2]*b[2] */
+ mull %edx
+ addl %eax, %ecx
+ movl 4(%esi), %eax
+ adcl %edx, %ebp
+ movl 12(%edi), %edx
+ adcl $0, %ebx
+ /* mul a[1]*b[3] */
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl 8(%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 16(%eax)
+ movl 12(%esi), %eax
+ /* saved r[4] */
+ /* ################## Calculate word 5 */
+ xorl %ecx, %ecx
+ /* mul a[3]*b[2] */
+ mull %edx
+ addl %eax, %ebp
+ movl 8(%esi), %eax
+ adcl %edx, %ebx
+ movl 12(%edi), %edx
+ adcl $0, %ecx
+ /* mul a[2]*b[3] */
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl 12(%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 20(%eax)
+ movl 12(%esi), %eax
+ /* saved r[5] */
+ /* ################## Calculate word 6 */
+ xorl %ebp, %ebp
+ /* mul a[3]*b[3] */
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ adcl $0, %ebp
+ movl %ebx, 24(%eax)
+ /* saved r[6] */
+ /* save r[7] */
+ movl %ecx, 28(%eax)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.bn_mul_comba4_end:
+ SIZE(bn_mul_comba4,.bn_mul_comba4_end-bn_mul_comba4)
+.ident "desasm.pl"
+.text
+ .align ALIGN
+.globl bn_sqr_comba8
+ TYPE(bn_sqr_comba8,@function)
+bn_sqr_comba8:
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushl %ebx
+ movl 20(%esp), %edi
+ movl 24(%esp), %esi
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ movl (%esi), %eax
+ /* ############### Calculate word 0 */
+ xorl %ebp, %ebp
+ /* sqr a[0]*a[0] */
+ mull %eax
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl (%esi), %edx
+ adcl $0, %ebp
+ movl %ebx, (%edi)
+ movl 4(%esi), %eax
+ /* saved r[0] */
+ /* ############### Calculate word 1 */
+ xorl %ebx, %ebx
+ /* sqr a[1]*a[0] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 8(%esi), %eax
+ adcl $0, %ebx
+ movl %ecx, 4(%edi)
+ movl (%esi), %edx
+ /* saved r[1] */
+ /* ############### Calculate word 2 */
+ xorl %ecx, %ecx
+ /* sqr a[2]*a[0] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 4(%esi), %eax
+ adcl $0, %ecx
+ /* sqr a[1]*a[1] */
+ mull %eax
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl (%esi), %edx
+ adcl $0, %ecx
+ movl %ebp, 8(%edi)
+ movl 12(%esi), %eax
+ /* saved r[2] */
+ /* ############### Calculate word 3 */
+ xorl %ebp, %ebp
+ /* sqr a[3]*a[0] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 8(%esi), %eax
+ adcl $0, %ebp
+ movl 4(%esi), %edx
+ /* sqr a[2]*a[1] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 16(%esi), %eax
+ adcl $0, %ebp
+ movl %ebx, 12(%edi)
+ movl (%esi), %edx
+ /* saved r[3] */
+ /* ############### Calculate word 4 */
+ xorl %ebx, %ebx
+ /* sqr a[4]*a[0] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 12(%esi), %eax
+ adcl $0, %ebx
+ movl 4(%esi), %edx
+ /* sqr a[3]*a[1] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 8(%esi), %eax
+ adcl $0, %ebx
+ /* sqr a[2]*a[2] */
+ mull %eax
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl (%esi), %edx
+ adcl $0, %ebx
+ movl %ecx, 16(%edi)
+ movl 20(%esi), %eax
+ /* saved r[4] */
+ /* ############### Calculate word 5 */
+ xorl %ecx, %ecx
+ /* sqr a[5]*a[0] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 16(%esi), %eax
+ adcl $0, %ecx
+ movl 4(%esi), %edx
+ /* sqr a[4]*a[1] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 12(%esi), %eax
+ adcl $0, %ecx
+ movl 8(%esi), %edx
+ /* sqr a[3]*a[2] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 24(%esi), %eax
+ adcl $0, %ecx
+ movl %ebp, 20(%edi)
+ movl (%esi), %edx
+ /* saved r[5] */
+ /* ############### Calculate word 6 */
+ xorl %ebp, %ebp
+ /* sqr a[6]*a[0] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 20(%esi), %eax
+ adcl $0, %ebp
+ movl 4(%esi), %edx
+ /* sqr a[5]*a[1] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 16(%esi), %eax
+ adcl $0, %ebp
+ movl 8(%esi), %edx
+ /* sqr a[4]*a[2] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 12(%esi), %eax
+ adcl $0, %ebp
+ /* sqr a[3]*a[3] */
+ mull %eax
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl (%esi), %edx
+ adcl $0, %ebp
+ movl %ebx, 24(%edi)
+ movl 28(%esi), %eax
+ /* saved r[6] */
+ /* ############### Calculate word 7 */
+ xorl %ebx, %ebx
+ /* sqr a[7]*a[0] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 24(%esi), %eax
+ adcl $0, %ebx
+ movl 4(%esi), %edx
+ /* sqr a[6]*a[1] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 20(%esi), %eax
+ adcl $0, %ebx
+ movl 8(%esi), %edx
+ /* sqr a[5]*a[2] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 16(%esi), %eax
+ adcl $0, %ebx
+ movl 12(%esi), %edx
+ /* sqr a[4]*a[3] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 28(%esi), %eax
+ adcl $0, %ebx
+ movl %ecx, 28(%edi)
+ movl 4(%esi), %edx
+ /* saved r[7] */
+ /* ############### Calculate word 8 */
+ xorl %ecx, %ecx
+ /* sqr a[7]*a[1] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 24(%esi), %eax
+ adcl $0, %ecx
+ movl 8(%esi), %edx
+ /* sqr a[6]*a[2] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 20(%esi), %eax
+ adcl $0, %ecx
+ movl 12(%esi), %edx
+ /* sqr a[5]*a[3] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 16(%esi), %eax
+ adcl $0, %ecx
+ /* sqr a[4]*a[4] */
+ mull %eax
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 8(%esi), %edx
+ adcl $0, %ecx
+ movl %ebp, 32(%edi)
+ movl 28(%esi), %eax
+ /* saved r[8] */
+ /* ############### Calculate word 9 */
+ xorl %ebp, %ebp
+ /* sqr a[7]*a[2] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 24(%esi), %eax
+ adcl $0, %ebp
+ movl 12(%esi), %edx
+ /* sqr a[6]*a[3] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 20(%esi), %eax
+ adcl $0, %ebp
+ movl 16(%esi), %edx
+ /* sqr a[5]*a[4] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 28(%esi), %eax
+ adcl $0, %ebp
+ movl %ebx, 36(%edi)
+ movl 12(%esi), %edx
+ /* saved r[9] */
+ /* ############### Calculate word 10 */
+ xorl %ebx, %ebx
+ /* sqr a[7]*a[3] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 24(%esi), %eax
+ adcl $0, %ebx
+ movl 16(%esi), %edx
+ /* sqr a[6]*a[4] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 20(%esi), %eax
+ adcl $0, %ebx
+ /* sqr a[5]*a[5] */
+ mull %eax
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 16(%esi), %edx
+ adcl $0, %ebx
+ movl %ecx, 40(%edi)
+ movl 28(%esi), %eax
+ /* saved r[10] */
+ /* ############### Calculate word 11 */
+ xorl %ecx, %ecx
+ /* sqr a[7]*a[4] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 24(%esi), %eax
+ adcl $0, %ecx
+ movl 20(%esi), %edx
+ /* sqr a[6]*a[5] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 28(%esi), %eax
+ adcl $0, %ecx
+ movl %ebp, 44(%edi)
+ movl 20(%esi), %edx
+ /* saved r[11] */
+ /* ############### Calculate word 12 */
+ xorl %ebp, %ebp
+ /* sqr a[7]*a[5] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 24(%esi), %eax
+ adcl $0, %ebp
+ /* sqr a[6]*a[6] */
+ mull %eax
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 24(%esi), %edx
+ adcl $0, %ebp
+ movl %ebx, 48(%edi)
+ movl 28(%esi), %eax
+ /* saved r[12] */
+ /* ############### Calculate word 13 */
+ xorl %ebx, %ebx
+ /* sqr a[7]*a[6] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 28(%esi), %eax
+ adcl $0, %ebx
+ movl %ecx, 52(%edi)
+ /* saved r[13] */
+ /* ############### Calculate word 14 */
+ xorl %ecx, %ecx
+ /* sqr a[7]*a[7] */
+ mull %eax
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ adcl $0, %ecx
+ movl %ebp, 56(%edi)
+ /* saved r[14] */
+ movl %ebx, 60(%edi)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.bn_sqr_comba8_end:
+ SIZE(bn_sqr_comba8,.bn_sqr_comba8_end-bn_sqr_comba8)
+.ident "desasm.pl"
+.text
+ .align ALIGN
+.globl bn_sqr_comba4
+ TYPE(bn_sqr_comba4,@function)
+bn_sqr_comba4:
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushl %ebx
+ movl 20(%esp), %edi
+ movl 24(%esp), %esi
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ movl (%esi), %eax
+ /* ############### Calculate word 0 */
+ xorl %ebp, %ebp
+ /* sqr a[0]*a[0] */
+ mull %eax
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl (%esi), %edx
+ adcl $0, %ebp
+ movl %ebx, (%edi)
+ movl 4(%esi), %eax
+ /* saved r[0] */
+ /* ############### Calculate word 1 */
+ xorl %ebx, %ebx
+ /* sqr a[1]*a[0] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 8(%esi), %eax
+ adcl $0, %ebx
+ movl %ecx, 4(%edi)
+ movl (%esi), %edx
+ /* saved r[1] */
+ /* ############### Calculate word 2 */
+ xorl %ecx, %ecx
+ /* sqr a[2]*a[0] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 4(%esi), %eax
+ adcl $0, %ecx
+ /* sqr a[1]*a[1] */
+ mull %eax
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl (%esi), %edx
+ adcl $0, %ecx
+ movl %ebp, 8(%edi)
+ movl 12(%esi), %eax
+ /* saved r[2] */
+ /* ############### Calculate word 3 */
+ xorl %ebp, %ebp
+ /* sqr a[3]*a[0] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 8(%esi), %eax
+ adcl $0, %ebp
+ movl 4(%esi), %edx
+ /* sqr a[2]*a[1] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 12(%esi), %eax
+ adcl $0, %ebp
+ movl %ebx, 12(%edi)
+ movl 4(%esi), %edx
+ /* saved r[3] */
+ /* ############### Calculate word 4 */
+ xorl %ebx, %ebx
+ /* sqr a[3]*a[1] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 8(%esi), %eax
+ adcl $0, %ebx
+ /* sqr a[2]*a[2] */
+ mull %eax
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 8(%esi), %edx
+ adcl $0, %ebx
+ movl %ecx, 16(%edi)
+ movl 12(%esi), %eax
+ /* saved r[4] */
+ /* ############### Calculate word 5 */
+ xorl %ecx, %ecx
+ /* sqr a[3]*a[2] */
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 12(%esi), %eax
+ adcl $0, %ecx
+ movl %ebp, 20(%edi)
+ /* saved r[5] */
+ /* ############### Calculate word 6 */
+ xorl %ebp, %ebp
+ /* sqr a[3]*a[3] */
+ mull %eax
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ adcl $0, %ebp
+ movl %ebx, 24(%edi)
+ /* saved r[6] */
+ movl %ecx, 28(%edi)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.bn_sqr_comba4_end:
+ SIZE(bn_sqr_comba4,.bn_sqr_comba4_end-bn_sqr_comba4)
+.ident "desasm.pl"
diff --git a/crypto/bn/asm/elf.s b/crypto/bn/asm/elf.s
new file mode 100644
index 0000000000..97ad1264db
--- /dev/null
+++ b/crypto/bn/asm/elf.s
@@ -0,0 +1,1269 @@
+ # Don't even think of reading this code
+ # It was automatically generated by bn-586.pl
+ # Which is a perl program used to generate the x86 assember for
+ # any of elf, a.out, BSDI,Win32, or Solaris
+ # eric <eay@cryptsoft.com>
+
+ .file "bn-586.s"
+ .version "01.01"
+gcc2_compiled.:
+.text
+ .align 16
+.globl bn_mul_comba8
+ .type bn_mul_comba8,@function
+bn_mul_comba8:
+ pushl %esi
+ movl 12(%esp), %esi
+ pushl %edi
+ movl 20(%esp), %edi
+ pushl %ebp
+ pushl %ebx
+ xorl %ebx, %ebx
+ movl (%esi), %eax
+ xorl %ecx, %ecx
+ movl (%edi), %edx
+ # ################## Calculate word 0
+ xorl %ebp, %ebp
+ # mul a[0]*b[0]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl (%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, (%eax)
+ movl 4(%esi), %eax
+ # saved r[0]
+ # ################## Calculate word 1
+ xorl %ebx, %ebx
+ # mul a[1]*b[0]
+ mull %edx
+ addl %eax, %ecx
+ movl (%esi), %eax
+ adcl %edx, %ebp
+ movl 4(%edi), %edx
+ adcl $0, %ebx
+ # mul a[0]*b[1]
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl (%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 4(%eax)
+ movl 8(%esi), %eax
+ # saved r[1]
+ # ################## Calculate word 2
+ xorl %ecx, %ecx
+ # mul a[2]*b[0]
+ mull %edx
+ addl %eax, %ebp
+ movl 4(%esi), %eax
+ adcl %edx, %ebx
+ movl 4(%edi), %edx
+ adcl $0, %ecx
+ # mul a[1]*b[1]
+ mull %edx
+ addl %eax, %ebp
+ movl (%esi), %eax
+ adcl %edx, %ebx
+ movl 8(%edi), %edx
+ adcl $0, %ecx
+ # mul a[0]*b[2]
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl (%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 8(%eax)
+ movl 12(%esi), %eax
+ # saved r[2]
+ # ################## Calculate word 3
+ xorl %ebp, %ebp
+ # mul a[3]*b[0]
+ mull %edx
+ addl %eax, %ebx
+ movl 8(%esi), %eax
+ adcl %edx, %ecx
+ movl 4(%edi), %edx
+ adcl $0, %ebp
+ # mul a[2]*b[1]
+ mull %edx
+ addl %eax, %ebx
+ movl 4(%esi), %eax
+ adcl %edx, %ecx
+ movl 8(%edi), %edx
+ adcl $0, %ebp
+ # mul a[1]*b[2]
+ mull %edx
+ addl %eax, %ebx
+ movl (%esi), %eax
+ adcl %edx, %ecx
+ movl 12(%edi), %edx
+ adcl $0, %ebp
+ # mul a[0]*b[3]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl (%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, 12(%eax)
+ movl 16(%esi), %eax
+ # saved r[3]
+ # ################## Calculate word 4
+ xorl %ebx, %ebx
+ # mul a[4]*b[0]
+ mull %edx
+ addl %eax, %ecx
+ movl 12(%esi), %eax
+ adcl %edx, %ebp
+ movl 4(%edi), %edx
+ adcl $0, %ebx
+ # mul a[3]*b[1]
+ mull %edx
+ addl %eax, %ecx
+ movl 8(%esi), %eax
+ adcl %edx, %ebp
+ movl 8(%edi), %edx
+ adcl $0, %ebx
+ # mul a[2]*b[2]
+ mull %edx
+ addl %eax, %ecx
+ movl 4(%esi), %eax
+ adcl %edx, %ebp
+ movl 12(%edi), %edx
+ adcl $0, %ebx
+ # mul a[1]*b[3]
+ mull %edx
+ addl %eax, %ecx
+ movl (%esi), %eax
+ adcl %edx, %ebp
+ movl 16(%edi), %edx
+ adcl $0, %ebx
+ # mul a[0]*b[4]
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl (%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 16(%eax)
+ movl 20(%esi), %eax
+ # saved r[4]
+ # ################## Calculate word 5
+ xorl %ecx, %ecx
+ # mul a[5]*b[0]
+ mull %edx
+ addl %eax, %ebp
+ movl 16(%esi), %eax
+ adcl %edx, %ebx
+ movl 4(%edi), %edx
+ adcl $0, %ecx
+ # mul a[4]*b[1]
+ mull %edx
+ addl %eax, %ebp
+ movl 12(%esi), %eax
+ adcl %edx, %ebx
+ movl 8(%edi), %edx
+ adcl $0, %ecx
+ # mul a[3]*b[2]
+ mull %edx
+ addl %eax, %ebp
+ movl 8(%esi), %eax
+ adcl %edx, %ebx
+ movl 12(%edi), %edx
+ adcl $0, %ecx
+ # mul a[2]*b[3]
+ mull %edx
+ addl %eax, %ebp
+ movl 4(%esi), %eax
+ adcl %edx, %ebx
+ movl 16(%edi), %edx
+ adcl $0, %ecx
+ # mul a[1]*b[4]
+ mull %edx
+ addl %eax, %ebp
+ movl (%esi), %eax
+ adcl %edx, %ebx
+ movl 20(%edi), %edx
+ adcl $0, %ecx
+ # mul a[0]*b[5]
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl (%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 20(%eax)
+ movl 24(%esi), %eax
+ # saved r[5]
+ # ################## Calculate word 6
+ xorl %ebp, %ebp
+ # mul a[6]*b[0]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esi), %eax
+ adcl %edx, %ecx
+ movl 4(%edi), %edx
+ adcl $0, %ebp
+ # mul a[5]*b[1]
+ mull %edx
+ addl %eax, %ebx
+ movl 16(%esi), %eax
+ adcl %edx, %ecx
+ movl 8(%edi), %edx
+ adcl $0, %ebp
+ # mul a[4]*b[2]
+ mull %edx
+ addl %eax, %ebx
+ movl 12(%esi), %eax
+ adcl %edx, %ecx
+ movl 12(%edi), %edx
+ adcl $0, %ebp
+ # mul a[3]*b[3]
+ mull %edx
+ addl %eax, %ebx
+ movl 8(%esi), %eax
+ adcl %edx, %ecx
+ movl 16(%edi), %edx
+ adcl $0, %ebp
+ # mul a[2]*b[4]
+ mull %edx
+ addl %eax, %ebx
+ movl 4(%esi), %eax
+ adcl %edx, %ecx
+ movl 20(%edi), %edx
+ adcl $0, %ebp
+ # mul a[1]*b[5]
+ mull %edx
+ addl %eax, %ebx
+ movl (%esi), %eax
+ adcl %edx, %ecx
+ movl 24(%edi), %edx
+ adcl $0, %ebp
+ # mul a[0]*b[6]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl (%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, 24(%eax)
+ movl 28(%esi), %eax
+ # saved r[6]
+ # ################## Calculate word 7
+ xorl %ebx, %ebx
+ # mul a[7]*b[0]
+ mull %edx
+ addl %eax, %ecx
+ movl 24(%esi), %eax
+ adcl %edx, %ebp
+ movl 4(%edi), %edx
+ adcl $0, %ebx
+ # mul a[6]*b[1]
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esi), %eax
+ adcl %edx, %ebp
+ movl 8(%edi), %edx
+ adcl $0, %ebx
+ # mul a[5]*b[2]
+ mull %edx
+ addl %eax, %ecx
+ movl 16(%esi), %eax
+ adcl %edx, %ebp
+ movl 12(%edi), %edx
+ adcl $0, %ebx
+ # mul a[4]*b[3]
+ mull %edx
+ addl %eax, %ecx
+ movl 12(%esi), %eax
+ adcl %edx, %ebp
+ movl 16(%edi), %edx
+ adcl $0, %ebx
+ # mul a[3]*b[4]
+ mull %edx
+ addl %eax, %ecx
+ movl 8(%esi), %eax
+ adcl %edx, %ebp
+ movl 20(%edi), %edx
+ adcl $0, %ebx
+ # mul a[2]*b[5]
+ mull %edx
+ addl %eax, %ecx
+ movl 4(%esi), %eax
+ adcl %edx, %ebp
+ movl 24(%edi), %edx
+ adcl $0, %ebx
+ # mul a[1]*b[6]
+ mull %edx
+ addl %eax, %ecx
+ movl (%esi), %eax
+ adcl %edx, %ebp
+ movl 28(%edi), %edx
+ adcl $0, %ebx
+ # mul a[0]*b[7]
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl 4(%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 28(%eax)
+ movl 28(%esi), %eax
+ # saved r[7]
+ # ################## Calculate word 8
+ xorl %ecx, %ecx
+ # mul a[7]*b[1]
+ mull %edx
+ addl %eax, %ebp
+ movl 24(%esi), %eax
+ adcl %edx, %ebx
+ movl 8(%edi), %edx
+ adcl $0, %ecx
+ # mul a[6]*b[2]
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esi), %eax
+ adcl %edx, %ebx
+ movl 12(%edi), %edx
+ adcl $0, %ecx
+ # mul a[5]*b[3]
+ mull %edx
+ addl %eax, %ebp
+ movl 16(%esi), %eax
+ adcl %edx, %ebx
+ movl 16(%edi), %edx
+ adcl $0, %ecx
+ # mul a[4]*b[4]
+ mull %edx
+ addl %eax, %ebp
+ movl 12(%esi), %eax
+ adcl %edx, %ebx
+ movl 20(%edi), %edx
+ adcl $0, %ecx
+ # mul a[3]*b[5]
+ mull %edx
+ addl %eax, %ebp
+ movl 8(%esi), %eax
+ adcl %edx, %ebx
+ movl 24(%edi), %edx
+ adcl $0, %ecx
+ # mul a[2]*b[6]
+ mull %edx
+ addl %eax, %ebp
+ movl 4(%esi), %eax
+ adcl %edx, %ebx
+ movl 28(%edi), %edx
+ adcl $0, %ecx
+ # mul a[1]*b[7]
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl 8(%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 32(%eax)
+ movl 28(%esi), %eax
+ # saved r[8]
+ # ################## Calculate word 9
+ xorl %ebp, %ebp
+ # mul a[7]*b[2]
+ mull %edx
+ addl %eax, %ebx
+ movl 24(%esi), %eax
+ adcl %edx, %ecx
+ movl 12(%edi), %edx
+ adcl $0, %ebp
+ # mul a[6]*b[3]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esi), %eax
+ adcl %edx, %ecx
+ movl 16(%edi), %edx
+ adcl $0, %ebp
+ # mul a[5]*b[4]
+ mull %edx
+ addl %eax, %ebx
+ movl 16(%esi), %eax
+ adcl %edx, %ecx
+ movl 20(%edi), %edx
+ adcl $0, %ebp
+ # mul a[4]*b[5]
+ mull %edx
+ addl %eax, %ebx
+ movl 12(%esi), %eax
+ adcl %edx, %ecx
+ movl 24(%edi), %edx
+ adcl $0, %ebp
+ # mul a[3]*b[6]
+ mull %edx
+ addl %eax, %ebx
+ movl 8(%esi), %eax
+ adcl %edx, %ecx
+ movl 28(%edi), %edx
+ adcl $0, %ebp
+ # mul a[2]*b[7]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl 12(%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, 36(%eax)
+ movl 28(%esi), %eax
+ # saved r[9]
+ # ################## Calculate word 10
+ xorl %ebx, %ebx
+ # mul a[7]*b[3]
+ mull %edx
+ addl %eax, %ecx
+ movl 24(%esi), %eax
+ adcl %edx, %ebp
+ movl 16(%edi), %edx
+ adcl $0, %ebx
+ # mul a[6]*b[4]
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esi), %eax
+ adcl %edx, %ebp
+ movl 20(%edi), %edx
+ adcl $0, %ebx
+ # mul a[5]*b[5]
+ mull %edx
+ addl %eax, %ecx
+ movl 16(%esi), %eax
+ adcl %edx, %ebp
+ movl 24(%edi), %edx
+ adcl $0, %ebx
+ # mul a[4]*b[6]
+ mull %edx
+ addl %eax, %ecx
+ movl 12(%esi), %eax
+ adcl %edx, %ebp
+ movl 28(%edi), %edx
+ adcl $0, %ebx
+ # mul a[3]*b[7]
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl 16(%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 40(%eax)
+ movl 28(%esi), %eax
+ # saved r[10]
+ # ################## Calculate word 11
+ xorl %ecx, %ecx
+ # mul a[7]*b[4]
+ mull %edx
+ addl %eax, %ebp
+ movl 24(%esi), %eax
+ adcl %edx, %ebx
+ movl 20(%edi), %edx
+ adcl $0, %ecx
+ # mul a[6]*b[5]
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esi), %eax
+ adcl %edx, %ebx
+ movl 24(%edi), %edx
+ adcl $0, %ecx
+ # mul a[5]*b[6]
+ mull %edx
+ addl %eax, %ebp
+ movl 16(%esi), %eax
+ adcl %edx, %ebx
+ movl 28(%edi), %edx
+ adcl $0, %ecx
+ # mul a[4]*b[7]
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl 20(%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 44(%eax)
+ movl 28(%esi), %eax
+ # saved r[11]
+ # ################## Calculate word 12
+ xorl %ebp, %ebp
+ # mul a[7]*b[5]
+ mull %edx
+ addl %eax, %ebx
+ movl 24(%esi), %eax
+ adcl %edx, %ecx
+ movl 24(%edi), %edx
+ adcl $0, %ebp
+ # mul a[6]*b[6]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esi), %eax
+ adcl %edx, %ecx
+ movl 28(%edi), %edx
+ adcl $0, %ebp
+ # mul a[5]*b[7]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl 24(%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, 48(%eax)
+ movl 28(%esi), %eax
+ # saved r[12]
+ # ################## Calculate word 13
+ xorl %ebx, %ebx
+ # mul a[7]*b[6]
+ mull %edx
+ addl %eax, %ecx
+ movl 24(%esi), %eax
+ adcl %edx, %ebp
+ movl 28(%edi), %edx
+ adcl $0, %ebx
+ # mul a[6]*b[7]
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl 28(%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 52(%eax)
+ movl 28(%esi), %eax
+ # saved r[13]
+ # ################## Calculate word 14
+ xorl %ecx, %ecx
+ # mul a[7]*b[7]
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ adcl $0, %ecx
+ movl %ebp, 56(%eax)
+ # saved r[14]
+ # save r[15]
+ movl %ebx, 60(%eax)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.bn_mul_comba8_end:
+ .size bn_mul_comba8,.bn_mul_comba8_end-bn_mul_comba8
+.ident "desasm.pl"
+.text
+ .align 16
+.globl bn_mul_comba4
+ .type bn_mul_comba4,@function
+bn_mul_comba4:
+ pushl %esi
+ movl 12(%esp), %esi
+ pushl %edi
+ movl 20(%esp), %edi
+ pushl %ebp
+ pushl %ebx
+ xorl %ebx, %ebx
+ movl (%esi), %eax
+ xorl %ecx, %ecx
+ movl (%edi), %edx
+ # ################## Calculate word 0
+ xorl %ebp, %ebp
+ # mul a[0]*b[0]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl (%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, (%eax)
+ movl 4(%esi), %eax
+ # saved r[0]
+ # ################## Calculate word 1
+ xorl %ebx, %ebx
+ # mul a[1]*b[0]
+ mull %edx
+ addl %eax, %ecx
+ movl (%esi), %eax
+ adcl %edx, %ebp
+ movl 4(%edi), %edx
+ adcl $0, %ebx
+ # mul a[0]*b[1]
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl (%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 4(%eax)
+ movl 8(%esi), %eax
+ # saved r[1]
+ # ################## Calculate word 2
+ xorl %ecx, %ecx
+ # mul a[2]*b[0]
+ mull %edx
+ addl %eax, %ebp
+ movl 4(%esi), %eax
+ adcl %edx, %ebx
+ movl 4(%edi), %edx
+ adcl $0, %ecx
+ # mul a[1]*b[1]
+ mull %edx
+ addl %eax, %ebp
+ movl (%esi), %eax
+ adcl %edx, %ebx
+ movl 8(%edi), %edx
+ adcl $0, %ecx
+ # mul a[0]*b[2]
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl (%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 8(%eax)
+ movl 12(%esi), %eax
+ # saved r[2]
+ # ################## Calculate word 3
+ xorl %ebp, %ebp
+ # mul a[3]*b[0]
+ mull %edx
+ addl %eax, %ebx
+ movl 8(%esi), %eax
+ adcl %edx, %ecx
+ movl 4(%edi), %edx
+ adcl $0, %ebp
+ # mul a[2]*b[1]
+ mull %edx
+ addl %eax, %ebx
+ movl 4(%esi), %eax
+ adcl %edx, %ecx
+ movl 8(%edi), %edx
+ adcl $0, %ebp
+ # mul a[1]*b[2]
+ mull %edx
+ addl %eax, %ebx
+ movl (%esi), %eax
+ adcl %edx, %ecx
+ movl 12(%edi), %edx
+ adcl $0, %ebp
+ # mul a[0]*b[3]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl 4(%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, 12(%eax)
+ movl 12(%esi), %eax
+ # saved r[3]
+ # ################## Calculate word 4
+ xorl %ebx, %ebx
+ # mul a[3]*b[1]
+ mull %edx
+ addl %eax, %ecx
+ movl 8(%esi), %eax
+ adcl %edx, %ebp
+ movl 8(%edi), %edx
+ adcl $0, %ebx
+ # mul a[2]*b[2]
+ mull %edx
+ addl %eax, %ecx
+ movl 4(%esi), %eax
+ adcl %edx, %ebp
+ movl 12(%edi), %edx
+ adcl $0, %ebx
+ # mul a[1]*b[3]
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl 8(%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 16(%eax)
+ movl 12(%esi), %eax
+ # saved r[4]
+ # ################## Calculate word 5
+ xorl %ecx, %ecx
+ # mul a[3]*b[2]
+ mull %edx
+ addl %eax, %ebp
+ movl 8(%esi), %eax
+ adcl %edx, %ebx
+ movl 12(%edi), %edx
+ adcl $0, %ecx
+ # mul a[2]*b[3]
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl 12(%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 20(%eax)
+ movl 12(%esi), %eax
+ # saved r[5]
+ # ################## Calculate word 6
+ xorl %ebp, %ebp
+ # mul a[3]*b[3]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ adcl $0, %ebp
+ movl %ebx, 24(%eax)
+ # saved r[6]
+ # save r[7]
+ movl %ecx, 28(%eax)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.bn_mul_comba4_end:
+ .size bn_mul_comba4,.bn_mul_comba4_end-bn_mul_comba4
+.ident "desasm.pl"
+.text
+ .align 16
+.globl bn_sqr_comba8
+ .type bn_sqr_comba8,@function
+bn_sqr_comba8:
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushl %ebx
+ movl 20(%esp), %edi
+ movl 24(%esp), %esi
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ movl (%esi), %eax
+ # ############### Calculate word 0
+ xorl %ebp, %ebp
+ # sqr a[0]*a[0]
+ mull %eax
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl (%esi), %edx
+ adcl $0, %ebp
+ movl %ebx, (%edi)
+ movl 4(%esi), %eax
+ # saved r[0]
+ # ############### Calculate word 1
+ xorl %ebx, %ebx
+ # sqr a[1]*a[0]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 8(%esi), %eax
+ adcl $0, %ebx
+ movl %ecx, 4(%edi)
+ movl (%esi), %edx
+ # saved r[1]
+ # ############### Calculate word 2
+ xorl %ecx, %ecx
+ # sqr a[2]*a[0]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 4(%esi), %eax
+ adcl $0, %ecx
+ # sqr a[1]*a[1]
+ mull %eax
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl (%esi), %edx
+ adcl $0, %ecx
+ movl %ebp, 8(%edi)
+ movl 12(%esi), %eax
+ # saved r[2]
+ # ############### Calculate word 3
+ xorl %ebp, %ebp
+ # sqr a[3]*a[0]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 8(%esi), %eax
+ adcl $0, %ebp
+ movl 4(%esi), %edx
+ # sqr a[2]*a[1]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 16(%esi), %eax
+ adcl $0, %ebp
+ movl %ebx, 12(%edi)
+ movl (%esi), %edx
+ # saved r[3]
+ # ############### Calculate word 4
+ xorl %ebx, %ebx
+ # sqr a[4]*a[0]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 12(%esi), %eax
+ adcl $0, %ebx
+ movl 4(%esi), %edx
+ # sqr a[3]*a[1]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 8(%esi), %eax
+ adcl $0, %ebx
+ # sqr a[2]*a[2]
+ mull %eax
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl (%esi), %edx
+ adcl $0, %ebx
+ movl %ecx, 16(%edi)
+ movl 20(%esi), %eax
+ # saved r[4]
+ # ############### Calculate word 5
+ xorl %ecx, %ecx
+ # sqr a[5]*a[0]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 16(%esi), %eax
+ adcl $0, %ecx
+ movl 4(%esi), %edx
+ # sqr a[4]*a[1]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 12(%esi), %eax
+ adcl $0, %ecx
+ movl 8(%esi), %edx
+ # sqr a[3]*a[2]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 24(%esi), %eax
+ adcl $0, %ecx
+ movl %ebp, 20(%edi)
+ movl (%esi), %edx
+ # saved r[5]
+ # ############### Calculate word 6
+ xorl %ebp, %ebp
+ # sqr a[6]*a[0]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 20(%esi), %eax
+ adcl $0, %ebp
+ movl 4(%esi), %edx
+ # sqr a[5]*a[1]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 16(%esi), %eax
+ adcl $0, %ebp
+ movl 8(%esi), %edx
+ # sqr a[4]*a[2]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 12(%esi), %eax
+ adcl $0, %ebp
+ # sqr a[3]*a[3]
+ mull %eax
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl (%esi), %edx
+ adcl $0, %ebp
+ movl %ebx, 24(%edi)
+ movl 28(%esi), %eax
+ # saved r[6]
+ # ############### Calculate word 7
+ xorl %ebx, %ebx
+ # sqr a[7]*a[0]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 24(%esi), %eax
+ adcl $0, %ebx
+ movl 4(%esi), %edx
+ # sqr a[6]*a[1]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 20(%esi), %eax
+ adcl $0, %ebx
+ movl 8(%esi), %edx
+ # sqr a[5]*a[2]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 16(%esi), %eax
+ adcl $0, %ebx
+ movl 12(%esi), %edx
+ # sqr a[4]*a[3]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 28(%esi), %eax
+ adcl $0, %ebx
+ movl %ecx, 28(%edi)
+ movl 4(%esi), %edx
+ # saved r[7]
+ # ############### Calculate word 8
+ xorl %ecx, %ecx
+ # sqr a[7]*a[1]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 24(%esi), %eax
+ adcl $0, %ecx
+ movl 8(%esi), %edx
+ # sqr a[6]*a[2]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 20(%esi), %eax
+ adcl $0, %ecx
+ movl 12(%esi), %edx
+ # sqr a[5]*a[3]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 16(%esi), %eax
+ adcl $0, %ecx
+ # sqr a[4]*a[4]
+ mull %eax
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 8(%esi), %edx
+ adcl $0, %ecx
+ movl %ebp, 32(%edi)
+ movl 28(%esi), %eax
+ # saved r[8]
+ # ############### Calculate word 9
+ xorl %ebp, %ebp
+ # sqr a[7]*a[2]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 24(%esi), %eax
+ adcl $0, %ebp
+ movl 12(%esi), %edx
+ # sqr a[6]*a[3]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 20(%esi), %eax
+ adcl $0, %ebp
+ movl 16(%esi), %edx
+ # sqr a[5]*a[4]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 28(%esi), %eax
+ adcl $0, %ebp
+ movl %ebx, 36(%edi)
+ movl 12(%esi), %edx
+ # saved r[9]
+ # ############### Calculate word 10
+ xorl %ebx, %ebx
+ # sqr a[7]*a[3]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 24(%esi), %eax
+ adcl $0, %ebx
+ movl 16(%esi), %edx
+ # sqr a[6]*a[4]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 20(%esi), %eax
+ adcl $0, %ebx
+ # sqr a[5]*a[5]
+ mull %eax
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 16(%esi), %edx
+ adcl $0, %ebx
+ movl %ecx, 40(%edi)
+ movl 28(%esi), %eax
+ # saved r[10]
+ # ############### Calculate word 11
+ xorl %ecx, %ecx
+ # sqr a[7]*a[4]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 24(%esi), %eax
+ adcl $0, %ecx
+ movl 20(%esi), %edx
+ # sqr a[6]*a[5]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 28(%esi), %eax
+ adcl $0, %ecx
+ movl %ebp, 44(%edi)
+ movl 20(%esi), %edx
+ # saved r[11]
+ # ############### Calculate word 12
+ xorl %ebp, %ebp
+ # sqr a[7]*a[5]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 24(%esi), %eax
+ adcl $0, %ebp
+ # sqr a[6]*a[6]
+ mull %eax
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 24(%esi), %edx
+ adcl $0, %ebp
+ movl %ebx, 48(%edi)
+ movl 28(%esi), %eax
+ # saved r[12]
+ # ############### Calculate word 13
+ xorl %ebx, %ebx
+ # sqr a[7]*a[6]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 28(%esi), %eax
+ adcl $0, %ebx
+ movl %ecx, 52(%edi)
+ # saved r[13]
+ # ############### Calculate word 14
+ xorl %ecx, %ecx
+ # sqr a[7]*a[7]
+ mull %eax
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ adcl $0, %ecx
+ movl %ebp, 56(%edi)
+ # saved r[14]
+ movl %ebx, 60(%edi)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.bn_sqr_comba8_end:
+ .size bn_sqr_comba8,.bn_sqr_comba8_end-bn_sqr_comba8
+.ident "desasm.pl"
+.text
+ .align 16
+.globl bn_sqr_comba4
+ .type bn_sqr_comba4,@function
+bn_sqr_comba4:
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushl %ebx
+ movl 20(%esp), %edi
+ movl 24(%esp), %esi
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ movl (%esi), %eax
+ # ############### Calculate word 0
+ xorl %ebp, %ebp
+ # sqr a[0]*a[0]
+ mull %eax
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl (%esi), %edx
+ adcl $0, %ebp
+ movl %ebx, (%edi)
+ movl 4(%esi), %eax
+ # saved r[0]
+ # ############### Calculate word 1
+ xorl %ebx, %ebx
+ # sqr a[1]*a[0]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 8(%esi), %eax
+ adcl $0, %ebx
+ movl %ecx, 4(%edi)
+ movl (%esi), %edx
+ # saved r[1]
+ # ############### Calculate word 2
+ xorl %ecx, %ecx
+ # sqr a[2]*a[0]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 4(%esi), %eax
+ adcl $0, %ecx
+ # sqr a[1]*a[1]
+ mull %eax
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl (%esi), %edx
+ adcl $0, %ecx
+ movl %ebp, 8(%edi)
+ movl 12(%esi), %eax
+ # saved r[2]
+ # ############### Calculate word 3
+ xorl %ebp, %ebp
+ # sqr a[3]*a[0]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 8(%esi), %eax
+ adcl $0, %ebp
+ movl 4(%esi), %edx
+ # sqr a[2]*a[1]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 12(%esi), %eax
+ adcl $0, %ebp
+ movl %ebx, 12(%edi)
+ movl 4(%esi), %edx
+ # saved r[3]
+ # ############### Calculate word 4
+ xorl %ebx, %ebx
+ # sqr a[3]*a[1]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 8(%esi), %eax
+ adcl $0, %ebx
+ # sqr a[2]*a[2]
+ mull %eax
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 8(%esi), %edx
+ adcl $0, %ebx
+ movl %ecx, 16(%edi)
+ movl 12(%esi), %eax
+ # saved r[4]
+ # ############### Calculate word 5
+ xorl %ecx, %ecx
+ # sqr a[3]*a[2]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 12(%esi), %eax
+ adcl $0, %ecx
+ movl %ebp, 20(%edi)
+ # saved r[5]
+ # ############### Calculate word 6
+ xorl %ebp, %ebp
+ # sqr a[3]*a[3]
+ mull %eax
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ adcl $0, %ebp
+ movl %ebx, 24(%edi)
+ # saved r[6]
+ movl %ecx, 28(%edi)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.bn_sqr_comba4_end:
+ .size bn_sqr_comba4,.bn_sqr_comba4_end-bn_sqr_comba4
+.ident "desasm.pl"
diff --git a/crypto/bn/asm/f b/crypto/bn/asm/f
new file mode 100644
index 0000000000..a23fa159b2
--- /dev/null
+++ b/crypto/bn/asm/f
@@ -0,0 +1,500 @@
+ .text
+ .align 3
+ .globl bn_sqr_comba8
+ .ent bn_sqr_comba8
+bn_sqr_comba8:
+bn_sqr_comba8..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ ldq $0, 0($17)
+ ldq $1, 8($17)
+ ldq $2, 16($17)
+ ldq $3, 24($17)
+ ldq $4, 32($17)
+ ldq $5, 40($17)
+ ldq $6, 48($17)
+ ldq $7, 56($17)
+ bis $31, $31, $23
+ mulq $0, $0, $8
+ umulh $0, $0, $22
+ stq $8, 0($16)
+ bis $31, $31, $8
+ mulq $1, $0, $24
+ umulh $1, $0, $25
+ cmplt $24, $31, $27
+ cmplt $25, $31, $28
+ addq $24, $24, $24
+ addq $25, $25, $25
+ addq $25, $27, $25
+ addq $8, $28, $8
+ addq $22, $24, $22
+ addq $23, $25, $23
+ cmpult $22, $24, $21
+ cmpult $23, $25, $20
+ addq $23, $21, $23
+ addq $8, $20, $8
+ stq $22, 8($16)
+ bis $31, $31, $22
+ mulq $1, $1, $19
+ umulh $1, $1, $18
+ addq $23, $19, $23
+ addq $8, $18, $8
+ cmpult $23, $19, $17
+ cmpult $8, $18, $27
+ addq $8, $17, $8
+ addq $22, $27, $22
+ mulq $2, $0, $28
+ umulh $2, $0, $24
+ cmplt $28, $31, $25
+ cmplt $24, $31, $21
+ addq $28, $28, $28
+ addq $24, $24, $24
+ addq $24, $25, $24
+ addq $22, $21, $22
+ addq $23, $28, $23
+ addq $8, $24, $8
+ cmpult $23, $28, $20
+ cmpult $8, $24, $19
+ addq $8, $20, $8
+ addq $22, $19, $22
+ stq $23, 16($16)
+ bis $31, $31, $23
+ mulq $2, $1, $18
+ umulh $2, $1, $17
+ cmplt $18, $31, $27
+ cmplt $17, $31, $25
+ addq $18, $18, $18
+ addq $17, $17, $17
+ addq $17, $27, $17
+ addq $23, $25, $23
+ addq $8, $18, $8
+ addq $22, $17, $22
+ cmpult $8, $18, $21
+ cmpult $22, $17, $28
+ addq $22, $21, $22
+ addq $23, $28, $23
+ mulq $3, $0, $24
+ umulh $3, $0, $20
+ cmplt $24, $31, $19
+ cmplt $20, $31, $27
+ addq $24, $24, $24
+ addq $20, $20, $20
+ addq $20, $19, $20
+ addq $23, $27, $23
+ addq $8, $24, $8
+ addq $22, $20, $22
+ cmpult $8, $24, $25
+ cmpult $22, $20, $18
+ addq $22, $25, $22
+ addq $23, $18, $23
+ stq $8, 24($16)
+ bis $31, $31, $8
+ mulq $2, $2, $17
+ umulh $2, $2, $21
+ addq $22, $17, $22
+ addq $23, $21, $23
+ cmpult $22, $17, $28
+ cmpult $23, $21, $19
+ addq $23, $28, $23
+ addq $8, $19, $8
+ mulq $3, $1, $27
+ umulh $3, $1, $24
+ cmplt $27, $31, $20
+ cmplt $24, $31, $25
+ addq $27, $27, $27
+ addq $24, $24, $24
+ addq $24, $20, $24
+ addq $8, $25, $8
+ addq $22, $27, $22
+ addq $23, $24, $23
+ cmpult $22, $27, $18
+ cmpult $23, $24, $17
+ addq $23, $18, $23
+ addq $8, $17, $8
+ mulq $4, $0, $21
+ umulh $4, $0, $28
+ cmplt $21, $31, $19
+ cmplt $28, $31, $20
+ addq $21, $21, $21
+ addq $28, $28, $28
+ addq $28, $19, $28
+ addq $8, $20, $8
+ addq $22, $21, $22
+ addq $23, $28, $23
+ cmpult $22, $21, $25
+ cmpult $23, $28, $27
+ addq $23, $25, $23
+ addq $8, $27, $8
+ stq $22, 32($16)
+ bis $31, $31, $22
+ mulq $3, $2, $24
+ umulh $3, $2, $18
+ cmplt $24, $31, $17
+ cmplt $18, $31, $19
+ addq $24, $24, $24
+ addq $18, $18, $18
+ addq $18, $17, $18
+ addq $22, $19, $22
+ addq $23, $24, $23
+ addq $8, $18, $8
+ cmpult $23, $24, $20
+ cmpult $8, $18, $21
+ addq $8, $20, $8
+ addq $22, $21, $22
+ mulq $4, $1, $28
+ umulh $4, $1, $25
+ cmplt $28, $31, $27
+ cmplt $25, $31, $17
+ addq $28, $28, $28
+ addq $25, $25, $25
+ addq $25, $27, $25
+ addq $22, $17, $22
+ addq $23, $28, $23
+ addq $8, $25, $8
+ cmpult $23, $28, $19
+ cmpult $8, $25, $24
+ addq $8, $19, $8
+ addq $22, $24, $22
+ mulq $5, $0, $18
+ umulh $5, $0, $20
+ cmplt $18, $31, $21
+ cmplt $20, $31, $27
+ addq $18, $18, $18
+ addq $20, $20, $20
+ addq $20, $21, $20
+ addq $22, $27, $22
+ addq $23, $18, $23
+ addq $8, $20, $8
+ cmpult $23, $18, $17
+ cmpult $8, $20, $28
+ addq $8, $17, $8
+ addq $22, $28, $22
+ stq $23, 40($16)
+ bis $31, $31, $23
+ mulq $3, $3, $25
+ umulh $3, $3, $19
+ addq $8, $25, $8
+ addq $22, $19, $22
+ cmpult $8, $25, $24
+ cmpult $22, $19, $21
+ addq $22, $24, $22
+ addq $23, $21, $23
+ mulq $4, $2, $27
+ umulh $4, $2, $18
+ cmplt $27, $31, $20
+ cmplt $18, $31, $17
+ addq $27, $27, $27
+ addq $18, $18, $18
+ addq $18, $20, $18
+ addq $23, $17, $23
+ addq $8, $27, $8
+ addq $22, $18, $22
+ cmpult $8, $27, $28
+ cmpult $22, $18, $25
+ addq $22, $28, $22
+ addq $23, $25, $23
+ mulq $5, $1, $19
+ umulh $5, $1, $24
+ cmplt $19, $31, $21
+ cmplt $24, $31, $20
+ addq $19, $19, $19
+ addq $24, $24, $24
+ addq $24, $21, $24
+ addq $23, $20, $23
+ addq $8, $19, $8
+ addq $22, $24, $22
+ cmpult $8, $19, $17
+ cmpult $22, $24, $27
+ addq $22, $17, $22
+ addq $23, $27, $23
+ mulq $6, $0, $18
+ umulh $6, $0, $28
+ cmplt $18, $31, $25
+ cmplt $28, $31, $21
+ addq $18, $18, $18
+ addq $28, $28, $28
+ addq $28, $25, $28
+ addq $23, $21, $23
+ addq $8, $18, $8
+ addq $22, $28, $22
+ cmpult $8, $18, $20
+ cmpult $22, $28, $19
+ addq $22, $20, $22
+ addq $23, $19, $23
+ stq $8, 48($16)
+ bis $31, $31, $8
+ mulq $4, $3, $24
+ umulh $4, $3, $17
+ cmplt $24, $31, $27
+ cmplt $17, $31, $25
+ addq $24, $24, $24
+ addq $17, $17, $17
+ addq $17, $27, $17
+ addq $8, $25, $8
+ addq $22, $24, $22
+ addq $23, $17, $23
+ cmpult $22, $24, $21
+ cmpult $23, $17, $18
+ addq $23, $21, $23
+ addq $8, $18, $8
+ mulq $5, $2, $28
+ umulh $5, $2, $20
+ cmplt $28, $31, $19
+ cmplt $20, $31, $27
+ addq $28, $28, $28
+ addq $20, $20, $20
+ addq $20, $19, $20
+ addq $8, $27, $8
+ addq $22, $28, $22
+ addq $23, $20, $23
+ cmpult $22, $28, $25
+ cmpult $23, $20, $24
+ addq $23, $25, $23
+ addq $8, $24, $8
+ mulq $6, $1, $17
+ umulh $6, $1, $21
+ cmplt $17, $31, $18
+ cmplt $21, $31, $19
+ addq $17, $17, $17
+ addq $21, $21, $21
+ addq $21, $18, $21
+ addq $8, $19, $8
+ addq $22, $17, $22
+ addq $23, $21, $23
+ cmpult $22, $17, $27
+ cmpult $23, $21, $28
+ addq $23, $27, $23
+ addq $8, $28, $8
+ mulq $7, $0, $20
+ umulh $7, $0, $25
+ cmplt $20, $31, $24
+ cmplt $25, $31, $18
+ addq $20, $20, $20
+ addq $25, $25, $25
+ addq $25, $24, $25
+ addq $8, $18, $8
+ addq $22, $20, $22
+ addq $23, $25, $23
+ cmpult $22, $20, $19
+ cmpult $23, $25, $17
+ addq $23, $19, $23
+ addq $8, $17, $8
+ stq $22, 56($16)
+ bis $31, $31, $22
+ mulq $4, $4, $21
+ umulh $4, $4, $27
+ addq $23, $21, $23
+ addq $8, $27, $8
+ cmpult $23, $21, $28
+ cmpult $8, $27, $24
+ addq $8, $28, $8
+ addq $22, $24, $22
+ mulq $5, $3, $18
+ umulh $5, $3, $20
+ cmplt $18, $31, $25
+ cmplt $20, $31, $19
+ addq $18, $18, $18
+ addq $20, $20, $20
+ addq $20, $25, $20
+ addq $22, $19, $22
+ addq $23, $18, $23
+ addq $8, $20, $8
+ cmpult $23, $18, $17
+ cmpult $8, $20, $21
+ addq $8, $17, $8
+ addq $22, $21, $22
+ mulq $6, $2, $27
+ umulh $6, $2, $28
+ cmplt $27, $31, $24
+ cmplt $28, $31, $25
+ addq $27, $27, $27
+ addq $28, $28, $28
+ addq $28, $24, $28
+ addq $22, $25, $22
+ addq $23, $27, $23
+ addq $8, $28, $8
+ cmpult $23, $27, $19
+ cmpult $8, $28, $18
+ addq $8, $19, $8
+ addq $22, $18, $22
+ mulq $7, $1, $20
+ umulh $7, $1, $17
+ cmplt $20, $31, $21
+ cmplt $17, $31, $24
+ addq $20, $20, $20
+ addq $17, $17, $17
+ addq $17, $21, $17
+ addq $22, $24, $22
+ addq $23, $20, $23
+ addq $8, $17, $8
+ cmpult $23, $20, $25
+ cmpult $8, $17, $27
+ addq $8, $25, $8
+ addq $22, $27, $22
+ stq $23, 64($16)
+ bis $31, $31, $23
+ mulq $5, $4, $28
+ umulh $5, $4, $19
+ cmplt $28, $31, $18
+ cmplt $19, $31, $21
+ addq $28, $28, $28
+ addq $19, $19, $19
+ addq $19, $18, $19
+ addq $23, $21, $23
+ addq $8, $28, $8
+ addq $22, $19, $22
+ cmpult $8, $28, $24
+ cmpult $22, $19, $20
+ addq $22, $24, $22
+ addq $23, $20, $23
+ mulq $6, $3, $17
+ umulh $6, $3, $25
+ cmplt $17, $31, $27
+ cmplt $25, $31, $18
+ addq $17, $17, $17
+ addq $25, $25, $25
+ addq $25, $27, $25
+ addq $23, $18, $23
+ addq $8, $17, $8
+ addq $22, $25, $22
+ cmpult $8, $17, $21
+ cmpult $22, $25, $28
+ addq $22, $21, $22
+ addq $23, $28, $23
+ mulq $7, $2, $19
+ umulh $7, $2, $24
+ cmplt $19, $31, $20
+ cmplt $24, $31, $27
+ addq $19, $19, $19
+ addq $24, $24, $24
+ addq $24, $20, $24
+ addq $23, $27, $23
+ addq $8, $19, $8
+ addq $22, $24, $22
+ cmpult $8, $19, $18
+ cmpult $22, $24, $17
+ addq $22, $18, $22
+ addq $23, $17, $23
+ stq $8, 72($16)
+ bis $31, $31, $8
+ mulq $5, $5, $25
+ umulh $5, $5, $21
+ addq $22, $25, $22
+ addq $23, $21, $23
+ cmpult $22, $25, $28
+ cmpult $23, $21, $20
+ addq $23, $28, $23
+ addq $8, $20, $8
+ mulq $6, $4, $27
+ umulh $6, $4, $19
+ cmplt $27, $31, $24
+ cmplt $19, $31, $18
+ addq $27, $27, $27
+ addq $19, $19, $19
+ addq $19, $24, $19
+ addq $8, $18, $8
+ addq $22, $27, $22
+ addq $23, $19, $23
+ cmpult $22, $27, $17
+ cmpult $23, $19, $25
+ addq $23, $17, $23
+ addq $8, $25, $8
+ mulq $7, $3, $21
+ umulh $7, $3, $28
+ cmplt $21, $31, $20
+ cmplt $28, $31, $24
+ addq $21, $21, $21
+ addq $28, $28, $28
+ addq $28, $20, $28
+ addq $8, $24, $8
+ addq $22, $21, $22
+ addq $23, $28, $23
+ cmpult $22, $21, $18
+ cmpult $23, $28, $27
+ addq $23, $18, $23
+ addq $8, $27, $8
+ stq $22, 80($16)
+ bis $31, $31, $22
+ mulq $6, $5, $19
+ umulh $6, $5, $17
+ cmplt $19, $31, $25
+ cmplt $17, $31, $20
+ addq $19, $19, $19
+ addq $17, $17, $17
+ addq $17, $25, $17
+ addq $22, $20, $22
+ addq $23, $19, $23
+ addq $8, $17, $8
+ cmpult $23, $19, $24
+ cmpult $8, $17, $21
+ addq $8, $24, $8
+ addq $22, $21, $22
+ mulq $7, $4, $28
+ umulh $7, $4, $18
+ cmplt $28, $31, $27
+ cmplt $18, $31, $25
+ addq $28, $28, $28
+ addq $18, $18, $18
+ addq $18, $27, $18
+ addq $22, $25, $22
+ addq $23, $28, $23
+ addq $8, $18, $8
+ cmpult $23, $28, $20
+ cmpult $8, $18, $19
+ addq $8, $20, $8
+ addq $22, $19, $22
+ stq $23, 88($16)
+ bis $31, $31, $23
+ mulq $6, $6, $17
+ umulh $6, $6, $24
+ addq $8, $17, $8
+ addq $22, $24, $22
+ cmpult $8, $17, $21
+ cmpult $22, $24, $27
+ addq $22, $21, $22
+ addq $23, $27, $23
+ mulq $7, $5, $25
+ umulh $7, $5, $28
+ cmplt $25, $31, $18
+ cmplt $28, $31, $20
+ addq $25, $25, $25
+ addq $28, $28, $28
+ addq $28, $18, $28
+ addq $23, $20, $23
+ addq $8, $25, $8
+ addq $22, $28, $22
+ cmpult $8, $25, $19
+ cmpult $22, $28, $17
+ addq $22, $19, $22
+ addq $23, $17, $23
+ stq $8, 96($16)
+ bis $31, $31, $8
+ mulq $7, $6, $24
+ umulh $7, $6, $21
+ cmplt $24, $31, $27
+ cmplt $21, $31, $18
+ addq $24, $24, $24
+ addq $21, $21, $21
+ addq $21, $27, $21
+ addq $8, $18, $8
+ addq $22, $24, $22
+ addq $23, $21, $23
+ cmpult $22, $24, $20
+ cmpult $23, $21, $25
+ addq $23, $20, $23
+ addq $8, $25, $8
+ stq $22, 104($16)
+ bis $31, $31, $22
+ mulq $7, $7, $28
+ umulh $7, $7, $19
+ addq $23, $28, $23
+ addq $8, $19, $8
+ cmpult $23, $28, $17
+ cmpult $8, $19, $27
+ addq $8, $17, $8
+ addq $22, $27, $22
+ stq $23, 112($16)
+ stq $8, 120($16)
+ ret $31,($26),1
+ .end bn_sqr_comba8
diff --git a/crypto/bn/asm/f.c b/crypto/bn/asm/f.c
new file mode 100644
index 0000000000..bfdccae4a0
--- /dev/null
+++ b/crypto/bn/asm/f.c
@@ -0,0 +1,8 @@
+int abc(a,b,c,d,e,f,g,h,i,j)
+unsigned long a,b,c,d,e,f,g,h,i,j;
+ {
+ gg(g);
+ if (g)
+ gg(h);
+ gg(i);
+ }
diff --git a/crypto/bn/asm/f.elf b/crypto/bn/asm/f.elf
new file mode 100644
index 0000000000..39d07b79e1
--- /dev/null
+++ b/crypto/bn/asm/f.elf
@@ -0,0 +1,2149 @@
+ # Don't even think of reading this code
+ # It was automatically generated by bn-586.pl
+ # Which is a perl program used to generate the x86 assember for
+ # any of elf, a.out, BSDI,Win32, or Solaris
+ # eric <eay@cryptsoft.com>
+
+ .file "bn-586.s"
+ .version "01.01"
+gcc2_compiled.:
+.text
+ .align 16
+.globl bn_mul_add_words
+ .type bn_mul_add_words,@function
+bn_mul_add_words:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+
+ xorl %esi, %esi
+ movl 20(%esp), %edi
+ movl 28(%esp), %ecx
+ movl 24(%esp), %ebx
+ andl $4294967288, %ecx
+ movl 32(%esp), %ebp
+ pushl %ecx
+ jz .L000maw_finish
+.L001maw_loop:
+ movl %ecx, (%esp)
+ # Round 0
+ movl (%ebx), %eax
+ mull %ebp
+ addl %esi, %eax
+ movl (%edi), %esi
+ adcl $0, %edx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, (%edi)
+ movl %edx, %esi
+ # Round 4
+ movl 4(%ebx), %eax
+ mull %ebp
+ addl %esi, %eax
+ movl 4(%edi), %esi
+ adcl $0, %edx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, 4(%edi)
+ movl %edx, %esi
+ # Round 8
+ movl 8(%ebx), %eax
+ mull %ebp
+ addl %esi, %eax
+ movl 8(%edi), %esi
+ adcl $0, %edx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, 8(%edi)
+ movl %edx, %esi
+ # Round 12
+ movl 12(%ebx), %eax
+ mull %ebp
+ addl %esi, %eax
+ movl 12(%edi), %esi
+ adcl $0, %edx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, 12(%edi)
+ movl %edx, %esi
+ # Round 16
+ movl 16(%ebx), %eax
+ mull %ebp
+ addl %esi, %eax
+ movl 16(%edi), %esi
+ adcl $0, %edx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, 16(%edi)
+ movl %edx, %esi
+ # Round 20
+ movl 20(%ebx), %eax
+ mull %ebp
+ addl %esi, %eax
+ movl 20(%edi), %esi
+ adcl $0, %edx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, 20(%edi)
+ movl %edx, %esi
+ # Round 24
+ movl 24(%ebx), %eax
+ mull %ebp
+ addl %esi, %eax
+ movl 24(%edi), %esi
+ adcl $0, %edx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, 24(%edi)
+ movl %edx, %esi
+ # Round 28
+ movl 28(%ebx), %eax
+ mull %ebp
+ addl %esi, %eax
+ movl 28(%edi), %esi
+ adcl $0, %edx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, 28(%edi)
+ movl %edx, %esi
+
+ movl (%esp), %ecx
+ addl $32, %ebx
+ addl $32, %edi
+ subl $8, %ecx
+ jnz .L001maw_loop
+.L000maw_finish:
+ movl 32(%esp), %ecx
+ andl $7, %ecx
+ jnz .L002maw_finish2
+ jmp .L003maw_end
+.align 16
+.L002maw_finish2:
+ # Tail Round 0
+ movl (%ebx), %eax
+ mull %ebp
+ addl %esi, %eax
+ movl (%edi), %esi
+ adcl $0, %edx
+ addl %esi, %eax
+ adcl $0, %edx
+ decl %ecx
+ movl %eax, (%edi)
+ movl %edx, %esi
+ jz .L003maw_end
+ # Tail Round 1
+ movl 4(%ebx), %eax
+ mull %ebp
+ addl %esi, %eax
+ movl 4(%edi), %esi
+ adcl $0, %edx
+ addl %esi, %eax
+ adcl $0, %edx
+ decl %ecx
+ movl %eax, 4(%edi)
+ movl %edx, %esi
+ jz .L003maw_end
+ # Tail Round 2
+ movl 8(%ebx), %eax
+ mull %ebp
+ addl %esi, %eax
+ movl 8(%edi), %esi
+ adcl $0, %edx
+ addl %esi, %eax
+ adcl $0, %edx
+ decl %ecx
+ movl %eax, 8(%edi)
+ movl %edx, %esi
+ jz .L003maw_end
+ # Tail Round 3
+ movl 12(%ebx), %eax
+ mull %ebp
+ addl %esi, %eax
+ movl 12(%edi), %esi
+ adcl $0, %edx
+ addl %esi, %eax
+ adcl $0, %edx
+ decl %ecx
+ movl %eax, 12(%edi)
+ movl %edx, %esi
+ jz .L003maw_end
+ # Tail Round 4
+ movl 16(%ebx), %eax
+ mull %ebp
+ addl %esi, %eax
+ movl 16(%edi), %esi
+ adcl $0, %edx
+ addl %esi, %eax
+ adcl $0, %edx
+ decl %ecx
+ movl %eax, 16(%edi)
+ movl %edx, %esi
+ jz .L003maw_end
+ # Tail Round 5
+ movl 20(%ebx), %eax
+ mull %ebp
+ addl %esi, %eax
+ movl 20(%edi), %esi
+ adcl $0, %edx
+ addl %esi, %eax
+ adcl $0, %edx
+ decl %ecx
+ movl %eax, 20(%edi)
+ movl %edx, %esi
+ jz .L003maw_end
+ # Tail Round 6
+ movl 24(%ebx), %eax
+ mull %ebp
+ addl %esi, %eax
+ movl 24(%edi), %esi
+ adcl $0, %edx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, 24(%edi)
+ movl %edx, %esi
+.L003maw_end:
+ movl %esi, %eax
+ popl %ecx
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.bn_mul_add_words_end:
+ .size bn_mul_add_words,.bn_mul_add_words_end-bn_mul_add_words
+.ident "bn_mul_add_words"
+.text
+ .align 16
+.globl bn_mul_words
+ .type bn_mul_words,@function
+bn_mul_words:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+
+ xorl %esi, %esi
+ movl 20(%esp), %edi
+ movl 24(%esp), %ebx
+ movl 28(%esp), %ebp
+ movl 32(%esp), %ecx
+ andl $4294967288, %ebp
+ jz .L004mw_finish
+.L005mw_loop:
+ # Round 0
+ movl (%ebx), %eax
+ mull %ecx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, (%edi)
+ movl %edx, %esi
+ # Round 4
+ movl 4(%ebx), %eax
+ mull %ecx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, 4(%edi)
+ movl %edx, %esi
+ # Round 8
+ movl 8(%ebx), %eax
+ mull %ecx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, 8(%edi)
+ movl %edx, %esi
+ # Round 12
+ movl 12(%ebx), %eax
+ mull %ecx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, 12(%edi)
+ movl %edx, %esi
+ # Round 16
+ movl 16(%ebx), %eax
+ mull %ecx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, 16(%edi)
+ movl %edx, %esi
+ # Round 20
+ movl 20(%ebx), %eax
+ mull %ecx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, 20(%edi)
+ movl %edx, %esi
+ # Round 24
+ movl 24(%ebx), %eax
+ mull %ecx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, 24(%edi)
+ movl %edx, %esi
+ # Round 28
+ movl 28(%ebx), %eax
+ mull %ecx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, 28(%edi)
+ movl %edx, %esi
+
+ addl $32, %ebx
+ addl $32, %edi
+ subl $8, %ebp
+ jz .L004mw_finish
+ jmp .L005mw_loop
+.L004mw_finish:
+ movl 28(%esp), %ebp
+ andl $7, %ebp
+ jnz .L006mw_finish2
+ jmp .L007mw_end
+.align 16
+.L006mw_finish2:
+ # Tail Round 0
+ movl (%ebx), %eax
+ mull %ecx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, (%edi)
+ movl %edx, %esi
+ decl %ebp
+ jz .L007mw_end
+ # Tail Round 1
+ movl 4(%ebx), %eax
+ mull %ecx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, 4(%edi)
+ movl %edx, %esi
+ decl %ebp
+ jz .L007mw_end
+ # Tail Round 2
+ movl 8(%ebx), %eax
+ mull %ecx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, 8(%edi)
+ movl %edx, %esi
+ decl %ebp
+ jz .L007mw_end
+ # Tail Round 3
+ movl 12(%ebx), %eax
+ mull %ecx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, 12(%edi)
+ movl %edx, %esi
+ decl %ebp
+ jz .L007mw_end
+ # Tail Round 4
+ movl 16(%ebx), %eax
+ mull %ecx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, 16(%edi)
+ movl %edx, %esi
+ decl %ebp
+ jz .L007mw_end
+ # Tail Round 5
+ movl 20(%ebx), %eax
+ mull %ecx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, 20(%edi)
+ movl %edx, %esi
+ decl %ebp
+ jz .L007mw_end
+ # Tail Round 6
+ movl 24(%ebx), %eax
+ mull %ecx
+ addl %esi, %eax
+ adcl $0, %edx
+ movl %eax, 24(%edi)
+ movl %edx, %esi
+.L007mw_end:
+ movl %esi, %eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.bn_mul_words_end:
+ .size bn_mul_words,.bn_mul_words_end-bn_mul_words
+.ident "bn_mul_words"
+.text
+ .align 16
+.globl bn_sqr_words
+ .type bn_sqr_words,@function
+bn_sqr_words:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+
+ movl 20(%esp), %esi
+ movl 24(%esp), %edi
+ movl 28(%esp), %ebx
+ andl $4294967288, %ebx
+ jz .L008sw_finish
+.L009sw_loop:
+ # Round 0
+ movl (%edi), %eax
+ mull %eax
+ movl %eax, (%esi)
+ movl %edx, 4(%esi)
+ # Round 4
+ movl 4(%edi), %eax
+ mull %eax
+ movl %eax, 8(%esi)
+ movl %edx, 12(%esi)
+ # Round 8
+ movl 8(%edi), %eax
+ mull %eax
+ movl %eax, 16(%esi)
+ movl %edx, 20(%esi)
+ # Round 12
+ movl 12(%edi), %eax
+ mull %eax
+ movl %eax, 24(%esi)
+ movl %edx, 28(%esi)
+ # Round 16
+ movl 16(%edi), %eax
+ mull %eax
+ movl %eax, 32(%esi)
+ movl %edx, 36(%esi)
+ # Round 20
+ movl 20(%edi), %eax
+ mull %eax
+ movl %eax, 40(%esi)
+ movl %edx, 44(%esi)
+ # Round 24
+ movl 24(%edi), %eax
+ mull %eax
+ movl %eax, 48(%esi)
+ movl %edx, 52(%esi)
+ # Round 28
+ movl 28(%edi), %eax
+ mull %eax
+ movl %eax, 56(%esi)
+ movl %edx, 60(%esi)
+
+ addl $32, %edi
+ addl $64, %esi
+ subl $8, %ebx
+ jnz .L009sw_loop
+.L008sw_finish:
+ movl 28(%esp), %ebx
+ andl $7, %ebx
+ jz .L010sw_end
+ # Tail Round 0
+ movl (%edi), %eax
+ mull %eax
+ movl %eax, (%esi)
+ decl %ebx
+ movl %edx, 4(%esi)
+ jz .L010sw_end
+ # Tail Round 1
+ movl 4(%edi), %eax
+ mull %eax
+ movl %eax, 8(%esi)
+ decl %ebx
+ movl %edx, 12(%esi)
+ jz .L010sw_end
+ # Tail Round 2
+ movl 8(%edi), %eax
+ mull %eax
+ movl %eax, 16(%esi)
+ decl %ebx
+ movl %edx, 20(%esi)
+ jz .L010sw_end
+ # Tail Round 3
+ movl 12(%edi), %eax
+ mull %eax
+ movl %eax, 24(%esi)
+ decl %ebx
+ movl %edx, 28(%esi)
+ jz .L010sw_end
+ # Tail Round 4
+ movl 16(%edi), %eax
+ mull %eax
+ movl %eax, 32(%esi)
+ decl %ebx
+ movl %edx, 36(%esi)
+ jz .L010sw_end
+ # Tail Round 5
+ movl 20(%edi), %eax
+ mull %eax
+ movl %eax, 40(%esi)
+ decl %ebx
+ movl %edx, 44(%esi)
+ jz .L010sw_end
+ # Tail Round 6
+ movl 24(%edi), %eax
+ mull %eax
+ movl %eax, 48(%esi)
+ movl %edx, 52(%esi)
+.L010sw_end:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.bn_sqr_words_end:
+ .size bn_sqr_words,.bn_sqr_words_end-bn_sqr_words
+.ident "bn_sqr_words"
+.text
+ .align 16
+.globl bn_div64
+ .type bn_div64,@function
+bn_div64:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ movl 20(%esp), %edx
+ movl 24(%esp), %eax
+ movl 28(%esp), %ebx
+ divl %ebx
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.bn_div64_end:
+ .size bn_div64,.bn_div64_end-bn_div64
+.ident "bn_div64"
+.text
+ .align 16
+.globl bn_add_words
+ .type bn_add_words,@function
+bn_add_words:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+
+ movl 20(%esp), %ebx
+ movl 24(%esp), %esi
+ movl 28(%esp), %edi
+ movl 32(%esp), %ebp
+ xorl %eax, %eax
+ andl $4294967288, %ebp
+ jz .L011aw_finish
+.L012aw_loop:
+ # Round 0
+ movl (%esi), %ecx
+ movl (%edi), %edx
+ addl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ addl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, (%ebx)
+ # Round 1
+ movl 4(%esi), %ecx
+ movl 4(%edi), %edx
+ addl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ addl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 4(%ebx)
+ # Round 2
+ movl 8(%esi), %ecx
+ movl 8(%edi), %edx
+ addl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ addl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 8(%ebx)
+ # Round 3
+ movl 12(%esi), %ecx
+ movl 12(%edi), %edx
+ addl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ addl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 12(%ebx)
+ # Round 4
+ movl 16(%esi), %ecx
+ movl 16(%edi), %edx
+ addl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ addl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 16(%ebx)
+ # Round 5
+ movl 20(%esi), %ecx
+ movl 20(%edi), %edx
+ addl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ addl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 20(%ebx)
+ # Round 6
+ movl 24(%esi), %ecx
+ movl 24(%edi), %edx
+ addl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ addl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 24(%ebx)
+ # Round 7
+ movl 28(%esi), %ecx
+ movl 28(%edi), %edx
+ addl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ addl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 28(%ebx)
+
+ addl $32, %esi
+ addl $32, %edi
+ addl $32, %ebx
+ subl $8, %ebp
+ jnz .L012aw_loop
+.L011aw_finish:
+ movl 32(%esp), %ebp
+ andl $7, %ebp
+ jz .L013aw_end
+ # Tail Round 0
+ movl (%esi), %ecx
+ movl (%edi), %edx
+ addl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ addl %edx, %ecx
+ adcl $0, %eax
+ decl %ebp
+ movl %ecx, (%ebx)
+ jz .L013aw_end
+ # Tail Round 1
+ movl 4(%esi), %ecx
+ movl 4(%edi), %edx
+ addl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ addl %edx, %ecx
+ adcl $0, %eax
+ decl %ebp
+ movl %ecx, 4(%ebx)
+ jz .L013aw_end
+ # Tail Round 2
+ movl 8(%esi), %ecx
+ movl 8(%edi), %edx
+ addl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ addl %edx, %ecx
+ adcl $0, %eax
+ decl %ebp
+ movl %ecx, 8(%ebx)
+ jz .L013aw_end
+ # Tail Round 3
+ movl 12(%esi), %ecx
+ movl 12(%edi), %edx
+ addl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ addl %edx, %ecx
+ adcl $0, %eax
+ decl %ebp
+ movl %ecx, 12(%ebx)
+ jz .L013aw_end
+ # Tail Round 4
+ movl 16(%esi), %ecx
+ movl 16(%edi), %edx
+ addl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ addl %edx, %ecx
+ adcl $0, %eax
+ decl %ebp
+ movl %ecx, 16(%ebx)
+ jz .L013aw_end
+ # Tail Round 5
+ movl 20(%esi), %ecx
+ movl 20(%edi), %edx
+ addl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ addl %edx, %ecx
+ adcl $0, %eax
+ decl %ebp
+ movl %ecx, 20(%ebx)
+ jz .L013aw_end
+ # Tail Round 6
+ movl 24(%esi), %ecx
+ movl 24(%edi), %edx
+ addl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ addl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 24(%ebx)
+.L013aw_end:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.bn_add_words_end:
+ .size bn_add_words,.bn_add_words_end-bn_add_words
+.ident "bn_add_words"
+.text
+ .align 16
+.globl bn_sub_words
+ .type bn_sub_words,@function
+bn_sub_words:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+
+ movl 20(%esp), %ebx
+ movl 24(%esp), %esi
+ movl 28(%esp), %edi
+ movl 32(%esp), %ebp
+ xorl %eax, %eax
+ andl $4294967288, %ebp
+ jz .L014aw_finish
+.L015aw_loop:
+ # Round 0
+ movl (%esi), %ecx
+ movl (%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, (%ebx)
+ # Round 1
+ movl 4(%esi), %ecx
+ movl 4(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 4(%ebx)
+ # Round 2
+ movl 8(%esi), %ecx
+ movl 8(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 8(%ebx)
+ # Round 3
+ movl 12(%esi), %ecx
+ movl 12(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 12(%ebx)
+ # Round 4
+ movl 16(%esi), %ecx
+ movl 16(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 16(%ebx)
+ # Round 5
+ movl 20(%esi), %ecx
+ movl 20(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 20(%ebx)
+ # Round 6
+ movl 24(%esi), %ecx
+ movl 24(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 24(%ebx)
+ # Round 7
+ movl 28(%esi), %ecx
+ movl 28(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 28(%ebx)
+
+ addl $32, %esi
+ addl $32, %edi
+ addl $32, %ebx
+ subl $8, %ebp
+ jnz .L015aw_loop
+.L014aw_finish:
+ movl 32(%esp), %ebp
+ andl $7, %ebp
+ jz .L016aw_end
+ # Tail Round 0
+ movl (%esi), %ecx
+ movl (%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ decl %ebp
+ movl %ecx, (%ebx)
+ jz .L016aw_end
+ # Tail Round 1
+ movl 4(%esi), %ecx
+ movl 4(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ decl %ebp
+ movl %ecx, 4(%ebx)
+ jz .L016aw_end
+ # Tail Round 2
+ movl 8(%esi), %ecx
+ movl 8(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ decl %ebp
+ movl %ecx, 8(%ebx)
+ jz .L016aw_end
+ # Tail Round 3
+ movl 12(%esi), %ecx
+ movl 12(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ decl %ebp
+ movl %ecx, 12(%ebx)
+ jz .L016aw_end
+ # Tail Round 4
+ movl 16(%esi), %ecx
+ movl 16(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ decl %ebp
+ movl %ecx, 16(%ebx)
+ jz .L016aw_end
+ # Tail Round 5
+ movl 20(%esi), %ecx
+ movl 20(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ decl %ebp
+ movl %ecx, 20(%ebx)
+ jz .L016aw_end
+ # Tail Round 6
+ movl 24(%esi), %ecx
+ movl 24(%edi), %edx
+ subl %eax, %ecx
+ movl $0, %eax
+ adcl %eax, %eax
+ subl %edx, %ecx
+ adcl $0, %eax
+ movl %ecx, 24(%ebx)
+.L016aw_end:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.bn_sub_words_end:
+ .size bn_sub_words,.bn_sub_words_end-bn_sub_words
+.ident "bn_sub_words"
+.text
+ .align 16
+.globl bn_mul_comba8
+ .type bn_mul_comba8,@function
+bn_mul_comba8:
+ pushl %esi
+ movl 12(%esp), %esi
+ pushl %edi
+ movl 20(%esp), %edi
+ pushl %ebp
+ pushl %ebx
+ xorl %ebx, %ebx
+ movl (%esi), %eax
+ xorl %ecx, %ecx
+ movl (%edi), %edx
+ # ################## Calculate word 0
+ xorl %ebp, %ebp
+ # mul a[0]*b[0]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl (%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, (%eax)
+ movl 4(%esi), %eax
+ # saved r[0]
+ # ################## Calculate word 1
+ xorl %ebx, %ebx
+ # mul a[1]*b[0]
+ mull %edx
+ addl %eax, %ecx
+ movl (%esi), %eax
+ adcl %edx, %ebp
+ movl 4(%edi), %edx
+ adcl $0, %ebx
+ # mul a[0]*b[1]
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl (%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 4(%eax)
+ movl 8(%esi), %eax
+ # saved r[1]
+ # ################## Calculate word 2
+ xorl %ecx, %ecx
+ # mul a[2]*b[0]
+ mull %edx
+ addl %eax, %ebp
+ movl 4(%esi), %eax
+ adcl %edx, %ebx
+ movl 4(%edi), %edx
+ adcl $0, %ecx
+ # mul a[1]*b[1]
+ mull %edx
+ addl %eax, %ebp
+ movl (%esi), %eax
+ adcl %edx, %ebx
+ movl 8(%edi), %edx
+ adcl $0, %ecx
+ # mul a[0]*b[2]
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl (%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 8(%eax)
+ movl 12(%esi), %eax
+ # saved r[2]
+ # ################## Calculate word 3
+ xorl %ebp, %ebp
+ # mul a[3]*b[0]
+ mull %edx
+ addl %eax, %ebx
+ movl 8(%esi), %eax
+ adcl %edx, %ecx
+ movl 4(%edi), %edx
+ adcl $0, %ebp
+ # mul a[2]*b[1]
+ mull %edx
+ addl %eax, %ebx
+ movl 4(%esi), %eax
+ adcl %edx, %ecx
+ movl 8(%edi), %edx
+ adcl $0, %ebp
+ # mul a[1]*b[2]
+ mull %edx
+ addl %eax, %ebx
+ movl (%esi), %eax
+ adcl %edx, %ecx
+ movl 12(%edi), %edx
+ adcl $0, %ebp
+ # mul a[0]*b[3]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl (%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, 12(%eax)
+ movl 16(%esi), %eax
+ # saved r[3]
+ # ################## Calculate word 4
+ xorl %ebx, %ebx
+ # mul a[4]*b[0]
+ mull %edx
+ addl %eax, %ecx
+ movl 12(%esi), %eax
+ adcl %edx, %ebp
+ movl 4(%edi), %edx
+ adcl $0, %ebx
+ # mul a[3]*b[1]
+ mull %edx
+ addl %eax, %ecx
+ movl 8(%esi), %eax
+ adcl %edx, %ebp
+ movl 8(%edi), %edx
+ adcl $0, %ebx
+ # mul a[2]*b[2]
+ mull %edx
+ addl %eax, %ecx
+ movl 4(%esi), %eax
+ adcl %edx, %ebp
+ movl 12(%edi), %edx
+ adcl $0, %ebx
+ # mul a[1]*b[3]
+ mull %edx
+ addl %eax, %ecx
+ movl (%esi), %eax
+ adcl %edx, %ebp
+ movl 16(%edi), %edx
+ adcl $0, %ebx
+ # mul a[0]*b[4]
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl (%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 16(%eax)
+ movl 20(%esi), %eax
+ # saved r[4]
+ # ################## Calculate word 5
+ xorl %ecx, %ecx
+ # mul a[5]*b[0]
+ mull %edx
+ addl %eax, %ebp
+ movl 16(%esi), %eax
+ adcl %edx, %ebx
+ movl 4(%edi), %edx
+ adcl $0, %ecx
+ # mul a[4]*b[1]
+ mull %edx
+ addl %eax, %ebp
+ movl 12(%esi), %eax
+ adcl %edx, %ebx
+ movl 8(%edi), %edx
+ adcl $0, %ecx
+ # mul a[3]*b[2]
+ mull %edx
+ addl %eax, %ebp
+ movl 8(%esi), %eax
+ adcl %edx, %ebx
+ movl 12(%edi), %edx
+ adcl $0, %ecx
+ # mul a[2]*b[3]
+ mull %edx
+ addl %eax, %ebp
+ movl 4(%esi), %eax
+ adcl %edx, %ebx
+ movl 16(%edi), %edx
+ adcl $0, %ecx
+ # mul a[1]*b[4]
+ mull %edx
+ addl %eax, %ebp
+ movl (%esi), %eax
+ adcl %edx, %ebx
+ movl 20(%edi), %edx
+ adcl $0, %ecx
+ # mul a[0]*b[5]
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl (%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 20(%eax)
+ movl 24(%esi), %eax
+ # saved r[5]
+ # ################## Calculate word 6
+ xorl %ebp, %ebp
+ # mul a[6]*b[0]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esi), %eax
+ adcl %edx, %ecx
+ movl 4(%edi), %edx
+ adcl $0, %ebp
+ # mul a[5]*b[1]
+ mull %edx
+ addl %eax, %ebx
+ movl 16(%esi), %eax
+ adcl %edx, %ecx
+ movl 8(%edi), %edx
+ adcl $0, %ebp
+ # mul a[4]*b[2]
+ mull %edx
+ addl %eax, %ebx
+ movl 12(%esi), %eax
+ adcl %edx, %ecx
+ movl 12(%edi), %edx
+ adcl $0, %ebp
+ # mul a[3]*b[3]
+ mull %edx
+ addl %eax, %ebx
+ movl 8(%esi), %eax
+ adcl %edx, %ecx
+ movl 16(%edi), %edx
+ adcl $0, %ebp
+ # mul a[2]*b[4]
+ mull %edx
+ addl %eax, %ebx
+ movl 4(%esi), %eax
+ adcl %edx, %ecx
+ movl 20(%edi), %edx
+ adcl $0, %ebp
+ # mul a[1]*b[5]
+ mull %edx
+ addl %eax, %ebx
+ movl (%esi), %eax
+ adcl %edx, %ecx
+ movl 24(%edi), %edx
+ adcl $0, %ebp
+ # mul a[0]*b[6]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl (%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, 24(%eax)
+ movl 28(%esi), %eax
+ # saved r[6]
+ # ################## Calculate word 7
+ xorl %ebx, %ebx
+ # mul a[7]*b[0]
+ mull %edx
+ addl %eax, %ecx
+ movl 24(%esi), %eax
+ adcl %edx, %ebp
+ movl 4(%edi), %edx
+ adcl $0, %ebx
+ # mul a[6]*b[1]
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esi), %eax
+ adcl %edx, %ebp
+ movl 8(%edi), %edx
+ adcl $0, %ebx
+ # mul a[5]*b[2]
+ mull %edx
+ addl %eax, %ecx
+ movl 16(%esi), %eax
+ adcl %edx, %ebp
+ movl 12(%edi), %edx
+ adcl $0, %ebx
+ # mul a[4]*b[3]
+ mull %edx
+ addl %eax, %ecx
+ movl 12(%esi), %eax
+ adcl %edx, %ebp
+ movl 16(%edi), %edx
+ adcl $0, %ebx
+ # mul a[3]*b[4]
+ mull %edx
+ addl %eax, %ecx
+ movl 8(%esi), %eax
+ adcl %edx, %ebp
+ movl 20(%edi), %edx
+ adcl $0, %ebx
+ # mul a[2]*b[5]
+ mull %edx
+ addl %eax, %ecx
+ movl 4(%esi), %eax
+ adcl %edx, %ebp
+ movl 24(%edi), %edx
+ adcl $0, %ebx
+ # mul a[1]*b[6]
+ mull %edx
+ addl %eax, %ecx
+ movl (%esi), %eax
+ adcl %edx, %ebp
+ movl 28(%edi), %edx
+ adcl $0, %ebx
+ # mul a[0]*b[7]
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl 4(%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 28(%eax)
+ movl 28(%esi), %eax
+ # saved r[7]
+ # ################## Calculate word 8
+ xorl %ecx, %ecx
+ # mul a[7]*b[1]
+ mull %edx
+ addl %eax, %ebp
+ movl 24(%esi), %eax
+ adcl %edx, %ebx
+ movl 8(%edi), %edx
+ adcl $0, %ecx
+ # mul a[6]*b[2]
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esi), %eax
+ adcl %edx, %ebx
+ movl 12(%edi), %edx
+ adcl $0, %ecx
+ # mul a[5]*b[3]
+ mull %edx
+ addl %eax, %ebp
+ movl 16(%esi), %eax
+ adcl %edx, %ebx
+ movl 16(%edi), %edx
+ adcl $0, %ecx
+ # mul a[4]*b[4]
+ mull %edx
+ addl %eax, %ebp
+ movl 12(%esi), %eax
+ adcl %edx, %ebx
+ movl 20(%edi), %edx
+ adcl $0, %ecx
+ # mul a[3]*b[5]
+ mull %edx
+ addl %eax, %ebp
+ movl 8(%esi), %eax
+ adcl %edx, %ebx
+ movl 24(%edi), %edx
+ adcl $0, %ecx
+ # mul a[2]*b[6]
+ mull %edx
+ addl %eax, %ebp
+ movl 4(%esi), %eax
+ adcl %edx, %ebx
+ movl 28(%edi), %edx
+ adcl $0, %ecx
+ # mul a[1]*b[7]
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl 8(%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 32(%eax)
+ movl 28(%esi), %eax
+ # saved r[8]
+ # ################## Calculate word 9
+ xorl %ebp, %ebp
+ # mul a[7]*b[2]
+ mull %edx
+ addl %eax, %ebx
+ movl 24(%esi), %eax
+ adcl %edx, %ecx
+ movl 12(%edi), %edx
+ adcl $0, %ebp
+ # mul a[6]*b[3]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esi), %eax
+ adcl %edx, %ecx
+ movl 16(%edi), %edx
+ adcl $0, %ebp
+ # mul a[5]*b[4]
+ mull %edx
+ addl %eax, %ebx
+ movl 16(%esi), %eax
+ adcl %edx, %ecx
+ movl 20(%edi), %edx
+ adcl $0, %ebp
+ # mul a[4]*b[5]
+ mull %edx
+ addl %eax, %ebx
+ movl 12(%esi), %eax
+ adcl %edx, %ecx
+ movl 24(%edi), %edx
+ adcl $0, %ebp
+ # mul a[3]*b[6]
+ mull %edx
+ addl %eax, %ebx
+ movl 8(%esi), %eax
+ adcl %edx, %ecx
+ movl 28(%edi), %edx
+ adcl $0, %ebp
+ # mul a[2]*b[7]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl 12(%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, 36(%eax)
+ movl 28(%esi), %eax
+ # saved r[9]
+ # ################## Calculate word 10
+ xorl %ebx, %ebx
+ # mul a[7]*b[3]
+ mull %edx
+ addl %eax, %ecx
+ movl 24(%esi), %eax
+ adcl %edx, %ebp
+ movl 16(%edi), %edx
+ adcl $0, %ebx
+ # mul a[6]*b[4]
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esi), %eax
+ adcl %edx, %ebp
+ movl 20(%edi), %edx
+ adcl $0, %ebx
+ # mul a[5]*b[5]
+ mull %edx
+ addl %eax, %ecx
+ movl 16(%esi), %eax
+ adcl %edx, %ebp
+ movl 24(%edi), %edx
+ adcl $0, %ebx
+ # mul a[4]*b[6]
+ mull %edx
+ addl %eax, %ecx
+ movl 12(%esi), %eax
+ adcl %edx, %ebp
+ movl 28(%edi), %edx
+ adcl $0, %ebx
+ # mul a[3]*b[7]
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl 16(%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 40(%eax)
+ movl 28(%esi), %eax
+ # saved r[10]
+ # ################## Calculate word 11
+ xorl %ecx, %ecx
+ # mul a[7]*b[4]
+ mull %edx
+ addl %eax, %ebp
+ movl 24(%esi), %eax
+ adcl %edx, %ebx
+ movl 20(%edi), %edx
+ adcl $0, %ecx
+ # mul a[6]*b[5]
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esi), %eax
+ adcl %edx, %ebx
+ movl 24(%edi), %edx
+ adcl $0, %ecx
+ # mul a[5]*b[6]
+ mull %edx
+ addl %eax, %ebp
+ movl 16(%esi), %eax
+ adcl %edx, %ebx
+ movl 28(%edi), %edx
+ adcl $0, %ecx
+ # mul a[4]*b[7]
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl 20(%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 44(%eax)
+ movl 28(%esi), %eax
+ # saved r[11]
+ # ################## Calculate word 12
+ xorl %ebp, %ebp
+ # mul a[7]*b[5]
+ mull %edx
+ addl %eax, %ebx
+ movl 24(%esi), %eax
+ adcl %edx, %ecx
+ movl 24(%edi), %edx
+ adcl $0, %ebp
+ # mul a[6]*b[6]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esi), %eax
+ adcl %edx, %ecx
+ movl 28(%edi), %edx
+ adcl $0, %ebp
+ # mul a[5]*b[7]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl 24(%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, 48(%eax)
+ movl 28(%esi), %eax
+ # saved r[12]
+ # ################## Calculate word 13
+ xorl %ebx, %ebx
+ # mul a[7]*b[6]
+ mull %edx
+ addl %eax, %ecx
+ movl 24(%esi), %eax
+ adcl %edx, %ebp
+ movl 28(%edi), %edx
+ adcl $0, %ebx
+ # mul a[6]*b[7]
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl 28(%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 52(%eax)
+ movl 28(%esi), %eax
+ # saved r[13]
+ # ################## Calculate word 14
+ xorl %ecx, %ecx
+ # mul a[7]*b[7]
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ adcl $0, %ecx
+ movl %ebp, 56(%eax)
+ # saved r[14]
+ # save r[15]
+ movl %ebx, 60(%eax)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.bn_mul_comba8_end:
+ .size bn_mul_comba8,.bn_mul_comba8_end-bn_mul_comba8
+.ident "desasm.pl"
+.text
+ .align 16
+.globl bn_mul_comba4
+ .type bn_mul_comba4,@function
+bn_mul_comba4:
+ pushl %esi
+ movl 12(%esp), %esi
+ pushl %edi
+ movl 20(%esp), %edi
+ pushl %ebp
+ pushl %ebx
+ xorl %ebx, %ebx
+ movl (%esi), %eax
+ xorl %ecx, %ecx
+ movl (%edi), %edx
+ # ################## Calculate word 0
+ xorl %ebp, %ebp
+ # mul a[0]*b[0]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl (%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, (%eax)
+ movl 4(%esi), %eax
+ # saved r[0]
+ # ################## Calculate word 1
+ xorl %ebx, %ebx
+ # mul a[1]*b[0]
+ mull %edx
+ addl %eax, %ecx
+ movl (%esi), %eax
+ adcl %edx, %ebp
+ movl 4(%edi), %edx
+ adcl $0, %ebx
+ # mul a[0]*b[1]
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl (%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 4(%eax)
+ movl 8(%esi), %eax
+ # saved r[1]
+ # ################## Calculate word 2
+ xorl %ecx, %ecx
+ # mul a[2]*b[0]
+ mull %edx
+ addl %eax, %ebp
+ movl 4(%esi), %eax
+ adcl %edx, %ebx
+ movl 4(%edi), %edx
+ adcl $0, %ecx
+ # mul a[1]*b[1]
+ mull %edx
+ addl %eax, %ebp
+ movl (%esi), %eax
+ adcl %edx, %ebx
+ movl 8(%edi), %edx
+ adcl $0, %ecx
+ # mul a[0]*b[2]
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl (%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 8(%eax)
+ movl 12(%esi), %eax
+ # saved r[2]
+ # ################## Calculate word 3
+ xorl %ebp, %ebp
+ # mul a[3]*b[0]
+ mull %edx
+ addl %eax, %ebx
+ movl 8(%esi), %eax
+ adcl %edx, %ecx
+ movl 4(%edi), %edx
+ adcl $0, %ebp
+ # mul a[2]*b[1]
+ mull %edx
+ addl %eax, %ebx
+ movl 4(%esi), %eax
+ adcl %edx, %ecx
+ movl 8(%edi), %edx
+ adcl $0, %ebp
+ # mul a[1]*b[2]
+ mull %edx
+ addl %eax, %ebx
+ movl (%esi), %eax
+ adcl %edx, %ecx
+ movl 12(%edi), %edx
+ adcl $0, %ebp
+ # mul a[0]*b[3]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ movl 4(%edi), %edx
+ adcl $0, %ebp
+ movl %ebx, 12(%eax)
+ movl 12(%esi), %eax
+ # saved r[3]
+ # ################## Calculate word 4
+ xorl %ebx, %ebx
+ # mul a[3]*b[1]
+ mull %edx
+ addl %eax, %ecx
+ movl 8(%esi), %eax
+ adcl %edx, %ebp
+ movl 8(%edi), %edx
+ adcl $0, %ebx
+ # mul a[2]*b[2]
+ mull %edx
+ addl %eax, %ecx
+ movl 4(%esi), %eax
+ adcl %edx, %ebp
+ movl 12(%edi), %edx
+ adcl $0, %ebx
+ # mul a[1]*b[3]
+ mull %edx
+ addl %eax, %ecx
+ movl 20(%esp), %eax
+ adcl %edx, %ebp
+ movl 8(%edi), %edx
+ adcl $0, %ebx
+ movl %ecx, 16(%eax)
+ movl 12(%esi), %eax
+ # saved r[4]
+ # ################## Calculate word 5
+ xorl %ecx, %ecx
+ # mul a[3]*b[2]
+ mull %edx
+ addl %eax, %ebp
+ movl 8(%esi), %eax
+ adcl %edx, %ebx
+ movl 12(%edi), %edx
+ adcl $0, %ecx
+ # mul a[2]*b[3]
+ mull %edx
+ addl %eax, %ebp
+ movl 20(%esp), %eax
+ adcl %edx, %ebx
+ movl 12(%edi), %edx
+ adcl $0, %ecx
+ movl %ebp, 20(%eax)
+ movl 12(%esi), %eax
+ # saved r[5]
+ # ################## Calculate word 6
+ xorl %ebp, %ebp
+ # mul a[3]*b[3]
+ mull %edx
+ addl %eax, %ebx
+ movl 20(%esp), %eax
+ adcl %edx, %ecx
+ adcl $0, %ebp
+ movl %ebx, 24(%eax)
+ # saved r[6]
+ # save r[7]
+ movl %ecx, 28(%eax)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.bn_mul_comba4_end:
+ .size bn_mul_comba4,.bn_mul_comba4_end-bn_mul_comba4
+.ident "desasm.pl"
+.text
+ .align 16
+.globl bn_sqr_comba8
+ .type bn_sqr_comba8,@function
+bn_sqr_comba8:
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushl %ebx
+ movl 20(%esp), %edi
+ movl 24(%esp), %esi
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ movl (%esi), %eax
+ # ############### Calculate word 0
+ xorl %ebp, %ebp
+ # sqr a[0]*a[0]
+ mull %eax
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl (%esi), %edx
+ adcl $0, %ebp
+ movl %ebx, (%edi)
+ movl 4(%esi), %eax
+ # saved r[0]
+ # ############### Calculate word 1
+ xorl %ebx, %ebx
+ # sqr a[1]*a[0]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 8(%esi), %eax
+ adcl $0, %ebx
+ movl %ecx, 4(%edi)
+ movl (%esi), %edx
+ # saved r[1]
+ # ############### Calculate word 2
+ xorl %ecx, %ecx
+ # sqr a[2]*a[0]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 4(%esi), %eax
+ adcl $0, %ecx
+ # sqr a[1]*a[1]
+ mull %eax
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl (%esi), %edx
+ adcl $0, %ecx
+ movl %ebp, 8(%edi)
+ movl 12(%esi), %eax
+ # saved r[2]
+ # ############### Calculate word 3
+ xorl %ebp, %ebp
+ # sqr a[3]*a[0]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 8(%esi), %eax
+ adcl $0, %ebp
+ movl 4(%esi), %edx
+ # sqr a[2]*a[1]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 16(%esi), %eax
+ adcl $0, %ebp
+ movl %ebx, 12(%edi)
+ movl (%esi), %edx
+ # saved r[3]
+ # ############### Calculate word 4
+ xorl %ebx, %ebx
+ # sqr a[4]*a[0]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 12(%esi), %eax
+ adcl $0, %ebx
+ movl 4(%esi), %edx
+ # sqr a[3]*a[1]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 8(%esi), %eax
+ adcl $0, %ebx
+ # sqr a[2]*a[2]
+ mull %eax
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl (%esi), %edx
+ adcl $0, %ebx
+ movl %ecx, 16(%edi)
+ movl 20(%esi), %eax
+ # saved r[4]
+ # ############### Calculate word 5
+ xorl %ecx, %ecx
+ # sqr a[5]*a[0]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 16(%esi), %eax
+ adcl $0, %ecx
+ movl 4(%esi), %edx
+ # sqr a[4]*a[1]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 12(%esi), %eax
+ adcl $0, %ecx
+ movl 8(%esi), %edx
+ # sqr a[3]*a[2]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 24(%esi), %eax
+ adcl $0, %ecx
+ movl %ebp, 20(%edi)
+ movl (%esi), %edx
+ # saved r[5]
+ # ############### Calculate word 6
+ xorl %ebp, %ebp
+ # sqr a[6]*a[0]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 20(%esi), %eax
+ adcl $0, %ebp
+ movl 4(%esi), %edx
+ # sqr a[5]*a[1]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 16(%esi), %eax
+ adcl $0, %ebp
+ movl 8(%esi), %edx
+ # sqr a[4]*a[2]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 12(%esi), %eax
+ adcl $0, %ebp
+ # sqr a[3]*a[3]
+ mull %eax
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl (%esi), %edx
+ adcl $0, %ebp
+ movl %ebx, 24(%edi)
+ movl 28(%esi), %eax
+ # saved r[6]
+ # ############### Calculate word 7
+ xorl %ebx, %ebx
+ # sqr a[7]*a[0]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 24(%esi), %eax
+ adcl $0, %ebx
+ movl 4(%esi), %edx
+ # sqr a[6]*a[1]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 20(%esi), %eax
+ adcl $0, %ebx
+ movl 8(%esi), %edx
+ # sqr a[5]*a[2]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 16(%esi), %eax
+ adcl $0, %ebx
+ movl 12(%esi), %edx
+ # sqr a[4]*a[3]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 28(%esi), %eax
+ adcl $0, %ebx
+ movl %ecx, 28(%edi)
+ movl 4(%esi), %edx
+ # saved r[7]
+ # ############### Calculate word 8
+ xorl %ecx, %ecx
+ # sqr a[7]*a[1]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 24(%esi), %eax
+ adcl $0, %ecx
+ movl 8(%esi), %edx
+ # sqr a[6]*a[2]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 20(%esi), %eax
+ adcl $0, %ecx
+ movl 12(%esi), %edx
+ # sqr a[5]*a[3]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 16(%esi), %eax
+ adcl $0, %ecx
+ # sqr a[4]*a[4]
+ mull %eax
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 8(%esi), %edx
+ adcl $0, %ecx
+ movl %ebp, 32(%edi)
+ movl 28(%esi), %eax
+ # saved r[8]
+ # ############### Calculate word 9
+ xorl %ebp, %ebp
+ # sqr a[7]*a[2]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 24(%esi), %eax
+ adcl $0, %ebp
+ movl 12(%esi), %edx
+ # sqr a[6]*a[3]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 20(%esi), %eax
+ adcl $0, %ebp
+ movl 16(%esi), %edx
+ # sqr a[5]*a[4]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 28(%esi), %eax
+ adcl $0, %ebp
+ movl %ebx, 36(%edi)
+ movl 12(%esi), %edx
+ # saved r[9]
+ # ############### Calculate word 10
+ xorl %ebx, %ebx
+ # sqr a[7]*a[3]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 24(%esi), %eax
+ adcl $0, %ebx
+ movl 16(%esi), %edx
+ # sqr a[6]*a[4]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 20(%esi), %eax
+ adcl $0, %ebx
+ # sqr a[5]*a[5]
+ mull %eax
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 16(%esi), %edx
+ adcl $0, %ebx
+ movl %ecx, 40(%edi)
+ movl 28(%esi), %eax
+ # saved r[10]
+ # ############### Calculate word 11
+ xorl %ecx, %ecx
+ # sqr a[7]*a[4]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 24(%esi), %eax
+ adcl $0, %ecx
+ movl 20(%esi), %edx
+ # sqr a[6]*a[5]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 28(%esi), %eax
+ adcl $0, %ecx
+ movl %ebp, 44(%edi)
+ movl 20(%esi), %edx
+ # saved r[11]
+ # ############### Calculate word 12
+ xorl %ebp, %ebp
+ # sqr a[7]*a[5]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 24(%esi), %eax
+ adcl $0, %ebp
+ # sqr a[6]*a[6]
+ mull %eax
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 24(%esi), %edx
+ adcl $0, %ebp
+ movl %ebx, 48(%edi)
+ movl 28(%esi), %eax
+ # saved r[12]
+ # ############### Calculate word 13
+ xorl %ebx, %ebx
+ # sqr a[7]*a[6]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 28(%esi), %eax
+ adcl $0, %ebx
+ movl %ecx, 52(%edi)
+ # saved r[13]
+ # ############### Calculate word 14
+ xorl %ecx, %ecx
+ # sqr a[7]*a[7]
+ mull %eax
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ adcl $0, %ecx
+ movl %ebp, 56(%edi)
+ # saved r[14]
+ movl %ebx, 60(%edi)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.bn_sqr_comba8_end:
+ .size bn_sqr_comba8,.bn_sqr_comba8_end-bn_sqr_comba8
+.ident "desasm.pl"
+.text
+ .align 16
+.globl bn_sqr_comba4
+ .type bn_sqr_comba4,@function
+bn_sqr_comba4:
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushl %ebx
+ movl 20(%esp), %edi
+ movl 24(%esp), %esi
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ movl (%esi), %eax
+ # ############### Calculate word 0
+ xorl %ebp, %ebp
+ # sqr a[0]*a[0]
+ mull %eax
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl (%esi), %edx
+ adcl $0, %ebp
+ movl %ebx, (%edi)
+ movl 4(%esi), %eax
+ # saved r[0]
+ # ############### Calculate word 1
+ xorl %ebx, %ebx
+ # sqr a[1]*a[0]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 8(%esi), %eax
+ adcl $0, %ebx
+ movl %ecx, 4(%edi)
+ movl (%esi), %edx
+ # saved r[1]
+ # ############### Calculate word 2
+ xorl %ecx, %ecx
+ # sqr a[2]*a[0]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 4(%esi), %eax
+ adcl $0, %ecx
+ # sqr a[1]*a[1]
+ mull %eax
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl (%esi), %edx
+ adcl $0, %ecx
+ movl %ebp, 8(%edi)
+ movl 12(%esi), %eax
+ # saved r[2]
+ # ############### Calculate word 3
+ xorl %ebp, %ebp
+ # sqr a[3]*a[0]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 8(%esi), %eax
+ adcl $0, %ebp
+ movl 4(%esi), %edx
+ # sqr a[2]*a[1]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebp
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ movl 12(%esi), %eax
+ adcl $0, %ebp
+ movl %ebx, 12(%edi)
+ movl 4(%esi), %edx
+ # saved r[3]
+ # ############### Calculate word 4
+ xorl %ebx, %ebx
+ # sqr a[3]*a[1]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ebx
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 8(%esi), %eax
+ adcl $0, %ebx
+ # sqr a[2]*a[2]
+ mull %eax
+ addl %eax, %ecx
+ adcl %edx, %ebp
+ movl 8(%esi), %edx
+ adcl $0, %ebx
+ movl %ecx, 16(%edi)
+ movl 12(%esi), %eax
+ # saved r[4]
+ # ############### Calculate word 5
+ xorl %ecx, %ecx
+ # sqr a[3]*a[2]
+ mull %edx
+ addl %eax, %eax
+ adcl %edx, %edx
+ adcl $0, %ecx
+ addl %eax, %ebp
+ adcl %edx, %ebx
+ movl 12(%esi), %eax
+ adcl $0, %ecx
+ movl %ebp, 20(%edi)
+ # saved r[5]
+ # ############### Calculate word 6
+ xorl %ebp, %ebp
+ # sqr a[3]*a[3]
+ mull %eax
+ addl %eax, %ebx
+ adcl %edx, %ecx
+ adcl $0, %ebp
+ movl %ebx, 24(%edi)
+ # saved r[6]
+ movl %ecx, 28(%edi)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.bn_sqr_comba4_end:
+ .size bn_sqr_comba4,.bn_sqr_comba4_end-bn_sqr_comba4
+.ident "desasm.pl"
diff --git a/crypto/bn/asm/f.s b/crypto/bn/asm/f.s
new file mode 100644
index 0000000000..2f8f63c690
--- /dev/null
+++ b/crypto/bn/asm/f.s
@@ -0,0 +1,1773 @@
+ # Don't even think of reading this code
+ # It was automatically generated by bn-586.pl
+ # Which is a perl program used to generate the alpha assember.
+ # eric <eay@cryptsoft.com>
+
+ # DEC Alpha assember
+ # Generated from perl scripts contains in SSLeay
+ .file 1 "bn-586.s"
+ .set noat
+ .text
+ .align 3
+ .globl bn_mul_words
+ .ent bn_mul_words
+bn_mul_words:
+bn_mul_words..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ subq $18, 4, $18
+ bis $31, $31, $0
+ br $100
+ blt $18, $100
+ ldq $1, 0($17)
+ ldq $2, 0($16)
+$101:
+ ldq $3, 0($17)
+ mulq $3, $19, $4
+ addq $17, 8, $17
+ umulh $3, $19, $5
+ addq $4, $0, $4
+ addq $16, 8, $16
+ subq $18, 1, $18
+ cmpult $4, $0, $0
+ stq $4, -8($16)
+ addq $5, $0, $0
+ bgt $18, $101
+ ret $31,($26),1
+$100:
+ addq $18, 4, $18
+ bgt $18, $101
+$102:
+ ret $31,($26),1
+ .end bn_mul_words
+ .text
+ .align 3
+ .globl bn_sqr_words
+ .ent bn_sqr_words
+bn_sqr_words:
+bn_sqr_words..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ subq $18, 4, $18
+ bis $31, $31, $0
+ br $103
+ blt $18, $103
+ ldq $1, 0($17)
+ ldq $2, 0($16)
+$104:
+ ldq $3, 0($17)
+ mulq $3, $3, $4
+ addq $17, 8, $17
+ addq $16, 16, $16
+ subq $18, 1, $18
+ umulh $3, $3, $5
+ stq $4, -16($16)
+ stq $5, -8($16)
+ bgt $18, $104
+ ret $31,($26),1
+$103:
+ addq $18, 4, $18
+ bgt $18, $104
+$105:
+ ret $31,($26),1
+ .end bn_sqr_words
+ .text
+ .align 3
+ .globl bn_mul_add_words
+ .ent bn_mul_add_words
+bn_mul_add_words:
+bn_mul_add_words..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ subq $18, 4, $18
+ bis $31, $31, $0
+ br $106
+ blt $18, $106
+ ldq $1, 0($17)
+ ldq $2, 0($16)
+$107:
+ ldq $3, 0($17)
+ ldq $4, 0($16)
+ mulq $3, $19, $5
+ subq $18, 1, $18
+ addq $17, 8, $17
+ umulh $3, $19, $6
+ addq $4, $5, $4
+ addq $16, 8, $16
+ cmpult $4, $5, $7
+ addq $4, $0, $4
+ addq $6, $7, $6
+ cmpult $4, $0, $0
+ stq $4, -8($16)
+ addq $6, $0, $0
+ bgt $18, $107
+ ret $31,($26),1
+$106:
+ addq $18, 4, $18
+ bgt $18, $107
+$108:
+ ret $31,($26),1
+ .end bn_mul_add_words
+ .text
+ .align 3
+ .globl bn_add_words
+ .ent bn_add_words
+bn_add_words:
+bn_add_words..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ subq $19, 4, $19
+ bis $31, $31, $0
+ br $109
+ blt $19, $109
+ ldq $1, 0($17)
+ ldq $2, 0($18)
+$110:
+ ldq $3, 8($17)
+ ldq $4, 8($18)
+ ldq $5, 16($17)
+ ldq $6, 16($18)
+ ldq $7, 24($17)
+ ldq $8, 24($18)
+ addq $1, $2, $22
+ cmpult $22, $2, $23
+ addq $22, $0, $22
+ cmpult $22, $0, $0
+ addq $0, $23, $0
+ addq $3, $4, $25
+ cmpult $25, $4, $24
+ addq $25, $0, $25
+ cmpult $25, $0, $0
+ addq $0, $24, $0
+ addq $5, $6, $28
+ cmpult $28, $6, $27
+ addq $28, $0, $28
+ cmpult $28, $0, $0
+ addq $0, $27, $0
+ addq $7, $8, $20
+ cmpult $20, $8, $21
+ addq $20, $0, $20
+ cmpult $20, $0, $0
+ addq $0, $21, $0
+ stq $22, 0($16)
+ stq $25, 0($16)
+ stq $28, 0($16)
+ stq $20, 0($16)
+ subq $19, 4, $19
+ addq $17, 32, $17
+ addq $18, 32, $18
+ addq $16, 32, $16
+ blt $19, $109
+ ldq $1, 0($17)
+ ldq $2, 0($18)
+ br $110
+$111:
+ ldq $1, 0($17)
+ ldq $2, 0($18)
+ addq $1, $2, $3
+ cmpult $3, $2, $23
+ addq $3, $0, $3
+ cmpult $3, $0, $0
+ addq $0, $23, $0
+ stq $3, 0($16)
+ addq $17, 8, $17
+ addq $18, 8, $18
+ addq $16, 8, $16
+ subq $19, 1, $19
+ bgt $19, $111
+ ret $31,($26),1
+$109:
+ addq $19, 4, $19
+ bgt $19, $111
+$112:
+ ret $31,($26),1
+ .end bn_add_words
+ .text
+ .align 3
+ .globl bn_sub_words
+ .ent bn_sub_words
+bn_sub_words:
+bn_sub_words..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ subq $19, 4, $19
+ bis $31, $31, $0
+ blt $19, $113
+ ldq $1, 0($17)
+ ldq $2, 0($18)
+$114:
+ ldq $3, 8($17)
+ cmpult $1, $2, $4
+ ldq $5, 8($18)
+ subq $1, $2, $1
+ ldq $6, 16($17)
+ cmpult $1, $0, $2
+ ldq $7, 16($18)
+ subq $1, $0, $23
+ ldq $8, 24($17)
+ addq $2, $4, $0
+ cmpult $3, $5, $24
+ subq $3, $5, $3
+ ldq $22, 24($18)
+ cmpult $3, $0, $5
+ subq $3, $0, $25
+ addq $5, $24, $0
+ cmpult $6, $7, $27
+ subq $6, $7, $6
+ stq $23, 0($16)
+ cmpult $6, $0, $7
+ subq $6, $0, $28
+ addq $7, $27, $0
+ cmpult $8, $22, $21
+ subq $8, $22, $8
+ stq $25, 8($16)
+ cmpult $8, $0, $22
+ subq $8, $0, $20
+ addq $22, $21, $0
+ stq $28, 16($16)
+ subq $19, 4, $19
+ stq $20, 24($16)
+ addq $17, 32, $17
+ addq $18, 32, $18
+ addq $16, 32, $16
+ blt $19, $113
+ ldq $1, 0($17)
+ ldq $2, 0($18)
+ br $114
+$115:
+ ldq $1, 0($17)
+ ldq $2, 0($18)
+ cmpult $1, $2, $27
+ subq $1, $2, $1
+ cmpult $1, $0, $2
+ subq $1, $0, $1
+ stq $1, 0($16)
+ addq $2, $27, $0
+ addq $17, 8, $17
+ addq $18, 8, $18
+ addq $16, 8, $16
+ subq $19, 1, $19
+ bgt $19, $115
+ ret $31,($26),1
+$113:
+ addq $19, 4, $19
+ bgt $19, $115
+$116:
+ ret $31,($26),1
+ .end bn_sub_words
+ #
+ # What follows was taken directly from the C compiler with a few
+ # hacks to redo the lables.
+ #
+.text
+ .align 3
+ .globl bn_div64
+ .ent bn_div64
+bn_div64:
+ ldgp $29,0($27)
+bn_div64..ng:
+ lda $30,-48($30)
+ .frame $30,48,$26,0
+ stq $26,0($30)
+ stq $9,8($30)
+ stq $10,16($30)
+ stq $11,24($30)
+ stq $12,32($30)
+ stq $13,40($30)
+ .mask 0x4003e00,-48
+ .prologue 1
+ bis $16,$16,$9
+ bis $17,$17,$10
+ bis $18,$18,$11
+ bis $31,$31,$13
+ bis $31,2,$12
+ bne $11,$9119
+ lda $0,-1
+ br $31,$9136
+ .align 4
+$9119:
+ bis $11,$11,$16
+ jsr $26,BN_num_bits_word
+ ldgp $29,0($26)
+ subq $0,64,$1
+ beq $1,$9120
+ bis $31,1,$1
+ sll $1,$0,$1
+ cmpule $9,$1,$1
+ bne $1,$9120
+ # lda $16,_IO_stderr_
+ # lda $17,$C32
+ # bis $0,$0,$18
+ # jsr $26,fprintf
+ # ldgp $29,0($26)
+ jsr $26,abort
+ ldgp $29,0($26)
+ .align 4
+$9120:
+ bis $31,64,$3
+ cmpult $9,$11,$2
+ subq $3,$0,$1
+ addl $1,$31,$0
+ subq $9,$11,$1
+ cmoveq $2,$1,$9
+ beq $0,$9122
+ zapnot $0,15,$2
+ subq $3,$0,$1
+ sll $11,$2,$11
+ sll $9,$2,$3
+ srl $10,$1,$1
+ sll $10,$2,$10
+ bis $3,$1,$9
+$9122:
+ srl $11,32,$5
+ zapnot $11,15,$6
+ lda $7,-1
+ .align 5
+$9123:
+ srl $9,32,$1
+ subq $1,$5,$1
+ bne $1,$9126
+ zapnot $7,15,$27
+ br $31,$9127
+ .align 4
+$9126:
+ bis $9,$9,$24
+ bis $5,$5,$25
+ divqu $24,$25,$27
+$9127:
+ srl $10,32,$4
+ .align 5
+$9128:
+ mulq $27,$5,$1
+ subq $9,$1,$3
+ zapnot $3,240,$1
+ bne $1,$9129
+ mulq $6,$27,$2
+ sll $3,32,$1
+ addq $1,$4,$1
+ cmpule $2,$1,$2
+ bne $2,$9129
+ subq $27,1,$27
+ br $31,$9128
+ .align 4
+$9129:
+ mulq $27,$6,$1
+ mulq $27,$5,$4
+ srl $1,32,$3
+ sll $1,32,$1
+ addq $4,$3,$4
+ cmpult $10,$1,$2
+ subq $10,$1,$10
+ addq $2,$4,$2
+ cmpult $9,$2,$1
+ bis $2,$2,$4
+ beq $1,$9134
+ addq $9,$11,$9
+ subq $27,1,$27
+$9134:
+ subl $12,1,$12
+ subq $9,$4,$9
+ beq $12,$9124
+ sll $27,32,$13
+ sll $9,32,$2
+ srl $10,32,$1
+ sll $10,32,$10
+ bis $2,$1,$9
+ br $31,$9123
+ .align 4
+$9124:
+ bis $13,$27,$0
+$9136:
+ ldq $26,0($30)
+ ldq $9,8($30)
+ ldq $10,16($30)
+ ldq $11,24($30)
+ ldq $12,32($30)
+ ldq $13,40($30)
+ addq $30,48,$30
+ ret $31,($26),1
+ .end bn_div64
+ .text
+ .align 3
+ .globl bn_mul_comba8
+ .ent bn_mul_comba8
+bn_mul_comba8:
+bn_mul_comba8..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ subq $30, 16, $30
+ ldq $0, 0($17)
+ ldq $1, 0($18)
+ stq $9, 0($30)
+ stq $10, 8($30)
+ ldq $2, 8($17)
+ ldq $3, 8($18)
+ ldq $4, 16($17)
+ ldq $5, 16($18)
+ ldq $6, 24($17)
+ ldq $7, 24($18)
+ ldq $8, 8($17)
+ ldq $22, 8($18)
+ ldq $23, 8($17)
+ ldq $24, 8($18)
+ ldq $25, 8($17)
+ ldq $27, 8($18)
+ ldq $28, 8($17)
+ ldq $21, 8($18)
+ bis $31, $31, $9
+ mulq $0, $1, $20
+ umulh $0, $1, $19
+ stq $20, 0($16)
+ bis $31, $31, $10
+ mulq $0, $3, $17
+ umulh $0, $3, $18
+ addq $19, $17, $19
+ cmpult $19, $17, $20
+ addq $20, $18, $18
+ addq $9, $18, $9
+ cmpult $9, $18, $17
+ addq $10, $17, $10
+ mulq $2, $1, $20
+ umulh $2, $1, $18
+ addq $19, $20, $19
+ cmpult $19, $20, $17
+ addq $17, $18, $18
+ addq $9, $18, $9
+ cmpult $9, $18, $20
+ addq $10, $20, $10
+ stq $19, 8($16)
+ bis $31, $31, $17
+ mulq $0, $5, $18
+ umulh $0, $5, $20
+ addq $9, $18, $9
+ cmpult $9, $18, $19
+ addq $19, $20, $20
+ addq $10, $20, $10
+ cmpult $10, $20, $18
+ addq $17, $18, $17
+ mulq $2, $3, $19
+ umulh $2, $3, $20
+ addq $9, $19, $9
+ cmpult $9, $19, $18
+ addq $18, $20, $20
+ addq $10, $20, $10
+ cmpult $10, $20, $19
+ addq $17, $19, $17
+ mulq $4, $1, $18
+ umulh $4, $1, $20
+ addq $9, $18, $9
+ cmpult $9, $18, $19
+ addq $19, $20, $20
+ addq $10, $20, $10
+ cmpult $10, $20, $18
+ addq $17, $18, $17
+ stq $9, 16($16)
+ bis $31, $31, $19
+ mulq $0, $7, $20
+ umulh $0, $7, $18
+ addq $10, $20, $10
+ cmpult $10, $20, $9
+ addq $9, $18, $18
+ addq $17, $18, $17
+ cmpult $17, $18, $20
+ addq $19, $20, $19
+ mulq $2, $5, $9
+ umulh $2, $5, $18
+ addq $10, $9, $10
+ cmpult $10, $9, $20
+ addq $20, $18, $18
+ addq $17, $18, $17
+ cmpult $17, $18, $9
+ addq $19, $9, $19
+ mulq $4, $3, $20
+ umulh $4, $3, $18
+ addq $10, $20, $10
+ cmpult $10, $20, $9
+ addq $9, $18, $18
+ addq $17, $18, $17
+ cmpult $17, $18, $20
+ addq $19, $20, $19
+ mulq $6, $1, $9
+ umulh $6, $1, $18
+ addq $10, $9, $10
+ cmpult $10, $9, $20
+ addq $20, $18, $18
+ addq $17, $18, $17
+ cmpult $17, $18, $9
+ addq $19, $9, $19
+ stq $10, 24($16)
+ bis $31, $31, $20
+ mulq $0, $22, $18
+ umulh $0, $22, $9
+ addq $17, $18, $17
+ cmpult $17, $18, $10
+ addq $10, $9, $9
+ addq $19, $9, $19
+ cmpult $19, $9, $18
+ addq $20, $18, $20
+ mulq $2, $7, $10
+ umulh $2, $7, $9
+ addq $17, $10, $17
+ cmpult $17, $10, $18
+ addq $18, $9, $9
+ addq $19, $9, $19
+ cmpult $19, $9, $10
+ addq $20, $10, $20
+ mulq $4, $5, $18
+ umulh $4, $5, $9
+ addq $17, $18, $17
+ cmpult $17, $18, $10
+ addq $10, $9, $9
+ addq $19, $9, $19
+ cmpult $19, $9, $18
+ addq $20, $18, $20
+ mulq $6, $3, $10
+ umulh $6, $3, $9
+ addq $17, $10, $17
+ cmpult $17, $10, $18
+ addq $18, $9, $9
+ addq $19, $9, $19
+ cmpult $19, $9, $10
+ addq $20, $10, $20
+ mulq $8, $1, $18
+ umulh $8, $1, $9
+ addq $17, $18, $17
+ cmpult $17, $18, $10
+ addq $10, $9, $9
+ addq $19, $9, $19
+ cmpult $19, $9, $18
+ addq $20, $18, $20
+ stq $17, 32($16)
+ bis $31, $31, $10
+ mulq $0, $24, $9
+ umulh $0, $24, $18
+ addq $19, $9, $19
+ cmpult $19, $9, $17
+ addq $17, $18, $18
+ addq $20, $18, $20
+ cmpult $20, $18, $9
+ addq $10, $9, $10
+ mulq $2, $22, $17
+ umulh $2, $22, $18
+ addq $19, $17, $19
+ cmpult $19, $17, $9
+ addq $9, $18, $18
+ addq $20, $18, $20
+ cmpult $20, $18, $17
+ addq $10, $17, $10
+ mulq $4, $7, $9
+ umulh $4, $7, $18
+ addq $19, $9, $19
+ cmpult $19, $9, $17
+ addq $17, $18, $18
+ addq $20, $18, $20
+ cmpult $20, $18, $9
+ addq $10, $9, $10
+ mulq $6, $5, $17
+ umulh $6, $5, $18
+ addq $19, $17, $19
+ cmpult $19, $17, $9
+ addq $9, $18, $18
+ addq $20, $18, $20
+ cmpult $20, $18, $17
+ addq $10, $17, $10
+ mulq $8, $3, $9
+ umulh $8, $3, $18
+ addq $19, $9, $19
+ cmpult $19, $9, $17
+ addq $17, $18, $18
+ addq $20, $18, $20
+ cmpult $20, $18, $9
+ addq $10, $9, $10
+ mulq $23, $1, $17
+ umulh $23, $1, $18
+ addq $19, $17, $19
+ cmpult $19, $17, $9
+ addq $9, $18, $18
+ addq $20, $18, $20
+ cmpult $20, $18, $17
+ addq $10, $17, $10
+ stq $19, 40($16)
+ bis $31, $31, $9
+ mulq $0, $27, $18
+ umulh $0, $27, $17
+ addq $20, $18, $20
+ cmpult $20, $18, $19
+ addq $19, $17, $17
+ addq $10, $17, $10
+ cmpult $10, $17, $18
+ addq $9, $18, $9
+ mulq $2, $24, $19
+ umulh $2, $24, $17
+ addq $20, $19, $20
+ cmpult $20, $19, $18
+ addq $18, $17, $17
+ addq $10, $17, $10
+ cmpult $10, $17, $19
+ addq $9, $19, $9
+ mulq $4, $22, $18
+ umulh $4, $22, $17
+ addq $20, $18, $20
+ cmpult $20, $18, $19
+ addq $19, $17, $17
+ addq $10, $17, $10
+ cmpult $10, $17, $18
+ addq $9, $18, $9
+ mulq $6, $7, $19
+ umulh $6, $7, $17
+ addq $20, $19, $20
+ cmpult $20, $19, $18
+ addq $18, $17, $17
+ addq $10, $17, $10
+ cmpult $10, $17, $19
+ addq $9, $19, $9
+ mulq $8, $5, $18
+ umulh $8, $5, $17
+ addq $20, $18, $20
+ cmpult $20, $18, $19
+ addq $19, $17, $17
+ addq $10, $17, $10
+ cmpult $10, $17, $18
+ addq $9, $18, $9
+ mulq $23, $3, $19
+ umulh $23, $3, $17
+ addq $20, $19, $20
+ cmpult $20, $19, $18
+ addq $18, $17, $17
+ addq $10, $17, $10
+ cmpult $10, $17, $19
+ addq $9, $19, $9
+ mulq $25, $1, $18
+ umulh $25, $1, $17
+ addq $20, $18, $20
+ cmpult $20, $18, $19
+ addq $19, $17, $17
+ addq $10, $17, $10
+ cmpult $10, $17, $18
+ addq $9, $18, $9
+ stq $20, 48($16)
+ bis $31, $31, $19
+ mulq $0, $21, $17
+ umulh $0, $21, $18
+ addq $10, $17, $10
+ cmpult $10, $17, $20
+ addq $20, $18, $18
+ addq $9, $18, $9
+ cmpult $9, $18, $17
+ addq $19, $17, $19
+ mulq $2, $27, $20
+ umulh $2, $27, $18
+ addq $10, $20, $10
+ cmpult $10, $20, $17
+ addq $17, $18, $18
+ addq $9, $18, $9
+ cmpult $9, $18, $0
+ addq $19, $0, $19
+ mulq $4, $24, $20
+ umulh $4, $24, $17
+ addq $10, $20, $10
+ cmpult $10, $20, $18
+ addq $18, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $0
+ addq $19, $0, $19
+ mulq $6, $22, $20
+ umulh $6, $22, $18
+ addq $10, $20, $10
+ cmpult $10, $20, $17
+ addq $17, $18, $18
+ addq $9, $18, $9
+ cmpult $9, $18, $0
+ addq $19, $0, $19
+ mulq $8, $7, $20
+ umulh $8, $7, $17
+ addq $10, $20, $10
+ cmpult $10, $20, $18
+ addq $18, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $0
+ addq $19, $0, $19
+ mulq $23, $5, $20
+ umulh $23, $5, $18
+ addq $10, $20, $10
+ cmpult $10, $20, $17
+ addq $17, $18, $18
+ addq $9, $18, $9
+ cmpult $9, $18, $0
+ addq $19, $0, $19
+ mulq $25, $3, $20
+ umulh $25, $3, $17
+ addq $10, $20, $10
+ cmpult $10, $20, $18
+ addq $18, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $0
+ addq $19, $0, $19
+ mulq $28, $1, $20
+ umulh $28, $1, $18
+ addq $10, $20, $10
+ cmpult $10, $20, $17
+ addq $17, $18, $18
+ addq $9, $18, $9
+ cmpult $9, $18, $0
+ addq $19, $0, $19
+ stq $10, 56($16)
+ bis $31, $31, $20
+ mulq $2, $21, $17
+ umulh $2, $21, $18
+ addq $9, $17, $9
+ cmpult $9, $17, $0
+ addq $0, $18, $18
+ addq $19, $18, $19
+ cmpult $19, $18, $1
+ addq $20, $1, $20
+ mulq $4, $27, $10
+ umulh $4, $27, $17
+ addq $9, $10, $9
+ cmpult $9, $10, $0
+ addq $0, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $18
+ addq $20, $18, $20
+ mulq $6, $24, $1
+ umulh $6, $24, $2
+ addq $9, $1, $9
+ cmpult $9, $1, $10
+ addq $10, $2, $2
+ addq $19, $2, $19
+ cmpult $19, $2, $0
+ addq $20, $0, $20
+ mulq $8, $22, $17
+ umulh $8, $22, $18
+ addq $9, $17, $9
+ cmpult $9, $17, $1
+ addq $1, $18, $18
+ addq $19, $18, $19
+ cmpult $19, $18, $10
+ addq $20, $10, $20
+ mulq $23, $7, $2
+ umulh $23, $7, $0
+ addq $9, $2, $9
+ cmpult $9, $2, $17
+ addq $17, $0, $0
+ addq $19, $0, $19
+ cmpult $19, $0, $1
+ addq $20, $1, $20
+ mulq $25, $5, $18
+ umulh $25, $5, $10
+ addq $9, $18, $9
+ cmpult $9, $18, $2
+ addq $2, $10, $10
+ addq $19, $10, $19
+ cmpult $19, $10, $17
+ addq $20, $17, $20
+ mulq $28, $3, $0
+ umulh $28, $3, $1
+ addq $9, $0, $9
+ cmpult $9, $0, $18
+ addq $18, $1, $1
+ addq $19, $1, $19
+ cmpult $19, $1, $2
+ addq $20, $2, $20
+ stq $9, 64($16)
+ bis $31, $31, $10
+ mulq $4, $21, $17
+ umulh $4, $21, $0
+ addq $19, $17, $19
+ cmpult $19, $17, $18
+ addq $18, $0, $0
+ addq $20, $0, $20
+ cmpult $20, $0, $1
+ addq $10, $1, $10
+ mulq $6, $27, $2
+ umulh $6, $27, $3
+ addq $19, $2, $19
+ cmpult $19, $2, $9
+ addq $9, $3, $3
+ addq $20, $3, $20
+ cmpult $20, $3, $17
+ addq $10, $17, $10
+ mulq $8, $24, $18
+ umulh $8, $24, $0
+ addq $19, $18, $19
+ cmpult $19, $18, $1
+ addq $1, $0, $0
+ addq $20, $0, $20
+ cmpult $20, $0, $4
+ addq $10, $4, $10
+ mulq $23, $22, $2
+ umulh $23, $22, $9
+ addq $19, $2, $19
+ cmpult $19, $2, $3
+ addq $3, $9, $9
+ addq $20, $9, $20
+ cmpult $20, $9, $17
+ addq $10, $17, $10
+ mulq $25, $7, $18
+ umulh $25, $7, $1
+ addq $19, $18, $19
+ cmpult $19, $18, $0
+ addq $0, $1, $1
+ addq $20, $1, $20
+ cmpult $20, $1, $4
+ addq $10, $4, $10
+ mulq $28, $5, $2
+ umulh $28, $5, $3
+ addq $19, $2, $19
+ cmpult $19, $2, $9
+ addq $9, $3, $3
+ addq $20, $3, $20
+ cmpult $20, $3, $17
+ addq $10, $17, $10
+ stq $19, 72($16)
+ bis $31, $31, $18
+ mulq $6, $21, $0
+ umulh $6, $21, $1
+ addq $20, $0, $20
+ cmpult $20, $0, $4
+ addq $4, $1, $1
+ addq $10, $1, $10
+ cmpult $10, $1, $2
+ addq $18, $2, $18
+ mulq $8, $27, $9
+ umulh $8, $27, $3
+ addq $20, $9, $20
+ cmpult $20, $9, $17
+ addq $17, $3, $3
+ addq $10, $3, $10
+ cmpult $10, $3, $5
+ addq $18, $5, $18
+ mulq $23, $24, $19
+ umulh $23, $24, $0
+ addq $20, $19, $20
+ cmpult $20, $19, $4
+ addq $4, $0, $0
+ addq $10, $0, $10
+ cmpult $10, $0, $1
+ addq $18, $1, $18
+ mulq $25, $22, $2
+ umulh $25, $22, $6
+ addq $20, $2, $20
+ cmpult $20, $2, $9
+ addq $9, $6, $6
+ addq $10, $6, $10
+ cmpult $10, $6, $17
+ addq $18, $17, $18
+ mulq $28, $7, $3
+ umulh $28, $7, $5
+ addq $20, $3, $20
+ cmpult $20, $3, $19
+ addq $19, $5, $5
+ addq $10, $5, $10
+ cmpult $10, $5, $4
+ addq $18, $4, $18
+ stq $20, 80($16)
+ bis $31, $31, $0
+ mulq $8, $21, $1
+ umulh $8, $21, $2
+ addq $10, $1, $10
+ cmpult $10, $1, $9
+ addq $9, $2, $2
+ addq $18, $2, $18
+ cmpult $18, $2, $6
+ addq $0, $6, $0
+ mulq $23, $27, $17
+ umulh $23, $27, $3
+ addq $10, $17, $10
+ cmpult $10, $17, $19
+ addq $19, $3, $3
+ addq $18, $3, $18
+ cmpult $18, $3, $5
+ addq $0, $5, $0
+ mulq $25, $24, $4
+ umulh $25, $24, $7
+ addq $10, $4, $10
+ cmpult $10, $4, $20
+ addq $20, $7, $7
+ addq $18, $7, $18
+ cmpult $18, $7, $1
+ addq $0, $1, $0
+ mulq $28, $22, $9
+ umulh $28, $22, $2
+ addq $10, $9, $10
+ cmpult $10, $9, $6
+ addq $6, $2, $2
+ addq $18, $2, $18
+ cmpult $18, $2, $8
+ addq $0, $8, $0
+ stq $10, 88($16)
+ bis $31, $31, $17
+ mulq $23, $21, $19
+ umulh $23, $21, $3
+ addq $18, $19, $18
+ cmpult $18, $19, $5
+ addq $5, $3, $3
+ addq $0, $3, $0
+ cmpult $0, $3, $4
+ addq $17, $4, $17
+ mulq $25, $27, $20
+ umulh $25, $27, $7
+ addq $18, $20, $18
+ cmpult $18, $20, $1
+ addq $1, $7, $7
+ addq $0, $7, $0
+ cmpult $0, $7, $9
+ addq $17, $9, $17
+ mulq $28, $24, $6
+ umulh $28, $24, $2
+ addq $18, $6, $18
+ cmpult $18, $6, $8
+ addq $8, $2, $2
+ addq $0, $2, $0
+ cmpult $0, $2, $22
+ addq $17, $22, $17
+ stq $18, 96($16)
+ bis $31, $31, $10
+ mulq $25, $21, $19
+ umulh $25, $21, $5
+ addq $0, $19, $0
+ cmpult $0, $19, $3
+ addq $3, $5, $5
+ addq $17, $5, $17
+ cmpult $17, $5, $4
+ addq $10, $4, $10
+ mulq $28, $27, $23
+ umulh $28, $27, $20
+ addq $0, $23, $0
+ cmpult $0, $23, $1
+ addq $1, $20, $20
+ addq $17, $20, $17
+ cmpult $17, $20, $7
+ addq $10, $7, $10
+ stq $0, 104($16)
+ bis $31, $31, $9
+ mulq $28, $21, $6
+ umulh $28, $21, $8
+ addq $17, $6, $17
+ cmpult $17, $6, $2
+ addq $2, $8, $8
+ addq $10, $8, $10
+ cmpult $10, $8, $22
+ addq $9, $22, $9
+ stq $17, 112($16)
+ stq $10, 120($16)
+ ldq $9, 0($30)
+ ldq $10, 8($30)
+ addq $30, 16, $30
+ ret $31,($26),1
+ .end bn_mul_comba8
+ .text
+ .align 3
+ .globl bn_mul_comba4
+ .ent bn_mul_comba4
+bn_mul_comba4:
+bn_mul_comba4..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ ldq $0, 0($17)
+ ldq $1, 0($18)
+ ldq $2, 8($17)
+ ldq $3, 8($18)
+ mulq $0, $1, $4
+ ldq $5, 16($17)
+ ldq $6, 16($18)
+ umulh $0, $1, $7
+ ldq $8, 24($17)
+ ldq $22, 24($18)
+ mulq $0, $3, $23
+ stq $4, 0($16)
+ bis $31, $31, $24
+ mulq $2, $1, $28
+ bis $31, $31, $25
+ bis $31, $31,
+ addq $24, $7, $24
+ umulh $0, $3, $21
+ cmpult $24, $7, $20
+ addq $24, $23, $24
+ addq $25, $20, $25
+ umulh $2, $1, $19
+ cmpult $24, $23, $17
+ addq $24, $28, $24
+ addq $27, $17, $27
+ mulq $0, $6, $18
+ cmpult $24, $28, $4
+ addq $25, $4, $25
+ stq $24, 8($16)
+ addq $25, $27, $24
+ bis $31, $31, $25
+ addq $24, $21, $24
+ bis $31, $31, $27
+ mulq $2, $3, $7
+ cmpult $24, $21, $20
+ addq $24, $19, $24
+ addq $25, $20, $25
+ mulq $5, $1, $23
+ cmpult $24, $19, $17
+ addq $24, $7, $24
+ addq $27, $17, $27
+ umulh $0, $6, $28
+ cmpult $24, $18, $4
+ addq $24, $7, $24
+ addq $25, $4, $25
+ umulh $2, $3, $21
+ cmpult $24, $7, $20
+ addq $24, $23, $24
+ addq $27, $20, $27
+ umulh $5, $1, $19
+ cmpult $24, $23, $17
+ addq $25, $17, $25
+ stq $24, 16($16)
+ addq $25, $27, $24
+ bis $31, $31, $25
+ addq $24, $28, $24
+ bis $31, $31, $27
+ mulq $0, $22, $18
+ cmpult $24, $28, $4
+ addq $24, $21, $24
+ addq $25, $4, $25
+ mulq $2, $6, $7
+ cmpult $24, $21, $20
+ addq $24, $19, $24
+ addq $25, $20, $25
+ mulq $5, $3, $23
+ cmpult $24, $19, $17
+ addq $24, $18, $24
+ addq $25, $17, $25
+ mulq $8, $1, $28
+ cmpult $24, $18, $4
+ addq $24, $7, $24
+ addq $25, $4, $25
+ umulh $0, $22, $21
+ cmpult $24, $7, $20
+ addq $24, $23, $24
+ addq $25, $20, $25
+ umulh $2, $6, $19
+ cmpult $24, $23, $17
+ addq $24, $28, $24
+ addq $25, $17, $25
+ umulh $5, $3, $18
+ cmpult $24, $28, $4
+ addq $25, $4, $25
+ stq $24, 24($16)
+ addq $25, $27, $24
+ bis $31, $31, $25
+ addq $24, $21, $24
+ bis $31, $31, $27
+ umulh $8, $1, $7
+ cmpult $24, $21, $20
+ addq $24, $19, $24
+ addq $25, $20, $25
+ mulq $2, $22, $23
+ cmpult $24, $19, $17
+ addq $24, $18, $24
+ addq $25, $17, $25
+ mulq $5, $6, $28
+ cmpult $24, $18, $4
+ addq $24, $7, $24
+ addq $25, $4, $25
+ mulq $8, $3, $21
+ cmpult $24, $7, $20
+ addq $24, $23, $24
+ addq $25, $20, $25
+ umulh $2, $22, $19
+ cmpult $24, $23, $17
+ addq $24, $28, $24
+ addq $25, $17, $25
+ umulh $5, $6, $18
+ cmpult $24, $28, $4
+ addq $24, $21, $24
+ addq $25, $4, $25
+ umulh $8, $3, $7
+ cmpult $24, $21, $20
+ addq $25, $20, $25
+ stq $24, 32($16)
+ addq $25, $27, $24
+ bis $31, $31, $25
+ addq $24, $19, $24
+ bis $31, $31, $27
+ mulq $5, $22, $23
+ cmpult $24, $19, $17
+ addq $24, $18, $24
+ addq $25, $17, $25
+ mulq $8, $6, $28
+ cmpult $24, $18, $4
+ addq $24, $7, $24
+ addq $25, $4, $25
+ umulh $5, $22, $21
+ cmpult $24, $7, $20
+ addq $24, $23, $24
+ addq $25, $20, $25
+ umulh $8, $6, $19
+ cmpult $24, $23, $17
+ addq $24, $28, $24
+ addq $25, $17, $25
+ mulq $8, $22, $18
+ cmpult $24, $28, $4
+ addq $25, $4, $25
+ stq $24, 40($16)
+ addq $25, $27, $24
+ bis $31, $31, $25
+ addq $24, $21, $24
+ bis $31, $31, $27
+ umulh $8, $22, $7
+ cmpult $24, $21, $20
+ addq $24, $19, $24
+ addq $25, $20, $25
+ cmpult $24, $19, $23
+ addq $24, $18, $24
+ addq $25, $23, $25
+ cmpult $24, $18, $17
+ addq $25, $17, $25
+ stq $24, 48($16)
+ addq $25, $27, $24
+ addq $24, $7, $24
+ stq $24, 56($16)
+ ret $31,($26),1
+ .end bn_mul_comba4
+ .text
+ .align 3
+ .globl bn_sqr_comba4
+ .ent bn_sqr_comba4
+bn_sqr_comba4:
+bn_sqr_comba4..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ ldq $0, 0($17)
+ ldq $1, 8($17)
+ ldq $2, 16($17)
+ ldq $3, 24($17)
+ bis $31, $31, $6
+ mulq $0, $0, $4
+ umulh $0, $0, $5
+ stq $4, 0($16)
+ bis $31, $31, $4
+ mulq $0, $1, $7
+ umulh $0, $1, $8
+ cmplt $7, $31, $22
+ cmplt $8, $31, $23
+ addq $7, $7, $7
+ addq $8, $8, $8
+ addq $8, $22, $8
+ addq $4, $23, $4
+ addq $5, $7, $5
+ addq $6, $8, $6
+ cmpult $5, $7, $24
+ cmpult $6, $8, $25
+ addq $6, $24, $6
+ addq $4, $25, $4
+ stq $5, 8($16)
+ bis $31, $31, $5
+ mulq $1, $1, $27
+ umulh $1, $1, $28
+ addq $6, $27, $6
+ addq $4, $28, $4
+ cmpult $6, $27, $21
+ cmpult $4, $28, $20
+ addq $4, $21, $4
+ addq $5, $20, $5
+ mulq $2, $0, $19
+ umulh $2, $0, $18
+ cmplt $19, $31, $17
+ cmplt $18, $31, $22
+ addq $19, $19, $19
+ addq $18, $18, $18
+ addq $18, $17, $18
+ addq $5, $22, $5
+ addq $6, $19, $6
+ addq $4, $18, $4
+ cmpult $6, $19, $23
+ cmpult $4, $18, $7
+ addq $4, $23, $4
+ addq $5, $7, $5
+ stq $6, 16($16)
+ bis $31, $31, $6
+ mulq $3, $0, $8
+ umulh $3, $0, $24
+ cmplt $8, $31, $25
+ cmplt $24, $31, $27
+ addq $8, $8, $8
+ addq $24, $24, $24
+ addq $24, $25, $24
+ addq $6, $27, $6
+ addq $4, $8, $4
+ addq $5, $24, $5
+ cmpult $4, $8, $28
+ cmpult $5, $24, $21
+ addq $5, $28, $5
+ addq $6, $21, $6
+ mulq $2, $1, $20
+ umulh $2, $1, $17
+ cmplt $20, $31, $22
+ cmplt $17, $31, $19
+ addq $20, $20, $20
+ addq $17, $17, $17
+ addq $17, $22, $17
+ addq $6, $19, $6
+ addq $4, $20, $4
+ addq $5, $17, $5
+ cmpult $4, $20, $18
+ cmpult $5, $17, $23
+ addq $5, $18, $5
+ addq $6, $23, $6
+ stq $4, 24($16)
+ bis $31, $31, $4
+ mulq $2, $2, $7
+ umulh $2, $2, $25
+ addq $5, $7, $5
+ addq $6, $25, $6
+ cmpult $5, $7, $27
+ cmpult $6, $25, $8
+ addq $6, $27, $6
+ addq $4, $8, $4
+ mulq $3, $1, $24
+ umulh $3, $1, $28
+ cmplt $24, $31, $21
+ cmplt $28, $31, $22
+ addq $24, $24, $24
+ addq $28, $28, $28
+ addq $28, $21, $28
+ addq $4, $22, $4
+ addq $5, $24, $5
+ addq $6, $28, $6
+ cmpult $5, $24, $19
+ cmpult $6, $28, $20
+ addq $6, $19, $6
+ addq $4, $20, $4
+ stq $5, 32($16)
+ bis $31, $31, $5
+ mulq $3, $2, $17
+ umulh $3, $2, $18
+ cmplt $17, $31, $23
+ cmplt $18, $31, $7
+ addq $17, $17, $17
+ addq $18, $18, $18
+ addq $18, $23, $18
+ addq $5, $7, $5
+ addq $6, $17, $6
+ addq $4, $18, $4
+ cmpult $6, $17, $25
+ cmpult $4, $18, $27
+ addq $4, $25, $4
+ addq $5, $27, $5
+ stq $6, 40($16)
+ bis $31, $31, $6
+ mulq $3, $3, $8
+ umulh $3, $3, $21
+ addq $4, $8, $4
+ addq $5, $21, $5
+ cmpult $4, $8, $22
+ cmpult $5, $21, $24
+ addq $5, $22, $5
+ addq $6, $24, $6
+ stq $4, 48($16)
+ stq $5, 56($16)
+ ret $31,($26),1
+ .end bn_sqr_comba4
+ .text
+ .align 3
+ .globl bn_sqr_comba8
+ .ent bn_sqr_comba8
+bn_sqr_comba8:
+bn_sqr_comba8..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ ldq $0, 0($17)
+ ldq $1, 8($17)
+ ldq $2, 16($17)
+ ldq $3, 24($17)
+ ldq $4, 32($17)
+ ldq $5, 40($17)
+ ldq $6, 48($17)
+ ldq $7, 56($17)
+ bis $31, $31, $23
+ mulq $0, $0, $8
+ umulh $0, $0, $22
+ stq $8, 0($16)
+ bis $31, $31, $8
+ mulq $1, $0, $24
+ umulh $1, $0, $25
+ cmplt $24, $31, $27
+ cmplt $25, $31, $28
+ addq $24, $24, $24
+ addq $25, $25, $25
+ addq $25, $27, $25
+ addq $8, $28, $8
+ addq $22, $24, $22
+ addq $23, $25, $23
+ cmpult $22, $24, $21
+ cmpult $23, $25, $20
+ addq $23, $21, $23
+ addq $8, $20, $8
+ stq $22, 8($16)
+ bis $31, $31, $22
+ mulq $1, $1, $19
+ umulh $1, $1, $18
+ addq $23, $19, $23
+ addq $8, $18, $8
+ cmpult $23, $19, $17
+ cmpult $8, $18, $27
+ addq $8, $17, $8
+ addq $22, $27, $22
+ mulq $2, $0, $28
+ umulh $2, $0, $24
+ cmplt $28, $31, $25
+ cmplt $24, $31, $21
+ addq $28, $28, $28
+ addq $24, $24, $24
+ addq $24, $25, $24
+ addq $22, $21, $22
+ addq $23, $28, $23
+ addq $8, $24, $8
+ cmpult $23, $28, $20
+ cmpult $8, $24, $19
+ addq $8, $20, $8
+ addq $22, $19, $22
+ stq $23, 16($16)
+ bis $31, $31, $23
+ mulq $2, $1, $18
+ umulh $2, $1, $17
+ cmplt $18, $31, $27
+ cmplt $17, $31, $25
+ addq $18, $18, $18
+ addq $17, $17, $17
+ addq $17, $27, $17
+ addq $23, $25, $23
+ addq $8, $18, $8
+ addq $22, $17, $22
+ cmpult $8, $18, $21
+ cmpult $22, $17, $28
+ addq $22, $21, $22
+ addq $23, $28, $23
+ mulq $3, $0, $24
+ umulh $3, $0, $20
+ cmplt $24, $31, $19
+ cmplt $20, $31, $27
+ addq $24, $24, $24
+ addq $20, $20, $20
+ addq $20, $19, $20
+ addq $23, $27, $23
+ addq $8, $24, $8
+ addq $22, $20, $22
+ cmpult $8, $24, $25
+ cmpult $22, $20, $18
+ addq $22, $25, $22
+ addq $23, $18, $23
+ stq $8, 24($16)
+ bis $31, $31, $8
+ mulq $2, $2, $17
+ umulh $2, $2, $21
+ addq $22, $17, $22
+ addq $23, $21, $23
+ cmpult $22, $17, $28
+ cmpult $23, $21, $19
+ addq $23, $28, $23
+ addq $8, $19, $8
+ mulq $3, $1, $27
+ umulh $3, $1, $24
+ cmplt $27, $31, $20
+ cmplt $24, $31, $25
+ addq $27, $27, $27
+ addq $24, $24, $24
+ addq $24, $20, $24
+ addq $8, $25, $8
+ addq $22, $27, $22
+ addq $23, $24, $23
+ cmpult $22, $27, $18
+ cmpult $23, $24, $17
+ addq $23, $18, $23
+ addq $8, $17, $8
+ mulq $4, $0, $21
+ umulh $4, $0, $28
+ cmplt $21, $31, $19
+ cmplt $28, $31, $20
+ addq $21, $21, $21
+ addq $28, $28, $28
+ addq $28, $19, $28
+ addq $8, $20, $8
+ addq $22, $21, $22
+ addq $23, $28, $23
+ cmpult $22, $21, $25
+ cmpult $23, $28, $27
+ addq $23, $25, $23
+ addq $8, $27, $8
+ stq $22, 32($16)
+ bis $31, $31, $22
+ mulq $3, $2, $24
+ umulh $3, $2, $18
+ cmplt $24, $31, $17
+ cmplt $18, $31, $19
+ addq $24, $24, $24
+ addq $18, $18, $18
+ addq $18, $17, $18
+ addq $22, $19, $22
+ addq $23, $24, $23
+ addq $8, $18, $8
+ cmpult $23, $24, $20
+ cmpult $8, $18, $21
+ addq $8, $20, $8
+ addq $22, $21, $22
+ mulq $4, $1, $28
+ umulh $4, $1, $25
+ cmplt $28, $31, $27
+ cmplt $25, $31, $17
+ addq $28, $28, $28
+ addq $25, $25, $25
+ addq $25, $27, $25
+ addq $22, $17, $22
+ addq $23, $28, $23
+ addq $8, $25, $8
+ cmpult $23, $28, $19
+ cmpult $8, $25, $24
+ addq $8, $19, $8
+ addq $22, $24, $22
+ mulq $5, $0, $18
+ umulh $5, $0, $20
+ cmplt $18, $31, $21
+ cmplt $20, $31, $27
+ addq $18, $18, $18
+ addq $20, $20, $20
+ addq $20, $21, $20
+ addq $22, $27, $22
+ addq $23, $18, $23
+ addq $8, $20, $8
+ cmpult $23, $18, $17
+ cmpult $8, $20, $28
+ addq $8, $17, $8
+ addq $22, $28, $22
+ stq $23, 40($16)
+ bis $31, $31, $23
+ mulq $3, $3, $25
+ umulh $3, $3, $19
+ addq $8, $25, $8
+ addq $22, $19, $22
+ cmpult $8, $25, $24
+ cmpult $22, $19, $21
+ addq $22, $24, $22
+ addq $23, $21, $23
+ mulq $4, $2, $27
+ umulh $4, $2, $18
+ cmplt $27, $31, $20
+ cmplt $18, $31, $17
+ addq $27, $27, $27
+ addq $18, $18, $18
+ addq $18, $20, $18
+ addq $23, $17, $23
+ addq $8, $27, $8
+ addq $22, $18, $22
+ cmpult $8, $27, $28
+ cmpult $22, $18, $25
+ addq $22, $28, $22
+ addq $23, $25, $23
+ mulq $5, $1, $19
+ umulh $5, $1, $24
+ cmplt $19, $31, $21
+ cmplt $24, $31, $20
+ addq $19, $19, $19
+ addq $24, $24, $24
+ addq $24, $21, $24
+ addq $23, $20, $23
+ addq $8, $19, $8
+ addq $22, $24, $22
+ cmpult $8, $19, $17
+ cmpult $22, $24, $27
+ addq $22, $17, $22
+ addq $23, $27, $23
+ mulq $6, $0, $18
+ umulh $6, $0, $28
+ cmplt $18, $31, $25
+ cmplt $28, $31, $21
+ addq $18, $18, $18
+ addq $28, $28, $28
+ addq $28, $25, $28
+ addq $23, $21, $23
+ addq $8, $18, $8
+ addq $22, $28, $22
+ cmpult $8, $18, $20
+ cmpult $22, $28, $19
+ addq $22, $20, $22
+ addq $23, $19, $23
+ stq $8, 48($16)
+ bis $31, $31, $8
+ mulq $4, $3, $24
+ umulh $4, $3, $17
+ cmplt $24, $31, $27
+ cmplt $17, $31, $25
+ addq $24, $24, $24
+ addq $17, $17, $17
+ addq $17, $27, $17
+ addq $8, $25, $8
+ addq $22, $24, $22
+ addq $23, $17, $23
+ cmpult $22, $24, $21
+ cmpult $23, $17, $18
+ addq $23, $21, $23
+ addq $8, $18, $8
+ mulq $5, $2, $28
+ umulh $5, $2, $20
+ cmplt $28, $31, $19
+ cmplt $20, $31, $27
+ addq $28, $28, $28
+ addq $20, $20, $20
+ addq $20, $19, $20
+ addq $8, $27, $8
+ addq $22, $28, $22
+ addq $23, $20, $23
+ cmpult $22, $28, $25
+ cmpult $23, $20, $24
+ addq $23, $25, $23
+ addq $8, $24, $8
+ mulq $6, $1, $17
+ umulh $6, $1, $21
+ cmplt $17, $31, $18
+ cmplt $21, $31, $19
+ addq $17, $17, $17
+ addq $21, $21, $21
+ addq $21, $18, $21
+ addq $8, $19, $8
+ addq $22, $17, $22
+ addq $23, $21, $23
+ cmpult $22, $17, $27
+ cmpult $23, $21, $28
+ addq $23, $27, $23
+ addq $8, $28, $8
+ mulq $7, $0, $20
+ umulh $7, $0, $25
+ cmplt $20, $31, $24
+ cmplt $25, $31, $18
+ addq $20, $20, $20
+ addq $25, $25, $25
+ addq $25, $24, $25
+ addq $8, $18, $8
+ addq $22, $20, $22
+ addq $23, $25, $23
+ cmpult $22, $20, $19
+ cmpult $23, $25, $17
+ addq $23, $19, $23
+ addq $8, $17, $8
+ stq $22, 56($16)
+ bis $31, $31, $22
+ mulq $4, $4, $21
+ umulh $4, $4, $27
+ addq $23, $21, $23
+ addq $8, $27, $8
+ cmpult $23, $21, $28
+ cmpult $8, $27, $24
+ addq $8, $28, $8
+ addq $22, $24, $22
+ mulq $5, $3, $18
+ umulh $5, $3, $20
+ cmplt $18, $31, $25
+ cmplt $20, $31, $19
+ addq $18, $18, $18
+ addq $20, $20, $20
+ addq $20, $25, $20
+ addq $22, $19, $22
+ addq $23, $18, $23
+ addq $8, $20, $8
+ cmpult $23, $18, $17
+ cmpult $8, $20, $21
+ addq $8, $17, $8
+ addq $22, $21, $22
+ mulq $6, $2, $27
+ umulh $6, $2, $28
+ cmplt $27, $31, $24
+ cmplt $28, $31, $25
+ addq $27, $27, $27
+ addq $28, $28, $28
+ addq $28, $24, $28
+ addq $22, $25, $22
+ addq $23, $27, $23
+ addq $8, $28, $8
+ cmpult $23, $27, $19
+ cmpult $8, $28, $18
+ addq $8, $19, $8
+ addq $22, $18, $22
+ mulq $7, $1, $20
+ umulh $7, $1, $17
+ cmplt $20, $31, $21
+ cmplt $17, $31, $24
+ addq $20, $20, $20
+ addq $17, $17, $17
+ addq $17, $21, $17
+ addq $22, $24, $22
+ addq $23, $20, $23
+ addq $8, $17, $8
+ cmpult $23, $20, $25
+ cmpult $8, $17, $27
+ addq $8, $25, $8
+ addq $22, $27, $22
+ stq $23, 64($16)
+ bis $31, $31, $23
+ mulq $5, $4, $28
+ umulh $5, $4, $19
+ cmplt $28, $31, $18
+ cmplt $19, $31, $21
+ addq $28, $28, $28
+ addq $19, $19, $19
+ addq $19, $18, $19
+ addq $23, $21, $23
+ addq $8, $28, $8
+ addq $22, $19, $22
+ cmpult $8, $28, $24
+ cmpult $22, $19, $20
+ addq $22, $24, $22
+ addq $23, $20, $23
+ mulq $6, $3, $17
+ umulh $6, $3, $25
+ cmplt $17, $31, $27
+ cmplt $25, $31, $18
+ addq $17, $17, $17
+ addq $25, $25, $25
+ addq $25, $27, $25
+ addq $23, $18, $23
+ addq $8, $17, $8
+ addq $22, $25, $22
+ cmpult $8, $17, $21
+ cmpult $22, $25, $28
+ addq $22, $21, $22
+ addq $23, $28, $23
+ mulq $7, $2, $19
+ umulh $7, $2, $24
+ cmplt $19, $31, $20
+ cmplt $24, $31, $27
+ addq $19, $19, $19
+ addq $24, $24, $24
+ addq $24, $20, $24
+ addq $23, $27, $23
+ addq $8, $19, $8
+ addq $22, $24, $22
+ cmpult $8, $19, $18
+ cmpult $22, $24, $17
+ addq $22, $18, $22
+ addq $23, $17, $23
+ stq $8, 72($16)
+ bis $31, $31, $8
+ mulq $5, $5, $25
+ umulh $5, $5, $21
+ addq $22, $25, $22
+ addq $23, $21, $23
+ cmpult $22, $25, $28
+ cmpult $23, $21, $20
+ addq $23, $28, $23
+ addq $8, $20, $8
+ mulq $6, $4, $27
+ umulh $6, $4, $19
+ cmplt $27, $31, $24
+ cmplt $19, $31, $18
+ addq $27, $27, $27
+ addq $19, $19, $19
+ addq $19, $24, $19
+ addq $8, $18, $8
+ addq $22, $27, $22
+ addq $23, $19, $23
+ cmpult $22, $27, $17
+ cmpult $23, $19, $25
+ addq $23, $17, $23
+ addq $8, $25, $8
+ mulq $7, $3, $21
+ umulh $7, $3, $28
+ cmplt $21, $31, $20
+ cmplt $28, $31, $24
+ addq $21, $21, $21
+ addq $28, $28, $28
+ addq $28, $20, $28
+ addq $8, $24, $8
+ addq $22, $21, $22
+ addq $23, $28, $23
+ cmpult $22, $21, $18
+ cmpult $23, $28, $27
+ addq $23, $18, $23
+ addq $8, $27, $8
+ stq $22, 80($16)
+ bis $31, $31, $22
+ mulq $6, $5, $19
+ umulh $6, $5, $17
+ cmplt $19, $31, $25
+ cmplt $17, $31, $20
+ addq $19, $19, $19
+ addq $17, $17, $17
+ addq $17, $25, $17
+ addq $22, $20, $22
+ addq $23, $19, $23
+ addq $8, $17, $8
+ cmpult $23, $19, $24
+ cmpult $8, $17, $21
+ addq $8, $24, $8
+ addq $22, $21, $22
+ mulq $7, $4, $28
+ umulh $7, $4, $18
+ cmplt $28, $31, $27
+ cmplt $18, $31, $25
+ addq $28, $28, $28
+ addq $18, $18, $18
+ addq $18, $27, $18
+ addq $22, $25, $22
+ addq $23, $28, $23
+ addq $8, $18, $8
+ cmpult $23, $28, $20
+ cmpult $8, $18, $19
+ addq $8, $20, $8
+ addq $22, $19, $22
+ stq $23, 88($16)
+ bis $31, $31, $23
+ mulq $6, $6, $17
+ umulh $6, $6, $24
+ addq $8, $17, $8
+ addq $22, $24, $22
+ cmpult $8, $17, $21
+ cmpult $22, $24, $27
+ addq $22, $21, $22
+ addq $23, $27, $23
+ mulq $7, $5, $25
+ umulh $7, $5, $28
+ cmplt $25, $31, $18
+ cmplt $28, $31, $20
+ addq $25, $25, $25
+ addq $28, $28, $28
+ addq $28, $18, $28
+ addq $23, $20, $23
+ addq $8, $25, $8
+ addq $22, $28, $22
+ cmpult $8, $25, $19
+ cmpult $22, $28, $17
+ addq $22, $19, $22
+ addq $23, $17, $23
+ stq $8, 96($16)
+ bis $31, $31, $8
+ mulq $7, $6, $24
+ umulh $7, $6, $21
+ cmplt $24, $31, $27
+ cmplt $21, $31, $18
+ addq $24, $24, $24
+ addq $21, $21, $21
+ addq $21, $27, $21
+ addq $8, $18, $8
+ addq $22, $24, $22
+ addq $23, $21, $23
+ cmpult $22, $24, $20
+ cmpult $23, $21, $25
+ addq $23, $20, $23
+ addq $8, $25, $8
+ stq $22, 104($16)
+ bis $31, $31, $22
+ mulq $7, $7, $28
+ umulh $7, $7, $19
+ addq $23, $28, $23
+ addq $8, $19, $8
+ cmpult $23, $28, $17
+ cmpult $8, $19, $27
+ addq $8, $17, $8
+ addq $22, $27, $22
+ stq $23, 112($16)
+ stq $8, 120($16)
+ ret $31,($26),1
+ .end bn_sqr_comba8
diff --git a/crypto/bn/asm/ff b/crypto/bn/asm/ff
new file mode 100644
index 0000000000..4af216889d
--- /dev/null
+++ b/crypto/bn/asm/ff
@@ -0,0 +1,724 @@
+ .text
+ .align 3
+ .globl bn_mul_comba4
+ .ent bn_mul_comba4
+bn_mul_comba4:
+bn_mul_comba4..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ ldq $0, 0($17)
+ ldq $1, 0($18)
+ ldq $2, 8($17)
+ ldq $3, 8($18)
+ ldq $4, 16($17)
+ ldq $5, 16($18)
+ ldq $6, 24($17)
+ ldq $7, 24($18)
+ bis $31, $31, $23
+ mulq $0, $1, $8
+ umulh $0, $1, $22
+ stq $8, 0($16)
+ bis $31, $31, $8
+ mulq $0, $3, $24
+ umulh $0, $3, $25
+ addq $22, $24, $22
+ cmpult $22, $24, $27
+ addq $27, $25, $25
+ addq $23, $25, $23
+ cmpult $23, $25, $28
+ addq $8, $28, $8
+ mulq $2, $1, $21
+ umulh $2, $1, $20
+ addq $22, $21, $22
+ cmpult $22, $21, $19
+ addq $19, $20, $20
+ addq $23, $20, $23
+ cmpult $23, $20, $17
+ addq $8, $17, $8
+ stq $22, 8($16)
+ bis $31, $31, $22
+ mulq $2, $3, $18
+ umulh $2, $3, $24
+ addq $23, $18, $23
+ cmpult $23, $18, $27
+ addq $27, $24, $24
+ addq $8, $24, $8
+ cmpult $8, $24, $25
+ addq $22, $25, $22
+ mulq $0, $5, $28
+ umulh $0, $5, $21
+ addq $23, $28, $23
+ cmpult $23, $28, $19
+ addq $19, $21, $21
+ addq $8, $21, $8
+ cmpult $8, $21, $20
+ addq $22, $20, $22
+ mulq $4, $1, $17
+ umulh $4, $1, $18
+ addq $23, $17, $23
+ cmpult $23, $17, $27
+ addq $27, $18, $18
+ addq $8, $18, $8
+ cmpult $8, $18, $24
+ addq $22, $24, $22
+ stq $23, 16($16)
+ bis $31, $31, $23
+ mulq $0, $7, $25
+ umulh $0, $7, $28
+ addq $8, $25, $8
+ cmpult $8, $25, $19
+ addq $19, $28, $28
+ addq $22, $28, $22
+ cmpult $22, $28, $21
+ addq $23, $21, $23
+ mulq $2, $5, $20
+ umulh $2, $5, $17
+ addq $8, $20, $8
+ cmpult $8, $20, $27
+ addq $27, $17, $17
+ addq $22, $17, $22
+ cmpult $22, $17, $18
+ addq $23, $18, $23
+ mulq $4, $3, $24
+ umulh $4, $3, $25
+ addq $8, $24, $8
+ cmpult $8, $24, $19
+ addq $19, $25, $25
+ addq $22, $25, $22
+ cmpult $22, $25, $28
+ addq $23, $28, $23
+ mulq $6, $1, $21
+ umulh $6, $1, $0
+ addq $8, $21, $8
+ cmpult $8, $21, $20
+ addq $20, $0, $0
+ addq $22, $0, $22
+ cmpult $22, $0, $27
+ addq $23, $27, $23
+ stq $8, 24($16)
+ bis $31, $31, $8
+ mulq $2, $7, $17
+ umulh $2, $7, $18
+ addq $22, $17, $22
+ cmpult $22, $17, $24
+ addq $24, $18, $18
+ addq $23, $18, $23
+ cmpult $23, $18, $19
+ addq $8, $19, $8
+ mulq $4, $5, $25
+ umulh $4, $5, $28
+ addq $22, $25, $22
+ cmpult $22, $25, $21
+ addq $21, $28, $28
+ addq $23, $28, $23
+ cmpult $23, $28, $20
+ addq $8, $20, $8
+ mulq $6, $3, $0
+ umulh $6, $3, $27
+ addq $22, $0, $22
+ cmpult $22, $0, $1
+ addq $1, $27, $27
+ addq $23, $27, $23
+ cmpult $23, $27, $17
+ addq $8, $17, $8
+ stq $22, 32($16)
+ bis $31, $31, $22
+ mulq $4, $7, $24
+ umulh $4, $7, $18
+ addq $23, $24, $23
+ cmpult $23, $24, $19
+ addq $19, $18, $18
+ addq $8, $18, $8
+ cmpult $8, $18, $2
+ addq $22, $2, $22
+ mulq $6, $5, $25
+ umulh $6, $5, $21
+ addq $23, $25, $23
+ cmpult $23, $25, $28
+ addq $28, $21, $21
+ addq $8, $21, $8
+ cmpult $8, $21, $20
+ addq $22, $20, $22
+ stq $23, 40($16)
+ bis $31, $31, $23
+ mulq $6, $7, $0
+ umulh $6, $7, $1
+ addq $8, $0, $8
+ cmpult $8, $0, $27
+ addq $27, $1, $1
+ addq $22, $1, $22
+ cmpult $22, $1, $17
+ addq $23, $17, $23
+ stq $8, 48($16)
+ stq $22, 56($16)
+ ret $31,($26),1
+ .end bn_mul_comba4
+ .text
+ .align 3
+ .globl bn_mul_comba8
+ .ent bn_mul_comba8
+bn_mul_comba8:
+bn_mul_comba8..ng:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ stq $9, 8($30)
+ stq $10, 16($30)
+ ldq $0, 0($17)
+ ldq $1, 0($18)
+ ldq $2, 8($17)
+ ldq $3, 8($18)
+ ldq $4, 16($17)
+ ldq $5, 16($18)
+ ldq $6, 24($17)
+ ldq $7, 24($18)
+ ldq $8, 8($17)
+ ldq $22, 8($18)
+ ldq $23, 8($17)
+ ldq $24, 8($18)
+ ldq $25, 8($17)
+ ldq $27, 8($18)
+ ldq $28, 8($17)
+ ldq $21, 8($18)
+ bis $31, $31, $9
+ mulq $0, $1, $20
+ umulh $0, $1, $19
+ stq $20, 0($16)
+ bis $31, $31, $20
+ mulq $0, $3, $10
+ umulh $0, $3, $17
+ addq $19, $10, $19
+ cmpult $19, $10, $18
+ addq $18, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $10
+ addq $20, $10, $20
+ mulq $2, $1, $18
+ umulh $2, $1, $17
+ addq $19, $18, $19
+ cmpult $19, $18, $10
+ addq $10, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $18
+ addq $20, $18, $20
+ stq $19, 8($16)
+ bis $31, $31, $19
+ mulq $0, $5, $10
+ umulh $0, $5, $17
+ addq $9, $10, $9
+ cmpult $9, $10, $18
+ addq $18, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $10
+ addq $19, $10, $19
+ mulq $2, $3, $18
+ umulh $2, $3, $17
+ addq $9, $18, $9
+ cmpult $9, $18, $10
+ addq $10, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $18
+ addq $19, $18, $19
+ mulq $4, $1, $10
+ umulh $4, $1, $17
+ addq $9, $10, $9
+ cmpult $9, $10, $18
+ addq $18, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $10
+ addq $19, $10, $19
+ stq $9, 16($16)
+ bis $31, $31, $9
+ mulq $0, $7, $18
+ umulh $0, $7, $17
+ addq $20, $18, $20
+ cmpult $20, $18, $10
+ addq $10, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $18
+ addq $9, $18, $9
+ mulq $2, $5, $10
+ umulh $2, $5, $17
+ addq $20, $10, $20
+ cmpult $20, $10, $18
+ addq $18, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $10
+ addq $9, $10, $9
+ mulq $4, $3, $18
+ umulh $4, $3, $17
+ addq $20, $18, $20
+ cmpult $20, $18, $10
+ addq $10, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $18
+ addq $9, $18, $9
+ mulq $6, $1, $10
+ umulh $6, $1, $17
+ addq $20, $10, $20
+ cmpult $20, $10, $18
+ addq $18, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $10
+ addq $9, $10, $9
+ stq $20, 24($16)
+ bis $31, $31, $20
+ mulq $0, $22, $18
+ umulh $0, $22, $17
+ addq $19, $18, $19
+ cmpult $19, $18, $10
+ addq $10, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $18
+ addq $20, $18, $20
+ mulq $2, $7, $10
+ umulh $2, $7, $17
+ addq $19, $10, $19
+ cmpult $19, $10, $18
+ addq $18, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $10
+ addq $20, $10, $20
+ mulq $4, $5, $18
+ umulh $4, $5, $17
+ addq $19, $18, $19
+ cmpult $19, $18, $10
+ addq $10, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $18
+ addq $20, $18, $20
+ mulq $6, $3, $10
+ umulh $6, $3, $17
+ addq $19, $10, $19
+ cmpult $19, $10, $18
+ addq $18, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $10
+ addq $20, $10, $20
+ mulq $8, $1, $18
+ umulh $8, $1, $17
+ addq $19, $18, $19
+ cmpult $19, $18, $10
+ addq $10, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $18
+ addq $20, $18, $20
+ stq $19, 32($16)
+ bis $31, $31, $19
+ mulq $0, $24, $10
+ umulh $0, $24, $17
+ addq $9, $10, $9
+ cmpult $9, $10, $18
+ addq $18, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $10
+ addq $19, $10, $19
+ mulq $2, $22, $18
+ umulh $2, $22, $17
+ addq $9, $18, $9
+ cmpult $9, $18, $10
+ addq $10, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $18
+ addq $19, $18, $19
+ mulq $4, $7, $10
+ umulh $4, $7, $17
+ addq $9, $10, $9
+ cmpult $9, $10, $18
+ addq $18, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $10
+ addq $19, $10, $19
+ mulq $6, $5, $18
+ umulh $6, $5, $17
+ addq $9, $18, $9
+ cmpult $9, $18, $10
+ addq $10, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $18
+ addq $19, $18, $19
+ mulq $8, $3, $10
+ umulh $8, $3, $17
+ addq $9, $10, $9
+ cmpult $9, $10, $18
+ addq $18, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $10
+ addq $19, $10, $19
+ mulq $23, $1, $18
+ umulh $23, $1, $17
+ addq $9, $18, $9
+ cmpult $9, $18, $10
+ addq $10, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $18
+ addq $19, $18, $19
+ stq $9, 40($16)
+ bis $31, $31, $9
+ mulq $0, $27, $10
+ umulh $0, $27, $17
+ addq $20, $10, $20
+ cmpult $20, $10, $18
+ addq $18, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $10
+ addq $9, $10, $9
+ mulq $2, $24, $18
+ umulh $2, $24, $17
+ addq $20, $18, $20
+ cmpult $20, $18, $10
+ addq $10, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $18
+ addq $9, $18, $9
+ mulq $4, $22, $10
+ umulh $4, $22, $17
+ addq $20, $10, $20
+ cmpult $20, $10, $18
+ addq $18, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $10
+ addq $9, $10, $9
+ mulq $6, $7, $18
+ umulh $6, $7, $17
+ addq $20, $18, $20
+ cmpult $20, $18, $10
+ addq $10, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $18
+ addq $9, $18, $9
+ mulq $8, $5, $10
+ umulh $8, $5, $17
+ addq $20, $10, $20
+ cmpult $20, $10, $18
+ addq $18, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $10
+ addq $9, $10, $9
+ mulq $23, $3, $18
+ umulh $23, $3, $17
+ addq $20, $18, $20
+ cmpult $20, $18, $10
+ addq $10, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $18
+ addq $9, $18, $9
+ mulq $25, $1, $10
+ umulh $25, $1, $17
+ addq $20, $10, $20
+ cmpult $20, $10, $18
+ addq $18, $17, $17
+ addq $19, $17, $19
+ cmpult $19, $17, $10
+ addq $9, $10, $9
+ stq $20, 48($16)
+ bis $31, $31, $20
+ mulq $0, $21, $18
+ umulh $0, $21, $17
+ addq $19, $18, $19
+ cmpult $19, $18, $10
+ addq $10, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $18
+ addq $20, $18, $20
+ mulq $2, $27, $10
+ umulh $2, $27, $17
+ addq $19, $10, $19
+ cmpult $19, $10, $18
+ addq $18, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $0
+ addq $20, $0, $20
+ mulq $4, $24, $10
+ umulh $4, $24, $18
+ addq $19, $10, $19
+ cmpult $19, $10, $17
+ addq $17, $18, $18
+ addq $9, $18, $9
+ cmpult $9, $18, $0
+ addq $20, $0, $20
+ mulq $6, $22, $10
+ umulh $6, $22, $17
+ addq $19, $10, $19
+ cmpult $19, $10, $18
+ addq $18, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $0
+ addq $20, $0, $20
+ mulq $8, $7, $10
+ umulh $8, $7, $18
+ addq $19, $10, $19
+ cmpult $19, $10, $17
+ addq $17, $18, $18
+ addq $9, $18, $9
+ cmpult $9, $18, $0
+ addq $20, $0, $20
+ mulq $23, $5, $10
+ umulh $23, $5, $17
+ addq $19, $10, $19
+ cmpult $19, $10, $18
+ addq $18, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $0
+ addq $20, $0, $20
+ mulq $25, $3, $10
+ umulh $25, $3, $18
+ addq $19, $10, $19
+ cmpult $19, $10, $17
+ addq $17, $18, $18
+ addq $9, $18, $9
+ cmpult $9, $18, $0
+ addq $20, $0, $20
+ mulq $28, $1, $10
+ umulh $28, $1, $17
+ addq $19, $10, $19
+ cmpult $19, $10, $18
+ addq $18, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $0
+ addq $20, $0, $20
+ stq $19, 56($16)
+ bis $31, $31, $19
+ mulq $2, $21, $10
+ umulh $2, $21, $18
+ addq $9, $10, $9
+ cmpult $9, $10, $17
+ addq $17, $18, $18
+ addq $20, $18, $20
+ cmpult $20, $18, $0
+ addq $19, $0, $19
+ mulq $4, $27, $1
+ umulh $4, $27, $10
+ addq $9, $1, $9
+ cmpult $9, $1, $17
+ addq $17, $10, $10
+ addq $20, $10, $20
+ cmpult $20, $10, $18
+ addq $19, $18, $19
+ mulq $6, $24, $0
+ umulh $6, $24, $2
+ addq $9, $0, $9
+ cmpult $9, $0, $1
+ addq $1, $2, $2
+ addq $20, $2, $20
+ cmpult $20, $2, $17
+ addq $19, $17, $19
+ mulq $8, $22, $10
+ umulh $8, $22, $18
+ addq $9, $10, $9
+ cmpult $9, $10, $0
+ addq $0, $18, $18
+ addq $20, $18, $20
+ cmpult $20, $18, $1
+ addq $19, $1, $19
+ mulq $23, $7, $2
+ umulh $23, $7, $17
+ addq $9, $2, $9
+ cmpult $9, $2, $10
+ addq $10, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $0
+ addq $19, $0, $19
+ mulq $25, $5, $18
+ umulh $25, $5, $1
+ addq $9, $18, $9
+ cmpult $9, $18, $2
+ addq $2, $1, $1
+ addq $20, $1, $20
+ cmpult $20, $1, $10
+ addq $19, $10, $19
+ mulq $28, $3, $17
+ umulh $28, $3, $0
+ addq $9, $17, $9
+ cmpult $9, $17, $18
+ addq $18, $0, $0
+ addq $20, $0, $20
+ cmpult $20, $0, $2
+ addq $19, $2, $19
+ stq $9, 64($16)
+ bis $31, $31, $9
+ mulq $4, $21, $1
+ umulh $4, $21, $10
+ addq $20, $1, $20
+ cmpult $20, $1, $17
+ addq $17, $10, $10
+ addq $19, $10, $19
+ cmpult $19, $10, $18
+ addq $9, $18, $9
+ mulq $6, $27, $0
+ umulh $6, $27, $2
+ addq $20, $0, $20
+ cmpult $20, $0, $3
+ addq $3, $2, $2
+ addq $19, $2, $19
+ cmpult $19, $2, $1
+ addq $9, $1, $9
+ mulq $8, $24, $17
+ umulh $8, $24, $10
+ addq $20, $17, $20
+ cmpult $20, $17, $18
+ addq $18, $10, $10
+ addq $19, $10, $19
+ cmpult $19, $10, $4
+ addq $9, $4, $9
+ mulq $23, $22, $0
+ umulh $23, $22, $3
+ addq $20, $0, $20
+ cmpult $20, $0, $2
+ addq $2, $3, $3
+ addq $19, $3, $19
+ cmpult $19, $3, $1
+ addq $9, $1, $9
+ mulq $25, $7, $17
+ umulh $25, $7, $18
+ addq $20, $17, $20
+ cmpult $20, $17, $10
+ addq $10, $18, $18
+ addq $19, $18, $19
+ cmpult $19, $18, $4
+ addq $9, $4, $9
+ mulq $28, $5, $0
+ umulh $28, $5, $2
+ addq $20, $0, $20
+ cmpult $20, $0, $3
+ addq $3, $2, $2
+ addq $19, $2, $19
+ cmpult $19, $2, $1
+ addq $9, $1, $9
+ stq $20, 72($16)
+ bis $31, $31, $20
+ mulq $6, $21, $17
+ umulh $6, $21, $10
+ addq $19, $17, $19
+ cmpult $19, $17, $18
+ addq $18, $10, $10
+ addq $9, $10, $9
+ cmpult $9, $10, $4
+ addq $20, $4, $20
+ mulq $8, $27, $0
+ umulh $8, $27, $3
+ addq $19, $0, $19
+ cmpult $19, $0, $2
+ addq $2, $3, $3
+ addq $9, $3, $9
+ cmpult $9, $3, $1
+ addq $20, $1, $20
+ mulq $23, $24, $5
+ umulh $23, $24, $17
+ addq $19, $5, $19
+ cmpult $19, $5, $18
+ addq $18, $17, $17
+ addq $9, $17, $9
+ cmpult $9, $17, $10
+ addq $20, $10, $20
+ mulq $25, $22, $4
+ umulh $25, $22, $6
+ addq $19, $4, $19
+ cmpult $19, $4, $0
+ addq $0, $6, $6
+ addq $9, $6, $9
+ cmpult $9, $6, $2
+ addq $20, $2, $20
+ mulq $28, $7, $3
+ umulh $28, $7, $1
+ addq $19, $3, $19
+ cmpult $19, $3, $5
+ addq $5, $1, $1
+ addq $9, $1, $9
+ cmpult $9, $1, $18
+ addq $20, $18, $20
+ stq $19, 80($16)
+ bis $31, $31, $19
+ mulq $8, $21, $17
+ umulh $8, $21, $10
+ addq $9, $17, $9
+ cmpult $9, $17, $4
+ addq $4, $10, $10
+ addq $20, $10, $20
+ cmpult $20, $10, $0
+ addq $19, $0, $19
+ mulq $23, $27, $6
+ umulh $23, $27, $2
+ addq $9, $6, $9
+ cmpult $9, $6, $3
+ addq $3, $2, $2
+ addq $20, $2, $20
+ cmpult $20, $2, $5
+ addq $19, $5, $19
+ mulq $25, $24, $1
+ umulh $25, $24, $18
+ addq $9, $1, $9
+ cmpult $9, $1, $7
+ addq $7, $18, $18
+ addq $20, $18, $20
+ cmpult $20, $18, $17
+ addq $19, $17, $19
+ mulq $28, $22, $4
+ umulh $28, $22, $10
+ addq $9, $4, $9
+ cmpult $9, $4, $0
+ addq $0, $10, $10
+ addq $20, $10, $20
+ cmpult $20, $10, $8
+ addq $19, $8, $19
+ stq $9, 88($16)
+ bis $31, $31, $9
+ mulq $23, $21, $6
+ umulh $23, $21, $3
+ addq $20, $6, $20
+ cmpult $20, $6, $2
+ addq $2, $3, $3
+ addq $19, $3, $19
+ cmpult $19, $3, $5
+ addq $9, $5, $9
+ mulq $25, $27, $1
+ umulh $25, $27, $7
+ addq $20, $1, $20
+ cmpult $20, $1, $18
+ addq $18, $7, $7
+ addq $19, $7, $19
+ cmpult $19, $7, $17
+ addq $9, $17, $9
+ mulq $28, $24, $4
+ umulh $28, $24, $0
+ addq $20, $4, $20
+ cmpult $20, $4, $10
+ addq $10, $0, $0
+ addq $19, $0, $19
+ cmpult $19, $0, $8
+ addq $9, $8, $9
+ stq $20, 96($16)
+ bis $31, $31, $20
+ mulq $25, $21, $22
+ umulh $25, $21, $6
+ addq $19, $22, $19
+ cmpult $19, $22, $2
+ addq $2, $6, $6
+ addq $9, $6, $9
+ cmpult $9, $6, $3
+ addq $20, $3, $20
+ mulq $28, $27, $5
+ umulh $28, $27, $23
+ addq $19, $5, $19
+ cmpult $19, $5, $1
+ addq $1, $23, $23
+ addq $9, $23, $9
+ cmpult $9, $23, $18
+ addq $20, $18, $20
+ stq $19, 104($16)
+ bis $31, $31, $19
+ mulq $28, $21, $7
+ umulh $28, $21, $17
+ addq $9, $7, $9
+ cmpult $9, $7, $4
+ addq $4, $17, $17
+ addq $20, $17, $20
+ cmpult $20, $17, $10
+ addq $19, $10, $19
+ stq $9, 112($16)
+ stq $20, 120($16)
+ ldq $9, 8($30)
+ ldq $10, 16($30)
+ ret $31,($26),1
+ .end bn_mul_comba8
diff --git a/crypto/bn/asm/mips1.s b/crypto/bn/asm/mips1.s
new file mode 100644
index 0000000000..44fa1254c7
--- /dev/null
+++ b/crypto/bn/asm/mips1.s
@@ -0,0 +1,539 @@
+/* This assember is for R2000/R3000 machines, or higher ones that do
+ * no want to do any 64 bit arithmatic.
+ * Make sure that the SSLeay bignum library is compiled with
+ * THIRTY_TWO_BIT set.
+ * This must either be compiled with the system CC, or, if you use GNU gas,
+ * cc -E mips1.s|gas -o mips1.o
+ */
+ .set reorder
+ .set noat
+
+#define R1 $1
+#define CC $2
+#define R2 $3
+#define R3 $8
+#define R4 $9
+#define L1 $10
+#define L2 $11
+#define L3 $12
+#define L4 $13
+#define H1 $14
+#define H2 $15
+#define H3 $24
+#define H4 $25
+
+#define P1 $4
+#define P2 $5
+#define P3 $6
+#define P4 $7
+
+ .align 2
+ .ent bn_mul_add_words
+ .globl bn_mul_add_words
+.text
+bn_mul_add_words:
+ .frame $sp,0,$31
+ .mask 0x00000000,0
+ .fmask 0x00000000,0
+
+ #blt P3,4,$lab34
+
+ subu R1,P3,4
+ move CC,$0
+ bltz R1,$lab34
+$lab2:
+ lw R1,0(P1)
+ lw L1,0(P2)
+ lw R2,4(P1)
+ lw L2,4(P2)
+ lw R3,8(P1)
+ lw L3,8(P2)
+ lw R4,12(P1)
+ lw L4,12(P2)
+ multu L1,P4
+ addu R1,R1,CC
+ mflo L1
+ sltu CC,R1,CC
+ addu R1,R1,L1
+ mfhi H1
+ sltu L1,R1,L1
+ sw R1,0(P1)
+ addu CC,CC,L1
+ multu L2,P4
+ addu CC,H1,CC
+ mflo L2
+ addu R2,R2,CC
+ sltu CC,R2,CC
+ mfhi H2
+ addu R2,R2,L2
+ addu P2,P2,16
+ sltu L2,R2,L2
+ sw R2,4(P1)
+ addu CC,CC,L2
+ multu L3,P4
+ addu CC,H2,CC
+ mflo L3
+ addu R3,R3,CC
+ sltu CC,R3,CC
+ mfhi H3
+ addu R3,R3,L3
+ addu P1,P1,16
+ sltu L3,R3,L3
+ sw R3,-8(P1)
+ addu CC,CC,L3
+ multu L4,P4
+ addu CC,H3,CC
+ mflo L4
+ addu R4,R4,CC
+ sltu CC,R4,CC
+ mfhi H4
+ addu R4,R4,L4
+ subu P3,P3,4
+ sltu L4,R4,L4
+ addu CC,CC,L4
+ addu CC,H4,CC
+
+ subu R1,P3,4
+ sw R4,-4(P1) # delay slot
+ bgez R1,$lab2
+
+ bleu P3,0,$lab3
+ .align 2
+$lab33:
+ lw L1,0(P2)
+ lw R1,0(P1)
+ multu L1,P4
+ addu R1,R1,CC
+ sltu CC,R1,CC
+ addu P1,P1,4
+ mflo L1
+ mfhi H1
+ addu R1,R1,L1
+ addu P2,P2,4
+ sltu L1,R1,L1
+ subu P3,P3,1
+ addu CC,CC,L1
+ sw R1,-4(P1)
+ addu CC,H1,CC
+ bgtz P3,$lab33
+ j $31
+ .align 2
+$lab3:
+ j $31
+ .align 2
+$lab34:
+ bgt P3,0,$lab33
+ j $31
+ .end bn_mul_add_words
+
+ .align 2
+ # Program Unit: bn_mul_words
+ .ent bn_mul_words
+ .globl bn_mul_words
+.text
+bn_mul_words:
+ .frame $sp,0,$31
+ .mask 0x00000000,0
+ .fmask 0x00000000,0
+
+ subu P3,P3,4
+ move CC,$0
+ bltz P3,$lab45
+$lab44:
+ lw L1,0(P2)
+ lw L2,4(P2)
+ lw L3,8(P2)
+ lw L4,12(P2)
+ multu L1,P4
+ subu P3,P3,4
+ mflo L1
+ mfhi H1
+ addu L1,L1,CC
+ multu L2,P4
+ sltu CC,L1,CC
+ sw L1,0(P1)
+ addu CC,H1,CC
+ mflo L2
+ mfhi H2
+ addu L2,L2,CC
+ multu L3,P4
+ sltu CC,L2,CC
+ sw L2,4(P1)
+ addu CC,H2,CC
+ mflo L3
+ mfhi H3
+ addu L3,L3,CC
+ multu L4,P4
+ sltu CC,L3,CC
+ sw L3,8(P1)
+ addu CC,H3,CC
+ mflo L4
+ mfhi H4
+ addu L4,L4,CC
+ addu P1,P1,16
+ sltu CC,L4,CC
+ addu P2,P2,16
+ addu CC,H4,CC
+ sw L4,-4(P1)
+
+ bgez P3,$lab44
+ b $lab45
+$lab46:
+ lw L1,0(P2)
+ addu P1,P1,4
+ multu L1,P4
+ addu P2,P2,4
+ mflo L1
+ mfhi H1
+ addu L1,L1,CC
+ subu P3,P3,1
+ sltu CC,L1,CC
+ sw L1,-4(P1)
+ addu CC,H1,CC
+ bgtz P3,$lab46
+ j $31
+$lab45:
+ addu P3,P3,4
+ bgtz P3,$lab46
+ j $31
+ .align 2
+ .end bn_mul_words
+
+ # Program Unit: bn_sqr_words
+ .ent bn_sqr_words
+ .globl bn_sqr_words
+.text
+bn_sqr_words:
+ .frame $sp,0,$31
+ .mask 0x00000000,0
+ .fmask 0x00000000,0
+
+ subu P3,P3,4
+ bltz P3,$lab55
+$lab54:
+ lw L1,0(P2)
+ lw L2,4(P2)
+ lw L3,8(P2)
+ lw L4,12(P2)
+
+ multu L1,L1
+ subu P3,P3,4
+ mflo L1
+ mfhi H1
+ sw L1,0(P1)
+ sw H1,4(P1)
+
+ multu L2,L2
+ addu P1,P1,32
+ mflo L2
+ mfhi H2
+ sw L2,-24(P1)
+ sw H2,-20(P1)
+
+ multu L3,L3
+ addu P2,P2,16
+ mflo L3
+ mfhi H3
+ sw L3,-16(P1)
+ sw H3,-12(P1)
+
+ multu L4,L4
+
+ mflo L4
+ mfhi H4
+ sw L4,-8(P1)
+ sw H4,-4(P1)
+
+ bgtz P3,$lab54
+ b $lab55
+$lab56:
+ lw L1,0(P2)
+ addu P1,P1,8
+ multu L1,L1
+ addu P2,P2,4
+ subu P3,P3,1
+ mflo L1
+ mfhi H1
+ sw L1,-8(P1)
+ sw H1,-4(P1)
+
+ bgtz P3,$lab56
+ j $31
+$lab55:
+ addu P3,P3,4
+ bgtz P3,$lab56
+ j $31
+ .align 2
+ .end bn_sqr_words
+
+ # Program Unit: bn_add_words
+ .ent bn_add_words
+ .globl bn_add_words
+.text
+bn_add_words: # 0x590
+ .frame $sp,0,$31
+ .mask 0x00000000,0
+ .fmask 0x00000000,0
+
+ subu P4,P4,4
+ move CC,$0
+ bltz P4,$lab65
+$lab64:
+ lw L1,0(P2)
+ lw R1,0(P3)
+ lw L2,4(P2)
+ lw R2,4(P3)
+
+ addu L1,L1,CC
+ lw L3,8(P2)
+ sltu CC,L1,CC
+ addu L1,L1,R1
+ sltu R1,L1,R1
+ lw R3,8(P3)
+ addu CC,CC,R1
+ lw L4,12(P2)
+
+ addu L2,L2,CC
+ lw R4,12(P3)
+ sltu CC,L2,CC
+ addu L2,L2,R2
+ sltu R2,L2,R2
+ sw L1,0(P1)
+ addu CC,CC,R2
+ addu P1,P1,16
+ addu L3,L3,CC
+ sw L2,-12(P1)
+
+ sltu CC,L3,CC
+ addu L3,L3,R3
+ sltu R3,L3,R3
+ addu P2,P2,16
+ addu CC,CC,R3
+
+ addu L4,L4,CC
+ addu P3,P3,16
+ sltu CC,L4,CC
+ addu L4,L4,R4
+ subu P4,P4,4
+ sltu R4,L4,R4
+ sw L3,-8(P1)
+ addu CC,CC,R4
+ sw L4,-4(P1)
+
+ bgtz P4,$lab64
+ b $lab65
+$lab66:
+ lw L1,0(P2)
+ lw R1,0(P3)
+ addu L1,L1,CC
+ addu P1,P1,4
+ sltu CC,L1,CC
+ addu P2,P2,4
+ addu P3,P3,4
+ addu L1,L1,R1
+ subu P4,P4,1
+ sltu R1,L1,R1
+ sw L1,-4(P1)
+ addu CC,CC,R1
+
+ bgtz P4,$lab66
+ j $31
+$lab65:
+ addu P4,P4,4
+ bgtz P4,$lab66
+ j $31
+ .end bn_add_words
+
+ # Program Unit: bn_div64
+ .set at
+ .set reorder
+ .text
+ .align 2
+ .globl bn_div64
+ # 321 {
+ .ent bn_div64 2
+bn_div64:
+ subu $sp, 64
+ sw $31, 56($sp)
+ sw $16, 48($sp)
+ .mask 0x80010000, -56
+ .frame $sp, 64, $31
+ move $9, $4
+ move $12, $5
+ move $16, $6
+ # 322 BN_ULONG dh,dl,q,ret=0,th,tl,t;
+ move $31, $0
+ # 323 int i,count=2;
+ li $13, 2
+ # 324
+ # 325 if (d == 0) return(BN_MASK2);
+ bne $16, 0, $80
+ li $2, -1
+ b $93
+$80:
+ # 326
+ # 327 i=BN_num_bits_word(d);
+ move $4, $16
+ sw $31, 16($sp)
+ sw $9, 24($sp)
+ sw $12, 32($sp)
+ sw $13, 40($sp)
+ .livereg 0x800ff0e,0xfff
+ jal BN_num_bits_word
+ li $4, 32
+ lw $31, 16($sp)
+ lw $9, 24($sp)
+ lw $12, 32($sp)
+ lw $13, 40($sp)
+ move $3, $2
+ # 328 if ((i != BN_BITS2) && (h > (BN_ULONG)1<<i))
+ beq $2, $4, $81
+ li $14, 1
+ sll $15, $14, $2
+ bleu $9, $15, $81
+ # 329 {
+ # 330 #if !defined(NO_STDIO) && !defined(WIN16)
+ # 331 fprintf(stderr,"Division would overflow (%d)\n",i);
+ # 332 #endif
+ # 333 abort();
+ sw $3, 8($sp)
+ sw $9, 24($sp)
+ sw $12, 32($sp)
+ sw $13, 40($sp)
+ sw $31, 26($sp)
+ .livereg 0xff0e,0xfff
+ jal abort
+ lw $3, 8($sp)
+ li $4, 32
+ lw $9, 24($sp)
+ lw $12, 32($sp)
+ lw $13, 40($sp)
+ lw $31, 26($sp)
+ # 334 }
+$81:
+ # 335 i=BN_BITS2-i;
+ subu $3, $4, $3
+ # 336 if (h >= d) h-=d;
+ bltu $9, $16, $82
+ subu $9, $9, $16
+$82:
+ # 337
+ # 338 if (i)
+ beq $3, 0, $83
+ # 339 {
+ # 340 d<<=i;
+ sll $16, $16, $3
+ # 341 h=(h<<i)|(l>>(BN_BITS2-i));
+ sll $24, $9, $3
+ subu $25, $4, $3
+ srl $14, $12, $25
+ or $9, $24, $14
+ # 342 l<<=i;
+ sll $12, $12, $3
+ # 343 }
+$83:
+ # 344 dh=(d&BN_MASK2h)>>BN_BITS4;
+ # 345 dl=(d&BN_MASK2l);
+ and $8, $16, -65536
+ srl $8, $8, 16
+ and $10, $16, 65535
+ li $6, -65536
+$84:
+ # 346 for (;;)
+ # 347 {
+ # 348 if ((h>>BN_BITS4) == dh)
+ srl $15, $9, 16
+ bne $8, $15, $85
+ # 349 q=BN_MASK2l;
+ li $5, 65535
+ b $86
+$85:
+ # 350 else
+ # 351 q=h/dh;
+ divu $5, $9, $8
+$86:
+ # 352
+ # 353 for (;;)
+ # 354 {
+ # 355 t=(h-q*dh);
+ mul $4, $5, $8
+ subu $2, $9, $4
+ move $3, $2
+ # 356 if ((t&BN_MASK2h) ||
+ # 357 ((dl*q) <= (
+ # 358 (t<<BN_BITS4)+
+ # 359 ((l&BN_MASK2h)>>BN_BITS4))))
+ and $25, $2, $6
+ bne $25, $0, $87
+ mul $24, $10, $5
+ sll $14, $3, 16
+ and $15, $12, $6
+ srl $25, $15, 16
+ addu $15, $14, $25
+ bgtu $24, $15, $88
+$87:
+ # 360 break;
+ mul $3, $10, $5
+ b $89
+$88:
+ # 361 q--;
+ addu $5, $5, -1
+ # 362 }
+ b $86
+$89:
+ # 363 th=q*dh;
+ # 364 tl=q*dl;
+ # 365 t=(tl>>BN_BITS4);
+ # 366 tl=(tl<<BN_BITS4)&BN_MASK2h;
+ sll $14, $3, 16
+ and $2, $14, $6
+ move $11, $2
+ # 367 th+=t;
+ srl $25, $3, 16
+ addu $7, $4, $25
+ # 368
+ # 369 if (l < tl) th++;
+ bgeu $12, $2, $90
+ addu $7, $7, 1
+$90:
+ # 370 l-=tl;
+ subu $12, $12, $11
+ # 371 if (h < th)
+ bgeu $9, $7, $91
+ # 372 {
+ # 373 h+=d;
+ addu $9, $9, $16
+ # 374 q--;
+ addu $5, $5, -1
+ # 375 }
+$91:
+ # 376 h-=th;
+ subu $9, $9, $7
+ # 377
+ # 378 if (--count == 0) break;
+ addu $13, $13, -1
+ beq $13, 0, $92
+ # 379
+ # 380 ret=q<<BN_BITS4;
+ sll $31, $5, 16
+ # 381 h=((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2;
+ sll $24, $9, 16
+ srl $15, $12, 16
+ or $9, $24, $15
+ # 382 l=(l&BN_MASK2l)<<BN_BITS4;
+ and $12, $12, 65535
+ sll $12, $12, 16
+ # 383 }
+ b $84
+$92:
+ # 384 ret|=q;
+ or $31, $31, $5
+ # 385 return(ret);
+ move $2, $31
+$93:
+ lw $16, 48($sp)
+ lw $31, 56($sp)
+ addu $sp, 64
+ j $31
+ .end bn_div64
+
diff --git a/crypto/bn/asm/mips3.s b/crypto/bn/asm/mips3.s
new file mode 100644
index 0000000000..e8fdd50d16
--- /dev/null
+++ b/crypto/bn/asm/mips3.s
@@ -0,0 +1,544 @@
+/* This assember is for R4000 and above machines. It takes advantage
+ * of the 64 bit registers present on these CPUs.
+ * Make sure that the SSLeay bignum library is compiled with
+ * SIXTY_FOUR_BIT set and BN_LLONG undefined.
+ * This must either be compiled with the system CC, or, if you use GNU gas,
+ * cc -E mips3.s|gas -o mips3.o
+ */
+ .set reorder
+ .set noat
+
+#define R1 $1
+#define CC $2
+#define R2 $3
+#define R3 $8
+#define R4 $9
+#define L1 $10
+#define L2 $11
+#define L3 $12
+#define L4 $13
+#define H1 $14
+#define H2 $15
+#define H3 $24
+#define H4 $25
+
+#define P1 $4
+#define P2 $5
+#define P3 $6
+#define P4 $7
+
+ .align 2
+ .ent bn_mul_add_words
+ .globl bn_mul_add_words
+.text
+bn_mul_add_words:
+ .frame $sp,0,$31
+ .mask 0x00000000,0
+ .fmask 0x00000000,0
+
+ #blt P3,4,$lab34
+
+ subu R1,P3,4
+ move CC,$0
+ bltz R1,$lab34
+$lab2:
+ ld R1,0(P1)
+ ld L1,0(P2)
+ ld R2,8(P1)
+ ld L2,8(P2)
+ ld R3,16(P1)
+ ld L3,16(P2)
+ ld R4,24(P1)
+ ld L4,24(P2)
+ dmultu L1,P4
+ daddu R1,R1,CC
+ mflo L1
+ sltu CC,R1,CC
+ daddu R1,R1,L1
+ mfhi H1
+ sltu L1,R1,L1
+ sd R1,0(P1)
+ daddu CC,CC,L1
+ dmultu L2,P4
+ daddu CC,H1,CC
+ mflo L2
+ daddu R2,R2,CC
+ sltu CC,R2,CC
+ mfhi H2
+ daddu R2,R2,L2
+ daddu P2,P2,32
+ sltu L2,R2,L2
+ sd R2,8(P1)
+ daddu CC,CC,L2
+ dmultu L3,P4
+ daddu CC,H2,CC
+ mflo L3
+ daddu R3,R3,CC
+ sltu CC,R3,CC
+ mfhi H3
+ daddu R3,R3,L3
+ daddu P1,P1,32
+ sltu L3,R3,L3
+ sd R3,-16(P1)
+ daddu CC,CC,L3
+ dmultu L4,P4
+ daddu CC,H3,CC
+ mflo L4
+ daddu R4,R4,CC
+ sltu CC,R4,CC
+ mfhi H4
+ daddu R4,R4,L4
+ subu P3,P3,4
+ sltu L4,R4,L4
+ daddu CC,CC,L4
+ daddu CC,H4,CC
+
+ subu R1,P3,4
+ sd R4,-8(P1) # delay slot
+ bgez R1,$lab2
+
+ bleu P3,0,$lab3
+ .align 2
+$lab33:
+ ld L1,0(P2)
+ ld R1,0(P1)
+ dmultu L1,P4
+ daddu R1,R1,CC
+ sltu CC,R1,CC
+ daddu P1,P1,8
+ mflo L1
+ mfhi H1
+ daddu R1,R1,L1
+ daddu P2,P2,8
+ sltu L1,R1,L1
+ subu P3,P3,1
+ daddu CC,CC,L1
+ sd R1,-8(P1)
+ daddu CC,H1,CC
+ bgtz P3,$lab33
+ j $31
+ .align 2
+$lab3:
+ j $31
+ .align 2
+$lab34:
+ bgt P3,0,$lab33
+ j $31
+ .end bn_mul_add_words
+
+ .align 2
+ # Program Unit: bn_mul_words
+ .ent bn_mul_words
+ .globl bn_mul_words
+.text
+bn_mul_words:
+ .frame $sp,0,$31
+ .mask 0x00000000,0
+ .fmask 0x00000000,0
+
+ subu P3,P3,4
+ move CC,$0
+ bltz P3,$lab45
+$lab44:
+ ld L1,0(P2)
+ ld L2,8(P2)
+ ld L3,16(P2)
+ ld L4,24(P2)
+ dmultu L1,P4
+ subu P3,P3,4
+ mflo L1
+ mfhi H1
+ daddu L1,L1,CC
+ dmultu L2,P4
+ sltu CC,L1,CC
+ sd L1,0(P1)
+ daddu CC,H1,CC
+ mflo L2
+ mfhi H2
+ daddu L2,L2,CC
+ dmultu L3,P4
+ sltu CC,L2,CC
+ sd L2,8(P1)
+ daddu CC,H2,CC
+ mflo L3
+ mfhi H3
+ daddu L3,L3,CC
+ dmultu L4,P4
+ sltu CC,L3,CC
+ sd L3,16(P1)
+ daddu CC,H3,CC
+ mflo L4
+ mfhi H4
+ daddu L4,L4,CC
+ daddu P1,P1,32
+ sltu CC,L4,CC
+ daddu P2,P2,32
+ daddu CC,H4,CC
+ sd L4,-8(P1)
+
+ bgez P3,$lab44
+ b $lab45
+$lab46:
+ ld L1,0(P2)
+ daddu P1,P1,8
+ dmultu L1,P4
+ daddu P2,P2,8
+ mflo L1
+ mfhi H1
+ daddu L1,L1,CC
+ subu P3,P3,1
+ sltu CC,L1,CC
+ sd L1,-8(P1)
+ daddu CC,H1,CC
+ bgtz P3,$lab46
+ j $31
+$lab45:
+ addu P3,P3,4
+ bgtz P3,$lab46
+ j $31
+ .align 2
+ .end bn_mul_words
+
+ # Program Unit: bn_sqr_words
+ .ent bn_sqr_words
+ .globl bn_sqr_words
+.text
+bn_sqr_words:
+ .frame $sp,0,$31
+ .mask 0x00000000,0
+ .fmask 0x00000000,0
+
+ subu P3,P3,4
+ b $lab55
+ bltz P3,$lab55
+$lab54:
+ ld L1,0(P2)
+ ld L2,8(P2)
+ ld L3,16(P2)
+ ld L4,24(P2)
+
+ dmultu L1,L1
+ subu P3,P3,4
+ mflo L1
+ mfhi H1
+ sd L1,0(P1)
+ sd H1,8(P1)
+
+ dmultu L2,L2
+ daddu P1,P1,32
+ mflo L2
+ mfhi H2
+ sd L2,-48(P1)
+ sd H2,-40(P1)
+
+ dmultu L3,L3
+ daddu P2,P2,32
+ mflo L3
+ mfhi H3
+ sd L3,-32(P1)
+ sd H3,-24(P1)
+
+ dmultu L4,L4
+
+ mflo L4
+ mfhi H4
+ sd L4,-16(P1)
+ sd H4,-8(P1)
+
+ bgtz P3,$lab54
+ b $lab55
+$lab56:
+ ld L1,0(P2)
+ daddu P1,P1,16
+ dmultu L1,L1
+ daddu P2,P2,8
+ subu P3,P3,1
+ mflo L1
+ mfhi H1
+ sd L1,-16(P1)
+ sd H1,-8(P1)
+
+ bgtz P3,$lab56
+ j $31
+$lab55:
+ daddu P3,P3,4
+ bgtz P3,$lab56
+ j $31
+ .align 2
+ .end bn_sqr_words
+
+ # Program Unit: bn_add_words
+ .ent bn_add_words
+ .globl bn_add_words
+.text
+bn_add_words: # 0x590
+ .frame $sp,0,$31
+ .mask 0x00000000,0
+ .fmask 0x00000000,0
+
+ subu P4,P4,4
+ move CC,$0
+ bltz P4,$lab65
+$lab64:
+ ld L1,0(P2)
+ ld R1,0(P3)
+ ld L2,8(P2)
+ ld R2,8(P3)
+
+ daddu L1,L1,CC
+ ld L3,16(P2)
+ sltu CC,L1,CC
+ daddu L1,L1,R1
+ sltu R1,L1,R1
+ ld R3,16(P3)
+ daddu CC,CC,R1
+ ld L4,24(P2)
+
+ daddu L2,L2,CC
+ ld R4,24(P3)
+ sltu CC,L2,CC
+ daddu L2,L2,R2
+ sltu R2,L2,R2
+ sd L1,0(P1)
+ daddu CC,CC,R2
+ daddu P1,P1,32
+ daddu L3,L3,CC
+ sd L2,-24(P1)
+
+ sltu CC,L3,CC
+ daddu L3,L3,R3
+ sltu R3,L3,R3
+ daddu P2,P2,32
+ daddu CC,CC,R3
+
+ daddu L4,L4,CC
+ daddu P3,P3,32
+ sltu CC,L4,CC
+ daddu L4,L4,R4
+ sltu R4,L4,R4
+ subu P4,P4,4
+ sd L3,-16(P1)
+ daddu CC,CC,R4
+ sd L4,-8(P1)
+
+ bgtz P4,$lab64
+ b $lab65
+$lab66:
+ ld L1,0(P2)
+ ld R1,0(P3)
+ daddu L1,L1,CC
+ daddu P1,P1,8
+ sltu CC,L1,CC
+ daddu P2,P2,8
+ daddu P3,P3,8
+ daddu L1,L1,R1
+ subu P4,P4,1
+ sltu R1,L1,R1
+ sd L1,-8(P1)
+ daddu CC,CC,R1
+
+ bgtz P4,$lab66
+ j $31
+$lab65:
+ addu P4,P4,4
+ bgtz P4,$lab66
+ j $31
+ .end bn_add_words
+
+#if 1
+ # Program Unit: bn_div64
+ .set at
+ .set reorder
+ .text
+ .align 2
+ .globl bn_div64
+ # 321 {
+ .ent bn_div64
+bn_div64:
+ dsubu $sp, 64
+ sd $31, 56($sp)
+ sd $16, 48($sp)
+ .mask 0x80010000, -56
+ .frame $sp, 64, $31
+ move $9, $4
+ move $12, $5
+ move $16, $6
+ # 322 BN_ULONG dh,dl,q,ret=0,th,tl,t;
+ move $31, $0
+ # 323 int i,count=2;
+ li $13, 2
+ # 324
+ # 325 if (d == 0) return(BN_MASK2);
+ bne $16, 0, $80
+ dli $2, -1
+ b $93
+$80:
+ # 326
+ # 327 i=BN_num_bits_word(d);
+ move $4, $16
+ sd $31, 16($sp)
+ sd $9, 24($sp)
+ sd $12, 32($sp)
+ sd $13, 40($sp)
+ .livereg 0x800ff0e,0xfff
+ jal BN_num_bits_word
+ dli $4, 64
+ ld $31, 16($sp)
+ ld $9, 24($sp)
+ ld $12, 32($sp)
+ ld $13, 40($sp)
+ move $3, $2
+ # 328 if ((i != BN_BITS2) && (h > (BN_ULONG)1<<i))
+ beq $2, $4, $81
+ dli $14, 1
+ dsll $15, $14, $2
+ bleu $9, $15, $81
+ # 329 {
+ # 330 #if !defined(NO_STDIO) && !defined(WIN16)
+ # 331 fprintf(stderr,"Division would overflow (%d)\n",i);
+ # 332 #endif
+ # 333 abort();
+ sd $3, 8($sp)
+ sd $31, 16($sp)
+ sd $9, 24($sp)
+ sd $12, 32($sp)
+ sd $13, 40($sp)
+ .livereg 0xff0e,0xfff
+ jal abort
+ dli $4, 64
+ ld $3, 8($sp)
+ ld $31, 16($sp)
+ ld $9, 24($sp)
+ ld $12, 32($sp)
+ ld $13, 40($sp)
+ # 334 }
+$81:
+ # 335 i=BN_BITS2-i;
+ dsubu $3, $4, $3
+ # 336 if (h >= d) h-=d;
+ bltu $9, $16, $82
+ dsubu $9, $9, $16
+$82:
+ # 337
+ # 338 if (i)
+ beq $3, 0, $83
+ # 339 {
+ # 340 d<<=i;
+ dsll $16, $16, $3
+ # 341 h=(h<<i)|(l>>(BN_BITS2-i));
+ dsll $24, $9, $3
+ dsubu $25, $4, $3
+ dsrl $14, $12, $25
+ or $9, $24, $14
+ # 342 l<<=i;
+ dsll $12, $12, $3
+ # 343 }
+$83:
+ # 344 dh=(d&BN_MASK2h)>>BN_BITS4;
+ # 345 dl=(d&BN_MASK2l);
+ and $8, $16,0xFFFFFFFF00000000
+ dsrl $8, $8, 32
+ # dli $10,0xFFFFFFFF # Is this needed?
+ # and $10, $16, $10
+ dsll $10, $16, 32
+ dsrl $10, $10, 32
+ dli $6,0xFFFFFFFF00000000
+$84:
+ # 346 for (;;)
+ # 347 {
+ # 348 if ((h>>BN_BITS4) == dh)
+ dsrl $15, $9, 32
+ bne $8, $15, $85
+ # 349 q=BN_MASK2l;
+ dli $5, 0xFFFFFFFF
+ b $86
+$85:
+ # 350 else
+ # 351 q=h/dh;
+ ddivu $5, $9, $8
+$86:
+ # 352
+ # 353 for (;;)
+ # 354 {
+ # 355 t=(h-q*dh);
+ dmul $4, $5, $8
+ dsubu $2, $9, $4
+ move $3, $2
+ # 356 if ((t&BN_MASK2h) ||
+ # 357 ((dl*q) <= (
+ # 358 (t<<BN_BITS4)+
+ # 359 ((l&BN_MASK2h)>>BN_BITS4))))
+ and $25, $2, $6
+ bne $25, $0, $87
+ dmul $24, $10, $5
+ dsll $14, $3, 32
+ and $15, $12, $6
+ dsrl $25, $15, 32
+ daddu $15, $14, $25
+ bgtu $24, $15, $88
+$87:
+ # 360 break;
+ dmul $3, $10, $5
+ b $89
+$88:
+ # 361 q--;
+ daddu $5, $5, -1
+ # 362 }
+ b $86
+$89:
+ # 363 th=q*dh;
+ # 364 tl=q*dl;
+ # 365 t=(tl>>BN_BITS4);
+ # 366 tl=(tl<<BN_BITS4)&BN_MASK2h;
+ dsll $14, $3, 32
+ and $2, $14, $6
+ move $11, $2
+ # 367 th+=t;
+ dsrl $25, $3, 32
+ daddu $7, $4, $25
+ # 368
+ # 369 if (l < tl) th++;
+ bgeu $12, $2, $90
+ daddu $7, $7, 1
+$90:
+ # 370 l-=tl;
+ dsubu $12, $12, $11
+ # 371 if (h < th)
+ bgeu $9, $7, $91
+ # 372 {
+ # 373 h+=d;
+ daddu $9, $9, $16
+ # 374 q--;
+ daddu $5, $5, -1
+ # 375 }
+$91:
+ # 376 h-=th;
+ dsubu $9, $9, $7
+ # 377
+ # 378 if (--count == 0) break;
+ addu $13, $13, -1
+ beq $13, 0, $92
+ # 379
+ # 380 ret=q<<BN_BITS4;
+ dsll $31, $5, 32
+ # 381 h=((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2;
+ dsll $24, $9, 32
+ dsrl $15, $12, 32
+ or $9, $24, $15
+ # 382 l=(l&BN_MASK2l)<<BN_BITS4;
+ and $12, $12, 0xFFFFFFFF
+ dsll $12, $12, 32
+ # 383 }
+ b $84
+$92:
+ # 384 ret|=q;
+ or $31, $31, $5
+ # 385 return(ret);
+ move $2, $31
+$93:
+ ld $16, 48($sp)
+ ld $31, 56($sp)
+ daddu $sp, 64
+ j $31
+ .end bn_div64
+#endif
diff --git a/crypto/bn/asm/x86.pl b/crypto/bn/asm/x86.pl
new file mode 100644
index 0000000000..bf869fd0ee
--- /dev/null
+++ b/crypto/bn/asm/x86.pl
@@ -0,0 +1,28 @@
+#!/usr/local/bin/perl
+
+push(@INC,"perlasm","../../perlasm");
+require "x86asm.pl";
+
+require("x86/mul_add.pl");
+require("x86/mul.pl");
+require("x86/sqr.pl");
+require("x86/div.pl");
+require("x86/add.pl");
+require("x86/sub.pl");
+require("x86/comba.pl");
+
+&asm_init($ARGV[0],"bn-586.pl");
+
+&bn_mul_add_words("bn_mul_add_words");
+&bn_mul_words("bn_mul_words");
+&bn_sqr_words("bn_sqr_words");
+&bn_div_words("bn_div_words");
+&bn_add_words("bn_add_words");
+&bn_sub_words("bn_sub_words");
+&bn_mul_comba("bn_mul_comba8",8);
+&bn_mul_comba("bn_mul_comba4",4);
+&bn_sqr_comba("bn_sqr_comba8",8);
+&bn_sqr_comba("bn_sqr_comba4",4);
+
+&asm_finish();
+
diff --git a/crypto/bn/asm/x86/add.pl b/crypto/bn/asm/x86/add.pl
new file mode 100644
index 0000000000..0b5cf583e3
--- /dev/null
+++ b/crypto/bn/asm/x86/add.pl
@@ -0,0 +1,76 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_add_words
+ {
+ local($name)=@_;
+
+ &function_begin($name,"");
+
+ &comment("");
+ $a="esi";
+ $b="edi";
+ $c="eax";
+ $r="ebx";
+ $tmp1="ecx";
+ $tmp2="edx";
+ $num="ebp";
+
+ &mov($r,&wparam(0)); # get r
+ &mov($a,&wparam(1)); # get a
+ &mov($b,&wparam(2)); # get b
+ &mov($num,&wparam(3)); # get num
+ &xor($c,$c); # clear carry
+ &and($num,0xfffffff8); # num / 8
+
+ &jz(&label("aw_finish"));
+
+ &set_label("aw_loop",0);
+ for ($i=0; $i<8; $i++)
+ {
+ &comment("Round $i");
+
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
+ &add($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &add($tmp1,$tmp2);
+ &adc($c,0);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ }
+
+ &comment("");
+ &add($a,32);
+ &add($b,32);
+ &add($r,32);
+ &sub($num,8);
+ &jnz(&label("aw_loop"));
+
+ &set_label("aw_finish",0);
+ &mov($num,&wparam(3)); # get num
+ &and($num,7);
+ &jz(&label("aw_end"));
+
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("Tail Round $i");
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+ &add($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &add($tmp1,$tmp2);
+ &adc($c,0);
+ &dec($num) if ($i != 6);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *a
+ &jz(&label("aw_end")) if ($i != 6);
+ }
+ &set_label("aw_end",0);
+
+# &mov("eax",$c); # $c is "eax"
+
+ &function_end($name);
+ }
+
+1;
diff --git a/crypto/bn/asm/x86/comba.pl b/crypto/bn/asm/x86/comba.pl
new file mode 100644
index 0000000000..2291253629
--- /dev/null
+++ b/crypto/bn/asm/x86/comba.pl
@@ -0,0 +1,277 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub mul_add_c
+ {
+ local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+ # pos == -1 if eax and edx are pre-loaded, 0 to load from next
+ # words, and 1 if load return value
+
+ &comment("mul a[$ai]*b[$bi]");
+
+ # "eax" and "edx" will always be pre-loaded.
+ # &mov("eax",&DWP($ai*4,$a,"",0)) ;
+ # &mov("edx",&DWP($bi*4,$b,"",0));
+
+ &mul("edx");
+ &add($c0,"eax");
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
+ &mov("eax",&wparam(0)) if $pos > 0; # load r[]
+ ###
+ &adc($c1,"edx");
+ &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
+ &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
+ ###
+ &adc($c2,0);
+ # is pos > 1, it means it is the last loop
+ &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
+ }
+
+sub sqr_add_c
+ {
+ local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+ # pos == -1 if eax and edx are pre-loaded, 0 to load from next
+ # words, and 1 if load return value
+
+ &comment("sqr a[$ai]*a[$bi]");
+
+ # "eax" and "edx" will always be pre-loaded.
+ # &mov("eax",&DWP($ai*4,$a,"",0)) ;
+ # &mov("edx",&DWP($bi*4,$b,"",0));
+
+ if ($ai == $bi)
+ { &mul("eax");}
+ else
+ { &mul("edx");}
+ &add($c0,"eax");
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
+ ###
+ &adc($c1,"edx");
+ &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
+ ###
+ &adc($c2,0);
+ # is pos > 1, it means it is the last loop
+ &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
+ }
+
+sub sqr_add_c2
+ {
+ local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+ # pos == -1 if eax and edx are pre-loaded, 0 to load from next
+ # words, and 1 if load return value
+
+ &comment("sqr a[$ai]*a[$bi]");
+
+ # "eax" and "edx" will always be pre-loaded.
+ # &mov("eax",&DWP($ai*4,$a,"",0)) ;
+ # &mov("edx",&DWP($bi*4,$a,"",0));
+
+ if ($ai == $bi)
+ { &mul("eax");}
+ else
+ { &mul("edx");}
+ &add("eax","eax");
+ ###
+ &adc("edx","edx");
+ ###
+ &adc($c2,0);
+ &add($c0,"eax");
+ &adc($c1,"edx");
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
+ &adc($c2,0);
+ &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
+ &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
+ ###
+ }
+
+sub bn_mul_comba
+ {
+ local($name,$num)=@_;
+ local($a,$b,$c0,$c1,$c2);
+ local($i,$as,$ae,$bs,$be,$ai,$bi);
+ local($tot,$end);
+
+ &function_begin_B($name,"");
+
+ $c0="ebx";
+ $c1="ecx";
+ $c2="ebp";
+ $a="esi";
+ $b="edi";
+
+ $as=0;
+ $ae=0;
+ $bs=0;
+ $be=0;
+ $tot=$num+$num-1;
+
+ &push("esi");
+ &mov($a,&wparam(1));
+ &push("edi");
+ &mov($b,&wparam(2));
+ &push("ebp");
+ &push("ebx");
+
+ &xor($c0,$c0);
+ &mov("eax",&DWP(0,$a,"",0)); # load the first word
+ &xor($c1,$c1);
+ &mov("edx",&DWP(0,$b,"",0)); # load the first second
+
+ for ($i=0; $i<$tot; $i++)
+ {
+ $ai=$as;
+ $bi=$bs;
+ $end=$be+1;
+
+ &comment("################## Calculate word $i");
+
+ for ($j=$bs; $j<$end; $j++)
+ {
+ &xor($c2,$c2) if ($j == $bs);
+ if (($j+1) == $end)
+ {
+ $v=1;
+ $v=2 if (($i+1) == $tot);
+ }
+ else
+ { $v=0; }
+ if (($j+1) != $end)
+ {
+ $na=($ai-1);
+ $nb=($bi+1);
+ }
+ else
+ {
+ $na=$as+($i < ($num-1));
+ $nb=$bs+($i >= ($num-1));
+ }
+#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
+ &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
+ if ($v)
+ {
+ &comment("saved r[$i]");
+ # &mov("eax",&wparam(0));
+ # &mov(&DWP($i*4,"eax","",0),$c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ }
+ $ai--;
+ $bi++;
+ }
+ $as++ if ($i < ($num-1));
+ $ae++ if ($i >= ($num-1));
+
+ $bs++ if ($i >= ($num-1));
+ $be++ if ($i < ($num-1));
+ }
+ &comment("save r[$i]");
+ # &mov("eax",&wparam(0));
+ &mov(&DWP($i*4,"eax","",0),$c0);
+
+ &pop("ebx");
+ &pop("ebp");
+ &pop("edi");
+ &pop("esi");
+ &ret();
+ &function_end_B($name);
+ }
+
+sub bn_sqr_comba
+ {
+ local($name,$num)=@_;
+ local($r,$a,$c0,$c1,$c2)=@_;
+ local($i,$as,$ae,$bs,$be,$ai,$bi);
+ local($b,$tot,$end,$half);
+
+ &function_begin_B($name,"");
+
+ $c0="ebx";
+ $c1="ecx";
+ $c2="ebp";
+ $a="esi";
+ $r="edi";
+
+ &push("esi");
+ &push("edi");
+ &push("ebp");
+ &push("ebx");
+ &mov($r,&wparam(0));
+ &mov($a,&wparam(1));
+ &xor($c0,$c0);
+ &xor($c1,$c1);
+ &mov("eax",&DWP(0,$a,"",0)); # load the first word
+
+ $as=0;
+ $ae=0;
+ $bs=0;
+ $be=0;
+ $tot=$num+$num-1;
+
+ for ($i=0; $i<$tot; $i++)
+ {
+ $ai=$as;
+ $bi=$bs;
+ $end=$be+1;
+
+ &comment("############### Calculate word $i");
+ for ($j=$bs; $j<$end; $j++)
+ {
+ &xor($c2,$c2) if ($j == $bs);
+ if (($ai-1) < ($bi+1))
+ {
+ $v=1;
+ $v=2 if ($i+1) == $tot;
+ }
+ else
+ { $v=0; }
+ if (!$v)
+ {
+ $na=$ai-1;
+ $nb=$bi+1;
+ }
+ else
+ {
+ $na=$as+($i < ($num-1));
+ $nb=$bs+($i >= ($num-1));
+ }
+ if ($ai == $bi)
+ {
+ &sqr_add_c($r,$a,$ai,$bi,
+ $c0,$c1,$c2,$v,$i,$na,$nb);
+ }
+ else
+ {
+ &sqr_add_c2($r,$a,$ai,$bi,
+ $c0,$c1,$c2,$v,$i,$na,$nb);
+ }
+ if ($v)
+ {
+ &comment("saved r[$i]");
+ #&mov(&DWP($i*4,$r,"",0),$c0);
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
+ last;
+ }
+ $ai--;
+ $bi++;
+ }
+ $as++ if ($i < ($num-1));
+ $ae++ if ($i >= ($num-1));
+
+ $bs++ if ($i >= ($num-1));
+ $be++ if ($i < ($num-1));
+ }
+ &mov(&DWP($i*4,$r,"",0),$c0);
+ &pop("ebx");
+ &pop("ebp");
+ &pop("edi");
+ &pop("esi");
+ &ret();
+ &function_end_B($name);
+ }
+
+1;
diff --git a/crypto/bn/asm/x86/div.pl b/crypto/bn/asm/x86/div.pl
new file mode 100644
index 0000000000..0e90152caa
--- /dev/null
+++ b/crypto/bn/asm/x86/div.pl
@@ -0,0 +1,15 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_div_words
+ {
+ local($name)=@_;
+
+ &function_begin($name,"");
+ &mov("edx",&wparam(0)); #
+ &mov("eax",&wparam(1)); #
+ &mov("ebx",&wparam(2)); #
+ &div("ebx");
+ &function_end($name);
+ }
+1;
diff --git a/crypto/bn/asm/x86/f b/crypto/bn/asm/x86/f
new file mode 100644
index 0000000000..22e4112224
--- /dev/null
+++ b/crypto/bn/asm/x86/f
@@ -0,0 +1,3 @@
+#!/usr/local/bin/perl
+# x86 assember
+
diff --git a/crypto/bn/asm/x86/mul.pl b/crypto/bn/asm/x86/mul.pl
new file mode 100644
index 0000000000..674cb9b055
--- /dev/null
+++ b/crypto/bn/asm/x86/mul.pl
@@ -0,0 +1,77 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_mul_words
+ {
+ local($name)=@_;
+
+ &function_begin($name,"");
+
+ &comment("");
+ $Low="eax";
+ $High="edx";
+ $a="ebx";
+ $w="ecx";
+ $r="edi";
+ $c="esi";
+ $num="ebp";
+
+ &xor($c,$c); # clear carry
+ &mov($r,&wparam(0)); #
+ &mov($a,&wparam(1)); #
+ &mov($num,&wparam(2)); #
+ &mov($w,&wparam(3)); #
+
+ &and($num,0xfffffff8); # num / 8
+ &jz(&label("mw_finish"));
+
+ &set_label("mw_loop",0);
+ for ($i=0; $i<32; $i+=4)
+ {
+ &comment("Round $i");
+
+ &mov("eax",&DWP($i,$a,"",0)); # *a
+ &mul($w); # *a * w
+ &add("eax",$c); # L(t)+=c
+ # XXX
+
+ &adc("edx",0); # H(t)+=carry
+ &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
+
+ &mov($c,"edx"); # c= H(t);
+ }
+
+ &comment("");
+ &add($a,32);
+ &add($r,32);
+ &sub($num,8);
+ &jz(&label("mw_finish"));
+ &jmp(&label("mw_loop"));
+
+ &set_label("mw_finish",0);
+ &mov($num,&wparam(2)); # get num
+ &and($num,7);
+ &jnz(&label("mw_finish2"));
+ &jmp(&label("mw_end"));
+
+ &set_label("mw_finish2",1);
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("Tail Round $i");
+ &mov("eax",&DWP($i*4,$a,"",0));# *a
+ &mul($w); # *a * w
+ &add("eax",$c); # L(t)+=c
+ # XXX
+ &adc("edx",0); # H(t)+=carry
+ &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
+ &mov($c,"edx"); # c= H(t);
+ &dec($num) if ($i != 7-1);
+ &jz(&label("mw_end")) if ($i != 7-1);
+ }
+ &set_label("mw_end",0);
+ &mov("eax",$c);
+
+ &function_end($name);
+ }
+
+1;
diff --git a/crypto/bn/asm/x86/mul_add.pl b/crypto/bn/asm/x86/mul_add.pl
new file mode 100644
index 0000000000..61830d3a90
--- /dev/null
+++ b/crypto/bn/asm/x86/mul_add.pl
@@ -0,0 +1,87 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_mul_add_words
+ {
+ local($name)=@_;
+
+ &function_begin($name,"");
+
+ &comment("");
+ $Low="eax";
+ $High="edx";
+ $a="ebx";
+ $w="ebp";
+ $r="edi";
+ $c="esi";
+
+ &xor($c,$c); # clear carry
+ &mov($r,&wparam(0)); #
+
+ &mov("ecx",&wparam(2)); #
+ &mov($a,&wparam(1)); #
+
+ &and("ecx",0xfffffff8); # num / 8
+ &mov($w,&wparam(3)); #
+
+ &push("ecx"); # Up the stack for a tmp variable
+
+ &jz(&label("maw_finish"));
+
+ &set_label("maw_loop",0);
+
+ &mov(&swtmp(0),"ecx"); #
+
+ for ($i=0; $i<32; $i+=4)
+ {
+ &comment("Round $i");
+
+ &mov("eax",&DWP($i,$a,"",0)); # *a
+ &mul($w); # *a * w
+ &add("eax",$c); # L(t)+= *r
+ &mov($c,&DWP($i,$r,"",0)); # L(t)+= *r
+ &adc("edx",0); # H(t)+=carry
+ &add("eax",$c); # L(t)+=c
+ &adc("edx",0); # H(t)+=carry
+ &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
+ &mov($c,"edx"); # c= H(t);
+ }
+
+ &comment("");
+ &mov("ecx",&swtmp(0)); #
+ &add($a,32);
+ &add($r,32);
+ &sub("ecx",8);
+ &jnz(&label("maw_loop"));
+
+ &set_label("maw_finish",0);
+ &mov("ecx",&wparam(2)); # get num
+ &and("ecx",7);
+ &jnz(&label("maw_finish2")); # helps branch prediction
+ &jmp(&label("maw_end"));
+
+ &set_label("maw_finish2",1);
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("Tail Round $i");
+ &mov("eax",&DWP($i*4,$a,"",0));# *a
+ &mul($w); # *a * w
+ &add("eax",$c); # L(t)+=c
+ &mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r
+ &adc("edx",0); # H(t)+=carry
+ &add("eax",$c);
+ &adc("edx",0); # H(t)+=carry
+ &dec("ecx") if ($i != 7-1);
+ &mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t);
+ &mov($c,"edx"); # c= H(t);
+ &jz(&label("maw_end")) if ($i != 7-1);
+ }
+ &set_label("maw_end",0);
+ &mov("eax",$c);
+
+ &pop("ecx"); # clear variable from
+
+ &function_end($name);
+ }
+
+1;
diff --git a/crypto/bn/asm/x86/sqr.pl b/crypto/bn/asm/x86/sqr.pl
new file mode 100644
index 0000000000..1f90993cf6
--- /dev/null
+++ b/crypto/bn/asm/x86/sqr.pl
@@ -0,0 +1,60 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_sqr_words
+ {
+ local($name)=@_;
+
+ &function_begin($name,"");
+
+ &comment("");
+ $r="esi";
+ $a="edi";
+ $num="ebx";
+
+ &mov($r,&wparam(0)); #
+ &mov($a,&wparam(1)); #
+ &mov($num,&wparam(2)); #
+
+ &and($num,0xfffffff8); # num / 8
+ &jz(&label("sw_finish"));
+
+ &set_label("sw_loop",0);
+ for ($i=0; $i<32; $i+=4)
+ {
+ &comment("Round $i");
+ &mov("eax",&DWP($i,$a,"",0)); # *a
+ # XXX
+ &mul("eax"); # *a * *a
+ &mov(&DWP($i*2,$r,"",0),"eax"); #
+ &mov(&DWP($i*2+4,$r,"",0),"edx");#
+ }
+
+ &comment("");
+ &add($a,32);
+ &add($r,64);
+ &sub($num,8);
+ &jnz(&label("sw_loop"));
+
+ &set_label("sw_finish",0);
+ &mov($num,&wparam(2)); # get num
+ &and($num,7);
+ &jz(&label("sw_end"));
+
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("Tail Round $i");
+ &mov("eax",&DWP($i*4,$a,"",0)); # *a
+ # XXX
+ &mul("eax"); # *a * *a
+ &mov(&DWP($i*8,$r,"",0),"eax"); #
+ &dec($num) if ($i != 7-1);
+ &mov(&DWP($i*8+4,$r,"",0),"edx");
+ &jz(&label("sw_end")) if ($i != 7-1);
+ }
+ &set_label("sw_end",0);
+
+ &function_end($name);
+ }
+
+1;
diff --git a/crypto/bn/asm/x86/sub.pl b/crypto/bn/asm/x86/sub.pl
new file mode 100644
index 0000000000..837b0e1b07
--- /dev/null
+++ b/crypto/bn/asm/x86/sub.pl
@@ -0,0 +1,76 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_sub_words
+ {
+ local($name)=@_;
+
+ &function_begin($name,"");
+
+ &comment("");
+ $a="esi";
+ $b="edi";
+ $c="eax";
+ $r="ebx";
+ $tmp1="ecx";
+ $tmp2="edx";
+ $num="ebp";
+
+ &mov($r,&wparam(0)); # get r
+ &mov($a,&wparam(1)); # get a
+ &mov($b,&wparam(2)); # get b
+ &mov($num,&wparam(3)); # get num
+ &xor($c,$c); # clear carry
+ &and($num,0xfffffff8); # num / 8
+
+ &jz(&label("aw_finish"));
+
+ &set_label("aw_loop",0);
+ for ($i=0; $i<8; $i++)
+ {
+ &comment("Round $i");
+
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
+ &sub($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &sub($tmp1,$tmp2);
+ &adc($c,0);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
+ }
+
+ &comment("");
+ &add($a,32);
+ &add($b,32);
+ &add($r,32);
+ &sub($num,8);
+ &jnz(&label("aw_loop"));
+
+ &set_label("aw_finish",0);
+ &mov($num,&wparam(3)); # get num
+ &and($num,7);
+ &jz(&label("aw_end"));
+
+ for ($i=0; $i<7; $i++)
+ {
+ &comment("Tail Round $i");
+ &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
+ &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+ &sub($tmp1,$c);
+ &mov($c,0);
+ &adc($c,$c);
+ &sub($tmp1,$tmp2);
+ &adc($c,0);
+ &dec($num) if ($i != 6);
+ &mov(&DWP($i*4,$r,"",0),$tmp1); # *a
+ &jz(&label("aw_end")) if ($i != 6);
+ }
+ &set_label("aw_end",0);
+
+# &mov("eax",$c); # $c is "eax"
+
+ &function_end($name);
+ }
+
+1;
diff --git a/crypto/bn/asm/x86w16.asm b/crypto/bn/asm/x86w16.asm
index 74a933a8cd..80a9ed6eef 100644
--- a/crypto/bn/asm/x86w16.asm
+++ b/crypto/bn/asm/x86w16.asm
@@ -6,11 +6,11 @@ F_TEXT SEGMENT WORD PUBLIC 'CODE'
F_TEXT ENDS
_DATA SEGMENT WORD PUBLIC 'DATA'
_DATA ENDS
-CONST SEGMENT WORD PUBLIC 'CONST'
-CONST ENDS
+_CONST SEGMENT WORD PUBLIC 'CONST'
+_CONST ENDS
_BSS SEGMENT WORD PUBLIC 'BSS'
_BSS ENDS
-DGROUP GROUP CONST, _BSS, _DATA
+DGROUP GROUP _CONST, _BSS, _DATA
ASSUME DS: DGROUP, SS: DGROUP
F_TEXT SEGMENT
ASSUME CS: F_TEXT
diff --git a/crypto/bn/asm/x86w32.asm b/crypto/bn/asm/x86w32.asm
index fc6f917714..957d71e3b1 100644
--- a/crypto/bn/asm/x86w32.asm
+++ b/crypto/bn/asm/x86w32.asm
@@ -6,11 +6,11 @@ F_TEXT SEGMENT WORD USE16 PUBLIC 'CODE'
F_TEXT ENDS
_DATA SEGMENT WORD USE16 PUBLIC 'DATA'
_DATA ENDS
-CONST SEGMENT WORD USE16 PUBLIC 'CONST'
-CONST ENDS
+_CONST SEGMENT WORD USE16 PUBLIC 'CONST'
+_CONST ENDS
_BSS SEGMENT WORD USE16 PUBLIC 'BSS'
_BSS ENDS
-DGROUP GROUP CONST, _BSS, _DATA
+DGROUP GROUP _CONST, _BSS, _DATA
ASSUME DS: DGROUP, SS: DGROUP
F_TEXT SEGMENT
ASSUME CS: F_TEXT
@@ -89,7 +89,7 @@ $L555:
mov bp,WORD PTR [bp+26] ; load num
and bp,3
dec bp
- js $L547
+ js $L547m
mov eax,ecx
mul DWORD PTR es:[bx] ; w* *a
@@ -100,7 +100,7 @@ $L555:
mov DWORD PTR ds:[di],eax
mov esi,edx
dec bp
- js $L547 ; Note that we are now testing for -1
+ js $L547m ; Note that we are now testing for -1
;
mov eax,ecx
mul DWORD PTR es:[bx+4] ; w* *a
@@ -111,7 +111,7 @@ $L555:
mov DWORD PTR ds:[di+4],eax
mov esi,edx
dec bp
- js $L547
+ js $L547m
;
mov eax,ecx
mul DWORD PTR es:[bx+8] ; w* *a
@@ -121,7 +121,7 @@ $L555:
adc edx,0
mov DWORD PTR ds:[di+8],eax
mov esi,edx
-$L547:
+$L547m:
mov eax,esi
mov edx,esi
shr edx,16
@@ -315,37 +315,35 @@ _bn_add_words PROC FAR
; ap = 22
; rp = 18
xor esi,esi ;c=0;
+ mov bx,WORD PTR [bp+18] ; load low r
mov si,WORD PTR [bp+22] ; load a
mov es,WORD PTR [bp+24] ; load a
mov di,WORD PTR [bp+26] ; load b
mov ds,WORD PTR [bp+28] ; load b
mov dx,WORD PTR [bp+30] ; load num
- dec dx
- js $L547
xor ecx,ecx
+ dec dx
+ js $L547a
$L5477:
- xor ebx,ebx
mov eax,DWORD PTR es:[si] ; *a
add eax,ecx
- adc ebx,0
+ mov ecx,0
+ adc ecx,0
add si,4 ; a++
add eax,DWORD PTR ds:[di] ; + *b
- mov ecx,ebx
adc ecx,0
- add di,4
- mov bx,WORD PTR [bp+18]
mov ds,WORD PTR [bp+20]
+ add di,4
mov DWORD PTR ds:[bx],eax
- add bx,4
mov ds,WORD PTR [bp+28]
- mov WORD PTR [bp+18],bx
+ add bx,4
dec dx
- js $L547 ; Note that we are now testing for -1
+ js $L547a ; Note that we are now testing for -1
jmp $L5477
;
-$L547:
+$L547a:
mov eax,ecx
mov edx,ecx
shr edx,16