summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNikos Mavrogiannopoulos <nmav@redhat.com>2014-05-30 15:40:14 +0200
committerNikos Mavrogiannopoulos <nmav@redhat.com>2014-05-30 15:40:14 +0200
commit676040da1b760daf8da94944ebf271696b955b59 (patch)
treeba26fdcab72161ff01cb1aa65dbaa4e7df6c1940
parentbd93a14d329f0bdd291f1461d11db0c1dc33f369 (diff)
downloadgnutls-676040da1b760daf8da94944ebf271696b955b59.tar.gz
Updated asm sources
m---------devel/openssl0
-rw-r--r--lib/accelerated/x86/coff/aes-ssse3-x86.s112
-rw-r--r--lib/accelerated/x86/coff/aes-ssse3-x86_64.s127
-rw-r--r--lib/accelerated/x86/coff/aesni-x86.s881
-rw-r--r--lib/accelerated/x86/coff/aesni-x86_64.s2401
-rw-r--r--lib/accelerated/x86/coff/e_padlock-x86.s356
-rw-r--r--lib/accelerated/x86/coff/e_padlock-x86_64.s359
-rw-r--r--lib/accelerated/x86/coff/ghash-x86_64.s649
-rw-r--r--lib/accelerated/x86/coff/sha1-ssse3-x86_64.s2599
-rw-r--r--lib/accelerated/x86/coff/sha256-ssse3-x86.s3315
-rw-r--r--lib/accelerated/x86/coff/sha512-ssse3-x86.s2
-rw-r--r--lib/accelerated/x86/coff/sha512-ssse3-x86_64.s2658
-rw-r--r--lib/accelerated/x86/elf/aes-ssse3-x86.s112
-rw-r--r--lib/accelerated/x86/elf/aes-ssse3-x86_64.s113
-rw-r--r--lib/accelerated/x86/elf/aesni-x86.s883
-rw-r--r--lib/accelerated/x86/elf/aesni-x86_64.s2166
-rw-r--r--lib/accelerated/x86/elf/e_padlock-x86.s362
-rw-r--r--lib/accelerated/x86/elf/e_padlock-x86_64.s359
-rw-r--r--lib/accelerated/x86/elf/ghash-x86_64.s572
-rw-r--r--lib/accelerated/x86/elf/sha1-ssse3-x86_64.s2555
-rw-r--r--lib/accelerated/x86/elf/sha256-ssse3-x86.s3317
-rw-r--r--lib/accelerated/x86/elf/sha512-ssse3-x86.s2
-rw-r--r--lib/accelerated/x86/elf/sha512-ssse3-x86_64.s2608
-rw-r--r--lib/accelerated/x86/macosx/aes-ssse3-x86.s112
-rw-r--r--lib/accelerated/x86/macosx/aes-ssse3-x86_64.s113
-rw-r--r--lib/accelerated/x86/macosx/aesni-x86.s877
-rw-r--r--lib/accelerated/x86/macosx/aesni-x86_64.s2166
-rw-r--r--lib/accelerated/x86/macosx/e_padlock-x86.s362
-rw-r--r--lib/accelerated/x86/macosx/e_padlock-x86_64.s359
-rw-r--r--lib/accelerated/x86/macosx/ghash-x86_64.s582
-rw-r--r--lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s2555
-rw-r--r--lib/accelerated/x86/macosx/sha256-ssse3-x86.s3315
-rw-r--r--lib/accelerated/x86/macosx/sha512-ssse3-x86.s2
-rw-r--r--lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s2608
34 files changed, 27755 insertions, 11804 deletions
diff --git a/devel/openssl b/devel/openssl
-Subproject b0ee17ad475c97f068c7314efafdd8a53d92af5
+Subproject e09ea622bba106e13ab85173c205f354b0f1d48
diff --git a/lib/accelerated/x86/coff/aes-ssse3-x86.s b/lib/accelerated/x86/coff/aes-ssse3-x86.s
index e209b7d0c0..063f6b157e 100644
--- a/lib/accelerated/x86/coff/aes-ssse3-x86.s
+++ b/lib/accelerated/x86/coff/aes-ssse3-x86.s
@@ -84,33 +84,33 @@ __vpaes_encrypt_core:
movdqa %xmm6,%xmm1
movdqa (%ebp),%xmm2
pandn %xmm0,%xmm1
- movdqu (%edx),%xmm5
- psrld $4,%xmm1
pand %xmm6,%xmm0
+ movdqu (%edx),%xmm5
.byte 102,15,56,0,208
movdqa 16(%ebp),%xmm0
-.byte 102,15,56,0,193
pxor %xmm5,%xmm2
- pxor %xmm2,%xmm0
+ psrld $4,%xmm1
addl $16,%edx
+.byte 102,15,56,0,193
leal 192(%ebp),%ebx
+ pxor %xmm2,%xmm0
jmp .L000enc_entry
.align 16
.L001enc_loop:
movdqa 32(%ebp),%xmm4
-.byte 102,15,56,0,226
- pxor %xmm5,%xmm4
movdqa 48(%ebp),%xmm0
+.byte 102,15,56,0,226
.byte 102,15,56,0,195
- pxor %xmm4,%xmm0
+ pxor %xmm5,%xmm4
movdqa 64(%ebp),%xmm5
-.byte 102,15,56,0,234
+ pxor %xmm4,%xmm0
movdqa -64(%ebx,%ecx,1),%xmm1
+.byte 102,15,56,0,234
movdqa 80(%ebp),%xmm2
-.byte 102,15,56,0,211
- pxor %xmm5,%xmm2
movdqa (%ebx,%ecx,1),%xmm4
+.byte 102,15,56,0,211
movdqa %xmm0,%xmm3
+ pxor %xmm5,%xmm2
.byte 102,15,56,0,193
addl $16,%edx
pxor %xmm2,%xmm0
@@ -119,28 +119,28 @@ __vpaes_encrypt_core:
pxor %xmm0,%xmm3
.byte 102,15,56,0,193
andl $48,%ecx
- pxor %xmm3,%xmm0
subl $1,%eax
+ pxor %xmm3,%xmm0
.L000enc_entry:
movdqa %xmm6,%xmm1
+ movdqa -32(%ebp),%xmm5
pandn %xmm0,%xmm1
psrld $4,%xmm1
pand %xmm6,%xmm0
- movdqa -32(%ebp),%xmm5
.byte 102,15,56,0,232
- pxor %xmm1,%xmm0
movdqa %xmm7,%xmm3
+ pxor %xmm1,%xmm0
.byte 102,15,56,0,217
- pxor %xmm5,%xmm3
movdqa %xmm7,%xmm4
+ pxor %xmm5,%xmm3
.byte 102,15,56,0,224
- pxor %xmm5,%xmm4
movdqa %xmm7,%xmm2
+ pxor %xmm5,%xmm4
.byte 102,15,56,0,211
- pxor %xmm0,%xmm2
movdqa %xmm7,%xmm3
- movdqu (%edx),%xmm5
+ pxor %xmm0,%xmm2
.byte 102,15,56,0,220
+ movdqu (%edx),%xmm5
pxor %xmm1,%xmm3
jnz .L001enc_loop
movdqa 96(%ebp),%xmm4
@@ -155,8 +155,8 @@ __vpaes_encrypt_core:
.def __vpaes_decrypt_core; .scl 3; .type 32; .endef
.align 16
__vpaes_decrypt_core:
- movl 240(%edx),%eax
leal 608(%ebp),%ebx
+ movl 240(%edx),%eax
movdqa %xmm6,%xmm1
movdqa -64(%ebx),%xmm2
pandn %xmm0,%xmm1
@@ -179,56 +179,56 @@ __vpaes_decrypt_core:
.align 16
.L003dec_loop:
movdqa -32(%ebx),%xmm4
+ movdqa -16(%ebx),%xmm1
.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa -16(%ebx),%xmm0
-.byte 102,15,56,0,195
+.byte 102,15,56,0,203
pxor %xmm4,%xmm0
- addl $16,%edx
-.byte 102,15,56,0,197
movdqa (%ebx),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 16(%ebx),%xmm1
.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa 16(%ebx),%xmm0
-.byte 102,15,56,0,195
- pxor %xmm4,%xmm0
- subl $1,%eax
.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
movdqa 32(%ebx),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 48(%ebx),%xmm1
.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa 48(%ebx),%xmm0
-.byte 102,15,56,0,195
- pxor %xmm4,%xmm0
.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
movdqa 64(%ebx),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 80(%ebx),%xmm1
.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa 80(%ebx),%xmm0
-.byte 102,15,56,0,195
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
pxor %xmm4,%xmm0
+ addl $16,%edx
.byte 102,15,58,15,237,12
+ pxor %xmm1,%xmm0
+ subl $1,%eax
.L002dec_entry:
movdqa %xmm6,%xmm1
+ movdqa -32(%ebp),%xmm2
pandn %xmm0,%xmm1
- psrld $4,%xmm1
pand %xmm6,%xmm0
- movdqa -32(%ebp),%xmm2
+ psrld $4,%xmm1
.byte 102,15,56,0,208
- pxor %xmm1,%xmm0
movdqa %xmm7,%xmm3
+ pxor %xmm1,%xmm0
.byte 102,15,56,0,217
- pxor %xmm2,%xmm3
movdqa %xmm7,%xmm4
+ pxor %xmm2,%xmm3
.byte 102,15,56,0,224
pxor %xmm2,%xmm4
movdqa %xmm7,%xmm2
.byte 102,15,56,0,211
- pxor %xmm0,%xmm2
movdqa %xmm7,%xmm3
+ pxor %xmm0,%xmm2
.byte 102,15,56,0,220
- pxor %xmm1,%xmm3
movdqu (%edx),%xmm0
+ pxor %xmm1,%xmm3
jnz .L003dec_loop
movdqa 96(%ebx),%xmm4
.byte 102,15,56,0,226
@@ -335,12 +335,12 @@ __vpaes_schedule_core:
.def __vpaes_schedule_192_smear; .scl 3; .type 32; .endef
.align 16
__vpaes_schedule_192_smear:
- pshufd $128,%xmm6,%xmm0
- pxor %xmm0,%xmm6
+ pshufd $128,%xmm6,%xmm1
pshufd $254,%xmm7,%xmm0
+ pxor %xmm1,%xmm6
+ pxor %xmm1,%xmm1
pxor %xmm0,%xmm6
movdqa %xmm6,%xmm0
- pxor %xmm1,%xmm1
movhlps %xmm1,%xmm6
ret
.def __vpaes_schedule_round; .scl 3; .type 32; .endef
@@ -601,6 +601,8 @@ _vpaes_cbc_encrypt:
movl 24(%esp),%edi
movl 28(%esp),%eax
movl 32(%esp),%edx
+ subl $16,%eax
+ jc .L020cbc_abort
leal -56(%esp),%ebx
movl 36(%esp),%ebp
andl $-16,%ebx
@@ -610,18 +612,17 @@ _vpaes_cbc_encrypt:
subl %esi,%edi
movl %ebx,48(%esp)
movl %edi,(%esp)
- subl $16,%eax
movl %edx,4(%esp)
movl %ebp,8(%esp)
movl %eax,%edi
- leal .L_vpaes_consts+0x30-.L020pic_point,%ebp
+ leal .L_vpaes_consts+0x30-.L021pic_point,%ebp
call __vpaes_preheat
-.L020pic_point:
+.L021pic_point:
cmpl $0,%ecx
- je .L021cbc_dec_loop
- jmp .L022cbc_enc_loop
+ je .L022cbc_dec_loop
+ jmp .L023cbc_enc_loop
.align 16
-.L022cbc_enc_loop:
+.L023cbc_enc_loop:
movdqu (%esi),%xmm0
pxor %xmm1,%xmm0
call __vpaes_encrypt_core
@@ -631,10 +632,10 @@ _vpaes_cbc_encrypt:
movdqu %xmm0,(%ebx,%esi,1)
leal 16(%esi),%esi
subl $16,%edi
- jnc .L022cbc_enc_loop
- jmp .L023cbc_done
+ jnc .L023cbc_enc_loop
+ jmp .L024cbc_done
.align 16
-.L021cbc_dec_loop:
+.L022cbc_dec_loop:
movdqu (%esi),%xmm0
movdqa %xmm1,16(%esp)
movdqa %xmm0,32(%esp)
@@ -646,11 +647,12 @@ _vpaes_cbc_encrypt:
movdqu %xmm0,(%ebx,%esi,1)
leal 16(%esi),%esi
subl $16,%edi
- jnc .L021cbc_dec_loop
-.L023cbc_done:
+ jnc .L022cbc_dec_loop
+.L024cbc_done:
movl 8(%esp),%ebx
movl 48(%esp),%esp
movdqu %xmm1,(%ebx)
+.L020cbc_abort:
popl %edi
popl %esi
popl %ebx
diff --git a/lib/accelerated/x86/coff/aes-ssse3-x86_64.s b/lib/accelerated/x86/coff/aes-ssse3-x86_64.s
index 1a6cf83232..c052b5ba6c 100644
--- a/lib/accelerated/x86/coff/aes-ssse3-x86_64.s
+++ b/lib/accelerated/x86/coff/aes-ssse3-x86_64.s
@@ -43,8 +43,8 @@ _vpaes_encrypt_core:
movdqa .Lk_ipt+16(%rip),%xmm0
.byte 102,15,56,0,193
pxor %xmm5,%xmm2
- pxor %xmm2,%xmm0
addq $16,%r9
+ pxor %xmm2,%xmm0
leaq .Lk_mc_backward(%rip),%r10
jmp .Lenc_entry
@@ -52,19 +52,19 @@ _vpaes_encrypt_core:
.Lenc_loop:
movdqa %xmm13,%xmm4
-.byte 102,15,56,0,226
- pxor %xmm5,%xmm4
movdqa %xmm12,%xmm0
+.byte 102,15,56,0,226
.byte 102,15,56,0,195
- pxor %xmm4,%xmm0
+ pxor %xmm5,%xmm4
movdqa %xmm15,%xmm5
-.byte 102,15,56,0,234
+ pxor %xmm4,%xmm0
movdqa -64(%r11,%r10,1),%xmm1
+.byte 102,15,56,0,234
+ movdqa (%r11,%r10,1),%xmm4
movdqa %xmm14,%xmm2
.byte 102,15,56,0,211
- pxor %xmm5,%xmm2
- movdqa (%r11,%r10,1),%xmm4
movdqa %xmm0,%xmm3
+ pxor %xmm5,%xmm2
.byte 102,15,56,0,193
addq $16,%r9
pxor %xmm2,%xmm0
@@ -73,30 +73,30 @@ _vpaes_encrypt_core:
pxor %xmm0,%xmm3
.byte 102,15,56,0,193
andq $48,%r11
- pxor %xmm3,%xmm0
subq $1,%rax
+ pxor %xmm3,%xmm0
.Lenc_entry:
movdqa %xmm9,%xmm1
+ movdqa %xmm11,%xmm5
pandn %xmm0,%xmm1
psrld $4,%xmm1
pand %xmm9,%xmm0
- movdqa %xmm11,%xmm5
.byte 102,15,56,0,232
- pxor %xmm1,%xmm0
movdqa %xmm10,%xmm3
+ pxor %xmm1,%xmm0
.byte 102,15,56,0,217
- pxor %xmm5,%xmm3
movdqa %xmm10,%xmm4
+ pxor %xmm5,%xmm3
.byte 102,15,56,0,224
- pxor %xmm5,%xmm4
movdqa %xmm10,%xmm2
+ pxor %xmm5,%xmm4
.byte 102,15,56,0,211
- pxor %xmm0,%xmm2
movdqa %xmm10,%xmm3
- movdqu (%r9),%xmm5
+ pxor %xmm0,%xmm2
.byte 102,15,56,0,220
+ movdqu (%r9),%xmm5
pxor %xmm1,%xmm3
jnz .Lenc_loop
@@ -149,62 +149,61 @@ _vpaes_decrypt_core:
movdqa -32(%r10),%xmm4
+ movdqa -16(%r10),%xmm1
.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa -16(%r10),%xmm0
-.byte 102,15,56,0,195
+.byte 102,15,56,0,203
pxor %xmm4,%xmm0
- addq $16,%r9
-
-.byte 102,15,56,0,197
movdqa 0(%r10),%xmm4
-.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa 16(%r10),%xmm0
-.byte 102,15,56,0,195
- pxor %xmm4,%xmm0
- subq $1,%rax
+ pxor %xmm1,%xmm0
+ movdqa 16(%r10),%xmm1
-.byte 102,15,56,0,197
- movdqa 32(%r10),%xmm4
.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa 48(%r10),%xmm0
-.byte 102,15,56,0,195
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
pxor %xmm4,%xmm0
+ movdqa 32(%r10),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 48(%r10),%xmm1
+.byte 102,15,56,0,226
.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
movdqa 64(%r10),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 80(%r10),%xmm1
+
.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa 80(%r10),%xmm0
-.byte 102,15,56,0,195
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
pxor %xmm4,%xmm0
-
+ addq $16,%r9
.byte 102,15,58,15,237,12
+ pxor %xmm1,%xmm0
+ subq $1,%rax
.Ldec_entry:
movdqa %xmm9,%xmm1
pandn %xmm0,%xmm1
+ movdqa %xmm11,%xmm2
psrld $4,%xmm1
pand %xmm9,%xmm0
- movdqa %xmm11,%xmm2
.byte 102,15,56,0,208
- pxor %xmm1,%xmm0
movdqa %xmm10,%xmm3
+ pxor %xmm1,%xmm0
.byte 102,15,56,0,217
- pxor %xmm2,%xmm3
movdqa %xmm10,%xmm4
+ pxor %xmm2,%xmm3
.byte 102,15,56,0,224
pxor %xmm2,%xmm4
movdqa %xmm10,%xmm2
.byte 102,15,56,0,211
- pxor %xmm0,%xmm2
movdqa %xmm10,%xmm3
+ pxor %xmm0,%xmm2
.byte 102,15,56,0,220
- pxor %xmm1,%xmm3
movdqu (%r9),%xmm0
+ pxor %xmm1,%xmm3
jnz .Ldec_loop
@@ -212,7 +211,7 @@ _vpaes_decrypt_core:
.byte 102,15,56,0,226
pxor %xmm0,%xmm4
movdqa 112(%r10),%xmm0
- movdqa .Lk_sr-.Lk_dsbd(%r11),%xmm2
+ movdqa -352(%r11),%xmm2
.byte 102,15,56,0,195
pxor %xmm4,%xmm0
.byte 102,15,56,0,194
@@ -232,7 +231,7 @@ _vpaes_schedule_core:
- call _vpaes_preheat
+ call _vpaes_preheat
movdqa .Lk_rcon(%rip),%xmm8
movdqu (%rdi),%xmm0
@@ -278,7 +277,7 @@ _vpaes_schedule_core:
call _vpaes_schedule_round
decq %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
jmp .Loop_schedule_128
@@ -299,7 +298,7 @@ _vpaes_schedule_core:
.p2align 4
.Lschedule_192:
movdqu 8(%rdi),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movdqa %xmm0,%xmm6
pxor %xmm4,%xmm4
movhlps %xmm4,%xmm6
@@ -308,13 +307,13 @@ _vpaes_schedule_core:
.Loop_schedule_192:
call _vpaes_schedule_round
.byte 102,15,58,15,198,8
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_round
decq %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
jmp .Loop_schedule_192
@@ -331,18 +330,18 @@ _vpaes_schedule_core:
.p2align 4
.Lschedule_256:
movdqu 16(%rdi),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movl $7,%esi
.Loop_schedule_256:
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
movdqa %xmm0,%xmm6
call _vpaes_schedule_round
decq %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
pshufd $255,%xmm0,%xmm0
@@ -380,7 +379,7 @@ _vpaes_schedule_core:
.Lschedule_mangle_last_dec:
addq $-16,%rdx
pxor .Lk_s63(%rip),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movdqu %xmm0,(%rdx)
@@ -412,12 +411,12 @@ _vpaes_schedule_core:
.def _vpaes_schedule_192_smear; .scl 3; .type 32; .endef
.p2align 4
_vpaes_schedule_192_smear:
- pshufd $128,%xmm6,%xmm0
- pxor %xmm0,%xmm6
+ pshufd $128,%xmm6,%xmm1
pshufd $254,%xmm7,%xmm0
+ pxor %xmm1,%xmm6
+ pxor %xmm1,%xmm1
pxor %xmm0,%xmm6
movdqa %xmm6,%xmm0
- pxor %xmm1,%xmm1
movhlps %xmm1,%xmm6
.byte 0xf3,0xc3
@@ -827,6 +826,8 @@ vpaes_cbc_encrypt:
movq 48(%rsp),%r9
xchgq %rcx,%rdx
+ subq $16,%rcx
+ jc .Lcbc_abort
leaq -184(%rsp),%rsp
movaps %xmm6,16(%rsp)
movaps %xmm7,32(%rsp)
@@ -841,7 +842,6 @@ vpaes_cbc_encrypt:
.Lcbc_body:
movdqu (%r8),%xmm6
subq %rdi,%rsi
- subq $16,%rcx
call _vpaes_preheat
cmpl $0,%r9d
je .Lcbc_dec_loop
@@ -882,6 +882,7 @@ vpaes_cbc_encrypt:
movaps 160(%rsp),%xmm15
leaq 184(%rsp),%rsp
.Lcbc_epilogue:
+.Lcbc_abort:
movq 8(%rsp),%rdi
movq 16(%rsp),%rsi
.byte 0xf3,0xc3
@@ -1006,7 +1007,7 @@ _vpaes_consts:
.Lk_dsbo:
.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
-.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
.p2align 6
@@ -1045,7 +1046,7 @@ se_handler:
leaq 16(%rax),%rsi
leaq 512(%r8),%rdi
movl $20,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
leaq 184(%rax),%rax
.Lin_prologue:
@@ -1058,7 +1059,7 @@ se_handler:
movq 40(%r9),%rdi
movq %r8,%rsi
movl $154,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
movq %r9,%rsi
xorq %rcx,%rcx
@@ -1115,21 +1116,21 @@ se_handler:
.LSEH_info_vpaes_set_encrypt_key:
.byte 9,0,0,0
.rva se_handler
-.rva .Lenc_key_body,.Lenc_key_epilogue
+.rva .Lenc_key_body,.Lenc_key_epilogue
.LSEH_info_vpaes_set_decrypt_key:
.byte 9,0,0,0
.rva se_handler
-.rva .Ldec_key_body,.Ldec_key_epilogue
+.rva .Ldec_key_body,.Ldec_key_epilogue
.LSEH_info_vpaes_encrypt:
.byte 9,0,0,0
.rva se_handler
-.rva .Lenc_body,.Lenc_epilogue
+.rva .Lenc_body,.Lenc_epilogue
.LSEH_info_vpaes_decrypt:
.byte 9,0,0,0
.rva se_handler
-.rva .Ldec_body,.Ldec_epilogue
+.rva .Ldec_body,.Ldec_epilogue
.LSEH_info_vpaes_cbc_encrypt:
.byte 9,0,0,0
.rva se_handler
-.rva .Lcbc_body,.Lcbc_epilogue
+.rva .Lcbc_body,.Lcbc_epilogue
diff --git a/lib/accelerated/x86/coff/aesni-x86.s b/lib/accelerated/x86/coff/aesni-x86.s
index 502be77883..1c9b935f34 100644
--- a/lib/accelerated/x86/coff/aesni-x86.s
+++ b/lib/accelerated/x86/coff/aesni-x86.s
@@ -85,29 +85,82 @@ _aesni_decrypt:
.byte 102,15,56,223,209
movups %xmm2,(%eax)
ret
+.def __aesni_encrypt2; .scl 3; .type 32; .endef
+.align 16
+__aesni_encrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L002enc2_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L002enc2_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ ret
+.def __aesni_decrypt2; .scl 3; .type 32; .endef
+.align 16
+__aesni_decrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L003dec2_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L003dec2_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+ ret
.def __aesni_encrypt3; .scl 3; .type 32; .endef
.align 16
__aesni_encrypt3:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
- movups (%edx),%xmm0
-.L002enc3_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L004enc3_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
- movups (%edx),%xmm0
- jnz .L002enc3_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L004enc3_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -119,25 +172,26 @@ __aesni_encrypt3:
.align 16
__aesni_decrypt3:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
- movups (%edx),%xmm0
-.L003dec3_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L005dec3_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
- movups (%edx),%xmm0
- jnz .L003dec3_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L005dec3_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -150,27 +204,29 @@ __aesni_decrypt3:
__aesni_encrypt4:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
- shrl $1,%ecx
- leal 32(%edx),%edx
+ shll $4,%ecx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
pxor %xmm0,%xmm5
- movups (%edx),%xmm0
-.L004enc4_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+.L006enc4_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
.byte 102,15,56,220,233
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
- movups (%edx),%xmm0
- jnz .L004enc4_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L006enc4_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -185,27 +241,29 @@ __aesni_encrypt4:
__aesni_decrypt4:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
- shrl $1,%ecx
- leal 32(%edx),%edx
+ shll $4,%ecx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
pxor %xmm0,%xmm5
- movups (%edx),%xmm0
-.L005dec4_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+.L007dec4_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
.byte 102,15,56,222,233
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
- movups (%edx),%xmm0
- jnz .L005dec4_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L007dec4_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -219,45 +277,44 @@ __aesni_decrypt4:
.align 16
__aesni_encrypt6:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
+.byte 102,15,56,220,209
pxor %xmm0,%xmm5
- decl %ecx
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,220,225
pxor %xmm0,%xmm7
+ addl $16,%ecx
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%edx),%xmm0
.byte 102,15,56,220,249
+ movups -16(%edx,%ecx,1),%xmm0
jmp .L_aesni_encrypt6_enter
.align 16
-.L006enc6_loop:
+.L008enc6_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.align 16
.L_aesni_encrypt6_enter:
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%edx),%xmm0
- jnz .L006enc6_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L008enc6_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -275,45 +332,44 @@ __aesni_encrypt6:
.align 16
__aesni_decrypt6:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
-.byte 102,15,56,222,217
+.byte 102,15,56,222,209
pxor %xmm0,%xmm5
- decl %ecx
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,222,225
pxor %xmm0,%xmm7
+ addl $16,%ecx
+.byte 102,15,56,222,233
.byte 102,15,56,222,241
- movups (%edx),%xmm0
.byte 102,15,56,222,249
+ movups -16(%edx,%ecx,1),%xmm0
jmp .L_aesni_decrypt6_enter
.align 16
-.L007dec6_loop:
+.L009dec6_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.align 16
.L_aesni_decrypt6_enter:
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%edx),%xmm0
- jnz .L007dec6_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L009dec6_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -342,14 +398,14 @@ _aesni_ecb_encrypt:
movl 32(%esp),%edx
movl 36(%esp),%ebx
andl $-16,%eax
- jz .L008ecb_ret
+ jz .L010ecb_ret
movl 240(%edx),%ecx
testl %ebx,%ebx
- jz .L009ecb_decrypt
+ jz .L011ecb_decrypt
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb .L010ecb_enc_tail
+ jb .L012ecb_enc_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -358,9 +414,9 @@ _aesni_ecb_encrypt:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp .L011ecb_enc_loop6_enter
+ jmp .L013ecb_enc_loop6_enter
.align 16
-.L012ecb_enc_loop6:
+.L014ecb_enc_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -375,12 +431,12 @@ _aesni_ecb_encrypt:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-.L011ecb_enc_loop6_enter:
+.L013ecb_enc_loop6_enter:
call __aesni_encrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc .L012ecb_enc_loop6
+ jnc .L014ecb_enc_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -389,18 +445,18 @@ _aesni_ecb_encrypt:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz .L008ecb_ret
-.L010ecb_enc_tail:
+ jz .L010ecb_ret
+.L012ecb_enc_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb .L013ecb_enc_one
+ jb .L015ecb_enc_one
movups 16(%esi),%xmm3
- je .L014ecb_enc_two
+ je .L016ecb_enc_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb .L015ecb_enc_three
+ jb .L017ecb_enc_three
movups 48(%esi),%xmm5
- je .L016ecb_enc_four
+ je .L018ecb_enc_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call __aesni_encrypt6
@@ -409,50 +465,49 @@ _aesni_ecb_encrypt:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L008ecb_ret
+ jmp .L010ecb_ret
.align 16
-.L013ecb_enc_one:
+.L015ecb_enc_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L017enc1_loop_3:
+.L019enc1_loop_3:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L017enc1_loop_3
+ jnz .L019enc1_loop_3
.byte 102,15,56,221,209
movups %xmm2,(%edi)
- jmp .L008ecb_ret
+ jmp .L010ecb_ret
.align 16
-.L014ecb_enc_two:
- xorps %xmm4,%xmm4
- call __aesni_encrypt3
+.L016ecb_enc_two:
+ call __aesni_encrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L008ecb_ret
+ jmp .L010ecb_ret
.align 16
-.L015ecb_enc_three:
+.L017ecb_enc_three:
call __aesni_encrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L008ecb_ret
+ jmp .L010ecb_ret
.align 16
-.L016ecb_enc_four:
+.L018ecb_enc_four:
call __aesni_encrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
- jmp .L008ecb_ret
+ jmp .L010ecb_ret
.align 16
-.L009ecb_decrypt:
+.L011ecb_decrypt:
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb .L018ecb_dec_tail
+ jb .L020ecb_dec_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -461,9 +516,9 @@ _aesni_ecb_encrypt:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp .L019ecb_dec_loop6_enter
+ jmp .L021ecb_dec_loop6_enter
.align 16
-.L020ecb_dec_loop6:
+.L022ecb_dec_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -478,12 +533,12 @@ _aesni_ecb_encrypt:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-.L019ecb_dec_loop6_enter:
+.L021ecb_dec_loop6_enter:
call __aesni_decrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc .L020ecb_dec_loop6
+ jnc .L022ecb_dec_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -492,18 +547,18 @@ _aesni_ecb_encrypt:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz .L008ecb_ret
-.L018ecb_dec_tail:
+ jz .L010ecb_ret
+.L020ecb_dec_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb .L021ecb_dec_one
+ jb .L023ecb_dec_one
movups 16(%esi),%xmm3
- je .L022ecb_dec_two
+ je .L024ecb_dec_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb .L023ecb_dec_three
+ jb .L025ecb_dec_three
movups 48(%esi),%xmm5
- je .L024ecb_dec_four
+ je .L026ecb_dec_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call __aesni_decrypt6
@@ -512,44 +567,43 @@ _aesni_ecb_encrypt:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L008ecb_ret
+ jmp .L010ecb_ret
.align 16
-.L021ecb_dec_one:
+.L023ecb_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L025dec1_loop_4:
+.L027dec1_loop_4:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L025dec1_loop_4
+ jnz .L027dec1_loop_4
.byte 102,15,56,223,209
movups %xmm2,(%edi)
- jmp .L008ecb_ret
+ jmp .L010ecb_ret
.align 16
-.L022ecb_dec_two:
- xorps %xmm4,%xmm4
- call __aesni_decrypt3
+.L024ecb_dec_two:
+ call __aesni_decrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L008ecb_ret
+ jmp .L010ecb_ret
.align 16
-.L023ecb_dec_three:
+.L025ecb_dec_three:
call __aesni_decrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L008ecb_ret
+ jmp .L010ecb_ret
.align 16
-.L024ecb_dec_four:
+.L026ecb_dec_four:
call __aesni_decrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-.L008ecb_ret:
+.L010ecb_ret:
popl %edi
popl %esi
popl %ebx
@@ -587,45 +641,45 @@ _aesni_ccm64_encrypt_blocks:
movl %ebp,20(%esp)
movl %ebp,24(%esp)
movl %ebp,28(%esp)
- shrl $1,%ecx
+ shll $4,%ecx
+ movl $16,%ebx
leal (%edx),%ebp
movdqa (%esp),%xmm5
movdqa %xmm7,%xmm2
- movl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ subl %ecx,%ebx
.byte 102,15,56,0,253
-.L026ccm64_enc_outer:
+.L028ccm64_enc_outer:
movups (%ebp),%xmm0
movl %ebx,%ecx
movups (%esi),%xmm6
xorps %xmm0,%xmm2
movups 16(%ebp),%xmm1
xorps %xmm6,%xmm0
- leal 32(%ebp),%edx
xorps %xmm0,%xmm3
- movups (%edx),%xmm0
-.L027ccm64_enc2_loop:
+ movups 32(%ebp),%xmm0
+.L029ccm64_enc2_loop:
.byte 102,15,56,220,209
- decl %ecx
.byte 102,15,56,220,217
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
- leal 32(%edx),%edx
.byte 102,15,56,220,216
- movups (%edx),%xmm0
- jnz .L027ccm64_enc2_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L029ccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
paddq 16(%esp),%xmm7
+ decl %eax
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- decl %eax
leal 16(%esi),%esi
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
movups %xmm6,(%edi)
- leal 16(%edi),%edi
.byte 102,15,56,0,213
- jnz .L026ccm64_enc_outer
+ leal 16(%edi),%edi
+ jnz .L028ccm64_enc_outer
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
@@ -675,67 +729,70 @@ _aesni_ccm64_decrypt_blocks:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L028enc1_loop_5:
+.L030enc1_loop_5:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L028enc1_loop_5
+ jnz .L030enc1_loop_5
.byte 102,15,56,221,209
+ shll $4,%ebx
+ movl $16,%ecx
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
leal 16(%esi),%esi
- jmp .L029ccm64_dec_outer
+ subl %ebx,%ecx
+ leal 32(%ebp,%ebx,1),%edx
+ movl %ecx,%ebx
+ jmp .L031ccm64_dec_outer
.align 16
-.L029ccm64_dec_outer:
+.L031ccm64_dec_outer:
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
- movl %ebx,%ecx
movups %xmm6,(%edi)
leal 16(%edi),%edi
.byte 102,15,56,0,213
subl $1,%eax
- jz .L030ccm64_dec_break
+ jz .L032ccm64_dec_break
movups (%ebp),%xmm0
- shrl $1,%ecx
+ movl %ebx,%ecx
movups 16(%ebp),%xmm1
xorps %xmm0,%xmm6
- leal 32(%ebp),%edx
xorps %xmm0,%xmm2
xorps %xmm6,%xmm3
- movups (%edx),%xmm0
-.L031ccm64_dec2_loop:
+ movups 32(%ebp),%xmm0
+.L033ccm64_dec2_loop:
.byte 102,15,56,220,209
- decl %ecx
.byte 102,15,56,220,217
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
- leal 32(%edx),%edx
.byte 102,15,56,220,216
- movups (%edx),%xmm0
- jnz .L031ccm64_dec2_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L033ccm64_dec2_loop
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- leal 16(%esi),%esi
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- jmp .L029ccm64_dec_outer
+ leal 16(%esi),%esi
+ jmp .L031ccm64_dec_outer
.align 16
-.L030ccm64_dec_break:
+.L032ccm64_dec_break:
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movups (%edx),%xmm0
movups 16(%edx),%xmm1
xorps %xmm0,%xmm6
leal 32(%edx),%edx
xorps %xmm6,%xmm3
-.L032enc1_loop_6:
+.L034enc1_loop_6:
.byte 102,15,56,220,217
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L032enc1_loop_6
+ jnz .L034enc1_loop_6
.byte 102,15,56,221,217
movl 48(%esp),%esp
movl 40(%esp),%edi
@@ -764,7 +821,7 @@ _aesni_ctr32_encrypt_blocks:
andl $-16,%esp
movl %ebp,80(%esp)
cmpl $1,%eax
- je .L033ctr32_one_shortcut
+ je .L035ctr32_one_shortcut
movdqu (%ebx),%xmm7
movl $202182159,(%esp)
movl $134810123,4(%esp)
@@ -780,63 +837,59 @@ _aesni_ctr32_encrypt_blocks:
.byte 102,15,58,34,253,3
movl 240(%edx),%ecx
bswap %ebx
- pxor %xmm1,%xmm1
pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movdqa (%esp),%xmm2
-.byte 102,15,58,34,203,0
+.byte 102,15,58,34,195,0
leal 3(%ebx),%ebp
-.byte 102,15,58,34,197,0
+.byte 102,15,58,34,205,0
incl %ebx
-.byte 102,15,58,34,203,1
+.byte 102,15,58,34,195,1
incl %ebp
-.byte 102,15,58,34,197,1
+.byte 102,15,58,34,205,1
incl %ebx
-.byte 102,15,58,34,203,2
+.byte 102,15,58,34,195,2
incl %ebp
-.byte 102,15,58,34,197,2
- movdqa %xmm1,48(%esp)
-.byte 102,15,56,0,202
- movdqa %xmm0,64(%esp)
+.byte 102,15,58,34,205,2
+ movdqa %xmm0,48(%esp)
.byte 102,15,56,0,194
- pshufd $192,%xmm1,%xmm2
- pshufd $128,%xmm1,%xmm3
+ movdqu (%edx),%xmm6
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
+ pshufd $192,%xmm0,%xmm2
+ pshufd $128,%xmm0,%xmm3
cmpl $6,%eax
- jb .L034ctr32_tail
+ jb .L036ctr32_tail
+ pxor %xmm6,%xmm7
+ shll $4,%ecx
+ movl $16,%ebx
movdqa %xmm7,32(%esp)
- shrl $1,%ecx
movl %edx,%ebp
- movl %ecx,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
subl $6,%eax
- jmp .L035ctr32_loop6
-.align 16
-.L035ctr32_loop6:
- pshufd $64,%xmm1,%xmm4
- movdqa 32(%esp),%xmm1
- pshufd $192,%xmm0,%xmm5
- por %xmm1,%xmm2
- pshufd $128,%xmm0,%xmm6
- por %xmm1,%xmm3
- pshufd $64,%xmm0,%xmm7
- por %xmm1,%xmm4
- por %xmm1,%xmm5
- por %xmm1,%xmm6
- por %xmm1,%xmm7
- movups (%ebp),%xmm0
- movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
- decl %ecx
+ jmp .L037ctr32_loop6
+.align 16
+.L037ctr32_loop6:
+ pshufd $64,%xmm0,%xmm4
+ movdqa 32(%esp),%xmm0
+ pshufd $192,%xmm1,%xmm5
pxor %xmm0,%xmm2
+ pshufd $128,%xmm1,%xmm6
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
+ pshufd $64,%xmm1,%xmm7
+ movups 16(%ebp),%xmm1
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
+.byte 102,15,56,220,209
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
pxor %xmm0,%xmm7
+.byte 102,15,56,220,217
+ movups 32(%ebp),%xmm0
+ movl %ebx,%ecx
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%edx),%xmm0
.byte 102,15,56,220,249
call .L_aesni_encrypt6_enter
movups (%esi),%xmm1
@@ -847,51 +900,51 @@ _aesni_ctr32_encrypt_blocks:
movups %xmm2,(%edi)
movdqa 16(%esp),%xmm0
xorps %xmm1,%xmm4
- movdqa 48(%esp),%xmm1
+ movdqa 64(%esp),%xmm1
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
paddd %xmm0,%xmm1
- paddd 64(%esp),%xmm0
+ paddd 48(%esp),%xmm0
movdqa (%esp),%xmm2
movups 48(%esi),%xmm3
movups 64(%esi),%xmm4
xorps %xmm3,%xmm5
movups 80(%esi),%xmm3
leal 96(%esi),%esi
- movdqa %xmm1,48(%esp)
-.byte 102,15,56,0,202
+ movdqa %xmm0,48(%esp)
+.byte 102,15,56,0,194
xorps %xmm4,%xmm6
movups %xmm5,48(%edi)
xorps %xmm3,%xmm7
- movdqa %xmm0,64(%esp)
-.byte 102,15,56,0,194
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
movups %xmm6,64(%edi)
- pshufd $192,%xmm1,%xmm2
+ pshufd $192,%xmm0,%xmm2
movups %xmm7,80(%edi)
leal 96(%edi),%edi
- movl %ebx,%ecx
- pshufd $128,%xmm1,%xmm3
+ pshufd $128,%xmm0,%xmm3
subl $6,%eax
- jnc .L035ctr32_loop6
+ jnc .L037ctr32_loop6
addl $6,%eax
- jz .L036ctr32_ret
+ jz .L038ctr32_ret
+ movdqu (%ebp),%xmm7
movl %ebp,%edx
- leal 1(,%ecx,2),%ecx
- movdqa 32(%esp),%xmm7
-.L034ctr32_tail:
+ pxor 32(%esp),%xmm7
+ movl 240(%ebp),%ecx
+.L036ctr32_tail:
por %xmm7,%xmm2
cmpl $2,%eax
- jb .L037ctr32_one
- pshufd $64,%xmm1,%xmm4
+ jb .L039ctr32_one
+ pshufd $64,%xmm0,%xmm4
por %xmm7,%xmm3
- je .L038ctr32_two
- pshufd $192,%xmm0,%xmm5
+ je .L040ctr32_two
+ pshufd $192,%xmm1,%xmm5
por %xmm7,%xmm4
cmpl $4,%eax
- jb .L039ctr32_three
- pshufd $128,%xmm0,%xmm6
+ jb .L041ctr32_three
+ pshufd $128,%xmm1,%xmm6
por %xmm7,%xmm5
- je .L040ctr32_four
+ je .L042ctr32_four
por %xmm7,%xmm6
call __aesni_encrypt6
movups (%esi),%xmm1
@@ -909,39 +962,39 @@ _aesni_ctr32_encrypt_blocks:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L036ctr32_ret
+ jmp .L038ctr32_ret
.align 16
-.L033ctr32_one_shortcut:
+.L035ctr32_one_shortcut:
movups (%ebx),%xmm2
movl 240(%edx),%ecx
-.L037ctr32_one:
+.L039ctr32_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L041enc1_loop_7:
+.L043enc1_loop_7:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L041enc1_loop_7
+ jnz .L043enc1_loop_7
.byte 102,15,56,221,209
movups (%esi),%xmm6
xorps %xmm2,%xmm6
movups %xmm6,(%edi)
- jmp .L036ctr32_ret
+ jmp .L038ctr32_ret
.align 16
-.L038ctr32_two:
- call __aesni_encrypt3
+.L040ctr32_two:
+ call __aesni_encrypt2
movups (%esi),%xmm5
movups 16(%esi),%xmm6
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L036ctr32_ret
+ jmp .L038ctr32_ret
.align 16
-.L039ctr32_three:
+.L041ctr32_three:
call __aesni_encrypt3
movups (%esi),%xmm5
movups 16(%esi),%xmm6
@@ -952,9 +1005,9 @@ _aesni_ctr32_encrypt_blocks:
xorps %xmm7,%xmm4
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L036ctr32_ret
+ jmp .L038ctr32_ret
.align 16
-.L040ctr32_four:
+.L042ctr32_four:
call __aesni_encrypt4
movups (%esi),%xmm6
movups 16(%esi),%xmm7
@@ -968,7 +1021,7 @@ _aesni_ctr32_encrypt_blocks:
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-.L036ctr32_ret:
+.L038ctr32_ret:
movl 80(%esp),%esp
popl %edi
popl %esi
@@ -992,12 +1045,12 @@ _aesni_xts_encrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L042enc1_loop_8:
+.L044enc1_loop_8:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L042enc1_loop_8
+ jnz .L044enc1_loop_8
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1021,12 +1074,14 @@ _aesni_xts_encrypt:
movl %edx,%ebp
movl %ecx,%ebx
subl $96,%eax
- jc .L043xts_enc_short
- shrl $1,%ecx
- movl %ecx,%ebx
- jmp .L044xts_enc_loop6
+ jc .L045xts_enc_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp .L046xts_enc_loop6
.align 16
-.L044xts_enc_loop6:
+.L046xts_enc_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1062,6 +1117,7 @@ _aesni_xts_encrypt:
pand %xmm3,%xmm7
movups (%esi),%xmm2
pxor %xmm1,%xmm7
+ movl %ebx,%ecx
movdqu 16(%esi),%xmm3
xorps %xmm0,%xmm2
movdqu 32(%esi),%xmm4
@@ -1077,19 +1133,17 @@ _aesni_xts_encrypt:
movdqa %xmm7,80(%esp)
pxor %xmm1,%xmm7
movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
pxor 16(%esp),%xmm3
-.byte 102,15,56,220,209
pxor 32(%esp),%xmm4
-.byte 102,15,56,220,217
+.byte 102,15,56,220,209
pxor 48(%esp),%xmm5
- decl %ecx
-.byte 102,15,56,220,225
pxor 64(%esp),%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,217
pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%edx),%xmm0
.byte 102,15,56,220,249
call .L_aesni_encrypt6_enter
movdqa 80(%esp),%xmm1
@@ -1114,26 +1168,25 @@ _aesni_xts_encrypt:
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
- movl %ebx,%ecx
pxor %xmm2,%xmm1
subl $96,%eax
- jnc .L044xts_enc_loop6
- leal 1(,%ecx,2),%ecx
+ jnc .L046xts_enc_loop6
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-.L043xts_enc_short:
+.L045xts_enc_short:
addl $96,%eax
- jz .L045xts_enc_done6x
+ jz .L047xts_enc_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb .L046xts_enc_one
+ jb .L048xts_enc_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je .L047xts_enc_two
+ je .L049xts_enc_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1142,7 +1195,7 @@ _aesni_xts_encrypt:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb .L048xts_enc_three
+ jb .L050xts_enc_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1152,7 +1205,7 @@ _aesni_xts_encrypt:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je .L049xts_enc_four
+ je .L051xts_enc_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1184,9 +1237,9 @@ _aesni_xts_encrypt:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp .L050xts_enc_done
+ jmp .L052xts_enc_done
.align 16
-.L046xts_enc_one:
+.L048xts_enc_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1194,37 +1247,36 @@ _aesni_xts_encrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L051enc1_loop_9:
+.L053enc1_loop_9:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L051enc1_loop_9
+ jnz .L053enc1_loop_9
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp .L050xts_enc_done
+ jmp .L052xts_enc_done
.align 16
-.L047xts_enc_two:
+.L049xts_enc_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
leal 32(%esi),%esi
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
- xorps %xmm4,%xmm4
- call __aesni_encrypt3
+ call __aesni_encrypt2
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L050xts_enc_done
+ jmp .L052xts_enc_done
.align 16
-.L048xts_enc_three:
+.L050xts_enc_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1242,9 +1294,9 @@ _aesni_xts_encrypt:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp .L050xts_enc_done
+ jmp .L052xts_enc_done
.align 16
-.L049xts_enc_four:
+.L051xts_enc_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1266,28 +1318,28 @@ _aesni_xts_encrypt:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L050xts_enc_done
+ jmp .L052xts_enc_done
.align 16
-.L045xts_enc_done6x:
+.L047xts_enc_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz .L052xts_enc_ret
+ jz .L054xts_enc_ret
movdqa %xmm1,%xmm5
movl %eax,112(%esp)
- jmp .L053xts_enc_steal
+ jmp .L055xts_enc_steal
.align 16
-.L050xts_enc_done:
+.L052xts_enc_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz .L052xts_enc_ret
+ jz .L054xts_enc_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm5
paddq %xmm1,%xmm1
pand 96(%esp),%xmm5
pxor %xmm1,%xmm5
-.L053xts_enc_steal:
+.L055xts_enc_steal:
movzbl (%esi),%ecx
movzbl -16(%edi),%edx
leal 1(%esi),%esi
@@ -1295,7 +1347,7 @@ _aesni_xts_encrypt:
movb %dl,(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz .L053xts_enc_steal
+ jnz .L055xts_enc_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1305,16 +1357,16 @@ _aesni_xts_encrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L054enc1_loop_10:
+.L056enc1_loop_10:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L054enc1_loop_10
+ jnz .L056enc1_loop_10
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,-16(%edi)
-.L052xts_enc_ret:
+.L054xts_enc_ret:
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1338,12 +1390,12 @@ _aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L055enc1_loop_11:
+.L057enc1_loop_11:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L055enc1_loop_11
+ jnz .L057enc1_loop_11
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1372,12 +1424,14 @@ _aesni_xts_decrypt:
pcmpgtd %xmm1,%xmm0
andl $-16,%eax
subl $96,%eax
- jc .L056xts_dec_short
- shrl $1,%ecx
- movl %ecx,%ebx
- jmp .L057xts_dec_loop6
+ jc .L058xts_dec_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp .L059xts_dec_loop6
.align 16
-.L057xts_dec_loop6:
+.L059xts_dec_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1413,6 +1467,7 @@ _aesni_xts_decrypt:
pand %xmm3,%xmm7
movups (%esi),%xmm2
pxor %xmm1,%xmm7
+ movl %ebx,%ecx
movdqu 16(%esi),%xmm3
xorps %xmm0,%xmm2
movdqu 32(%esi),%xmm4
@@ -1428,19 +1483,17 @@ _aesni_xts_decrypt:
movdqa %xmm7,80(%esp)
pxor %xmm1,%xmm7
movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
pxor 16(%esp),%xmm3
-.byte 102,15,56,222,209
pxor 32(%esp),%xmm4
-.byte 102,15,56,222,217
+.byte 102,15,56,222,209
pxor 48(%esp),%xmm5
- decl %ecx
-.byte 102,15,56,222,225
pxor 64(%esp),%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,217
pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
.byte 102,15,56,222,241
- movups (%edx),%xmm0
.byte 102,15,56,222,249
call .L_aesni_decrypt6_enter
movdqa 80(%esp),%xmm1
@@ -1465,26 +1518,25 @@ _aesni_xts_decrypt:
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
- movl %ebx,%ecx
pxor %xmm2,%xmm1
subl $96,%eax
- jnc .L057xts_dec_loop6
- leal 1(,%ecx,2),%ecx
+ jnc .L059xts_dec_loop6
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-.L056xts_dec_short:
+.L058xts_dec_short:
addl $96,%eax
- jz .L058xts_dec_done6x
+ jz .L060xts_dec_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb .L059xts_dec_one
+ jb .L061xts_dec_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je .L060xts_dec_two
+ je .L062xts_dec_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1493,7 +1545,7 @@ _aesni_xts_decrypt:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb .L061xts_dec_three
+ jb .L063xts_dec_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1503,7 +1555,7 @@ _aesni_xts_decrypt:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je .L062xts_dec_four
+ je .L064xts_dec_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1535,9 +1587,9 @@ _aesni_xts_decrypt:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp .L063xts_dec_done
+ jmp .L065xts_dec_done
.align 16
-.L059xts_dec_one:
+.L061xts_dec_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1545,36 +1597,36 @@ _aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L064dec1_loop_12:
+.L066dec1_loop_12:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L064dec1_loop_12
+ jnz .L066dec1_loop_12
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp .L063xts_dec_done
+ jmp .L065xts_dec_done
.align 16
-.L060xts_dec_two:
+.L062xts_dec_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
leal 32(%esi),%esi
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
- call __aesni_decrypt3
+ call __aesni_decrypt2
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L063xts_dec_done
+ jmp .L065xts_dec_done
.align 16
-.L061xts_dec_three:
+.L063xts_dec_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1592,9 +1644,9 @@ _aesni_xts_decrypt:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp .L063xts_dec_done
+ jmp .L065xts_dec_done
.align 16
-.L062xts_dec_four:
+.L064xts_dec_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1616,20 +1668,20 @@ _aesni_xts_decrypt:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L063xts_dec_done
+ jmp .L065xts_dec_done
.align 16
-.L058xts_dec_done6x:
+.L060xts_dec_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz .L065xts_dec_ret
+ jz .L067xts_dec_ret
movl %eax,112(%esp)
- jmp .L066xts_dec_only_one_more
+ jmp .L068xts_dec_only_one_more
.align 16
-.L063xts_dec_done:
+.L065xts_dec_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz .L065xts_dec_ret
+ jz .L067xts_dec_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm2
@@ -1639,7 +1691,7 @@ _aesni_xts_decrypt:
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
-.L066xts_dec_only_one_more:
+.L068xts_dec_only_one_more:
pshufd $19,%xmm0,%xmm5
movdqa %xmm1,%xmm6
paddq %xmm1,%xmm1
@@ -1653,16 +1705,16 @@ _aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L067dec1_loop_13:
+.L069dec1_loop_13:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L067dec1_loop_13
+ jnz .L069dec1_loop_13
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
-.L068xts_dec_steal:
+.L070xts_dec_steal:
movzbl 16(%esi),%ecx
movzbl (%edi),%edx
leal 1(%esi),%esi
@@ -1670,7 +1722,7 @@ _aesni_xts_decrypt:
movb %dl,16(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz .L068xts_dec_steal
+ jnz .L070xts_dec_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1680,16 +1732,16 @@ _aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L069dec1_loop_14:
+.L071dec1_loop_14:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L069dec1_loop_14
+ jnz .L071dec1_loop_14
.byte 102,15,56,223,209
xorps %xmm6,%xmm2
movups %xmm2,(%edi)
-.L065xts_dec_ret:
+.L067xts_dec_ret:
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1714,7 +1766,7 @@ _aesni_cbc_encrypt:
movl 32(%esp),%edx
movl 36(%esp),%ebp
testl %eax,%eax
- jz .L070cbc_abort
+ jz .L072cbc_abort
cmpl $0,40(%esp)
xchgl %esp,%ebx
movups (%ebp),%xmm7
@@ -1722,14 +1774,14 @@ _aesni_cbc_encrypt:
movl %edx,%ebp
movl %ebx,16(%esp)
movl %ecx,%ebx
- je .L071cbc_decrypt
+ je .L073cbc_decrypt
movaps %xmm7,%xmm2
cmpl $16,%eax
- jb .L072cbc_enc_tail
+ jb .L074cbc_enc_tail
subl $16,%eax
- jmp .L073cbc_enc_loop
+ jmp .L075cbc_enc_loop
.align 16
-.L073cbc_enc_loop:
+.L075cbc_enc_loop:
movups (%esi),%xmm7
leal 16(%esi),%esi
movups (%edx),%xmm0
@@ -1737,24 +1789,24 @@ _aesni_cbc_encrypt:
xorps %xmm0,%xmm7
leal 32(%edx),%edx
xorps %xmm7,%xmm2
-.L074enc1_loop_15:
+.L076enc1_loop_15:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L074enc1_loop_15
+ jnz .L076enc1_loop_15
.byte 102,15,56,221,209
movl %ebx,%ecx
movl %ebp,%edx
movups %xmm2,(%edi)
leal 16(%edi),%edi
subl $16,%eax
- jnc .L073cbc_enc_loop
+ jnc .L075cbc_enc_loop
addl $16,%eax
- jnz .L072cbc_enc_tail
+ jnz .L074cbc_enc_tail
movaps %xmm2,%xmm7
- jmp .L075cbc_ret
-.L072cbc_enc_tail:
+ jmp .L077cbc_ret
+.L074cbc_enc_tail:
movl %eax,%ecx
.long 2767451785
movl $16,%ecx
@@ -1765,20 +1817,20 @@ _aesni_cbc_encrypt:
movl %ebx,%ecx
movl %edi,%esi
movl %ebp,%edx
- jmp .L073cbc_enc_loop
+ jmp .L075cbc_enc_loop
.align 16
-.L071cbc_decrypt:
+.L073cbc_decrypt:
cmpl $80,%eax
- jbe .L076cbc_dec_tail
+ jbe .L078cbc_dec_tail
movaps %xmm7,(%esp)
subl $80,%eax
- jmp .L077cbc_dec_loop6_enter
+ jmp .L079cbc_dec_loop6_enter
.align 16
-.L078cbc_dec_loop6:
+.L080cbc_dec_loop6:
movaps %xmm0,(%esp)
movups %xmm7,(%edi)
leal 16(%edi),%edi
-.L077cbc_dec_loop6_enter:
+.L079cbc_dec_loop6_enter:
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -1808,28 +1860,28 @@ _aesni_cbc_encrypt:
movups %xmm6,64(%edi)
leal 80(%edi),%edi
subl $96,%eax
- ja .L078cbc_dec_loop6
+ ja .L080cbc_dec_loop6
movaps %xmm7,%xmm2
movaps %xmm0,%xmm7
addl $80,%eax
- jle .L079cbc_dec_tail_collected
+ jle .L081cbc_dec_tail_collected
movups %xmm2,(%edi)
leal 16(%edi),%edi
-.L076cbc_dec_tail:
+.L078cbc_dec_tail:
movups (%esi),%xmm2
movaps %xmm2,%xmm6
cmpl $16,%eax
- jbe .L080cbc_dec_one
+ jbe .L082cbc_dec_one
movups 16(%esi),%xmm3
movaps %xmm3,%xmm5
cmpl $32,%eax
- jbe .L081cbc_dec_two
+ jbe .L083cbc_dec_two
movups 32(%esi),%xmm4
cmpl $48,%eax
- jbe .L082cbc_dec_three
+ jbe .L084cbc_dec_three
movups 48(%esi),%xmm5
cmpl $64,%eax
- jbe .L083cbc_dec_four
+ jbe .L085cbc_dec_four
movups 64(%esi),%xmm6
movaps %xmm7,(%esp)
movups (%esi),%xmm2
@@ -1852,28 +1904,27 @@ _aesni_cbc_encrypt:
leal 64(%edi),%edi
movaps %xmm6,%xmm2
subl $80,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L081cbc_dec_tail_collected
.align 16
-.L080cbc_dec_one:
+.L082cbc_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L084dec1_loop_16:
+.L086dec1_loop_16:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L084dec1_loop_16
+ jnz .L086dec1_loop_16
.byte 102,15,56,223,209
xorps %xmm7,%xmm2
movaps %xmm6,%xmm7
subl $16,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L081cbc_dec_tail_collected
.align 16
-.L081cbc_dec_two:
- xorps %xmm4,%xmm4
- call __aesni_decrypt3
+.L083cbc_dec_two:
+ call __aesni_decrypt2
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
@@ -1881,9 +1932,9 @@ _aesni_cbc_encrypt:
leal 16(%edi),%edi
movaps %xmm5,%xmm7
subl $32,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L081cbc_dec_tail_collected
.align 16
-.L082cbc_dec_three:
+.L084cbc_dec_three:
call __aesni_decrypt3
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
@@ -1894,9 +1945,9 @@ _aesni_cbc_encrypt:
leal 32(%edi),%edi
movups 32(%esi),%xmm7
subl $48,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L081cbc_dec_tail_collected
.align 16
-.L083cbc_dec_four:
+.L085cbc_dec_four:
call __aesni_decrypt4
movups 16(%esi),%xmm1
movups 32(%esi),%xmm0
@@ -1911,23 +1962,23 @@ _aesni_cbc_encrypt:
leal 48(%edi),%edi
movaps %xmm5,%xmm2
subl $64,%eax
-.L079cbc_dec_tail_collected:
+.L081cbc_dec_tail_collected:
andl $15,%eax
- jnz .L085cbc_dec_tail_partial
+ jnz .L087cbc_dec_tail_partial
movups %xmm2,(%edi)
- jmp .L075cbc_ret
+ jmp .L077cbc_ret
.align 16
-.L085cbc_dec_tail_partial:
+.L087cbc_dec_tail_partial:
movaps %xmm2,(%esp)
movl $16,%ecx
movl %esp,%esi
subl %eax,%ecx
.long 2767451785
-.L075cbc_ret:
+.L077cbc_ret:
movl 16(%esp),%esp
movl 36(%esp),%ebp
movups %xmm7,(%ebp)
-.L070cbc_abort:
+.L072cbc_abort:
popl %edi
popl %esi
popl %ebx
@@ -1937,51 +1988,51 @@ _aesni_cbc_encrypt:
.align 16
__aesni_set_encrypt_key:
testl %eax,%eax
- jz .L086bad_pointer
+ jz .L088bad_pointer
testl %edx,%edx
- jz .L086bad_pointer
+ jz .L088bad_pointer
movups (%eax),%xmm0
xorps %xmm4,%xmm4
leal 16(%edx),%edx
cmpl $256,%ecx
- je .L08714rounds
+ je .L08914rounds
cmpl $192,%ecx
- je .L08812rounds
+ je .L09012rounds
cmpl $128,%ecx
- jne .L089bad_keybits
+ jne .L091bad_keybits
.align 16
-.L09010rounds:
+.L09210rounds:
movl $9,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,200,1
- call .L091key_128_cold
+ call .L093key_128_cold
.byte 102,15,58,223,200,2
- call .L092key_128
+ call .L094key_128
.byte 102,15,58,223,200,4
- call .L092key_128
+ call .L094key_128
.byte 102,15,58,223,200,8
- call .L092key_128
+ call .L094key_128
.byte 102,15,58,223,200,16
- call .L092key_128
+ call .L094key_128
.byte 102,15,58,223,200,32
- call .L092key_128
+ call .L094key_128
.byte 102,15,58,223,200,64
- call .L092key_128
+ call .L094key_128
.byte 102,15,58,223,200,128
- call .L092key_128
+ call .L094key_128
.byte 102,15,58,223,200,27
- call .L092key_128
+ call .L094key_128
.byte 102,15,58,223,200,54
- call .L092key_128
+ call .L094key_128
movups %xmm0,(%edx)
movl %ecx,80(%edx)
xorl %eax,%eax
ret
.align 16
-.L092key_128:
+.L094key_128:
movups %xmm0,(%edx)
leal 16(%edx),%edx
-.L091key_128_cold:
+.L093key_128_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -1990,38 +2041,38 @@ __aesni_set_encrypt_key:
xorps %xmm1,%xmm0
ret
.align 16
-.L08812rounds:
+.L09012rounds:
movq 16(%eax),%xmm2
movl $11,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,202,1
- call .L093key_192a_cold
+ call .L095key_192a_cold
.byte 102,15,58,223,202,2
- call .L094key_192b
+ call .L096key_192b
.byte 102,15,58,223,202,4
- call .L095key_192a
+ call .L097key_192a
.byte 102,15,58,223,202,8
- call .L094key_192b
+ call .L096key_192b
.byte 102,15,58,223,202,16
- call .L095key_192a
+ call .L097key_192a
.byte 102,15,58,223,202,32
- call .L094key_192b
+ call .L096key_192b
.byte 102,15,58,223,202,64
- call .L095key_192a
+ call .L097key_192a
.byte 102,15,58,223,202,128
- call .L094key_192b
+ call .L096key_192b
movups %xmm0,(%edx)
movl %ecx,48(%edx)
xorl %eax,%eax
ret
.align 16
-.L095key_192a:
+.L097key_192a:
movups %xmm0,(%edx)
leal 16(%edx),%edx
.align 16
-.L093key_192a_cold:
+.L095key_192a_cold:
movaps %xmm2,%xmm5
-.L096key_192b_warm:
+.L098key_192b_warm:
shufps $16,%xmm0,%xmm4
movdqa %xmm2,%xmm3
xorps %xmm4,%xmm0
@@ -2035,56 +2086,56 @@ __aesni_set_encrypt_key:
pxor %xmm3,%xmm2
ret
.align 16
-.L094key_192b:
+.L096key_192b:
movaps %xmm0,%xmm3
shufps $68,%xmm0,%xmm5
movups %xmm5,(%edx)
shufps $78,%xmm2,%xmm3
movups %xmm3,16(%edx)
leal 32(%edx),%edx
- jmp .L096key_192b_warm
+ jmp .L098key_192b_warm
.align 16
-.L08714rounds:
+.L08914rounds:
movups 16(%eax),%xmm2
movl $13,%ecx
leal 16(%edx),%edx
movups %xmm0,-32(%edx)
movups %xmm2,-16(%edx)
.byte 102,15,58,223,202,1
- call .L097key_256a_cold
+ call .L099key_256a_cold
.byte 102,15,58,223,200,1
- call .L098key_256b
+ call .L100key_256b
.byte 102,15,58,223,202,2
- call .L099key_256a
+ call .L101key_256a
.byte 102,15,58,223,200,2
- call .L098key_256b
+ call .L100key_256b
.byte 102,15,58,223,202,4
- call .L099key_256a
+ call .L101key_256a
.byte 102,15,58,223,200,4
- call .L098key_256b
+ call .L100key_256b
.byte 102,15,58,223,202,8
- call .L099key_256a
+ call .L101key_256a
.byte 102,15,58,223,200,8
- call .L098key_256b
+ call .L100key_256b
.byte 102,15,58,223,202,16
- call .L099key_256a
+ call .L101key_256a
.byte 102,15,58,223,200,16
- call .L098key_256b
+ call .L100key_256b
.byte 102,15,58,223,202,32
- call .L099key_256a
+ call .L101key_256a
.byte 102,15,58,223,200,32
- call .L098key_256b
+ call .L100key_256b
.byte 102,15,58,223,202,64
- call .L099key_256a
+ call .L101key_256a
movups %xmm0,(%edx)
movl %ecx,16(%edx)
xorl %eax,%eax
ret
.align 16
-.L099key_256a:
+.L101key_256a:
movups %xmm2,(%edx)
leal 16(%edx),%edx
-.L097key_256a_cold:
+.L099key_256a_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -2093,7 +2144,7 @@ __aesni_set_encrypt_key:
xorps %xmm1,%xmm0
ret
.align 16
-.L098key_256b:
+.L100key_256b:
movups %xmm0,(%edx)
leal 16(%edx),%edx
shufps $16,%xmm2,%xmm4
@@ -2104,11 +2155,11 @@ __aesni_set_encrypt_key:
xorps %xmm1,%xmm2
ret
.align 4
-.L086bad_pointer:
+.L088bad_pointer:
movl $-1,%eax
ret
.align 4
-.L089bad_keybits:
+.L091bad_keybits:
movl $-2,%eax
ret
.globl _aesni_set_encrypt_key
@@ -2133,7 +2184,7 @@ _aesni_set_decrypt_key:
movl 12(%esp),%edx
shll $4,%ecx
testl %eax,%eax
- jnz .L100dec_key_ret
+ jnz .L102dec_key_ret
leal 16(%edx,%ecx,1),%eax
movups (%edx),%xmm0
movups (%eax),%xmm1
@@ -2141,7 +2192,7 @@ _aesni_set_decrypt_key:
movups %xmm1,(%edx)
leal 16(%edx),%edx
leal -16(%eax),%eax
-.L101dec_key_inverse:
+.L103dec_key_inverse:
movups (%edx),%xmm0
movups (%eax),%xmm1
.byte 102,15,56,219,192
@@ -2151,12 +2202,12 @@ _aesni_set_decrypt_key:
movups %xmm0,16(%eax)
movups %xmm1,-16(%edx)
cmpl %edx,%eax
- ja .L101dec_key_inverse
+ ja .L103dec_key_inverse
movups (%edx),%xmm0
.byte 102,15,56,219,192
movups %xmm0,(%edx)
xorl %eax,%eax
-.L100dec_key_ret:
+.L102dec_key_ret:
ret
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
diff --git a/lib/accelerated/x86/coff/aesni-x86_64.s b/lib/accelerated/x86/coff/aesni-x86_64.s
index 8751ccba8f..a79c18b158 100644
--- a/lib/accelerated/x86/coff/aesni-x86_64.s
+++ b/lib/accelerated/x86/coff/aesni-x86_64.s
@@ -38,6 +38,7 @@
# *** This file is auto-generated ***
#
.text
+
.globl aesni_encrypt
.def aesni_encrypt; .scl 2; .type 32; .endef
.p2align 4
@@ -53,7 +54,7 @@ aesni_encrypt:
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz .Loop_enc1_1
+ jnz .Loop_enc1_1
.byte 102,15,56,221,209
movups %xmm2,(%rdx)
.byte 0xf3,0xc3
@@ -74,34 +75,93 @@ aesni_decrypt:
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz .Loop_dec1_2
+ jnz .Loop_dec1_2
.byte 102,15,56,223,209
movups %xmm2,(%rdx)
.byte 0xf3,0xc3
+.def _aesni_encrypt2; .scl 3; .type 32; .endef
+.p2align 4
+_aesni_encrypt2:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+.Lenc_loop2:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Lenc_loop2
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ .byte 0xf3,0xc3
+
+.def _aesni_decrypt2; .scl 3; .type 32; .endef
+.p2align 4
+_aesni_decrypt2:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+.Ldec_loop2:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Ldec_loop2
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+ .byte 0xf3,0xc3
+
.def _aesni_encrypt3; .scl 3; .type 32; .endef
.p2align 4
_aesni_encrypt3:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
.Lenc_loop3:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop3
.byte 102,15,56,220,209
@@ -116,25 +176,26 @@ _aesni_encrypt3:
.p2align 4
_aesni_decrypt3:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
.Ldec_loop3:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop3
.byte 102,15,56,222,209
@@ -149,28 +210,30 @@ _aesni_decrypt3:
.p2align 4
_aesni_encrypt4:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
xorps %xmm0,%xmm5
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
.Lenc_loop4:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop4
.byte 102,15,56,220,209
@@ -187,28 +250,30 @@ _aesni_encrypt4:
.p2align 4
_aesni_decrypt4:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
xorps %xmm0,%xmm5
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
.Ldec_loop4:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop4
.byte 102,15,56,222,209
@@ -225,43 +290,43 @@ _aesni_decrypt4:
.p2align 4
_aesni_encrypt6:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
+.byte 102,15,56,220,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,225
pxor %xmm0,%xmm7
- decl %eax
+ addq $16,%rax
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%rcx),%xmm0
.byte 102,15,56,220,249
+ movups -16(%rcx,%rax,1),%xmm0
jmp .Lenc_loop6_enter
.p2align 4
.Lenc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.Lenc_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop6
.byte 102,15,56,220,209
@@ -282,43 +347,43 @@ _aesni_encrypt6:
.p2align 4
_aesni_decrypt6:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
+.byte 102,15,56,222,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,222,217
pxor %xmm0,%xmm5
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,225
pxor %xmm0,%xmm7
- decl %eax
+ addq $16,%rax
+.byte 102,15,56,222,233
.byte 102,15,56,222,241
- movups (%rcx),%xmm0
.byte 102,15,56,222,249
+ movups -16(%rcx,%rax,1),%xmm0
jmp .Ldec_loop6_enter
.p2align 4
.Ldec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.Ldec_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop6
.byte 102,15,56,222,209
@@ -339,52 +404,51 @@ _aesni_decrypt6:
.p2align 4
_aesni_encrypt8:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,209
+ addq $16,%rax
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,220,241
+.byte 102,15,56,220,217
pxor %xmm0,%xmm8
-.byte 102,15,56,220,249
pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
- movups 16(%rcx),%xmm1
+ movups -16(%rcx,%rax,1),%xmm0
jmp .Lenc_loop8_enter
.p2align 4
.Lenc_loop8:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
- movups 16(%rcx),%xmm1
.Lenc_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop8
.byte 102,15,56,220,209
@@ -409,52 +473,51 @@ _aesni_encrypt8:
.p2align 4
_aesni_decrypt8:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
-.byte 102,15,56,222,217
pxor %xmm0,%xmm5
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,222,209
+ addq $16,%rax
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,222,241
+.byte 102,15,56,222,217
pxor %xmm0,%xmm8
-.byte 102,15,56,222,249
pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
+ movups -16(%rcx,%rax,1),%xmm0
jmp .Ldec_loop8_enter
.p2align 4
.Ldec_loop8:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
.Ldec_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop8
.byte 102,15,56,222,209
@@ -593,14 +656,13 @@ aesni_ecb_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_3
+ jnz .Loop_enc1_3
.byte 102,15,56,221,209
movups %xmm2,(%rsi)
jmp .Lecb_ret
.p2align 4
.Lecb_enc_two:
- xorps %xmm4,%xmm4
- call _aesni_encrypt3
+ call _aesni_encrypt2
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
jmp .Lecb_ret
@@ -738,14 +800,13 @@ aesni_ecb_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_4
+ jnz .Loop_dec1_4
.byte 102,15,56,223,209
movups %xmm2,(%rsi)
jmp .Lecb_ret
.p2align 4
.Lecb_dec_two:
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
+ call _aesni_decrypt2
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
jmp .Lecb_ret
@@ -811,53 +872,53 @@ aesni_ccm64_encrypt_blocks:
movaps %xmm9,48(%rsp)
.Lccm64_enc_body:
movl 240(%rcx),%eax
- movdqu (%r8),%xmm9
- movdqa .Lincrement64(%rip),%xmm6
+ movdqu (%r8),%xmm6
+ movdqa .Lincrement64(%rip),%xmm9
movdqa .Lbswap_mask(%rip),%xmm7
- shrl $1,%eax
+ shll $4,%eax
+ movl $16,%r10d
leaq 0(%rcx),%r11
movdqu (%r9),%xmm3
- movdqa %xmm9,%xmm2
- movl %eax,%r10d
-.byte 102,68,15,56,0,207
+ movdqa %xmm6,%xmm2
+ leaq 32(%rcx,%rax,1),%rcx
+.byte 102,15,56,0,247
+ subq %rax,%r10
jmp .Lccm64_enc_outer
.p2align 4
.Lccm64_enc_outer:
movups (%r11),%xmm0
- movl %r10d,%eax
+ movq %r10,%rax
movups (%rdi),%xmm8
xorps %xmm0,%xmm2
movups 16(%r11),%xmm1
xorps %xmm8,%xmm0
- leaq 32(%r11),%rcx
xorps %xmm0,%xmm3
- movups (%rcx),%xmm0
+ movups 32(%r11),%xmm0
.Lccm64_enc2_loop:
.byte 102,15,56,220,209
- decl %eax
.byte 102,15,56,220,217
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,216
- movups 0(%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
+ decq %rdx
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- decq %rdx
leaq 16(%rdi),%rdi
xorps %xmm2,%xmm8
- movdqa %xmm9,%xmm2
+ movdqa %xmm6,%xmm2
movups %xmm8,(%rsi)
- leaq 16(%rsi),%rsi
.byte 102,15,56,0,215
+ leaq 16(%rsi),%rsi
jnz .Lccm64_enc_outer
movups %xmm3,(%r9)
@@ -893,15 +954,15 @@ aesni_ccm64_decrypt_blocks:
movaps %xmm9,48(%rsp)
.Lccm64_dec_body:
movl 240(%rcx),%eax
- movups (%r8),%xmm9
+ movups (%r8),%xmm6
movdqu (%r9),%xmm3
- movdqa .Lincrement64(%rip),%xmm6
+ movdqa .Lincrement64(%rip),%xmm9
movdqa .Lbswap_mask(%rip),%xmm7
- movaps %xmm9,%xmm2
+ movaps %xmm6,%xmm2
movl %eax,%r10d
movq %rcx,%r11
-.byte 102,68,15,56,0,207
+.byte 102,15,56,0,247
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
@@ -911,17 +972,21 @@ aesni_ccm64_decrypt_blocks:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_5
+ jnz .Loop_enc1_5
.byte 102,15,56,221,209
+ shll $4,%r10d
+ movl $16,%eax
movups (%rdi),%xmm8
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
leaq 16(%rdi),%rdi
+ subq %r10,%rax
+ leaq 32(%r11,%r10,1),%rcx
+ movq %rax,%r10
jmp .Lccm64_dec_outer
.p2align 4
.Lccm64_dec_outer:
xorps %xmm2,%xmm8
- movdqa %xmm9,%xmm2
- movl %r10d,%eax
+ movdqa %xmm6,%xmm2
movups %xmm8,(%rsi)
leaq 16(%rsi),%rsi
.byte 102,15,56,0,215
@@ -930,36 +995,36 @@ aesni_ccm64_decrypt_blocks:
jz .Lccm64_dec_break
movups (%r11),%xmm0
- shrl $1,%eax
+ movq %r10,%rax
movups 16(%r11),%xmm1
xorps %xmm0,%xmm8
- leaq 32(%r11),%rcx
xorps %xmm0,%xmm2
xorps %xmm8,%xmm3
- movups (%rcx),%xmm0
-
+ movups 32(%r11),%xmm0
+ jmp .Lccm64_dec2_loop
+.p2align 4
.Lccm64_dec2_loop:
.byte 102,15,56,220,209
- decl %eax
.byte 102,15,56,220,217
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,216
- movups 0(%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lccm64_dec2_loop
movups (%rdi),%xmm8
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- leaq 16(%rdi),%rdi
.byte 102,15,56,221,208
.byte 102,15,56,221,216
+ leaq 16(%rdi),%rdi
jmp .Lccm64_dec_outer
.p2align 4
.Lccm64_dec_break:
+ movl 240(%r11),%eax
movups (%r11),%xmm0
movups 16(%r11),%xmm1
xorps %xmm0,%xmm8
@@ -970,7 +1035,7 @@ aesni_ccm64_decrypt_blocks:
decl %eax
movups (%r11),%xmm1
leaq 16(%r11),%r11
- jnz .Loop_enc1_6
+ jnz .Loop_enc1_6
.byte 102,15,56,221,217
movups %xmm3,(%r9)
movaps (%rsp),%xmm6
@@ -997,211 +1062,529 @@ aesni_ctr32_encrypt_blocks:
movq %r9,%rcx
movq 40(%rsp),%r8
- leaq -200(%rsp),%rsp
- movaps %xmm6,32(%rsp)
- movaps %xmm7,48(%rsp)
- movaps %xmm8,64(%rsp)
- movaps %xmm9,80(%rsp)
- movaps %xmm10,96(%rsp)
- movaps %xmm11,112(%rsp)
- movaps %xmm12,128(%rsp)
- movaps %xmm13,144(%rsp)
- movaps %xmm14,160(%rsp)
- movaps %xmm15,176(%rsp)
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $288,%rsp
+ andq $-16,%rsp
+ movaps %xmm6,-168(%rax)
+ movaps %xmm7,-152(%rax)
+ movaps %xmm8,-136(%rax)
+ movaps %xmm9,-120(%rax)
+ movaps %xmm10,-104(%rax)
+ movaps %xmm11,-88(%rax)
+ movaps %xmm12,-72(%rax)
+ movaps %xmm13,-56(%rax)
+ movaps %xmm14,-40(%rax)
+ movaps %xmm15,-24(%rax)
.Lctr32_body:
+ leaq -8(%rax),%rbp
+
cmpq $1,%rdx
je .Lctr32_one_shortcut
- movdqu (%r8),%xmm14
- movdqa .Lbswap_mask(%rip),%xmm15
- xorl %eax,%eax
-.byte 102,69,15,58,22,242,3
-.byte 102,68,15,58,34,240,3
-
+ movdqu (%r8),%xmm2
+ movdqu (%rcx),%xmm0
+ movl 12(%r8),%r8d
+ pxor %xmm0,%xmm2
+ movl 12(%rcx),%r11d
+ movdqa %xmm2,0(%rsp)
+ bswapl %r8d
+ movdqa %xmm2,%xmm3
+ movdqa %xmm2,%xmm4
+ movdqa %xmm2,%xmm5
+ movdqa %xmm2,64(%rsp)
+ movdqa %xmm2,80(%rsp)
+ movdqa %xmm2,96(%rsp)
+ movq %rdx,%r10
+ movdqa %xmm2,112(%rsp)
+
+ leaq 1(%r8),%rax
+ leaq 2(%r8),%rdx
+ bswapl %eax
+ bswapl %edx
+ xorl %r11d,%eax
+ xorl %r11d,%edx
+.byte 102,15,58,34,216,3
+ leaq 3(%r8),%rax
+ movdqa %xmm3,16(%rsp)
+.byte 102,15,58,34,226,3
+ bswapl %eax
+ movq %r10,%rdx
+ leaq 4(%r8),%r10
+ movdqa %xmm4,32(%rsp)
+ xorl %r11d,%eax
+ bswapl %r10d
+.byte 102,15,58,34,232,3
+ xorl %r11d,%r10d
+ movdqa %xmm5,48(%rsp)
+ leaq 5(%r8),%r9
+ movl %r10d,64+12(%rsp)
+ bswapl %r9d
+ leaq 6(%r8),%r10
movl 240(%rcx),%eax
+ xorl %r11d,%r9d
bswapl %r10d
- pxor %xmm12,%xmm12
- pxor %xmm13,%xmm13
-.byte 102,69,15,58,34,226,0
- leaq 3(%r10),%r11
-.byte 102,69,15,58,34,235,0
- incl %r10d
-.byte 102,69,15,58,34,226,1
- incq %r11
-.byte 102,69,15,58,34,235,1
- incl %r10d
-.byte 102,69,15,58,34,226,2
- incq %r11
-.byte 102,69,15,58,34,235,2
- movdqa %xmm12,0(%rsp)
-.byte 102,69,15,56,0,231
- movdqa %xmm13,16(%rsp)
-.byte 102,69,15,56,0,239
-
- pshufd $192,%xmm12,%xmm2
- pshufd $128,%xmm12,%xmm3
- pshufd $64,%xmm12,%xmm4
- cmpq $6,%rdx
+ movl %r9d,80+12(%rsp)
+ xorl %r11d,%r10d
+ leaq 7(%r8),%r9
+ movl %r10d,96+12(%rsp)
+ bswapl %r9d
+ movl _gnutls_x86_cpuid_s+4(%rip),%r10d
+ xorl %r11d,%r9d
+ andl $71303168,%r10d
+ movl %r9d,112+12(%rsp)
+
+ movups 16(%rcx),%xmm1
+
+ movdqa 64(%rsp),%xmm6
+ movdqa 80(%rsp),%xmm7
+
+ cmpq $8,%rdx
jb .Lctr32_tail
- shrl $1,%eax
- movq %rcx,%r11
- movl %eax,%r10d
+
subq $6,%rdx
+ cmpl $4194304,%r10d
+ je .Lctr32_6x
+
+ leaq 128(%rcx),%rcx
+ subq $2,%rdx
+ jmp .Lctr32_loop8
+
+.p2align 4
+.Lctr32_6x:
+ shll $4,%eax
+ movl $48,%r10d
+ bswapl %r11d
+ leaq 32(%rcx,%rax,1),%rcx
+ subq %rax,%r10
jmp .Lctr32_loop6
.p2align 4
.Lctr32_loop6:
- pshufd $192,%xmm13,%xmm5
- por %xmm14,%xmm2
- movups (%r11),%xmm0
- pshufd $128,%xmm13,%xmm6
- por %xmm14,%xmm3
- movups 16(%r11),%xmm1
- pshufd $64,%xmm13,%xmm7
- por %xmm14,%xmm4
- por %xmm14,%xmm5
- xorps %xmm0,%xmm2
- por %xmm14,%xmm6
- por %xmm14,%xmm7
+ addl $6,%r8d
+ movups -48(%rcx,%r10,1),%xmm0
+.byte 102,15,56,220,209
+ movl %r8d,%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,217
+.byte 0x0f,0x38,0xf1,0x44,0x24,12
+ leal 1(%r8),%eax
+.byte 102,15,56,220,225
+ xorl %r11d,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,28
+.byte 102,15,56,220,233
+ leal 2(%r8),%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,241
+.byte 0x0f,0x38,0xf1,0x44,0x24,44
+ leal 3(%r8),%eax
+.byte 102,15,56,220,249
+ movups -32(%rcx,%r10,1),%xmm1
+ xorl %r11d,%eax
+
+.byte 102,15,56,220,208
+.byte 0x0f,0x38,0xf1,0x44,0x24,60
+ leal 4(%r8),%eax
+.byte 102,15,56,220,216
+ xorl %r11d,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,76
+.byte 102,15,56,220,224
+ leal 5(%r8),%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,232
+.byte 0x0f,0x38,0xf1,0x44,0x24,92
+ movq %r10,%rax
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups -16(%rcx,%r10,1),%xmm0
+ call .Lenc_loop6
+ movdqu (%rdi),%xmm8
+ movdqu 16(%rdi),%xmm9
+ movdqu 32(%rdi),%xmm10
+ movdqu 48(%rdi),%xmm11
+ movdqu 64(%rdi),%xmm12
+ movdqu 80(%rdi),%xmm13
+ leaq 96(%rdi),%rdi
+ movups -64(%rcx,%r10,1),%xmm1
+ pxor %xmm2,%xmm8
+ movaps 0(%rsp),%xmm2
+ pxor %xmm3,%xmm9
+ movaps 16(%rsp),%xmm3
+ pxor %xmm4,%xmm10
+ movaps 32(%rsp),%xmm4
+ pxor %xmm5,%xmm11
+ movaps 48(%rsp),%xmm5
+ pxor %xmm6,%xmm12
+ movaps 64(%rsp),%xmm6
+ pxor %xmm7,%xmm13
+ movaps 80(%rsp),%xmm7
+ movdqu %xmm8,(%rsi)
+ movdqu %xmm9,16(%rsi)
+ movdqu %xmm10,32(%rsi)
+ movdqu %xmm11,48(%rsi)
+ movdqu %xmm12,64(%rsi)
+ movdqu %xmm13,80(%rsi)
+ leaq 96(%rsi),%rsi
+ subq $6,%rdx
+ jnc .Lctr32_loop6
- pxor %xmm0,%xmm3
+ addq $6,%rdx
+ jz .Lctr32_done
+
+ leal -48(%r10),%eax
+ leaq -80(%rcx,%r10,1),%rcx
+ negl %eax
+ shrl $4,%eax
+ jmp .Lctr32_tail
+
+.p2align 5
+.Lctr32_loop8:
+ addl $8,%r8d
+ movdqa 96(%rsp),%xmm8
.byte 102,15,56,220,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+ movl %r8d,%r9d
+ movdqa 112(%rsp),%xmm9
.byte 102,15,56,220,217
- movdqa .Lincrement32(%rip),%xmm13
- pxor %xmm0,%xmm5
+ bswapl %r9d
+ movups 32-128(%rcx),%xmm0
.byte 102,15,56,220,225
- movdqa 0(%rsp),%xmm12
- pxor %xmm0,%xmm6
+ xorl %r11d,%r9d
+ nop
.byte 102,15,56,220,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+ movl %r9d,0+12(%rsp)
+ leaq 1(%r8),%r9
.byte 102,15,56,220,241
.byte 102,15,56,220,249
- jmp .Lctr32_enc_loop6_enter
-.p2align 4
-.Lctr32_enc_loop6:
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 48-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movl %r9d,16+12(%rsp)
+ leaq 2(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 64-128(%rcx),%xmm0
+ bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
+ xorl %r11d,%r9d
+.byte 0x66,0x90
.byte 102,15,56,220,225
.byte 102,15,56,220,233
+ movl %r9d,32+12(%rsp)
+ leaq 3(%r8),%r9
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.Lctr32_enc_loop6_enter:
- movups 16(%rcx),%xmm1
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 80-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
+ xorl %r11d,%r9d
+.byte 0x66,0x90
.byte 102,15,56,220,224
.byte 102,15,56,220,232
+ movl %r9d,48+12(%rsp)
+ leaq 4(%r8),%r9
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
- jnz .Lctr32_enc_loop6
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 96-128(%rcx),%xmm0
+ bswapl %r9d
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movl %r9d,64+12(%rsp)
+ leaq 5(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 112-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movl %r9d,80+12(%rsp)
+ leaq 6(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 128-128(%rcx),%xmm0
+ bswapl %r9d
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movl %r9d,96+12(%rsp)
+ leaq 7(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 144-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+ xorl %r11d,%r9d
+ movdqu 0(%rdi),%xmm10
+.byte 102,15,56,220,232
+ movl %r9d,112+12(%rsp)
+ cmpl $11,%eax
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 160-128(%rcx),%xmm0
+
+ jb .Lctr32_enc_done
.byte 102,15,56,220,209
- paddd %xmm13,%xmm12
.byte 102,15,56,220,217
- paddd 16(%rsp),%xmm13
.byte 102,15,56,220,225
- movdqa %xmm12,0(%rsp)
.byte 102,15,56,220,233
- movdqa %xmm13,16(%rsp)
.byte 102,15,56,220,241
-.byte 102,69,15,56,0,231
.byte 102,15,56,220,249
-.byte 102,69,15,56,0,239
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 176-128(%rcx),%xmm1
-.byte 102,15,56,221,208
- movups (%rdi),%xmm8
-.byte 102,15,56,221,216
- movups 16(%rdi),%xmm9
-.byte 102,15,56,221,224
- movups 32(%rdi),%xmm10
-.byte 102,15,56,221,232
- movups 48(%rdi),%xmm11
-.byte 102,15,56,221,240
- movups 64(%rdi),%xmm1
-.byte 102,15,56,221,248
- movups 80(%rdi),%xmm0
- leaq 96(%rdi),%rdi
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 192-128(%rcx),%xmm0
+ je .Lctr32_enc_done
- xorps %xmm2,%xmm8
- pshufd $192,%xmm12,%xmm2
- xorps %xmm3,%xmm9
- pshufd $128,%xmm12,%xmm3
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- pshufd $64,%xmm12,%xmm4
- movups %xmm9,16(%rsi)
- xorps %xmm5,%xmm11
- movups %xmm10,32(%rsi)
- xorps %xmm6,%xmm1
- movups %xmm11,48(%rsi)
- xorps %xmm7,%xmm0
- movups %xmm1,64(%rsi)
- movups %xmm0,80(%rsi)
- leaq 96(%rsi),%rsi
- movl %r10d,%eax
- subq $6,%rdx
- jnc .Lctr32_loop6
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 208-128(%rcx),%xmm1
- addq $6,%rdx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 224-128(%rcx),%xmm0
+ jmp .Lctr32_enc_done
+
+.p2align 4
+.Lctr32_enc_done:
+ movdqu 16(%rdi),%xmm11
+ pxor %xmm0,%xmm10
+ movdqu 32(%rdi),%xmm12
+ pxor %xmm0,%xmm11
+ movdqu 48(%rdi),%xmm13
+ pxor %xmm0,%xmm12
+ movdqu 64(%rdi),%xmm14
+ pxor %xmm0,%xmm13
+ movdqu 80(%rdi),%xmm15
+ pxor %xmm0,%xmm14
+ pxor %xmm0,%xmm15
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movdqu 96(%rdi),%xmm1
+ leaq 128(%rdi),%rdi
+
+.byte 102,65,15,56,221,210
+ pxor %xmm0,%xmm1
+ movdqu 112-128(%rdi),%xmm10
+.byte 102,65,15,56,221,219
+ pxor %xmm0,%xmm10
+ movdqa 0(%rsp),%xmm11
+.byte 102,65,15,56,221,228
+.byte 102,65,15,56,221,237
+ movdqa 16(%rsp),%xmm12
+ movdqa 32(%rsp),%xmm13
+.byte 102,65,15,56,221,246
+.byte 102,65,15,56,221,255
+ movdqa 48(%rsp),%xmm14
+ movdqa 64(%rsp),%xmm15
+.byte 102,68,15,56,221,193
+ movdqa 80(%rsp),%xmm0
+ movups 16-128(%rcx),%xmm1
+.byte 102,69,15,56,221,202
+
+ movups %xmm2,(%rsi)
+ movdqa %xmm11,%xmm2
+ movups %xmm3,16(%rsi)
+ movdqa %xmm12,%xmm3
+ movups %xmm4,32(%rsi)
+ movdqa %xmm13,%xmm4
+ movups %xmm5,48(%rsi)
+ movdqa %xmm14,%xmm5
+ movups %xmm6,64(%rsi)
+ movdqa %xmm15,%xmm6
+ movups %xmm7,80(%rsi)
+ movdqa %xmm0,%xmm7
+ movups %xmm8,96(%rsi)
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+
+ subq $8,%rdx
+ jnc .Lctr32_loop8
+
+ addq $8,%rdx
jz .Lctr32_done
- movq %r11,%rcx
- leal 1(%rax,%rax,1),%eax
+ leaq -128(%rcx),%rcx
.Lctr32_tail:
- por %xmm14,%xmm2
- movups (%rdi),%xmm8
- cmpq $2,%rdx
- jb .Lctr32_one
+ leaq 16(%rcx),%rcx
+ cmpq $4,%rdx
+ jb .Lctr32_loop3
+ je .Lctr32_loop4
- por %xmm14,%xmm3
- movups 16(%rdi),%xmm9
- je .Lctr32_two
+ shll $4,%eax
+ movdqa 96(%rsp),%xmm8
+ pxor %xmm9,%xmm9
- pshufd $192,%xmm13,%xmm5
- por %xmm14,%xmm4
- movups 32(%rdi),%xmm10
- cmpq $4,%rdx
- jb .Lctr32_three
+ movups 16(%rcx),%xmm0
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ leaq 32-16(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,225
+ addq $16,%rax
+ movups (%rdi),%xmm10
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+ movups 16(%rdi),%xmm11
+ movups 32(%rdi),%xmm12
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
- pshufd $128,%xmm13,%xmm6
- por %xmm14,%xmm5
- movups 48(%rdi),%xmm11
- je .Lctr32_four
+ call .Lenc_loop8_enter
- por %xmm14,%xmm6
- xorps %xmm7,%xmm7
+ movdqu 48(%rdi),%xmm13
+ pxor %xmm10,%xmm2
+ movdqu 64(%rdi),%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm10,%xmm6
+ movdqu %xmm5,48(%rsi)
+ movdqu %xmm6,64(%rsi)
+ cmpq $6,%rdx
+ jb .Lctr32_done
- call _aesni_encrypt6
+ movups 80(%rdi),%xmm11
+ xorps %xmm11,%xmm7
+ movups %xmm7,80(%rsi)
+ je .Lctr32_done
- movups 64(%rdi),%xmm1
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- movups %xmm9,16(%rsi)
- xorps %xmm5,%xmm11
- movups %xmm10,32(%rsi)
- xorps %xmm6,%xmm1
- movups %xmm11,48(%rsi)
- movups %xmm1,64(%rsi)
+ movups 96(%rdi),%xmm12
+ xorps %xmm12,%xmm8
+ movups %xmm8,96(%rsi)
+ jmp .Lctr32_done
+
+.p2align 5
+.Lctr32_loop4:
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+ decl %eax
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups (%rcx),%xmm1
+ jnz .Lctr32_loop4
+.byte 102,15,56,221,209
+.byte 102,15,56,221,217
+ movups (%rdi),%xmm10
+ movups 16(%rdi),%xmm11
+.byte 102,15,56,221,225
+.byte 102,15,56,221,233
+ movups 32(%rdi),%xmm12
+ movups 48(%rdi),%xmm13
+
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+ xorps %xmm11,%xmm3
+ movups %xmm3,16(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm5,48(%rsi)
+ jmp .Lctr32_done
+
+.p2align 5
+.Lctr32_loop3:
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+ decl %eax
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+ movups (%rcx),%xmm1
+ jnz .Lctr32_loop3
+.byte 102,15,56,221,209
+.byte 102,15,56,221,217
+.byte 102,15,56,221,225
+
+ movups (%rdi),%xmm10
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+ cmpq $2,%rdx
+ jb .Lctr32_done
+
+ movups 16(%rdi),%xmm11
+ xorps %xmm11,%xmm3
+ movups %xmm3,16(%rsi)
+ je .Lctr32_done
+
+ movups 32(%rdi),%xmm12
+ xorps %xmm12,%xmm4
+ movups %xmm4,32(%rsi)
jmp .Lctr32_done
.p2align 4
.Lctr32_one_shortcut:
movups (%r8),%xmm2
- movups (%rdi),%xmm8
+ movups (%rdi),%xmm10
movl 240(%rcx),%eax
-.Lctr32_one:
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
@@ -1211,58 +1594,27 @@ aesni_ctr32_encrypt_blocks:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_7
+ jnz .Loop_enc1_7
.byte 102,15,56,221,209
- xorps %xmm2,%xmm8
- movups %xmm8,(%rsi)
- jmp .Lctr32_done
-
-.p2align 4
-.Lctr32_two:
- xorps %xmm4,%xmm4
- call _aesni_encrypt3
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- movups %xmm9,16(%rsi)
- jmp .Lctr32_done
-
-.p2align 4
-.Lctr32_three:
- call _aesni_encrypt3
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- movups %xmm9,16(%rsi)
- movups %xmm10,32(%rsi)
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
jmp .Lctr32_done
.p2align 4
-.Lctr32_four:
- call _aesni_encrypt4
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- movups %xmm9,16(%rsi)
- xorps %xmm5,%xmm11
- movups %xmm10,32(%rsi)
- movups %xmm11,48(%rsi)
-
.Lctr32_done:
- movaps 32(%rsp),%xmm6
- movaps 48(%rsp),%xmm7
- movaps 64(%rsp),%xmm8
- movaps 80(%rsp),%xmm9
- movaps 96(%rsp),%xmm10
- movaps 112(%rsp),%xmm11
- movaps 128(%rsp),%xmm12
- movaps 144(%rsp),%xmm13
- movaps 160(%rsp),%xmm14
- movaps 176(%rsp),%xmm15
- leaq 200(%rsp),%rsp
-.Lctr32_ret:
+ movaps -160(%rbp),%xmm6
+ movaps -144(%rbp),%xmm7
+ movaps -128(%rbp),%xmm8
+ movaps -112(%rbp),%xmm9
+ movaps -96(%rbp),%xmm10
+ movaps -80(%rbp),%xmm11
+ movaps -64(%rbp),%xmm12
+ movaps -48(%rbp),%xmm13
+ movaps -32(%rbp),%xmm14
+ movaps -16(%rbp),%xmm15
+ leaq (%rbp),%rsp
+ popq %rbp
+.Lctr32_epilogue:
movq 8(%rsp),%rdi
movq 16(%rsp),%rsi
.byte 0xf3,0xc3
@@ -1282,254 +1634,297 @@ aesni_xts_encrypt:
movq 40(%rsp),%r8
movq 48(%rsp),%r9
- leaq -264(%rsp),%rsp
- movaps %xmm6,96(%rsp)
- movaps %xmm7,112(%rsp)
- movaps %xmm8,128(%rsp)
- movaps %xmm9,144(%rsp)
- movaps %xmm10,160(%rsp)
- movaps %xmm11,176(%rsp)
- movaps %xmm12,192(%rsp)
- movaps %xmm13,208(%rsp)
- movaps %xmm14,224(%rsp)
- movaps %xmm15,240(%rsp)
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $272,%rsp
+ andq $-16,%rsp
+ movaps %xmm6,-168(%rax)
+ movaps %xmm7,-152(%rax)
+ movaps %xmm8,-136(%rax)
+ movaps %xmm9,-120(%rax)
+ movaps %xmm10,-104(%rax)
+ movaps %xmm11,-88(%rax)
+ movaps %xmm12,-72(%rax)
+ movaps %xmm13,-56(%rax)
+ movaps %xmm14,-40(%rax)
+ movaps %xmm15,-24(%rax)
.Lxts_enc_body:
- movups (%r9),%xmm15
+ leaq -8(%rax),%rbp
+ movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
movups (%r8),%xmm0
movups 16(%r8),%xmm1
leaq 32(%r8),%r8
- xorps %xmm0,%xmm15
+ xorps %xmm0,%xmm2
.Loop_enc1_8:
-.byte 102,68,15,56,220,249
+.byte 102,15,56,220,209
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz .Loop_enc1_8
-.byte 102,68,15,56,221,249
+ jnz .Loop_enc1_8
+.byte 102,15,56,221,209
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+
movdqa .Lxts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ movdqa %xmm2,%xmm15
+ pshufd $95,%xmm2,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc .Lxts_enc_short
- shrl $1,%eax
- subl $1,%eax
- movl %eax,%r10d
+ movl $16+96,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ subq %r10,%rax
+ movups 16(%r11),%xmm1
+ movq %rax,%r10
+ leaq .Lxts_magic(%rip),%r8
jmp .Lxts_enc_grandloop
-.p2align 4
+.p2align 5
.Lxts_enc_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,220,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,220,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,220,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,220,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,220,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,220,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,220,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm12
+
+.byte 102,15,56,220,208
+ pxor %xmm9,%xmm13
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,220,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,220,216
+ pxor %xmm9,%xmm14
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,220,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,220,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,220,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,220,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp .Lxts_enc_loop6_enter
-
-.p2align 4
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups 64(%r11),%xmm0
+ movdqa %xmm8,80(%rsp)
+ pshufd $95,%xmm15,%xmm9
+ jmp .Lxts_enc_loop6
+.p2align 5
.Lxts_enc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.Lxts_enc_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups -64(%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
+ movups -80(%rcx,%rax,1),%xmm0
jnz .Lxts_enc_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
.byte 102,15,56,220,233
.byte 102,15,56,220,241
+ pxor %xmm14,%xmm15
+ movaps %xmm10,%xmm11
.byte 102,15,56,220,249
- movups 16(%rcx),%xmm1
+ movups -64(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,208
- pand %xmm8,%xmm9
+ paddd %xmm9,%xmm9
+ pxor %xmm15,%xmm10
.byte 102,15,56,220,216
- pcmpgtd %xmm15,%xmm14
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
.byte 102,15,56,220,224
- pxor %xmm9,%xmm15
.byte 102,15,56,220,232
+ pand %xmm8,%xmm14
+ movaps %xmm11,%xmm12
.byte 102,15,56,220,240
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,248
- movups 32(%rcx),%xmm0
+ movups -48(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
.byte 102,15,56,220,233
+ movdqa %xmm13,48(%rsp)
+ pxor %xmm14,%xmm15
.byte 102,15,56,220,241
+ movaps %xmm12,%xmm13
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,249
+ movups -32(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,220,216
paddq %xmm15,%xmm15
-.byte 102,15,56,221,208
- pand %xmm8,%xmm9
-.byte 102,15,56,221,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,221,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,221,232
-.byte 102,15,56,221,240
-.byte 102,15,56,221,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+ pxor %xmm14,%xmm15
+ movaps %xmm13,%xmm14
+.byte 102,15,56,220,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,220,217
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
+ pand %xmm8,%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+.byte 102,15,56,221,84,36,0
+ psrad $31,%xmm9
+ paddq %xmm15,%xmm15
+.byte 102,15,56,221,92,36,16
+.byte 102,15,56,221,100,36,32
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+ movq %r10,%rax
+.byte 102,15,56,221,108,36,48
+.byte 102,15,56,221,116,36,64
+.byte 102,15,56,221,124,36,80
pxor %xmm9,%xmm15
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
- movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc .Lxts_enc_grandloop
- leal 3(%rax,%rax,1),%eax
+ movl $16+96,%eax
+ subl %r10d,%eax
movq %r11,%rcx
- movl %eax,%r10d
+ shrl $4,%eax
.Lxts_enc_short:
+ movl %eax,%r10d
+ pxor %xmm0,%xmm10
addq $96,%rdx
jz .Lxts_enc_done
+ pxor %xmm0,%xmm11
cmpq $32,%rdx
jb .Lxts_enc_one
+ pxor %xmm0,%xmm12
je .Lxts_enc_two
+ pxor %xmm0,%xmm13
cmpq $64,%rdx
jb .Lxts_enc_three
+ pxor %xmm0,%xmm14
je .Lxts_enc_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
@@ -1570,7 +1965,7 @@ aesni_xts_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_9
+ jnz .Loop_enc1_9
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movdqa %xmm11,%xmm10
@@ -1586,7 +1981,7 @@ aesni_xts_encrypt:
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
- call _aesni_encrypt3
+ call _aesni_encrypt2
xorps %xmm10,%xmm2
movdqa %xmm12,%xmm10
@@ -1632,15 +2027,15 @@ aesni_xts_encrypt:
call _aesni_encrypt4
- xorps %xmm10,%xmm2
- movdqa %xmm15,%xmm10
- xorps %xmm11,%xmm3
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm10,%xmm2
+ movdqa %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp .Lxts_enc_done
@@ -1675,23 +2070,24 @@ aesni_xts_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_10
+ jnz .Loop_enc1_10
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movups %xmm2,-16(%rsi)
.Lxts_enc_ret:
- movaps 96(%rsp),%xmm6
- movaps 112(%rsp),%xmm7
- movaps 128(%rsp),%xmm8
- movaps 144(%rsp),%xmm9
- movaps 160(%rsp),%xmm10
- movaps 176(%rsp),%xmm11
- movaps 192(%rsp),%xmm12
- movaps 208(%rsp),%xmm13
- movaps 224(%rsp),%xmm14
- movaps 240(%rsp),%xmm15
- leaq 264(%rsp),%rsp
+ movaps -160(%rbp),%xmm6
+ movaps -144(%rbp),%xmm7
+ movaps -128(%rbp),%xmm8
+ movaps -112(%rbp),%xmm9
+ movaps -96(%rbp),%xmm10
+ movaps -80(%rbp),%xmm11
+ movaps -64(%rbp),%xmm12
+ movaps -48(%rbp),%xmm13
+ movaps -32(%rbp),%xmm14
+ movaps -16(%rbp),%xmm15
+ leaq (%rbp),%rsp
+ popq %rbp
.Lxts_enc_epilogue:
movq 8(%rsp),%rdi
movq 16(%rsp),%rsi
@@ -1712,260 +2108,303 @@ aesni_xts_decrypt:
movq 40(%rsp),%r8
movq 48(%rsp),%r9
- leaq -264(%rsp),%rsp
- movaps %xmm6,96(%rsp)
- movaps %xmm7,112(%rsp)
- movaps %xmm8,128(%rsp)
- movaps %xmm9,144(%rsp)
- movaps %xmm10,160(%rsp)
- movaps %xmm11,176(%rsp)
- movaps %xmm12,192(%rsp)
- movaps %xmm13,208(%rsp)
- movaps %xmm14,224(%rsp)
- movaps %xmm15,240(%rsp)
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $272,%rsp
+ andq $-16,%rsp
+ movaps %xmm6,-168(%rax)
+ movaps %xmm7,-152(%rax)
+ movaps %xmm8,-136(%rax)
+ movaps %xmm9,-120(%rax)
+ movaps %xmm10,-104(%rax)
+ movaps %xmm11,-88(%rax)
+ movaps %xmm12,-72(%rax)
+ movaps %xmm13,-56(%rax)
+ movaps %xmm14,-40(%rax)
+ movaps %xmm15,-24(%rax)
.Lxts_dec_body:
- movups (%r9),%xmm15
+ leaq -8(%rax),%rbp
+ movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
movups (%r8),%xmm0
movups 16(%r8),%xmm1
leaq 32(%r8),%r8
- xorps %xmm0,%xmm15
+ xorps %xmm0,%xmm2
.Loop_enc1_11:
-.byte 102,68,15,56,220,249
+.byte 102,15,56,220,209
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz .Loop_enc1_11
-.byte 102,68,15,56,221,249
+ jnz .Loop_enc1_11
+.byte 102,15,56,221,209
xorl %eax,%eax
testq $15,%rdx
setnz %al
shlq $4,%rax
subq %rax,%rdx
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+
movdqa .Lxts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ movdqa %xmm2,%xmm15
+ pshufd $95,%xmm2,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc .Lxts_dec_short
- shrl $1,%eax
- subl $1,%eax
- movl %eax,%r10d
+ movl $16+96,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ subq %r10,%rax
+ movups 16(%r11),%xmm1
+ movq %rax,%r10
+ leaq .Lxts_magic(%rip),%r8
jmp .Lxts_dec_grandloop
-.p2align 4
+.p2align 5
.Lxts_dec_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,222,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,222,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,222,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,222,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,222,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,222,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,222,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm12
+
+.byte 102,15,56,222,208
+ pxor %xmm9,%xmm13
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,222,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,222,216
+ pxor %xmm9,%xmm14
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,222,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,222,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,222,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,222,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp .Lxts_dec_loop6_enter
-
-.p2align 4
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups 64(%r11),%xmm0
+ movdqa %xmm8,80(%rsp)
+ pshufd $95,%xmm15,%xmm9
+ jmp .Lxts_dec_loop6
+.p2align 5
.Lxts_dec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.Lxts_dec_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups -64(%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%rcx),%xmm0
+ movups -80(%rcx,%rax,1),%xmm0
jnz .Lxts_dec_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
.byte 102,15,56,222,233
.byte 102,15,56,222,241
+ pxor %xmm14,%xmm15
+ movaps %xmm10,%xmm11
.byte 102,15,56,222,249
- movups 16(%rcx),%xmm1
+ movups -64(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,208
- pand %xmm8,%xmm9
+ paddd %xmm9,%xmm9
+ pxor %xmm15,%xmm10
.byte 102,15,56,222,216
- pcmpgtd %xmm15,%xmm14
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
.byte 102,15,56,222,224
- pxor %xmm9,%xmm15
.byte 102,15,56,222,232
+ pand %xmm8,%xmm14
+ movaps %xmm11,%xmm12
.byte 102,15,56,222,240
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,248
- movups 32(%rcx),%xmm0
+ movups -48(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
.byte 102,15,56,222,233
+ movdqa %xmm13,48(%rsp)
+ pxor %xmm14,%xmm15
.byte 102,15,56,222,241
+ movaps %xmm12,%xmm13
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,249
+ movups -32(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,222,216
paddq %xmm15,%xmm15
-.byte 102,15,56,223,208
- pand %xmm8,%xmm9
-.byte 102,15,56,223,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,223,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,223,232
-.byte 102,15,56,223,240
-.byte 102,15,56,223,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+ pxor %xmm14,%xmm15
+ movaps %xmm13,%xmm14
+.byte 102,15,56,222,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,222,217
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+.byte 102,15,56,223,84,36,0
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
+.byte 102,15,56,223,92,36,16
+.byte 102,15,56,223,100,36,32
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+ movq %r10,%rax
+.byte 102,15,56,223,108,36,48
+.byte 102,15,56,223,116,36,64
+.byte 102,15,56,223,124,36,80
pxor %xmm9,%xmm15
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
- movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc .Lxts_dec_grandloop
- leal 3(%rax,%rax,1),%eax
+ movl $16+96,%eax
+ subl %r10d,%eax
movq %r11,%rcx
- movl %eax,%r10d
+ shrl $4,%eax
.Lxts_dec_short:
+ movl %eax,%r10d
+ pxor %xmm0,%xmm10
+ pxor %xmm0,%xmm11
addq $96,%rdx
jz .Lxts_dec_done
+ pxor %xmm0,%xmm12
cmpq $32,%rdx
jb .Lxts_dec_one
+ pxor %xmm0,%xmm13
je .Lxts_dec_two
+ pxor %xmm0,%xmm14
cmpq $64,%rdx
jb .Lxts_dec_three
je .Lxts_dec_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
@@ -2015,7 +2454,7 @@ aesni_xts_decrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_12
+ jnz .Loop_dec1_12
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movdqa %xmm11,%xmm10
@@ -2032,7 +2471,7 @@ aesni_xts_decrypt:
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
- call _aesni_decrypt3
+ call _aesni_decrypt2
xorps %xmm10,%xmm2
movdqa %xmm12,%xmm10
@@ -2058,7 +2497,7 @@ aesni_xts_decrypt:
xorps %xmm10,%xmm2
movdqa %xmm13,%xmm10
xorps %xmm11,%xmm3
- movdqa %xmm15,%xmm11
+ movdqa %xmm14,%xmm11
xorps %xmm12,%xmm4
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
@@ -2068,14 +2507,8 @@ aesni_xts_decrypt:
.p2align 4
.Lxts_dec_four:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movups (%rdi),%xmm2
- pand %xmm8,%xmm9
movups 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movups 32(%rdi),%xmm4
xorps %xmm10,%xmm2
movups 48(%rdi),%xmm5
@@ -2086,16 +2519,16 @@ aesni_xts_decrypt:
call _aesni_decrypt4
- xorps %xmm10,%xmm2
+ pxor %xmm10,%xmm2
movdqa %xmm14,%xmm10
- xorps %xmm11,%xmm3
+ pxor %xmm11,%xmm3
movdqa %xmm15,%xmm11
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp .Lxts_dec_done
@@ -2119,7 +2552,7 @@ aesni_xts_decrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_13
+ jnz .Loop_dec1_13
.byte 102,15,56,223,209
xorps %xmm11,%xmm2
movups %xmm2,(%rsi)
@@ -2149,23 +2582,24 @@ aesni_xts_decrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_14
+ jnz .Loop_dec1_14
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movups %xmm2,(%rsi)
.Lxts_dec_ret:
- movaps 96(%rsp),%xmm6
- movaps 112(%rsp),%xmm7
- movaps 128(%rsp),%xmm8
- movaps 144(%rsp),%xmm9
- movaps 160(%rsp),%xmm10
- movaps 176(%rsp),%xmm11
- movaps 192(%rsp),%xmm12
- movaps 208(%rsp),%xmm13
- movaps 224(%rsp),%xmm14
- movaps 240(%rsp),%xmm15
- leaq 264(%rsp),%rsp
+ movaps -160(%rbp),%xmm6
+ movaps -144(%rbp),%xmm7
+ movaps -128(%rbp),%xmm8
+ movaps -112(%rbp),%xmm9
+ movaps -96(%rbp),%xmm10
+ movaps -80(%rbp),%xmm11
+ movaps -64(%rbp),%xmm12
+ movaps -48(%rbp),%xmm13
+ movaps -32(%rbp),%xmm14
+ movaps -16(%rbp),%xmm15
+ leaq (%rbp),%rsp
+ popq %rbp
.Lxts_dec_epilogue:
movq 8(%rsp),%rdi
movq 16(%rsp),%rsi
@@ -2215,7 +2649,7 @@ aesni_cbc_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_15
+ jnz .Loop_enc1_15
.byte 102,15,56,221,209
movl %r10d,%eax
movq %r11,%rcx
@@ -2231,169 +2665,408 @@ aesni_cbc_encrypt:
.Lcbc_enc_tail:
movq %rdx,%rcx
xchgq %rdi,%rsi
-.long 0x9066A4F3
+.long 0x9066A4F3
movl $16,%ecx
subq %rdx,%rcx
xorl %eax,%eax
-.long 0x9066AAF3
+.long 0x9066AAF3
leaq -16(%rdi),%rdi
movl %r10d,%eax
movq %rdi,%rsi
movq %r11,%rcx
xorq %rdx,%rdx
- jmp .Lcbc_enc_loop
+ jmp .Lcbc_enc_loop
.p2align 4
.Lcbc_decrypt:
- leaq -88(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,16(%rsp)
- movaps %xmm8,32(%rsp)
- movaps %xmm9,48(%rsp)
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $176,%rsp
+ andq $-16,%rsp
+ movaps %xmm6,16(%rsp)
+ movaps %xmm7,32(%rsp)
+ movaps %xmm8,48(%rsp)
+ movaps %xmm9,64(%rsp)
+ movaps %xmm10,80(%rsp)
+ movaps %xmm11,96(%rsp)
+ movaps %xmm12,112(%rsp)
+ movaps %xmm13,128(%rsp)
+ movaps %xmm14,144(%rsp)
+ movaps %xmm15,160(%rsp)
.Lcbc_decrypt_body:
- movups (%r8),%xmm9
+ leaq -8(%rax),%rbp
+ movups (%r8),%xmm10
movl %r10d,%eax
- cmpq $112,%rdx
+ cmpq $80,%rdx
jbe .Lcbc_dec_tail
- shrl $1,%r10d
- subq $112,%rdx
- movl %r10d,%eax
- movaps %xmm9,64(%rsp)
+
+ movups (%rcx),%xmm0
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqa %xmm2,%xmm11
+ movdqu 32(%rdi),%xmm4
+ movdqa %xmm3,%xmm12
+ movdqu 48(%rdi),%xmm5
+ movdqa %xmm4,%xmm13
+ movdqu 64(%rdi),%xmm6
+ movdqa %xmm5,%xmm14
+ movdqu 80(%rdi),%xmm7
+ movdqa %xmm6,%xmm15
+ movl _gnutls_x86_cpuid_s+4(%rip),%r9d
+ cmpq $112,%rdx
+ jbe .Lcbc_dec_six_or_seven
+
+ andl $71303168,%r9d
+ subq $80,%rdx
+ cmpl $4194304,%r9d
+ je .Lcbc_dec_loop6_enter
+ subq $32,%rdx
+ leaq 112(%rcx),%rcx
jmp .Lcbc_dec_loop8_enter
.p2align 4
.Lcbc_dec_loop8:
- movaps %xmm0,64(%rsp)
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
.Lcbc_dec_loop8_enter:
- movups (%rcx),%xmm0
- movups (%rdi),%xmm2
- movups 16(%rdi),%xmm3
- movups 16(%rcx),%xmm1
+ movdqu 96(%rdi),%xmm8
+ pxor %xmm0,%xmm2
+ movdqu 112(%rdi),%xmm9
+ pxor %xmm0,%xmm3
+ movups 16-112(%rcx),%xmm1
+ pxor %xmm0,%xmm4
+ xorq %r11,%r11
+ cmpq $112,%rdx
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+ pxor %xmm0,%xmm7
+ pxor %xmm0,%xmm8
- leaq 32(%rcx),%rcx
- movdqu 32(%rdi),%xmm4
- xorps %xmm0,%xmm2
- movdqu 48(%rdi),%xmm5
- xorps %xmm0,%xmm3
- movdqu 64(%rdi),%xmm6
.byte 102,15,56,222,209
- pxor %xmm0,%xmm4
- movdqu 80(%rdi),%xmm7
+ pxor %xmm0,%xmm9
+ movups 32-112(%rcx),%xmm0
.byte 102,15,56,222,217
- pxor %xmm0,%xmm5
- movdqu 96(%rdi),%xmm8
.byte 102,15,56,222,225
- pxor %xmm0,%xmm6
- movdqu 112(%rdi),%xmm9
.byte 102,15,56,222,233
- pxor %xmm0,%xmm7
- decl %eax
.byte 102,15,56,222,241
- pxor %xmm0,%xmm8
.byte 102,15,56,222,249
- pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
.byte 102,68,15,56,222,193
+ setnc %r11b
+ shlq $7,%r11
.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
-
- call .Ldec_loop8_enter
+ addq %rdi,%r11
+ movups 48-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 64-112(%rcx),%xmm0
+ nop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 80-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 96-112(%rcx),%xmm0
+ nop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 112-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 128-112(%rcx),%xmm0
+ nop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 144-112(%rcx),%xmm1
+ cmpl $11,%eax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 160-112(%rcx),%xmm0
+ jb .Lcbc_dec_done
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 176-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 192-112(%rcx),%xmm0
+ je .Lcbc_dec_done
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 208-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 224-112(%rcx),%xmm0
+ jmp .Lcbc_dec_done
+.p2align 4
+.Lcbc_dec_done:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm10
+ pxor %xmm0,%xmm11
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm12
+ pxor %xmm0,%xmm13
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ pxor %xmm0,%xmm14
+ pxor %xmm0,%xmm15
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movdqu 80(%rdi),%xmm1
+
+.byte 102,65,15,56,223,210
+ movdqu 96(%rdi),%xmm10
+ pxor %xmm0,%xmm1
+.byte 102,65,15,56,223,219
+ pxor %xmm0,%xmm10
+ movdqu 112(%rdi),%xmm0
+.byte 102,65,15,56,223,228
+ leaq 128(%rdi),%rdi
+ movdqu 0(%r11),%xmm11
+.byte 102,65,15,56,223,237
+.byte 102,65,15,56,223,246
+ movdqu 16(%r11),%xmm12
+ movdqu 32(%r11),%xmm13
+.byte 102,65,15,56,223,255
+.byte 102,68,15,56,223,193
+ movdqu 48(%r11),%xmm14
+ movdqu 64(%r11),%xmm15
+.byte 102,69,15,56,223,202
+ movdqa %xmm0,%xmm10
+ movdqu 80(%r11),%xmm1
+ movups -112(%rcx),%xmm0
- movups (%rdi),%xmm1
- movups 16(%rdi),%xmm0
- xorps 64(%rsp),%xmm2
- xorps %xmm1,%xmm3
- movups 32(%rdi),%xmm1
- xorps %xmm0,%xmm4
- movups 48(%rdi),%xmm0
- xorps %xmm1,%xmm5
- movups 64(%rdi),%xmm1
- xorps %xmm0,%xmm6
- movups 80(%rdi),%xmm0
- xorps %xmm1,%xmm7
- movups 96(%rdi),%xmm1
- xorps %xmm0,%xmm8
- movups 112(%rdi),%xmm0
- xorps %xmm1,%xmm9
movups %xmm2,(%rsi)
+ movdqa %xmm11,%xmm2
movups %xmm3,16(%rsi)
+ movdqa %xmm12,%xmm3
movups %xmm4,32(%rsi)
+ movdqa %xmm13,%xmm4
movups %xmm5,48(%rsi)
- movl %r10d,%eax
+ movdqa %xmm14,%xmm5
movups %xmm6,64(%rsi)
- movq %r11,%rcx
+ movdqa %xmm15,%xmm6
movups %xmm7,80(%rsi)
- leaq 128(%rdi),%rdi
+ movdqa %xmm1,%xmm7
movups %xmm8,96(%rsi)
leaq 112(%rsi),%rsi
+
subq $128,%rdx
ja .Lcbc_dec_loop8
movaps %xmm9,%xmm2
- movaps %xmm0,%xmm9
+ leaq -112(%rcx),%rcx
addq $112,%rdx
jle .Lcbc_dec_tail_collected
- movups %xmm2,(%rsi)
- leal 1(%r10,%r10,1),%eax
+ movups %xmm9,(%rsi)
+ leaq 16(%rsi),%rsi
+ cmpq $80,%rdx
+ jbe .Lcbc_dec_tail
+
+ movaps %xmm11,%xmm2
+.Lcbc_dec_six_or_seven:
+ cmpq $96,%rdx
+ ja .Lcbc_dec_seven
+
+ movaps %xmm7,%xmm8
+ call _aesni_decrypt6
+ pxor %xmm10,%xmm2
+ movaps %xmm8,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ movdqa %xmm7,%xmm2
+ jmp .Lcbc_dec_tail_collected
+
+.p2align 4
+.Lcbc_dec_seven:
+ movups 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
+ call _aesni_decrypt8
+ movups 80(%rdi),%xmm9
+ pxor %xmm10,%xmm2
+ movups 96(%rdi),%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movdqu %xmm6,64(%rsi)
+ pxor %xmm9,%xmm8
+ movdqu %xmm7,80(%rsi)
+ leaq 96(%rsi),%rsi
+ movdqa %xmm8,%xmm2
+ jmp .Lcbc_dec_tail_collected
+
+.p2align 4
+.Lcbc_dec_loop6:
+ movups %xmm7,(%rsi)
+ leaq 16(%rsi),%rsi
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqa %xmm2,%xmm11
+ movdqu 32(%rdi),%xmm4
+ movdqa %xmm3,%xmm12
+ movdqu 48(%rdi),%xmm5
+ movdqa %xmm4,%xmm13
+ movdqu 64(%rdi),%xmm6
+ movdqa %xmm5,%xmm14
+ movdqu 80(%rdi),%xmm7
+ movdqa %xmm6,%xmm15
+.Lcbc_dec_loop6_enter:
+ leaq 96(%rdi),%rdi
+ movdqa %xmm7,%xmm8
+
+ call _aesni_decrypt6
+
+ pxor %xmm10,%xmm2
+ movdqa %xmm8,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movq %r11,%rcx
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movl %r10d,%eax
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ subq $96,%rdx
+ ja .Lcbc_dec_loop6
+
+ movdqa %xmm7,%xmm2
+ addq $80,%rdx
+ jle .Lcbc_dec_tail_collected
+ movups %xmm7,(%rsi)
leaq 16(%rsi),%rsi
+
.Lcbc_dec_tail:
movups (%rdi),%xmm2
- movaps %xmm2,%xmm8
- cmpq $16,%rdx
+ subq $16,%rdx
jbe .Lcbc_dec_one
movups 16(%rdi),%xmm3
- movaps %xmm3,%xmm7
- cmpq $32,%rdx
+ movaps %xmm2,%xmm11
+ subq $16,%rdx
jbe .Lcbc_dec_two
movups 32(%rdi),%xmm4
- movaps %xmm4,%xmm6
- cmpq $48,%rdx
+ movaps %xmm3,%xmm12
+ subq $16,%rdx
jbe .Lcbc_dec_three
movups 48(%rdi),%xmm5
- cmpq $64,%rdx
+ movaps %xmm4,%xmm13
+ subq $16,%rdx
jbe .Lcbc_dec_four
movups 64(%rdi),%xmm6
- cmpq $80,%rdx
- jbe .Lcbc_dec_five
-
- movups 80(%rdi),%xmm7
- cmpq $96,%rdx
- jbe .Lcbc_dec_six
-
- movups 96(%rdi),%xmm8
- movaps %xmm9,64(%rsp)
- call _aesni_decrypt8
- movups (%rdi),%xmm1
- movups 16(%rdi),%xmm0
- xorps 64(%rsp),%xmm2
- xorps %xmm1,%xmm3
- movups 32(%rdi),%xmm1
- xorps %xmm0,%xmm4
- movups 48(%rdi),%xmm0
- xorps %xmm1,%xmm5
- movups 64(%rdi),%xmm1
- xorps %xmm0,%xmm6
- movups 80(%rdi),%xmm0
- xorps %xmm1,%xmm7
- movups 96(%rdi),%xmm9
- xorps %xmm0,%xmm8
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
- leaq 96(%rsi),%rsi
- movaps %xmm8,%xmm2
- subq $112,%rdx
+ movaps %xmm5,%xmm14
+ movaps %xmm6,%xmm15
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ pxor %xmm10,%xmm2
+ movaps %xmm15,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ leaq 64(%rsi),%rsi
+ movdqa %xmm6,%xmm2
+ subq $16,%rdx
jmp .Lcbc_dec_tail_collected
+
.p2align 4
.Lcbc_dec_one:
+ movaps %xmm2,%xmm11
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
@@ -2403,118 +3076,80 @@ aesni_cbc_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_16
+ jnz .Loop_dec1_16
.byte 102,15,56,223,209
- xorps %xmm9,%xmm2
- movaps %xmm8,%xmm9
- subq $16,%rdx
+ xorps %xmm10,%xmm2
+ movaps %xmm11,%xmm10
jmp .Lcbc_dec_tail_collected
.p2align 4
.Lcbc_dec_two:
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- movups %xmm2,(%rsi)
- movaps %xmm7,%xmm9
- movaps %xmm3,%xmm2
+ movaps %xmm3,%xmm12
+ call _aesni_decrypt2
+ pxor %xmm10,%xmm2
+ movaps %xmm12,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ movdqa %xmm3,%xmm2
leaq 16(%rsi),%rsi
- subq $32,%rdx
jmp .Lcbc_dec_tail_collected
.p2align 4
.Lcbc_dec_three:
+ movaps %xmm4,%xmm13
call _aesni_decrypt3
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- movups %xmm2,(%rsi)
- xorps %xmm7,%xmm4
- movups %xmm3,16(%rsi)
- movaps %xmm6,%xmm9
- movaps %xmm4,%xmm2
+ pxor %xmm10,%xmm2
+ movaps %xmm13,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ movdqa %xmm4,%xmm2
leaq 32(%rsi),%rsi
- subq $48,%rdx
jmp .Lcbc_dec_tail_collected
.p2align 4
.Lcbc_dec_four:
+ movaps %xmm5,%xmm14
call _aesni_decrypt4
- xorps %xmm9,%xmm2
- movups 48(%rdi),%xmm9
- xorps %xmm8,%xmm3
- movups %xmm2,(%rsi)
- xorps %xmm7,%xmm4
- movups %xmm3,16(%rsi)
- xorps %xmm6,%xmm5
- movups %xmm4,32(%rsi)
- movaps %xmm5,%xmm2
+ pxor %xmm10,%xmm2
+ movaps %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ movdqa %xmm5,%xmm2
leaq 48(%rsi),%rsi
- subq $64,%rdx
- jmp .Lcbc_dec_tail_collected
-.p2align 4
-.Lcbc_dec_five:
- xorps %xmm7,%xmm7
- call _aesni_decrypt6
- movups 16(%rdi),%xmm1
- movups 32(%rdi),%xmm0
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- xorps %xmm1,%xmm4
- movups 48(%rdi),%xmm1
- xorps %xmm0,%xmm5
- movups 64(%rdi),%xmm9
- xorps %xmm1,%xmm6
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- leaq 64(%rsi),%rsi
- movaps %xmm6,%xmm2
- subq $80,%rdx
- jmp .Lcbc_dec_tail_collected
-.p2align 4
-.Lcbc_dec_six:
- call _aesni_decrypt6
- movups 16(%rdi),%xmm1
- movups 32(%rdi),%xmm0
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- xorps %xmm1,%xmm4
- movups 48(%rdi),%xmm1
- xorps %xmm0,%xmm5
- movups 64(%rdi),%xmm0
- xorps %xmm1,%xmm6
- movups 80(%rdi),%xmm9
- xorps %xmm0,%xmm7
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- movups %xmm6,64(%rsi)
- leaq 80(%rsi),%rsi
- movaps %xmm7,%xmm2
- subq $96,%rdx
jmp .Lcbc_dec_tail_collected
+
.p2align 4
.Lcbc_dec_tail_collected:
+ movups %xmm10,(%r8)
andq $15,%rdx
- movups %xmm9,(%r8)
jnz .Lcbc_dec_tail_partial
movups %xmm2,(%rsi)
jmp .Lcbc_dec_ret
.p2align 4
.Lcbc_dec_tail_partial:
- movaps %xmm2,64(%rsp)
+ movaps %xmm2,(%rsp)
movq $16,%rcx
movq %rsi,%rdi
subq %rdx,%rcx
- leaq 64(%rsp),%rsi
-.long 0x9066A4F3
+ leaq (%rsp),%rsi
+.long 0x9066A4F3
.Lcbc_dec_ret:
- movaps (%rsp),%xmm6
- movaps 16(%rsp),%xmm7
- movaps 32(%rsp),%xmm8
- movaps 48(%rsp),%xmm9
- leaq 88(%rsp),%rsp
+ movaps 16(%rsp),%xmm6
+ movaps 32(%rsp),%xmm7
+ movaps 48(%rsp),%xmm8
+ movaps 64(%rsp),%xmm9
+ movaps 80(%rsp),%xmm10
+ movaps 96(%rsp),%xmm11
+ movaps 112(%rsp),%xmm12
+ movaps 128(%rsp),%xmm13
+ movaps 144(%rsp),%xmm14
+ movaps 160(%rsp),%xmm15
+ leaq (%rbp),%rsp
+ popq %rbp
.Lcbc_ret:
movq 8(%rsp),%rdi
movq 16(%rsp),%rsi
@@ -2524,7 +3159,7 @@ aesni_cbc_encrypt:
.def aesni_set_decrypt_key; .scl 2; .type 32; .endef
.p2align 4
aesni_set_decrypt_key:
-.byte 0x48,0x83,0xEC,0x08
+.byte 0x48,0x83,0xEC,0x08
call __aesni_set_encrypt_key
shll $4,%edx
testl %eax,%eax
@@ -2563,7 +3198,7 @@ aesni_set_decrypt_key:
.p2align 4
aesni_set_encrypt_key:
__aesni_set_encrypt_key:
-.byte 0x48,0x83,0xEC,0x08
+.byte 0x48,0x83,0xEC,0x08
movq $-1,%rax
testq %rcx,%rcx
jz .Lenc_key_ret
@@ -2759,6 +3394,8 @@ __aesni_set_encrypt_key:
.long 1,0,0,0
.Lxts_magic:
.long 0x87,0,1,0
+.Lincrement1:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 6
@@ -2817,51 +3454,15 @@ ccm64_se_handler:
leaq 0(%rax),%rsi
leaq 512(%r8),%rdi
movl $8,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
leaq 88(%rax),%rax
jmp .Lcommon_seh_tail
-.def ctr32_se_handler; .scl 3; .type 32; .endef
+.def ctr_xts_se_handler; .scl 3; .type 32; .endef
.p2align 4
-ctr32_se_handler:
- pushq %rsi
- pushq %rdi
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushfq
- subq $64,%rsp
-
- movq 120(%r8),%rax
- movq 248(%r8),%rbx
-
- leaq .Lctr32_body(%rip),%r10
- cmpq %r10,%rbx
- jb .Lcommon_seh_tail
-
- movq 152(%r8),%rax
-
- leaq .Lctr32_ret(%rip),%r10
- cmpq %r10,%rbx
- jae .Lcommon_seh_tail
-
- leaq 32(%rax),%rsi
- leaq 512(%r8),%rdi
- movl $20,%ecx
-.long 0xa548f3fc
- leaq 200(%rax),%rax
-
- jmp .Lcommon_seh_tail
-
-
-.def xts_se_handler; .scl 3; .type 32; .endef
-.p2align 4
-xts_se_handler:
+ctr_xts_se_handler:
pushq %rsi
pushq %rdi
pushq %rbx
@@ -2891,13 +3492,13 @@ xts_se_handler:
cmpq %r10,%rbx
jae .Lcommon_seh_tail
- leaq 96(%rax),%rsi
+ movq 160(%r8),%rax
+ leaq -160(%rax),%rsi
leaq 512(%r8),%rdi
movl $20,%ecx
-.long 0xa548f3fc
- leaq 104+160(%rax),%rax
+.long 0xa548f3fc
- jmp .Lcommon_seh_tail
+ jmp .Lcommon_rbp_tail
.def cbc_se_handler; .scl 3; .type 32; .endef
.p2align 4
@@ -2928,11 +3529,16 @@ cbc_se_handler:
cmpq %r10,%rbx
jae .Lcommon_seh_tail
- leaq 0(%rax),%rsi
+ leaq 16(%rax),%rsi
leaq 512(%r8),%rdi
- movl $8,%ecx
-.long 0xa548f3fc
- leaq 88(%rax),%rax
+ movl $20,%ecx
+.long 0xa548f3fc
+
+.Lcommon_rbp_tail:
+ movq 160(%r8),%rax
+ movq (%rax),%rbp
+ leaq 8(%rax),%rax
+ movq %rbp,160(%r8)
jmp .Lcommon_seh_tail
.Lrestore_cbc_rax:
@@ -2948,7 +3554,7 @@ cbc_se_handler:
movq 40(%r9),%rdi
movq %r8,%rsi
movl $154,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
movq %r9,%rsi
xorq %rcx,%rcx
@@ -3022,26 +3628,27 @@ cbc_se_handler:
.LSEH_info_ccm64_enc:
.byte 9,0,0,0
.rva ccm64_se_handler
-.rva .Lccm64_enc_body,.Lccm64_enc_ret
+.rva .Lccm64_enc_body,.Lccm64_enc_ret
.LSEH_info_ccm64_dec:
.byte 9,0,0,0
.rva ccm64_se_handler
-.rva .Lccm64_dec_body,.Lccm64_dec_ret
+.rva .Lccm64_dec_body,.Lccm64_dec_ret
.LSEH_info_ctr32:
.byte 9,0,0,0
-.rva ctr32_se_handler
+.rva ctr_xts_se_handler
+.rva .Lctr32_body,.Lctr32_epilogue
.LSEH_info_xts_enc:
.byte 9,0,0,0
-.rva xts_se_handler
-.rva .Lxts_enc_body,.Lxts_enc_epilogue
+.rva ctr_xts_se_handler
+.rva .Lxts_enc_body,.Lxts_enc_epilogue
.LSEH_info_xts_dec:
.byte 9,0,0,0
-.rva xts_se_handler
-.rva .Lxts_dec_body,.Lxts_dec_epilogue
+.rva ctr_xts_se_handler
+.rva .Lxts_dec_body,.Lxts_dec_epilogue
.LSEH_info_cbc:
.byte 9,0,0,0
.rva cbc_se_handler
.LSEH_info_key:
.byte 0x01,0x04,0x01,0x00
-.byte 0x04,0x02,0x00,0x00
+.byte 0x04,0x02,0x00,0x00
diff --git a/lib/accelerated/x86/coff/e_padlock-x86.s b/lib/accelerated/x86/coff/e_padlock-x86.s
index 47dd6d36d4..41f87b1174 100644
--- a/lib/accelerated/x86/coff/e_padlock-x86.s
+++ b/lib/accelerated/x86/coff/e_padlock-x86.s
@@ -180,16 +180,14 @@ _padlock_ecb_encrypt:
leal 16(%edx),%edx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpl $128,%ecx
- jbe .L006ecb_short
testl $32,(%edx)
- jnz .L007ecb_aligned
+ jnz .L006ecb_aligned
testl $15,%edi
setz %al
testl $15,%esi
setz %bl
testl %ebx,%eax
- jnz .L007ecb_aligned
+ jnz .L006ecb_aligned
negl %eax
movl $512,%ebx
notl %eax
@@ -201,10 +199,28 @@ _padlock_ecb_encrypt:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
- jmp .L008ecb_loop
+ movl %eax,16(%ebp)
+ cmpl %ebx,%ecx
+ ja .L007ecb_loop
+ movl %esi,%eax
+ cmpl %esp,%ebp
+ cmovel %edi,%eax
+ addl %ecx,%eax
+ negl %eax
+ andl $4095,%eax
+ cmpl $128,%eax
+ movl $-128,%eax
+ cmovael %ebx,%eax
+ andl %eax,%ebx
+ jz .L008ecb_unaligned_tail
+ jmp .L007ecb_loop
.align 16
-.L008ecb_loop:
+.L007ecb_loop:
movl %edi,(%ebp)
movl %esi,4(%ebp)
movl %ecx,8(%ebp)
@@ -229,8 +245,8 @@ _padlock_ecb_encrypt:
testl $15,%edi
jz .L010ecb_out_aligned
movl %ebx,%ecx
- shrl $2,%ecx
leal (%esp),%esi
+ shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
.L010ecb_out_aligned:
@@ -240,43 +256,75 @@ _padlock_ecb_encrypt:
addl %ebx,%esi
subl %ebx,%ecx
movl $512,%ebx
- jnz .L008ecb_loop
+ jz .L011ecb_break
+ cmpl %ebx,%ecx
+ jae .L007ecb_loop
+.L008ecb_unaligned_tail:
+ xorl %eax,%eax
cmpl %ebp,%esp
- je .L011ecb_done
+ cmovel %ecx,%eax
+ subl %eax,%esp
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp .L007ecb_loop
+.align 16
+.L011ecb_break:
+ cmpl %ebp,%esp
+ je .L012ecb_done
pxor %xmm0,%xmm0
leal (%esp),%eax
-.L012ecb_bzero:
+.L013ecb_bzero:
movaps %xmm0,(%eax)
leal 16(%eax),%eax
cmpl %eax,%ebp
- ja .L012ecb_bzero
-.L011ecb_done:
+ ja .L013ecb_bzero
+.L012ecb_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
- jmp .L013ecb_exit
+ jmp .L014ecb_exit
.align 16
-.L006ecb_short:
+.L006ecb_aligned:
+ leal (%esi,%ecx,1),%ebp
+ negl %ebp
+ andl $4095,%ebp
xorl %eax,%eax
- leal -24(%esp),%ebp
- subl %ecx,%eax
- leal (%eax,%ebp,1),%esp
- andl $-16,%esp
- xorl %ebx,%ebx
-.L014ecb_short_copy:
- movups (%esi,%ebx,1),%xmm0
- leal 16(%ebx),%ebx
- cmpl %ebx,%ecx
- movaps %xmm0,-16(%esp,%ebx,1)
- ja .L014ecb_short_copy
- movl %esp,%esi
- movl %ecx,%ebx
- jmp .L008ecb_loop
-.align 16
-.L007ecb_aligned:
+ cmpl $128,%ebp
+ movl $127,%ebp
+ cmovael %eax,%ebp
+ andl %ecx,%ebp
+ subl %ebp,%ecx
+ jz .L015ecb_aligned_tail
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
.byte 243,15,167,200
-.L013ecb_exit:
+ testl %ebp,%ebp
+ jz .L014ecb_exit
+.L015ecb_aligned_tail:
+ movl %ebp,%ecx
+ leal -24(%esp),%ebp
+ movl %ebp,%esp
+ movl %ebp,%eax
+ subl %ecx,%esp
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp .L007ecb_loop
+.L014ecb_exit:
movl $1,%eax
leal 4(%esp),%esp
.L004ecb_abort:
@@ -299,19 +347,17 @@ _padlock_cbc_encrypt:
movl 28(%esp),%edx
movl 32(%esp),%ecx
testl $15,%edx
- jnz .L015cbc_abort
+ jnz .L016cbc_abort
testl $15,%ecx
- jnz .L015cbc_abort
+ jnz .L016cbc_abort
leal .Lpadlock_saved_context,%eax
pushfl
cld
call __padlock_verify_ctx
-.L016cbc_pic_point:
+.L017cbc_pic_point:
leal 16(%edx),%edx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpl $64,%ecx
- jbe .L017cbc_short
testl $32,(%edx)
jnz .L018cbc_aligned
testl $15,%edi
@@ -331,7 +377,25 @@ _padlock_cbc_encrypt:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
+ movl %eax,16(%ebp)
+ cmpl %ebx,%ecx
+ ja .L019cbc_loop
+ movl %esi,%eax
+ cmpl %esp,%ebp
+ cmovel %edi,%eax
+ addl %ecx,%eax
+ negl %eax
+ andl $4095,%eax
+ cmpl $64,%eax
+ movl $-64,%eax
+ cmovael %ebx,%eax
+ andl %eax,%ebx
+ jz .L020cbc_unaligned_tail
jmp .L019cbc_loop
.align 16
.L019cbc_loop:
@@ -343,13 +407,13 @@ _padlock_cbc_encrypt:
testl $15,%edi
cmovnzl %esp,%edi
testl $15,%esi
- jz .L020cbc_inp_aligned
+ jz .L021cbc_inp_aligned
shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
movl %ebx,%ecx
movl %edi,%esi
-.L020cbc_inp_aligned:
+.L021cbc_inp_aligned:
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
@@ -359,61 +423,93 @@ _padlock_cbc_encrypt:
movl (%ebp),%edi
movl 12(%ebp),%ebx
testl $15,%edi
- jz .L021cbc_out_aligned
+ jz .L022cbc_out_aligned
movl %ebx,%ecx
- shrl $2,%ecx
leal (%esp),%esi
+ shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
-.L021cbc_out_aligned:
+.L022cbc_out_aligned:
movl 4(%ebp),%esi
movl 8(%ebp),%ecx
addl %ebx,%edi
addl %ebx,%esi
subl %ebx,%ecx
movl $512,%ebx
- jnz .L019cbc_loop
+ jz .L023cbc_break
+ cmpl %ebx,%ecx
+ jae .L019cbc_loop
+.L020cbc_unaligned_tail:
+ xorl %eax,%eax
+ cmpl %ebp,%esp
+ cmovel %ecx,%eax
+ subl %eax,%esp
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp .L019cbc_loop
+.align 16
+.L023cbc_break:
cmpl %ebp,%esp
- je .L022cbc_done
+ je .L024cbc_done
pxor %xmm0,%xmm0
leal (%esp),%eax
-.L023cbc_bzero:
+.L025cbc_bzero:
movaps %xmm0,(%eax)
leal 16(%eax),%eax
cmpl %eax,%ebp
- ja .L023cbc_bzero
-.L022cbc_done:
+ ja .L025cbc_bzero
+.L024cbc_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
- jmp .L024cbc_exit
-.align 16
-.L017cbc_short:
- xorl %eax,%eax
- leal -24(%esp),%ebp
- subl %ecx,%eax
- leal (%eax,%ebp,1),%esp
- andl $-16,%esp
- xorl %ebx,%ebx
-.L025cbc_short_copy:
- movups (%esi,%ebx,1),%xmm0
- leal 16(%ebx),%ebx
- cmpl %ebx,%ecx
- movaps %xmm0,-16(%esp,%ebx,1)
- ja .L025cbc_short_copy
- movl %esp,%esi
- movl %ecx,%ebx
- jmp .L019cbc_loop
+ jmp .L026cbc_exit
.align 16
.L018cbc_aligned:
+ leal (%esi,%ecx,1),%ebp
+ negl %ebp
+ andl $4095,%ebp
+ xorl %eax,%eax
+ cmpl $64,%ebp
+ movl $63,%ebp
+ cmovael %eax,%ebp
+ andl %ecx,%ebp
+ subl %ebp,%ecx
+ jz .L027cbc_aligned_tail
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
.byte 243,15,167,208
movaps (%eax),%xmm0
movaps %xmm0,-16(%edx)
-.L024cbc_exit:
+ testl %ebp,%ebp
+ jz .L026cbc_exit
+.L027cbc_aligned_tail:
+ movl %ebp,%ecx
+ leal -24(%esp),%ebp
+ movl %ebp,%esp
+ movl %ebp,%eax
+ subl %ecx,%esp
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp .L019cbc_loop
+.L026cbc_exit:
movl $1,%eax
leal 4(%esp),%esp
-.L015cbc_abort:
+.L016cbc_abort:
popl %edi
popl %esi
popl %ebx
@@ -433,25 +529,25 @@ _padlock_cfb_encrypt:
movl 28(%esp),%edx
movl 32(%esp),%ecx
testl $15,%edx
- jnz .L026cfb_abort
+ jnz .L028cfb_abort
testl $15,%ecx
- jnz .L026cfb_abort
+ jnz .L028cfb_abort
leal .Lpadlock_saved_context,%eax
pushfl
cld
call __padlock_verify_ctx
-.L027cfb_pic_point:
+.L029cfb_pic_point:
leal 16(%edx),%edx
xorl %eax,%eax
xorl %ebx,%ebx
testl $32,(%edx)
- jnz .L028cfb_aligned
+ jnz .L030cfb_aligned
testl $15,%edi
setz %al
testl $15,%esi
setz %bl
testl %ebx,%eax
- jnz .L028cfb_aligned
+ jnz .L030cfb_aligned
negl %eax
movl $512,%ebx
notl %eax
@@ -463,10 +559,15 @@ _padlock_cfb_encrypt:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
- jmp .L029cfb_loop
+ movl %eax,16(%ebp)
+ jmp .L031cfb_loop
.align 16
-.L029cfb_loop:
+.L031cfb_loop:
movl %edi,(%ebp)
movl %esi,4(%ebp)
movl %ecx,8(%ebp)
@@ -475,13 +576,13 @@ _padlock_cfb_encrypt:
testl $15,%edi
cmovnzl %esp,%edi
testl $15,%esi
- jz .L030cfb_inp_aligned
+ jz .L032cfb_inp_aligned
shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
movl %ebx,%ecx
movl %edi,%esi
-.L030cfb_inp_aligned:
+.L032cfb_inp_aligned:
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
@@ -491,61 +592,45 @@ _padlock_cfb_encrypt:
movl (%ebp),%edi
movl 12(%ebp),%ebx
testl $15,%edi
- jz .L031cfb_out_aligned
+ jz .L033cfb_out_aligned
movl %ebx,%ecx
- shrl $2,%ecx
leal (%esp),%esi
+ shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
-.L031cfb_out_aligned:
+.L033cfb_out_aligned:
movl 4(%ebp),%esi
movl 8(%ebp),%ecx
addl %ebx,%edi
addl %ebx,%esi
subl %ebx,%ecx
movl $512,%ebx
- jnz .L029cfb_loop
+ jnz .L031cfb_loop
cmpl %ebp,%esp
- je .L032cfb_done
+ je .L034cfb_done
pxor %xmm0,%xmm0
leal (%esp),%eax
-.L033cfb_bzero:
+.L035cfb_bzero:
movaps %xmm0,(%eax)
leal 16(%eax),%eax
cmpl %eax,%ebp
- ja .L033cfb_bzero
-.L032cfb_done:
+ ja .L035cfb_bzero
+.L034cfb_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
- jmp .L034cfb_exit
+ jmp .L036cfb_exit
.align 16
-.L035cfb_short:
- xorl %eax,%eax
- leal -24(%esp),%ebp
- subl %ecx,%eax
- leal (%eax,%ebp,1),%esp
- andl $-16,%esp
- xorl %ebx,%ebx
-.L036cfb_short_copy:
- movups (%esi,%ebx,1),%xmm0
- leal 16(%ebx),%ebx
- cmpl %ebx,%ecx
- movaps %xmm0,-16(%esp,%ebx,1)
- ja .L036cfb_short_copy
- movl %esp,%esi
- movl %ecx,%ebx
- jmp .L029cfb_loop
-.align 16
-.L028cfb_aligned:
+.L030cfb_aligned:
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
.byte 243,15,167,224
movaps (%eax),%xmm0
movaps %xmm0,-16(%edx)
-.L034cfb_exit:
+.L036cfb_exit:
movl $1,%eax
leal 4(%esp),%esp
-.L026cfb_abort:
+.L028cfb_abort:
popl %edi
popl %esi
popl %ebx
@@ -595,7 +680,12 @@ _padlock_ofb_encrypt:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
+ movl %eax,16(%ebp)
jmp .L040ofb_loop
.align 16
.L040ofb_loop:
@@ -625,8 +715,8 @@ _padlock_ofb_encrypt:
testl $15,%edi
jz .L042ofb_out_aligned
movl %ebx,%ecx
- shrl $2,%ecx
leal (%esp),%esi
+ shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
.L042ofb_out_aligned:
@@ -647,26 +737,10 @@ _padlock_ofb_encrypt:
cmpl %eax,%ebp
ja .L044ofb_bzero
.L043ofb_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
jmp .L045ofb_exit
.align 16
-.L046ofb_short:
- xorl %eax,%eax
- leal -24(%esp),%ebp
- subl %ecx,%eax
- leal (%eax,%ebp,1),%esp
- andl $-16,%esp
- xorl %ebx,%ebx
-.L047ofb_short_copy:
- movups (%esi,%ebx,1),%xmm0
- leal 16(%ebx),%ebx
- cmpl %ebx,%ecx
- movaps %xmm0,-16(%esp,%ebx,1)
- ja .L047ofb_short_copy
- movl %esp,%esi
- movl %ecx,%ebx
- jmp .L040ofb_loop
-.align 16
.L039ofb_aligned:
leal -16(%edx),%eax
leal 16(%edx),%ebx
@@ -697,14 +771,14 @@ _padlock_ctr32_encrypt:
movl 28(%esp),%edx
movl 32(%esp),%ecx
testl $15,%edx
- jnz .L048ctr32_abort
+ jnz .L046ctr32_abort
testl $15,%ecx
- jnz .L048ctr32_abort
+ jnz .L046ctr32_abort
leal .Lpadlock_saved_context,%eax
pushfl
cld
call __padlock_verify_ctx
-.L049ctr32_pic_point:
+.L047ctr32_pic_point:
leal 16(%edx),%edx
xorl %eax,%eax
movq -16(%edx),%mm0
@@ -718,10 +792,15 @@ _padlock_ctr32_encrypt:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
- jmp .L050ctr32_loop
+ movl %eax,16(%ebp)
+ jmp .L048ctr32_loop
.align 16
-.L050ctr32_loop:
+.L048ctr32_loop:
movl %edi,(%ebp)
movl %esi,4(%ebp)
movl %ecx,8(%ebp)
@@ -730,7 +809,7 @@ _padlock_ctr32_encrypt:
movl -4(%edx),%ecx
xorl %edi,%edi
movl -8(%edx),%eax
-.L051ctr32_prepare:
+.L049ctr32_prepare:
movl %ecx,12(%esp,%edi,1)
bswap %ecx
movq %mm0,(%esp,%edi,1)
@@ -739,7 +818,7 @@ _padlock_ctr32_encrypt:
bswap %ecx
leal 16(%edi),%edi
cmpl %ebx,%edi
- jb .L051ctr32_prepare
+ jb .L049ctr32_prepare
movl %ecx,-4(%edx)
leal (%esp),%esi
leal (%esp),%edi
@@ -752,32 +831,33 @@ _padlock_ctr32_encrypt:
movl 12(%ebp),%ebx
movl 4(%ebp),%esi
xorl %ecx,%ecx
-.L052ctr32_xor:
+.L050ctr32_xor:
movups (%esi,%ecx,1),%xmm1
leal 16(%ecx),%ecx
pxor -16(%esp,%ecx,1),%xmm1
movups %xmm1,-16(%edi,%ecx,1)
cmpl %ebx,%ecx
- jb .L052ctr32_xor
+ jb .L050ctr32_xor
movl 8(%ebp),%ecx
addl %ebx,%edi
addl %ebx,%esi
subl %ebx,%ecx
movl $512,%ebx
- jnz .L050ctr32_loop
+ jnz .L048ctr32_loop
pxor %xmm0,%xmm0
leal (%esp),%eax
-.L053ctr32_bzero:
+.L051ctr32_bzero:
movaps %xmm0,(%eax)
leal 16(%eax),%eax
cmpl %eax,%ebp
- ja .L053ctr32_bzero
-.L054ctr32_done:
+ ja .L051ctr32_bzero
+.L052ctr32_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
movl $1,%eax
leal 4(%esp),%esp
emms
-.L048ctr32_abort:
+.L046ctr32_abort:
popl %edi
popl %esi
popl %ebx
@@ -801,10 +881,10 @@ __win32_segv_handler:
movl 4(%esp),%edx
movl 12(%esp),%ecx
cmpl $3221225477,(%edx)
- jne .L055ret
+ jne .L053ret
addl $4,184(%ecx)
movl $0,%eax
-.L055ret:
+.L053ret:
ret
.globl _padlock_sha1_oneshot
.def _padlock_sha1_oneshot; .scl 2; .type 32; .endef
diff --git a/lib/accelerated/x86/coff/e_padlock-x86_64.s b/lib/accelerated/x86/coff/e_padlock-x86_64.s
index b9b93cc592..cfa91d3fff 100644
--- a/lib/accelerated/x86/coff/e_padlock-x86_64.s
+++ b/lib/accelerated/x86/coff/e_padlock-x86_64.s
@@ -135,7 +135,7 @@ padlock_aes_block:
movq $1,%rcx
leaq 32(%rdx),%rbx
leaq 16(%rdx),%rdx
-.byte 0xf3,0x0f,0xa7,0xc8
+.byte 0xf3,0x0f,0xa7,0xc8
movq %r8,%rbx
movq 8(%rsp),%rdi
movq 16(%rsp),%rsi
@@ -154,7 +154,7 @@ padlock_xstore:
movq %rdx,%rsi
movl %esi,%edx
-.byte 0x0f,0xa7,0xc0
+.byte 0x0f,0xa7,0xc0
movq 8(%rsp),%rdi
movq 16(%rsp),%rsi
.byte 0xf3,0xc3
@@ -181,7 +181,7 @@ padlock_sha1_oneshot:
movq %rsp,%rdi
movl %eax,16(%rsp)
xorq %rax,%rax
-.byte 0xf3,0x0f,0xa6,0xc8
+.byte 0xf3,0x0f,0xa6,0xc8
movaps (%rsp),%xmm0
movl 16(%rsp),%eax
addq $128+8,%rsp
@@ -213,7 +213,7 @@ padlock_sha1_blocks:
movq %rsp,%rdi
movl %eax,16(%rsp)
movq $-1,%rax
-.byte 0xf3,0x0f,0xa6,0xc8
+.byte 0xf3,0x0f,0xa6,0xc8
movaps (%rsp),%xmm0
movl 16(%rsp),%eax
addq $128+8,%rsp
@@ -245,7 +245,7 @@ padlock_sha256_oneshot:
movq %rsp,%rdi
movaps %xmm1,16(%rsp)
xorq %rax,%rax
-.byte 0xf3,0x0f,0xa6,0xd0
+.byte 0xf3,0x0f,0xa6,0xd0
movaps (%rsp),%xmm0
movaps 16(%rsp),%xmm1
addq $128+8,%rsp
@@ -277,7 +277,7 @@ padlock_sha256_blocks:
movq %rsp,%rdi
movaps %xmm1,16(%rsp)
movq $-1,%rax
-.byte 0xf3,0x0f,0xa6,0xd0
+.byte 0xf3,0x0f,0xa6,0xd0
movaps (%rsp),%xmm0
movaps 16(%rsp),%xmm1
addq $128+8,%rsp
@@ -312,7 +312,7 @@ padlock_sha512_blocks:
movaps %xmm1,16(%rsp)
movaps %xmm2,32(%rsp)
movaps %xmm3,48(%rsp)
-.byte 0xf3,0x0f,0xa6,0xe0
+.byte 0xf3,0x0f,0xa6,0xe0
movaps (%rsp),%xmm0
movaps 16(%rsp),%xmm1
movaps 32(%rsp),%xmm2
@@ -354,8 +354,6 @@ padlock_ecb_encrypt:
leaq 16(%rdx),%rdx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpq $128,%rcx
- jbe .Lecb_short
testl $32,(%rdx)
jnz .Lecb_aligned
testq $15,%rdi
@@ -375,6 +373,21 @@ padlock_ecb_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+ cmpq %rbx,%rcx
+ ja .Lecb_loop
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $128,%rax
+ movq $-128,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jz .Lecb_unaligned_tail
jmp .Lecb_loop
.p2align 4
.Lecb_loop:
@@ -390,7 +403,7 @@ padlock_ecb_encrypt:
testq $15,%rsi
jz .Lecb_inp_aligned
shrq $3,%rcx
-.byte 0xf3,0x48,0xa5
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
movq %rbx,%rcx
movq %rdi,%rsi
@@ -398,15 +411,15 @@ padlock_ecb_encrypt:
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,200
+.byte 0xf3,0x0f,0xa7,200
movq %r8,%rdi
movq %r11,%rbx
testq $15,%rdi
jz .Lecb_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
-.byte 0xf3,0x48,0xa5
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
.Lecb_out_aligned:
movq %r9,%rsi
@@ -415,9 +428,26 @@ padlock_ecb_encrypt:
addq %rbx,%rsi
subq %rbx,%rcx
movq $512,%rbx
- jnz .Lecb_loop
-
+ jz .Lecb_break
+ cmpq %rbx,%rcx
+ jae .Lecb_loop
+.Lecb_unaligned_tail:
+ xorl %eax,%eax
cmpq %rsp,%rbp
+ cmoveq %rcx,%rax
+ movq %rdi,%r8
+ movq %rcx,%rbx
+ subq %rax,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ movq %rsp,%rsi
+ movq %r8,%rdi
+ movq %rbx,%rcx
+ jmp .Lecb_loop
+.p2align 4
+.Lecb_break:
+ cmpq %rbp,%rsp
je .Lecb_done
pxor %xmm0,%xmm0
@@ -431,26 +461,39 @@ padlock_ecb_encrypt:
.Lecb_done:
leaq (%rbp),%rsp
jmp .Lecb_exit
-.p2align 4
-.Lecb_short:
- movq %rsp,%rbp
- subq %rcx,%rsp
- xorq %rbx,%rbx
-.Lecb_short_copy:
- movups (%rsi,%rbx,1),%xmm0
- leaq 16(%rbx),%rbx
- cmpq %rbx,%rcx
- movaps %xmm0,-16(%rsp,%rbx,1)
- ja .Lecb_short_copy
- movq %rsp,%rsi
- movq %rcx,%rbx
- jmp .Lecb_loop
+
.p2align 4
.Lecb_aligned:
+ leaq (%rsi,%rcx,1),%rbp
+ negq %rbp
+ andq $4095,%rbp
+ xorl %eax,%eax
+ cmpq $128,%rbp
+ movq $128-1,%rbp
+ cmovaeq %rax,%rbp
+ andq %rcx,%rbp
+ subq %rbp,%rcx
+ jz .Lecb_aligned_tail
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,200
+.byte 0xf3,0x0f,0xa7,200
+ testq %rbp,%rbp
+ jz .Lecb_exit
+
+.Lecb_aligned_tail:
+ movq %rdi,%r8
+ movq %rbp,%rbx
+ movq %rbp,%rcx
+ leaq (%rsp),%rbp
+ subq %rcx,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ leaq (%r8),%rdi
+ leaq (%rsp),%rsi
+ movq %rbx,%rcx
+ jmp .Lecb_loop
.Lecb_exit:
movl $1,%eax
leaq 8(%rsp),%rsp
@@ -489,8 +532,6 @@ padlock_cbc_encrypt:
leaq 16(%rdx),%rdx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpq $64,%rcx
- jbe .Lcbc_short
testl $32,(%rdx)
jnz .Lcbc_aligned
testq $15,%rdi
@@ -510,6 +551,21 @@ padlock_cbc_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+ cmpq %rbx,%rcx
+ ja .Lcbc_loop
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $64,%rax
+ movq $-64,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jz .Lcbc_unaligned_tail
jmp .Lcbc_loop
.p2align 4
.Lcbc_loop:
@@ -525,7 +581,7 @@ padlock_cbc_encrypt:
testq $15,%rsi
jz .Lcbc_inp_aligned
shrq $3,%rcx
-.byte 0xf3,0x48,0xa5
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
movq %rbx,%rcx
movq %rdi,%rsi
@@ -533,7 +589,7 @@ padlock_cbc_encrypt:
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,208
+.byte 0xf3,0x0f,0xa7,208
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
movq %r8,%rdi
@@ -541,9 +597,9 @@ padlock_cbc_encrypt:
testq $15,%rdi
jz .Lcbc_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
-.byte 0xf3,0x48,0xa5
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
.Lcbc_out_aligned:
movq %r9,%rsi
@@ -552,9 +608,26 @@ padlock_cbc_encrypt:
addq %rbx,%rsi
subq %rbx,%rcx
movq $512,%rbx
- jnz .Lcbc_loop
-
+ jz .Lcbc_break
+ cmpq %rbx,%rcx
+ jae .Lcbc_loop
+.Lcbc_unaligned_tail:
+ xorl %eax,%eax
cmpq %rsp,%rbp
+ cmoveq %rcx,%rax
+ movq %rdi,%r8
+ movq %rcx,%rbx
+ subq %rax,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ movq %rsp,%rsi
+ movq %r8,%rdi
+ movq %rbx,%rcx
+ jmp .Lcbc_loop
+.p2align 4
+.Lcbc_break:
+ cmpq %rbp,%rsp
je .Lcbc_done
pxor %xmm0,%xmm0
@@ -568,28 +641,41 @@ padlock_cbc_encrypt:
.Lcbc_done:
leaq (%rbp),%rsp
jmp .Lcbc_exit
-.p2align 4
-.Lcbc_short:
- movq %rsp,%rbp
- subq %rcx,%rsp
- xorq %rbx,%rbx
-.Lcbc_short_copy:
- movups (%rsi,%rbx,1),%xmm0
- leaq 16(%rbx),%rbx
- cmpq %rbx,%rcx
- movaps %xmm0,-16(%rsp,%rbx,1)
- ja .Lcbc_short_copy
- movq %rsp,%rsi
- movq %rcx,%rbx
- jmp .Lcbc_loop
+
.p2align 4
.Lcbc_aligned:
+ leaq (%rsi,%rcx,1),%rbp
+ negq %rbp
+ andq $4095,%rbp
+ xorl %eax,%eax
+ cmpq $64,%rbp
+ movq $64-1,%rbp
+ cmovaeq %rax,%rbp
+ andq %rcx,%rbp
+ subq %rbp,%rcx
+ jz .Lcbc_aligned_tail
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,208
+.byte 0xf3,0x0f,0xa7,208
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
+ testq %rbp,%rbp
+ jz .Lcbc_exit
+
+.Lcbc_aligned_tail:
+ movq %rdi,%r8
+ movq %rbp,%rbx
+ movq %rbp,%rcx
+ leaq (%rsp),%rbp
+ subq %rcx,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ leaq (%r8),%rdi
+ leaq (%rsp),%rsi
+ movq %rbx,%rcx
+ jmp .Lcbc_loop
.Lcbc_exit:
movl $1,%eax
leaq 8(%rsp),%rsp
@@ -647,6 +733,8 @@ padlock_cfb_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
jmp .Lcfb_loop
.p2align 4
.Lcfb_loop:
@@ -662,7 +750,7 @@ padlock_cfb_encrypt:
testq $15,%rsi
jz .Lcfb_inp_aligned
shrq $3,%rcx
-.byte 0xf3,0x48,0xa5
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
movq %rbx,%rcx
movq %rdi,%rsi
@@ -670,7 +758,7 @@ padlock_cfb_encrypt:
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,224
+.byte 0xf3,0x0f,0xa7,224
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
movq %r8,%rdi
@@ -678,9 +766,9 @@ padlock_cfb_encrypt:
testq $15,%rdi
jz .Lcfb_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
-.byte 0xf3,0x48,0xa5
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
.Lcfb_out_aligned:
movq %r9,%rsi
@@ -690,8 +778,7 @@ padlock_cfb_encrypt:
subq %rbx,%rcx
movq $512,%rbx
jnz .Lcfb_loop
-
- cmpq %rsp,%rbp
+ cmpq %rbp,%rsp
je .Lcfb_done
pxor %xmm0,%xmm0
@@ -705,12 +792,13 @@ padlock_cfb_encrypt:
.Lcfb_done:
leaq (%rbp),%rsp
jmp .Lcfb_exit
+
.p2align 4
.Lcfb_aligned:
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,224
+.byte 0xf3,0x0f,0xa7,224
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
.Lcfb_exit:
@@ -770,6 +858,8 @@ padlock_ofb_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
jmp .Lofb_loop
.p2align 4
.Lofb_loop:
@@ -785,7 +875,7 @@ padlock_ofb_encrypt:
testq $15,%rsi
jz .Lofb_inp_aligned
shrq $3,%rcx
-.byte 0xf3,0x48,0xa5
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
movq %rbx,%rcx
movq %rdi,%rsi
@@ -793,7 +883,7 @@ padlock_ofb_encrypt:
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,232
+.byte 0xf3,0x0f,0xa7,232
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
movq %r8,%rdi
@@ -801,9 +891,9 @@ padlock_ofb_encrypt:
testq $15,%rdi
jz .Lofb_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
-.byte 0xf3,0x48,0xa5
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
.Lofb_out_aligned:
movq %r9,%rsi
@@ -813,8 +903,7 @@ padlock_ofb_encrypt:
subq %rbx,%rcx
movq $512,%rbx
jnz .Lofb_loop
-
- cmpq %rsp,%rbp
+ cmpq %rbp,%rsp
je .Lofb_done
pxor %xmm0,%xmm0
@@ -828,12 +917,13 @@ padlock_ofb_encrypt:
.Lofb_done:
leaq (%rbp),%rsp
jmp .Lofb_exit
+
.p2align 4
.Lofb_aligned:
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,232
+.byte 0xf3,0x0f,0xa7,232
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
.Lofb_exit:
@@ -874,8 +964,6 @@ padlock_ctr32_encrypt:
leaq 16(%rdx),%rdx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpq $64,%rcx
- jbe .Lctr32_short
testl $32,(%rdx)
jnz .Lctr32_aligned
testq $15,%rdi
@@ -895,15 +983,32 @@ padlock_ctr32_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
.Lctr32_reenter:
movl -4(%rdx),%eax
bswapl %eax
negl %eax
andl $31,%eax
- jz .Lctr32_loop
+ movq $512,%rbx
shll $4,%eax
+ cmovzq %rbx,%rax
cmpq %rax,%rcx
cmovaq %rax,%rbx
+ cmovbeq %rcx,%rbx
+ cmpq %rbx,%rcx
+ ja .Lctr32_loop
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $32,%rax
+ movq $-32,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jz .Lctr32_unaligned_tail
jmp .Lctr32_loop
.p2align 4
.Lctr32_loop:
@@ -919,7 +1024,7 @@ padlock_ctr32_encrypt:
testq $15,%rsi
jz .Lctr32_inp_aligned
shrq $3,%rcx
-.byte 0xf3,0x48,0xa5
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
movq %rbx,%rcx
movq %rdi,%rsi
@@ -927,23 +1032,23 @@ padlock_ctr32_encrypt:
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,216
+.byte 0xf3,0x0f,0xa7,216
movl -4(%rdx),%eax
testl $4294901760,%eax
- jnz .Lctr32_no_corr
+ jnz .Lctr32_no_carry
bswapl %eax
addl $65536,%eax
bswapl %eax
movl %eax,-4(%rdx)
-.Lctr32_no_corr:
+.Lctr32_no_carry:
movq %r8,%rdi
movq %r11,%rbx
testq $15,%rdi
jz .Lctr32_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
-.byte 0xf3,0x48,0xa5
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
.Lctr32_out_aligned:
movq %r9,%rsi
@@ -952,9 +1057,38 @@ padlock_ctr32_encrypt:
addq %rbx,%rsi
subq %rbx,%rcx
movq $512,%rbx
+ jz .Lctr32_break
+ cmpq %rbx,%rcx
+ jae .Lctr32_loop
+ movq %rcx,%rbx
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $32,%rax
+ movq $-32,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
jnz .Lctr32_loop
-
+.Lctr32_unaligned_tail:
+ xorl %eax,%eax
cmpq %rsp,%rbp
+ cmoveq %rcx,%rax
+ movq %rdi,%r8
+ movq %rcx,%rbx
+ subq %rax,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ movq %rsp,%rsi
+ movq %r8,%rdi
+ movq %rbx,%rcx
+ jmp .Lctr32_loop
+.p2align 4
+.Lctr32_break:
+ cmpq %rbp,%rsp
je .Lctr32_done
pxor %xmm0,%xmm0
@@ -968,56 +1102,75 @@ padlock_ctr32_encrypt:
.Lctr32_done:
leaq (%rbp),%rsp
jmp .Lctr32_exit
-.p2align 4
-.Lctr32_short:
- movq %rsp,%rbp
- subq %rcx,%rsp
- xorq %rbx,%rbx
-.Lctr32_short_copy:
- movups (%rsi,%rbx,1),%xmm0
- leaq 16(%rbx),%rbx
- cmpq %rbx,%rcx
- movaps %xmm0,-16(%rsp,%rbx,1)
- ja .Lctr32_short_copy
- movq %rsp,%rsi
- movq %rcx,%rbx
- jmp .Lctr32_reenter
+
.p2align 4
.Lctr32_aligned:
movl -4(%rdx),%eax
- movq $1048576,%rbx
bswapl %eax
- cmpq %rcx,%rbx
- cmovaq %rcx,%rbx
negl %eax
andl $65535,%eax
- jz .Lctr32_aligned_loop
+ movq $1048576,%rbx
shll $4,%eax
+ cmovzq %rbx,%rax
cmpq %rax,%rcx
cmovaq %rax,%rbx
- jmp .Lctr32_aligned_loop
-.p2align 4
+ cmovbeq %rcx,%rbx
+ jbe .Lctr32_aligned_skip
+
.Lctr32_aligned_loop:
- cmpq %rcx,%rbx
- cmovaq %rcx,%rbx
movq %rcx,%r10
movq %rbx,%rcx
movq %rbx,%r11
+
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,216
+.byte 0xf3,0x0f,0xa7,216
+
movl -4(%rdx),%eax
bswapl %eax
addl $65536,%eax
bswapl %eax
movl %eax,-4(%rdx)
- movq %r11,%rbx
movq %r10,%rcx
- subq %rbx,%rcx
+ subq %r11,%rcx
movq $1048576,%rbx
- jnz .Lctr32_aligned_loop
+ jz .Lctr32_exit
+ cmpq %rbx,%rcx
+ jae .Lctr32_aligned_loop
+
+.Lctr32_aligned_skip:
+ leaq (%rsi,%rcx,1),%rbp
+ negq %rbp
+ andq $4095,%rbp
+ xorl %eax,%eax
+ cmpq $32,%rbp
+ movq $32-1,%rbp
+ cmovaeq %rax,%rbp
+ andq %rcx,%rbp
+ subq %rbp,%rcx
+ jz .Lctr32_aligned_tail
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,216
+ testq %rbp,%rbp
+ jz .Lctr32_exit
+
+.Lctr32_aligned_tail:
+ movq %rdi,%r8
+ movq %rbp,%rbx
+ movq %rbp,%rcx
+ leaq (%rsp),%rbp
+ subq %rcx,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ leaq (%r8),%rdi
+ leaq (%rsp),%rsi
+ movq %rbx,%rcx
+ jmp .Lctr32_loop
.Lctr32_exit:
movl $1,%eax
leaq 8(%rsp),%rsp
diff --git a/lib/accelerated/x86/coff/ghash-x86_64.s b/lib/accelerated/x86/coff/ghash-x86_64.s
index 7a89b7a599..b1d69911c7 100644
--- a/lib/accelerated/x86/coff/ghash-x86_64.s
+++ b/lib/accelerated/x86/coff/ghash-x86_64.s
@@ -39,6 +39,7 @@
#
.text
+
.globl gcm_gmult_4bit
.def gcm_gmult_4bit; .scl 2; .type 32; .endef
.p2align 4
@@ -717,6 +718,11 @@ gcm_ghash_4bit:
.def gcm_init_clmul; .scl 2; .type 32; .endef
.p2align 4
gcm_init_clmul:
+.L_init_clmul:
+.LSEH_begin_gcm_init_clmul:
+
+.byte 0x48,0x83,0xec,0x18
+.byte 0x0f,0x29,0x34,0x24
movdqu (%rdx),%xmm2
pshufd $78,%xmm2,%xmm2
@@ -735,15 +741,15 @@ gcm_init_clmul:
pxor %xmm5,%xmm2
+ pshufd $78,%xmm2,%xmm6
movdqa %xmm2,%xmm0
+ pxor %xmm2,%xmm6
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
+.byte 102,15,58,68,222,0
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -753,44 +759,137 @@ gcm_init_clmul:
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
psllq $1,%xmm0
pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm2,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm2,%xmm3
+ movdqu %xmm2,0(%rcx)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,16(%rcx)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,32(%rcx)
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
psrlq $5,%xmm0
pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ movdqa %xmm0,%xmm5
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- movdqu %xmm2,(%rcx)
- movdqu %xmm0,16(%rcx)
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm5,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm5,%xmm3
+ movdqu %xmm5,48(%rcx)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,64(%rcx)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,80(%rcx)
+ movaps (%rsp),%xmm6
+ leaq 24(%rsp),%rsp
+.LSEH_end_gcm_init_clmul:
.byte 0xf3,0xc3
.globl gcm_gmult_clmul
.def gcm_gmult_clmul; .scl 2; .type 32; .endef
.p2align 4
gcm_gmult_clmul:
+.L_gmult_clmul:
movdqu (%rcx),%xmm0
movdqa .Lbswap_mask(%rip),%xmm5
movdqu (%rdx),%xmm2
+ movdqu 32(%rdx),%xmm4
.byte 102,15,56,0,197
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
.byte 102,15,58,68,220,0
@@ -803,209 +902,395 @@ gcm_gmult_clmul:
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
.byte 102,15,56,0,197
movdqu %xmm0,(%rcx)
.byte 0xf3,0xc3
.globl gcm_ghash_clmul
.def gcm_ghash_clmul; .scl 2; .type 32; .endef
-.p2align 4
+.p2align 5
gcm_ghash_clmul:
+.L_ghash_clmul:
+ leaq -136(%rsp),%rax
.LSEH_begin_gcm_ghash_clmul:
-.byte 0x48,0x83,0xec,0x58
-.byte 0x0f,0x29,0x34,0x24
-.byte 0x0f,0x29,0x7c,0x24,0x10
-.byte 0x44,0x0f,0x29,0x44,0x24,0x20
-.byte 0x44,0x0f,0x29,0x4c,0x24,0x30
-.byte 0x44,0x0f,0x29,0x54,0x24,0x40
- movdqa .Lbswap_mask(%rip),%xmm5
+.byte 0x48,0x8d,0x60,0xe0
+.byte 0x0f,0x29,0x70,0xe0
+.byte 0x0f,0x29,0x78,0xf0
+.byte 0x44,0x0f,0x29,0x00
+.byte 0x44,0x0f,0x29,0x48,0x10
+.byte 0x44,0x0f,0x29,0x50,0x20
+.byte 0x44,0x0f,0x29,0x58,0x30
+.byte 0x44,0x0f,0x29,0x60,0x40
+.byte 0x44,0x0f,0x29,0x68,0x50
+.byte 0x44,0x0f,0x29,0x70,0x60
+.byte 0x44,0x0f,0x29,0x78,0x70
+ movdqa .Lbswap_mask(%rip),%xmm10
movdqu (%rcx),%xmm0
movdqu (%rdx),%xmm2
-.byte 102,15,56,0,197
+ movdqu 32(%rdx),%xmm7
+.byte 102,65,15,56,0,194
subq $16,%r9
jz .Lodd_tail
- movdqu 16(%rdx),%xmm8
-
+ movdqu 16(%rdx),%xmm6
+ movl _gnutls_x86_cpuid_s+4(%rip),%eax
+ cmpq $48,%r9
+ jb .Lskip4x
+
+ andl $71303168,%eax
+ cmpl $4194304,%eax
+ je .Lskip4x
+
+ subq $48,%r9
+ movq $11547335547999543296,%rax
+ movdqu 48(%rdx),%xmm14
+ movdqu 64(%rdx),%xmm15
+
+
+
+
+ movdqu 48(%r8),%xmm3
+ movdqu 32(%r8),%xmm11
+.byte 102,65,15,56,0,218
+.byte 102,69,15,56,0,218
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
+
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm11,%xmm12
+.byte 102,68,15,58,68,222,0
+.byte 102,68,15,58,68,238,17
+ xorps %xmm11,%xmm3
+.byte 102,68,15,58,68,231,16
+ xorps %xmm13,%xmm5
+ movups 80(%rdx),%xmm7
+ xorps %xmm12,%xmm4
+
+ movdqu 16(%r8),%xmm11
+ movdqu 0(%r8),%xmm8
+.byte 102,69,15,56,0,218
+.byte 102,69,15,56,0,194
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm8,%xmm0
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm3
+.byte 102,68,15,58,68,231,0
+ xorps %xmm13,%xmm5
+
+ leaq 64(%r8),%r8
+ subq $64,%r9
+ jc .Ltail4x
+
+ jmp .Lmod4_loop
+.p2align 5
+.Lmod4_loop:
+.byte 102,65,15,58,68,199,0
+ xorps %xmm12,%xmm4
+ movdqu 48(%r8),%xmm11
+.byte 102,69,15,56,0,218
+.byte 102,65,15,58,68,207,17
+ xorps %xmm3,%xmm0
+ movdqu 32(%r8),%xmm3
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+.byte 102,68,15,58,68,199,16
+ xorps %xmm5,%xmm1
+ pxor %xmm11,%xmm12
+.byte 102,65,15,56,0,218
+ movups 32(%rdx),%xmm7
+.byte 102,68,15,58,68,218,0
+ xorps %xmm4,%xmm8
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+
+ pxor %xmm0,%xmm8
+ pxor %xmm3,%xmm4
+ pxor %xmm1,%xmm8
+ movdqa %xmm8,%xmm9
+ pslldq $8,%xmm8
+.byte 102,68,15,58,68,234,17
+ psrldq $8,%xmm9
+ pxor %xmm8,%xmm0
+ movdqa .L7_mask(%rip),%xmm8
+ pxor %xmm9,%xmm1
+.byte 102,76,15,110,200
+
+ pand %xmm0,%xmm8
+.byte 102,69,15,56,0,200
+.byte 102,68,15,58,68,231,0
+ pxor %xmm0,%xmm9
+ psllq $57,%xmm9
+ movdqa %xmm9,%xmm8
+ pslldq $8,%xmm9
+.byte 102,15,58,68,222,0
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
+ movdqu 0(%r8),%xmm8
+
+ movdqa %xmm0,%xmm9
+ psrlq $1,%xmm0
+.byte 102,15,58,68,238,17
+ xorps %xmm11,%xmm3
+ movdqu 16(%r8),%xmm11
+.byte 102,69,15,56,0,218
+.byte 102,15,58,68,231,16
+ xorps %xmm13,%xmm5
+ movups 80(%rdx),%xmm7
+.byte 102,69,15,56,0,194
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
+ psrlq $5,%xmm0
+ movdqa %xmm11,%xmm13
+ pxor %xmm12,%xmm4
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
+ psrlq $1,%xmm0
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm3
+ pxor %xmm1,%xmm0
+.byte 102,68,15,58,68,231,0
+ xorps %xmm13,%xmm5
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
+
+ leaq 64(%r8),%r8
+ subq $64,%r9
+ jnc .Lmod4_loop
+
+.Ltail4x:
+.byte 102,65,15,58,68,199,0
+ xorps %xmm12,%xmm4
+.byte 102,65,15,58,68,207,17
+ xorps %xmm3,%xmm0
+.byte 102,68,15,58,68,199,16
+ xorps %xmm5,%xmm1
+ pxor %xmm0,%xmm1
+ pxor %xmm4,%xmm8
+
+ pxor %xmm1,%xmm8
+ pxor %xmm0,%xmm1
+
+ movdqa %xmm8,%xmm9
+ psrldq $8,%xmm8
+ pslldq $8,%xmm9
+ pxor %xmm8,%xmm1
+ pxor %xmm9,%xmm0
- movdqu (%r8),%xmm3
- movdqu 16(%r8),%xmm6
-.byte 102,15,56,0,221
-.byte 102,15,56,0,245
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
- movdqa %xmm6,%xmm7
- pshufd $78,%xmm6,%xmm3
- pshufd $78,%xmm2,%xmm4
- pxor %xmm6,%xmm3
- pxor %xmm2,%xmm4
-.byte 102,15,58,68,242,0
-.byte 102,15,58,68,250,17
-.byte 102,15,58,68,220,0
- pxor %xmm6,%xmm3
- pxor %xmm7,%xmm3
-
- movdqa %xmm3,%xmm4
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm7
- pxor %xmm4,%xmm6
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm8,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm8,%xmm4
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ addq $64,%r9
+ jz .Ldone
+ movdqu 32(%rdx),%xmm7
+ subq $16,%r9
+ jz .Lodd_tail
+.Lskip4x:
+
+
+
+
+
+ movdqu (%r8),%xmm8
+ movdqu 16(%r8),%xmm3
+.byte 102,69,15,56,0,194
+.byte 102,65,15,56,0,218
+ pxor %xmm8,%xmm0
+
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
leaq 32(%r8),%r8
+ nop
subq $32,%r9
jbe .Leven_tail
+ nop
+ jmp .Lmod_loop
+.p2align 5
.Lmod_loop:
-.byte 102,65,15,58,68,192,0
-.byte 102,65,15,58,68,200,17
-.byte 102,15,58,68,220,0
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm0,%xmm4
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
+
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ movdqu (%r8),%xmm5
+ pxor %xmm0,%xmm8
+.byte 102,65,15,56,0,234
+ movdqu 16(%r8),%xmm3
+
+ pxor %xmm1,%xmm8
+ pxor %xmm5,%xmm1
+ pxor %xmm8,%xmm4
+.byte 102,65,15,56,0,218
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
pslldq $8,%xmm4
- pxor %xmm3,%xmm1
+ pxor %xmm8,%xmm1
pxor %xmm4,%xmm0
- movdqu (%r8),%xmm3
- pxor %xmm6,%xmm0
- pxor %xmm7,%xmm1
-
- movdqu 16(%r8),%xmm6
-.byte 102,15,56,0,221
-.byte 102,15,56,0,245
-
- movdqa %xmm6,%xmm7
- pshufd $78,%xmm6,%xmm9
- pshufd $78,%xmm2,%xmm10
- pxor %xmm6,%xmm9
- pxor %xmm2,%xmm10
- pxor %xmm3,%xmm1
- movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
+ movdqa %xmm3,%xmm5
+
+ movdqa %xmm0,%xmm9
+ movdqa %xmm0,%xmm8
psllq $5,%xmm0
- pxor %xmm3,%xmm0
-.byte 102,15,58,68,242,0
+ pxor %xmm0,%xmm8
+.byte 102,15,58,68,218,0
+ psllq $1,%xmm0
+ pxor %xmm8,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm8
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
-
-.byte 102,15,58,68,250,17
- movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pshufd $78,%xmm5,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm5,%xmm4
+
+.byte 102,15,58,68,234,17
+ movdqa %xmm0,%xmm9
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
+ psrlq $5,%xmm0
+ pxor %xmm9,%xmm0
+ leaq 32(%r8),%r8
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
-
-.byte 102,69,15,58,68,202,0
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm8,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm8,%xmm4
-
- pxor %xmm6,%xmm9
- pxor %xmm7,%xmm9
- movdqa %xmm9,%xmm10
- psrldq $8,%xmm9
- pslldq $8,%xmm10
- pxor %xmm9,%xmm7
- pxor %xmm10,%xmm6
+.byte 102,15,58,68,231,0
+ pxor %xmm1,%xmm0
+.byte 0x66,0x90
- leaq 32(%r8),%r8
subq $32,%r9
ja .Lmod_loop
.Leven_tail:
-.byte 102,65,15,58,68,192,0
-.byte 102,65,15,58,68,200,17
-.byte 102,15,58,68,220,0
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm0,%xmm4
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
+
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ pxor %xmm0,%xmm8
+ pxor %xmm1,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
pslldq $8,%xmm4
- pxor %xmm3,%xmm1
+ pxor %xmm8,%xmm1
pxor %xmm4,%xmm0
- pxor %xmm6,%xmm0
- pxor %xmm7,%xmm1
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
testq %r9,%r9
jnz .Ldone
.Lodd_tail:
- movdqu (%r8),%xmm3
-.byte 102,15,56,0,221
- pxor %xmm3,%xmm0
+ movdqu (%r8),%xmm8
+.byte 102,69,15,56,0,194
+ pxor %xmm8,%xmm0
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
+.byte 102,15,58,68,223,0
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -1015,44 +1300,72 @@ gcm_ghash_clmul:
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
.Ldone:
-.byte 102,15,56,0,197
+.byte 102,65,15,56,0,194
movdqu %xmm0,(%rcx)
movaps (%rsp),%xmm6
movaps 16(%rsp),%xmm7
movaps 32(%rsp),%xmm8
movaps 48(%rsp),%xmm9
movaps 64(%rsp),%xmm10
- addq $88,%rsp
- .byte 0xf3,0xc3
+ movaps 80(%rsp),%xmm11
+ movaps 96(%rsp),%xmm12
+ movaps 112(%rsp),%xmm13
+ movaps 128(%rsp),%xmm14
+ movaps 144(%rsp),%xmm15
+ leaq 168(%rsp),%rsp
.LSEH_end_gcm_ghash_clmul:
+ .byte 0xf3,0xc3
+
+.globl gcm_init_avx
+.def gcm_init_avx; .scl 2; .type 32; .endef
+.p2align 5
+gcm_init_avx:
+ jmp .L_init_clmul
+
+.globl gcm_gmult_avx
+.def gcm_gmult_avx; .scl 2; .type 32; .endef
+.p2align 5
+gcm_gmult_avx:
+ jmp .L_gmult_clmul
+
+.globl gcm_ghash_avx
+.def gcm_ghash_avx; .scl 2; .type 32; .endef
+.p2align 5
+gcm_ghash_avx:
+ jmp .L_ghash_clmul
.p2align 6
.Lbswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.L0x1c2_polynomial:
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+.long 7,0,7,0
+.L7_mask_poly:
+.long 7,0,450,0
.p2align 6
.Lrem_4bit:
@@ -1149,7 +1462,7 @@ se_handler:
movq 40(%r9),%rdi
movq %r8,%rsi
movl $154,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
movq %r9,%rsi
xorq %rcx,%rcx
@@ -1189,26 +1502,38 @@ se_handler:
.rva .LSEH_end_gcm_ghash_4bit
.rva .LSEH_info_gcm_ghash_4bit
+.rva .LSEH_begin_gcm_init_clmul
+.rva .LSEH_end_gcm_init_clmul
+.rva .LSEH_info_gcm_init_clmul
+
.rva .LSEH_begin_gcm_ghash_clmul
.rva .LSEH_end_gcm_ghash_clmul
.rva .LSEH_info_gcm_ghash_clmul
-
.section .xdata
.p2align 3
.LSEH_info_gcm_gmult_4bit:
.byte 9,0,0,0
.rva se_handler
-.rva .Lgmult_prologue,.Lgmult_epilogue
+.rva .Lgmult_prologue,.Lgmult_epilogue
.LSEH_info_gcm_ghash_4bit:
.byte 9,0,0,0
.rva se_handler
-.rva .Lghash_prologue,.Lghash_epilogue
+.rva .Lghash_prologue,.Lghash_epilogue
+.LSEH_info_gcm_init_clmul:
+.byte 0x01,0x08,0x03,0x00
+.byte 0x08,0x68,0x00,0x00
+.byte 0x04,0x22,0x00,0x00
.LSEH_info_gcm_ghash_clmul:
-.byte 0x01,0x1f,0x0b,0x00
-.byte 0x1f,0xa8,0x04,0x00
-.byte 0x19,0x98,0x03,0x00
-.byte 0x13,0x88,0x02,0x00
-.byte 0x0d,0x78,0x01,0x00
-.byte 0x08,0x68,0x00,0x00
-.byte 0x04,0xa2,0x00,0x00
+.byte 0x01,0x33,0x16,0x00
+.byte 0x33,0xf8,0x09,0x00
+.byte 0x2e,0xe8,0x08,0x00
+.byte 0x29,0xd8,0x07,0x00
+.byte 0x24,0xc8,0x06,0x00
+.byte 0x1f,0xb8,0x05,0x00
+.byte 0x1a,0xa8,0x04,0x00
+.byte 0x15,0x98,0x03,0x00
+.byte 0x10,0x88,0x02,0x00
+.byte 0x0c,0x78,0x01,0x00
+.byte 0x08,0x68,0x00,0x00
+.byte 0x04,0x01,0x15,0x00
diff --git a/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s b/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s
index 9aa029f1bb..e8136025e8 100644
--- a/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s
+++ b/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s
@@ -54,23 +54,25 @@ sha1_block_data_order:
movl _gnutls_x86_cpuid_s+0(%rip),%r9d
movl _gnutls_x86_cpuid_s+4(%rip),%r8d
+ movl _gnutls_x86_cpuid_s+8(%rip),%r10d
testl $512,%r8d
jz .Lialu
jmp _ssse3_shortcut
.p2align 4
.Lialu:
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
- movq %rsp,%r11
+ pushq %r14
movq %rdi,%r8
subq $72,%rsp
movq %rsi,%r9
andq $-64,%rsp
movq %rdx,%r10
- movq %r11,64(%rsp)
+ movq %rax,64(%rsp)
.Lprologue:
movl 0(%r8),%esi
@@ -84,1230 +86,1168 @@ sha1_block_data_order:
.Lloop:
movl 0(%r9),%edx
bswapl %edx
- movl %edx,0(%rsp)
- movl %r11d,%eax
movl 4(%r9),%ebp
+ movl %r12d,%eax
+ movl %edx,0(%rsp)
movl %esi,%ecx
- xorl %r12d,%eax
bswapl %ebp
+ xorl %r11d,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%r13,1),%r13d
andl %edi,%eax
- movl %ebp,4(%rsp)
+ leal 1518500249(%rdx,%r13,1),%r13d
addl %ecx,%r13d
xorl %r12d,%eax
roll $30,%edi
addl %eax,%r13d
- movl %edi,%eax
- movl 8(%r9),%edx
+ movl 8(%r9),%r14d
+ movl %r11d,%eax
+ movl %ebp,4(%rsp)
movl %r13d,%ecx
- xorl %r11d,%eax
- bswapl %edx
+ bswapl %r14d
+ xorl %edi,%eax
roll $5,%ecx
- leal 1518500249(%rbp,%r12,1),%r12d
andl %esi,%eax
- movl %edx,8(%rsp)
+ leal 1518500249(%rbp,%r12,1),%r12d
addl %ecx,%r12d
xorl %r11d,%eax
roll $30,%esi
addl %eax,%r12d
- movl %esi,%eax
- movl 12(%r9),%ebp
+ movl 12(%r9),%edx
+ movl %edi,%eax
+ movl %r14d,8(%rsp)
movl %r12d,%ecx
- xorl %edi,%eax
- bswapl %ebp
+ bswapl %edx
+ xorl %esi,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%r11,1),%r11d
andl %r13d,%eax
- movl %ebp,12(%rsp)
+ leal 1518500249(%r14,%r11,1),%r11d
addl %ecx,%r11d
xorl %edi,%eax
roll $30,%r13d
addl %eax,%r11d
- movl %r13d,%eax
- movl 16(%r9),%edx
+ movl 16(%r9),%ebp
+ movl %esi,%eax
+ movl %edx,12(%rsp)
movl %r11d,%ecx
- xorl %esi,%eax
- bswapl %edx
+ bswapl %ebp
+ xorl %r13d,%eax
roll $5,%ecx
- leal 1518500249(%rbp,%rdi,1),%edi
andl %r12d,%eax
- movl %edx,16(%rsp)
+ leal 1518500249(%rdx,%rdi,1),%edi
addl %ecx,%edi
xorl %esi,%eax
roll $30,%r12d
addl %eax,%edi
- movl %r12d,%eax
- movl 20(%r9),%ebp
+ movl 20(%r9),%r14d
+ movl %r13d,%eax
+ movl %ebp,16(%rsp)
movl %edi,%ecx
- xorl %r13d,%eax
- bswapl %ebp
+ bswapl %r14d
+ xorl %r12d,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%rsi,1),%esi
andl %r11d,%eax
- movl %ebp,20(%rsp)
+ leal 1518500249(%rbp,%rsi,1),%esi
addl %ecx,%esi
xorl %r13d,%eax
roll $30,%r11d
addl %eax,%esi
- movl %r11d,%eax
movl 24(%r9),%edx
+ movl %r12d,%eax
+ movl %r14d,20(%rsp)
movl %esi,%ecx
- xorl %r12d,%eax
bswapl %edx
+ xorl %r11d,%eax
roll $5,%ecx
- leal 1518500249(%rbp,%r13,1),%r13d
andl %edi,%eax
- movl %edx,24(%rsp)
+ leal 1518500249(%r14,%r13,1),%r13d
addl %ecx,%r13d
xorl %r12d,%eax
roll $30,%edi
addl %eax,%r13d
- movl %edi,%eax
movl 28(%r9),%ebp
+ movl %r11d,%eax
+ movl %edx,24(%rsp)
movl %r13d,%ecx
- xorl %r11d,%eax
bswapl %ebp
+ xorl %edi,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%r12,1),%r12d
andl %esi,%eax
- movl %ebp,28(%rsp)
+ leal 1518500249(%rdx,%r12,1),%r12d
addl %ecx,%r12d
xorl %r11d,%eax
roll $30,%esi
addl %eax,%r12d
- movl %esi,%eax
- movl 32(%r9),%edx
+ movl 32(%r9),%r14d
+ movl %edi,%eax
+ movl %ebp,28(%rsp)
movl %r12d,%ecx
- xorl %edi,%eax
- bswapl %edx
+ bswapl %r14d
+ xorl %esi,%eax
roll $5,%ecx
- leal 1518500249(%rbp,%r11,1),%r11d
andl %r13d,%eax
- movl %edx,32(%rsp)
+ leal 1518500249(%rbp,%r11,1),%r11d
addl %ecx,%r11d
xorl %edi,%eax
roll $30,%r13d
addl %eax,%r11d
- movl %r13d,%eax
- movl 36(%r9),%ebp
+ movl 36(%r9),%edx
+ movl %esi,%eax
+ movl %r14d,32(%rsp)
movl %r11d,%ecx
- xorl %esi,%eax
- bswapl %ebp
+ bswapl %edx
+ xorl %r13d,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%rdi,1),%edi
andl %r12d,%eax
- movl %ebp,36(%rsp)
+ leal 1518500249(%r14,%rdi,1),%edi
addl %ecx,%edi
xorl %esi,%eax
roll $30,%r12d
addl %eax,%edi
- movl %r12d,%eax
- movl 40(%r9),%edx
+ movl 40(%r9),%ebp
+ movl %r13d,%eax
+ movl %edx,36(%rsp)
movl %edi,%ecx
- xorl %r13d,%eax
- bswapl %edx
+ bswapl %ebp
+ xorl %r12d,%eax
roll $5,%ecx
- leal 1518500249(%rbp,%rsi,1),%esi
andl %r11d,%eax
- movl %edx,40(%rsp)
+ leal 1518500249(%rdx,%rsi,1),%esi
addl %ecx,%esi
xorl %r13d,%eax
roll $30,%r11d
addl %eax,%esi
- movl %r11d,%eax
- movl 44(%r9),%ebp
+ movl 44(%r9),%r14d
+ movl %r12d,%eax
+ movl %ebp,40(%rsp)
movl %esi,%ecx
- xorl %r12d,%eax
- bswapl %ebp
+ bswapl %r14d
+ xorl %r11d,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%r13,1),%r13d
andl %edi,%eax
- movl %ebp,44(%rsp)
+ leal 1518500249(%rbp,%r13,1),%r13d
addl %ecx,%r13d
xorl %r12d,%eax
roll $30,%edi
addl %eax,%r13d
- movl %edi,%eax
movl 48(%r9),%edx
+ movl %r11d,%eax
+ movl %r14d,44(%rsp)
movl %r13d,%ecx
- xorl %r11d,%eax
bswapl %edx
+ xorl %edi,%eax
roll $5,%ecx
- leal 1518500249(%rbp,%r12,1),%r12d
andl %esi,%eax
- movl %edx,48(%rsp)
+ leal 1518500249(%r14,%r12,1),%r12d
addl %ecx,%r12d
xorl %r11d,%eax
roll $30,%esi
addl %eax,%r12d
- movl %esi,%eax
movl 52(%r9),%ebp
+ movl %edi,%eax
+ movl %edx,48(%rsp)
movl %r12d,%ecx
- xorl %edi,%eax
bswapl %ebp
+ xorl %esi,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%r11,1),%r11d
andl %r13d,%eax
- movl %ebp,52(%rsp)
+ leal 1518500249(%rdx,%r11,1),%r11d
addl %ecx,%r11d
xorl %edi,%eax
roll $30,%r13d
addl %eax,%r11d
- movl %r13d,%eax
- movl 56(%r9),%edx
+ movl 56(%r9),%r14d
+ movl %esi,%eax
+ movl %ebp,52(%rsp)
movl %r11d,%ecx
- xorl %esi,%eax
- bswapl %edx
+ bswapl %r14d
+ xorl %r13d,%eax
roll $5,%ecx
- leal 1518500249(%rbp,%rdi,1),%edi
andl %r12d,%eax
- movl %edx,56(%rsp)
+ leal 1518500249(%rbp,%rdi,1),%edi
addl %ecx,%edi
xorl %esi,%eax
roll $30,%r12d
addl %eax,%edi
- movl %r12d,%eax
- movl 60(%r9),%ebp
+ movl 60(%r9),%edx
+ movl %r13d,%eax
+ movl %r14d,56(%rsp)
movl %edi,%ecx
- xorl %r13d,%eax
- bswapl %ebp
+ bswapl %edx
+ xorl %r12d,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%rsi,1),%esi
andl %r11d,%eax
- movl %ebp,60(%rsp)
+ leal 1518500249(%r14,%rsi,1),%esi
addl %ecx,%esi
xorl %r13d,%eax
roll $30,%r11d
addl %eax,%esi
- movl 0(%rsp),%edx
- movl %r11d,%eax
+ xorl 0(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,60(%rsp)
movl %esi,%ecx
- xorl 8(%rsp),%edx
- xorl %r12d,%eax
+ xorl 8(%rsp),%ebp
+ xorl %r11d,%eax
roll $5,%ecx
- xorl 32(%rsp),%edx
+ xorl 32(%rsp),%ebp
andl %edi,%eax
- leal 1518500249(%rbp,%r13,1),%r13d
- xorl 52(%rsp),%edx
+ leal 1518500249(%rdx,%r13,1),%r13d
+ roll $30,%edi
xorl %r12d,%eax
- roll $1,%edx
addl %ecx,%r13d
- roll $30,%edi
- movl %edx,0(%rsp)
+ roll $1,%ebp
addl %eax,%r13d
- movl 4(%rsp),%ebp
- movl %edi,%eax
+ xorl 4(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,0(%rsp)
movl %r13d,%ecx
- xorl 12(%rsp),%ebp
- xorl %r11d,%eax
+ xorl 12(%rsp),%r14d
+ xorl %edi,%eax
roll $5,%ecx
- xorl 36(%rsp),%ebp
+ xorl 36(%rsp),%r14d
andl %esi,%eax
- leal 1518500249(%rdx,%r12,1),%r12d
- xorl 56(%rsp),%ebp
+ leal 1518500249(%rbp,%r12,1),%r12d
+ roll $30,%esi
xorl %r11d,%eax
- roll $1,%ebp
addl %ecx,%r12d
- roll $30,%esi
- movl %ebp,4(%rsp)
+ roll $1,%r14d
addl %eax,%r12d
- movl 8(%rsp),%edx
- movl %esi,%eax
+ xorl 8(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,4(%rsp)
movl %r12d,%ecx
xorl 16(%rsp),%edx
- xorl %edi,%eax
+ xorl %esi,%eax
roll $5,%ecx
xorl 40(%rsp),%edx
andl %r13d,%eax
- leal 1518500249(%rbp,%r11,1),%r11d
- xorl 60(%rsp),%edx
+ leal 1518500249(%r14,%r11,1),%r11d
+ roll $30,%r13d
xorl %edi,%eax
- roll $1,%edx
addl %ecx,%r11d
- roll $30,%r13d
- movl %edx,8(%rsp)
+ roll $1,%edx
addl %eax,%r11d
- movl 12(%rsp),%ebp
- movl %r13d,%eax
+ xorl 12(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,8(%rsp)
movl %r11d,%ecx
xorl 20(%rsp),%ebp
- xorl %esi,%eax
+ xorl %r13d,%eax
roll $5,%ecx
xorl 44(%rsp),%ebp
andl %r12d,%eax
leal 1518500249(%rdx,%rdi,1),%edi
- xorl 0(%rsp),%ebp
+ roll $30,%r12d
xorl %esi,%eax
- roll $1,%ebp
addl %ecx,%edi
- roll $30,%r12d
- movl %ebp,12(%rsp)
+ roll $1,%ebp
addl %eax,%edi
- movl 16(%rsp),%edx
- movl %r12d,%eax
+ xorl 16(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,12(%rsp)
movl %edi,%ecx
- xorl 24(%rsp),%edx
- xorl %r13d,%eax
+ xorl 24(%rsp),%r14d
+ xorl %r12d,%eax
roll $5,%ecx
- xorl 48(%rsp),%edx
+ xorl 48(%rsp),%r14d
andl %r11d,%eax
leal 1518500249(%rbp,%rsi,1),%esi
- xorl 4(%rsp),%edx
+ roll $30,%r11d
xorl %r13d,%eax
- roll $1,%edx
addl %ecx,%esi
- roll $30,%r11d
- movl %edx,16(%rsp)
+ roll $1,%r14d
addl %eax,%esi
- movl 20(%rsp),%ebp
- movl %r11d,%eax
+ xorl 20(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,16(%rsp)
movl %esi,%ecx
- xorl 28(%rsp),%ebp
- xorl %edi,%eax
- roll $5,%ecx
- leal 1859775393(%rdx,%r13,1),%r13d
- xorl 52(%rsp),%ebp
+ xorl 28(%rsp),%edx
xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 52(%rsp),%edx
+ leal 1859775393(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 8(%rsp),%ebp
roll $30,%edi
addl %eax,%r13d
- roll $1,%ebp
- movl %ebp,20(%rsp)
- movl 24(%rsp),%edx
- movl %edi,%eax
+ roll $1,%edx
+ xorl 24(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,20(%rsp)
movl %r13d,%ecx
- xorl 32(%rsp),%edx
- xorl %esi,%eax
- roll $5,%ecx
- leal 1859775393(%rbp,%r12,1),%r12d
- xorl 56(%rsp),%edx
+ xorl 32(%rsp),%ebp
xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 56(%rsp),%ebp
+ leal 1859775393(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 12(%rsp),%edx
roll $30,%esi
addl %eax,%r12d
- roll $1,%edx
- movl %edx,24(%rsp)
- movl 28(%rsp),%ebp
- movl %esi,%eax
+ roll $1,%ebp
+ xorl 28(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,24(%rsp)
movl %r12d,%ecx
- xorl 36(%rsp),%ebp
- xorl %r13d,%eax
- roll $5,%ecx
- leal 1859775393(%rdx,%r11,1),%r11d
- xorl 60(%rsp),%ebp
+ xorl 36(%rsp),%r14d
xorl %edi,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%r14d
+ leal 1859775393(%rbp,%r11,1),%r11d
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 16(%rsp),%ebp
roll $30,%r13d
addl %eax,%r11d
- roll $1,%ebp
- movl %ebp,28(%rsp)
- movl 32(%rsp),%edx
- movl %r13d,%eax
+ roll $1,%r14d
+ xorl 32(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,28(%rsp)
movl %r11d,%ecx
xorl 40(%rsp),%edx
- xorl %r12d,%eax
+ xorl %esi,%eax
roll $5,%ecx
- leal 1859775393(%rbp,%rdi,1),%edi
xorl 0(%rsp),%edx
- xorl %esi,%eax
+ leal 1859775393(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 20(%rsp),%edx
roll $30,%r12d
addl %eax,%edi
roll $1,%edx
+ xorl 36(%rsp),%ebp
+ movl %r11d,%eax
movl %edx,32(%rsp)
- movl 36(%rsp),%ebp
- movl %r12d,%eax
movl %edi,%ecx
xorl 44(%rsp),%ebp
- xorl %r11d,%eax
+ xorl %r13d,%eax
roll $5,%ecx
- leal 1859775393(%rdx,%rsi,1),%esi
xorl 4(%rsp),%ebp
- xorl %r13d,%eax
+ leal 1859775393(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
addl %ecx,%esi
- xorl 24(%rsp),%ebp
roll $30,%r11d
addl %eax,%esi
roll $1,%ebp
+ xorl 40(%rsp),%r14d
+ movl %edi,%eax
movl %ebp,36(%rsp)
- movl 40(%rsp),%edx
- movl %r11d,%eax
movl %esi,%ecx
- xorl 48(%rsp),%edx
- xorl %edi,%eax
+ xorl 48(%rsp),%r14d
+ xorl %r12d,%eax
roll $5,%ecx
+ xorl 8(%rsp),%r14d
leal 1859775393(%rbp,%r13,1),%r13d
- xorl 8(%rsp),%edx
- xorl %r12d,%eax
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 28(%rsp),%edx
roll $30,%edi
addl %eax,%r13d
- roll $1,%edx
- movl %edx,40(%rsp)
- movl 44(%rsp),%ebp
- movl %edi,%eax
+ roll $1,%r14d
+ xorl 44(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,40(%rsp)
movl %r13d,%ecx
- xorl 52(%rsp),%ebp
- xorl %esi,%eax
- roll $5,%ecx
- leal 1859775393(%rdx,%r12,1),%r12d
- xorl 12(%rsp),%ebp
+ xorl 52(%rsp),%edx
xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 12(%rsp),%edx
+ leal 1859775393(%r14,%r12,1),%r12d
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 32(%rsp),%ebp
roll $30,%esi
addl %eax,%r12d
- roll $1,%ebp
- movl %ebp,44(%rsp)
- movl 48(%rsp),%edx
- movl %esi,%eax
+ roll $1,%edx
+ xorl 48(%rsp),%ebp
+ movl %r13d,%eax
+ movl %edx,44(%rsp)
movl %r12d,%ecx
- xorl 56(%rsp),%edx
- xorl %r13d,%eax
- roll $5,%ecx
- leal 1859775393(%rbp,%r11,1),%r11d
- xorl 16(%rsp),%edx
+ xorl 56(%rsp),%ebp
xorl %edi,%eax
+ roll $5,%ecx
+ xorl 16(%rsp),%ebp
+ leal 1859775393(%rdx,%r11,1),%r11d
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 36(%rsp),%edx
roll $30,%r13d
addl %eax,%r11d
- roll $1,%edx
- movl %edx,48(%rsp)
- movl 52(%rsp),%ebp
- movl %r13d,%eax
+ roll $1,%ebp
+ xorl 52(%rsp),%r14d
+ movl %r12d,%eax
+ movl %ebp,48(%rsp)
movl %r11d,%ecx
- xorl 60(%rsp),%ebp
- xorl %r12d,%eax
- roll $5,%ecx
- leal 1859775393(%rdx,%rdi,1),%edi
- xorl 20(%rsp),%ebp
+ xorl 60(%rsp),%r14d
xorl %esi,%eax
+ roll $5,%ecx
+ xorl 20(%rsp),%r14d
+ leal 1859775393(%rbp,%rdi,1),%edi
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 40(%rsp),%ebp
roll $30,%r12d
addl %eax,%edi
- roll $1,%ebp
- movl %ebp,52(%rsp)
- movl 56(%rsp),%edx
- movl %r12d,%eax
+ roll $1,%r14d
+ xorl 56(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,52(%rsp)
movl %edi,%ecx
xorl 0(%rsp),%edx
- xorl %r11d,%eax
+ xorl %r13d,%eax
roll $5,%ecx
- leal 1859775393(%rbp,%rsi,1),%esi
xorl 24(%rsp),%edx
- xorl %r13d,%eax
+ leal 1859775393(%r14,%rsi,1),%esi
+ xorl %r12d,%eax
addl %ecx,%esi
- xorl 44(%rsp),%edx
roll $30,%r11d
addl %eax,%esi
roll $1,%edx
+ xorl 60(%rsp),%ebp
+ movl %edi,%eax
movl %edx,56(%rsp)
- movl 60(%rsp),%ebp
- movl %r11d,%eax
movl %esi,%ecx
xorl 4(%rsp),%ebp
- xorl %edi,%eax
+ xorl %r12d,%eax
roll $5,%ecx
- leal 1859775393(%rdx,%r13,1),%r13d
xorl 28(%rsp),%ebp
- xorl %r12d,%eax
+ leal 1859775393(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 48(%rsp),%ebp
roll $30,%edi
addl %eax,%r13d
roll $1,%ebp
+ xorl 0(%rsp),%r14d
+ movl %esi,%eax
movl %ebp,60(%rsp)
- movl 0(%rsp),%edx
- movl %edi,%eax
movl %r13d,%ecx
- xorl 8(%rsp),%edx
- xorl %esi,%eax
+ xorl 8(%rsp),%r14d
+ xorl %r11d,%eax
roll $5,%ecx
+ xorl 32(%rsp),%r14d
leal 1859775393(%rbp,%r12,1),%r12d
- xorl 32(%rsp),%edx
- xorl %r11d,%eax
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 52(%rsp),%edx
roll $30,%esi
addl %eax,%r12d
- roll $1,%edx
- movl %edx,0(%rsp)
- movl 4(%rsp),%ebp
- movl %esi,%eax
+ roll $1,%r14d
+ xorl 4(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,0(%rsp)
movl %r12d,%ecx
- xorl 12(%rsp),%ebp
- xorl %r13d,%eax
- roll $5,%ecx
- leal 1859775393(%rdx,%r11,1),%r11d
- xorl 36(%rsp),%ebp
+ xorl 12(%rsp),%edx
xorl %edi,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%edx
+ leal 1859775393(%r14,%r11,1),%r11d
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 56(%rsp),%ebp
roll $30,%r13d
addl %eax,%r11d
- roll $1,%ebp
- movl %ebp,4(%rsp)
- movl 8(%rsp),%edx
- movl %r13d,%eax
+ roll $1,%edx
+ xorl 8(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,4(%rsp)
movl %r11d,%ecx
- xorl 16(%rsp),%edx
- xorl %r12d,%eax
- roll $5,%ecx
- leal 1859775393(%rbp,%rdi,1),%edi
- xorl 40(%rsp),%edx
+ xorl 16(%rsp),%ebp
xorl %esi,%eax
+ roll $5,%ecx
+ xorl 40(%rsp),%ebp
+ leal 1859775393(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 60(%rsp),%edx
roll $30,%r12d
addl %eax,%edi
- roll $1,%edx
- movl %edx,8(%rsp)
- movl 12(%rsp),%ebp
- movl %r12d,%eax
+ roll $1,%ebp
+ xorl 12(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,8(%rsp)
movl %edi,%ecx
- xorl 20(%rsp),%ebp
- xorl %r11d,%eax
- roll $5,%ecx
- leal 1859775393(%rdx,%rsi,1),%esi
- xorl 44(%rsp),%ebp
+ xorl 20(%rsp),%r14d
xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 44(%rsp),%r14d
+ leal 1859775393(%rbp,%rsi,1),%esi
+ xorl %r12d,%eax
addl %ecx,%esi
- xorl 0(%rsp),%ebp
roll $30,%r11d
addl %eax,%esi
- roll $1,%ebp
- movl %ebp,12(%rsp)
- movl 16(%rsp),%edx
- movl %r11d,%eax
+ roll $1,%r14d
+ xorl 16(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,12(%rsp)
movl %esi,%ecx
xorl 24(%rsp),%edx
- xorl %edi,%eax
+ xorl %r12d,%eax
roll $5,%ecx
- leal 1859775393(%rbp,%r13,1),%r13d
xorl 48(%rsp),%edx
- xorl %r12d,%eax
+ leal 1859775393(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 4(%rsp),%edx
roll $30,%edi
addl %eax,%r13d
roll $1,%edx
+ xorl 20(%rsp),%ebp
+ movl %esi,%eax
movl %edx,16(%rsp)
- movl 20(%rsp),%ebp
- movl %edi,%eax
movl %r13d,%ecx
xorl 28(%rsp),%ebp
- xorl %esi,%eax
+ xorl %r11d,%eax
roll $5,%ecx
- leal 1859775393(%rdx,%r12,1),%r12d
xorl 52(%rsp),%ebp
- xorl %r11d,%eax
+ leal 1859775393(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 8(%rsp),%ebp
roll $30,%esi
addl %eax,%r12d
roll $1,%ebp
+ xorl 24(%rsp),%r14d
+ movl %r13d,%eax
movl %ebp,20(%rsp)
- movl 24(%rsp),%edx
- movl %esi,%eax
movl %r12d,%ecx
- xorl 32(%rsp),%edx
- xorl %r13d,%eax
+ xorl 32(%rsp),%r14d
+ xorl %edi,%eax
roll $5,%ecx
+ xorl 56(%rsp),%r14d
leal 1859775393(%rbp,%r11,1),%r11d
- xorl 56(%rsp),%edx
- xorl %edi,%eax
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 12(%rsp),%edx
roll $30,%r13d
addl %eax,%r11d
- roll $1,%edx
- movl %edx,24(%rsp)
- movl 28(%rsp),%ebp
- movl %r13d,%eax
+ roll $1,%r14d
+ xorl 28(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,24(%rsp)
movl %r11d,%ecx
- xorl 36(%rsp),%ebp
- xorl %r12d,%eax
- roll $5,%ecx
- leal 1859775393(%rdx,%rdi,1),%edi
- xorl 60(%rsp),%ebp
+ xorl 36(%rsp),%edx
xorl %esi,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%edx
+ leal 1859775393(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 16(%rsp),%ebp
roll $30,%r12d
addl %eax,%edi
- roll $1,%ebp
- movl %ebp,28(%rsp)
- movl 32(%rsp),%edx
- movl %r12d,%eax
+ roll $1,%edx
+ xorl 32(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,28(%rsp)
movl %edi,%ecx
- xorl 40(%rsp),%edx
- xorl %r11d,%eax
- roll $5,%ecx
- leal 1859775393(%rbp,%rsi,1),%esi
- xorl 0(%rsp),%edx
+ xorl 40(%rsp),%ebp
xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 0(%rsp),%ebp
+ leal 1859775393(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
addl %ecx,%esi
- xorl 20(%rsp),%edx
roll $30,%r11d
addl %eax,%esi
- roll $1,%edx
- movl %edx,32(%rsp)
- movl 36(%rsp),%ebp
- movl %r11d,%eax
- movl %r11d,%ebx
- xorl 44(%rsp),%ebp
- andl %r12d,%eax
+ roll $1,%ebp
+ xorl 36(%rsp),%r14d
+ movl %r12d,%eax
+ movl %ebp,32(%rsp)
+ movl %r12d,%ebx
+ xorl 44(%rsp),%r14d
+ andl %r11d,%eax
movl %esi,%ecx
- xorl 4(%rsp),%ebp
- xorl %r12d,%ebx
- leal -1894007588(%rdx,%r13,1),%r13d
+ xorl 4(%rsp),%r14d
+ leal -1894007588(%rbp,%r13,1),%r13d
+ xorl %r11d,%ebx
roll $5,%ecx
- xorl 24(%rsp),%ebp
addl %eax,%r13d
+ roll $1,%r14d
andl %edi,%ebx
- roll $1,%ebp
- addl %ebx,%r13d
- roll $30,%edi
- movl %ebp,36(%rsp)
addl %ecx,%r13d
- movl 40(%rsp),%edx
- movl %edi,%eax
- movl %edi,%ebx
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 40(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,36(%rsp)
+ movl %r11d,%ebx
xorl 48(%rsp),%edx
- andl %r11d,%eax
+ andl %edi,%eax
movl %r13d,%ecx
xorl 8(%rsp),%edx
- xorl %r11d,%ebx
- leal -1894007588(%rbp,%r12,1),%r12d
+ leal -1894007588(%r14,%r12,1),%r12d
+ xorl %edi,%ebx
roll $5,%ecx
- xorl 28(%rsp),%edx
addl %eax,%r12d
- andl %esi,%ebx
roll $1,%edx
- addl %ebx,%r12d
+ andl %esi,%ebx
+ addl %ecx,%r12d
roll $30,%esi
+ addl %ebx,%r12d
+ xorl 44(%rsp),%ebp
+ movl %edi,%eax
movl %edx,40(%rsp)
- addl %ecx,%r12d
- movl 44(%rsp),%ebp
- movl %esi,%eax
- movl %esi,%ebx
+ movl %edi,%ebx
xorl 52(%rsp),%ebp
- andl %edi,%eax
+ andl %esi,%eax
movl %r12d,%ecx
xorl 12(%rsp),%ebp
- xorl %edi,%ebx
leal -1894007588(%rdx,%r11,1),%r11d
+ xorl %esi,%ebx
roll $5,%ecx
- xorl 32(%rsp),%ebp
addl %eax,%r11d
- andl %r13d,%ebx
roll $1,%ebp
- addl %ebx,%r11d
+ andl %r13d,%ebx
+ addl %ecx,%r11d
roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 48(%rsp),%r14d
+ movl %esi,%eax
movl %ebp,44(%rsp)
- addl %ecx,%r11d
- movl 48(%rsp),%edx
- movl %r13d,%eax
- movl %r13d,%ebx
- xorl 56(%rsp),%edx
- andl %esi,%eax
+ movl %esi,%ebx
+ xorl 56(%rsp),%r14d
+ andl %r13d,%eax
movl %r11d,%ecx
- xorl 16(%rsp),%edx
- xorl %esi,%ebx
+ xorl 16(%rsp),%r14d
leal -1894007588(%rbp,%rdi,1),%edi
+ xorl %r13d,%ebx
roll $5,%ecx
- xorl 36(%rsp),%edx
addl %eax,%edi
+ roll $1,%r14d
andl %r12d,%ebx
- roll $1,%edx
- addl %ebx,%edi
- roll $30,%r12d
- movl %edx,48(%rsp)
addl %ecx,%edi
- movl 52(%rsp),%ebp
- movl %r12d,%eax
- movl %r12d,%ebx
- xorl 60(%rsp),%ebp
- andl %r13d,%eax
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 52(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,48(%rsp)
+ movl %r13d,%ebx
+ xorl 60(%rsp),%edx
+ andl %r12d,%eax
movl %edi,%ecx
- xorl 20(%rsp),%ebp
- xorl %r13d,%ebx
- leal -1894007588(%rdx,%rsi,1),%esi
+ xorl 20(%rsp),%edx
+ leal -1894007588(%r14,%rsi,1),%esi
+ xorl %r12d,%ebx
roll $5,%ecx
- xorl 40(%rsp),%ebp
addl %eax,%esi
+ roll $1,%edx
andl %r11d,%ebx
- roll $1,%ebp
- addl %ebx,%esi
- roll $30,%r11d
- movl %ebp,52(%rsp)
addl %ecx,%esi
- movl 56(%rsp),%edx
- movl %r11d,%eax
- movl %r11d,%ebx
- xorl 0(%rsp),%edx
- andl %r12d,%eax
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 56(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,52(%rsp)
+ movl %r12d,%ebx
+ xorl 0(%rsp),%ebp
+ andl %r11d,%eax
movl %esi,%ecx
- xorl 24(%rsp),%edx
- xorl %r12d,%ebx
- leal -1894007588(%rbp,%r13,1),%r13d
+ xorl 24(%rsp),%ebp
+ leal -1894007588(%rdx,%r13,1),%r13d
+ xorl %r11d,%ebx
roll $5,%ecx
- xorl 44(%rsp),%edx
addl %eax,%r13d
+ roll $1,%ebp
andl %edi,%ebx
- roll $1,%edx
- addl %ebx,%r13d
- roll $30,%edi
- movl %edx,56(%rsp)
addl %ecx,%r13d
- movl 60(%rsp),%ebp
- movl %edi,%eax
- movl %edi,%ebx
- xorl 4(%rsp),%ebp
- andl %r11d,%eax
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 60(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,56(%rsp)
+ movl %r11d,%ebx
+ xorl 4(%rsp),%r14d
+ andl %edi,%eax
movl %r13d,%ecx
- xorl 28(%rsp),%ebp
- xorl %r11d,%ebx
- leal -1894007588(%rdx,%r12,1),%r12d
+ xorl 28(%rsp),%r14d
+ leal -1894007588(%rbp,%r12,1),%r12d
+ xorl %edi,%ebx
roll $5,%ecx
- xorl 48(%rsp),%ebp
addl %eax,%r12d
+ roll $1,%r14d
andl %esi,%ebx
- roll $1,%ebp
- addl %ebx,%r12d
- roll $30,%esi
- movl %ebp,60(%rsp)
addl %ecx,%r12d
- movl 0(%rsp),%edx
- movl %esi,%eax
- movl %esi,%ebx
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 0(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,60(%rsp)
+ movl %edi,%ebx
xorl 8(%rsp),%edx
- andl %edi,%eax
+ andl %esi,%eax
movl %r12d,%ecx
xorl 32(%rsp),%edx
- xorl %edi,%ebx
- leal -1894007588(%rbp,%r11,1),%r11d
+ leal -1894007588(%r14,%r11,1),%r11d
+ xorl %esi,%ebx
roll $5,%ecx
- xorl 52(%rsp),%edx
addl %eax,%r11d
- andl %r13d,%ebx
roll $1,%edx
- addl %ebx,%r11d
+ andl %r13d,%ebx
+ addl %ecx,%r11d
roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 4(%rsp),%ebp
+ movl %esi,%eax
movl %edx,0(%rsp)
- addl %ecx,%r11d
- movl 4(%rsp),%ebp
- movl %r13d,%eax
- movl %r13d,%ebx
+ movl %esi,%ebx
xorl 12(%rsp),%ebp
- andl %esi,%eax
+ andl %r13d,%eax
movl %r11d,%ecx
xorl 36(%rsp),%ebp
- xorl %esi,%ebx
leal -1894007588(%rdx,%rdi,1),%edi
+ xorl %r13d,%ebx
roll $5,%ecx
- xorl 56(%rsp),%ebp
addl %eax,%edi
- andl %r12d,%ebx
roll $1,%ebp
- addl %ebx,%edi
+ andl %r12d,%ebx
+ addl %ecx,%edi
roll $30,%r12d
+ addl %ebx,%edi
+ xorl 8(%rsp),%r14d
+ movl %r13d,%eax
movl %ebp,4(%rsp)
- addl %ecx,%edi
- movl 8(%rsp),%edx
- movl %r12d,%eax
- movl %r12d,%ebx
- xorl 16(%rsp),%edx
- andl %r13d,%eax
+ movl %r13d,%ebx
+ xorl 16(%rsp),%r14d
+ andl %r12d,%eax
movl %edi,%ecx
- xorl 40(%rsp),%edx
- xorl %r13d,%ebx
+ xorl 40(%rsp),%r14d
leal -1894007588(%rbp,%rsi,1),%esi
+ xorl %r12d,%ebx
roll $5,%ecx
- xorl 60(%rsp),%edx
addl %eax,%esi
+ roll $1,%r14d
andl %r11d,%ebx
- roll $1,%edx
- addl %ebx,%esi
- roll $30,%r11d
- movl %edx,8(%rsp)
addl %ecx,%esi
- movl 12(%rsp),%ebp
- movl %r11d,%eax
- movl %r11d,%ebx
- xorl 20(%rsp),%ebp
- andl %r12d,%eax
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 12(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,8(%rsp)
+ movl %r12d,%ebx
+ xorl 20(%rsp),%edx
+ andl %r11d,%eax
movl %esi,%ecx
- xorl 44(%rsp),%ebp
- xorl %r12d,%ebx
- leal -1894007588(%rdx,%r13,1),%r13d
+ xorl 44(%rsp),%edx
+ leal -1894007588(%r14,%r13,1),%r13d
+ xorl %r11d,%ebx
roll $5,%ecx
- xorl 0(%rsp),%ebp
addl %eax,%r13d
+ roll $1,%edx
andl %edi,%ebx
- roll $1,%ebp
- addl %ebx,%r13d
- roll $30,%edi
- movl %ebp,12(%rsp)
addl %ecx,%r13d
- movl 16(%rsp),%edx
- movl %edi,%eax
- movl %edi,%ebx
- xorl 24(%rsp),%edx
- andl %r11d,%eax
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 16(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,12(%rsp)
+ movl %r11d,%ebx
+ xorl 24(%rsp),%ebp
+ andl %edi,%eax
movl %r13d,%ecx
- xorl 48(%rsp),%edx
- xorl %r11d,%ebx
- leal -1894007588(%rbp,%r12,1),%r12d
+ xorl 48(%rsp),%ebp
+ leal -1894007588(%rdx,%r12,1),%r12d
+ xorl %edi,%ebx
roll $5,%ecx
- xorl 4(%rsp),%edx
addl %eax,%r12d
+ roll $1,%ebp
andl %esi,%ebx
- roll $1,%edx
- addl %ebx,%r12d
- roll $30,%esi
- movl %edx,16(%rsp)
addl %ecx,%r12d
- movl 20(%rsp),%ebp
- movl %esi,%eax
- movl %esi,%ebx
- xorl 28(%rsp),%ebp
- andl %edi,%eax
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 20(%rsp),%r14d
+ movl %edi,%eax
+ movl %ebp,16(%rsp)
+ movl %edi,%ebx
+ xorl 28(%rsp),%r14d
+ andl %esi,%eax
movl %r12d,%ecx
- xorl 52(%rsp),%ebp
- xorl %edi,%ebx
- leal -1894007588(%rdx,%r11,1),%r11d
+ xorl 52(%rsp),%r14d
+ leal -1894007588(%rbp,%r11,1),%r11d
+ xorl %esi,%ebx
roll $5,%ecx
- xorl 8(%rsp),%ebp
addl %eax,%r11d
+ roll $1,%r14d
andl %r13d,%ebx
- roll $1,%ebp
- addl %ebx,%r11d
- roll $30,%r13d
- movl %ebp,20(%rsp)
addl %ecx,%r11d
- movl 24(%rsp),%edx
- movl %r13d,%eax
- movl %r13d,%ebx
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 24(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,20(%rsp)
+ movl %esi,%ebx
xorl 32(%rsp),%edx
- andl %esi,%eax
+ andl %r13d,%eax
movl %r11d,%ecx
xorl 56(%rsp),%edx
- xorl %esi,%ebx
- leal -1894007588(%rbp,%rdi,1),%edi
+ leal -1894007588(%r14,%rdi,1),%edi
+ xorl %r13d,%ebx
roll $5,%ecx
- xorl 12(%rsp),%edx
addl %eax,%edi
- andl %r12d,%ebx
roll $1,%edx
- addl %ebx,%edi
+ andl %r12d,%ebx
+ addl %ecx,%edi
roll $30,%r12d
+ addl %ebx,%edi
+ xorl 28(%rsp),%ebp
+ movl %r13d,%eax
movl %edx,24(%rsp)
- addl %ecx,%edi
- movl 28(%rsp),%ebp
- movl %r12d,%eax
- movl %r12d,%ebx
+ movl %r13d,%ebx
xorl 36(%rsp),%ebp
- andl %r13d,%eax
+ andl %r12d,%eax
movl %edi,%ecx
xorl 60(%rsp),%ebp
- xorl %r13d,%ebx
leal -1894007588(%rdx,%rsi,1),%esi
+ xorl %r12d,%ebx
roll $5,%ecx
- xorl 16(%rsp),%ebp
addl %eax,%esi
- andl %r11d,%ebx
roll $1,%ebp
- addl %ebx,%esi
+ andl %r11d,%ebx
+ addl %ecx,%esi
roll $30,%r11d
+ addl %ebx,%esi
+ xorl 32(%rsp),%r14d
+ movl %r12d,%eax
movl %ebp,28(%rsp)
- addl %ecx,%esi
- movl 32(%rsp),%edx
- movl %r11d,%eax
- movl %r11d,%ebx
- xorl 40(%rsp),%edx
- andl %r12d,%eax
+ movl %r12d,%ebx
+ xorl 40(%rsp),%r14d
+ andl %r11d,%eax
movl %esi,%ecx
- xorl 0(%rsp),%edx
- xorl %r12d,%ebx
+ xorl 0(%rsp),%r14d
leal -1894007588(%rbp,%r13,1),%r13d
+ xorl %r11d,%ebx
roll $5,%ecx
- xorl 20(%rsp),%edx
addl %eax,%r13d
+ roll $1,%r14d
andl %edi,%ebx
- roll $1,%edx
- addl %ebx,%r13d
- roll $30,%edi
- movl %edx,32(%rsp)
addl %ecx,%r13d
- movl 36(%rsp),%ebp
- movl %edi,%eax
- movl %edi,%ebx
- xorl 44(%rsp),%ebp
- andl %r11d,%eax
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 36(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,32(%rsp)
+ movl %r11d,%ebx
+ xorl 44(%rsp),%edx
+ andl %edi,%eax
movl %r13d,%ecx
- xorl 4(%rsp),%ebp
- xorl %r11d,%ebx
- leal -1894007588(%rdx,%r12,1),%r12d
+ xorl 4(%rsp),%edx
+ leal -1894007588(%r14,%r12,1),%r12d
+ xorl %edi,%ebx
roll $5,%ecx
- xorl 24(%rsp),%ebp
addl %eax,%r12d
+ roll $1,%edx
andl %esi,%ebx
- roll $1,%ebp
- addl %ebx,%r12d
- roll $30,%esi
- movl %ebp,36(%rsp)
addl %ecx,%r12d
- movl 40(%rsp),%edx
- movl %esi,%eax
- movl %esi,%ebx
- xorl 48(%rsp),%edx
- andl %edi,%eax
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 40(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,36(%rsp)
+ movl %edi,%ebx
+ xorl 48(%rsp),%ebp
+ andl %esi,%eax
movl %r12d,%ecx
- xorl 8(%rsp),%edx
- xorl %edi,%ebx
- leal -1894007588(%rbp,%r11,1),%r11d
+ xorl 8(%rsp),%ebp
+ leal -1894007588(%rdx,%r11,1),%r11d
+ xorl %esi,%ebx
roll $5,%ecx
- xorl 28(%rsp),%edx
addl %eax,%r11d
+ roll $1,%ebp
andl %r13d,%ebx
- roll $1,%edx
- addl %ebx,%r11d
- roll $30,%r13d
- movl %edx,40(%rsp)
addl %ecx,%r11d
- movl 44(%rsp),%ebp
- movl %r13d,%eax
- movl %r13d,%ebx
- xorl 52(%rsp),%ebp
- andl %esi,%eax
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 44(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,40(%rsp)
+ movl %esi,%ebx
+ xorl 52(%rsp),%r14d
+ andl %r13d,%eax
movl %r11d,%ecx
- xorl 12(%rsp),%ebp
- xorl %esi,%ebx
- leal -1894007588(%rdx,%rdi,1),%edi
+ xorl 12(%rsp),%r14d
+ leal -1894007588(%rbp,%rdi,1),%edi
+ xorl %r13d,%ebx
roll $5,%ecx
- xorl 32(%rsp),%ebp
addl %eax,%edi
+ roll $1,%r14d
andl %r12d,%ebx
- roll $1,%ebp
- addl %ebx,%edi
- roll $30,%r12d
- movl %ebp,44(%rsp)
addl %ecx,%edi
- movl 48(%rsp),%edx
- movl %r12d,%eax
- movl %r12d,%ebx
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 48(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,44(%rsp)
+ movl %r13d,%ebx
xorl 56(%rsp),%edx
- andl %r13d,%eax
+ andl %r12d,%eax
movl %edi,%ecx
xorl 16(%rsp),%edx
- xorl %r13d,%ebx
- leal -1894007588(%rbp,%rsi,1),%esi
+ leal -1894007588(%r14,%rsi,1),%esi
+ xorl %r12d,%ebx
roll $5,%ecx
- xorl 36(%rsp),%edx
addl %eax,%esi
- andl %r11d,%ebx
roll $1,%edx
- addl %ebx,%esi
+ andl %r11d,%ebx
+ addl %ecx,%esi
roll $30,%r11d
+ addl %ebx,%esi
+ xorl 52(%rsp),%ebp
+ movl %edi,%eax
movl %edx,48(%rsp)
- addl %ecx,%esi
- movl 52(%rsp),%ebp
- movl %r11d,%eax
movl %esi,%ecx
xorl 60(%rsp),%ebp
- xorl %edi,%eax
+ xorl %r12d,%eax
roll $5,%ecx
- leal -899497514(%rdx,%r13,1),%r13d
xorl 20(%rsp),%ebp
- xorl %r12d,%eax
+ leal -899497514(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 40(%rsp),%ebp
roll $30,%edi
addl %eax,%r13d
roll $1,%ebp
+ xorl 56(%rsp),%r14d
+ movl %esi,%eax
movl %ebp,52(%rsp)
- movl 56(%rsp),%edx
- movl %edi,%eax
movl %r13d,%ecx
- xorl 0(%rsp),%edx
- xorl %esi,%eax
+ xorl 0(%rsp),%r14d
+ xorl %r11d,%eax
roll $5,%ecx
+ xorl 24(%rsp),%r14d
leal -899497514(%rbp,%r12,1),%r12d
- xorl 24(%rsp),%edx
- xorl %r11d,%eax
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 44(%rsp),%edx
roll $30,%esi
addl %eax,%r12d
- roll $1,%edx
- movl %edx,56(%rsp)
- movl 60(%rsp),%ebp
- movl %esi,%eax
+ roll $1,%r14d
+ xorl 60(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,56(%rsp)
movl %r12d,%ecx
- xorl 4(%rsp),%ebp
- xorl %r13d,%eax
- roll $5,%ecx
- leal -899497514(%rdx,%r11,1),%r11d
- xorl 28(%rsp),%ebp
+ xorl 4(%rsp),%edx
xorl %edi,%eax
+ roll $5,%ecx
+ xorl 28(%rsp),%edx
+ leal -899497514(%r14,%r11,1),%r11d
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 48(%rsp),%ebp
roll $30,%r13d
addl %eax,%r11d
- roll $1,%ebp
- movl %ebp,60(%rsp)
- movl 0(%rsp),%edx
- movl %r13d,%eax
+ roll $1,%edx
+ xorl 0(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,60(%rsp)
movl %r11d,%ecx
- xorl 8(%rsp),%edx
- xorl %r12d,%eax
- roll $5,%ecx
- leal -899497514(%rbp,%rdi,1),%edi
- xorl 32(%rsp),%edx
+ xorl 8(%rsp),%ebp
xorl %esi,%eax
+ roll $5,%ecx
+ xorl 32(%rsp),%ebp
+ leal -899497514(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 52(%rsp),%edx
roll $30,%r12d
addl %eax,%edi
- roll $1,%edx
- movl %edx,0(%rsp)
- movl 4(%rsp),%ebp
- movl %r12d,%eax
+ roll $1,%ebp
+ xorl 4(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,0(%rsp)
movl %edi,%ecx
- xorl 12(%rsp),%ebp
- xorl %r11d,%eax
- roll $5,%ecx
- leal -899497514(%rdx,%rsi,1),%esi
- xorl 36(%rsp),%ebp
+ xorl 12(%rsp),%r14d
xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%r14d
+ leal -899497514(%rbp,%rsi,1),%esi
+ xorl %r12d,%eax
addl %ecx,%esi
- xorl 56(%rsp),%ebp
roll $30,%r11d
addl %eax,%esi
- roll $1,%ebp
- movl %ebp,4(%rsp)
- movl 8(%rsp),%edx
- movl %r11d,%eax
+ roll $1,%r14d
+ xorl 8(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,4(%rsp)
movl %esi,%ecx
xorl 16(%rsp),%edx
- xorl %edi,%eax
+ xorl %r12d,%eax
roll $5,%ecx
- leal -899497514(%rbp,%r13,1),%r13d
xorl 40(%rsp),%edx
- xorl %r12d,%eax
+ leal -899497514(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 60(%rsp),%edx
roll $30,%edi
addl %eax,%r13d
roll $1,%edx
+ xorl 12(%rsp),%ebp
+ movl %esi,%eax
movl %edx,8(%rsp)
- movl 12(%rsp),%ebp
- movl %edi,%eax
movl %r13d,%ecx
xorl 20(%rsp),%ebp
- xorl %esi,%eax
+ xorl %r11d,%eax
roll $5,%ecx
- leal -899497514(%rdx,%r12,1),%r12d
xorl 44(%rsp),%ebp
- xorl %r11d,%eax
+ leal -899497514(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 0(%rsp),%ebp
roll $30,%esi
addl %eax,%r12d
roll $1,%ebp
+ xorl 16(%rsp),%r14d
+ movl %r13d,%eax
movl %ebp,12(%rsp)
- movl 16(%rsp),%edx
- movl %esi,%eax
movl %r12d,%ecx
- xorl 24(%rsp),%edx
- xorl %r13d,%eax
+ xorl 24(%rsp),%r14d
+ xorl %edi,%eax
roll $5,%ecx
+ xorl 48(%rsp),%r14d
leal -899497514(%rbp,%r11,1),%r11d
- xorl 48(%rsp),%edx
- xorl %edi,%eax
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 4(%rsp),%edx
roll $30,%r13d
addl %eax,%r11d
- roll $1,%edx
- movl %edx,16(%rsp)
- movl 20(%rsp),%ebp
- movl %r13d,%eax
+ roll $1,%r14d
+ xorl 20(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,16(%rsp)
movl %r11d,%ecx
- xorl 28(%rsp),%ebp
- xorl %r12d,%eax
- roll $5,%ecx
- leal -899497514(%rdx,%rdi,1),%edi
- xorl 52(%rsp),%ebp
+ xorl 28(%rsp),%edx
xorl %esi,%eax
+ roll $5,%ecx
+ xorl 52(%rsp),%edx
+ leal -899497514(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 8(%rsp),%ebp
roll $30,%r12d
addl %eax,%edi
- roll $1,%ebp
- movl %ebp,20(%rsp)
- movl 24(%rsp),%edx
- movl %r12d,%eax
+ roll $1,%edx
+ xorl 24(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,20(%rsp)
movl %edi,%ecx
- xorl 32(%rsp),%edx
- xorl %r11d,%eax
- roll $5,%ecx
- leal -899497514(%rbp,%rsi,1),%esi
- xorl 56(%rsp),%edx
+ xorl 32(%rsp),%ebp
xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 56(%rsp),%ebp
+ leal -899497514(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
addl %ecx,%esi
- xorl 12(%rsp),%edx
roll $30,%r11d
addl %eax,%esi
- roll $1,%edx
- movl %edx,24(%rsp)
- movl 28(%rsp),%ebp
- movl %r11d,%eax
+ roll $1,%ebp
+ xorl 28(%rsp),%r14d
+ movl %edi,%eax
+ movl %ebp,24(%rsp)
movl %esi,%ecx
- xorl 36(%rsp),%ebp
- xorl %edi,%eax
- roll $5,%ecx
- leal -899497514(%rdx,%r13,1),%r13d
- xorl 60(%rsp),%ebp
+ xorl 36(%rsp),%r14d
xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%r14d
+ leal -899497514(%rbp,%r13,1),%r13d
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 16(%rsp),%ebp
roll $30,%edi
addl %eax,%r13d
- roll $1,%ebp
- movl %ebp,28(%rsp)
- movl 32(%rsp),%edx
- movl %edi,%eax
+ roll $1,%r14d
+ xorl 32(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,28(%rsp)
movl %r13d,%ecx
xorl 40(%rsp),%edx
- xorl %esi,%eax
+ xorl %r11d,%eax
roll $5,%ecx
- leal -899497514(%rbp,%r12,1),%r12d
xorl 0(%rsp),%edx
- xorl %r11d,%eax
+ leal -899497514(%r14,%r12,1),%r12d
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 20(%rsp),%edx
roll $30,%esi
addl %eax,%r12d
roll $1,%edx
- movl %edx,32(%rsp)
- movl 36(%rsp),%ebp
- movl %esi,%eax
+ xorl 36(%rsp),%ebp
+ movl %r13d,%eax
+
movl %r12d,%ecx
xorl 44(%rsp),%ebp
- xorl %r13d,%eax
+ xorl %edi,%eax
roll $5,%ecx
- leal -899497514(%rdx,%r11,1),%r11d
xorl 4(%rsp),%ebp
- xorl %edi,%eax
+ leal -899497514(%rdx,%r11,1),%r11d
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 24(%rsp),%ebp
roll $30,%r13d
addl %eax,%r11d
roll $1,%ebp
- movl %ebp,36(%rsp)
- movl 40(%rsp),%edx
- movl %r13d,%eax
+ xorl 40(%rsp),%r14d
+ movl %r12d,%eax
+
movl %r11d,%ecx
- xorl 48(%rsp),%edx
- xorl %r12d,%eax
+ xorl 48(%rsp),%r14d
+ xorl %esi,%eax
roll $5,%ecx
+ xorl 8(%rsp),%r14d
leal -899497514(%rbp,%rdi,1),%edi
- xorl 8(%rsp),%edx
- xorl %esi,%eax
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 28(%rsp),%edx
roll $30,%r12d
addl %eax,%edi
- roll $1,%edx
- movl %edx,40(%rsp)
- movl 44(%rsp),%ebp
- movl %r12d,%eax
+ roll $1,%r14d
+ xorl 44(%rsp),%edx
+ movl %r11d,%eax
+
movl %edi,%ecx
- xorl 52(%rsp),%ebp
- xorl %r11d,%eax
- roll $5,%ecx
- leal -899497514(%rdx,%rsi,1),%esi
- xorl 12(%rsp),%ebp
+ xorl 52(%rsp),%edx
xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 12(%rsp),%edx
+ leal -899497514(%r14,%rsi,1),%esi
+ xorl %r12d,%eax
addl %ecx,%esi
- xorl 32(%rsp),%ebp
roll $30,%r11d
addl %eax,%esi
- roll $1,%ebp
- movl %ebp,44(%rsp)
- movl 48(%rsp),%edx
- movl %r11d,%eax
+ roll $1,%edx
+ xorl 48(%rsp),%ebp
+ movl %edi,%eax
+
movl %esi,%ecx
- xorl 56(%rsp),%edx
- xorl %edi,%eax
- roll $5,%ecx
- leal -899497514(%rbp,%r13,1),%r13d
- xorl 16(%rsp),%edx
+ xorl 56(%rsp),%ebp
xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 16(%rsp),%ebp
+ leal -899497514(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 36(%rsp),%edx
roll $30,%edi
addl %eax,%r13d
- roll $1,%edx
- movl %edx,48(%rsp)
- movl 52(%rsp),%ebp
- movl %edi,%eax
+ roll $1,%ebp
+ xorl 52(%rsp),%r14d
+ movl %esi,%eax
+
movl %r13d,%ecx
- xorl 60(%rsp),%ebp
- xorl %esi,%eax
- roll $5,%ecx
- leal -899497514(%rdx,%r12,1),%r12d
- xorl 20(%rsp),%ebp
+ xorl 60(%rsp),%r14d
xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 20(%rsp),%r14d
+ leal -899497514(%rbp,%r12,1),%r12d
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 40(%rsp),%ebp
roll $30,%esi
addl %eax,%r12d
- roll $1,%ebp
- movl 56(%rsp),%edx
- movl %esi,%eax
+ roll $1,%r14d
+ xorl 56(%rsp),%edx
+ movl %r13d,%eax
+
movl %r12d,%ecx
xorl 0(%rsp),%edx
- xorl %r13d,%eax
+ xorl %edi,%eax
roll $5,%ecx
- leal -899497514(%rbp,%r11,1),%r11d
xorl 24(%rsp),%edx
- xorl %edi,%eax
+ leal -899497514(%r14,%r11,1),%r11d
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 44(%rsp),%edx
roll $30,%r13d
addl %eax,%r11d
roll $1,%edx
- movl 60(%rsp),%ebp
- movl %r13d,%eax
+ xorl 60(%rsp),%ebp
+ movl %r12d,%eax
+
movl %r11d,%ecx
xorl 4(%rsp),%ebp
- xorl %r12d,%eax
+ xorl %esi,%eax
roll $5,%ecx
- leal -899497514(%rdx,%rdi,1),%edi
xorl 28(%rsp),%ebp
- xorl %esi,%eax
+ leal -899497514(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 48(%rsp),%ebp
roll $30,%r12d
addl %eax,%edi
roll $1,%ebp
- movl %r12d,%eax
+ movl %r11d,%eax
movl %edi,%ecx
- xorl %r11d,%eax
+ xorl %r13d,%eax
leal -899497514(%rbp,%rsi,1),%esi
roll $5,%ecx
- xorl %r13d,%eax
+ xorl %r12d,%eax
addl %ecx,%esi
roll $30,%r11d
addl %eax,%esi
@@ -1327,11 +1267,12 @@ sha1_block_data_order:
jnz .Lloop
movq 64(%rsp),%rsi
- movq (%rsi),%r13
- movq 8(%rsi),%r12
- movq 16(%rsi),%rbp
- movq 24(%rsi),%rbx
- leaq 32(%rsi),%rsp
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
.Lepilogue:
movq 8(%rsp),%rdi
movq 16(%rsp),%rsi
@@ -1349,23 +1290,29 @@ sha1_block_data_order_ssse3:
movq %r8,%rdx
_ssse3_shortcut:
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
- leaq -144(%rsp),%rsp
- movaps %xmm6,64+0(%rsp)
- movaps %xmm7,64+16(%rsp)
- movaps %xmm8,64+32(%rsp)
- movaps %xmm9,64+48(%rsp)
- movaps %xmm10,64+64(%rsp)
+ pushq %r13
+ pushq %r14
+ leaq -160(%rsp),%rsp
+ movaps %xmm6,-40-96(%rax)
+ movaps %xmm7,-40-80(%rax)
+ movaps %xmm8,-40-64(%rax)
+ movaps %xmm9,-40-48(%rax)
+ movaps %xmm10,-40-32(%rax)
+ movaps %xmm11,-40-16(%rax)
.Lprologue_ssse3:
+ movq %rax,%r14
+ andq $-64,%rsp
movq %rdi,%r8
movq %rsi,%r9
movq %rdx,%r10
shlq $6,%r10
addq %r9,%r10
- leaq K_XX_XX(%rip),%r11
+ leaq K_XX_XX+64(%rip),%r11
movl 0(%r8),%eax
movl 4(%r8),%ebx
@@ -1373,19 +1320,22 @@ _ssse3_shortcut:
movl 12(%r8),%edx
movl %ebx,%esi
movl 16(%r8),%ebp
+ movl %ecx,%edi
+ xorl %edx,%edi
+ andl %edi,%esi
movdqa 64(%r11),%xmm6
- movdqa 0(%r11),%xmm9
+ movdqa -64(%r11),%xmm9
movdqu 0(%r9),%xmm0
movdqu 16(%r9),%xmm1
movdqu 32(%r9),%xmm2
movdqu 48(%r9),%xmm3
.byte 102,15,56,0,198
- addq $64,%r9
.byte 102,15,56,0,206
.byte 102,15,56,0,214
-.byte 102,15,56,0,222
+ addq $64,%r9
paddd %xmm9,%xmm0
+.byte 102,15,56,0,222
paddd %xmm9,%xmm1
paddd %xmm9,%xmm2
movdqa %xmm0,0(%rsp)
@@ -1397,904 +1347,882 @@ _ssse3_shortcut:
jmp .Loop_ssse3
.p2align 4
.Loop_ssse3:
- movdqa %xmm1,%xmm4
- addl 0(%rsp),%ebp
- xorl %edx,%ecx
+ rorl $2,%ebx
+ pshufd $238,%xmm0,%xmm4
+ xorl %edx,%esi
movdqa %xmm3,%xmm8
-.byte 102,15,58,15,224,8
+ paddd %xmm3,%xmm9
movl %eax,%edi
+ addl 0(%rsp),%ebp
+ punpcklqdq %xmm1,%xmm4
+ xorl %ecx,%ebx
roll $5,%eax
- paddd %xmm3,%xmm9
- andl %ecx,%esi
- xorl %edx,%ecx
+ addl %esi,%ebp
psrldq $4,%xmm8
- xorl %edx,%esi
- addl %eax,%ebp
+ andl %ebx,%edi
+ xorl %ecx,%ebx
pxor %xmm0,%xmm4
- rorl $2,%ebx
- addl %esi,%ebp
+ addl %eax,%ebp
+ rorl $7,%eax
pxor %xmm2,%xmm8
- addl 4(%rsp),%edx
- xorl %ecx,%ebx
+ xorl %ecx,%edi
movl %ebp,%esi
- roll $5,%ebp
+ addl 4(%rsp),%edx
pxor %xmm8,%xmm4
- andl %ebx,%edi
- xorl %ecx,%ebx
+ xorl %ebx,%eax
+ roll $5,%ebp
movdqa %xmm9,48(%rsp)
- xorl %ecx,%edi
- addl %ebp,%edx
- movdqa %xmm4,%xmm10
- movdqa %xmm4,%xmm8
- rorl $7,%eax
addl %edi,%edx
- addl 8(%rsp),%ecx
+ andl %eax,%esi
+ movdqa %xmm4,%xmm10
xorl %ebx,%eax
+ addl %ebp,%edx
+ rorl $7,%ebp
+ movdqa %xmm4,%xmm8
+ xorl %ebx,%esi
pslldq $12,%xmm10
paddd %xmm4,%xmm4
movl %edx,%edi
- roll $5,%edx
- andl %eax,%esi
- xorl %ebx,%eax
+ addl 8(%rsp),%ecx
psrld $31,%xmm8
- xorl %ebx,%esi
- addl %edx,%ecx
- movdqa %xmm10,%xmm9
- rorl $7,%ebp
+ xorl %eax,%ebp
+ roll $5,%edx
addl %esi,%ecx
+ movdqa %xmm10,%xmm9
+ andl %ebp,%edi
+ xorl %eax,%ebp
psrld $30,%xmm10
+ addl %edx,%ecx
+ rorl $7,%edx
por %xmm8,%xmm4
- addl 12(%rsp),%ebx
- xorl %eax,%ebp
+ xorl %eax,%edi
movl %ecx,%esi
- roll $5,%ecx
+ addl 12(%rsp),%ebx
pslld $2,%xmm9
pxor %xmm10,%xmm4
- andl %ebp,%edi
- xorl %eax,%ebp
- movdqa 0(%r11),%xmm10
- xorl %eax,%edi
- addl %ecx,%ebx
- pxor %xmm9,%xmm4
- rorl $7,%edx
+ xorl %ebp,%edx
+ movdqa -64(%r11),%xmm10
+ roll $5,%ecx
addl %edi,%ebx
- movdqa %xmm2,%xmm5
- addl 16(%rsp),%eax
+ andl %edx,%esi
+ pxor %xmm9,%xmm4
xorl %ebp,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ pshufd $238,%xmm1,%xmm5
+ xorl %ebp,%esi
movdqa %xmm4,%xmm9
-.byte 102,15,58,15,233,8
+ paddd %xmm4,%xmm10
movl %ebx,%edi
+ addl 16(%rsp),%eax
+ punpcklqdq %xmm2,%xmm5
+ xorl %edx,%ecx
roll $5,%ebx
- paddd %xmm4,%xmm10
- andl %edx,%esi
- xorl %ebp,%edx
+ addl %esi,%eax
psrldq $4,%xmm9
- xorl %ebp,%esi
- addl %ebx,%eax
+ andl %ecx,%edi
+ xorl %edx,%ecx
pxor %xmm1,%xmm5
- rorl $7,%ecx
- addl %esi,%eax
+ addl %ebx,%eax
+ rorl $7,%ebx
pxor %xmm3,%xmm9
- addl 20(%rsp),%ebp
- xorl %edx,%ecx
+ xorl %edx,%edi
movl %eax,%esi
- roll $5,%eax
+ addl 20(%rsp),%ebp
pxor %xmm9,%xmm5
- andl %ecx,%edi
- xorl %edx,%ecx
+ xorl %ecx,%ebx
+ roll $5,%eax
movdqa %xmm10,0(%rsp)
- xorl %edx,%edi
- addl %eax,%ebp
- movdqa %xmm5,%xmm8
- movdqa %xmm5,%xmm9
- rorl $7,%ebx
addl %edi,%ebp
- addl 24(%rsp),%edx
+ andl %ebx,%esi
+ movdqa %xmm5,%xmm8
xorl %ecx,%ebx
+ addl %eax,%ebp
+ rorl $7,%eax
+ movdqa %xmm5,%xmm9
+ xorl %ecx,%esi
pslldq $12,%xmm8
paddd %xmm5,%xmm5
movl %ebp,%edi
- roll $5,%ebp
- andl %ebx,%esi
- xorl %ecx,%ebx
+ addl 24(%rsp),%edx
psrld $31,%xmm9
- xorl %ecx,%esi
- addl %ebp,%edx
- movdqa %xmm8,%xmm10
- rorl $7,%eax
+ xorl %ebx,%eax
+ roll $5,%ebp
addl %esi,%edx
+ movdqa %xmm8,%xmm10
+ andl %eax,%edi
+ xorl %ebx,%eax
psrld $30,%xmm8
+ addl %ebp,%edx
+ rorl $7,%ebp
por %xmm9,%xmm5
- addl 28(%rsp),%ecx
- xorl %ebx,%eax
+ xorl %ebx,%edi
movl %edx,%esi
- roll $5,%edx
+ addl 28(%rsp),%ecx
pslld $2,%xmm10
pxor %xmm8,%xmm5
- andl %eax,%edi
- xorl %ebx,%eax
- movdqa 16(%r11),%xmm8
- xorl %ebx,%edi
- addl %edx,%ecx
- pxor %xmm10,%xmm5
- rorl $7,%ebp
+ xorl %eax,%ebp
+ movdqa -32(%r11),%xmm8
+ roll $5,%edx
addl %edi,%ecx
- movdqa %xmm3,%xmm6
- addl 32(%rsp),%ebx
+ andl %ebp,%esi
+ pxor %xmm10,%xmm5
xorl %eax,%ebp
+ addl %edx,%ecx
+ rorl $7,%edx
+ pshufd $238,%xmm2,%xmm6
+ xorl %eax,%esi
movdqa %xmm5,%xmm10
-.byte 102,15,58,15,242,8
+ paddd %xmm5,%xmm8
movl %ecx,%edi
+ addl 32(%rsp),%ebx
+ punpcklqdq %xmm3,%xmm6
+ xorl %ebp,%edx
roll $5,%ecx
- paddd %xmm5,%xmm8
- andl %ebp,%esi
- xorl %eax,%ebp
+ addl %esi,%ebx
psrldq $4,%xmm10
- xorl %eax,%esi
- addl %ecx,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
pxor %xmm2,%xmm6
- rorl $7,%edx
- addl %esi,%ebx
+ addl %ecx,%ebx
+ rorl $7,%ecx
pxor %xmm4,%xmm10
- addl 36(%rsp),%eax
- xorl %ebp,%edx
+ xorl %ebp,%edi
movl %ebx,%esi
- roll $5,%ebx
+ addl 36(%rsp),%eax
pxor %xmm10,%xmm6
- andl %edx,%edi
- xorl %ebp,%edx
+ xorl %edx,%ecx
+ roll $5,%ebx
movdqa %xmm8,16(%rsp)
- xorl %ebp,%edi
- addl %ebx,%eax
- movdqa %xmm6,%xmm9
- movdqa %xmm6,%xmm10
- rorl $7,%ecx
addl %edi,%eax
- addl 40(%rsp),%ebp
+ andl %ecx,%esi
+ movdqa %xmm6,%xmm9
xorl %edx,%ecx
+ addl %ebx,%eax
+ rorl $7,%ebx
+ movdqa %xmm6,%xmm10
+ xorl %edx,%esi
pslldq $12,%xmm9
paddd %xmm6,%xmm6
movl %eax,%edi
- roll $5,%eax
- andl %ecx,%esi
- xorl %edx,%ecx
+ addl 40(%rsp),%ebp
psrld $31,%xmm10
- xorl %edx,%esi
- addl %eax,%ebp
- movdqa %xmm9,%xmm8
- rorl $7,%ebx
+ xorl %ecx,%ebx
+ roll $5,%eax
addl %esi,%ebp
+ movdqa %xmm9,%xmm8
+ andl %ebx,%edi
+ xorl %ecx,%ebx
psrld $30,%xmm9
+ addl %eax,%ebp
+ rorl $7,%eax
por %xmm10,%xmm6
- addl 44(%rsp),%edx
- xorl %ecx,%ebx
+ xorl %ecx,%edi
movl %ebp,%esi
- roll $5,%ebp
+ addl 44(%rsp),%edx
pslld $2,%xmm8
pxor %xmm9,%xmm6
- andl %ebx,%edi
- xorl %ecx,%ebx
- movdqa 16(%r11),%xmm9
- xorl %ecx,%edi
- addl %ebp,%edx
- pxor %xmm8,%xmm6
- rorl $7,%eax
+ xorl %ebx,%eax
+ movdqa -32(%r11),%xmm9
+ roll $5,%ebp
addl %edi,%edx
- movdqa %xmm4,%xmm7
- addl 48(%rsp),%ecx
+ andl %eax,%esi
+ pxor %xmm8,%xmm6
xorl %ebx,%eax
+ addl %ebp,%edx
+ rorl $7,%ebp
+ pshufd $238,%xmm3,%xmm7
+ xorl %ebx,%esi
movdqa %xmm6,%xmm8
-.byte 102,15,58,15,251,8
+ paddd %xmm6,%xmm9
movl %edx,%edi
+ addl 48(%rsp),%ecx
+ punpcklqdq %xmm4,%xmm7
+ xorl %eax,%ebp
roll $5,%edx
- paddd %xmm6,%xmm9
- andl %eax,%esi
- xorl %ebx,%eax
+ addl %esi,%ecx
psrldq $4,%xmm8
- xorl %ebx,%esi
- addl %edx,%ecx
+ andl %ebp,%edi
+ xorl %eax,%ebp
pxor %xmm3,%xmm7
- rorl $7,%ebp
- addl %esi,%ecx
+ addl %edx,%ecx
+ rorl $7,%edx
pxor %xmm5,%xmm8
- addl 52(%rsp),%ebx
- xorl %eax,%ebp
+ xorl %eax,%edi
movl %ecx,%esi
- roll $5,%ecx
+ addl 52(%rsp),%ebx
pxor %xmm8,%xmm7
- andl %ebp,%edi
- xorl %eax,%ebp
+ xorl %ebp,%edx
+ roll $5,%ecx
movdqa %xmm9,32(%rsp)
- xorl %eax,%edi
- addl %ecx,%ebx
- movdqa %xmm7,%xmm10
- movdqa %xmm7,%xmm8
- rorl $7,%edx
addl %edi,%ebx
- addl 56(%rsp),%eax
+ andl %edx,%esi
+ movdqa %xmm7,%xmm10
xorl %ebp,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ movdqa %xmm7,%xmm8
+ xorl %ebp,%esi
pslldq $12,%xmm10
paddd %xmm7,%xmm7
movl %ebx,%edi
- roll $5,%ebx
- andl %edx,%esi
- xorl %ebp,%edx
+ addl 56(%rsp),%eax
psrld $31,%xmm8
- xorl %ebp,%esi
- addl %ebx,%eax
- movdqa %xmm10,%xmm9
- rorl $7,%ecx
+ xorl %edx,%ecx
+ roll $5,%ebx
addl %esi,%eax
+ movdqa %xmm10,%xmm9
+ andl %ecx,%edi
+ xorl %edx,%ecx
psrld $30,%xmm10
+ addl %ebx,%eax
+ rorl $7,%ebx
por %xmm8,%xmm7
- addl 60(%rsp),%ebp
- xorl %edx,%ecx
+ xorl %edx,%edi
movl %eax,%esi
- roll $5,%eax
+ addl 60(%rsp),%ebp
pslld $2,%xmm9
pxor %xmm10,%xmm7
- andl %ecx,%edi
- xorl %edx,%ecx
- movdqa 16(%r11),%xmm10
- xorl %edx,%edi
- addl %eax,%ebp
- pxor %xmm9,%xmm7
- rorl $7,%ebx
+ xorl %ecx,%ebx
+ movdqa -32(%r11),%xmm10
+ roll $5,%eax
addl %edi,%ebp
- movdqa %xmm7,%xmm9
- addl 0(%rsp),%edx
- pxor %xmm4,%xmm0
-.byte 102,68,15,58,15,206,8
+ andl %ebx,%esi
+ pxor %xmm9,%xmm7
+ pshufd $238,%xmm6,%xmm9
xorl %ecx,%ebx
+ addl %eax,%ebp
+ rorl $7,%eax
+ pxor %xmm4,%xmm0
+ xorl %ecx,%esi
movl %ebp,%edi
+ addl 0(%rsp),%edx
+ punpcklqdq %xmm7,%xmm9
+ xorl %ebx,%eax
roll $5,%ebp
pxor %xmm1,%xmm0
- andl %ebx,%esi
- xorl %ecx,%ebx
+ addl %esi,%edx
+ andl %eax,%edi
movdqa %xmm10,%xmm8
+ xorl %ebx,%eax
paddd %xmm7,%xmm10
- xorl %ecx,%esi
addl %ebp,%edx
pxor %xmm9,%xmm0
- rorl $7,%eax
- addl %esi,%edx
+ rorl $7,%ebp
+ xorl %ebx,%edi
+ movl %edx,%esi
addl 4(%rsp),%ecx
- xorl %ebx,%eax
movdqa %xmm0,%xmm9
- movdqa %xmm10,48(%rsp)
- movl %edx,%esi
+ xorl %eax,%ebp
roll $5,%edx
- andl %eax,%edi
- xorl %ebx,%eax
+ movdqa %xmm10,48(%rsp)
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
pslld $2,%xmm0
- xorl %ebx,%edi
addl %edx,%ecx
+ rorl $7,%edx
psrld $30,%xmm9
- rorl $7,%ebp
- addl %edi,%ecx
- addl 8(%rsp),%ebx
- xorl %eax,%ebp
+ xorl %eax,%esi
movl %ecx,%edi
- roll $5,%ecx
+ addl 8(%rsp),%ebx
por %xmm9,%xmm0
- andl %ebp,%esi
- xorl %eax,%ebp
- movdqa %xmm0,%xmm10
- xorl %eax,%esi
- addl %ecx,%ebx
- rorl $7,%edx
- addl %esi,%ebx
- addl 12(%rsp),%eax
xorl %ebp,%edx
- movl %ebx,%esi
- roll $5,%ebx
+ roll $5,%ecx
+ pshufd $238,%xmm7,%xmm10
+ addl %esi,%ebx
andl %edx,%edi
xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 12(%rsp),%eax
xorl %ebp,%edi
- addl %ebx,%eax
- rorl $7,%ecx
+ movl %ebx,%esi
+ roll $5,%ebx
addl %edi,%eax
- addl 16(%rsp),%ebp
- pxor %xmm5,%xmm1
-.byte 102,68,15,58,15,215,8
xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ pxor %xmm5,%xmm1
+ addl 16(%rsp),%ebp
+ xorl %ecx,%esi
+ punpcklqdq %xmm0,%xmm10
movl %eax,%edi
roll $5,%eax
pxor %xmm2,%xmm1
- xorl %ecx,%esi
- addl %eax,%ebp
+ addl %esi,%ebp
+ xorl %ecx,%edi
movdqa %xmm8,%xmm9
- paddd %xmm0,%xmm8
rorl $7,%ebx
- addl %esi,%ebp
+ paddd %xmm0,%xmm8
+ addl %eax,%ebp
pxor %xmm10,%xmm1
addl 20(%rsp),%edx
- xorl %ecx,%edi
+ xorl %ebx,%edi
movl %ebp,%esi
roll $5,%ebp
movdqa %xmm1,%xmm10
+ addl %edi,%edx
+ xorl %ebx,%esi
movdqa %xmm8,0(%rsp)
- xorl %ebx,%edi
- addl %ebp,%edx
rorl $7,%eax
- addl %edi,%edx
- pslld $2,%xmm1
+ addl %ebp,%edx
addl 24(%rsp),%ecx
- xorl %ebx,%esi
- psrld $30,%xmm10
+ pslld $2,%xmm1
+ xorl %eax,%esi
movl %edx,%edi
+ psrld $30,%xmm10
roll $5,%edx
- xorl %eax,%esi
- addl %edx,%ecx
- rorl $7,%ebp
addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
por %xmm10,%xmm1
+ addl %edx,%ecx
addl 28(%rsp),%ebx
- xorl %eax,%edi
- movdqa %xmm1,%xmm8
+ pshufd $238,%xmm0,%xmm8
+ xorl %ebp,%edi
movl %ecx,%esi
roll $5,%ecx
- xorl %ebp,%edi
- addl %ecx,%ebx
- rorl $7,%edx
addl %edi,%ebx
- addl 32(%rsp),%eax
- pxor %xmm6,%xmm2
-.byte 102,68,15,58,15,192,8
xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ pxor %xmm6,%xmm2
+ addl 32(%rsp),%eax
+ xorl %edx,%esi
+ punpcklqdq %xmm1,%xmm8
movl %ebx,%edi
roll $5,%ebx
pxor %xmm3,%xmm2
- xorl %edx,%esi
- addl %ebx,%eax
- movdqa 32(%r11),%xmm10
- paddd %xmm1,%xmm9
- rorl $7,%ecx
addl %esi,%eax
+ xorl %edx,%edi
+ movdqa 0(%r11),%xmm10
+ rorl $7,%ecx
+ paddd %xmm1,%xmm9
+ addl %ebx,%eax
pxor %xmm8,%xmm2
addl 36(%rsp),%ebp
- xorl %edx,%edi
+ xorl %ecx,%edi
movl %eax,%esi
roll $5,%eax
movdqa %xmm2,%xmm8
+ addl %edi,%ebp
+ xorl %ecx,%esi
movdqa %xmm9,16(%rsp)
- xorl %ecx,%edi
- addl %eax,%ebp
rorl $7,%ebx
- addl %edi,%ebp
- pslld $2,%xmm2
+ addl %eax,%ebp
addl 40(%rsp),%edx
- xorl %ecx,%esi
- psrld $30,%xmm8
+ pslld $2,%xmm2
+ xorl %ebx,%esi
movl %ebp,%edi
+ psrld $30,%xmm8
roll $5,%ebp
- xorl %ebx,%esi
- addl %ebp,%edx
- rorl $7,%eax
addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
por %xmm8,%xmm2
+ addl %ebp,%edx
addl 44(%rsp),%ecx
- xorl %ebx,%edi
- movdqa %xmm2,%xmm9
+ pshufd $238,%xmm1,%xmm9
+ xorl %eax,%edi
movl %edx,%esi
roll $5,%edx
- xorl %eax,%edi
- addl %edx,%ecx
- rorl $7,%ebp
addl %edi,%ecx
- addl 48(%rsp),%ebx
- pxor %xmm7,%xmm3
-.byte 102,68,15,58,15,201,8
xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ pxor %xmm7,%xmm3
+ addl 48(%rsp),%ebx
+ xorl %ebp,%esi
+ punpcklqdq %xmm2,%xmm9
movl %ecx,%edi
roll $5,%ecx
pxor %xmm4,%xmm3
- xorl %ebp,%esi
- addl %ecx,%ebx
+ addl %esi,%ebx
+ xorl %ebp,%edi
movdqa %xmm10,%xmm8
- paddd %xmm2,%xmm10
rorl $7,%edx
- addl %esi,%ebx
+ paddd %xmm2,%xmm10
+ addl %ecx,%ebx
pxor %xmm9,%xmm3
addl 52(%rsp),%eax
- xorl %ebp,%edi
+ xorl %edx,%edi
movl %ebx,%esi
roll $5,%ebx
movdqa %xmm3,%xmm9
+ addl %edi,%eax
+ xorl %edx,%esi
movdqa %xmm10,32(%rsp)
- xorl %edx,%edi
- addl %ebx,%eax
rorl $7,%ecx
- addl %edi,%eax
- pslld $2,%xmm3
+ addl %ebx,%eax
addl 56(%rsp),%ebp
- xorl %edx,%esi
- psrld $30,%xmm9
+ pslld $2,%xmm3
+ xorl %ecx,%esi
movl %eax,%edi
+ psrld $30,%xmm9
roll $5,%eax
- xorl %ecx,%esi
- addl %eax,%ebp
- rorl $7,%ebx
addl %esi,%ebp
+ xorl %ecx,%edi
+ rorl $7,%ebx
por %xmm9,%xmm3
+ addl %eax,%ebp
addl 60(%rsp),%edx
- xorl %ecx,%edi
- movdqa %xmm3,%xmm10
+ pshufd $238,%xmm2,%xmm10
+ xorl %ebx,%edi
movl %ebp,%esi
roll $5,%ebp
- xorl %ebx,%edi
- addl %ebp,%edx
- rorl $7,%eax
addl %edi,%edx
- addl 0(%rsp),%ecx
- pxor %xmm0,%xmm4
-.byte 102,68,15,58,15,210,8
xorl %ebx,%esi
+ rorl $7,%eax
+ addl %ebp,%edx
+ pxor %xmm0,%xmm4
+ addl 0(%rsp),%ecx
+ xorl %eax,%esi
+ punpcklqdq %xmm3,%xmm10
movl %edx,%edi
roll $5,%edx
pxor %xmm5,%xmm4
- xorl %eax,%esi
- addl %edx,%ecx
+ addl %esi,%ecx
+ xorl %eax,%edi
movdqa %xmm8,%xmm9
- paddd %xmm3,%xmm8
rorl $7,%ebp
- addl %esi,%ecx
+ paddd %xmm3,%xmm8
+ addl %edx,%ecx
pxor %xmm10,%xmm4
addl 4(%rsp),%ebx
- xorl %eax,%edi
+ xorl %ebp,%edi
movl %ecx,%esi
roll $5,%ecx
movdqa %xmm4,%xmm10
+ addl %edi,%ebx
+ xorl %ebp,%esi
movdqa %xmm8,48(%rsp)
- xorl %ebp,%edi
- addl %ecx,%ebx
rorl $7,%edx
- addl %edi,%ebx
- pslld $2,%xmm4
+ addl %ecx,%ebx
addl 8(%rsp),%eax
- xorl %ebp,%esi
- psrld $30,%xmm10
+ pslld $2,%xmm4
+ xorl %edx,%esi
movl %ebx,%edi
+ psrld $30,%xmm10
roll $5,%ebx
- xorl %edx,%esi
- addl %ebx,%eax
- rorl $7,%ecx
addl %esi,%eax
+ xorl %edx,%edi
+ rorl $7,%ecx
por %xmm10,%xmm4
+ addl %ebx,%eax
addl 12(%rsp),%ebp
- xorl %edx,%edi
- movdqa %xmm4,%xmm8
+ pshufd $238,%xmm3,%xmm8
+ xorl %ecx,%edi
movl %eax,%esi
roll $5,%eax
- xorl %ecx,%edi
- addl %eax,%ebp
- rorl $7,%ebx
addl %edi,%ebp
- addl 16(%rsp),%edx
- pxor %xmm1,%xmm5
-.byte 102,68,15,58,15,195,8
xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ pxor %xmm1,%xmm5
+ addl 16(%rsp),%edx
+ xorl %ebx,%esi
+ punpcklqdq %xmm4,%xmm8
movl %ebp,%edi
roll $5,%ebp
pxor %xmm6,%xmm5
- xorl %ebx,%esi
- addl %ebp,%edx
+ addl %esi,%edx
+ xorl %ebx,%edi
movdqa %xmm9,%xmm10
- paddd %xmm4,%xmm9
rorl $7,%eax
- addl %esi,%edx
+ paddd %xmm4,%xmm9
+ addl %ebp,%edx
pxor %xmm8,%xmm5
addl 20(%rsp),%ecx
- xorl %ebx,%edi
+ xorl %eax,%edi
movl %edx,%esi
roll $5,%edx
movdqa %xmm5,%xmm8
+ addl %edi,%ecx
+ xorl %eax,%esi
movdqa %xmm9,0(%rsp)
- xorl %eax,%edi
- addl %edx,%ecx
rorl $7,%ebp
- addl %edi,%ecx
- pslld $2,%xmm5
+ addl %edx,%ecx
addl 24(%rsp),%ebx
- xorl %eax,%esi
- psrld $30,%xmm8
+ pslld $2,%xmm5
+ xorl %ebp,%esi
movl %ecx,%edi
+ psrld $30,%xmm8
roll $5,%ecx
- xorl %ebp,%esi
- addl %ecx,%ebx
- rorl $7,%edx
addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
por %xmm8,%xmm5
+ addl %ecx,%ebx
addl 28(%rsp),%eax
- xorl %ebp,%edi
- movdqa %xmm5,%xmm9
+ pshufd $238,%xmm4,%xmm9
+ rorl $7,%ecx
movl %ebx,%esi
- roll $5,%ebx
xorl %edx,%edi
- addl %ebx,%eax
- rorl $7,%ecx
+ roll $5,%ebx
addl %edi,%eax
- movl %ecx,%edi
- pxor %xmm2,%xmm6
-.byte 102,68,15,58,15,204,8
+ xorl %ecx,%esi
xorl %edx,%ecx
+ addl %ebx,%eax
+ pxor %xmm2,%xmm6
addl 32(%rsp),%ebp
- andl %edx,%edi
- pxor %xmm7,%xmm6
andl %ecx,%esi
+ xorl %edx,%ecx
rorl $7,%ebx
- movdqa %xmm10,%xmm8
- paddd %xmm5,%xmm10
- addl %edi,%ebp
+ punpcklqdq %xmm5,%xmm9
movl %eax,%edi
- pxor %xmm9,%xmm6
+ xorl %ecx,%esi
+ pxor %xmm7,%xmm6
roll $5,%eax
addl %esi,%ebp
- xorl %edx,%ecx
- addl %eax,%ebp
- movdqa %xmm6,%xmm9
- movdqa %xmm10,16(%rsp)
- movl %ebx,%esi
+ movdqa %xmm10,%xmm8
+ xorl %ebx,%edi
+ paddd %xmm5,%xmm10
xorl %ecx,%ebx
+ pxor %xmm9,%xmm6
+ addl %eax,%ebp
addl 36(%rsp),%edx
- andl %ecx,%esi
- pslld $2,%xmm6
andl %ebx,%edi
+ xorl %ecx,%ebx
rorl $7,%eax
- psrld $30,%xmm9
- addl %esi,%edx
+ movdqa %xmm6,%xmm9
movl %ebp,%esi
+ xorl %ebx,%edi
+ movdqa %xmm10,16(%rsp)
roll $5,%ebp
addl %edi,%edx
- xorl %ecx,%ebx
- addl %ebp,%edx
- por %xmm9,%xmm6
- movl %eax,%edi
+ xorl %eax,%esi
+ pslld $2,%xmm6
xorl %ebx,%eax
- movdqa %xmm6,%xmm10
+ addl %ebp,%edx
+ psrld $30,%xmm9
addl 40(%rsp),%ecx
- andl %ebx,%edi
andl %eax,%esi
+ xorl %ebx,%eax
+ por %xmm9,%xmm6
rorl $7,%ebp
- addl %edi,%ecx
movl %edx,%edi
+ xorl %eax,%esi
roll $5,%edx
+ pshufd $238,%xmm5,%xmm10
addl %esi,%ecx
- xorl %ebx,%eax
- addl %edx,%ecx
- movl %ebp,%esi
+ xorl %ebp,%edi
xorl %eax,%ebp
+ addl %edx,%ecx
addl 44(%rsp),%ebx
- andl %eax,%esi
andl %ebp,%edi
+ xorl %eax,%ebp
rorl $7,%edx
- addl %esi,%ebx
movl %ecx,%esi
+ xorl %ebp,%edi
roll $5,%ecx
addl %edi,%ebx
- xorl %eax,%ebp
+ xorl %edx,%esi
+ xorl %ebp,%edx
addl %ecx,%ebx
- movl %edx,%edi
pxor %xmm3,%xmm7
-.byte 102,68,15,58,15,213,8
- xorl %ebp,%edx
addl 48(%rsp),%eax
- andl %ebp,%edi
- pxor %xmm0,%xmm7
andl %edx,%esi
+ xorl %ebp,%edx
rorl $7,%ecx
- movdqa 48(%r11),%xmm9
- paddd %xmm6,%xmm8
- addl %edi,%eax
+ punpcklqdq %xmm6,%xmm10
movl %ebx,%edi
- pxor %xmm10,%xmm7
+ xorl %edx,%esi
+ pxor %xmm0,%xmm7
roll $5,%ebx
addl %esi,%eax
- xorl %ebp,%edx
- addl %ebx,%eax
- movdqa %xmm7,%xmm10
- movdqa %xmm8,32(%rsp)
- movl %ecx,%esi
+ movdqa 32(%r11),%xmm9
+ xorl %ecx,%edi
+ paddd %xmm6,%xmm8
xorl %edx,%ecx
+ pxor %xmm10,%xmm7
+ addl %ebx,%eax
addl 52(%rsp),%ebp
- andl %edx,%esi
- pslld $2,%xmm7
andl %ecx,%edi
+ xorl %edx,%ecx
rorl $7,%ebx
- psrld $30,%xmm10
- addl %esi,%ebp
+ movdqa %xmm7,%xmm10
movl %eax,%esi
+ xorl %ecx,%edi
+ movdqa %xmm8,32(%rsp)
roll $5,%eax
addl %edi,%ebp
- xorl %edx,%ecx
- addl %eax,%ebp
- por %xmm10,%xmm7
- movl %ebx,%edi
+ xorl %ebx,%esi
+ pslld $2,%xmm7
xorl %ecx,%ebx
- movdqa %xmm7,%xmm8
+ addl %eax,%ebp
+ psrld $30,%xmm10
addl 56(%rsp),%edx
- andl %ecx,%edi
andl %ebx,%esi
+ xorl %ecx,%ebx
+ por %xmm10,%xmm7
rorl $7,%eax
- addl %edi,%edx
movl %ebp,%edi
+ xorl %ebx,%esi
roll $5,%ebp
+ pshufd $238,%xmm6,%xmm8
addl %esi,%edx
- xorl %ecx,%ebx
- addl %ebp,%edx
- movl %eax,%esi
+ xorl %eax,%edi
xorl %ebx,%eax
+ addl %ebp,%edx
addl 60(%rsp),%ecx
- andl %ebx,%esi
andl %eax,%edi
+ xorl %ebx,%eax
rorl $7,%ebp
- addl %esi,%ecx
movl %edx,%esi
+ xorl %eax,%edi
roll $5,%edx
addl %edi,%ecx
- xorl %ebx,%eax
+ xorl %ebp,%esi
+ xorl %eax,%ebp
addl %edx,%ecx
- movl %ebp,%edi
pxor %xmm4,%xmm0
-.byte 102,68,15,58,15,198,8
- xorl %eax,%ebp
addl 0(%rsp),%ebx
- andl %eax,%edi
- pxor %xmm1,%xmm0
andl %ebp,%esi
+ xorl %eax,%ebp
rorl $7,%edx
- movdqa %xmm9,%xmm10
- paddd %xmm7,%xmm9
- addl %edi,%ebx
+ punpcklqdq %xmm7,%xmm8
movl %ecx,%edi
- pxor %xmm8,%xmm0
+ xorl %ebp,%esi
+ pxor %xmm1,%xmm0
roll $5,%ecx
addl %esi,%ebx
- xorl %eax,%ebp
- addl %ecx,%ebx
- movdqa %xmm0,%xmm8
- movdqa %xmm9,48(%rsp)
- movl %edx,%esi
+ movdqa %xmm9,%xmm10
+ xorl %edx,%edi
+ paddd %xmm7,%xmm9
xorl %ebp,%edx
+ pxor %xmm8,%xmm0
+ addl %ecx,%ebx
addl 4(%rsp),%eax
- andl %ebp,%esi
- pslld $2,%xmm0
andl %edx,%edi
+ xorl %ebp,%edx
rorl $7,%ecx
- psrld $30,%xmm8
- addl %esi,%eax
+ movdqa %xmm0,%xmm8
movl %ebx,%esi
+ xorl %edx,%edi
+ movdqa %xmm9,48(%rsp)
roll $5,%ebx
addl %edi,%eax
- xorl %ebp,%edx
- addl %ebx,%eax
- por %xmm8,%xmm0
- movl %ecx,%edi
+ xorl %ecx,%esi
+ pslld $2,%xmm0
xorl %edx,%ecx
- movdqa %xmm0,%xmm9
+ addl %ebx,%eax
+ psrld $30,%xmm8
addl 8(%rsp),%ebp
- andl %edx,%edi
andl %ecx,%esi
+ xorl %edx,%ecx
+ por %xmm8,%xmm0
rorl $7,%ebx
- addl %edi,%ebp
movl %eax,%edi
+ xorl %ecx,%esi
roll $5,%eax
+ pshufd $238,%xmm7,%xmm9
addl %esi,%ebp
- xorl %edx,%ecx
- addl %eax,%ebp
- movl %ebx,%esi
+ xorl %ebx,%edi
xorl %ecx,%ebx
+ addl %eax,%ebp
addl 12(%rsp),%edx
- andl %ecx,%esi
andl %ebx,%edi
+ xorl %ecx,%ebx
rorl $7,%eax
- addl %esi,%edx
movl %ebp,%esi
+ xorl %ebx,%edi
roll $5,%ebp
addl %edi,%edx
- xorl %ecx,%ebx
+ xorl %eax,%esi
+ xorl %ebx,%eax
addl %ebp,%edx
- movl %eax,%edi
pxor %xmm5,%xmm1
-.byte 102,68,15,58,15,207,8
- xorl %ebx,%eax
addl 16(%rsp),%ecx
- andl %ebx,%edi
- pxor %xmm2,%xmm1
andl %eax,%esi
+ xorl %ebx,%eax
rorl $7,%ebp
- movdqa %xmm10,%xmm8
- paddd %xmm0,%xmm10
- addl %edi,%ecx
+ punpcklqdq %xmm0,%xmm9
movl %edx,%edi
- pxor %xmm9,%xmm1
+ xorl %eax,%esi
+ pxor %xmm2,%xmm1
roll $5,%edx
addl %esi,%ecx
- xorl %ebx,%eax
- addl %edx,%ecx
- movdqa %xmm1,%xmm9
- movdqa %xmm10,0(%rsp)
- movl %ebp,%esi
+ movdqa %xmm10,%xmm8
+ xorl %ebp,%edi
+ paddd %xmm0,%xmm10
xorl %eax,%ebp
+ pxor %xmm9,%xmm1
+ addl %edx,%ecx
addl 20(%rsp),%ebx
- andl %eax,%esi
- pslld $2,%xmm1
andl %ebp,%edi
+ xorl %eax,%ebp
rorl $7,%edx
- psrld $30,%xmm9
- addl %esi,%ebx
+ movdqa %xmm1,%xmm9
movl %ecx,%esi
+ xorl %ebp,%edi
+ movdqa %xmm10,0(%rsp)
roll $5,%ecx
addl %edi,%ebx
- xorl %eax,%ebp
- addl %ecx,%ebx
- por %xmm9,%xmm1
- movl %edx,%edi
+ xorl %edx,%esi
+ pslld $2,%xmm1
xorl %ebp,%edx
- movdqa %xmm1,%xmm10
+ addl %ecx,%ebx
+ psrld $30,%xmm9
addl 24(%rsp),%eax
- andl %ebp,%edi
andl %edx,%esi
+ xorl %ebp,%edx
+ por %xmm9,%xmm1
rorl $7,%ecx
- addl %edi,%eax
movl %ebx,%edi
+ xorl %edx,%esi
roll $5,%ebx
+ pshufd $238,%xmm0,%xmm10
addl %esi,%eax
- xorl %ebp,%edx
- addl %ebx,%eax
- movl %ecx,%esi
+ xorl %ecx,%edi
xorl %edx,%ecx
+ addl %ebx,%eax
addl 28(%rsp),%ebp
- andl %edx,%esi
andl %ecx,%edi
+ xorl %edx,%ecx
rorl $7,%ebx
- addl %esi,%ebp
movl %eax,%esi
+ xorl %ecx,%edi
roll $5,%eax
addl %edi,%ebp
- xorl %edx,%ecx
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
addl %eax,%ebp
- movl %ebx,%edi
pxor %xmm6,%xmm2
-.byte 102,68,15,58,15,208,8
- xorl %ecx,%ebx
addl 32(%rsp),%edx
- andl %ecx,%edi
- pxor %xmm3,%xmm2
andl %ebx,%esi
+ xorl %ecx,%ebx
rorl $7,%eax
- movdqa %xmm8,%xmm9
- paddd %xmm1,%xmm8
- addl %edi,%edx
+ punpcklqdq %xmm1,%xmm10
movl %ebp,%edi
- pxor %xmm10,%xmm2
+ xorl %ebx,%esi
+ pxor %xmm3,%xmm2
roll $5,%ebp
addl %esi,%edx
- xorl %ecx,%ebx
- addl %ebp,%edx
- movdqa %xmm2,%xmm10
- movdqa %xmm8,16(%rsp)
- movl %eax,%esi
+ movdqa %xmm8,%xmm9
+ xorl %eax,%edi
+ paddd %xmm1,%xmm8
xorl %ebx,%eax
+ pxor %xmm10,%xmm2
+ addl %ebp,%edx
addl 36(%rsp),%ecx
- andl %ebx,%esi
- pslld $2,%xmm2
andl %eax,%edi
+ xorl %ebx,%eax
rorl $7,%ebp
- psrld $30,%xmm10
- addl %esi,%ecx
+ movdqa %xmm2,%xmm10
movl %edx,%esi
+ xorl %eax,%edi
+ movdqa %xmm8,16(%rsp)
roll $5,%edx
addl %edi,%ecx
- xorl %ebx,%eax
- addl %edx,%ecx
- por %xmm10,%xmm2
- movl %ebp,%edi
+ xorl %ebp,%esi
+ pslld $2,%xmm2
xorl %eax,%ebp
- movdqa %xmm2,%xmm8
+ addl %edx,%ecx
+ psrld $30,%xmm10
addl 40(%rsp),%ebx
- andl %eax,%edi
andl %ebp,%esi
+ xorl %eax,%ebp
+ por %xmm10,%xmm2
rorl $7,%edx
- addl %edi,%ebx
movl %ecx,%edi
+ xorl %ebp,%esi
roll $5,%ecx
+ pshufd $238,%xmm1,%xmm8
addl %esi,%ebx
- xorl %eax,%ebp
- addl %ecx,%ebx
- movl %edx,%esi
+ xorl %edx,%edi
xorl %ebp,%edx
+ addl %ecx,%ebx
addl 44(%rsp),%eax
- andl %ebp,%esi
andl %edx,%edi
+ xorl %ebp,%edx
rorl $7,%ecx
- addl %esi,%eax
movl %ebx,%esi
+ xorl %edx,%edi
roll $5,%ebx
addl %edi,%eax
- xorl %ebp,%edx
+ xorl %edx,%esi
addl %ebx,%eax
- addl 48(%rsp),%ebp
pxor %xmm7,%xmm3
-.byte 102,68,15,58,15,193,8
- xorl %edx,%esi
+ addl 48(%rsp),%ebp
+ xorl %ecx,%esi
+ punpcklqdq %xmm2,%xmm8
movl %eax,%edi
roll $5,%eax
pxor %xmm4,%xmm3
- xorl %ecx,%esi
- addl %eax,%ebp
+ addl %esi,%ebp
+ xorl %ecx,%edi
movdqa %xmm9,%xmm10
- paddd %xmm2,%xmm9
rorl $7,%ebx
- addl %esi,%ebp
+ paddd %xmm2,%xmm9
+ addl %eax,%ebp
pxor %xmm8,%xmm3
addl 52(%rsp),%edx
- xorl %ecx,%edi
+ xorl %ebx,%edi
movl %ebp,%esi
roll $5,%ebp
movdqa %xmm3,%xmm8
+ addl %edi,%edx
+ xorl %ebx,%esi
movdqa %xmm9,32(%rsp)
- xorl %ebx,%edi
- addl %ebp,%edx
rorl $7,%eax
- addl %edi,%edx
- pslld $2,%xmm3
+ addl %ebp,%edx
addl 56(%rsp),%ecx
- xorl %ebx,%esi
- psrld $30,%xmm8
+ pslld $2,%xmm3
+ xorl %eax,%esi
movl %edx,%edi
+ psrld $30,%xmm8
roll $5,%edx
- xorl %eax,%esi
- addl %edx,%ecx
- rorl $7,%ebp
addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
por %xmm8,%xmm3
+ addl %edx,%ecx
addl 60(%rsp),%ebx
- xorl %eax,%edi
+ xorl %ebp,%edi
movl %ecx,%esi
roll $5,%ecx
- xorl %ebp,%edi
- addl %ecx,%ebx
- rorl $7,%edx
addl %edi,%ebx
- addl 0(%rsp),%eax
- paddd %xmm3,%xmm10
xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 0(%rsp),%eax
+ xorl %edx,%esi
movl %ebx,%edi
roll $5,%ebx
- xorl %edx,%esi
+ paddd %xmm3,%xmm10
+ addl %esi,%eax
+ xorl %edx,%edi
movdqa %xmm10,48(%rsp)
- addl %ebx,%eax
rorl $7,%ecx
- addl %esi,%eax
+ addl %ebx,%eax
addl 4(%rsp),%ebp
- xorl %edx,%edi
+ xorl %ecx,%edi
movl %eax,%esi
roll $5,%eax
- xorl %ecx,%edi
- addl %eax,%ebp
- rorl $7,%ebx
addl %edi,%ebp
- addl 8(%rsp),%edx
xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 8(%rsp),%edx
+ xorl %ebx,%esi
movl %ebp,%edi
roll $5,%ebp
- xorl %ebx,%esi
- addl %ebp,%edx
- rorl $7,%eax
addl %esi,%edx
- addl 12(%rsp),%ecx
xorl %ebx,%edi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 12(%rsp),%ecx
+ xorl %eax,%edi
movl %edx,%esi
roll $5,%edx
- xorl %eax,%edi
- addl %edx,%ecx
- rorl $7,%ebp
addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
cmpq %r10,%r9
je .Ldone_ssse3
movdqa 64(%r11),%xmm6
- movdqa 0(%r11),%xmm9
+ movdqa -64(%r11),%xmm9
movdqu 0(%r9),%xmm0
movdqu 16(%r9),%xmm1
movdqu 32(%r9),%xmm2
@@ -2302,113 +2230,112 @@ _ssse3_shortcut:
.byte 102,15,56,0,198
addq $64,%r9
addl 16(%rsp),%ebx
- xorl %eax,%esi
-.byte 102,15,56,0,206
+ xorl %ebp,%esi
movl %ecx,%edi
+.byte 102,15,56,0,206
roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
paddd %xmm9,%xmm0
- xorl %ebp,%esi
addl %ecx,%ebx
- rorl $7,%edx
- addl %esi,%ebx
- movdqa %xmm0,0(%rsp)
addl 20(%rsp),%eax
- xorl %ebp,%edi
- psubd %xmm9,%xmm0
+ xorl %edx,%edi
movl %ebx,%esi
+ movdqa %xmm0,0(%rsp)
roll $5,%ebx
- xorl %edx,%edi
- addl %ebx,%eax
- rorl $7,%ecx
addl %edi,%eax
- addl 24(%rsp),%ebp
xorl %edx,%esi
+ rorl $7,%ecx
+ psubd %xmm9,%xmm0
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
movl %eax,%edi
roll $5,%eax
- xorl %ecx,%esi
- addl %eax,%ebp
- rorl $7,%ebx
addl %esi,%ebp
- addl 28(%rsp),%edx
xorl %ecx,%edi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
movl %ebp,%esi
roll $5,%ebp
- xorl %ebx,%edi
- addl %ebp,%edx
- rorl $7,%eax
addl %edi,%edx
- addl 32(%rsp),%ecx
xorl %ebx,%esi
-.byte 102,15,56,0,214
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
movl %edx,%edi
+.byte 102,15,56,0,214
roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
paddd %xmm9,%xmm1
- xorl %eax,%esi
addl %edx,%ecx
- rorl $7,%ebp
- addl %esi,%ecx
- movdqa %xmm1,16(%rsp)
addl 36(%rsp),%ebx
- xorl %eax,%edi
- psubd %xmm9,%xmm1
+ xorl %ebp,%edi
movl %ecx,%esi
+ movdqa %xmm1,16(%rsp)
roll $5,%ecx
- xorl %ebp,%edi
- addl %ecx,%ebx
- rorl $7,%edx
addl %edi,%ebx
- addl 40(%rsp),%eax
xorl %ebp,%esi
+ rorl $7,%edx
+ psubd %xmm9,%xmm1
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
movl %ebx,%edi
roll $5,%ebx
- xorl %edx,%esi
- addl %ebx,%eax
- rorl $7,%ecx
addl %esi,%eax
- addl 44(%rsp),%ebp
xorl %edx,%edi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
movl %eax,%esi
roll $5,%eax
- xorl %ecx,%edi
- addl %eax,%ebp
- rorl $7,%ebx
addl %edi,%ebp
- addl 48(%rsp),%edx
xorl %ecx,%esi
-.byte 102,15,56,0,222
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
movl %ebp,%edi
+.byte 102,15,56,0,222
roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
paddd %xmm9,%xmm2
- xorl %ebx,%esi
addl %ebp,%edx
- rorl $7,%eax
- addl %esi,%edx
- movdqa %xmm2,32(%rsp)
addl 52(%rsp),%ecx
- xorl %ebx,%edi
- psubd %xmm9,%xmm2
+ xorl %eax,%edi
movl %edx,%esi
+ movdqa %xmm2,32(%rsp)
roll $5,%edx
- xorl %eax,%edi
- addl %edx,%ecx
- rorl $7,%ebp
addl %edi,%ecx
- addl 56(%rsp),%ebx
xorl %eax,%esi
+ rorl $7,%ebp
+ psubd %xmm9,%xmm2
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
movl %ecx,%edi
roll $5,%ecx
- xorl %ebp,%esi
- addl %ecx,%ebx
- rorl $7,%edx
addl %esi,%ebx
- addl 60(%rsp),%eax
xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
movl %ebx,%esi
roll $5,%ebx
- xorl %edx,%edi
- addl %ebx,%eax
- rorl $7,%ecx
addl %edi,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
addl 0(%r8),%eax
addl 4(%r8),%esi
addl 8(%r8),%ecx
@@ -2418,108 +2345,110 @@ _ssse3_shortcut:
movl %esi,4(%r8)
movl %esi,%ebx
movl %ecx,8(%r8)
+ movl %ecx,%edi
movl %edx,12(%r8)
+ xorl %edx,%edi
movl %ebp,16(%r8)
+ andl %edi,%esi
jmp .Loop_ssse3
.p2align 4
.Ldone_ssse3:
addl 16(%rsp),%ebx
- xorl %eax,%esi
+ xorl %ebp,%esi
movl %ecx,%edi
roll $5,%ecx
- xorl %ebp,%esi
- addl %ecx,%ebx
- rorl $7,%edx
addl %esi,%ebx
- addl 20(%rsp),%eax
xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
movl %ebx,%esi
roll $5,%ebx
- xorl %edx,%edi
- addl %ebx,%eax
- rorl $7,%ecx
addl %edi,%eax
- addl 24(%rsp),%ebp
xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
movl %eax,%edi
roll $5,%eax
- xorl %ecx,%esi
- addl %eax,%ebp
- rorl $7,%ebx
addl %esi,%ebp
- addl 28(%rsp),%edx
xorl %ecx,%edi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
movl %ebp,%esi
roll $5,%ebp
- xorl %ebx,%edi
- addl %ebp,%edx
- rorl $7,%eax
addl %edi,%edx
- addl 32(%rsp),%ecx
xorl %ebx,%esi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
movl %edx,%edi
roll $5,%edx
- xorl %eax,%esi
- addl %edx,%ecx
- rorl $7,%ebp
addl %esi,%ecx
- addl 36(%rsp),%ebx
xorl %eax,%edi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
movl %ecx,%esi
roll $5,%ecx
- xorl %ebp,%edi
- addl %ecx,%ebx
- rorl $7,%edx
addl %edi,%ebx
- addl 40(%rsp),%eax
xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
movl %ebx,%edi
roll $5,%ebx
- xorl %edx,%esi
- addl %ebx,%eax
- rorl $7,%ecx
addl %esi,%eax
- addl 44(%rsp),%ebp
xorl %edx,%edi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
movl %eax,%esi
roll $5,%eax
- xorl %ecx,%edi
- addl %eax,%ebp
- rorl $7,%ebx
addl %edi,%ebp
- addl 48(%rsp),%edx
xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
movl %ebp,%edi
roll $5,%ebp
- xorl %ebx,%esi
- addl %ebp,%edx
- rorl $7,%eax
addl %esi,%edx
- addl 52(%rsp),%ecx
xorl %ebx,%edi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
movl %edx,%esi
roll $5,%edx
- xorl %eax,%edi
- addl %edx,%ecx
- rorl $7,%ebp
addl %edi,%ecx
- addl 56(%rsp),%ebx
xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
movl %ecx,%edi
roll $5,%ecx
- xorl %ebp,%esi
- addl %ecx,%ebx
- rorl $7,%edx
addl %esi,%ebx
- addl 60(%rsp),%eax
xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
movl %ebx,%esi
roll $5,%ebx
- xorl %edx,%edi
- addl %ebx,%eax
- rorl $7,%ecx
addl %edi,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
addl 0(%r8),%eax
addl 4(%r8),%esi
addl 8(%r8),%ecx
@@ -2530,16 +2459,19 @@ _ssse3_shortcut:
movl %ecx,8(%r8)
movl %edx,12(%r8)
movl %ebp,16(%r8)
- movaps 64+0(%rsp),%xmm6
- movaps 64+16(%rsp),%xmm7
- movaps 64+32(%rsp),%xmm8
- movaps 64+48(%rsp),%xmm9
- movaps 64+64(%rsp),%xmm10
- leaq 144(%rsp),%rsi
- movq 0(%rsi),%r12
- movq 8(%rsi),%rbp
- movq 16(%rsi),%rbx
- leaq 24(%rsi),%rsp
+ movaps -40-96(%r14),%xmm6
+ movaps -40-80(%r14),%xmm7
+ movaps -40-64(%r14),%xmm8
+ movaps -40-48(%r14),%xmm9
+ movaps -40-32(%r14),%xmm10
+ movaps -40-16(%r14),%xmm11
+ leaq (%r14),%rsi
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
.Lepilogue_ssse3:
movq 8(%rsp),%rdi
movq 16(%rsp),%rsi
@@ -2547,11 +2479,16 @@ _ssse3_shortcut:
.LSEH_end_sha1_block_data_order_ssse3:
.p2align 6
K_XX_XX:
-.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
-.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
-.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
-.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
-.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 6
@@ -2583,16 +2520,17 @@ se_handler:
jae .Lcommon_seh_tail
movq 64(%rax),%rax
- leaq 32(%rax),%rax
movq -8(%rax),%rbx
movq -16(%rax),%rbp
movq -24(%rax),%r12
movq -32(%rax),%r13
+ movq -40(%rax),%r14
movq %rbx,144(%r8)
movq %rbp,160(%r8)
movq %r12,216(%r8)
movq %r13,224(%r8)
+ movq %r14,232(%r8)
jmp .Lcommon_seh_tail
@@ -2629,18 +2567,23 @@ ssse3_handler:
cmpq %r10,%rbx
jae .Lcommon_seh_tail
- leaq 64(%rax),%rsi
+ movq 232(%r8),%rax
+
+ leaq -40-96(%rax),%rsi
leaq 512(%r8),%rdi
- movl $10,%ecx
-.long 0xa548f3fc
- leaq 168(%rax),%rax
+ movl $12,%ecx
+.long 0xa548f3fc
movq -8(%rax),%rbx
movq -16(%rax),%rbp
movq -24(%rax),%r12
+ movq -32(%rax),%r13
+ movq -40(%rax),%r14
movq %rbx,144(%r8)
movq %rbp,160(%r8)
movq %r12,216(%r8)
+ movq %r13,224(%r8)
+ movq %r14,232(%r8)
.Lcommon_seh_tail:
movq 8(%rax),%rdi
@@ -2652,7 +2595,7 @@ ssse3_handler:
movq 40(%r9),%rdi
movq %r8,%rsi
movl $154,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
movq %r9,%rsi
xorq %rcx,%rcx
@@ -2698,5 +2641,5 @@ ssse3_handler:
.LSEH_info_sha1_block_data_order_ssse3:
.byte 9,0,0,0
.rva ssse3_handler
-.rva .Lprologue_ssse3,.Lepilogue_ssse3
+.rva .Lprologue_ssse3,.Lepilogue_ssse3
diff --git a/lib/accelerated/x86/coff/sha256-ssse3-x86.s b/lib/accelerated/x86/coff/sha256-ssse3-x86.s
index 61d6eaccf9..91cf88cc34 100644
--- a/lib/accelerated/x86/coff/sha256-ssse3-x86.s
+++ b/lib/accelerated/x86/coff/sha256-ssse3-x86.s
@@ -64,195 +64,391 @@ _sha256_block_data_order:
movl %edi,4(%esp)
movl %eax,8(%esp)
movl %ebx,12(%esp)
+ jmp .L002loop
.align 16
.L002loop:
movl (%edi),%eax
movl 4(%edi),%ebx
movl 8(%edi),%ecx
- movl 12(%edi),%edx
bswap %eax
+ movl 12(%edi),%edx
bswap %ebx
- bswap %ecx
- bswap %edx
pushl %eax
+ bswap %ecx
pushl %ebx
+ bswap %edx
pushl %ecx
pushl %edx
movl 16(%edi),%eax
movl 20(%edi),%ebx
movl 24(%edi),%ecx
- movl 28(%edi),%edx
bswap %eax
+ movl 28(%edi),%edx
bswap %ebx
- bswap %ecx
- bswap %edx
pushl %eax
+ bswap %ecx
pushl %ebx
+ bswap %edx
pushl %ecx
pushl %edx
movl 32(%edi),%eax
movl 36(%edi),%ebx
movl 40(%edi),%ecx
- movl 44(%edi),%edx
bswap %eax
+ movl 44(%edi),%edx
bswap %ebx
- bswap %ecx
- bswap %edx
pushl %eax
+ bswap %ecx
pushl %ebx
+ bswap %edx
pushl %ecx
pushl %edx
movl 48(%edi),%eax
movl 52(%edi),%ebx
movl 56(%edi),%ecx
- movl 60(%edi),%edx
bswap %eax
+ movl 60(%edi),%edx
bswap %ebx
- bswap %ecx
- bswap %edx
pushl %eax
+ bswap %ecx
pushl %ebx
+ bswap %edx
pushl %ecx
pushl %edx
addl $64,%edi
- subl $32,%esp
- movl %edi,100(%esp)
+ leal -36(%esp),%esp
+ movl %edi,104(%esp)
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
movl 12(%esi),%edi
- movl %ebx,4(%esp)
- movl %ecx,8(%esp)
- movl %edi,12(%esp)
+ movl %ebx,8(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,12(%esp)
+ movl %edi,16(%esp)
+ movl %ebx,(%esp)
movl 16(%esi),%edx
movl 20(%esi),%ebx
movl 24(%esi),%ecx
movl 28(%esi),%edi
- movl %ebx,20(%esp)
- movl %ecx,24(%esp)
- movl %edi,28(%esp)
+ movl %ebx,24(%esp)
+ movl %ecx,28(%esp)
+ movl %edi,32(%esp)
.align 16
.L00300_15:
- movl 92(%esp),%ebx
movl %edx,%ecx
+ movl 24(%esp),%esi
rorl $14,%ecx
- movl 20(%esp),%esi
+ movl 28(%esp),%edi
xorl %edx,%ecx
- rorl $5,%ecx
- xorl %edx,%ecx
- rorl $6,%ecx
- movl 24(%esp),%edi
- addl %ecx,%ebx
xorl %edi,%esi
- movl %edx,16(%esp)
- movl %eax,%ecx
+ movl 96(%esp),%ebx
+ rorl $5,%ecx
andl %edx,%esi
- movl 12(%esp),%edx
+ movl %edx,20(%esp)
+ xorl %ecx,%edx
+ addl 32(%esp),%ebx
xorl %edi,%esi
- movl %eax,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
addl %esi,%ebx
rorl $9,%ecx
- addl 28(%esp),%ebx
+ addl %edx,%ebx
+ movl 8(%esp),%edi
xorl %eax,%ecx
+ movl %eax,4(%esp)
+ leal -4(%esp),%esp
rorl $11,%ecx
- movl 4(%esp),%esi
+ movl (%ebp),%esi
xorl %eax,%ecx
+ movl 20(%esp),%edx
+ xorl %edi,%eax
rorl $2,%ecx
+ addl %esi,%ebx
+ movl %eax,(%esp)
addl %ebx,%edx
- movl 8(%esp),%edi
+ andl 4(%esp),%eax
addl %ecx,%ebx
- movl %eax,(%esp)
- movl %eax,%ecx
- subl $4,%esp
- orl %esi,%eax
- andl %esi,%ecx
- andl %edi,%eax
- movl (%ebp),%esi
- orl %ecx,%eax
+ xorl %edi,%eax
addl $4,%ebp
addl %ebx,%eax
- addl %esi,%edx
- addl %esi,%eax
cmpl $3248222580,%esi
jne .L00300_15
- movl 152(%esp),%ebx
+ movl 156(%esp),%ecx
+ jmp .L00416_63
.align 16
.L00416_63:
- movl %ebx,%esi
- movl 100(%esp),%ecx
- rorl $11,%esi
- movl %ecx,%edi
- xorl %ebx,%esi
- rorl $7,%esi
+ movl %ecx,%ebx
+ movl 104(%esp),%esi
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
shrl $3,%ebx
- rorl $2,%edi
- xorl %esi,%ebx
- xorl %ecx,%edi
- rorl $17,%edi
- shrl $10,%ecx
- addl 156(%esp),%ebx
- xorl %ecx,%edi
- addl 120(%esp),%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 160(%esp),%ebx
+ shrl $10,%edi
+ addl 124(%esp),%ebx
movl %edx,%ecx
- addl %edi,%ebx
+ xorl %esi,%edi
+ movl 24(%esp),%esi
rorl $14,%ecx
- movl 20(%esp),%esi
- xorl %edx,%ecx
- rorl $5,%ecx
- movl %ebx,92(%esp)
+ addl %edi,%ebx
+ movl 28(%esp),%edi
xorl %edx,%ecx
- rorl $6,%ecx
- movl 24(%esp),%edi
- addl %ecx,%ebx
xorl %edi,%esi
- movl %edx,16(%esp)
- movl %eax,%ecx
+ movl %ebx,96(%esp)
+ rorl $5,%ecx
andl %edx,%esi
- movl 12(%esp),%edx
+ movl %edx,20(%esp)
+ xorl %ecx,%edx
+ addl 32(%esp),%ebx
xorl %edi,%esi
- movl %eax,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
addl %esi,%ebx
rorl $9,%ecx
- addl 28(%esp),%ebx
+ addl %edx,%ebx
+ movl 8(%esp),%edi
xorl %eax,%ecx
+ movl %eax,4(%esp)
+ leal -4(%esp),%esp
rorl $11,%ecx
- movl 4(%esp),%esi
+ movl (%ebp),%esi
xorl %eax,%ecx
+ movl 20(%esp),%edx
+ xorl %edi,%eax
rorl $2,%ecx
+ addl %esi,%ebx
+ movl %eax,(%esp)
addl %ebx,%edx
- movl 8(%esp),%edi
+ andl 4(%esp),%eax
addl %ecx,%ebx
+ xorl %edi,%eax
+ movl 156(%esp),%ecx
+ addl $4,%ebp
+ addl %ebx,%eax
+ cmpl $3329325298,%esi
+ jne .L00416_63
+ movl 356(%esp),%esi
+ movl 8(%esp),%ebx
+ movl 16(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl 24(%esp),%eax
+ movl 28(%esp),%ebx
+ movl 32(%esp),%ecx
+ movl 360(%esp),%edi
+ addl 16(%esi),%edx
+ addl 20(%esi),%eax
+ addl 24(%esi),%ebx
+ addl 28(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %eax,20(%esi)
+ movl %ebx,24(%esi)
+ movl %ecx,28(%esi)
+ leal 356(%esp),%esp
+ subl $256,%ebp
+ cmpl 8(%esp),%edi
+ jb .L002loop
+ movl 12(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 32
+.L005loop_shrd:
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ bswap %eax
+ movl 12(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ movl 16(%edi),%eax
+ movl 20(%edi),%ebx
+ movl 24(%edi),%ecx
+ bswap %eax
+ movl 28(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ movl 32(%edi),%eax
+ movl 36(%edi),%ebx
+ movl 40(%edi),%ecx
+ bswap %eax
+ movl 44(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ movl 48(%edi),%eax
+ movl 52(%edi),%ebx
+ movl 56(%edi),%ecx
+ bswap %eax
+ movl 60(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ addl $64,%edi
+ leal -36(%esp),%esp
+ movl %edi,104(%esp)
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,8(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,12(%esp)
+ movl %edi,16(%esp)
+ movl %ebx,(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%edi
+ movl %ebx,24(%esp)
+ movl %ecx,28(%esp)
+ movl %edi,32(%esp)
+.align 16
+.L00600_15_shrd:
+ movl %edx,%ecx
+ movl 24(%esp),%esi
+ shrdl $14,%ecx,%ecx
+ movl 28(%esp),%edi
+ xorl %edx,%ecx
+ xorl %edi,%esi
+ movl 96(%esp),%ebx
+ shrdl $5,%ecx,%ecx
+ andl %edx,%esi
+ movl %edx,20(%esp)
+ xorl %ecx,%edx
+ addl 32(%esp),%ebx
+ xorl %edi,%esi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %esi,%ebx
+ shrdl $9,%ecx,%ecx
+ addl %edx,%ebx
+ movl 8(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,4(%esp)
+ leal -4(%esp),%esp
+ shrdl $11,%ecx,%ecx
+ movl (%ebp),%esi
+ xorl %eax,%ecx
+ movl 20(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %esi,%ebx
movl %eax,(%esp)
+ addl %ebx,%edx
+ andl 4(%esp),%eax
+ addl %ecx,%ebx
+ xorl %edi,%eax
+ addl $4,%ebp
+ addl %ebx,%eax
+ cmpl $3248222580,%esi
+ jne .L00600_15_shrd
+ movl 156(%esp),%ecx
+ jmp .L00716_63_shrd
+.align 16
+.L00716_63_shrd:
+ movl %ecx,%ebx
+ movl 104(%esp),%esi
+ shrdl $11,%ecx,%ecx
+ movl %esi,%edi
+ shrdl $2,%esi,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ shrdl $17,%esi,%esi
+ addl 160(%esp),%ebx
+ shrl $10,%edi
+ addl 124(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 24(%esp),%esi
+ shrdl $14,%ecx,%ecx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %edx,%ecx
+ xorl %edi,%esi
+ movl %ebx,96(%esp)
+ shrdl $5,%ecx,%ecx
+ andl %edx,%esi
+ movl %edx,20(%esp)
+ xorl %ecx,%edx
+ addl 32(%esp),%ebx
+ xorl %edi,%esi
+ shrdl $6,%edx,%edx
movl %eax,%ecx
- subl $4,%esp
- orl %esi,%eax
- andl %esi,%ecx
- andl %edi,%eax
+ addl %esi,%ebx
+ shrdl $9,%ecx,%ecx
+ addl %edx,%ebx
+ movl 8(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,4(%esp)
+ leal -4(%esp),%esp
+ shrdl $11,%ecx,%ecx
movl (%ebp),%esi
- orl %ecx,%eax
+ xorl %eax,%ecx
+ movl 20(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %esi,%ebx
+ movl %eax,(%esp)
+ addl %ebx,%edx
+ andl 4(%esp),%eax
+ addl %ecx,%ebx
+ xorl %edi,%eax
+ movl 156(%esp),%ecx
addl $4,%ebp
addl %ebx,%eax
- movl 152(%esp),%ebx
- addl %esi,%edx
- addl %esi,%eax
cmpl $3329325298,%esi
- jne .L00416_63
- movl 352(%esp),%esi
- movl 4(%esp),%ebx
- movl 8(%esp),%ecx
- movl 12(%esp),%edi
+ jne .L00716_63_shrd
+ movl 356(%esp),%esi
+ movl 8(%esp),%ebx
+ movl 16(%esp),%ecx
addl (%esi),%eax
addl 4(%esi),%ebx
- addl 8(%esi),%ecx
- addl 12(%esi),%edi
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
movl %eax,(%esi)
movl %ebx,4(%esi)
- movl %ecx,8(%esi)
- movl %edi,12(%esi)
- movl 20(%esp),%eax
- movl 24(%esp),%ebx
- movl 28(%esp),%ecx
- movl 356(%esp),%edi
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl 24(%esp),%eax
+ movl 28(%esp),%ebx
+ movl 32(%esp),%ecx
+ movl 360(%esp),%edi
addl 16(%esi),%edx
addl 20(%esi),%eax
addl 24(%esi),%ebx
@@ -261,10 +457,10 @@ _sha256_block_data_order:
movl %eax,20(%esi)
movl %ebx,24(%esi)
movl %ecx,28(%esi)
- addl $352,%esp
+ leal 356(%esp),%esp
subl $256,%ebp
cmpl 8(%esp),%edi
- jb .L002loop
+ jb .L005loop_shrd
movl 12(%esp),%esp
popl %edi
popl %esi
@@ -273,25 +469,2918 @@ _sha256_block_data_order:
ret
.align 64
.L001K256:
-.long 1116352408,1899447441,3049323471,3921009573
-.long 961987163,1508970993,2453635748,2870763221
-.long 3624381080,310598401,607225278,1426881987
-.long 1925078388,2162078206,2614888103,3248222580
-.long 3835390401,4022224774,264347078,604807628
-.long 770255983,1249150122,1555081692,1996064986
-.long 2554220882,2821834349,2952996808,3210313671
-.long 3336571891,3584528711,113926993,338241895
-.long 666307205,773529912,1294757372,1396182291
-.long 1695183700,1986661051,2177026350,2456956037
-.long 2730485921,2820302411,3259730800,3345764771
-.long 3516065817,3600352804,4094571909,275423344
-.long 430227734,506948616,659060556,883997877
-.long 958139571,1322822218,1537002063,1747873779
-.long 1955562222,2024104815,2227730452,2361852424
-.long 2428436474,2756734187,3204031479,3329325298
+.long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
+.long 66051,67438087,134810123,202182159
.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
.byte 62,0
+.align 16
+.L008unrolled:
+ leal -96(%esp),%esp
+ movl (%esi),%eax
+ movl 4(%esi),%ebp
+ movl 8(%esi),%ecx
+ movl 12(%esi),%ebx
+ movl %ebp,4(%esp)
+ xorl %ecx,%ebp
+ movl %ecx,8(%esp)
+ movl %ebx,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %ebx,20(%esp)
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ jmp .L009grand_loop
+.align 16
+.L009grand_loop:
+ movl (%edi),%ebx
+ movl 4(%edi),%ecx
+ bswap %ebx
+ movl 8(%edi),%esi
+ bswap %ecx
+ movl %ebx,32(%esp)
+ bswap %esi
+ movl %ecx,36(%esp)
+ movl %esi,40(%esp)
+ movl 12(%edi),%ebx
+ movl 16(%edi),%ecx
+ bswap %ebx
+ movl 20(%edi),%esi
+ bswap %ecx
+ movl %ebx,44(%esp)
+ bswap %esi
+ movl %ecx,48(%esp)
+ movl %esi,52(%esp)
+ movl 24(%edi),%ebx
+ movl 28(%edi),%ecx
+ bswap %ebx
+ movl 32(%edi),%esi
+ bswap %ecx
+ movl %ebx,56(%esp)
+ bswap %esi
+ movl %ecx,60(%esp)
+ movl %esi,64(%esp)
+ movl 36(%edi),%ebx
+ movl 40(%edi),%ecx
+ bswap %ebx
+ movl 44(%edi),%esi
+ bswap %ecx
+ movl %ebx,68(%esp)
+ bswap %esi
+ movl %ecx,72(%esp)
+ movl %esi,76(%esp)
+ movl 48(%edi),%ebx
+ movl 52(%edi),%ecx
+ bswap %ebx
+ movl 56(%edi),%esi
+ bswap %ecx
+ movl %ebx,80(%esp)
+ bswap %esi
+ movl %ecx,84(%esp)
+ movl %esi,88(%esp)
+ movl 60(%edi),%ebx
+ addl $64,%edi
+ bswap %ebx
+ movl %edi,100(%esp)
+ movl %ebx,92(%esp)
+ movl %edx,%ecx
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl 32(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1116352408(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl 36(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1899447441(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl 40(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3049323471(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl 44(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3921009573(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl 48(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 961987163(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl 52(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1508970993(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl 56(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2453635748(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl 60(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2870763221(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl 64(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3624381080(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl 68(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 310598401(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl 72(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 607225278(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl 76(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1426881987(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl 80(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1925078388(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl 84(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2162078206(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl 88(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2614888103(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl 92(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3248222580(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 36(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 88(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 32(%esp),%ebx
+ shrl $10,%edi
+ addl 68(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,32(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3835390401(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 40(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 92(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 36(%esp),%ebx
+ shrl $10,%edi
+ addl 72(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,36(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 4022224774(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 44(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 32(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 40(%esp),%ebx
+ shrl $10,%edi
+ addl 76(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,40(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 264347078(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 48(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 36(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 44(%esp),%ebx
+ shrl $10,%edi
+ addl 80(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,44(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 604807628(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 52(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 40(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 48(%esp),%ebx
+ shrl $10,%edi
+ addl 84(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,48(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 770255983(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 56(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 44(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 52(%esp),%ebx
+ shrl $10,%edi
+ addl 88(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,52(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1249150122(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 60(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 48(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 56(%esp),%ebx
+ shrl $10,%edi
+ addl 92(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,56(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1555081692(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 64(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 52(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 60(%esp),%ebx
+ shrl $10,%edi
+ addl 32(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,60(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1996064986(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 68(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 56(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 64(%esp),%ebx
+ shrl $10,%edi
+ addl 36(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,64(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2554220882(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 72(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 60(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 68(%esp),%ebx
+ shrl $10,%edi
+ addl 40(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,68(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2821834349(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 76(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 64(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 72(%esp),%ebx
+ shrl $10,%edi
+ addl 44(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,72(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2952996808(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 80(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 68(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 76(%esp),%ebx
+ shrl $10,%edi
+ addl 48(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,76(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3210313671(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 84(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 72(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 80(%esp),%ebx
+ shrl $10,%edi
+ addl 52(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,80(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3336571891(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 88(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 76(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 84(%esp),%ebx
+ shrl $10,%edi
+ addl 56(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,84(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3584528711(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 92(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 80(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 88(%esp),%ebx
+ shrl $10,%edi
+ addl 60(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,88(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 113926993(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 32(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 84(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 92(%esp),%ebx
+ shrl $10,%edi
+ addl 64(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,92(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 338241895(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 36(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 88(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 32(%esp),%ebx
+ shrl $10,%edi
+ addl 68(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,32(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 666307205(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 40(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 92(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 36(%esp),%ebx
+ shrl $10,%edi
+ addl 72(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,36(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 773529912(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 44(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 32(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 40(%esp),%ebx
+ shrl $10,%edi
+ addl 76(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,40(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1294757372(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 48(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 36(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 44(%esp),%ebx
+ shrl $10,%edi
+ addl 80(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,44(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1396182291(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 52(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 40(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 48(%esp),%ebx
+ shrl $10,%edi
+ addl 84(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,48(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1695183700(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 56(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 44(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 52(%esp),%ebx
+ shrl $10,%edi
+ addl 88(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,52(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1986661051(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 60(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 48(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 56(%esp),%ebx
+ shrl $10,%edi
+ addl 92(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,56(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2177026350(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 64(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 52(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 60(%esp),%ebx
+ shrl $10,%edi
+ addl 32(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,60(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2456956037(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 68(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 56(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 64(%esp),%ebx
+ shrl $10,%edi
+ addl 36(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,64(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2730485921(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 72(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 60(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 68(%esp),%ebx
+ shrl $10,%edi
+ addl 40(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,68(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2820302411(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 76(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 64(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 72(%esp),%ebx
+ shrl $10,%edi
+ addl 44(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,72(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3259730800(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 80(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 68(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 76(%esp),%ebx
+ shrl $10,%edi
+ addl 48(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,76(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3345764771(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 84(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 72(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 80(%esp),%ebx
+ shrl $10,%edi
+ addl 52(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,80(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3516065817(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 88(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 76(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 84(%esp),%ebx
+ shrl $10,%edi
+ addl 56(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,84(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3600352804(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 92(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 80(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 88(%esp),%ebx
+ shrl $10,%edi
+ addl 60(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,88(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 4094571909(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 32(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 84(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 92(%esp),%ebx
+ shrl $10,%edi
+ addl 64(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,92(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 275423344(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 36(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 88(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 32(%esp),%ebx
+ shrl $10,%edi
+ addl 68(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,32(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 430227734(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 40(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 92(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 36(%esp),%ebx
+ shrl $10,%edi
+ addl 72(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,36(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 506948616(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 44(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 32(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 40(%esp),%ebx
+ shrl $10,%edi
+ addl 76(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,40(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 659060556(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 48(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 36(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 44(%esp),%ebx
+ shrl $10,%edi
+ addl 80(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,44(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 883997877(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 52(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 40(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 48(%esp),%ebx
+ shrl $10,%edi
+ addl 84(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,48(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 958139571(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 56(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 44(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 52(%esp),%ebx
+ shrl $10,%edi
+ addl 88(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,52(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1322822218(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 60(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 48(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 56(%esp),%ebx
+ shrl $10,%edi
+ addl 92(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,56(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1537002063(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 64(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 52(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 60(%esp),%ebx
+ shrl $10,%edi
+ addl 32(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,60(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1747873779(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 68(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 56(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 64(%esp),%ebx
+ shrl $10,%edi
+ addl 36(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,64(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1955562222(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 72(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 60(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 68(%esp),%ebx
+ shrl $10,%edi
+ addl 40(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,68(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2024104815(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 76(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 64(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 72(%esp),%ebx
+ shrl $10,%edi
+ addl 44(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,72(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2227730452(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 80(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 68(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 76(%esp),%ebx
+ shrl $10,%edi
+ addl 48(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,76(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2361852424(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 84(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 72(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 80(%esp),%ebx
+ shrl $10,%edi
+ addl 52(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,80(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2428436474(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 88(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 76(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 84(%esp),%ebx
+ shrl $10,%edi
+ addl 56(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,84(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2756734187(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 92(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 80(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 88(%esp),%ebx
+ shrl $10,%edi
+ addl 60(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3204031479(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 32(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 84(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 92(%esp),%ebx
+ shrl $10,%edi
+ addl 64(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3329325298(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebp
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebp
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebp,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebx
+ movl 28(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ebx
+ addl 28(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %ebx,24(%esi)
+ movl %ecx,28(%esi)
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ebx,24(%esp)
+ movl %ecx,28(%esp)
+ cmpl 104(%esp),%edi
+ jb .L009grand_loop
+ movl 108(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
diff --git a/lib/accelerated/x86/coff/sha512-ssse3-x86.s b/lib/accelerated/x86/coff/sha512-ssse3-x86.s
index acad0ec1e7..42233253c9 100644
--- a/lib/accelerated/x86/coff/sha512-ssse3-x86.s
+++ b/lib/accelerated/x86/coff/sha512-ssse3-x86.s
@@ -594,6 +594,8 @@ _sha512_block_data_order:
.long 4234509866,1501505948
.long 987167468,1607167915
.long 1246189591,1816402316
+.long 67438087,66051
+.long 202182159,134810123
.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
diff --git a/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s b/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s
index 2b6cd0fcd3..edaa67b95f 100644
--- a/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s
+++ b/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s
@@ -39,6 +39,7 @@
#
.text
+
.globl sha256_block_data_order
.def sha256_block_data_order; .scl 2; .type 32; .endef
.p2align 4
@@ -50,8 +51,13 @@ sha256_block_data_order:
movq %rcx,%rdi
movq %rdx,%rsi
movq %r8,%rdx
- movq %r9,%rcx
+ leaq _gnutls_x86_cpuid_s(%rip),%r11
+ movl 0(%r11),%r9d
+ movl 4(%r11),%r10d
+ movl 8(%r11),%r11d
+ testl $512,%r10d
+ jnz .Lssse3_shortcut
pushq %rbx
pushq %rbp
pushq %r12
@@ -69,8 +75,6 @@ sha256_block_data_order:
movq %r11,64+24(%rsp)
.Lprologue:
- leaq K256(%rip),%rbp
-
movl 0(%rdi),%eax
movl 4(%rdi),%ebx
movl 8(%rdi),%ecx
@@ -83,1694 +87,1632 @@ sha256_block_data_order:
.p2align 4
.Lloop:
- xorq %rdi,%rdi
+ movl %ebx,%edi
+ leaq K256(%rip),%rbp
+ xorl %ecx,%edi
movl 0(%rsi),%r12d
movl %r8d,%r13d
movl %eax,%r14d
bswapl %r12d
rorl $14,%r13d
movl %r9d,%r15d
- movl %r12d,0(%rsp)
- rorl $9,%r14d
xorl %r8d,%r13d
+ rorl $9,%r14d
xorl %r10d,%r15d
- rorl $5,%r13d
- addl %r11d,%r12d
+ movl %r12d,0(%rsp)
xorl %eax,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r8d,%r15d
- movl %ebx,%r11d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
rorl $11,%r14d
xorl %r8d,%r13d
- xorl %r10d,%r15d
+ addl %r15d,%r12d
- xorl %ecx,%r11d
+ movl %eax,%r15d
+ addl (%rbp),%r12d
xorl %eax,%r14d
- addl %r15d,%r12d
- movl %ebx,%r15d
+ xorl %ebx,%r15d
rorl $6,%r13d
- andl %eax,%r11d
- andl %ecx,%r15d
+ movl %ebx,%r11d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r11d
+ xorl %edi,%r11d
addl %r12d,%edx
addl %r12d,%r11d
- leaq 1(%rdi),%rdi
- addl %r14d,%r11d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r11d
movl 4(%rsi),%r12d
movl %edx,%r13d
movl %r11d,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %r8d,%r15d
- movl %r12d,4(%rsp)
+ movl %r8d,%edi
- rorl $9,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ rorl $9,%r14d
+ xorl %r9d,%edi
- rorl $5,%r13d
- addl %r10d,%r12d
+ movl %r12d,4(%rsp)
xorl %r11d,%r14d
+ andl %edx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %edx,%r15d
- movl %eax,%r10d
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
rorl $11,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ addl %edi,%r12d
- xorl %ebx,%r10d
+ movl %r11d,%edi
+ addl (%rbp),%r12d
xorl %r11d,%r14d
- addl %r15d,%r12d
- movl %eax,%r15d
+ xorl %eax,%edi
rorl $6,%r13d
- andl %r11d,%r10d
- andl %ebx,%r15d
+ movl %eax,%r10d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r10d
+ xorl %r15d,%r10d
addl %r12d,%ecx
addl %r12d,%r10d
- leaq 1(%rdi),%rdi
- addl %r14d,%r10d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r10d
movl 8(%rsi),%r12d
movl %ecx,%r13d
movl %r10d,%r14d
bswapl %r12d
rorl $14,%r13d
movl %edx,%r15d
- movl %r12d,8(%rsp)
- rorl $9,%r14d
xorl %ecx,%r13d
+ rorl $9,%r14d
xorl %r8d,%r15d
- rorl $5,%r13d
- addl %r9d,%r12d
+ movl %r12d,8(%rsp)
xorl %r10d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %ecx,%r15d
- movl %r11d,%r9d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
rorl $11,%r14d
xorl %ecx,%r13d
- xorl %r8d,%r15d
+ addl %r15d,%r12d
- xorl %eax,%r9d
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
xorl %r10d,%r14d
- addl %r15d,%r12d
- movl %r11d,%r15d
+ xorl %r11d,%r15d
rorl $6,%r13d
- andl %r10d,%r9d
- andl %eax,%r15d
+ movl %r11d,%r9d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r9d
+ xorl %edi,%r9d
addl %r12d,%ebx
addl %r12d,%r9d
- leaq 1(%rdi),%rdi
- addl %r14d,%r9d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r9d
movl 12(%rsi),%r12d
movl %ebx,%r13d
movl %r9d,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %ecx,%r15d
- movl %r12d,12(%rsp)
+ movl %ecx,%edi
- rorl $9,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ rorl $9,%r14d
+ xorl %edx,%edi
- rorl $5,%r13d
- addl %r8d,%r12d
+ movl %r12d,12(%rsp)
xorl %r9d,%r14d
+ andl %ebx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %ebx,%r15d
- movl %r10d,%r8d
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
rorl $11,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ addl %edi,%r12d
- xorl %r11d,%r8d
+ movl %r9d,%edi
+ addl (%rbp),%r12d
xorl %r9d,%r14d
- addl %r15d,%r12d
- movl %r10d,%r15d
+ xorl %r10d,%edi
rorl $6,%r13d
- andl %r9d,%r8d
- andl %r11d,%r15d
+ movl %r10d,%r8d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r8d
+ xorl %r15d,%r8d
addl %r12d,%eax
addl %r12d,%r8d
- leaq 1(%rdi),%rdi
- addl %r14d,%r8d
+ leaq 20(%rbp),%rbp
+ addl %r14d,%r8d
movl 16(%rsi),%r12d
movl %eax,%r13d
movl %r8d,%r14d
bswapl %r12d
rorl $14,%r13d
movl %ebx,%r15d
- movl %r12d,16(%rsp)
- rorl $9,%r14d
xorl %eax,%r13d
+ rorl $9,%r14d
xorl %ecx,%r15d
- rorl $5,%r13d
- addl %edx,%r12d
+ movl %r12d,16(%rsp)
xorl %r8d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %eax,%r15d
- movl %r9d,%edx
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
rorl $11,%r14d
xorl %eax,%r13d
- xorl %ecx,%r15d
+ addl %r15d,%r12d
- xorl %r10d,%edx
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
xorl %r8d,%r14d
- addl %r15d,%r12d
- movl %r9d,%r15d
+ xorl %r9d,%r15d
rorl $6,%r13d
- andl %r8d,%edx
- andl %r10d,%r15d
+ movl %r9d,%edx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%edx
+ xorl %edi,%edx
addl %r12d,%r11d
addl %r12d,%edx
- leaq 1(%rdi),%rdi
- addl %r14d,%edx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%edx
movl 20(%rsi),%r12d
movl %r11d,%r13d
movl %edx,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %eax,%r15d
- movl %r12d,20(%rsp)
+ movl %eax,%edi
- rorl $9,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ rorl $9,%r14d
+ xorl %ebx,%edi
- rorl $5,%r13d
- addl %ecx,%r12d
+ movl %r12d,20(%rsp)
xorl %edx,%r14d
+ andl %r11d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r11d,%r15d
- movl %r8d,%ecx
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
rorl $11,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ addl %edi,%r12d
- xorl %r9d,%ecx
+ movl %edx,%edi
+ addl (%rbp),%r12d
xorl %edx,%r14d
- addl %r15d,%r12d
- movl %r8d,%r15d
+ xorl %r8d,%edi
rorl $6,%r13d
- andl %edx,%ecx
- andl %r9d,%r15d
+ movl %r8d,%ecx
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ecx
+ xorl %r15d,%ecx
addl %r12d,%r10d
addl %r12d,%ecx
- leaq 1(%rdi),%rdi
- addl %r14d,%ecx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ecx
movl 24(%rsi),%r12d
movl %r10d,%r13d
movl %ecx,%r14d
bswapl %r12d
rorl $14,%r13d
movl %r11d,%r15d
- movl %r12d,24(%rsp)
- rorl $9,%r14d
xorl %r10d,%r13d
+ rorl $9,%r14d
xorl %eax,%r15d
- rorl $5,%r13d
- addl %ebx,%r12d
+ movl %r12d,24(%rsp)
xorl %ecx,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r10d,%r15d
- movl %edx,%ebx
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
rorl $11,%r14d
xorl %r10d,%r13d
- xorl %eax,%r15d
+ addl %r15d,%r12d
- xorl %r8d,%ebx
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
xorl %ecx,%r14d
- addl %r15d,%r12d
- movl %edx,%r15d
+ xorl %edx,%r15d
rorl $6,%r13d
- andl %ecx,%ebx
- andl %r8d,%r15d
+ movl %edx,%ebx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ebx
+ xorl %edi,%ebx
addl %r12d,%r9d
addl %r12d,%ebx
- leaq 1(%rdi),%rdi
- addl %r14d,%ebx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ebx
movl 28(%rsi),%r12d
movl %r9d,%r13d
movl %ebx,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %r10d,%r15d
- movl %r12d,28(%rsp)
+ movl %r10d,%edi
- rorl $9,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ rorl $9,%r14d
+ xorl %r11d,%edi
- rorl $5,%r13d
- addl %eax,%r12d
+ movl %r12d,28(%rsp)
xorl %ebx,%r14d
+ andl %r9d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r9d,%r15d
- movl %ecx,%eax
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
rorl $11,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ addl %edi,%r12d
- xorl %edx,%eax
+ movl %ebx,%edi
+ addl (%rbp),%r12d
xorl %ebx,%r14d
- addl %r15d,%r12d
- movl %ecx,%r15d
+ xorl %ecx,%edi
rorl $6,%r13d
- andl %ebx,%eax
- andl %edx,%r15d
+ movl %ecx,%eax
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%eax
+ xorl %r15d,%eax
addl %r12d,%r8d
addl %r12d,%eax
- leaq 1(%rdi),%rdi
- addl %r14d,%eax
+ leaq 20(%rbp),%rbp
+ addl %r14d,%eax
movl 32(%rsi),%r12d
movl %r8d,%r13d
movl %eax,%r14d
bswapl %r12d
rorl $14,%r13d
movl %r9d,%r15d
- movl %r12d,32(%rsp)
- rorl $9,%r14d
xorl %r8d,%r13d
+ rorl $9,%r14d
xorl %r10d,%r15d
- rorl $5,%r13d
- addl %r11d,%r12d
+ movl %r12d,32(%rsp)
xorl %eax,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r8d,%r15d
- movl %ebx,%r11d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
rorl $11,%r14d
xorl %r8d,%r13d
- xorl %r10d,%r15d
+ addl %r15d,%r12d
- xorl %ecx,%r11d
+ movl %eax,%r15d
+ addl (%rbp),%r12d
xorl %eax,%r14d
- addl %r15d,%r12d
- movl %ebx,%r15d
+ xorl %ebx,%r15d
rorl $6,%r13d
- andl %eax,%r11d
- andl %ecx,%r15d
+ movl %ebx,%r11d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r11d
+ xorl %edi,%r11d
addl %r12d,%edx
addl %r12d,%r11d
- leaq 1(%rdi),%rdi
- addl %r14d,%r11d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r11d
movl 36(%rsi),%r12d
movl %edx,%r13d
movl %r11d,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %r8d,%r15d
- movl %r12d,36(%rsp)
+ movl %r8d,%edi
- rorl $9,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ rorl $9,%r14d
+ xorl %r9d,%edi
- rorl $5,%r13d
- addl %r10d,%r12d
+ movl %r12d,36(%rsp)
xorl %r11d,%r14d
+ andl %edx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %edx,%r15d
- movl %eax,%r10d
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
rorl $11,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ addl %edi,%r12d
- xorl %ebx,%r10d
+ movl %r11d,%edi
+ addl (%rbp),%r12d
xorl %r11d,%r14d
- addl %r15d,%r12d
- movl %eax,%r15d
+ xorl %eax,%edi
rorl $6,%r13d
- andl %r11d,%r10d
- andl %ebx,%r15d
+ movl %eax,%r10d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r10d
+ xorl %r15d,%r10d
addl %r12d,%ecx
addl %r12d,%r10d
- leaq 1(%rdi),%rdi
- addl %r14d,%r10d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r10d
movl 40(%rsi),%r12d
movl %ecx,%r13d
movl %r10d,%r14d
bswapl %r12d
rorl $14,%r13d
movl %edx,%r15d
- movl %r12d,40(%rsp)
- rorl $9,%r14d
xorl %ecx,%r13d
+ rorl $9,%r14d
xorl %r8d,%r15d
- rorl $5,%r13d
- addl %r9d,%r12d
+ movl %r12d,40(%rsp)
xorl %r10d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %ecx,%r15d
- movl %r11d,%r9d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
rorl $11,%r14d
xorl %ecx,%r13d
- xorl %r8d,%r15d
+ addl %r15d,%r12d
- xorl %eax,%r9d
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
xorl %r10d,%r14d
- addl %r15d,%r12d
- movl %r11d,%r15d
+ xorl %r11d,%r15d
rorl $6,%r13d
- andl %r10d,%r9d
- andl %eax,%r15d
+ movl %r11d,%r9d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r9d
+ xorl %edi,%r9d
addl %r12d,%ebx
addl %r12d,%r9d
- leaq 1(%rdi),%rdi
- addl %r14d,%r9d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r9d
movl 44(%rsi),%r12d
movl %ebx,%r13d
movl %r9d,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %ecx,%r15d
- movl %r12d,44(%rsp)
+ movl %ecx,%edi
- rorl $9,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ rorl $9,%r14d
+ xorl %edx,%edi
- rorl $5,%r13d
- addl %r8d,%r12d
+ movl %r12d,44(%rsp)
xorl %r9d,%r14d
+ andl %ebx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %ebx,%r15d
- movl %r10d,%r8d
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
rorl $11,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ addl %edi,%r12d
- xorl %r11d,%r8d
+ movl %r9d,%edi
+ addl (%rbp),%r12d
xorl %r9d,%r14d
- addl %r15d,%r12d
- movl %r10d,%r15d
+ xorl %r10d,%edi
rorl $6,%r13d
- andl %r9d,%r8d
- andl %r11d,%r15d
+ movl %r10d,%r8d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r8d
+ xorl %r15d,%r8d
addl %r12d,%eax
addl %r12d,%r8d
- leaq 1(%rdi),%rdi
- addl %r14d,%r8d
+ leaq 20(%rbp),%rbp
+ addl %r14d,%r8d
movl 48(%rsi),%r12d
movl %eax,%r13d
movl %r8d,%r14d
bswapl %r12d
rorl $14,%r13d
movl %ebx,%r15d
- movl %r12d,48(%rsp)
- rorl $9,%r14d
xorl %eax,%r13d
+ rorl $9,%r14d
xorl %ecx,%r15d
- rorl $5,%r13d
- addl %edx,%r12d
+ movl %r12d,48(%rsp)
xorl %r8d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %eax,%r15d
- movl %r9d,%edx
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
rorl $11,%r14d
xorl %eax,%r13d
- xorl %ecx,%r15d
+ addl %r15d,%r12d
- xorl %r10d,%edx
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
xorl %r8d,%r14d
- addl %r15d,%r12d
- movl %r9d,%r15d
+ xorl %r9d,%r15d
rorl $6,%r13d
- andl %r8d,%edx
- andl %r10d,%r15d
+ movl %r9d,%edx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%edx
+ xorl %edi,%edx
addl %r12d,%r11d
addl %r12d,%edx
- leaq 1(%rdi),%rdi
- addl %r14d,%edx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%edx
movl 52(%rsi),%r12d
movl %r11d,%r13d
movl %edx,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %eax,%r15d
- movl %r12d,52(%rsp)
+ movl %eax,%edi
- rorl $9,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ rorl $9,%r14d
+ xorl %ebx,%edi
- rorl $5,%r13d
- addl %ecx,%r12d
+ movl %r12d,52(%rsp)
xorl %edx,%r14d
+ andl %r11d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r11d,%r15d
- movl %r8d,%ecx
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
rorl $11,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ addl %edi,%r12d
- xorl %r9d,%ecx
+ movl %edx,%edi
+ addl (%rbp),%r12d
xorl %edx,%r14d
- addl %r15d,%r12d
- movl %r8d,%r15d
+ xorl %r8d,%edi
rorl $6,%r13d
- andl %edx,%ecx
- andl %r9d,%r15d
+ movl %r8d,%ecx
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ecx
+ xorl %r15d,%ecx
addl %r12d,%r10d
addl %r12d,%ecx
- leaq 1(%rdi),%rdi
- addl %r14d,%ecx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ecx
movl 56(%rsi),%r12d
movl %r10d,%r13d
movl %ecx,%r14d
bswapl %r12d
rorl $14,%r13d
movl %r11d,%r15d
- movl %r12d,56(%rsp)
- rorl $9,%r14d
xorl %r10d,%r13d
+ rorl $9,%r14d
xorl %eax,%r15d
- rorl $5,%r13d
- addl %ebx,%r12d
+ movl %r12d,56(%rsp)
xorl %ecx,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r10d,%r15d
- movl %edx,%ebx
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
rorl $11,%r14d
xorl %r10d,%r13d
- xorl %eax,%r15d
+ addl %r15d,%r12d
- xorl %r8d,%ebx
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
xorl %ecx,%r14d
- addl %r15d,%r12d
- movl %edx,%r15d
+ xorl %edx,%r15d
rorl $6,%r13d
- andl %ecx,%ebx
- andl %r8d,%r15d
+ movl %edx,%ebx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ebx
+ xorl %edi,%ebx
addl %r12d,%r9d
addl %r12d,%ebx
- leaq 1(%rdi),%rdi
- addl %r14d,%ebx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ebx
movl 60(%rsi),%r12d
movl %r9d,%r13d
movl %ebx,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %r10d,%r15d
- movl %r12d,60(%rsp)
+ movl %r10d,%edi
- rorl $9,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ rorl $9,%r14d
+ xorl %r11d,%edi
- rorl $5,%r13d
- addl %eax,%r12d
+ movl %r12d,60(%rsp)
xorl %ebx,%r14d
+ andl %r9d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r9d,%r15d
- movl %ecx,%eax
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
rorl $11,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ addl %edi,%r12d
- xorl %edx,%eax
+ movl %ebx,%edi
+ addl (%rbp),%r12d
xorl %ebx,%r14d
- addl %r15d,%r12d
- movl %ecx,%r15d
+ xorl %ecx,%edi
rorl $6,%r13d
- andl %ebx,%eax
- andl %edx,%r15d
+ movl %ecx,%eax
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%eax
+ xorl %r15d,%eax
addl %r12d,%r8d
addl %r12d,%eax
- leaq 1(%rdi),%rdi
- addl %r14d,%eax
+ leaq 20(%rbp),%rbp
jmp .Lrounds_16_xx
.p2align 4
.Lrounds_16_xx:
movl 4(%rsp),%r13d
- movl 56(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 56(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%eax
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 36(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 36(%rsp),%r12d
addl 0(%rsp),%r12d
movl %r8d,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %eax,%r14d
rorl $14,%r13d
movl %r9d,%r15d
- movl %r12d,0(%rsp)
- rorl $9,%r14d
xorl %r8d,%r13d
+ rorl $9,%r14d
xorl %r10d,%r15d
- rorl $5,%r13d
- addl %r11d,%r12d
+ movl %r12d,0(%rsp)
xorl %eax,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r8d,%r15d
- movl %ebx,%r11d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
rorl $11,%r14d
xorl %r8d,%r13d
- xorl %r10d,%r15d
+ addl %r15d,%r12d
- xorl %ecx,%r11d
+ movl %eax,%r15d
+ addl (%rbp),%r12d
xorl %eax,%r14d
- addl %r15d,%r12d
- movl %ebx,%r15d
+ xorl %ebx,%r15d
rorl $6,%r13d
- andl %eax,%r11d
- andl %ecx,%r15d
+ movl %ebx,%r11d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r11d
+ xorl %edi,%r11d
addl %r12d,%edx
addl %r12d,%r11d
- leaq 1(%rdi),%rdi
- addl %r14d,%r11d
+ leaq 4(%rbp),%rbp
movl 8(%rsp),%r13d
- movl 60(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 60(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r11d
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 40(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 40(%rsp),%r12d
addl 4(%rsp),%r12d
movl %edx,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %r11d,%r14d
rorl $14,%r13d
- movl %r8d,%r15d
- movl %r12d,4(%rsp)
+ movl %r8d,%edi
- rorl $9,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ rorl $9,%r14d
+ xorl %r9d,%edi
- rorl $5,%r13d
- addl %r10d,%r12d
+ movl %r12d,4(%rsp)
xorl %r11d,%r14d
+ andl %edx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %edx,%r15d
- movl %eax,%r10d
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
rorl $11,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ addl %edi,%r12d
- xorl %ebx,%r10d
+ movl %r11d,%edi
+ addl (%rbp),%r12d
xorl %r11d,%r14d
- addl %r15d,%r12d
- movl %eax,%r15d
+ xorl %eax,%edi
rorl $6,%r13d
- andl %r11d,%r10d
- andl %ebx,%r15d
+ movl %eax,%r10d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r10d
+ xorl %r15d,%r10d
addl %r12d,%ecx
addl %r12d,%r10d
- leaq 1(%rdi),%rdi
- addl %r14d,%r10d
+ leaq 4(%rbp),%rbp
movl 12(%rsp),%r13d
- movl 0(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 0(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r10d
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 44(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 44(%rsp),%r12d
addl 8(%rsp),%r12d
movl %ecx,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %r10d,%r14d
rorl $14,%r13d
movl %edx,%r15d
- movl %r12d,8(%rsp)
- rorl $9,%r14d
xorl %ecx,%r13d
+ rorl $9,%r14d
xorl %r8d,%r15d
- rorl $5,%r13d
- addl %r9d,%r12d
+ movl %r12d,8(%rsp)
xorl %r10d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %ecx,%r15d
- movl %r11d,%r9d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
rorl $11,%r14d
xorl %ecx,%r13d
- xorl %r8d,%r15d
+ addl %r15d,%r12d
- xorl %eax,%r9d
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
xorl %r10d,%r14d
- addl %r15d,%r12d
- movl %r11d,%r15d
+ xorl %r11d,%r15d
rorl $6,%r13d
- andl %r10d,%r9d
- andl %eax,%r15d
+ movl %r11d,%r9d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r9d
+ xorl %edi,%r9d
addl %r12d,%ebx
addl %r12d,%r9d
- leaq 1(%rdi),%rdi
- addl %r14d,%r9d
+ leaq 4(%rbp),%rbp
movl 16(%rsp),%r13d
- movl 4(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 4(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r9d
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 48(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 48(%rsp),%r12d
addl 12(%rsp),%r12d
movl %ebx,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %r9d,%r14d
rorl $14,%r13d
- movl %ecx,%r15d
- movl %r12d,12(%rsp)
+ movl %ecx,%edi
- rorl $9,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ rorl $9,%r14d
+ xorl %edx,%edi
- rorl $5,%r13d
- addl %r8d,%r12d
+ movl %r12d,12(%rsp)
xorl %r9d,%r14d
+ andl %ebx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %ebx,%r15d
- movl %r10d,%r8d
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
rorl $11,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ addl %edi,%r12d
- xorl %r11d,%r8d
+ movl %r9d,%edi
+ addl (%rbp),%r12d
xorl %r9d,%r14d
- addl %r15d,%r12d
- movl %r10d,%r15d
+ xorl %r10d,%edi
rorl $6,%r13d
- andl %r9d,%r8d
- andl %r11d,%r15d
+ movl %r10d,%r8d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r8d
+ xorl %r15d,%r8d
addl %r12d,%eax
addl %r12d,%r8d
- leaq 1(%rdi),%rdi
- addl %r14d,%r8d
+ leaq 20(%rbp),%rbp
movl 20(%rsp),%r13d
- movl 8(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 8(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r8d
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 52(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 52(%rsp),%r12d
addl 16(%rsp),%r12d
movl %eax,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %r8d,%r14d
rorl $14,%r13d
movl %ebx,%r15d
- movl %r12d,16(%rsp)
- rorl $9,%r14d
xorl %eax,%r13d
+ rorl $9,%r14d
xorl %ecx,%r15d
- rorl $5,%r13d
- addl %edx,%r12d
+ movl %r12d,16(%rsp)
xorl %r8d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %eax,%r15d
- movl %r9d,%edx
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
rorl $11,%r14d
xorl %eax,%r13d
- xorl %ecx,%r15d
+ addl %r15d,%r12d
- xorl %r10d,%edx
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
xorl %r8d,%r14d
- addl %r15d,%r12d
- movl %r9d,%r15d
+ xorl %r9d,%r15d
rorl $6,%r13d
- andl %r8d,%edx
- andl %r10d,%r15d
+ movl %r9d,%edx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%edx
+ xorl %edi,%edx
addl %r12d,%r11d
addl %r12d,%edx
- leaq 1(%rdi),%rdi
- addl %r14d,%edx
+ leaq 4(%rbp),%rbp
movl 24(%rsp),%r13d
- movl 12(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 12(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%edx
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 56(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 56(%rsp),%r12d
addl 20(%rsp),%r12d
movl %r11d,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %edx,%r14d
rorl $14,%r13d
- movl %eax,%r15d
- movl %r12d,20(%rsp)
+ movl %eax,%edi
- rorl $9,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ rorl $9,%r14d
+ xorl %ebx,%edi
- rorl $5,%r13d
- addl %ecx,%r12d
+ movl %r12d,20(%rsp)
xorl %edx,%r14d
+ andl %r11d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r11d,%r15d
- movl %r8d,%ecx
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
rorl $11,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ addl %edi,%r12d
- xorl %r9d,%ecx
+ movl %edx,%edi
+ addl (%rbp),%r12d
xorl %edx,%r14d
- addl %r15d,%r12d
- movl %r8d,%r15d
+ xorl %r8d,%edi
rorl $6,%r13d
- andl %edx,%ecx
- andl %r9d,%r15d
+ movl %r8d,%ecx
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ecx
+ xorl %r15d,%ecx
addl %r12d,%r10d
addl %r12d,%ecx
- leaq 1(%rdi),%rdi
- addl %r14d,%ecx
+ leaq 4(%rbp),%rbp
movl 28(%rsp),%r13d
- movl 16(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 16(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ecx
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 60(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 60(%rsp),%r12d
addl 24(%rsp),%r12d
movl %r10d,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %ecx,%r14d
rorl $14,%r13d
movl %r11d,%r15d
- movl %r12d,24(%rsp)
- rorl $9,%r14d
xorl %r10d,%r13d
+ rorl $9,%r14d
xorl %eax,%r15d
- rorl $5,%r13d
- addl %ebx,%r12d
+ movl %r12d,24(%rsp)
xorl %ecx,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r10d,%r15d
- movl %edx,%ebx
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
rorl $11,%r14d
xorl %r10d,%r13d
- xorl %eax,%r15d
+ addl %r15d,%r12d
- xorl %r8d,%ebx
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
xorl %ecx,%r14d
- addl %r15d,%r12d
- movl %edx,%r15d
+ xorl %edx,%r15d
rorl $6,%r13d
- andl %ecx,%ebx
- andl %r8d,%r15d
+ movl %edx,%ebx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ebx
+ xorl %edi,%ebx
addl %r12d,%r9d
addl %r12d,%ebx
- leaq 1(%rdi),%rdi
- addl %r14d,%ebx
+ leaq 4(%rbp),%rbp
movl 32(%rsp),%r13d
- movl 20(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 20(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ebx
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 0(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 0(%rsp),%r12d
addl 28(%rsp),%r12d
movl %r9d,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %ebx,%r14d
rorl $14,%r13d
- movl %r10d,%r15d
- movl %r12d,28(%rsp)
+ movl %r10d,%edi
- rorl $9,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ rorl $9,%r14d
+ xorl %r11d,%edi
- rorl $5,%r13d
- addl %eax,%r12d
+ movl %r12d,28(%rsp)
xorl %ebx,%r14d
+ andl %r9d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r9d,%r15d
- movl %ecx,%eax
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
rorl $11,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ addl %edi,%r12d
- xorl %edx,%eax
+ movl %ebx,%edi
+ addl (%rbp),%r12d
xorl %ebx,%r14d
- addl %r15d,%r12d
- movl %ecx,%r15d
+ xorl %ecx,%edi
rorl $6,%r13d
- andl %ebx,%eax
- andl %edx,%r15d
+ movl %ecx,%eax
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%eax
+ xorl %r15d,%eax
addl %r12d,%r8d
addl %r12d,%eax
- leaq 1(%rdi),%rdi
- addl %r14d,%eax
+ leaq 20(%rbp),%rbp
movl 36(%rsp),%r13d
- movl 24(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 24(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%eax
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 4(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 4(%rsp),%r12d
addl 32(%rsp),%r12d
movl %r8d,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %eax,%r14d
rorl $14,%r13d
movl %r9d,%r15d
- movl %r12d,32(%rsp)
- rorl $9,%r14d
xorl %r8d,%r13d
+ rorl $9,%r14d
xorl %r10d,%r15d
- rorl $5,%r13d
- addl %r11d,%r12d
+ movl %r12d,32(%rsp)
xorl %eax,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r8d,%r15d
- movl %ebx,%r11d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
rorl $11,%r14d
xorl %r8d,%r13d
- xorl %r10d,%r15d
+ addl %r15d,%r12d
- xorl %ecx,%r11d
+ movl %eax,%r15d
+ addl (%rbp),%r12d
xorl %eax,%r14d
- addl %r15d,%r12d
- movl %ebx,%r15d
+ xorl %ebx,%r15d
rorl $6,%r13d
- andl %eax,%r11d
- andl %ecx,%r15d
+ movl %ebx,%r11d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r11d
+ xorl %edi,%r11d
addl %r12d,%edx
addl %r12d,%r11d
- leaq 1(%rdi),%rdi
- addl %r14d,%r11d
+ leaq 4(%rbp),%rbp
movl 40(%rsp),%r13d
- movl 28(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 28(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r11d
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 8(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 8(%rsp),%r12d
addl 36(%rsp),%r12d
movl %edx,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %r11d,%r14d
rorl $14,%r13d
- movl %r8d,%r15d
- movl %r12d,36(%rsp)
+ movl %r8d,%edi
- rorl $9,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ rorl $9,%r14d
+ xorl %r9d,%edi
- rorl $5,%r13d
- addl %r10d,%r12d
+ movl %r12d,36(%rsp)
xorl %r11d,%r14d
+ andl %edx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %edx,%r15d
- movl %eax,%r10d
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
rorl $11,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ addl %edi,%r12d
- xorl %ebx,%r10d
+ movl %r11d,%edi
+ addl (%rbp),%r12d
xorl %r11d,%r14d
- addl %r15d,%r12d
- movl %eax,%r15d
+ xorl %eax,%edi
rorl $6,%r13d
- andl %r11d,%r10d
- andl %ebx,%r15d
+ movl %eax,%r10d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r10d
+ xorl %r15d,%r10d
addl %r12d,%ecx
addl %r12d,%r10d
- leaq 1(%rdi),%rdi
- addl %r14d,%r10d
+ leaq 4(%rbp),%rbp
movl 44(%rsp),%r13d
- movl 32(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 32(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r10d
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 12(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 12(%rsp),%r12d
addl 40(%rsp),%r12d
movl %ecx,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %r10d,%r14d
rorl $14,%r13d
movl %edx,%r15d
- movl %r12d,40(%rsp)
- rorl $9,%r14d
xorl %ecx,%r13d
+ rorl $9,%r14d
xorl %r8d,%r15d
- rorl $5,%r13d
- addl %r9d,%r12d
+ movl %r12d,40(%rsp)
xorl %r10d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %ecx,%r15d
- movl %r11d,%r9d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
rorl $11,%r14d
xorl %ecx,%r13d
- xorl %r8d,%r15d
+ addl %r15d,%r12d
- xorl %eax,%r9d
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
xorl %r10d,%r14d
- addl %r15d,%r12d
- movl %r11d,%r15d
+ xorl %r11d,%r15d
rorl $6,%r13d
- andl %r10d,%r9d
- andl %eax,%r15d
+ movl %r11d,%r9d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r9d
+ xorl %edi,%r9d
addl %r12d,%ebx
addl %r12d,%r9d
- leaq 1(%rdi),%rdi
- addl %r14d,%r9d
+ leaq 4(%rbp),%rbp
movl 48(%rsp),%r13d
- movl 36(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 36(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r9d
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 16(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 16(%rsp),%r12d
addl 44(%rsp),%r12d
movl %ebx,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %r9d,%r14d
rorl $14,%r13d
- movl %ecx,%r15d
- movl %r12d,44(%rsp)
+ movl %ecx,%edi
- rorl $9,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ rorl $9,%r14d
+ xorl %edx,%edi
- rorl $5,%r13d
- addl %r8d,%r12d
+ movl %r12d,44(%rsp)
xorl %r9d,%r14d
+ andl %ebx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %ebx,%r15d
- movl %r10d,%r8d
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
rorl $11,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ addl %edi,%r12d
- xorl %r11d,%r8d
+ movl %r9d,%edi
+ addl (%rbp),%r12d
xorl %r9d,%r14d
- addl %r15d,%r12d
- movl %r10d,%r15d
+ xorl %r10d,%edi
rorl $6,%r13d
- andl %r9d,%r8d
- andl %r11d,%r15d
+ movl %r10d,%r8d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r8d
+ xorl %r15d,%r8d
addl %r12d,%eax
addl %r12d,%r8d
- leaq 1(%rdi),%rdi
- addl %r14d,%r8d
+ leaq 20(%rbp),%rbp
movl 52(%rsp),%r13d
- movl 40(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 40(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r8d
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 20(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 20(%rsp),%r12d
addl 48(%rsp),%r12d
movl %eax,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %r8d,%r14d
rorl $14,%r13d
movl %ebx,%r15d
- movl %r12d,48(%rsp)
- rorl $9,%r14d
xorl %eax,%r13d
+ rorl $9,%r14d
xorl %ecx,%r15d
- rorl $5,%r13d
- addl %edx,%r12d
+ movl %r12d,48(%rsp)
xorl %r8d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %eax,%r15d
- movl %r9d,%edx
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
rorl $11,%r14d
xorl %eax,%r13d
- xorl %ecx,%r15d
+ addl %r15d,%r12d
- xorl %r10d,%edx
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
xorl %r8d,%r14d
- addl %r15d,%r12d
- movl %r9d,%r15d
+ xorl %r9d,%r15d
rorl $6,%r13d
- andl %r8d,%edx
- andl %r10d,%r15d
+ movl %r9d,%edx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%edx
+ xorl %edi,%edx
addl %r12d,%r11d
addl %r12d,%edx
- leaq 1(%rdi),%rdi
- addl %r14d,%edx
+ leaq 4(%rbp),%rbp
movl 56(%rsp),%r13d
- movl 44(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 44(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%edx
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 24(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 24(%rsp),%r12d
addl 52(%rsp),%r12d
movl %r11d,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %edx,%r14d
rorl $14,%r13d
- movl %eax,%r15d
- movl %r12d,52(%rsp)
+ movl %eax,%edi
- rorl $9,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ rorl $9,%r14d
+ xorl %ebx,%edi
- rorl $5,%r13d
- addl %ecx,%r12d
+ movl %r12d,52(%rsp)
xorl %edx,%r14d
+ andl %r11d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r11d,%r15d
- movl %r8d,%ecx
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
rorl $11,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ addl %edi,%r12d
- xorl %r9d,%ecx
+ movl %edx,%edi
+ addl (%rbp),%r12d
xorl %edx,%r14d
- addl %r15d,%r12d
- movl %r8d,%r15d
+ xorl %r8d,%edi
rorl $6,%r13d
- andl %edx,%ecx
- andl %r9d,%r15d
+ movl %r8d,%ecx
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ecx
+ xorl %r15d,%ecx
addl %r12d,%r10d
addl %r12d,%ecx
- leaq 1(%rdi),%rdi
- addl %r14d,%ecx
+ leaq 4(%rbp),%rbp
movl 60(%rsp),%r13d
- movl 48(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 48(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ecx
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 28(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 28(%rsp),%r12d
addl 56(%rsp),%r12d
movl %r10d,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %ecx,%r14d
rorl $14,%r13d
movl %r11d,%r15d
- movl %r12d,56(%rsp)
- rorl $9,%r14d
xorl %r10d,%r13d
+ rorl $9,%r14d
xorl %eax,%r15d
- rorl $5,%r13d
- addl %ebx,%r12d
+ movl %r12d,56(%rsp)
xorl %ecx,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r10d,%r15d
- movl %edx,%ebx
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
rorl $11,%r14d
xorl %r10d,%r13d
- xorl %eax,%r15d
+ addl %r15d,%r12d
- xorl %r8d,%ebx
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
xorl %ecx,%r14d
- addl %r15d,%r12d
- movl %edx,%r15d
+ xorl %edx,%r15d
rorl $6,%r13d
- andl %ecx,%ebx
- andl %r8d,%r15d
+ movl %edx,%ebx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ebx
+ xorl %edi,%ebx
addl %r12d,%r9d
addl %r12d,%ebx
- leaq 1(%rdi),%rdi
- addl %r14d,%ebx
+ leaq 4(%rbp),%rbp
movl 0(%rsp),%r13d
- movl 52(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 52(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ebx
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 32(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 32(%rsp),%r12d
addl 60(%rsp),%r12d
movl %r9d,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %ebx,%r14d
rorl $14,%r13d
- movl %r10d,%r15d
- movl %r12d,60(%rsp)
+ movl %r10d,%edi
- rorl $9,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ rorl $9,%r14d
+ xorl %r11d,%edi
- rorl $5,%r13d
- addl %eax,%r12d
+ movl %r12d,60(%rsp)
xorl %ebx,%r14d
+ andl %r9d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r9d,%r15d
- movl %ecx,%eax
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
rorl $11,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ addl %edi,%r12d
- xorl %edx,%eax
+ movl %ebx,%edi
+ addl (%rbp),%r12d
xorl %ebx,%r14d
- addl %r15d,%r12d
- movl %ecx,%r15d
+ xorl %ecx,%edi
rorl $6,%r13d
- andl %ebx,%eax
- andl %edx,%r15d
+ movl %ecx,%eax
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%eax
+ xorl %r15d,%eax
addl %r12d,%r8d
addl %r12d,%eax
- leaq 1(%rdi),%rdi
- addl %r14d,%eax
- cmpq $64,%rdi
- jb .Lrounds_16_xx
+ leaq 20(%rbp),%rbp
+ cmpb $0,3(%rbp)
+ jnz .Lrounds_16_xx
movq 64+0(%rsp),%rdi
+ addl %r14d,%eax
leaq 64(%rsi),%rsi
addl 0(%rdi),%eax
@@ -1811,21 +1753,1158 @@ sha256_block_data_order:
K256:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.def sha256_block_data_order_ssse3; .scl 3; .type 32; .endef
+.p2align 6
+sha256_block_data_order_ssse3:
+ movq %rdi,8(%rsp)
+ movq %rsi,16(%rsp)
+ movq %rsp,%rax
+.LSEH_begin_sha256_block_data_order_ssse3:
+ movq %rcx,%rdi
+ movq %rdx,%rsi
+ movq %r8,%rdx
+
+.Lssse3_shortcut:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movq %rsp,%r11
+ shlq $4,%rdx
+ subq $160,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %r11,64+24(%rsp)
+ movaps %xmm6,64+32(%rsp)
+ movaps %xmm7,64+48(%rsp)
+ movaps %xmm8,64+64(%rsp)
+ movaps %xmm9,64+80(%rsp)
+.Lprologue_ssse3:
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+
+
+ jmp .Lloop_ssse3
+.p2align 4
+.Lloop_ssse3:
+ movdqa K256+512(%rip),%xmm7
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+.byte 102,15,56,0,199
+ leaq K256(%rip),%rbp
+.byte 102,15,56,0,207
+ movdqa 0(%rbp),%xmm4
+.byte 102,15,56,0,215
+ movdqa 32(%rbp),%xmm5
+ paddd %xmm0,%xmm4
+ movdqa 64(%rbp),%xmm6
+.byte 102,15,56,0,223
+ movdqa 96(%rbp),%xmm7
+ paddd %xmm1,%xmm5
+ paddd %xmm2,%xmm6
+ paddd %xmm3,%xmm7
+ movdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ movdqa %xmm5,16(%rsp)
+ movl %ebx,%edi
+ movdqa %xmm6,32(%rsp)
+ xorl %ecx,%edi
+ movdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp .Lssse3_00_47
+
+.p2align 4
+.Lssse3_00_47:
+ subq $-128,%rbp
+ rorl $14,%r13d
+ movdqa %xmm1,%xmm4
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ movdqa %xmm3,%xmm7
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+.byte 102,15,58,15,224,4
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+.byte 102,15,58,15,250,4
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ paddd %xmm7,%xmm0
+ rorl $2,%r14d
+ addl %r11d,%edx
+ psrld $7,%xmm6
+ addl %edi,%r11d
+ movl %edx,%r13d
+ pshufd $250,%xmm3,%xmm7
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %r11d,%r14d
+ pxor %xmm5,%xmm4
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ pslld $11,%xmm5
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ pxor %xmm6,%xmm4
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ paddd %xmm4,%xmm0
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ psrlq $17,%xmm6
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ psrldq $8,%xmm7
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ paddd %xmm7,%xmm0
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ pshufd $80,%xmm0,%xmm7
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ movdqa %xmm7,%xmm6
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ psrld $10,%xmm7
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ psrlq $2,%xmm6
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ pxor %xmm6,%xmm7
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ movdqa 0(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ paddd %xmm7,%xmm0
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ paddd %xmm0,%xmm6
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ movdqa %xmm6,0(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm2,%xmm4
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ movdqa %xmm0,%xmm7
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+.byte 102,15,58,15,225,4
+ andl %eax,%r12d
+ xorl %eax,%r13d
+.byte 102,15,58,15,251,4
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ paddd %xmm7,%xmm1
+ rorl $2,%r14d
+ addl %edx,%r11d
+ psrld $7,%xmm6
+ addl %edi,%edx
+ movl %r11d,%r13d
+ pshufd $250,%xmm0,%xmm7
+ addl %edx,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%edx
+ movl %eax,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %edx,%r14d
+ pxor %xmm5,%xmm4
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ pslld $11,%xmm5
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ pxor %xmm6,%xmm4
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ paddd %xmm4,%xmm1
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ psrlq $17,%xmm6
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ psrldq $8,%xmm7
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ paddd %xmm7,%xmm1
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ pshufd $80,%xmm1,%xmm7
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ movdqa %xmm7,%xmm6
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ psrld $10,%xmm7
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ psrlq $2,%xmm6
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ pxor %xmm6,%xmm7
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ movdqa 32(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ paddd %xmm7,%xmm1
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ paddd %xmm1,%xmm6
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movdqa %xmm6,16(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm3,%xmm4
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ movdqa %xmm1,%xmm7
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+.byte 102,15,58,15,226,4
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+.byte 102,15,58,15,248,4
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ paddd %xmm7,%xmm2
+ rorl $2,%r14d
+ addl %r11d,%edx
+ psrld $7,%xmm6
+ addl %edi,%r11d
+ movl %edx,%r13d
+ pshufd $250,%xmm1,%xmm7
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %r11d,%r14d
+ pxor %xmm5,%xmm4
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ pslld $11,%xmm5
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ pxor %xmm6,%xmm4
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ paddd %xmm4,%xmm2
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ psrlq $17,%xmm6
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ psrldq $8,%xmm7
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ paddd %xmm7,%xmm2
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ pshufd $80,%xmm2,%xmm7
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ movdqa %xmm7,%xmm6
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ psrld $10,%xmm7
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ psrlq $2,%xmm6
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ pxor %xmm6,%xmm7
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ movdqa 64(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ paddd %xmm7,%xmm2
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ paddd %xmm2,%xmm6
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ movdqa %xmm6,32(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm0,%xmm4
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ movdqa %xmm2,%xmm7
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+.byte 102,15,58,15,227,4
+ andl %eax,%r12d
+ xorl %eax,%r13d
+.byte 102,15,58,15,249,4
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ paddd %xmm7,%xmm3
+ rorl $2,%r14d
+ addl %edx,%r11d
+ psrld $7,%xmm6
+ addl %edi,%edx
+ movl %r11d,%r13d
+ pshufd $250,%xmm2,%xmm7
+ addl %edx,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%edx
+ movl %eax,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %edx,%r14d
+ pxor %xmm5,%xmm4
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ pslld $11,%xmm5
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ pxor %xmm6,%xmm4
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ paddd %xmm4,%xmm3
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ psrlq $17,%xmm6
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ psrldq $8,%xmm7
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ paddd %xmm7,%xmm3
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ pshufd $80,%xmm3,%xmm7
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ movdqa %xmm7,%xmm6
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ psrld $10,%xmm7
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ psrlq $2,%xmm6
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ pxor %xmm6,%xmm7
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ movdqa 96(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ paddd %xmm7,%xmm3
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ paddd %xmm3,%xmm6
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movdqa %xmm6,48(%rsp)
+ cmpb $0,131(%rbp)
+ jne .Lssse3_00_47
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ rorl $2,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ rorl $2,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ rorl $2,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ rorl $2,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%rdi
+ movl %r14d,%eax
+
+ addl 0(%rdi),%eax
+ leaq 64(%rsi),%rsi
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb .Lloop_ssse3
+
+ movq 64+24(%rsp),%rsi
+ movaps 64+32(%rsp),%xmm6
+ movaps 64+48(%rsp),%xmm7
+ movaps 64+64(%rsp),%xmm8
+ movaps 64+80(%rsp),%xmm9
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lepilogue_ssse3:
+ movq 8(%rsp),%rdi
+ movq 16(%rsp),%rsi
+ .byte 0xf3,0xc3
+.LSEH_end_sha256_block_data_order_ssse3:
.def se_handler; .scl 3; .type 32; .endef
.p2align 4
@@ -1844,16 +2923,21 @@ se_handler:
movq 120(%r8),%rax
movq 248(%r8),%rbx
- leaq .Lprologue(%rip),%r10
+ movq 8(%r9),%rsi
+ movq 56(%r9),%r11
+
+ movl 0(%r11),%r10d
+ leaq (%rsi,%r10,1),%r10
cmpq %r10,%rbx
jb .Lin_prologue
movq 152(%r8),%rax
- leaq .Lepilogue(%rip),%r10
+ movl 4(%r11),%r10d
+ leaq (%rsi,%r10,1),%r10
cmpq %r10,%rbx
jae .Lin_prologue
-
+ movq %rax,%rsi
movq 64+24(%rax),%rax
leaq 48(%rax),%rax
@@ -1870,6 +2954,15 @@ se_handler:
movq %r14,232(%r8)
movq %r15,240(%r8)
+ leaq .Lepilogue(%rip),%r10
+ cmpq %r10,%rbx
+ jb .Lin_prologue
+
+ leaq 64+32(%rsi),%rsi
+ leaq 512(%r8),%rdi
+ movl $8,%ecx
+.long 0xa548f3fc
+
.Lin_prologue:
movq 8(%rax),%rdi
movq 16(%rax),%rsi
@@ -1880,7 +2973,7 @@ se_handler:
movq 40(%r9),%rdi
movq %r8,%rsi
movl $154,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
movq %r9,%rsi
xorq %rcx,%rcx
@@ -1915,10 +3008,17 @@ se_handler:
.rva .LSEH_begin_sha256_block_data_order
.rva .LSEH_end_sha256_block_data_order
.rva .LSEH_info_sha256_block_data_order
-
+.rva .LSEH_begin_sha256_block_data_order_ssse3
+.rva .LSEH_end_sha256_block_data_order_ssse3
+.rva .LSEH_info_sha256_block_data_order_ssse3
.section .xdata
.p2align 3
.LSEH_info_sha256_block_data_order:
.byte 9,0,0,0
.rva se_handler
+.rva .Lprologue,.Lepilogue
+.LSEH_info_sha256_block_data_order_ssse3:
+.byte 9,0,0,0
+.rva se_handler
+.rva .Lprologue_ssse3,.Lepilogue_ssse3
diff --git a/lib/accelerated/x86/elf/aes-ssse3-x86.s b/lib/accelerated/x86/elf/aes-ssse3-x86.s
index 92bdeeadb3..3aa221267a 100644
--- a/lib/accelerated/x86/elf/aes-ssse3-x86.s
+++ b/lib/accelerated/x86/elf/aes-ssse3-x86.s
@@ -85,33 +85,33 @@ _vpaes_encrypt_core:
movdqa %xmm6,%xmm1
movdqa (%ebp),%xmm2
pandn %xmm0,%xmm1
- movdqu (%edx),%xmm5
- psrld $4,%xmm1
pand %xmm6,%xmm0
+ movdqu (%edx),%xmm5
.byte 102,15,56,0,208
movdqa 16(%ebp),%xmm0
-.byte 102,15,56,0,193
pxor %xmm5,%xmm2
- pxor %xmm2,%xmm0
+ psrld $4,%xmm1
addl $16,%edx
+.byte 102,15,56,0,193
leal 192(%ebp),%ebx
+ pxor %xmm2,%xmm0
jmp .L000enc_entry
.align 16
.L001enc_loop:
movdqa 32(%ebp),%xmm4
-.byte 102,15,56,0,226
- pxor %xmm5,%xmm4
movdqa 48(%ebp),%xmm0
+.byte 102,15,56,0,226
.byte 102,15,56,0,195
- pxor %xmm4,%xmm0
+ pxor %xmm5,%xmm4
movdqa 64(%ebp),%xmm5
-.byte 102,15,56,0,234
+ pxor %xmm4,%xmm0
movdqa -64(%ebx,%ecx,1),%xmm1
+.byte 102,15,56,0,234
movdqa 80(%ebp),%xmm2
-.byte 102,15,56,0,211
- pxor %xmm5,%xmm2
movdqa (%ebx,%ecx,1),%xmm4
+.byte 102,15,56,0,211
movdqa %xmm0,%xmm3
+ pxor %xmm5,%xmm2
.byte 102,15,56,0,193
addl $16,%edx
pxor %xmm2,%xmm0
@@ -120,28 +120,28 @@ _vpaes_encrypt_core:
pxor %xmm0,%xmm3
.byte 102,15,56,0,193
andl $48,%ecx
- pxor %xmm3,%xmm0
subl $1,%eax
+ pxor %xmm3,%xmm0
.L000enc_entry:
movdqa %xmm6,%xmm1
+ movdqa -32(%ebp),%xmm5
pandn %xmm0,%xmm1
psrld $4,%xmm1
pand %xmm6,%xmm0
- movdqa -32(%ebp),%xmm5
.byte 102,15,56,0,232
- pxor %xmm1,%xmm0
movdqa %xmm7,%xmm3
+ pxor %xmm1,%xmm0
.byte 102,15,56,0,217
- pxor %xmm5,%xmm3
movdqa %xmm7,%xmm4
+ pxor %xmm5,%xmm3
.byte 102,15,56,0,224
- pxor %xmm5,%xmm4
movdqa %xmm7,%xmm2
+ pxor %xmm5,%xmm4
.byte 102,15,56,0,211
- pxor %xmm0,%xmm2
movdqa %xmm7,%xmm3
- movdqu (%edx),%xmm5
+ pxor %xmm0,%xmm2
.byte 102,15,56,0,220
+ movdqu (%edx),%xmm5
pxor %xmm1,%xmm3
jnz .L001enc_loop
movdqa 96(%ebp),%xmm4
@@ -157,8 +157,8 @@ _vpaes_encrypt_core:
.type _vpaes_decrypt_core,@function
.align 16
_vpaes_decrypt_core:
- movl 240(%edx),%eax
leal 608(%ebp),%ebx
+ movl 240(%edx),%eax
movdqa %xmm6,%xmm1
movdqa -64(%ebx),%xmm2
pandn %xmm0,%xmm1
@@ -181,56 +181,56 @@ _vpaes_decrypt_core:
.align 16
.L003dec_loop:
movdqa -32(%ebx),%xmm4
+ movdqa -16(%ebx),%xmm1
.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa -16(%ebx),%xmm0
-.byte 102,15,56,0,195
+.byte 102,15,56,0,203
pxor %xmm4,%xmm0
- addl $16,%edx
-.byte 102,15,56,0,197
movdqa (%ebx),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 16(%ebx),%xmm1
.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa 16(%ebx),%xmm0
-.byte 102,15,56,0,195
- pxor %xmm4,%xmm0
- subl $1,%eax
.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
movdqa 32(%ebx),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 48(%ebx),%xmm1
.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa 48(%ebx),%xmm0
-.byte 102,15,56,0,195
- pxor %xmm4,%xmm0
.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
movdqa 64(%ebx),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 80(%ebx),%xmm1
.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa 80(%ebx),%xmm0
-.byte 102,15,56,0,195
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
pxor %xmm4,%xmm0
+ addl $16,%edx
.byte 102,15,58,15,237,12
+ pxor %xmm1,%xmm0
+ subl $1,%eax
.L002dec_entry:
movdqa %xmm6,%xmm1
+ movdqa -32(%ebp),%xmm2
pandn %xmm0,%xmm1
- psrld $4,%xmm1
pand %xmm6,%xmm0
- movdqa -32(%ebp),%xmm2
+ psrld $4,%xmm1
.byte 102,15,56,0,208
- pxor %xmm1,%xmm0
movdqa %xmm7,%xmm3
+ pxor %xmm1,%xmm0
.byte 102,15,56,0,217
- pxor %xmm2,%xmm3
movdqa %xmm7,%xmm4
+ pxor %xmm2,%xmm3
.byte 102,15,56,0,224
pxor %xmm2,%xmm4
movdqa %xmm7,%xmm2
.byte 102,15,56,0,211
- pxor %xmm0,%xmm2
movdqa %xmm7,%xmm3
+ pxor %xmm0,%xmm2
.byte 102,15,56,0,220
- pxor %xmm1,%xmm3
movdqu (%edx),%xmm0
+ pxor %xmm1,%xmm3
jnz .L003dec_loop
movdqa 96(%ebx),%xmm4
.byte 102,15,56,0,226
@@ -339,12 +339,12 @@ _vpaes_schedule_core:
.type _vpaes_schedule_192_smear,@function
.align 16
_vpaes_schedule_192_smear:
- pshufd $128,%xmm6,%xmm0
- pxor %xmm0,%xmm6
+ pshufd $128,%xmm6,%xmm1
pshufd $254,%xmm7,%xmm0
+ pxor %xmm1,%xmm6
+ pxor %xmm1,%xmm1
pxor %xmm0,%xmm6
movdqa %xmm6,%xmm0
- pxor %xmm1,%xmm1
movhlps %xmm1,%xmm6
ret
.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
@@ -613,6 +613,8 @@ vpaes_cbc_encrypt:
movl 24(%esp),%edi
movl 28(%esp),%eax
movl 32(%esp),%edx
+ subl $16,%eax
+ jc .L020cbc_abort
leal -56(%esp),%ebx
movl 36(%esp),%ebp
andl $-16,%ebx
@@ -622,18 +624,17 @@ vpaes_cbc_encrypt:
subl %esi,%edi
movl %ebx,48(%esp)
movl %edi,(%esp)
- subl $16,%eax
movl %edx,4(%esp)
movl %ebp,8(%esp)
movl %eax,%edi
- leal .L_vpaes_consts+0x30-.L020pic_point,%ebp
+ leal .L_vpaes_consts+0x30-.L021pic_point,%ebp
call _vpaes_preheat
-.L020pic_point:
+.L021pic_point:
cmpl $0,%ecx
- je .L021cbc_dec_loop
- jmp .L022cbc_enc_loop
+ je .L022cbc_dec_loop
+ jmp .L023cbc_enc_loop
.align 16
-.L022cbc_enc_loop:
+.L023cbc_enc_loop:
movdqu (%esi),%xmm0
pxor %xmm1,%xmm0
call _vpaes_encrypt_core
@@ -643,10 +644,10 @@ vpaes_cbc_encrypt:
movdqu %xmm0,(%ebx,%esi,1)
leal 16(%esi),%esi
subl $16,%edi
- jnc .L022cbc_enc_loop
- jmp .L023cbc_done
+ jnc .L023cbc_enc_loop
+ jmp .L024cbc_done
.align 16
-.L021cbc_dec_loop:
+.L022cbc_dec_loop:
movdqu (%esi),%xmm0
movdqa %xmm1,16(%esp)
movdqa %xmm0,32(%esp)
@@ -658,11 +659,12 @@ vpaes_cbc_encrypt:
movdqu %xmm0,(%ebx,%esi,1)
leal 16(%esi),%esi
subl $16,%edi
- jnc .L021cbc_dec_loop
-.L023cbc_done:
+ jnc .L022cbc_dec_loop
+.L024cbc_done:
movl 8(%esp),%ebx
movl 48(%esp),%esp
movdqu %xmm1,(%ebx)
+.L020cbc_abort:
popl %edi
popl %esi
popl %ebx
diff --git a/lib/accelerated/x86/elf/aes-ssse3-x86_64.s b/lib/accelerated/x86/elf/aes-ssse3-x86_64.s
index 78dd07e700..09cc51059f 100644
--- a/lib/accelerated/x86/elf/aes-ssse3-x86_64.s
+++ b/lib/accelerated/x86/elf/aes-ssse3-x86_64.s
@@ -43,8 +43,8 @@ _vpaes_encrypt_core:
movdqa .Lk_ipt+16(%rip),%xmm0
.byte 102,15,56,0,193
pxor %xmm5,%xmm2
- pxor %xmm2,%xmm0
addq $16,%r9
+ pxor %xmm2,%xmm0
leaq .Lk_mc_backward(%rip),%r10
jmp .Lenc_entry
@@ -52,19 +52,19 @@ _vpaes_encrypt_core:
.Lenc_loop:
movdqa %xmm13,%xmm4
-.byte 102,15,56,0,226
- pxor %xmm5,%xmm4
movdqa %xmm12,%xmm0
+.byte 102,15,56,0,226
.byte 102,15,56,0,195
- pxor %xmm4,%xmm0
+ pxor %xmm5,%xmm4
movdqa %xmm15,%xmm5
-.byte 102,15,56,0,234
+ pxor %xmm4,%xmm0
movdqa -64(%r11,%r10,1),%xmm1
+.byte 102,15,56,0,234
+ movdqa (%r11,%r10,1),%xmm4
movdqa %xmm14,%xmm2
.byte 102,15,56,0,211
- pxor %xmm5,%xmm2
- movdqa (%r11,%r10,1),%xmm4
movdqa %xmm0,%xmm3
+ pxor %xmm5,%xmm2
.byte 102,15,56,0,193
addq $16,%r9
pxor %xmm2,%xmm0
@@ -73,30 +73,30 @@ _vpaes_encrypt_core:
pxor %xmm0,%xmm3
.byte 102,15,56,0,193
andq $48,%r11
- pxor %xmm3,%xmm0
subq $1,%rax
+ pxor %xmm3,%xmm0
.Lenc_entry:
movdqa %xmm9,%xmm1
+ movdqa %xmm11,%xmm5
pandn %xmm0,%xmm1
psrld $4,%xmm1
pand %xmm9,%xmm0
- movdqa %xmm11,%xmm5
.byte 102,15,56,0,232
- pxor %xmm1,%xmm0
movdqa %xmm10,%xmm3
+ pxor %xmm1,%xmm0
.byte 102,15,56,0,217
- pxor %xmm5,%xmm3
movdqa %xmm10,%xmm4
+ pxor %xmm5,%xmm3
.byte 102,15,56,0,224
- pxor %xmm5,%xmm4
movdqa %xmm10,%xmm2
+ pxor %xmm5,%xmm4
.byte 102,15,56,0,211
- pxor %xmm0,%xmm2
movdqa %xmm10,%xmm3
- movdqu (%r9),%xmm5
+ pxor %xmm0,%xmm2
.byte 102,15,56,0,220
+ movdqu (%r9),%xmm5
pxor %xmm1,%xmm3
jnz .Lenc_loop
@@ -149,62 +149,61 @@ _vpaes_decrypt_core:
movdqa -32(%r10),%xmm4
+ movdqa -16(%r10),%xmm1
.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa -16(%r10),%xmm0
-.byte 102,15,56,0,195
+.byte 102,15,56,0,203
pxor %xmm4,%xmm0
- addq $16,%r9
-
-.byte 102,15,56,0,197
movdqa 0(%r10),%xmm4
-.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa 16(%r10),%xmm0
-.byte 102,15,56,0,195
- pxor %xmm4,%xmm0
- subq $1,%rax
+ pxor %xmm1,%xmm0
+ movdqa 16(%r10),%xmm1
-.byte 102,15,56,0,197
- movdqa 32(%r10),%xmm4
.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa 48(%r10),%xmm0
-.byte 102,15,56,0,195
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
pxor %xmm4,%xmm0
+ movdqa 32(%r10),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 48(%r10),%xmm1
+.byte 102,15,56,0,226
.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
movdqa 64(%r10),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 80(%r10),%xmm1
+
.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa 80(%r10),%xmm0
-.byte 102,15,56,0,195
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
pxor %xmm4,%xmm0
-
+ addq $16,%r9
.byte 102,15,58,15,237,12
+ pxor %xmm1,%xmm0
+ subq $1,%rax
.Ldec_entry:
movdqa %xmm9,%xmm1
pandn %xmm0,%xmm1
+ movdqa %xmm11,%xmm2
psrld $4,%xmm1
pand %xmm9,%xmm0
- movdqa %xmm11,%xmm2
.byte 102,15,56,0,208
- pxor %xmm1,%xmm0
movdqa %xmm10,%xmm3
+ pxor %xmm1,%xmm0
.byte 102,15,56,0,217
- pxor %xmm2,%xmm3
movdqa %xmm10,%xmm4
+ pxor %xmm2,%xmm3
.byte 102,15,56,0,224
pxor %xmm2,%xmm4
movdqa %xmm10,%xmm2
.byte 102,15,56,0,211
- pxor %xmm0,%xmm2
movdqa %xmm10,%xmm3
+ pxor %xmm0,%xmm2
.byte 102,15,56,0,220
- pxor %xmm1,%xmm3
movdqu (%r9),%xmm0
+ pxor %xmm1,%xmm3
jnz .Ldec_loop
@@ -212,7 +211,7 @@ _vpaes_decrypt_core:
.byte 102,15,56,0,226
pxor %xmm0,%xmm4
movdqa 112(%r10),%xmm0
- movdqa .Lk_sr-.Lk_dsbd(%r11),%xmm2
+ movdqa -352(%r11),%xmm2
.byte 102,15,56,0,195
pxor %xmm4,%xmm0
.byte 102,15,56,0,194
@@ -232,7 +231,7 @@ _vpaes_schedule_core:
- call _vpaes_preheat
+ call _vpaes_preheat
movdqa .Lk_rcon(%rip),%xmm8
movdqu (%rdi),%xmm0
@@ -278,7 +277,7 @@ _vpaes_schedule_core:
call _vpaes_schedule_round
decq %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
jmp .Loop_schedule_128
@@ -299,7 +298,7 @@ _vpaes_schedule_core:
.align 16
.Lschedule_192:
movdqu 8(%rdi),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movdqa %xmm0,%xmm6
pxor %xmm4,%xmm4
movhlps %xmm4,%xmm6
@@ -308,13 +307,13 @@ _vpaes_schedule_core:
.Loop_schedule_192:
call _vpaes_schedule_round
.byte 102,15,58,15,198,8
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_round
decq %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
jmp .Loop_schedule_192
@@ -331,18 +330,18 @@ _vpaes_schedule_core:
.align 16
.Lschedule_256:
movdqu 16(%rdi),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movl $7,%esi
.Loop_schedule_256:
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
movdqa %xmm0,%xmm6
call _vpaes_schedule_round
decq %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
pshufd $255,%xmm0,%xmm0
@@ -380,7 +379,7 @@ _vpaes_schedule_core:
.Lschedule_mangle_last_dec:
addq $-16,%rdx
pxor .Lk_s63(%rip),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movdqu %xmm0,(%rdx)
@@ -412,12 +411,12 @@ _vpaes_schedule_core:
.type _vpaes_schedule_192_smear,@function
.align 16
_vpaes_schedule_192_smear:
- pshufd $128,%xmm6,%xmm0
- pxor %xmm0,%xmm6
+ pshufd $128,%xmm6,%xmm1
pshufd $254,%xmm7,%xmm0
+ pxor %xmm1,%xmm6
+ pxor %xmm1,%xmm1
pxor %xmm0,%xmm6
movdqa %xmm6,%xmm0
- pxor %xmm1,%xmm1
movhlps %xmm1,%xmm6
.byte 0xf3,0xc3
.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
@@ -680,9 +679,10 @@ vpaes_decrypt:
.align 16
vpaes_cbc_encrypt:
xchgq %rcx,%rdx
+ subq $16,%rcx
+ jc .Lcbc_abort
movdqu (%r8),%xmm6
subq %rdi,%rsi
- subq $16,%rcx
call _vpaes_preheat
cmpl $0,%r9d
je .Lcbc_dec_loop
@@ -711,6 +711,7 @@ vpaes_cbc_encrypt:
jnc .Lcbc_dec_loop
.Lcbc_done:
movdqu %xmm6,(%r8)
+.Lcbc_abort:
.byte 0xf3,0xc3
.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
@@ -833,7 +834,7 @@ _vpaes_consts:
.Lk_dsbo:
.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
-.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
.align 64
.size _vpaes_consts,.-_vpaes_consts
diff --git a/lib/accelerated/x86/elf/aesni-x86.s b/lib/accelerated/x86/elf/aesni-x86.s
index 5ee55499d6..ab8ee06865 100644
--- a/lib/accelerated/x86/elf/aesni-x86.s
+++ b/lib/accelerated/x86/elf/aesni-x86.s
@@ -87,29 +87,84 @@ aesni_decrypt:
movups %xmm2,(%eax)
ret
.size aesni_decrypt,.-.L_aesni_decrypt_begin
+.type _aesni_encrypt2,@function
+.align 16
+_aesni_encrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L002enc2_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L002enc2_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ ret
+.size _aesni_encrypt2,.-_aesni_encrypt2
+.type _aesni_decrypt2,@function
+.align 16
+_aesni_decrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L003dec2_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L003dec2_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+ ret
+.size _aesni_decrypt2,.-_aesni_decrypt2
.type _aesni_encrypt3,@function
.align 16
_aesni_encrypt3:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
- movups (%edx),%xmm0
-.L002enc3_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L004enc3_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
- movups (%edx),%xmm0
- jnz .L002enc3_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L004enc3_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -122,25 +177,26 @@ _aesni_encrypt3:
.align 16
_aesni_decrypt3:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
- movups (%edx),%xmm0
-.L003dec3_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L005dec3_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
- movups (%edx),%xmm0
- jnz .L003dec3_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L005dec3_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -154,27 +210,29 @@ _aesni_decrypt3:
_aesni_encrypt4:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
- shrl $1,%ecx
- leal 32(%edx),%edx
+ shll $4,%ecx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
pxor %xmm0,%xmm5
- movups (%edx),%xmm0
-.L004enc4_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+.L006enc4_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
.byte 102,15,56,220,233
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
- movups (%edx),%xmm0
- jnz .L004enc4_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L006enc4_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -190,27 +248,29 @@ _aesni_encrypt4:
_aesni_decrypt4:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
- shrl $1,%ecx
- leal 32(%edx),%edx
+ shll $4,%ecx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
pxor %xmm0,%xmm5
- movups (%edx),%xmm0
-.L005dec4_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+.L007dec4_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
.byte 102,15,56,222,233
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
- movups (%edx),%xmm0
- jnz .L005dec4_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L007dec4_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -225,45 +285,44 @@ _aesni_decrypt4:
.align 16
_aesni_encrypt6:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
+.byte 102,15,56,220,209
pxor %xmm0,%xmm5
- decl %ecx
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,220,225
pxor %xmm0,%xmm7
+ addl $16,%ecx
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%edx),%xmm0
.byte 102,15,56,220,249
+ movups -16(%edx,%ecx,1),%xmm0
jmp .L_aesni_encrypt6_enter
.align 16
-.L006enc6_loop:
+.L008enc6_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.align 16
.L_aesni_encrypt6_enter:
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%edx),%xmm0
- jnz .L006enc6_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L008enc6_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -282,45 +341,44 @@ _aesni_encrypt6:
.align 16
_aesni_decrypt6:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
-.byte 102,15,56,222,217
+.byte 102,15,56,222,209
pxor %xmm0,%xmm5
- decl %ecx
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,222,225
pxor %xmm0,%xmm7
+ addl $16,%ecx
+.byte 102,15,56,222,233
.byte 102,15,56,222,241
- movups (%edx),%xmm0
.byte 102,15,56,222,249
+ movups -16(%edx,%ecx,1),%xmm0
jmp .L_aesni_decrypt6_enter
.align 16
-.L007dec6_loop:
+.L009dec6_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.align 16
.L_aesni_decrypt6_enter:
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%edx),%xmm0
- jnz .L007dec6_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L009dec6_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -350,14 +408,14 @@ aesni_ecb_encrypt:
movl 32(%esp),%edx
movl 36(%esp),%ebx
andl $-16,%eax
- jz .L008ecb_ret
+ jz .L010ecb_ret
movl 240(%edx),%ecx
testl %ebx,%ebx
- jz .L009ecb_decrypt
+ jz .L011ecb_decrypt
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb .L010ecb_enc_tail
+ jb .L012ecb_enc_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -366,9 +424,9 @@ aesni_ecb_encrypt:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp .L011ecb_enc_loop6_enter
+ jmp .L013ecb_enc_loop6_enter
.align 16
-.L012ecb_enc_loop6:
+.L014ecb_enc_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -383,12 +441,12 @@ aesni_ecb_encrypt:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-.L011ecb_enc_loop6_enter:
+.L013ecb_enc_loop6_enter:
call _aesni_encrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc .L012ecb_enc_loop6
+ jnc .L014ecb_enc_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -397,18 +455,18 @@ aesni_ecb_encrypt:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz .L008ecb_ret
-.L010ecb_enc_tail:
+ jz .L010ecb_ret
+.L012ecb_enc_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb .L013ecb_enc_one
+ jb .L015ecb_enc_one
movups 16(%esi),%xmm3
- je .L014ecb_enc_two
+ je .L016ecb_enc_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb .L015ecb_enc_three
+ jb .L017ecb_enc_three
movups 48(%esi),%xmm5
- je .L016ecb_enc_four
+ je .L018ecb_enc_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call _aesni_encrypt6
@@ -417,50 +475,49 @@ aesni_ecb_encrypt:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L008ecb_ret
+ jmp .L010ecb_ret
.align 16
-.L013ecb_enc_one:
+.L015ecb_enc_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L017enc1_loop_3:
+.L019enc1_loop_3:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L017enc1_loop_3
+ jnz .L019enc1_loop_3
.byte 102,15,56,221,209
movups %xmm2,(%edi)
- jmp .L008ecb_ret
+ jmp .L010ecb_ret
.align 16
-.L014ecb_enc_two:
- xorps %xmm4,%xmm4
- call _aesni_encrypt3
+.L016ecb_enc_two:
+ call _aesni_encrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L008ecb_ret
+ jmp .L010ecb_ret
.align 16
-.L015ecb_enc_three:
+.L017ecb_enc_three:
call _aesni_encrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L008ecb_ret
+ jmp .L010ecb_ret
.align 16
-.L016ecb_enc_four:
+.L018ecb_enc_four:
call _aesni_encrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
- jmp .L008ecb_ret
+ jmp .L010ecb_ret
.align 16
-.L009ecb_decrypt:
+.L011ecb_decrypt:
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb .L018ecb_dec_tail
+ jb .L020ecb_dec_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -469,9 +526,9 @@ aesni_ecb_encrypt:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp .L019ecb_dec_loop6_enter
+ jmp .L021ecb_dec_loop6_enter
.align 16
-.L020ecb_dec_loop6:
+.L022ecb_dec_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -486,12 +543,12 @@ aesni_ecb_encrypt:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-.L019ecb_dec_loop6_enter:
+.L021ecb_dec_loop6_enter:
call _aesni_decrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc .L020ecb_dec_loop6
+ jnc .L022ecb_dec_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -500,18 +557,18 @@ aesni_ecb_encrypt:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz .L008ecb_ret
-.L018ecb_dec_tail:
+ jz .L010ecb_ret
+.L020ecb_dec_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb .L021ecb_dec_one
+ jb .L023ecb_dec_one
movups 16(%esi),%xmm3
- je .L022ecb_dec_two
+ je .L024ecb_dec_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb .L023ecb_dec_three
+ jb .L025ecb_dec_three
movups 48(%esi),%xmm5
- je .L024ecb_dec_four
+ je .L026ecb_dec_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call _aesni_decrypt6
@@ -520,44 +577,43 @@ aesni_ecb_encrypt:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L008ecb_ret
+ jmp .L010ecb_ret
.align 16
-.L021ecb_dec_one:
+.L023ecb_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L025dec1_loop_4:
+.L027dec1_loop_4:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L025dec1_loop_4
+ jnz .L027dec1_loop_4
.byte 102,15,56,223,209
movups %xmm2,(%edi)
- jmp .L008ecb_ret
+ jmp .L010ecb_ret
.align 16
-.L022ecb_dec_two:
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
+.L024ecb_dec_two:
+ call _aesni_decrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L008ecb_ret
+ jmp .L010ecb_ret
.align 16
-.L023ecb_dec_three:
+.L025ecb_dec_three:
call _aesni_decrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L008ecb_ret
+ jmp .L010ecb_ret
.align 16
-.L024ecb_dec_four:
+.L026ecb_dec_four:
call _aesni_decrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-.L008ecb_ret:
+.L010ecb_ret:
popl %edi
popl %esi
popl %ebx
@@ -596,45 +652,45 @@ aesni_ccm64_encrypt_blocks:
movl %ebp,20(%esp)
movl %ebp,24(%esp)
movl %ebp,28(%esp)
- shrl $1,%ecx
+ shll $4,%ecx
+ movl $16,%ebx
leal (%edx),%ebp
movdqa (%esp),%xmm5
movdqa %xmm7,%xmm2
- movl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ subl %ecx,%ebx
.byte 102,15,56,0,253
-.L026ccm64_enc_outer:
+.L028ccm64_enc_outer:
movups (%ebp),%xmm0
movl %ebx,%ecx
movups (%esi),%xmm6
xorps %xmm0,%xmm2
movups 16(%ebp),%xmm1
xorps %xmm6,%xmm0
- leal 32(%ebp),%edx
xorps %xmm0,%xmm3
- movups (%edx),%xmm0
-.L027ccm64_enc2_loop:
+ movups 32(%ebp),%xmm0
+.L029ccm64_enc2_loop:
.byte 102,15,56,220,209
- decl %ecx
.byte 102,15,56,220,217
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
- leal 32(%edx),%edx
.byte 102,15,56,220,216
- movups (%edx),%xmm0
- jnz .L027ccm64_enc2_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L029ccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
paddq 16(%esp),%xmm7
+ decl %eax
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- decl %eax
leal 16(%esi),%esi
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
movups %xmm6,(%edi)
- leal 16(%edi),%edi
.byte 102,15,56,0,213
- jnz .L026ccm64_enc_outer
+ leal 16(%edi),%edi
+ jnz .L028ccm64_enc_outer
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
@@ -685,67 +741,70 @@ aesni_ccm64_decrypt_blocks:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L028enc1_loop_5:
+.L030enc1_loop_5:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L028enc1_loop_5
+ jnz .L030enc1_loop_5
.byte 102,15,56,221,209
+ shll $4,%ebx
+ movl $16,%ecx
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
leal 16(%esi),%esi
- jmp .L029ccm64_dec_outer
+ subl %ebx,%ecx
+ leal 32(%ebp,%ebx,1),%edx
+ movl %ecx,%ebx
+ jmp .L031ccm64_dec_outer
.align 16
-.L029ccm64_dec_outer:
+.L031ccm64_dec_outer:
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
- movl %ebx,%ecx
movups %xmm6,(%edi)
leal 16(%edi),%edi
.byte 102,15,56,0,213
subl $1,%eax
- jz .L030ccm64_dec_break
+ jz .L032ccm64_dec_break
movups (%ebp),%xmm0
- shrl $1,%ecx
+ movl %ebx,%ecx
movups 16(%ebp),%xmm1
xorps %xmm0,%xmm6
- leal 32(%ebp),%edx
xorps %xmm0,%xmm2
xorps %xmm6,%xmm3
- movups (%edx),%xmm0
-.L031ccm64_dec2_loop:
+ movups 32(%ebp),%xmm0
+.L033ccm64_dec2_loop:
.byte 102,15,56,220,209
- decl %ecx
.byte 102,15,56,220,217
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
- leal 32(%edx),%edx
.byte 102,15,56,220,216
- movups (%edx),%xmm0
- jnz .L031ccm64_dec2_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L033ccm64_dec2_loop
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- leal 16(%esi),%esi
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- jmp .L029ccm64_dec_outer
+ leal 16(%esi),%esi
+ jmp .L031ccm64_dec_outer
.align 16
-.L030ccm64_dec_break:
+.L032ccm64_dec_break:
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movups (%edx),%xmm0
movups 16(%edx),%xmm1
xorps %xmm0,%xmm6
leal 32(%edx),%edx
xorps %xmm6,%xmm3
-.L032enc1_loop_6:
+.L034enc1_loop_6:
.byte 102,15,56,220,217
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L032enc1_loop_6
+ jnz .L034enc1_loop_6
.byte 102,15,56,221,217
movl 48(%esp),%esp
movl 40(%esp),%edi
@@ -775,7 +834,7 @@ aesni_ctr32_encrypt_blocks:
andl $-16,%esp
movl %ebp,80(%esp)
cmpl $1,%eax
- je .L033ctr32_one_shortcut
+ je .L035ctr32_one_shortcut
movdqu (%ebx),%xmm7
movl $202182159,(%esp)
movl $134810123,4(%esp)
@@ -791,63 +850,59 @@ aesni_ctr32_encrypt_blocks:
.byte 102,15,58,34,253,3
movl 240(%edx),%ecx
bswap %ebx
- pxor %xmm1,%xmm1
pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movdqa (%esp),%xmm2
-.byte 102,15,58,34,203,0
+.byte 102,15,58,34,195,0
leal 3(%ebx),%ebp
-.byte 102,15,58,34,197,0
+.byte 102,15,58,34,205,0
incl %ebx
-.byte 102,15,58,34,203,1
+.byte 102,15,58,34,195,1
incl %ebp
-.byte 102,15,58,34,197,1
+.byte 102,15,58,34,205,1
incl %ebx
-.byte 102,15,58,34,203,2
+.byte 102,15,58,34,195,2
incl %ebp
-.byte 102,15,58,34,197,2
- movdqa %xmm1,48(%esp)
-.byte 102,15,56,0,202
- movdqa %xmm0,64(%esp)
+.byte 102,15,58,34,205,2
+ movdqa %xmm0,48(%esp)
.byte 102,15,56,0,194
- pshufd $192,%xmm1,%xmm2
- pshufd $128,%xmm1,%xmm3
+ movdqu (%edx),%xmm6
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
+ pshufd $192,%xmm0,%xmm2
+ pshufd $128,%xmm0,%xmm3
cmpl $6,%eax
- jb .L034ctr32_tail
+ jb .L036ctr32_tail
+ pxor %xmm6,%xmm7
+ shll $4,%ecx
+ movl $16,%ebx
movdqa %xmm7,32(%esp)
- shrl $1,%ecx
movl %edx,%ebp
- movl %ecx,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
subl $6,%eax
- jmp .L035ctr32_loop6
-.align 16
-.L035ctr32_loop6:
- pshufd $64,%xmm1,%xmm4
- movdqa 32(%esp),%xmm1
- pshufd $192,%xmm0,%xmm5
- por %xmm1,%xmm2
- pshufd $128,%xmm0,%xmm6
- por %xmm1,%xmm3
- pshufd $64,%xmm0,%xmm7
- por %xmm1,%xmm4
- por %xmm1,%xmm5
- por %xmm1,%xmm6
- por %xmm1,%xmm7
- movups (%ebp),%xmm0
- movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
- decl %ecx
+ jmp .L037ctr32_loop6
+.align 16
+.L037ctr32_loop6:
+ pshufd $64,%xmm0,%xmm4
+ movdqa 32(%esp),%xmm0
+ pshufd $192,%xmm1,%xmm5
pxor %xmm0,%xmm2
+ pshufd $128,%xmm1,%xmm6
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
+ pshufd $64,%xmm1,%xmm7
+ movups 16(%ebp),%xmm1
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
+.byte 102,15,56,220,209
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
pxor %xmm0,%xmm7
+.byte 102,15,56,220,217
+ movups 32(%ebp),%xmm0
+ movl %ebx,%ecx
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%edx),%xmm0
.byte 102,15,56,220,249
call .L_aesni_encrypt6_enter
movups (%esi),%xmm1
@@ -858,51 +913,51 @@ aesni_ctr32_encrypt_blocks:
movups %xmm2,(%edi)
movdqa 16(%esp),%xmm0
xorps %xmm1,%xmm4
- movdqa 48(%esp),%xmm1
+ movdqa 64(%esp),%xmm1
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
paddd %xmm0,%xmm1
- paddd 64(%esp),%xmm0
+ paddd 48(%esp),%xmm0
movdqa (%esp),%xmm2
movups 48(%esi),%xmm3
movups 64(%esi),%xmm4
xorps %xmm3,%xmm5
movups 80(%esi),%xmm3
leal 96(%esi),%esi
- movdqa %xmm1,48(%esp)
-.byte 102,15,56,0,202
+ movdqa %xmm0,48(%esp)
+.byte 102,15,56,0,194
xorps %xmm4,%xmm6
movups %xmm5,48(%edi)
xorps %xmm3,%xmm7
- movdqa %xmm0,64(%esp)
-.byte 102,15,56,0,194
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
movups %xmm6,64(%edi)
- pshufd $192,%xmm1,%xmm2
+ pshufd $192,%xmm0,%xmm2
movups %xmm7,80(%edi)
leal 96(%edi),%edi
- movl %ebx,%ecx
- pshufd $128,%xmm1,%xmm3
+ pshufd $128,%xmm0,%xmm3
subl $6,%eax
- jnc .L035ctr32_loop6
+ jnc .L037ctr32_loop6
addl $6,%eax
- jz .L036ctr32_ret
+ jz .L038ctr32_ret
+ movdqu (%ebp),%xmm7
movl %ebp,%edx
- leal 1(,%ecx,2),%ecx
- movdqa 32(%esp),%xmm7
-.L034ctr32_tail:
+ pxor 32(%esp),%xmm7
+ movl 240(%ebp),%ecx
+.L036ctr32_tail:
por %xmm7,%xmm2
cmpl $2,%eax
- jb .L037ctr32_one
- pshufd $64,%xmm1,%xmm4
+ jb .L039ctr32_one
+ pshufd $64,%xmm0,%xmm4
por %xmm7,%xmm3
- je .L038ctr32_two
- pshufd $192,%xmm0,%xmm5
+ je .L040ctr32_two
+ pshufd $192,%xmm1,%xmm5
por %xmm7,%xmm4
cmpl $4,%eax
- jb .L039ctr32_three
- pshufd $128,%xmm0,%xmm6
+ jb .L041ctr32_three
+ pshufd $128,%xmm1,%xmm6
por %xmm7,%xmm5
- je .L040ctr32_four
+ je .L042ctr32_four
por %xmm7,%xmm6
call _aesni_encrypt6
movups (%esi),%xmm1
@@ -920,39 +975,39 @@ aesni_ctr32_encrypt_blocks:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L036ctr32_ret
+ jmp .L038ctr32_ret
.align 16
-.L033ctr32_one_shortcut:
+.L035ctr32_one_shortcut:
movups (%ebx),%xmm2
movl 240(%edx),%ecx
-.L037ctr32_one:
+.L039ctr32_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L041enc1_loop_7:
+.L043enc1_loop_7:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L041enc1_loop_7
+ jnz .L043enc1_loop_7
.byte 102,15,56,221,209
movups (%esi),%xmm6
xorps %xmm2,%xmm6
movups %xmm6,(%edi)
- jmp .L036ctr32_ret
+ jmp .L038ctr32_ret
.align 16
-.L038ctr32_two:
- call _aesni_encrypt3
+.L040ctr32_two:
+ call _aesni_encrypt2
movups (%esi),%xmm5
movups 16(%esi),%xmm6
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L036ctr32_ret
+ jmp .L038ctr32_ret
.align 16
-.L039ctr32_three:
+.L041ctr32_three:
call _aesni_encrypt3
movups (%esi),%xmm5
movups 16(%esi),%xmm6
@@ -963,9 +1018,9 @@ aesni_ctr32_encrypt_blocks:
xorps %xmm7,%xmm4
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L036ctr32_ret
+ jmp .L038ctr32_ret
.align 16
-.L040ctr32_four:
+.L042ctr32_four:
call _aesni_encrypt4
movups (%esi),%xmm6
movups 16(%esi),%xmm7
@@ -979,7 +1034,7 @@ aesni_ctr32_encrypt_blocks:
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-.L036ctr32_ret:
+.L038ctr32_ret:
movl 80(%esp),%esp
popl %edi
popl %esi
@@ -1004,12 +1059,12 @@ aesni_xts_encrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L042enc1_loop_8:
+.L044enc1_loop_8:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L042enc1_loop_8
+ jnz .L044enc1_loop_8
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1033,12 +1088,14 @@ aesni_xts_encrypt:
movl %edx,%ebp
movl %ecx,%ebx
subl $96,%eax
- jc .L043xts_enc_short
- shrl $1,%ecx
- movl %ecx,%ebx
- jmp .L044xts_enc_loop6
+ jc .L045xts_enc_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp .L046xts_enc_loop6
.align 16
-.L044xts_enc_loop6:
+.L046xts_enc_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1074,6 +1131,7 @@ aesni_xts_encrypt:
pand %xmm3,%xmm7
movups (%esi),%xmm2
pxor %xmm1,%xmm7
+ movl %ebx,%ecx
movdqu 16(%esi),%xmm3
xorps %xmm0,%xmm2
movdqu 32(%esi),%xmm4
@@ -1089,19 +1147,17 @@ aesni_xts_encrypt:
movdqa %xmm7,80(%esp)
pxor %xmm1,%xmm7
movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
pxor 16(%esp),%xmm3
-.byte 102,15,56,220,209
pxor 32(%esp),%xmm4
-.byte 102,15,56,220,217
+.byte 102,15,56,220,209
pxor 48(%esp),%xmm5
- decl %ecx
-.byte 102,15,56,220,225
pxor 64(%esp),%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,217
pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%edx),%xmm0
.byte 102,15,56,220,249
call .L_aesni_encrypt6_enter
movdqa 80(%esp),%xmm1
@@ -1126,26 +1182,25 @@ aesni_xts_encrypt:
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
- movl %ebx,%ecx
pxor %xmm2,%xmm1
subl $96,%eax
- jnc .L044xts_enc_loop6
- leal 1(,%ecx,2),%ecx
+ jnc .L046xts_enc_loop6
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-.L043xts_enc_short:
+.L045xts_enc_short:
addl $96,%eax
- jz .L045xts_enc_done6x
+ jz .L047xts_enc_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb .L046xts_enc_one
+ jb .L048xts_enc_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je .L047xts_enc_two
+ je .L049xts_enc_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1154,7 +1209,7 @@ aesni_xts_encrypt:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb .L048xts_enc_three
+ jb .L050xts_enc_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1164,7 +1219,7 @@ aesni_xts_encrypt:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je .L049xts_enc_four
+ je .L051xts_enc_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1196,9 +1251,9 @@ aesni_xts_encrypt:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp .L050xts_enc_done
+ jmp .L052xts_enc_done
.align 16
-.L046xts_enc_one:
+.L048xts_enc_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1206,37 +1261,36 @@ aesni_xts_encrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L051enc1_loop_9:
+.L053enc1_loop_9:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L051enc1_loop_9
+ jnz .L053enc1_loop_9
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp .L050xts_enc_done
+ jmp .L052xts_enc_done
.align 16
-.L047xts_enc_two:
+.L049xts_enc_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
leal 32(%esi),%esi
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
- xorps %xmm4,%xmm4
- call _aesni_encrypt3
+ call _aesni_encrypt2
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L050xts_enc_done
+ jmp .L052xts_enc_done
.align 16
-.L048xts_enc_three:
+.L050xts_enc_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1254,9 +1308,9 @@ aesni_xts_encrypt:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp .L050xts_enc_done
+ jmp .L052xts_enc_done
.align 16
-.L049xts_enc_four:
+.L051xts_enc_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1278,28 +1332,28 @@ aesni_xts_encrypt:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L050xts_enc_done
+ jmp .L052xts_enc_done
.align 16
-.L045xts_enc_done6x:
+.L047xts_enc_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz .L052xts_enc_ret
+ jz .L054xts_enc_ret
movdqa %xmm1,%xmm5
movl %eax,112(%esp)
- jmp .L053xts_enc_steal
+ jmp .L055xts_enc_steal
.align 16
-.L050xts_enc_done:
+.L052xts_enc_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz .L052xts_enc_ret
+ jz .L054xts_enc_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm5
paddq %xmm1,%xmm1
pand 96(%esp),%xmm5
pxor %xmm1,%xmm5
-.L053xts_enc_steal:
+.L055xts_enc_steal:
movzbl (%esi),%ecx
movzbl -16(%edi),%edx
leal 1(%esi),%esi
@@ -1307,7 +1361,7 @@ aesni_xts_encrypt:
movb %dl,(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz .L053xts_enc_steal
+ jnz .L055xts_enc_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1317,16 +1371,16 @@ aesni_xts_encrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L054enc1_loop_10:
+.L056enc1_loop_10:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L054enc1_loop_10
+ jnz .L056enc1_loop_10
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,-16(%edi)
-.L052xts_enc_ret:
+.L054xts_enc_ret:
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1351,12 +1405,12 @@ aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L055enc1_loop_11:
+.L057enc1_loop_11:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L055enc1_loop_11
+ jnz .L057enc1_loop_11
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1385,12 +1439,14 @@ aesni_xts_decrypt:
pcmpgtd %xmm1,%xmm0
andl $-16,%eax
subl $96,%eax
- jc .L056xts_dec_short
- shrl $1,%ecx
- movl %ecx,%ebx
- jmp .L057xts_dec_loop6
+ jc .L058xts_dec_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp .L059xts_dec_loop6
.align 16
-.L057xts_dec_loop6:
+.L059xts_dec_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1426,6 +1482,7 @@ aesni_xts_decrypt:
pand %xmm3,%xmm7
movups (%esi),%xmm2
pxor %xmm1,%xmm7
+ movl %ebx,%ecx
movdqu 16(%esi),%xmm3
xorps %xmm0,%xmm2
movdqu 32(%esi),%xmm4
@@ -1441,19 +1498,17 @@ aesni_xts_decrypt:
movdqa %xmm7,80(%esp)
pxor %xmm1,%xmm7
movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
pxor 16(%esp),%xmm3
-.byte 102,15,56,222,209
pxor 32(%esp),%xmm4
-.byte 102,15,56,222,217
+.byte 102,15,56,222,209
pxor 48(%esp),%xmm5
- decl %ecx
-.byte 102,15,56,222,225
pxor 64(%esp),%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,217
pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
.byte 102,15,56,222,241
- movups (%edx),%xmm0
.byte 102,15,56,222,249
call .L_aesni_decrypt6_enter
movdqa 80(%esp),%xmm1
@@ -1478,26 +1533,25 @@ aesni_xts_decrypt:
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
- movl %ebx,%ecx
pxor %xmm2,%xmm1
subl $96,%eax
- jnc .L057xts_dec_loop6
- leal 1(,%ecx,2),%ecx
+ jnc .L059xts_dec_loop6
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-.L056xts_dec_short:
+.L058xts_dec_short:
addl $96,%eax
- jz .L058xts_dec_done6x
+ jz .L060xts_dec_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb .L059xts_dec_one
+ jb .L061xts_dec_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je .L060xts_dec_two
+ je .L062xts_dec_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1506,7 +1560,7 @@ aesni_xts_decrypt:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb .L061xts_dec_three
+ jb .L063xts_dec_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1516,7 +1570,7 @@ aesni_xts_decrypt:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je .L062xts_dec_four
+ je .L064xts_dec_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1548,9 +1602,9 @@ aesni_xts_decrypt:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp .L063xts_dec_done
+ jmp .L065xts_dec_done
.align 16
-.L059xts_dec_one:
+.L061xts_dec_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1558,36 +1612,36 @@ aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L064dec1_loop_12:
+.L066dec1_loop_12:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L064dec1_loop_12
+ jnz .L066dec1_loop_12
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp .L063xts_dec_done
+ jmp .L065xts_dec_done
.align 16
-.L060xts_dec_two:
+.L062xts_dec_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
leal 32(%esi),%esi
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
- call _aesni_decrypt3
+ call _aesni_decrypt2
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L063xts_dec_done
+ jmp .L065xts_dec_done
.align 16
-.L061xts_dec_three:
+.L063xts_dec_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1605,9 +1659,9 @@ aesni_xts_decrypt:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp .L063xts_dec_done
+ jmp .L065xts_dec_done
.align 16
-.L062xts_dec_four:
+.L064xts_dec_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1629,20 +1683,20 @@ aesni_xts_decrypt:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L063xts_dec_done
+ jmp .L065xts_dec_done
.align 16
-.L058xts_dec_done6x:
+.L060xts_dec_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz .L065xts_dec_ret
+ jz .L067xts_dec_ret
movl %eax,112(%esp)
- jmp .L066xts_dec_only_one_more
+ jmp .L068xts_dec_only_one_more
.align 16
-.L063xts_dec_done:
+.L065xts_dec_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz .L065xts_dec_ret
+ jz .L067xts_dec_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm2
@@ -1652,7 +1706,7 @@ aesni_xts_decrypt:
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
-.L066xts_dec_only_one_more:
+.L068xts_dec_only_one_more:
pshufd $19,%xmm0,%xmm5
movdqa %xmm1,%xmm6
paddq %xmm1,%xmm1
@@ -1666,16 +1720,16 @@ aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L067dec1_loop_13:
+.L069dec1_loop_13:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L067dec1_loop_13
+ jnz .L069dec1_loop_13
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
-.L068xts_dec_steal:
+.L070xts_dec_steal:
movzbl 16(%esi),%ecx
movzbl (%edi),%edx
leal 1(%esi),%esi
@@ -1683,7 +1737,7 @@ aesni_xts_decrypt:
movb %dl,16(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz .L068xts_dec_steal
+ jnz .L070xts_dec_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1693,16 +1747,16 @@ aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L069dec1_loop_14:
+.L071dec1_loop_14:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L069dec1_loop_14
+ jnz .L071dec1_loop_14
.byte 102,15,56,223,209
xorps %xmm6,%xmm2
movups %xmm2,(%edi)
-.L065xts_dec_ret:
+.L067xts_dec_ret:
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1728,7 +1782,7 @@ aesni_cbc_encrypt:
movl 32(%esp),%edx
movl 36(%esp),%ebp
testl %eax,%eax
- jz .L070cbc_abort
+ jz .L072cbc_abort
cmpl $0,40(%esp)
xchgl %esp,%ebx
movups (%ebp),%xmm7
@@ -1736,14 +1790,14 @@ aesni_cbc_encrypt:
movl %edx,%ebp
movl %ebx,16(%esp)
movl %ecx,%ebx
- je .L071cbc_decrypt
+ je .L073cbc_decrypt
movaps %xmm7,%xmm2
cmpl $16,%eax
- jb .L072cbc_enc_tail
+ jb .L074cbc_enc_tail
subl $16,%eax
- jmp .L073cbc_enc_loop
+ jmp .L075cbc_enc_loop
.align 16
-.L073cbc_enc_loop:
+.L075cbc_enc_loop:
movups (%esi),%xmm7
leal 16(%esi),%esi
movups (%edx),%xmm0
@@ -1751,24 +1805,24 @@ aesni_cbc_encrypt:
xorps %xmm0,%xmm7
leal 32(%edx),%edx
xorps %xmm7,%xmm2
-.L074enc1_loop_15:
+.L076enc1_loop_15:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L074enc1_loop_15
+ jnz .L076enc1_loop_15
.byte 102,15,56,221,209
movl %ebx,%ecx
movl %ebp,%edx
movups %xmm2,(%edi)
leal 16(%edi),%edi
subl $16,%eax
- jnc .L073cbc_enc_loop
+ jnc .L075cbc_enc_loop
addl $16,%eax
- jnz .L072cbc_enc_tail
+ jnz .L074cbc_enc_tail
movaps %xmm2,%xmm7
- jmp .L075cbc_ret
-.L072cbc_enc_tail:
+ jmp .L077cbc_ret
+.L074cbc_enc_tail:
movl %eax,%ecx
.long 2767451785
movl $16,%ecx
@@ -1779,20 +1833,20 @@ aesni_cbc_encrypt:
movl %ebx,%ecx
movl %edi,%esi
movl %ebp,%edx
- jmp .L073cbc_enc_loop
+ jmp .L075cbc_enc_loop
.align 16
-.L071cbc_decrypt:
+.L073cbc_decrypt:
cmpl $80,%eax
- jbe .L076cbc_dec_tail
+ jbe .L078cbc_dec_tail
movaps %xmm7,(%esp)
subl $80,%eax
- jmp .L077cbc_dec_loop6_enter
+ jmp .L079cbc_dec_loop6_enter
.align 16
-.L078cbc_dec_loop6:
+.L080cbc_dec_loop6:
movaps %xmm0,(%esp)
movups %xmm7,(%edi)
leal 16(%edi),%edi
-.L077cbc_dec_loop6_enter:
+.L079cbc_dec_loop6_enter:
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -1822,28 +1876,28 @@ aesni_cbc_encrypt:
movups %xmm6,64(%edi)
leal 80(%edi),%edi
subl $96,%eax
- ja .L078cbc_dec_loop6
+ ja .L080cbc_dec_loop6
movaps %xmm7,%xmm2
movaps %xmm0,%xmm7
addl $80,%eax
- jle .L079cbc_dec_tail_collected
+ jle .L081cbc_dec_tail_collected
movups %xmm2,(%edi)
leal 16(%edi),%edi
-.L076cbc_dec_tail:
+.L078cbc_dec_tail:
movups (%esi),%xmm2
movaps %xmm2,%xmm6
cmpl $16,%eax
- jbe .L080cbc_dec_one
+ jbe .L082cbc_dec_one
movups 16(%esi),%xmm3
movaps %xmm3,%xmm5
cmpl $32,%eax
- jbe .L081cbc_dec_two
+ jbe .L083cbc_dec_two
movups 32(%esi),%xmm4
cmpl $48,%eax
- jbe .L082cbc_dec_three
+ jbe .L084cbc_dec_three
movups 48(%esi),%xmm5
cmpl $64,%eax
- jbe .L083cbc_dec_four
+ jbe .L085cbc_dec_four
movups 64(%esi),%xmm6
movaps %xmm7,(%esp)
movups (%esi),%xmm2
@@ -1866,28 +1920,27 @@ aesni_cbc_encrypt:
leal 64(%edi),%edi
movaps %xmm6,%xmm2
subl $80,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L081cbc_dec_tail_collected
.align 16
-.L080cbc_dec_one:
+.L082cbc_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L084dec1_loop_16:
+.L086dec1_loop_16:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L084dec1_loop_16
+ jnz .L086dec1_loop_16
.byte 102,15,56,223,209
xorps %xmm7,%xmm2
movaps %xmm6,%xmm7
subl $16,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L081cbc_dec_tail_collected
.align 16
-.L081cbc_dec_two:
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
+.L083cbc_dec_two:
+ call _aesni_decrypt2
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
@@ -1895,9 +1948,9 @@ aesni_cbc_encrypt:
leal 16(%edi),%edi
movaps %xmm5,%xmm7
subl $32,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L081cbc_dec_tail_collected
.align 16
-.L082cbc_dec_three:
+.L084cbc_dec_three:
call _aesni_decrypt3
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
@@ -1908,9 +1961,9 @@ aesni_cbc_encrypt:
leal 32(%edi),%edi
movups 32(%esi),%xmm7
subl $48,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L081cbc_dec_tail_collected
.align 16
-.L083cbc_dec_four:
+.L085cbc_dec_four:
call _aesni_decrypt4
movups 16(%esi),%xmm1
movups 32(%esi),%xmm0
@@ -1925,23 +1978,23 @@ aesni_cbc_encrypt:
leal 48(%edi),%edi
movaps %xmm5,%xmm2
subl $64,%eax
-.L079cbc_dec_tail_collected:
+.L081cbc_dec_tail_collected:
andl $15,%eax
- jnz .L085cbc_dec_tail_partial
+ jnz .L087cbc_dec_tail_partial
movups %xmm2,(%edi)
- jmp .L075cbc_ret
+ jmp .L077cbc_ret
.align 16
-.L085cbc_dec_tail_partial:
+.L087cbc_dec_tail_partial:
movaps %xmm2,(%esp)
movl $16,%ecx
movl %esp,%esi
subl %eax,%ecx
.long 2767451785
-.L075cbc_ret:
+.L077cbc_ret:
movl 16(%esp),%esp
movl 36(%esp),%ebp
movups %xmm7,(%ebp)
-.L070cbc_abort:
+.L072cbc_abort:
popl %edi
popl %esi
popl %ebx
@@ -1952,51 +2005,51 @@ aesni_cbc_encrypt:
.align 16
_aesni_set_encrypt_key:
testl %eax,%eax
- jz .L086bad_pointer
+ jz .L088bad_pointer
testl %edx,%edx
- jz .L086bad_pointer
+ jz .L088bad_pointer
movups (%eax),%xmm0
xorps %xmm4,%xmm4
leal 16(%edx),%edx
cmpl $256,%ecx
- je .L08714rounds
+ je .L08914rounds
cmpl $192,%ecx
- je .L08812rounds
+ je .L09012rounds
cmpl $128,%ecx
- jne .L089bad_keybits
+ jne .L091bad_keybits
.align 16
-.L09010rounds:
+.L09210rounds:
movl $9,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,200,1
- call .L091key_128_cold
+ call .L093key_128_cold
.byte 102,15,58,223,200,2
- call .L092key_128
+ call .L094key_128
.byte 102,15,58,223,200,4
- call .L092key_128
+ call .L094key_128
.byte 102,15,58,223,200,8
- call .L092key_128
+ call .L094key_128
.byte 102,15,58,223,200,16
- call .L092key_128
+ call .L094key_128
.byte 102,15,58,223,200,32
- call .L092key_128
+ call .L094key_128
.byte 102,15,58,223,200,64
- call .L092key_128
+ call .L094key_128
.byte 102,15,58,223,200,128
- call .L092key_128
+ call .L094key_128
.byte 102,15,58,223,200,27
- call .L092key_128
+ call .L094key_128
.byte 102,15,58,223,200,54
- call .L092key_128
+ call .L094key_128
movups %xmm0,(%edx)
movl %ecx,80(%edx)
xorl %eax,%eax
ret
.align 16
-.L092key_128:
+.L094key_128:
movups %xmm0,(%edx)
leal 16(%edx),%edx
-.L091key_128_cold:
+.L093key_128_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -2005,38 +2058,38 @@ _aesni_set_encrypt_key:
xorps %xmm1,%xmm0
ret
.align 16
-.L08812rounds:
+.L09012rounds:
movq 16(%eax),%xmm2
movl $11,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,202,1
- call .L093key_192a_cold
+ call .L095key_192a_cold
.byte 102,15,58,223,202,2
- call .L094key_192b
+ call .L096key_192b
.byte 102,15,58,223,202,4
- call .L095key_192a
+ call .L097key_192a
.byte 102,15,58,223,202,8
- call .L094key_192b
+ call .L096key_192b
.byte 102,15,58,223,202,16
- call .L095key_192a
+ call .L097key_192a
.byte 102,15,58,223,202,32
- call .L094key_192b
+ call .L096key_192b
.byte 102,15,58,223,202,64
- call .L095key_192a
+ call .L097key_192a
.byte 102,15,58,223,202,128
- call .L094key_192b
+ call .L096key_192b
movups %xmm0,(%edx)
movl %ecx,48(%edx)
xorl %eax,%eax
ret
.align 16
-.L095key_192a:
+.L097key_192a:
movups %xmm0,(%edx)
leal 16(%edx),%edx
.align 16
-.L093key_192a_cold:
+.L095key_192a_cold:
movaps %xmm2,%xmm5
-.L096key_192b_warm:
+.L098key_192b_warm:
shufps $16,%xmm0,%xmm4
movdqa %xmm2,%xmm3
xorps %xmm4,%xmm0
@@ -2050,56 +2103,56 @@ _aesni_set_encrypt_key:
pxor %xmm3,%xmm2
ret
.align 16
-.L094key_192b:
+.L096key_192b:
movaps %xmm0,%xmm3
shufps $68,%xmm0,%xmm5
movups %xmm5,(%edx)
shufps $78,%xmm2,%xmm3
movups %xmm3,16(%edx)
leal 32(%edx),%edx
- jmp .L096key_192b_warm
+ jmp .L098key_192b_warm
.align 16
-.L08714rounds:
+.L08914rounds:
movups 16(%eax),%xmm2
movl $13,%ecx
leal 16(%edx),%edx
movups %xmm0,-32(%edx)
movups %xmm2,-16(%edx)
.byte 102,15,58,223,202,1
- call .L097key_256a_cold
+ call .L099key_256a_cold
.byte 102,15,58,223,200,1
- call .L098key_256b
+ call .L100key_256b
.byte 102,15,58,223,202,2
- call .L099key_256a
+ call .L101key_256a
.byte 102,15,58,223,200,2
- call .L098key_256b
+ call .L100key_256b
.byte 102,15,58,223,202,4
- call .L099key_256a
+ call .L101key_256a
.byte 102,15,58,223,200,4
- call .L098key_256b
+ call .L100key_256b
.byte 102,15,58,223,202,8
- call .L099key_256a
+ call .L101key_256a
.byte 102,15,58,223,200,8
- call .L098key_256b
+ call .L100key_256b
.byte 102,15,58,223,202,16
- call .L099key_256a
+ call .L101key_256a
.byte 102,15,58,223,200,16
- call .L098key_256b
+ call .L100key_256b
.byte 102,15,58,223,202,32
- call .L099key_256a
+ call .L101key_256a
.byte 102,15,58,223,200,32
- call .L098key_256b
+ call .L100key_256b
.byte 102,15,58,223,202,64
- call .L099key_256a
+ call .L101key_256a
movups %xmm0,(%edx)
movl %ecx,16(%edx)
xorl %eax,%eax
ret
.align 16
-.L099key_256a:
+.L101key_256a:
movups %xmm2,(%edx)
leal 16(%edx),%edx
-.L097key_256a_cold:
+.L099key_256a_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -2108,7 +2161,7 @@ _aesni_set_encrypt_key:
xorps %xmm1,%xmm0
ret
.align 16
-.L098key_256b:
+.L100key_256b:
movups %xmm0,(%edx)
leal 16(%edx),%edx
shufps $16,%xmm2,%xmm4
@@ -2119,11 +2172,11 @@ _aesni_set_encrypt_key:
xorps %xmm1,%xmm2
ret
.align 4
-.L086bad_pointer:
+.L088bad_pointer:
movl $-1,%eax
ret
.align 4
-.L089bad_keybits:
+.L091bad_keybits:
movl $-2,%eax
ret
.size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key
@@ -2150,7 +2203,7 @@ aesni_set_decrypt_key:
movl 12(%esp),%edx
shll $4,%ecx
testl %eax,%eax
- jnz .L100dec_key_ret
+ jnz .L102dec_key_ret
leal 16(%edx,%ecx,1),%eax
movups (%edx),%xmm0
movups (%eax),%xmm1
@@ -2158,7 +2211,7 @@ aesni_set_decrypt_key:
movups %xmm1,(%edx)
leal 16(%edx),%edx
leal -16(%eax),%eax
-.L101dec_key_inverse:
+.L103dec_key_inverse:
movups (%edx),%xmm0
movups (%eax),%xmm1
.byte 102,15,56,219,192
@@ -2168,12 +2221,12 @@ aesni_set_decrypt_key:
movups %xmm0,16(%eax)
movups %xmm1,-16(%edx)
cmpl %edx,%eax
- ja .L101dec_key_inverse
+ ja .L103dec_key_inverse
movups (%edx),%xmm0
.byte 102,15,56,219,192
movups %xmm0,(%edx)
xorl %eax,%eax
-.L100dec_key_ret:
+.L102dec_key_ret:
ret
.size aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
diff --git a/lib/accelerated/x86/elf/aesni-x86_64.s b/lib/accelerated/x86/elf/aesni-x86_64.s
index 59b37d83ba..996c531af2 100644
--- a/lib/accelerated/x86/elf/aesni-x86_64.s
+++ b/lib/accelerated/x86/elf/aesni-x86_64.s
@@ -38,6 +38,7 @@
# *** This file is auto-generated ***
#
.text
+
.globl aesni_encrypt
.type aesni_encrypt,@function
.align 16
@@ -53,7 +54,7 @@ aesni_encrypt:
decl %eax
movups (%rdx),%xmm1
leaq 16(%rdx),%rdx
- jnz .Loop_enc1_1
+ jnz .Loop_enc1_1
.byte 102,15,56,221,209
movups %xmm2,(%rsi)
.byte 0xf3,0xc3
@@ -74,34 +75,93 @@ aesni_decrypt:
decl %eax
movups (%rdx),%xmm1
leaq 16(%rdx),%rdx
- jnz .Loop_dec1_2
+ jnz .Loop_dec1_2
.byte 102,15,56,223,209
movups %xmm2,(%rsi)
.byte 0xf3,0xc3
.size aesni_decrypt, .-aesni_decrypt
+.type _aesni_encrypt2,@function
+.align 16
+_aesni_encrypt2:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+.Lenc_loop2:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Lenc_loop2
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ .byte 0xf3,0xc3
+.size _aesni_encrypt2,.-_aesni_encrypt2
+.type _aesni_decrypt2,@function
+.align 16
+_aesni_decrypt2:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+.Ldec_loop2:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Ldec_loop2
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+ .byte 0xf3,0xc3
+.size _aesni_decrypt2,.-_aesni_decrypt2
.type _aesni_encrypt3,@function
.align 16
_aesni_encrypt3:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
.Lenc_loop3:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop3
.byte 102,15,56,220,209
@@ -116,25 +176,26 @@ _aesni_encrypt3:
.align 16
_aesni_decrypt3:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
.Ldec_loop3:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop3
.byte 102,15,56,222,209
@@ -149,28 +210,30 @@ _aesni_decrypt3:
.align 16
_aesni_encrypt4:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
xorps %xmm0,%xmm5
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
.Lenc_loop4:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop4
.byte 102,15,56,220,209
@@ -187,28 +250,30 @@ _aesni_encrypt4:
.align 16
_aesni_decrypt4:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
xorps %xmm0,%xmm5
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
.Ldec_loop4:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop4
.byte 102,15,56,222,209
@@ -225,43 +290,43 @@ _aesni_decrypt4:
.align 16
_aesni_encrypt6:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
+.byte 102,15,56,220,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,225
pxor %xmm0,%xmm7
- decl %eax
+ addq $16,%rax
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%rcx),%xmm0
.byte 102,15,56,220,249
+ movups -16(%rcx,%rax,1),%xmm0
jmp .Lenc_loop6_enter
.align 16
.Lenc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.Lenc_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop6
.byte 102,15,56,220,209
@@ -282,43 +347,43 @@ _aesni_encrypt6:
.align 16
_aesni_decrypt6:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
+.byte 102,15,56,222,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,222,217
pxor %xmm0,%xmm5
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,225
pxor %xmm0,%xmm7
- decl %eax
+ addq $16,%rax
+.byte 102,15,56,222,233
.byte 102,15,56,222,241
- movups (%rcx),%xmm0
.byte 102,15,56,222,249
+ movups -16(%rcx,%rax,1),%xmm0
jmp .Ldec_loop6_enter
.align 16
.Ldec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.Ldec_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop6
.byte 102,15,56,222,209
@@ -339,52 +404,51 @@ _aesni_decrypt6:
.align 16
_aesni_encrypt8:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,209
+ addq $16,%rax
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,220,241
+.byte 102,15,56,220,217
pxor %xmm0,%xmm8
-.byte 102,15,56,220,249
pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
- movups 16(%rcx),%xmm1
+ movups -16(%rcx,%rax,1),%xmm0
jmp .Lenc_loop8_enter
.align 16
.Lenc_loop8:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
- movups 16(%rcx),%xmm1
.Lenc_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop8
.byte 102,15,56,220,209
@@ -409,52 +473,51 @@ _aesni_encrypt8:
.align 16
_aesni_decrypt8:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
-.byte 102,15,56,222,217
pxor %xmm0,%xmm5
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,222,209
+ addq $16,%rax
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,222,241
+.byte 102,15,56,222,217
pxor %xmm0,%xmm8
-.byte 102,15,56,222,249
pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
+ movups -16(%rcx,%rax,1),%xmm0
jmp .Ldec_loop8_enter
.align 16
.Ldec_loop8:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
.Ldec_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop8
.byte 102,15,56,222,209
@@ -583,14 +646,13 @@ aesni_ecb_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_3
+ jnz .Loop_enc1_3
.byte 102,15,56,221,209
movups %xmm2,(%rsi)
jmp .Lecb_ret
.align 16
.Lecb_enc_two:
- xorps %xmm4,%xmm4
- call _aesni_encrypt3
+ call _aesni_encrypt2
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
jmp .Lecb_ret
@@ -728,14 +790,13 @@ aesni_ecb_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_4
+ jnz .Loop_dec1_4
.byte 102,15,56,223,209
movups %xmm2,(%rsi)
jmp .Lecb_ret
.align 16
.Lecb_dec_two:
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
+ call _aesni_decrypt2
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
jmp .Lecb_ret
@@ -782,53 +843,53 @@ aesni_ecb_encrypt:
.align 16
aesni_ccm64_encrypt_blocks:
movl 240(%rcx),%eax
- movdqu (%r8),%xmm9
- movdqa .Lincrement64(%rip),%xmm6
+ movdqu (%r8),%xmm6
+ movdqa .Lincrement64(%rip),%xmm9
movdqa .Lbswap_mask(%rip),%xmm7
- shrl $1,%eax
+ shll $4,%eax
+ movl $16,%r10d
leaq 0(%rcx),%r11
movdqu (%r9),%xmm3
- movdqa %xmm9,%xmm2
- movl %eax,%r10d
-.byte 102,68,15,56,0,207
+ movdqa %xmm6,%xmm2
+ leaq 32(%rcx,%rax,1),%rcx
+.byte 102,15,56,0,247
+ subq %rax,%r10
jmp .Lccm64_enc_outer
.align 16
.Lccm64_enc_outer:
movups (%r11),%xmm0
- movl %r10d,%eax
+ movq %r10,%rax
movups (%rdi),%xmm8
xorps %xmm0,%xmm2
movups 16(%r11),%xmm1
xorps %xmm8,%xmm0
- leaq 32(%r11),%rcx
xorps %xmm0,%xmm3
- movups (%rcx),%xmm0
+ movups 32(%r11),%xmm0
.Lccm64_enc2_loop:
.byte 102,15,56,220,209
- decl %eax
.byte 102,15,56,220,217
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,216
- movups 0(%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
+ decq %rdx
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- decq %rdx
leaq 16(%rdi),%rdi
xorps %xmm2,%xmm8
- movdqa %xmm9,%xmm2
+ movdqa %xmm6,%xmm2
movups %xmm8,(%rsi)
- leaq 16(%rsi),%rsi
.byte 102,15,56,0,215
+ leaq 16(%rsi),%rsi
jnz .Lccm64_enc_outer
movups %xmm3,(%r9)
@@ -839,15 +900,15 @@ aesni_ccm64_encrypt_blocks:
.align 16
aesni_ccm64_decrypt_blocks:
movl 240(%rcx),%eax
- movups (%r8),%xmm9
+ movups (%r8),%xmm6
movdqu (%r9),%xmm3
- movdqa .Lincrement64(%rip),%xmm6
+ movdqa .Lincrement64(%rip),%xmm9
movdqa .Lbswap_mask(%rip),%xmm7
- movaps %xmm9,%xmm2
+ movaps %xmm6,%xmm2
movl %eax,%r10d
movq %rcx,%r11
-.byte 102,68,15,56,0,207
+.byte 102,15,56,0,247
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
@@ -857,17 +918,21 @@ aesni_ccm64_decrypt_blocks:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_5
+ jnz .Loop_enc1_5
.byte 102,15,56,221,209
+ shll $4,%r10d
+ movl $16,%eax
movups (%rdi),%xmm8
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
leaq 16(%rdi),%rdi
+ subq %r10,%rax
+ leaq 32(%r11,%r10,1),%rcx
+ movq %rax,%r10
jmp .Lccm64_dec_outer
.align 16
.Lccm64_dec_outer:
xorps %xmm2,%xmm8
- movdqa %xmm9,%xmm2
- movl %r10d,%eax
+ movdqa %xmm6,%xmm2
movups %xmm8,(%rsi)
leaq 16(%rsi),%rsi
.byte 102,15,56,0,215
@@ -876,36 +941,36 @@ aesni_ccm64_decrypt_blocks:
jz .Lccm64_dec_break
movups (%r11),%xmm0
- shrl $1,%eax
+ movq %r10,%rax
movups 16(%r11),%xmm1
xorps %xmm0,%xmm8
- leaq 32(%r11),%rcx
xorps %xmm0,%xmm2
xorps %xmm8,%xmm3
- movups (%rcx),%xmm0
-
+ movups 32(%r11),%xmm0
+ jmp .Lccm64_dec2_loop
+.align 16
.Lccm64_dec2_loop:
.byte 102,15,56,220,209
- decl %eax
.byte 102,15,56,220,217
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,216
- movups 0(%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lccm64_dec2_loop
movups (%rdi),%xmm8
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- leaq 16(%rdi),%rdi
.byte 102,15,56,221,208
.byte 102,15,56,221,216
+ leaq 16(%rdi),%rdi
jmp .Lccm64_dec_outer
.align 16
.Lccm64_dec_break:
+ movl 240(%r11),%eax
movups (%r11),%xmm0
movups 16(%r11),%xmm1
xorps %xmm0,%xmm8
@@ -916,7 +981,7 @@ aesni_ccm64_decrypt_blocks:
decl %eax
movups (%r11),%xmm1
leaq 16(%r11),%r11
- jnz .Loop_enc1_6
+ jnz .Loop_enc1_6
.byte 102,15,56,221,217
movups %xmm3,(%r9)
.byte 0xf3,0xc3
@@ -925,199 +990,518 @@ aesni_ccm64_decrypt_blocks:
.type aesni_ctr32_encrypt_blocks,@function
.align 16
aesni_ctr32_encrypt_blocks:
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $128,%rsp
+ andq $-16,%rsp
+ leaq -8(%rax),%rbp
+
cmpq $1,%rdx
je .Lctr32_one_shortcut
- movdqu (%r8),%xmm14
- movdqa .Lbswap_mask(%rip),%xmm15
- xorl %eax,%eax
-.byte 102,69,15,58,22,242,3
-.byte 102,68,15,58,34,240,3
-
+ movdqu (%r8),%xmm2
+ movdqu (%rcx),%xmm0
+ movl 12(%r8),%r8d
+ pxor %xmm0,%xmm2
+ movl 12(%rcx),%r11d
+ movdqa %xmm2,0(%rsp)
+ bswapl %r8d
+ movdqa %xmm2,%xmm3
+ movdqa %xmm2,%xmm4
+ movdqa %xmm2,%xmm5
+ movdqa %xmm2,64(%rsp)
+ movdqa %xmm2,80(%rsp)
+ movdqa %xmm2,96(%rsp)
+ movq %rdx,%r10
+ movdqa %xmm2,112(%rsp)
+
+ leaq 1(%r8),%rax
+ leaq 2(%r8),%rdx
+ bswapl %eax
+ bswapl %edx
+ xorl %r11d,%eax
+ xorl %r11d,%edx
+.byte 102,15,58,34,216,3
+ leaq 3(%r8),%rax
+ movdqa %xmm3,16(%rsp)
+.byte 102,15,58,34,226,3
+ bswapl %eax
+ movq %r10,%rdx
+ leaq 4(%r8),%r10
+ movdqa %xmm4,32(%rsp)
+ xorl %r11d,%eax
+ bswapl %r10d
+.byte 102,15,58,34,232,3
+ xorl %r11d,%r10d
+ movdqa %xmm5,48(%rsp)
+ leaq 5(%r8),%r9
+ movl %r10d,64+12(%rsp)
+ bswapl %r9d
+ leaq 6(%r8),%r10
movl 240(%rcx),%eax
+ xorl %r11d,%r9d
bswapl %r10d
- pxor %xmm12,%xmm12
- pxor %xmm13,%xmm13
-.byte 102,69,15,58,34,226,0
- leaq 3(%r10),%r11
-.byte 102,69,15,58,34,235,0
- incl %r10d
-.byte 102,69,15,58,34,226,1
- incq %r11
-.byte 102,69,15,58,34,235,1
- incl %r10d
-.byte 102,69,15,58,34,226,2
- incq %r11
-.byte 102,69,15,58,34,235,2
- movdqa %xmm12,-40(%rsp)
-.byte 102,69,15,56,0,231
- movdqa %xmm13,-24(%rsp)
-.byte 102,69,15,56,0,239
-
- pshufd $192,%xmm12,%xmm2
- pshufd $128,%xmm12,%xmm3
- pshufd $64,%xmm12,%xmm4
- cmpq $6,%rdx
+ movl %r9d,80+12(%rsp)
+ xorl %r11d,%r10d
+ leaq 7(%r8),%r9
+ movl %r10d,96+12(%rsp)
+ bswapl %r9d
+ movl _gnutls_x86_cpuid_s+4(%rip),%r10d
+ xorl %r11d,%r9d
+ andl $71303168,%r10d
+ movl %r9d,112+12(%rsp)
+
+ movups 16(%rcx),%xmm1
+
+ movdqa 64(%rsp),%xmm6
+ movdqa 80(%rsp),%xmm7
+
+ cmpq $8,%rdx
jb .Lctr32_tail
- shrl $1,%eax
- movq %rcx,%r11
- movl %eax,%r10d
+
subq $6,%rdx
+ cmpl $4194304,%r10d
+ je .Lctr32_6x
+
+ leaq 128(%rcx),%rcx
+ subq $2,%rdx
+ jmp .Lctr32_loop8
+
+.align 16
+.Lctr32_6x:
+ shll $4,%eax
+ movl $48,%r10d
+ bswapl %r11d
+ leaq 32(%rcx,%rax,1),%rcx
+ subq %rax,%r10
jmp .Lctr32_loop6
.align 16
.Lctr32_loop6:
- pshufd $192,%xmm13,%xmm5
- por %xmm14,%xmm2
- movups (%r11),%xmm0
- pshufd $128,%xmm13,%xmm6
- por %xmm14,%xmm3
- movups 16(%r11),%xmm1
- pshufd $64,%xmm13,%xmm7
- por %xmm14,%xmm4
- por %xmm14,%xmm5
- xorps %xmm0,%xmm2
- por %xmm14,%xmm6
- por %xmm14,%xmm7
+ addl $6,%r8d
+ movups -48(%rcx,%r10,1),%xmm0
+.byte 102,15,56,220,209
+ movl %r8d,%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,217
+.byte 0x0f,0x38,0xf1,0x44,0x24,12
+ leal 1(%r8),%eax
+.byte 102,15,56,220,225
+ xorl %r11d,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,28
+.byte 102,15,56,220,233
+ leal 2(%r8),%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,241
+.byte 0x0f,0x38,0xf1,0x44,0x24,44
+ leal 3(%r8),%eax
+.byte 102,15,56,220,249
+ movups -32(%rcx,%r10,1),%xmm1
+ xorl %r11d,%eax
+.byte 102,15,56,220,208
+.byte 0x0f,0x38,0xf1,0x44,0x24,60
+ leal 4(%r8),%eax
+.byte 102,15,56,220,216
+ xorl %r11d,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,76
+.byte 102,15,56,220,224
+ leal 5(%r8),%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,232
+.byte 0x0f,0x38,0xf1,0x44,0x24,92
+ movq %r10,%rax
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups -16(%rcx,%r10,1),%xmm0
+ call .Lenc_loop6
+ movdqu (%rdi),%xmm8
+ movdqu 16(%rdi),%xmm9
+ movdqu 32(%rdi),%xmm10
+ movdqu 48(%rdi),%xmm11
+ movdqu 64(%rdi),%xmm12
+ movdqu 80(%rdi),%xmm13
+ leaq 96(%rdi),%rdi
+ movups -64(%rcx,%r10,1),%xmm1
+ pxor %xmm2,%xmm8
+ movaps 0(%rsp),%xmm2
+ pxor %xmm3,%xmm9
+ movaps 16(%rsp),%xmm3
+ pxor %xmm4,%xmm10
+ movaps 32(%rsp),%xmm4
+ pxor %xmm5,%xmm11
+ movaps 48(%rsp),%xmm5
+ pxor %xmm6,%xmm12
+ movaps 64(%rsp),%xmm6
+ pxor %xmm7,%xmm13
+ movaps 80(%rsp),%xmm7
+ movdqu %xmm8,(%rsi)
+ movdqu %xmm9,16(%rsi)
+ movdqu %xmm10,32(%rsi)
+ movdqu %xmm11,48(%rsi)
+ movdqu %xmm12,64(%rsi)
+ movdqu %xmm13,80(%rsi)
+ leaq 96(%rsi),%rsi
- pxor %xmm0,%xmm3
+ subq $6,%rdx
+ jnc .Lctr32_loop6
+
+ addq $6,%rdx
+ jz .Lctr32_done
+
+ leal -48(%r10),%eax
+ leaq -80(%rcx,%r10,1),%rcx
+ negl %eax
+ shrl $4,%eax
+ jmp .Lctr32_tail
+
+.align 32
+.Lctr32_loop8:
+ addl $8,%r8d
+ movdqa 96(%rsp),%xmm8
.byte 102,15,56,220,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+ movl %r8d,%r9d
+ movdqa 112(%rsp),%xmm9
.byte 102,15,56,220,217
- movdqa .Lincrement32(%rip),%xmm13
- pxor %xmm0,%xmm5
+ bswapl %r9d
+ movups 32-128(%rcx),%xmm0
.byte 102,15,56,220,225
- movdqa -40(%rsp),%xmm12
- pxor %xmm0,%xmm6
+ xorl %r11d,%r9d
+ nop
.byte 102,15,56,220,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+ movl %r9d,0+12(%rsp)
+ leaq 1(%r8),%r9
.byte 102,15,56,220,241
.byte 102,15,56,220,249
- jmp .Lctr32_enc_loop6_enter
-.align 16
-.Lctr32_enc_loop6:
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 48-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movl %r9d,16+12(%rsp)
+ leaq 2(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 64-128(%rcx),%xmm0
+ bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
+ xorl %r11d,%r9d
+.byte 0x66,0x90
.byte 102,15,56,220,225
.byte 102,15,56,220,233
+ movl %r9d,32+12(%rsp)
+ leaq 3(%r8),%r9
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.Lctr32_enc_loop6_enter:
- movups 16(%rcx),%xmm1
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 80-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
+ xorl %r11d,%r9d
+.byte 0x66,0x90
.byte 102,15,56,220,224
.byte 102,15,56,220,232
+ movl %r9d,48+12(%rsp)
+ leaq 4(%r8),%r9
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
- jnz .Lctr32_enc_loop6
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 96-128(%rcx),%xmm0
+ bswapl %r9d
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movl %r9d,64+12(%rsp)
+ leaq 5(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 112-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movl %r9d,80+12(%rsp)
+ leaq 6(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 128-128(%rcx),%xmm0
+ bswapl %r9d
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movl %r9d,96+12(%rsp)
+ leaq 7(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 144-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+ xorl %r11d,%r9d
+ movdqu 0(%rdi),%xmm10
+.byte 102,15,56,220,232
+ movl %r9d,112+12(%rsp)
+ cmpl $11,%eax
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 160-128(%rcx),%xmm0
+
+ jb .Lctr32_enc_done
.byte 102,15,56,220,209
- paddd %xmm13,%xmm12
.byte 102,15,56,220,217
- paddd -24(%rsp),%xmm13
.byte 102,15,56,220,225
- movdqa %xmm12,-40(%rsp)
.byte 102,15,56,220,233
- movdqa %xmm13,-24(%rsp)
.byte 102,15,56,220,241
-.byte 102,69,15,56,0,231
.byte 102,15,56,220,249
-.byte 102,69,15,56,0,239
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 176-128(%rcx),%xmm1
-.byte 102,15,56,221,208
- movups (%rdi),%xmm8
-.byte 102,15,56,221,216
- movups 16(%rdi),%xmm9
-.byte 102,15,56,221,224
- movups 32(%rdi),%xmm10
-.byte 102,15,56,221,232
- movups 48(%rdi),%xmm11
-.byte 102,15,56,221,240
- movups 64(%rdi),%xmm1
-.byte 102,15,56,221,248
- movups 80(%rdi),%xmm0
- leaq 96(%rdi),%rdi
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 192-128(%rcx),%xmm0
+ je .Lctr32_enc_done
- xorps %xmm2,%xmm8
- pshufd $192,%xmm12,%xmm2
- xorps %xmm3,%xmm9
- pshufd $128,%xmm12,%xmm3
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- pshufd $64,%xmm12,%xmm4
- movups %xmm9,16(%rsi)
- xorps %xmm5,%xmm11
- movups %xmm10,32(%rsi)
- xorps %xmm6,%xmm1
- movups %xmm11,48(%rsi)
- xorps %xmm7,%xmm0
- movups %xmm1,64(%rsi)
- movups %xmm0,80(%rsi)
- leaq 96(%rsi),%rsi
- movl %r10d,%eax
- subq $6,%rdx
- jnc .Lctr32_loop6
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 208-128(%rcx),%xmm1
- addq $6,%rdx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 224-128(%rcx),%xmm0
+ jmp .Lctr32_enc_done
+
+.align 16
+.Lctr32_enc_done:
+ movdqu 16(%rdi),%xmm11
+ pxor %xmm0,%xmm10
+ movdqu 32(%rdi),%xmm12
+ pxor %xmm0,%xmm11
+ movdqu 48(%rdi),%xmm13
+ pxor %xmm0,%xmm12
+ movdqu 64(%rdi),%xmm14
+ pxor %xmm0,%xmm13
+ movdqu 80(%rdi),%xmm15
+ pxor %xmm0,%xmm14
+ pxor %xmm0,%xmm15
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movdqu 96(%rdi),%xmm1
+ leaq 128(%rdi),%rdi
+
+.byte 102,65,15,56,221,210
+ pxor %xmm0,%xmm1
+ movdqu 112-128(%rdi),%xmm10
+.byte 102,65,15,56,221,219
+ pxor %xmm0,%xmm10
+ movdqa 0(%rsp),%xmm11
+.byte 102,65,15,56,221,228
+.byte 102,65,15,56,221,237
+ movdqa 16(%rsp),%xmm12
+ movdqa 32(%rsp),%xmm13
+.byte 102,65,15,56,221,246
+.byte 102,65,15,56,221,255
+ movdqa 48(%rsp),%xmm14
+ movdqa 64(%rsp),%xmm15
+.byte 102,68,15,56,221,193
+ movdqa 80(%rsp),%xmm0
+ movups 16-128(%rcx),%xmm1
+.byte 102,69,15,56,221,202
+
+ movups %xmm2,(%rsi)
+ movdqa %xmm11,%xmm2
+ movups %xmm3,16(%rsi)
+ movdqa %xmm12,%xmm3
+ movups %xmm4,32(%rsi)
+ movdqa %xmm13,%xmm4
+ movups %xmm5,48(%rsi)
+ movdqa %xmm14,%xmm5
+ movups %xmm6,64(%rsi)
+ movdqa %xmm15,%xmm6
+ movups %xmm7,80(%rsi)
+ movdqa %xmm0,%xmm7
+ movups %xmm8,96(%rsi)
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+
+ subq $8,%rdx
+ jnc .Lctr32_loop8
+
+ addq $8,%rdx
jz .Lctr32_done
- movq %r11,%rcx
- leal 1(%rax,%rax,1),%eax
+ leaq -128(%rcx),%rcx
.Lctr32_tail:
- por %xmm14,%xmm2
- movups (%rdi),%xmm8
- cmpq $2,%rdx
- jb .Lctr32_one
+ leaq 16(%rcx),%rcx
+ cmpq $4,%rdx
+ jb .Lctr32_loop3
+ je .Lctr32_loop4
- por %xmm14,%xmm3
- movups 16(%rdi),%xmm9
- je .Lctr32_two
+ shll $4,%eax
+ movdqa 96(%rsp),%xmm8
+ pxor %xmm9,%xmm9
- pshufd $192,%xmm13,%xmm5
- por %xmm14,%xmm4
- movups 32(%rdi),%xmm10
- cmpq $4,%rdx
- jb .Lctr32_three
+ movups 16(%rcx),%xmm0
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ leaq 32-16(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,225
+ addq $16,%rax
+ movups (%rdi),%xmm10
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+ movups 16(%rdi),%xmm11
+ movups 32(%rdi),%xmm12
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
- pshufd $128,%xmm13,%xmm6
- por %xmm14,%xmm5
- movups 48(%rdi),%xmm11
- je .Lctr32_four
+ call .Lenc_loop8_enter
- por %xmm14,%xmm6
- xorps %xmm7,%xmm7
+ movdqu 48(%rdi),%xmm13
+ pxor %xmm10,%xmm2
+ movdqu 64(%rdi),%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm10,%xmm6
+ movdqu %xmm5,48(%rsi)
+ movdqu %xmm6,64(%rsi)
+ cmpq $6,%rdx
+ jb .Lctr32_done
- call _aesni_encrypt6
+ movups 80(%rdi),%xmm11
+ xorps %xmm11,%xmm7
+ movups %xmm7,80(%rsi)
+ je .Lctr32_done
- movups 64(%rdi),%xmm1
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- movups %xmm9,16(%rsi)
- xorps %xmm5,%xmm11
- movups %xmm10,32(%rsi)
- xorps %xmm6,%xmm1
- movups %xmm11,48(%rsi)
- movups %xmm1,64(%rsi)
+ movups 96(%rdi),%xmm12
+ xorps %xmm12,%xmm8
+ movups %xmm8,96(%rsi)
+ jmp .Lctr32_done
+
+.align 32
+.Lctr32_loop4:
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+ decl %eax
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups (%rcx),%xmm1
+ jnz .Lctr32_loop4
+.byte 102,15,56,221,209
+.byte 102,15,56,221,217
+ movups (%rdi),%xmm10
+ movups 16(%rdi),%xmm11
+.byte 102,15,56,221,225
+.byte 102,15,56,221,233
+ movups 32(%rdi),%xmm12
+ movups 48(%rdi),%xmm13
+
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+ xorps %xmm11,%xmm3
+ movups %xmm3,16(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm5,48(%rsi)
+ jmp .Lctr32_done
+
+.align 32
+.Lctr32_loop3:
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+ decl %eax
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+ movups (%rcx),%xmm1
+ jnz .Lctr32_loop3
+.byte 102,15,56,221,209
+.byte 102,15,56,221,217
+.byte 102,15,56,221,225
+
+ movups (%rdi),%xmm10
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+ cmpq $2,%rdx
+ jb .Lctr32_done
+
+ movups 16(%rdi),%xmm11
+ xorps %xmm11,%xmm3
+ movups %xmm3,16(%rsi)
+ je .Lctr32_done
+
+ movups 32(%rdi),%xmm12
+ xorps %xmm12,%xmm4
+ movups %xmm4,32(%rsi)
jmp .Lctr32_done
.align 16
.Lctr32_one_shortcut:
movups (%r8),%xmm2
- movups (%rdi),%xmm8
+ movups (%rdi),%xmm10
movl 240(%rcx),%eax
-.Lctr32_one:
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
@@ -1127,289 +1511,303 @@ aesni_ctr32_encrypt_blocks:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_7
+ jnz .Loop_enc1_7
.byte 102,15,56,221,209
- xorps %xmm2,%xmm8
- movups %xmm8,(%rsi)
- jmp .Lctr32_done
-
-.align 16
-.Lctr32_two:
- xorps %xmm4,%xmm4
- call _aesni_encrypt3
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- movups %xmm9,16(%rsi)
- jmp .Lctr32_done
-
-.align 16
-.Lctr32_three:
- call _aesni_encrypt3
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- movups %xmm9,16(%rsi)
- movups %xmm10,32(%rsi)
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
jmp .Lctr32_done
.align 16
-.Lctr32_four:
- call _aesni_encrypt4
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- movups %xmm9,16(%rsi)
- xorps %xmm5,%xmm11
- movups %xmm10,32(%rsi)
- movups %xmm11,48(%rsi)
-
.Lctr32_done:
+ leaq (%rbp),%rsp
+ popq %rbp
+.Lctr32_epilogue:
.byte 0xf3,0xc3
.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
.globl aesni_xts_encrypt
.type aesni_xts_encrypt,@function
.align 16
aesni_xts_encrypt:
- leaq -104(%rsp),%rsp
- movups (%r9),%xmm15
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $112,%rsp
+ andq $-16,%rsp
+ leaq -8(%rax),%rbp
+ movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
movups (%r8),%xmm0
movups 16(%r8),%xmm1
leaq 32(%r8),%r8
- xorps %xmm0,%xmm15
+ xorps %xmm0,%xmm2
.Loop_enc1_8:
-.byte 102,68,15,56,220,249
+.byte 102,15,56,220,209
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz .Loop_enc1_8
-.byte 102,68,15,56,221,249
+ jnz .Loop_enc1_8
+.byte 102,15,56,221,209
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+
movdqa .Lxts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ movdqa %xmm2,%xmm15
+ pshufd $95,%xmm2,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc .Lxts_enc_short
- shrl $1,%eax
- subl $1,%eax
- movl %eax,%r10d
+ movl $16+96,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ subq %r10,%rax
+ movups 16(%r11),%xmm1
+ movq %rax,%r10
+ leaq .Lxts_magic(%rip),%r8
jmp .Lxts_enc_grandloop
-.align 16
+.align 32
.Lxts_enc_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,220,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,220,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,220,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,220,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,220,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,220,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,220,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm12
+
+.byte 102,15,56,220,208
+ pxor %xmm9,%xmm13
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,220,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,220,216
+ pxor %xmm9,%xmm14
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,220,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,220,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,220,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,220,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp .Lxts_enc_loop6_enter
-
-.align 16
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups 64(%r11),%xmm0
+ movdqa %xmm8,80(%rsp)
+ pshufd $95,%xmm15,%xmm9
+ jmp .Lxts_enc_loop6
+.align 32
.Lxts_enc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.Lxts_enc_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups -64(%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
+ movups -80(%rcx,%rax,1),%xmm0
jnz .Lxts_enc_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
.byte 102,15,56,220,233
.byte 102,15,56,220,241
+ pxor %xmm14,%xmm15
+ movaps %xmm10,%xmm11
.byte 102,15,56,220,249
- movups 16(%rcx),%xmm1
+ movups -64(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,208
- pand %xmm8,%xmm9
+ paddd %xmm9,%xmm9
+ pxor %xmm15,%xmm10
.byte 102,15,56,220,216
- pcmpgtd %xmm15,%xmm14
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
.byte 102,15,56,220,224
- pxor %xmm9,%xmm15
.byte 102,15,56,220,232
+ pand %xmm8,%xmm14
+ movaps %xmm11,%xmm12
.byte 102,15,56,220,240
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,248
- movups 32(%rcx),%xmm0
+ movups -48(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
.byte 102,15,56,220,233
+ movdqa %xmm13,48(%rsp)
+ pxor %xmm14,%xmm15
.byte 102,15,56,220,241
+ movaps %xmm12,%xmm13
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,249
+ movups -32(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,220,216
paddq %xmm15,%xmm15
-.byte 102,15,56,221,208
- pand %xmm8,%xmm9
-.byte 102,15,56,221,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,221,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,221,232
-.byte 102,15,56,221,240
-.byte 102,15,56,221,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+ pxor %xmm14,%xmm15
+ movaps %xmm13,%xmm14
+.byte 102,15,56,220,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,220,217
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
+ pand %xmm8,%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+.byte 102,15,56,221,84,36,0
+ psrad $31,%xmm9
+ paddq %xmm15,%xmm15
+.byte 102,15,56,221,92,36,16
+.byte 102,15,56,221,100,36,32
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+ movq %r10,%rax
+.byte 102,15,56,221,108,36,48
+.byte 102,15,56,221,116,36,64
+.byte 102,15,56,221,124,36,80
pxor %xmm9,%xmm15
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
- movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc .Lxts_enc_grandloop
- leal 3(%rax,%rax,1),%eax
+ movl $16+96,%eax
+ subl %r10d,%eax
movq %r11,%rcx
- movl %eax,%r10d
+ shrl $4,%eax
.Lxts_enc_short:
+ movl %eax,%r10d
+ pxor %xmm0,%xmm10
addq $96,%rdx
jz .Lxts_enc_done
+ pxor %xmm0,%xmm11
cmpq $32,%rdx
jb .Lxts_enc_one
+ pxor %xmm0,%xmm12
je .Lxts_enc_two
+ pxor %xmm0,%xmm13
cmpq $64,%rdx
jb .Lxts_enc_three
+ pxor %xmm0,%xmm14
je .Lxts_enc_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
@@ -1450,7 +1848,7 @@ aesni_xts_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_9
+ jnz .Loop_enc1_9
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movdqa %xmm11,%xmm10
@@ -1466,7 +1864,7 @@ aesni_xts_encrypt:
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
- call _aesni_encrypt3
+ call _aesni_encrypt2
xorps %xmm10,%xmm2
movdqa %xmm12,%xmm10
@@ -1512,15 +1910,15 @@ aesni_xts_encrypt:
call _aesni_encrypt4
- xorps %xmm10,%xmm2
- movdqa %xmm15,%xmm10
- xorps %xmm11,%xmm3
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm10,%xmm2
+ movdqa %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp .Lxts_enc_done
@@ -1555,13 +1953,14 @@ aesni_xts_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_10
+ jnz .Loop_enc1_10
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movups %xmm2,-16(%rsi)
.Lxts_enc_ret:
- leaq 104(%rsp),%rsp
+ leaq (%rbp),%rsp
+ popq %rbp
.Lxts_enc_epilogue:
.byte 0xf3,0xc3
.size aesni_xts_encrypt,.-aesni_xts_encrypt
@@ -1569,249 +1968,292 @@ aesni_xts_encrypt:
.type aesni_xts_decrypt,@function
.align 16
aesni_xts_decrypt:
- leaq -104(%rsp),%rsp
- movups (%r9),%xmm15
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $112,%rsp
+ andq $-16,%rsp
+ leaq -8(%rax),%rbp
+ movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
movups (%r8),%xmm0
movups 16(%r8),%xmm1
leaq 32(%r8),%r8
- xorps %xmm0,%xmm15
+ xorps %xmm0,%xmm2
.Loop_enc1_11:
-.byte 102,68,15,56,220,249
+.byte 102,15,56,220,209
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz .Loop_enc1_11
-.byte 102,68,15,56,221,249
+ jnz .Loop_enc1_11
+.byte 102,15,56,221,209
xorl %eax,%eax
testq $15,%rdx
setnz %al
shlq $4,%rax
subq %rax,%rdx
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+
movdqa .Lxts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ movdqa %xmm2,%xmm15
+ pshufd $95,%xmm2,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc .Lxts_dec_short
- shrl $1,%eax
- subl $1,%eax
- movl %eax,%r10d
+ movl $16+96,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ subq %r10,%rax
+ movups 16(%r11),%xmm1
+ movq %rax,%r10
+ leaq .Lxts_magic(%rip),%r8
jmp .Lxts_dec_grandloop
-.align 16
+.align 32
.Lxts_dec_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,222,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,222,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,222,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,222,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,222,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,222,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,222,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm12
+
+.byte 102,15,56,222,208
+ pxor %xmm9,%xmm13
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,222,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,222,216
+ pxor %xmm9,%xmm14
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,222,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,222,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,222,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,222,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp .Lxts_dec_loop6_enter
-
-.align 16
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups 64(%r11),%xmm0
+ movdqa %xmm8,80(%rsp)
+ pshufd $95,%xmm15,%xmm9
+ jmp .Lxts_dec_loop6
+.align 32
.Lxts_dec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.Lxts_dec_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups -64(%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%rcx),%xmm0
+ movups -80(%rcx,%rax,1),%xmm0
jnz .Lxts_dec_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
.byte 102,15,56,222,233
.byte 102,15,56,222,241
+ pxor %xmm14,%xmm15
+ movaps %xmm10,%xmm11
.byte 102,15,56,222,249
- movups 16(%rcx),%xmm1
+ movups -64(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,208
- pand %xmm8,%xmm9
+ paddd %xmm9,%xmm9
+ pxor %xmm15,%xmm10
.byte 102,15,56,222,216
- pcmpgtd %xmm15,%xmm14
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
.byte 102,15,56,222,224
- pxor %xmm9,%xmm15
.byte 102,15,56,222,232
+ pand %xmm8,%xmm14
+ movaps %xmm11,%xmm12
.byte 102,15,56,222,240
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,248
- movups 32(%rcx),%xmm0
+ movups -48(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
.byte 102,15,56,222,233
+ movdqa %xmm13,48(%rsp)
+ pxor %xmm14,%xmm15
.byte 102,15,56,222,241
+ movaps %xmm12,%xmm13
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,249
+ movups -32(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,222,216
paddq %xmm15,%xmm15
-.byte 102,15,56,223,208
- pand %xmm8,%xmm9
-.byte 102,15,56,223,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,223,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,223,232
-.byte 102,15,56,223,240
-.byte 102,15,56,223,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+ pxor %xmm14,%xmm15
+ movaps %xmm13,%xmm14
+.byte 102,15,56,222,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,222,217
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
+ pand %xmm8,%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+.byte 102,15,56,223,84,36,0
+ psrad $31,%xmm9
+ paddq %xmm15,%xmm15
+.byte 102,15,56,223,92,36,16
+.byte 102,15,56,223,100,36,32
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+ movq %r10,%rax
+.byte 102,15,56,223,108,36,48
+.byte 102,15,56,223,116,36,64
+.byte 102,15,56,223,124,36,80
pxor %xmm9,%xmm15
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
- movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc .Lxts_dec_grandloop
- leal 3(%rax,%rax,1),%eax
+ movl $16+96,%eax
+ subl %r10d,%eax
movq %r11,%rcx
- movl %eax,%r10d
+ shrl $4,%eax
.Lxts_dec_short:
+ movl %eax,%r10d
+ pxor %xmm0,%xmm10
+ pxor %xmm0,%xmm11
addq $96,%rdx
jz .Lxts_dec_done
+ pxor %xmm0,%xmm12
cmpq $32,%rdx
jb .Lxts_dec_one
+ pxor %xmm0,%xmm13
je .Lxts_dec_two
+ pxor %xmm0,%xmm14
cmpq $64,%rdx
jb .Lxts_dec_three
je .Lxts_dec_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
@@ -1861,7 +2303,7 @@ aesni_xts_decrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_12
+ jnz .Loop_dec1_12
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movdqa %xmm11,%xmm10
@@ -1878,7 +2320,7 @@ aesni_xts_decrypt:
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
- call _aesni_decrypt3
+ call _aesni_decrypt2
xorps %xmm10,%xmm2
movdqa %xmm12,%xmm10
@@ -1904,7 +2346,7 @@ aesni_xts_decrypt:
xorps %xmm10,%xmm2
movdqa %xmm13,%xmm10
xorps %xmm11,%xmm3
- movdqa %xmm15,%xmm11
+ movdqa %xmm14,%xmm11
xorps %xmm12,%xmm4
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
@@ -1914,14 +2356,8 @@ aesni_xts_decrypt:
.align 16
.Lxts_dec_four:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movups (%rdi),%xmm2
- pand %xmm8,%xmm9
movups 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movups 32(%rdi),%xmm4
xorps %xmm10,%xmm2
movups 48(%rdi),%xmm5
@@ -1932,16 +2368,16 @@ aesni_xts_decrypt:
call _aesni_decrypt4
- xorps %xmm10,%xmm2
+ pxor %xmm10,%xmm2
movdqa %xmm14,%xmm10
- xorps %xmm11,%xmm3
+ pxor %xmm11,%xmm3
movdqa %xmm15,%xmm11
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp .Lxts_dec_done
@@ -1965,7 +2401,7 @@ aesni_xts_decrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_13
+ jnz .Loop_dec1_13
.byte 102,15,56,223,209
xorps %xmm11,%xmm2
movups %xmm2,(%rsi)
@@ -1995,13 +2431,14 @@ aesni_xts_decrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_14
+ jnz .Loop_dec1_14
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movups %xmm2,(%rsi)
.Lxts_dec_ret:
- leaq 104(%rsp),%rsp
+ leaq (%rbp),%rsp
+ popq %rbp
.Lxts_dec_epilogue:
.byte 0xf3,0xc3
.size aesni_xts_decrypt,.-aesni_xts_decrypt
@@ -2038,7 +2475,7 @@ aesni_cbc_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_15
+ jnz .Loop_enc1_15
.byte 102,15,56,221,209
movl %r10d,%eax
movq %r11,%rcx
@@ -2054,163 +2491,397 @@ aesni_cbc_encrypt:
.Lcbc_enc_tail:
movq %rdx,%rcx
xchgq %rdi,%rsi
-.long 0x9066A4F3
+.long 0x9066A4F3
movl $16,%ecx
subq %rdx,%rcx
xorl %eax,%eax
-.long 0x9066AAF3
+.long 0x9066AAF3
leaq -16(%rdi),%rdi
movl %r10d,%eax
movq %rdi,%rsi
movq %r11,%rcx
xorq %rdx,%rdx
- jmp .Lcbc_enc_loop
+ jmp .Lcbc_enc_loop
.align 16
.Lcbc_decrypt:
- movups (%r8),%xmm9
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $16,%rsp
+ andq $-16,%rsp
+ leaq -8(%rax),%rbp
+ movups (%r8),%xmm10
movl %r10d,%eax
- cmpq $112,%rdx
+ cmpq $80,%rdx
jbe .Lcbc_dec_tail
- shrl $1,%r10d
- subq $112,%rdx
- movl %r10d,%eax
- movaps %xmm9,-24(%rsp)
+
+ movups (%rcx),%xmm0
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqa %xmm2,%xmm11
+ movdqu 32(%rdi),%xmm4
+ movdqa %xmm3,%xmm12
+ movdqu 48(%rdi),%xmm5
+ movdqa %xmm4,%xmm13
+ movdqu 64(%rdi),%xmm6
+ movdqa %xmm5,%xmm14
+ movdqu 80(%rdi),%xmm7
+ movdqa %xmm6,%xmm15
+ movl _gnutls_x86_cpuid_s+4(%rip),%r9d
+ cmpq $112,%rdx
+ jbe .Lcbc_dec_six_or_seven
+
+ andl $71303168,%r9d
+ subq $80,%rdx
+ cmpl $4194304,%r9d
+ je .Lcbc_dec_loop6_enter
+ subq $32,%rdx
+ leaq 112(%rcx),%rcx
jmp .Lcbc_dec_loop8_enter
.align 16
.Lcbc_dec_loop8:
- movaps %xmm0,-24(%rsp)
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
.Lcbc_dec_loop8_enter:
- movups (%rcx),%xmm0
- movups (%rdi),%xmm2
- movups 16(%rdi),%xmm3
- movups 16(%rcx),%xmm1
+ movdqu 96(%rdi),%xmm8
+ pxor %xmm0,%xmm2
+ movdqu 112(%rdi),%xmm9
+ pxor %xmm0,%xmm3
+ movups 16-112(%rcx),%xmm1
+ pxor %xmm0,%xmm4
+ xorq %r11,%r11
+ cmpq $112,%rdx
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+ pxor %xmm0,%xmm7
+ pxor %xmm0,%xmm8
- leaq 32(%rcx),%rcx
- movdqu 32(%rdi),%xmm4
- xorps %xmm0,%xmm2
- movdqu 48(%rdi),%xmm5
- xorps %xmm0,%xmm3
- movdqu 64(%rdi),%xmm6
.byte 102,15,56,222,209
- pxor %xmm0,%xmm4
- movdqu 80(%rdi),%xmm7
+ pxor %xmm0,%xmm9
+ movups 32-112(%rcx),%xmm0
.byte 102,15,56,222,217
- pxor %xmm0,%xmm5
- movdqu 96(%rdi),%xmm8
.byte 102,15,56,222,225
- pxor %xmm0,%xmm6
- movdqu 112(%rdi),%xmm9
.byte 102,15,56,222,233
- pxor %xmm0,%xmm7
- decl %eax
.byte 102,15,56,222,241
- pxor %xmm0,%xmm8
.byte 102,15,56,222,249
- pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
.byte 102,68,15,56,222,193
+ setnc %r11b
+ shlq $7,%r11
.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
-
- call .Ldec_loop8_enter
+ addq %rdi,%r11
+ movups 48-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 64-112(%rcx),%xmm0
+ nop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 80-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 96-112(%rcx),%xmm0
+ nop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 112-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 128-112(%rcx),%xmm0
+ nop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 144-112(%rcx),%xmm1
+ cmpl $11,%eax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 160-112(%rcx),%xmm0
+ jb .Lcbc_dec_done
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 176-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 192-112(%rcx),%xmm0
+ je .Lcbc_dec_done
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 208-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 224-112(%rcx),%xmm0
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_done:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm10
+ pxor %xmm0,%xmm11
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm12
+ pxor %xmm0,%xmm13
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ pxor %xmm0,%xmm14
+ pxor %xmm0,%xmm15
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movdqu 80(%rdi),%xmm1
+
+.byte 102,65,15,56,223,210
+ movdqu 96(%rdi),%xmm10
+ pxor %xmm0,%xmm1
+.byte 102,65,15,56,223,219
+ pxor %xmm0,%xmm10
+ movdqu 112(%rdi),%xmm0
+.byte 102,65,15,56,223,228
+ leaq 128(%rdi),%rdi
+ movdqu 0(%r11),%xmm11
+.byte 102,65,15,56,223,237
+.byte 102,65,15,56,223,246
+ movdqu 16(%r11),%xmm12
+ movdqu 32(%r11),%xmm13
+.byte 102,65,15,56,223,255
+.byte 102,68,15,56,223,193
+ movdqu 48(%r11),%xmm14
+ movdqu 64(%r11),%xmm15
+.byte 102,69,15,56,223,202
+ movdqa %xmm0,%xmm10
+ movdqu 80(%r11),%xmm1
+ movups -112(%rcx),%xmm0
- movups (%rdi),%xmm1
- movups 16(%rdi),%xmm0
- xorps -24(%rsp),%xmm2
- xorps %xmm1,%xmm3
- movups 32(%rdi),%xmm1
- xorps %xmm0,%xmm4
- movups 48(%rdi),%xmm0
- xorps %xmm1,%xmm5
- movups 64(%rdi),%xmm1
- xorps %xmm0,%xmm6
- movups 80(%rdi),%xmm0
- xorps %xmm1,%xmm7
- movups 96(%rdi),%xmm1
- xorps %xmm0,%xmm8
- movups 112(%rdi),%xmm0
- xorps %xmm1,%xmm9
movups %xmm2,(%rsi)
+ movdqa %xmm11,%xmm2
movups %xmm3,16(%rsi)
+ movdqa %xmm12,%xmm3
movups %xmm4,32(%rsi)
+ movdqa %xmm13,%xmm4
movups %xmm5,48(%rsi)
- movl %r10d,%eax
+ movdqa %xmm14,%xmm5
movups %xmm6,64(%rsi)
- movq %r11,%rcx
+ movdqa %xmm15,%xmm6
movups %xmm7,80(%rsi)
- leaq 128(%rdi),%rdi
+ movdqa %xmm1,%xmm7
movups %xmm8,96(%rsi)
leaq 112(%rsi),%rsi
+
subq $128,%rdx
ja .Lcbc_dec_loop8
movaps %xmm9,%xmm2
- movaps %xmm0,%xmm9
+ leaq -112(%rcx),%rcx
addq $112,%rdx
jle .Lcbc_dec_tail_collected
- movups %xmm2,(%rsi)
- leal 1(%r10,%r10,1),%eax
+ movups %xmm9,(%rsi)
+ leaq 16(%rsi),%rsi
+ cmpq $80,%rdx
+ jbe .Lcbc_dec_tail
+
+ movaps %xmm11,%xmm2
+.Lcbc_dec_six_or_seven:
+ cmpq $96,%rdx
+ ja .Lcbc_dec_seven
+
+ movaps %xmm7,%xmm8
+ call _aesni_decrypt6
+ pxor %xmm10,%xmm2
+ movaps %xmm8,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ movdqa %xmm7,%xmm2
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_seven:
+ movups 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
+ call _aesni_decrypt8
+ movups 80(%rdi),%xmm9
+ pxor %xmm10,%xmm2
+ movups 96(%rdi),%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movdqu %xmm6,64(%rsi)
+ pxor %xmm9,%xmm8
+ movdqu %xmm7,80(%rsi)
+ leaq 96(%rsi),%rsi
+ movdqa %xmm8,%xmm2
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_loop6:
+ movups %xmm7,(%rsi)
leaq 16(%rsi),%rsi
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqa %xmm2,%xmm11
+ movdqu 32(%rdi),%xmm4
+ movdqa %xmm3,%xmm12
+ movdqu 48(%rdi),%xmm5
+ movdqa %xmm4,%xmm13
+ movdqu 64(%rdi),%xmm6
+ movdqa %xmm5,%xmm14
+ movdqu 80(%rdi),%xmm7
+ movdqa %xmm6,%xmm15
+.Lcbc_dec_loop6_enter:
+ leaq 96(%rdi),%rdi
+ movdqa %xmm7,%xmm8
+
+ call _aesni_decrypt6
+
+ pxor %xmm10,%xmm2
+ movdqa %xmm8,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movq %r11,%rcx
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movl %r10d,%eax
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ subq $96,%rdx
+ ja .Lcbc_dec_loop6
+
+ movdqa %xmm7,%xmm2
+ addq $80,%rdx
+ jle .Lcbc_dec_tail_collected
+ movups %xmm7,(%rsi)
+ leaq 16(%rsi),%rsi
+
.Lcbc_dec_tail:
movups (%rdi),%xmm2
- movaps %xmm2,%xmm8
- cmpq $16,%rdx
+ subq $16,%rdx
jbe .Lcbc_dec_one
movups 16(%rdi),%xmm3
- movaps %xmm3,%xmm7
- cmpq $32,%rdx
+ movaps %xmm2,%xmm11
+ subq $16,%rdx
jbe .Lcbc_dec_two
movups 32(%rdi),%xmm4
- movaps %xmm4,%xmm6
- cmpq $48,%rdx
+ movaps %xmm3,%xmm12
+ subq $16,%rdx
jbe .Lcbc_dec_three
movups 48(%rdi),%xmm5
- cmpq $64,%rdx
+ movaps %xmm4,%xmm13
+ subq $16,%rdx
jbe .Lcbc_dec_four
movups 64(%rdi),%xmm6
- cmpq $80,%rdx
- jbe .Lcbc_dec_five
-
- movups 80(%rdi),%xmm7
- cmpq $96,%rdx
- jbe .Lcbc_dec_six
-
- movups 96(%rdi),%xmm8
- movaps %xmm9,-24(%rsp)
- call _aesni_decrypt8
- movups (%rdi),%xmm1
- movups 16(%rdi),%xmm0
- xorps -24(%rsp),%xmm2
- xorps %xmm1,%xmm3
- movups 32(%rdi),%xmm1
- xorps %xmm0,%xmm4
- movups 48(%rdi),%xmm0
- xorps %xmm1,%xmm5
- movups 64(%rdi),%xmm1
- xorps %xmm0,%xmm6
- movups 80(%rdi),%xmm0
- xorps %xmm1,%xmm7
- movups 96(%rdi),%xmm9
- xorps %xmm0,%xmm8
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
- leaq 96(%rsi),%rsi
- movaps %xmm8,%xmm2
- subq $112,%rdx
+ movaps %xmm5,%xmm14
+ movaps %xmm6,%xmm15
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ pxor %xmm10,%xmm2
+ movaps %xmm15,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ leaq 64(%rsi),%rsi
+ movdqa %xmm6,%xmm2
+ subq $16,%rdx
jmp .Lcbc_dec_tail_collected
+
.align 16
.Lcbc_dec_one:
+ movaps %xmm2,%xmm11
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
@@ -2220,113 +2891,70 @@ aesni_cbc_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_16
+ jnz .Loop_dec1_16
.byte 102,15,56,223,209
- xorps %xmm9,%xmm2
- movaps %xmm8,%xmm9
- subq $16,%rdx
+ xorps %xmm10,%xmm2
+ movaps %xmm11,%xmm10
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_two:
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- movups %xmm2,(%rsi)
- movaps %xmm7,%xmm9
- movaps %xmm3,%xmm2
+ movaps %xmm3,%xmm12
+ call _aesni_decrypt2
+ pxor %xmm10,%xmm2
+ movaps %xmm12,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ movdqa %xmm3,%xmm2
leaq 16(%rsi),%rsi
- subq $32,%rdx
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_three:
+ movaps %xmm4,%xmm13
call _aesni_decrypt3
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- movups %xmm2,(%rsi)
- xorps %xmm7,%xmm4
- movups %xmm3,16(%rsi)
- movaps %xmm6,%xmm9
- movaps %xmm4,%xmm2
+ pxor %xmm10,%xmm2
+ movaps %xmm13,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ movdqa %xmm4,%xmm2
leaq 32(%rsi),%rsi
- subq $48,%rdx
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_four:
+ movaps %xmm5,%xmm14
call _aesni_decrypt4
- xorps %xmm9,%xmm2
- movups 48(%rdi),%xmm9
- xorps %xmm8,%xmm3
- movups %xmm2,(%rsi)
- xorps %xmm7,%xmm4
- movups %xmm3,16(%rsi)
- xorps %xmm6,%xmm5
- movups %xmm4,32(%rsi)
- movaps %xmm5,%xmm2
+ pxor %xmm10,%xmm2
+ movaps %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ movdqa %xmm5,%xmm2
leaq 48(%rsi),%rsi
- subq $64,%rdx
- jmp .Lcbc_dec_tail_collected
-.align 16
-.Lcbc_dec_five:
- xorps %xmm7,%xmm7
- call _aesni_decrypt6
- movups 16(%rdi),%xmm1
- movups 32(%rdi),%xmm0
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- xorps %xmm1,%xmm4
- movups 48(%rdi),%xmm1
- xorps %xmm0,%xmm5
- movups 64(%rdi),%xmm9
- xorps %xmm1,%xmm6
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- leaq 64(%rsi),%rsi
- movaps %xmm6,%xmm2
- subq $80,%rdx
- jmp .Lcbc_dec_tail_collected
-.align 16
-.Lcbc_dec_six:
- call _aesni_decrypt6
- movups 16(%rdi),%xmm1
- movups 32(%rdi),%xmm0
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- xorps %xmm1,%xmm4
- movups 48(%rdi),%xmm1
- xorps %xmm0,%xmm5
- movups 64(%rdi),%xmm0
- xorps %xmm1,%xmm6
- movups 80(%rdi),%xmm9
- xorps %xmm0,%xmm7
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- movups %xmm6,64(%rsi)
- leaq 80(%rsi),%rsi
- movaps %xmm7,%xmm2
- subq $96,%rdx
jmp .Lcbc_dec_tail_collected
+
.align 16
.Lcbc_dec_tail_collected:
+ movups %xmm10,(%r8)
andq $15,%rdx
- movups %xmm9,(%r8)
jnz .Lcbc_dec_tail_partial
movups %xmm2,(%rsi)
jmp .Lcbc_dec_ret
.align 16
.Lcbc_dec_tail_partial:
- movaps %xmm2,-24(%rsp)
+ movaps %xmm2,(%rsp)
movq $16,%rcx
movq %rsi,%rdi
subq %rdx,%rcx
- leaq -24(%rsp),%rsi
-.long 0x9066A4F3
+ leaq (%rsp),%rsi
+.long 0x9066A4F3
.Lcbc_dec_ret:
+ leaq (%rbp),%rsp
+ popq %rbp
.Lcbc_ret:
.byte 0xf3,0xc3
.size aesni_cbc_encrypt,.-aesni_cbc_encrypt
@@ -2334,7 +2962,7 @@ aesni_cbc_encrypt:
.type aesni_set_decrypt_key,@function
.align 16
aesni_set_decrypt_key:
-.byte 0x48,0x83,0xEC,0x08
+.byte 0x48,0x83,0xEC,0x08
call __aesni_set_encrypt_key
shll $4,%esi
testl %eax,%eax
@@ -2373,7 +3001,7 @@ aesni_set_decrypt_key:
.align 16
aesni_set_encrypt_key:
__aesni_set_encrypt_key:
-.byte 0x48,0x83,0xEC,0x08
+.byte 0x48,0x83,0xEC,0x08
movq $-1,%rax
testq %rdi,%rdi
jz .Lenc_key_ret
@@ -2569,6 +3197,8 @@ __aesni_set_encrypt_key:
.long 1,0,0,0
.Lxts_magic:
.long 0x87,0,1,0
+.Lincrement1:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64
diff --git a/lib/accelerated/x86/elf/e_padlock-x86.s b/lib/accelerated/x86/elf/e_padlock-x86.s
index 65f32f9601..0b8fc28810 100644
--- a/lib/accelerated/x86/elf/e_padlock-x86.s
+++ b/lib/accelerated/x86/elf/e_padlock-x86.s
@@ -187,16 +187,14 @@ padlock_ecb_encrypt:
leal 16(%edx),%edx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpl $128,%ecx
- jbe .L006ecb_short
testl $32,(%edx)
- jnz .L007ecb_aligned
+ jnz .L006ecb_aligned
testl $15,%edi
setz %al
testl $15,%esi
setz %bl
testl %ebx,%eax
- jnz .L007ecb_aligned
+ jnz .L006ecb_aligned
negl %eax
movl $512,%ebx
notl %eax
@@ -208,10 +206,28 @@ padlock_ecb_encrypt:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
- jmp .L008ecb_loop
+ movl %eax,16(%ebp)
+ cmpl %ebx,%ecx
+ ja .L007ecb_loop
+ movl %esi,%eax
+ cmpl %esp,%ebp
+ cmovel %edi,%eax
+ addl %ecx,%eax
+ negl %eax
+ andl $4095,%eax
+ cmpl $128,%eax
+ movl $-128,%eax
+ cmovael %ebx,%eax
+ andl %eax,%ebx
+ jz .L008ecb_unaligned_tail
+ jmp .L007ecb_loop
.align 16
-.L008ecb_loop:
+.L007ecb_loop:
movl %edi,(%ebp)
movl %esi,4(%ebp)
movl %ecx,8(%ebp)
@@ -236,8 +252,8 @@ padlock_ecb_encrypt:
testl $15,%edi
jz .L010ecb_out_aligned
movl %ebx,%ecx
- shrl $2,%ecx
leal (%esp),%esi
+ shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
.L010ecb_out_aligned:
@@ -247,43 +263,75 @@ padlock_ecb_encrypt:
addl %ebx,%esi
subl %ebx,%ecx
movl $512,%ebx
- jnz .L008ecb_loop
+ jz .L011ecb_break
+ cmpl %ebx,%ecx
+ jae .L007ecb_loop
+.L008ecb_unaligned_tail:
+ xorl %eax,%eax
cmpl %ebp,%esp
- je .L011ecb_done
+ cmovel %ecx,%eax
+ subl %eax,%esp
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp .L007ecb_loop
+.align 16
+.L011ecb_break:
+ cmpl %ebp,%esp
+ je .L012ecb_done
pxor %xmm0,%xmm0
leal (%esp),%eax
-.L012ecb_bzero:
+.L013ecb_bzero:
movaps %xmm0,(%eax)
leal 16(%eax),%eax
cmpl %eax,%ebp
- ja .L012ecb_bzero
-.L011ecb_done:
+ ja .L013ecb_bzero
+.L012ecb_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
- jmp .L013ecb_exit
+ jmp .L014ecb_exit
.align 16
-.L006ecb_short:
+.L006ecb_aligned:
+ leal (%esi,%ecx,1),%ebp
+ negl %ebp
+ andl $4095,%ebp
xorl %eax,%eax
- leal -24(%esp),%ebp
- subl %ecx,%eax
- leal (%eax,%ebp,1),%esp
- andl $-16,%esp
- xorl %ebx,%ebx
-.L014ecb_short_copy:
- movups (%esi,%ebx,1),%xmm0
- leal 16(%ebx),%ebx
- cmpl %ebx,%ecx
- movaps %xmm0,-16(%esp,%ebx,1)
- ja .L014ecb_short_copy
- movl %esp,%esi
- movl %ecx,%ebx
- jmp .L008ecb_loop
-.align 16
-.L007ecb_aligned:
+ cmpl $128,%ebp
+ movl $127,%ebp
+ cmovael %eax,%ebp
+ andl %ecx,%ebp
+ subl %ebp,%ecx
+ jz .L015ecb_aligned_tail
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
.byte 243,15,167,200
-.L013ecb_exit:
+ testl %ebp,%ebp
+ jz .L014ecb_exit
+.L015ecb_aligned_tail:
+ movl %ebp,%ecx
+ leal -24(%esp),%ebp
+ movl %ebp,%esp
+ movl %ebp,%eax
+ subl %ecx,%esp
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp .L007ecb_loop
+.L014ecb_exit:
movl $1,%eax
leal 4(%esp),%esp
.L004ecb_abort:
@@ -307,19 +355,17 @@ padlock_cbc_encrypt:
movl 28(%esp),%edx
movl 32(%esp),%ecx
testl $15,%edx
- jnz .L015cbc_abort
+ jnz .L016cbc_abort
testl $15,%ecx
- jnz .L015cbc_abort
- leal .Lpadlock_saved_context-.L016cbc_pic_point,%eax
+ jnz .L016cbc_abort
+ leal .Lpadlock_saved_context-.L017cbc_pic_point,%eax
pushfl
cld
call _padlock_verify_ctx
-.L016cbc_pic_point:
+.L017cbc_pic_point:
leal 16(%edx),%edx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpl $64,%ecx
- jbe .L017cbc_short
testl $32,(%edx)
jnz .L018cbc_aligned
testl $15,%edi
@@ -339,7 +385,25 @@ padlock_cbc_encrypt:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
+ movl %eax,16(%ebp)
+ cmpl %ebx,%ecx
+ ja .L019cbc_loop
+ movl %esi,%eax
+ cmpl %esp,%ebp
+ cmovel %edi,%eax
+ addl %ecx,%eax
+ negl %eax
+ andl $4095,%eax
+ cmpl $64,%eax
+ movl $-64,%eax
+ cmovael %ebx,%eax
+ andl %eax,%ebx
+ jz .L020cbc_unaligned_tail
jmp .L019cbc_loop
.align 16
.L019cbc_loop:
@@ -351,13 +415,13 @@ padlock_cbc_encrypt:
testl $15,%edi
cmovnzl %esp,%edi
testl $15,%esi
- jz .L020cbc_inp_aligned
+ jz .L021cbc_inp_aligned
shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
movl %ebx,%ecx
movl %edi,%esi
-.L020cbc_inp_aligned:
+.L021cbc_inp_aligned:
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
@@ -367,61 +431,93 @@ padlock_cbc_encrypt:
movl (%ebp),%edi
movl 12(%ebp),%ebx
testl $15,%edi
- jz .L021cbc_out_aligned
+ jz .L022cbc_out_aligned
movl %ebx,%ecx
- shrl $2,%ecx
leal (%esp),%esi
+ shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
-.L021cbc_out_aligned:
+.L022cbc_out_aligned:
movl 4(%ebp),%esi
movl 8(%ebp),%ecx
addl %ebx,%edi
addl %ebx,%esi
subl %ebx,%ecx
movl $512,%ebx
- jnz .L019cbc_loop
+ jz .L023cbc_break
+ cmpl %ebx,%ecx
+ jae .L019cbc_loop
+.L020cbc_unaligned_tail:
+ xorl %eax,%eax
+ cmpl %ebp,%esp
+ cmovel %ecx,%eax
+ subl %eax,%esp
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp .L019cbc_loop
+.align 16
+.L023cbc_break:
cmpl %ebp,%esp
- je .L022cbc_done
+ je .L024cbc_done
pxor %xmm0,%xmm0
leal (%esp),%eax
-.L023cbc_bzero:
+.L025cbc_bzero:
movaps %xmm0,(%eax)
leal 16(%eax),%eax
cmpl %eax,%ebp
- ja .L023cbc_bzero
-.L022cbc_done:
+ ja .L025cbc_bzero
+.L024cbc_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
- jmp .L024cbc_exit
-.align 16
-.L017cbc_short:
- xorl %eax,%eax
- leal -24(%esp),%ebp
- subl %ecx,%eax
- leal (%eax,%ebp,1),%esp
- andl $-16,%esp
- xorl %ebx,%ebx
-.L025cbc_short_copy:
- movups (%esi,%ebx,1),%xmm0
- leal 16(%ebx),%ebx
- cmpl %ebx,%ecx
- movaps %xmm0,-16(%esp,%ebx,1)
- ja .L025cbc_short_copy
- movl %esp,%esi
- movl %ecx,%ebx
- jmp .L019cbc_loop
+ jmp .L026cbc_exit
.align 16
.L018cbc_aligned:
+ leal (%esi,%ecx,1),%ebp
+ negl %ebp
+ andl $4095,%ebp
+ xorl %eax,%eax
+ cmpl $64,%ebp
+ movl $63,%ebp
+ cmovael %eax,%ebp
+ andl %ecx,%ebp
+ subl %ebp,%ecx
+ jz .L027cbc_aligned_tail
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
.byte 243,15,167,208
movaps (%eax),%xmm0
movaps %xmm0,-16(%edx)
-.L024cbc_exit:
+ testl %ebp,%ebp
+ jz .L026cbc_exit
+.L027cbc_aligned_tail:
+ movl %ebp,%ecx
+ leal -24(%esp),%ebp
+ movl %ebp,%esp
+ movl %ebp,%eax
+ subl %ecx,%esp
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp .L019cbc_loop
+.L026cbc_exit:
movl $1,%eax
leal 4(%esp),%esp
-.L015cbc_abort:
+.L016cbc_abort:
popl %edi
popl %esi
popl %ebx
@@ -442,25 +538,25 @@ padlock_cfb_encrypt:
movl 28(%esp),%edx
movl 32(%esp),%ecx
testl $15,%edx
- jnz .L026cfb_abort
+ jnz .L028cfb_abort
testl $15,%ecx
- jnz .L026cfb_abort
- leal .Lpadlock_saved_context-.L027cfb_pic_point,%eax
+ jnz .L028cfb_abort
+ leal .Lpadlock_saved_context-.L029cfb_pic_point,%eax
pushfl
cld
call _padlock_verify_ctx
-.L027cfb_pic_point:
+.L029cfb_pic_point:
leal 16(%edx),%edx
xorl %eax,%eax
xorl %ebx,%ebx
testl $32,(%edx)
- jnz .L028cfb_aligned
+ jnz .L030cfb_aligned
testl $15,%edi
setz %al
testl $15,%esi
setz %bl
testl %ebx,%eax
- jnz .L028cfb_aligned
+ jnz .L030cfb_aligned
negl %eax
movl $512,%ebx
notl %eax
@@ -472,10 +568,15 @@ padlock_cfb_encrypt:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
- jmp .L029cfb_loop
+ movl %eax,16(%ebp)
+ jmp .L031cfb_loop
.align 16
-.L029cfb_loop:
+.L031cfb_loop:
movl %edi,(%ebp)
movl %esi,4(%ebp)
movl %ecx,8(%ebp)
@@ -484,13 +585,13 @@ padlock_cfb_encrypt:
testl $15,%edi
cmovnzl %esp,%edi
testl $15,%esi
- jz .L030cfb_inp_aligned
+ jz .L032cfb_inp_aligned
shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
movl %ebx,%ecx
movl %edi,%esi
-.L030cfb_inp_aligned:
+.L032cfb_inp_aligned:
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
@@ -500,61 +601,45 @@ padlock_cfb_encrypt:
movl (%ebp),%edi
movl 12(%ebp),%ebx
testl $15,%edi
- jz .L031cfb_out_aligned
+ jz .L033cfb_out_aligned
movl %ebx,%ecx
- shrl $2,%ecx
leal (%esp),%esi
+ shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
-.L031cfb_out_aligned:
+.L033cfb_out_aligned:
movl 4(%ebp),%esi
movl 8(%ebp),%ecx
addl %ebx,%edi
addl %ebx,%esi
subl %ebx,%ecx
movl $512,%ebx
- jnz .L029cfb_loop
+ jnz .L031cfb_loop
cmpl %ebp,%esp
- je .L032cfb_done
+ je .L034cfb_done
pxor %xmm0,%xmm0
leal (%esp),%eax
-.L033cfb_bzero:
+.L035cfb_bzero:
movaps %xmm0,(%eax)
leal 16(%eax),%eax
cmpl %eax,%ebp
- ja .L033cfb_bzero
-.L032cfb_done:
+ ja .L035cfb_bzero
+.L034cfb_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
- jmp .L034cfb_exit
+ jmp .L036cfb_exit
.align 16
-.L035cfb_short:
- xorl %eax,%eax
- leal -24(%esp),%ebp
- subl %ecx,%eax
- leal (%eax,%ebp,1),%esp
- andl $-16,%esp
- xorl %ebx,%ebx
-.L036cfb_short_copy:
- movups (%esi,%ebx,1),%xmm0
- leal 16(%ebx),%ebx
- cmpl %ebx,%ecx
- movaps %xmm0,-16(%esp,%ebx,1)
- ja .L036cfb_short_copy
- movl %esp,%esi
- movl %ecx,%ebx
- jmp .L029cfb_loop
-.align 16
-.L028cfb_aligned:
+.L030cfb_aligned:
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
.byte 243,15,167,224
movaps (%eax),%xmm0
movaps %xmm0,-16(%edx)
-.L034cfb_exit:
+.L036cfb_exit:
movl $1,%eax
leal 4(%esp),%esp
-.L026cfb_abort:
+.L028cfb_abort:
popl %edi
popl %esi
popl %ebx
@@ -605,7 +690,12 @@ padlock_ofb_encrypt:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
+ movl %eax,16(%ebp)
jmp .L040ofb_loop
.align 16
.L040ofb_loop:
@@ -635,8 +725,8 @@ padlock_ofb_encrypt:
testl $15,%edi
jz .L042ofb_out_aligned
movl %ebx,%ecx
- shrl $2,%ecx
leal (%esp),%esi
+ shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
.L042ofb_out_aligned:
@@ -657,26 +747,10 @@ padlock_ofb_encrypt:
cmpl %eax,%ebp
ja .L044ofb_bzero
.L043ofb_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
jmp .L045ofb_exit
.align 16
-.L046ofb_short:
- xorl %eax,%eax
- leal -24(%esp),%ebp
- subl %ecx,%eax
- leal (%eax,%ebp,1),%esp
- andl $-16,%esp
- xorl %ebx,%ebx
-.L047ofb_short_copy:
- movups (%esi,%ebx,1),%xmm0
- leal 16(%ebx),%ebx
- cmpl %ebx,%ecx
- movaps %xmm0,-16(%esp,%ebx,1)
- ja .L047ofb_short_copy
- movl %esp,%esi
- movl %ecx,%ebx
- jmp .L040ofb_loop
-.align 16
.L039ofb_aligned:
leal -16(%edx),%eax
leal 16(%edx),%ebx
@@ -708,14 +782,14 @@ padlock_ctr32_encrypt:
movl 28(%esp),%edx
movl 32(%esp),%ecx
testl $15,%edx
- jnz .L048ctr32_abort
+ jnz .L046ctr32_abort
testl $15,%ecx
- jnz .L048ctr32_abort
- leal .Lpadlock_saved_context-.L049ctr32_pic_point,%eax
+ jnz .L046ctr32_abort
+ leal .Lpadlock_saved_context-.L047ctr32_pic_point,%eax
pushfl
cld
call _padlock_verify_ctx
-.L049ctr32_pic_point:
+.L047ctr32_pic_point:
leal 16(%edx),%edx
xorl %eax,%eax
movq -16(%edx),%mm0
@@ -729,10 +803,15 @@ padlock_ctr32_encrypt:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
- jmp .L050ctr32_loop
+ movl %eax,16(%ebp)
+ jmp .L048ctr32_loop
.align 16
-.L050ctr32_loop:
+.L048ctr32_loop:
movl %edi,(%ebp)
movl %esi,4(%ebp)
movl %ecx,8(%ebp)
@@ -741,7 +820,7 @@ padlock_ctr32_encrypt:
movl -4(%edx),%ecx
xorl %edi,%edi
movl -8(%edx),%eax
-.L051ctr32_prepare:
+.L049ctr32_prepare:
movl %ecx,12(%esp,%edi,1)
bswap %ecx
movq %mm0,(%esp,%edi,1)
@@ -750,7 +829,7 @@ padlock_ctr32_encrypt:
bswap %ecx
leal 16(%edi),%edi
cmpl %ebx,%edi
- jb .L051ctr32_prepare
+ jb .L049ctr32_prepare
movl %ecx,-4(%edx)
leal (%esp),%esi
leal (%esp),%edi
@@ -763,32 +842,33 @@ padlock_ctr32_encrypt:
movl 12(%ebp),%ebx
movl 4(%ebp),%esi
xorl %ecx,%ecx
-.L052ctr32_xor:
+.L050ctr32_xor:
movups (%esi,%ecx,1),%xmm1
leal 16(%ecx),%ecx
pxor -16(%esp,%ecx,1),%xmm1
movups %xmm1,-16(%edi,%ecx,1)
cmpl %ebx,%ecx
- jb .L052ctr32_xor
+ jb .L050ctr32_xor
movl 8(%ebp),%ecx
addl %ebx,%edi
addl %ebx,%esi
subl %ebx,%ecx
movl $512,%ebx
- jnz .L050ctr32_loop
+ jnz .L048ctr32_loop
pxor %xmm0,%xmm0
leal (%esp),%eax
-.L053ctr32_bzero:
+.L051ctr32_bzero:
movaps %xmm0,(%eax)
leal 16(%eax),%eax
cmpl %eax,%ebp
- ja .L053ctr32_bzero
-.L054ctr32_done:
+ ja .L051ctr32_bzero
+.L052ctr32_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
movl $1,%eax
leal 4(%esp),%esp
emms
-.L048ctr32_abort:
+.L046ctr32_abort:
popl %edi
popl %esi
popl %ebx
@@ -814,10 +894,10 @@ _win32_segv_handler:
movl 4(%esp),%edx
movl 12(%esp),%ecx
cmpl $3221225477,(%edx)
- jne .L055ret
+ jne .L053ret
addl $4,184(%ecx)
movl $0,%eax
-.L055ret:
+.L053ret:
ret
.size _win32_segv_handler,.-_win32_segv_handler
.globl padlock_sha1_oneshot
diff --git a/lib/accelerated/x86/elf/e_padlock-x86_64.s b/lib/accelerated/x86/elf/e_padlock-x86_64.s
index 014ecaf569..1f86a997a8 100644
--- a/lib/accelerated/x86/elf/e_padlock-x86_64.s
+++ b/lib/accelerated/x86/elf/e_padlock-x86_64.s
@@ -127,7 +127,7 @@ padlock_aes_block:
movq $1,%rcx
leaq 32(%rdx),%rbx
leaq 16(%rdx),%rdx
-.byte 0xf3,0x0f,0xa7,0xc8
+.byte 0xf3,0x0f,0xa7,0xc8
movq %r8,%rbx
.byte 0xf3,0xc3
.size padlock_aes_block,.-padlock_aes_block
@@ -137,7 +137,7 @@ padlock_aes_block:
.align 16
padlock_xstore:
movl %esi,%edx
-.byte 0x0f,0xa7,0xc0
+.byte 0x0f,0xa7,0xc0
.byte 0xf3,0xc3
.size padlock_xstore,.-padlock_xstore
@@ -154,7 +154,7 @@ padlock_sha1_oneshot:
movq %rsp,%rdi
movl %eax,16(%rsp)
xorq %rax,%rax
-.byte 0xf3,0x0f,0xa6,0xc8
+.byte 0xf3,0x0f,0xa6,0xc8
movaps (%rsp),%xmm0
movl 16(%rsp),%eax
addq $128+8,%rsp
@@ -176,7 +176,7 @@ padlock_sha1_blocks:
movq %rsp,%rdi
movl %eax,16(%rsp)
movq $-1,%rax
-.byte 0xf3,0x0f,0xa6,0xc8
+.byte 0xf3,0x0f,0xa6,0xc8
movaps (%rsp),%xmm0
movl 16(%rsp),%eax
addq $128+8,%rsp
@@ -198,7 +198,7 @@ padlock_sha256_oneshot:
movq %rsp,%rdi
movaps %xmm1,16(%rsp)
xorq %rax,%rax
-.byte 0xf3,0x0f,0xa6,0xd0
+.byte 0xf3,0x0f,0xa6,0xd0
movaps (%rsp),%xmm0
movaps 16(%rsp),%xmm1
addq $128+8,%rsp
@@ -220,7 +220,7 @@ padlock_sha256_blocks:
movq %rsp,%rdi
movaps %xmm1,16(%rsp)
movq $-1,%rax
-.byte 0xf3,0x0f,0xa6,0xd0
+.byte 0xf3,0x0f,0xa6,0xd0
movaps (%rsp),%xmm0
movaps 16(%rsp),%xmm1
addq $128+8,%rsp
@@ -245,7 +245,7 @@ padlock_sha512_blocks:
movaps %xmm1,16(%rsp)
movaps %xmm2,32(%rsp)
movaps %xmm3,48(%rsp)
-.byte 0xf3,0x0f,0xa6,0xe0
+.byte 0xf3,0x0f,0xa6,0xe0
movaps (%rsp),%xmm0
movaps 16(%rsp),%xmm1
movaps 32(%rsp),%xmm2
@@ -276,8 +276,6 @@ padlock_ecb_encrypt:
leaq 16(%rdx),%rdx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpq $128,%rcx
- jbe .Lecb_short
testl $32,(%rdx)
jnz .Lecb_aligned
testq $15,%rdi
@@ -297,6 +295,21 @@ padlock_ecb_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+ cmpq %rbx,%rcx
+ ja .Lecb_loop
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $128,%rax
+ movq $-128,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jz .Lecb_unaligned_tail
jmp .Lecb_loop
.align 16
.Lecb_loop:
@@ -312,7 +325,7 @@ padlock_ecb_encrypt:
testq $15,%rsi
jz .Lecb_inp_aligned
shrq $3,%rcx
-.byte 0xf3,0x48,0xa5
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
movq %rbx,%rcx
movq %rdi,%rsi
@@ -320,15 +333,15 @@ padlock_ecb_encrypt:
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,200
+.byte 0xf3,0x0f,0xa7,200
movq %r8,%rdi
movq %r11,%rbx
testq $15,%rdi
jz .Lecb_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
-.byte 0xf3,0x48,0xa5
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
.Lecb_out_aligned:
movq %r9,%rsi
@@ -337,9 +350,26 @@ padlock_ecb_encrypt:
addq %rbx,%rsi
subq %rbx,%rcx
movq $512,%rbx
- jnz .Lecb_loop
-
+ jz .Lecb_break
+ cmpq %rbx,%rcx
+ jae .Lecb_loop
+.Lecb_unaligned_tail:
+ xorl %eax,%eax
cmpq %rsp,%rbp
+ cmoveq %rcx,%rax
+ movq %rdi,%r8
+ movq %rcx,%rbx
+ subq %rax,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ movq %rsp,%rsi
+ movq %r8,%rdi
+ movq %rbx,%rcx
+ jmp .Lecb_loop
+.align 16
+.Lecb_break:
+ cmpq %rbp,%rsp
je .Lecb_done
pxor %xmm0,%xmm0
@@ -353,26 +383,39 @@ padlock_ecb_encrypt:
.Lecb_done:
leaq (%rbp),%rsp
jmp .Lecb_exit
-.align 16
-.Lecb_short:
- movq %rsp,%rbp
- subq %rcx,%rsp
- xorq %rbx,%rbx
-.Lecb_short_copy:
- movups (%rsi,%rbx,1),%xmm0
- leaq 16(%rbx),%rbx
- cmpq %rbx,%rcx
- movaps %xmm0,-16(%rsp,%rbx,1)
- ja .Lecb_short_copy
- movq %rsp,%rsi
- movq %rcx,%rbx
- jmp .Lecb_loop
+
.align 16
.Lecb_aligned:
+ leaq (%rsi,%rcx,1),%rbp
+ negq %rbp
+ andq $4095,%rbp
+ xorl %eax,%eax
+ cmpq $128,%rbp
+ movq $128-1,%rbp
+ cmovaeq %rax,%rbp
+ andq %rcx,%rbp
+ subq %rbp,%rcx
+ jz .Lecb_aligned_tail
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,200
+.byte 0xf3,0x0f,0xa7,200
+ testq %rbp,%rbp
+ jz .Lecb_exit
+
+.Lecb_aligned_tail:
+ movq %rdi,%r8
+ movq %rbp,%rbx
+ movq %rbp,%rcx
+ leaq (%rsp),%rbp
+ subq %rcx,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ leaq (%r8),%rdi
+ leaq (%rsp),%rsi
+ movq %rbx,%rcx
+ jmp .Lecb_loop
.Lecb_exit:
movl $1,%eax
leaq 8(%rsp),%rsp
@@ -400,8 +443,6 @@ padlock_cbc_encrypt:
leaq 16(%rdx),%rdx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpq $64,%rcx
- jbe .Lcbc_short
testl $32,(%rdx)
jnz .Lcbc_aligned
testq $15,%rdi
@@ -421,6 +462,21 @@ padlock_cbc_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+ cmpq %rbx,%rcx
+ ja .Lcbc_loop
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $64,%rax
+ movq $-64,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jz .Lcbc_unaligned_tail
jmp .Lcbc_loop
.align 16
.Lcbc_loop:
@@ -436,7 +492,7 @@ padlock_cbc_encrypt:
testq $15,%rsi
jz .Lcbc_inp_aligned
shrq $3,%rcx
-.byte 0xf3,0x48,0xa5
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
movq %rbx,%rcx
movq %rdi,%rsi
@@ -444,7 +500,7 @@ padlock_cbc_encrypt:
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,208
+.byte 0xf3,0x0f,0xa7,208
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
movq %r8,%rdi
@@ -452,9 +508,9 @@ padlock_cbc_encrypt:
testq $15,%rdi
jz .Lcbc_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
-.byte 0xf3,0x48,0xa5
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
.Lcbc_out_aligned:
movq %r9,%rsi
@@ -463,9 +519,26 @@ padlock_cbc_encrypt:
addq %rbx,%rsi
subq %rbx,%rcx
movq $512,%rbx
- jnz .Lcbc_loop
-
+ jz .Lcbc_break
+ cmpq %rbx,%rcx
+ jae .Lcbc_loop
+.Lcbc_unaligned_tail:
+ xorl %eax,%eax
cmpq %rsp,%rbp
+ cmoveq %rcx,%rax
+ movq %rdi,%r8
+ movq %rcx,%rbx
+ subq %rax,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ movq %rsp,%rsi
+ movq %r8,%rdi
+ movq %rbx,%rcx
+ jmp .Lcbc_loop
+.align 16
+.Lcbc_break:
+ cmpq %rbp,%rsp
je .Lcbc_done
pxor %xmm0,%xmm0
@@ -479,28 +552,41 @@ padlock_cbc_encrypt:
.Lcbc_done:
leaq (%rbp),%rsp
jmp .Lcbc_exit
-.align 16
-.Lcbc_short:
- movq %rsp,%rbp
- subq %rcx,%rsp
- xorq %rbx,%rbx
-.Lcbc_short_copy:
- movups (%rsi,%rbx,1),%xmm0
- leaq 16(%rbx),%rbx
- cmpq %rbx,%rcx
- movaps %xmm0,-16(%rsp,%rbx,1)
- ja .Lcbc_short_copy
- movq %rsp,%rsi
- movq %rcx,%rbx
- jmp .Lcbc_loop
+
.align 16
.Lcbc_aligned:
+ leaq (%rsi,%rcx,1),%rbp
+ negq %rbp
+ andq $4095,%rbp
+ xorl %eax,%eax
+ cmpq $64,%rbp
+ movq $64-1,%rbp
+ cmovaeq %rax,%rbp
+ andq %rcx,%rbp
+ subq %rbp,%rcx
+ jz .Lcbc_aligned_tail
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,208
+.byte 0xf3,0x0f,0xa7,208
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
+ testq %rbp,%rbp
+ jz .Lcbc_exit
+
+.Lcbc_aligned_tail:
+ movq %rdi,%r8
+ movq %rbp,%rbx
+ movq %rbp,%rcx
+ leaq (%rsp),%rbp
+ subq %rcx,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ leaq (%r8),%rdi
+ leaq (%rsp),%rsi
+ movq %rbx,%rcx
+ jmp .Lcbc_loop
.Lcbc_exit:
movl $1,%eax
leaq 8(%rsp),%rsp
@@ -547,6 +633,8 @@ padlock_cfb_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
jmp .Lcfb_loop
.align 16
.Lcfb_loop:
@@ -562,7 +650,7 @@ padlock_cfb_encrypt:
testq $15,%rsi
jz .Lcfb_inp_aligned
shrq $3,%rcx
-.byte 0xf3,0x48,0xa5
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
movq %rbx,%rcx
movq %rdi,%rsi
@@ -570,7 +658,7 @@ padlock_cfb_encrypt:
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,224
+.byte 0xf3,0x0f,0xa7,224
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
movq %r8,%rdi
@@ -578,9 +666,9 @@ padlock_cfb_encrypt:
testq $15,%rdi
jz .Lcfb_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
-.byte 0xf3,0x48,0xa5
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
.Lcfb_out_aligned:
movq %r9,%rsi
@@ -590,8 +678,7 @@ padlock_cfb_encrypt:
subq %rbx,%rcx
movq $512,%rbx
jnz .Lcfb_loop
-
- cmpq %rsp,%rbp
+ cmpq %rbp,%rsp
je .Lcfb_done
pxor %xmm0,%xmm0
@@ -605,12 +692,13 @@ padlock_cfb_encrypt:
.Lcfb_done:
leaq (%rbp),%rsp
jmp .Lcfb_exit
+
.align 16
.Lcfb_aligned:
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,224
+.byte 0xf3,0x0f,0xa7,224
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
.Lcfb_exit:
@@ -659,6 +747,8 @@ padlock_ofb_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
jmp .Lofb_loop
.align 16
.Lofb_loop:
@@ -674,7 +764,7 @@ padlock_ofb_encrypt:
testq $15,%rsi
jz .Lofb_inp_aligned
shrq $3,%rcx
-.byte 0xf3,0x48,0xa5
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
movq %rbx,%rcx
movq %rdi,%rsi
@@ -682,7 +772,7 @@ padlock_ofb_encrypt:
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,232
+.byte 0xf3,0x0f,0xa7,232
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
movq %r8,%rdi
@@ -690,9 +780,9 @@ padlock_ofb_encrypt:
testq $15,%rdi
jz .Lofb_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
-.byte 0xf3,0x48,0xa5
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
.Lofb_out_aligned:
movq %r9,%rsi
@@ -702,8 +792,7 @@ padlock_ofb_encrypt:
subq %rbx,%rcx
movq $512,%rbx
jnz .Lofb_loop
-
- cmpq %rsp,%rbp
+ cmpq %rbp,%rsp
je .Lofb_done
pxor %xmm0,%xmm0
@@ -717,12 +806,13 @@ padlock_ofb_encrypt:
.Lofb_done:
leaq (%rbp),%rsp
jmp .Lofb_exit
+
.align 16
.Lofb_aligned:
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,232
+.byte 0xf3,0x0f,0xa7,232
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
.Lofb_exit:
@@ -752,8 +842,6 @@ padlock_ctr32_encrypt:
leaq 16(%rdx),%rdx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpq $64,%rcx
- jbe .Lctr32_short
testl $32,(%rdx)
jnz .Lctr32_aligned
testq $15,%rdi
@@ -773,15 +861,32 @@ padlock_ctr32_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
.Lctr32_reenter:
movl -4(%rdx),%eax
bswapl %eax
negl %eax
andl $31,%eax
- jz .Lctr32_loop
+ movq $512,%rbx
shll $4,%eax
+ cmovzq %rbx,%rax
cmpq %rax,%rcx
cmovaq %rax,%rbx
+ cmovbeq %rcx,%rbx
+ cmpq %rbx,%rcx
+ ja .Lctr32_loop
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $32,%rax
+ movq $-32,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jz .Lctr32_unaligned_tail
jmp .Lctr32_loop
.align 16
.Lctr32_loop:
@@ -797,7 +902,7 @@ padlock_ctr32_encrypt:
testq $15,%rsi
jz .Lctr32_inp_aligned
shrq $3,%rcx
-.byte 0xf3,0x48,0xa5
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
movq %rbx,%rcx
movq %rdi,%rsi
@@ -805,23 +910,23 @@ padlock_ctr32_encrypt:
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,216
+.byte 0xf3,0x0f,0xa7,216
movl -4(%rdx),%eax
testl $4294901760,%eax
- jnz .Lctr32_no_corr
+ jnz .Lctr32_no_carry
bswapl %eax
addl $65536,%eax
bswapl %eax
movl %eax,-4(%rdx)
-.Lctr32_no_corr:
+.Lctr32_no_carry:
movq %r8,%rdi
movq %r11,%rbx
testq $15,%rdi
jz .Lctr32_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
-.byte 0xf3,0x48,0xa5
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
.Lctr32_out_aligned:
movq %r9,%rsi
@@ -830,9 +935,38 @@ padlock_ctr32_encrypt:
addq %rbx,%rsi
subq %rbx,%rcx
movq $512,%rbx
+ jz .Lctr32_break
+ cmpq %rbx,%rcx
+ jae .Lctr32_loop
+ movq %rcx,%rbx
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $32,%rax
+ movq $-32,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
jnz .Lctr32_loop
-
+.Lctr32_unaligned_tail:
+ xorl %eax,%eax
cmpq %rsp,%rbp
+ cmoveq %rcx,%rax
+ movq %rdi,%r8
+ movq %rcx,%rbx
+ subq %rax,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ movq %rsp,%rsi
+ movq %r8,%rdi
+ movq %rbx,%rcx
+ jmp .Lctr32_loop
+.align 16
+.Lctr32_break:
+ cmpq %rbp,%rsp
je .Lctr32_done
pxor %xmm0,%xmm0
@@ -846,56 +980,75 @@ padlock_ctr32_encrypt:
.Lctr32_done:
leaq (%rbp),%rsp
jmp .Lctr32_exit
-.align 16
-.Lctr32_short:
- movq %rsp,%rbp
- subq %rcx,%rsp
- xorq %rbx,%rbx
-.Lctr32_short_copy:
- movups (%rsi,%rbx,1),%xmm0
- leaq 16(%rbx),%rbx
- cmpq %rbx,%rcx
- movaps %xmm0,-16(%rsp,%rbx,1)
- ja .Lctr32_short_copy
- movq %rsp,%rsi
- movq %rcx,%rbx
- jmp .Lctr32_reenter
+
.align 16
.Lctr32_aligned:
movl -4(%rdx),%eax
- movq $1048576,%rbx
bswapl %eax
- cmpq %rcx,%rbx
- cmovaq %rcx,%rbx
negl %eax
andl $65535,%eax
- jz .Lctr32_aligned_loop
+ movq $1048576,%rbx
shll $4,%eax
+ cmovzq %rbx,%rax
cmpq %rax,%rcx
cmovaq %rax,%rbx
- jmp .Lctr32_aligned_loop
-.align 16
+ cmovbeq %rcx,%rbx
+ jbe .Lctr32_aligned_skip
+
.Lctr32_aligned_loop:
- cmpq %rcx,%rbx
- cmovaq %rcx,%rbx
movq %rcx,%r10
movq %rbx,%rcx
movq %rbx,%r11
+
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,216
+.byte 0xf3,0x0f,0xa7,216
+
movl -4(%rdx),%eax
bswapl %eax
addl $65536,%eax
bswapl %eax
movl %eax,-4(%rdx)
- movq %r11,%rbx
movq %r10,%rcx
- subq %rbx,%rcx
+ subq %r11,%rcx
movq $1048576,%rbx
- jnz .Lctr32_aligned_loop
+ jz .Lctr32_exit
+ cmpq %rbx,%rcx
+ jae .Lctr32_aligned_loop
+
+.Lctr32_aligned_skip:
+ leaq (%rsi,%rcx,1),%rbp
+ negq %rbp
+ andq $4095,%rbp
+ xorl %eax,%eax
+ cmpq $32,%rbp
+ movq $32-1,%rbp
+ cmovaeq %rax,%rbp
+ andq %rcx,%rbp
+ subq %rbp,%rcx
+ jz .Lctr32_aligned_tail
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,216
+ testq %rbp,%rbp
+ jz .Lctr32_exit
+
+.Lctr32_aligned_tail:
+ movq %rdi,%r8
+ movq %rbp,%rbx
+ movq %rbp,%rcx
+ leaq (%rsp),%rbp
+ subq %rcx,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ leaq (%r8),%rdi
+ leaq (%rsp),%rsi
+ movq %rbx,%rcx
+ jmp .Lctr32_loop
.Lctr32_exit:
movl $1,%eax
leaq 8(%rsp),%rsp
diff --git a/lib/accelerated/x86/elf/ghash-x86_64.s b/lib/accelerated/x86/elf/ghash-x86_64.s
index f60c95b546..17f6bebe6e 100644
--- a/lib/accelerated/x86/elf/ghash-x86_64.s
+++ b/lib/accelerated/x86/elf/ghash-x86_64.s
@@ -39,6 +39,7 @@
#
.text
+
.globl gcm_gmult_4bit
.type gcm_gmult_4bit,@function
.align 16
@@ -697,6 +698,7 @@ gcm_ghash_4bit:
.type gcm_init_clmul,@function
.align 16
gcm_init_clmul:
+.L_init_clmul:
movdqu (%rsi),%xmm2
pshufd $78,%xmm2,%xmm2
@@ -715,15 +717,15 @@ gcm_init_clmul:
pxor %xmm5,%xmm2
+ pshufd $78,%xmm2,%xmm6
movdqa %xmm2,%xmm0
+ pxor %xmm2,%xmm6
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
+.byte 102,15,58,68,222,0
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -733,44 +735,134 @@ gcm_init_clmul:
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
psllq $1,%xmm0
pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm2,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm2,%xmm3
+ movdqu %xmm2,0(%rdi)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,16(%rdi)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,32(%rdi)
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
psrlq $5,%xmm0
pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ movdqa %xmm0,%xmm5
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- movdqu %xmm2,(%rdi)
- movdqu %xmm0,16(%rdi)
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm5,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm5,%xmm3
+ movdqu %xmm5,48(%rdi)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,64(%rdi)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,80(%rdi)
.byte 0xf3,0xc3
.size gcm_init_clmul,.-gcm_init_clmul
.globl gcm_gmult_clmul
.type gcm_gmult_clmul,@function
.align 16
gcm_gmult_clmul:
+.L_gmult_clmul:
movdqu (%rdi),%xmm0
movdqa .Lbswap_mask(%rip),%xmm5
movdqu (%rsi),%xmm2
+ movdqu 32(%rsi),%xmm4
.byte 102,15,56,0,197
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
.byte 102,15,58,68,220,0
@@ -783,201 +875,381 @@ gcm_gmult_clmul:
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
.byte 102,15,56,0,197
movdqu %xmm0,(%rdi)
.byte 0xf3,0xc3
.size gcm_gmult_clmul,.-gcm_gmult_clmul
.globl gcm_ghash_clmul
.type gcm_ghash_clmul,@function
-.align 16
+.align 32
gcm_ghash_clmul:
- movdqa .Lbswap_mask(%rip),%xmm5
+.L_ghash_clmul:
+ movdqa .Lbswap_mask(%rip),%xmm10
movdqu (%rdi),%xmm0
movdqu (%rsi),%xmm2
-.byte 102,15,56,0,197
+ movdqu 32(%rsi),%xmm7
+.byte 102,65,15,56,0,194
subq $16,%rcx
jz .Lodd_tail
- movdqu 16(%rsi),%xmm8
+ movdqu 16(%rsi),%xmm6
+ movl _gnutls_x86_cpuid_s+4(%rip),%eax
+ cmpq $48,%rcx
+ jb .Lskip4x
+ andl $71303168,%eax
+ cmpl $4194304,%eax
+ je .Lskip4x
+ subq $48,%rcx
+ movq $11547335547999543296,%rax
+ movdqu 48(%rsi),%xmm14
+ movdqu 64(%rsi),%xmm15
- movdqu (%rdx),%xmm3
- movdqu 16(%rdx),%xmm6
-.byte 102,15,56,0,221
-.byte 102,15,56,0,245
- pxor %xmm3,%xmm0
- movdqa %xmm6,%xmm7
- pshufd $78,%xmm6,%xmm3
- pshufd $78,%xmm2,%xmm4
- pxor %xmm6,%xmm3
- pxor %xmm2,%xmm4
-.byte 102,15,58,68,242,0
-.byte 102,15,58,68,250,17
-.byte 102,15,58,68,220,0
- pxor %xmm6,%xmm3
- pxor %xmm7,%xmm3
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm7
- pxor %xmm4,%xmm6
+ movdqu 48(%rdx),%xmm3
+ movdqu 32(%rdx),%xmm11
+.byte 102,65,15,56,0,218
+.byte 102,69,15,56,0,218
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
+
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm11,%xmm12
+.byte 102,68,15,58,68,222,0
+.byte 102,68,15,58,68,238,17
+ xorps %xmm11,%xmm3
+.byte 102,68,15,58,68,231,16
+ xorps %xmm13,%xmm5
+ movups 80(%rsi),%xmm7
+ xorps %xmm12,%xmm4
+
+ movdqu 16(%rdx),%xmm11
+ movdqu 0(%rdx),%xmm8
+.byte 102,69,15,56,0,218
+.byte 102,69,15,56,0,194
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm8,%xmm0
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm8,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm8,%xmm4
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm3
+.byte 102,68,15,58,68,231,0
+ xorps %xmm13,%xmm5
- leaq 32(%rdx),%rdx
- subq $32,%rcx
- jbe .Leven_tail
+ leaq 64(%rdx),%rdx
+ subq $64,%rcx
+ jc .Ltail4x
-.Lmod_loop:
-.byte 102,65,15,58,68,192,0
-.byte 102,65,15,58,68,200,17
-.byte 102,15,58,68,220,0
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
+ jmp .Lmod4_loop
+.align 32
+.Lmod4_loop:
+.byte 102,65,15,58,68,199,0
+ xorps %xmm12,%xmm4
+ movdqu 48(%rdx),%xmm11
+.byte 102,69,15,56,0,218
+.byte 102,65,15,58,68,207,17
+ xorps %xmm3,%xmm0
+ movdqu 32(%rdx),%xmm3
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+.byte 102,68,15,58,68,199,16
+ xorps %xmm5,%xmm1
+ pxor %xmm11,%xmm12
+.byte 102,65,15,56,0,218
+ movups 32(%rsi),%xmm7
+.byte 102,68,15,58,68,218,0
+ xorps %xmm4,%xmm8
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
- movdqu (%rdx),%xmm3
- pxor %xmm6,%xmm0
- pxor %xmm7,%xmm1
-
- movdqu 16(%rdx),%xmm6
-.byte 102,15,56,0,221
-.byte 102,15,56,0,245
-
- movdqa %xmm6,%xmm7
- pshufd $78,%xmm6,%xmm9
- pshufd $78,%xmm2,%xmm10
- pxor %xmm6,%xmm9
- pxor %xmm2,%xmm10
- pxor %xmm3,%xmm1
+ pxor %xmm0,%xmm8
+ pxor %xmm3,%xmm4
+ pxor %xmm1,%xmm8
+ movdqa %xmm8,%xmm9
+ pslldq $8,%xmm8
+.byte 102,68,15,58,68,234,17
+ psrldq $8,%xmm9
+ pxor %xmm8,%xmm0
+ movdqa .L7_mask(%rip),%xmm8
+ pxor %xmm9,%xmm1
+.byte 102,76,15,110,200
+
+ pand %xmm0,%xmm8
+.byte 102,69,15,56,0,200
+.byte 102,68,15,58,68,231,0
+ pxor %xmm0,%xmm9
+ psllq $57,%xmm9
+ movdqa %xmm9,%xmm8
+ pslldq $8,%xmm9
+.byte 102,15,58,68,222,0
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
+ movdqu 0(%rdx),%xmm8
+ movdqa %xmm0,%xmm9
+ psrlq $1,%xmm0
+.byte 102,15,58,68,238,17
+ xorps %xmm11,%xmm3
+ movdqu 16(%rdx),%xmm11
+.byte 102,69,15,56,0,218
+.byte 102,15,58,68,231,16
+ xorps %xmm13,%xmm5
+ movups 80(%rsi),%xmm7
+.byte 102,69,15,56,0,194
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
+ psrlq $5,%xmm0
+
+ movdqa %xmm11,%xmm13
+ pxor %xmm12,%xmm4
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
+ psrlq $1,%xmm0
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm3
+ pxor %xmm1,%xmm0
+
+.byte 102,68,15,58,68,231,0
+ xorps %xmm13,%xmm5
+
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
+
+ leaq 64(%rdx),%rdx
+ subq $64,%rcx
+ jnc .Lmod4_loop
+
+.Ltail4x:
+.byte 102,65,15,58,68,199,0
+ xorps %xmm12,%xmm4
+.byte 102,65,15,58,68,207,17
+ xorps %xmm3,%xmm0
+.byte 102,68,15,58,68,199,16
+ xorps %xmm5,%xmm1
+ pxor %xmm0,%xmm1
+ pxor %xmm4,%xmm8
+
+ pxor %xmm1,%xmm8
+ pxor %xmm0,%xmm1
+
+ movdqa %xmm8,%xmm9
+ psrldq $8,%xmm8
+ pslldq $8,%xmm9
+ pxor %xmm8,%xmm1
+ pxor %xmm9,%xmm0
+
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
-.byte 102,15,58,68,242,0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
-.byte 102,15,58,68,250,17
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
+ addq $64,%rcx
+ jz .Ldone
+ movdqu 32(%rsi),%xmm7
+ subq $16,%rcx
+ jz .Lodd_tail
+.Lskip4x:
+
+
+
+
-.byte 102,69,15,58,68,202,0
+ movdqu (%rdx),%xmm8
+ movdqu 16(%rdx),%xmm3
+.byte 102,69,15,56,0,194
+.byte 102,65,15,56,0,218
+ pxor %xmm8,%xmm0
+
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
+
+ leaq 32(%rdx),%rdx
+ nop
+ subq $32,%rcx
+ jbe .Leven_tail
+ nop
+ jmp .Lmod_loop
+
+.align 32
+.Lmod_loop:
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm8,%xmm4
- pxor %xmm0,%xmm3
+ movdqa %xmm4,%xmm8
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm0,%xmm4
+
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
+
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ movdqu (%rdx),%xmm5
+ pxor %xmm0,%xmm8
+.byte 102,65,15,56,0,234
+ movdqu 16(%rdx),%xmm3
+
+ pxor %xmm1,%xmm8
+ pxor %xmm5,%xmm1
pxor %xmm8,%xmm4
+.byte 102,65,15,56,0,218
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
+ pslldq $8,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm4,%xmm0
- pxor %xmm6,%xmm9
- pxor %xmm7,%xmm9
- movdqa %xmm9,%xmm10
- psrldq $8,%xmm9
- pslldq $8,%xmm10
- pxor %xmm9,%xmm7
- pxor %xmm10,%xmm6
+ movdqa %xmm3,%xmm5
+
+ movdqa %xmm0,%xmm9
+ movdqa %xmm0,%xmm8
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm8
+.byte 102,15,58,68,218,0
+ psllq $1,%xmm0
+ pxor %xmm8,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm8
+ pslldq $8,%xmm0
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pshufd $78,%xmm5,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm5,%xmm4
+.byte 102,15,58,68,234,17
+ movdqa %xmm0,%xmm9
+ psrlq $1,%xmm0
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
+ psrlq $5,%xmm0
+ pxor %xmm9,%xmm0
leaq 32(%rdx),%rdx
+ psrlq $1,%xmm0
+.byte 102,15,58,68,231,0
+ pxor %xmm1,%xmm0
+.byte 0x66,0x90
+
subq $32,%rcx
ja .Lmod_loop
.Leven_tail:
-.byte 102,65,15,58,68,192,0
-.byte 102,65,15,58,68,200,17
-.byte 102,15,58,68,220,0
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm0,%xmm4
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
+
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ pxor %xmm0,%xmm8
+ pxor %xmm1,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
pslldq $8,%xmm4
- pxor %xmm3,%xmm1
+ pxor %xmm8,%xmm1
pxor %xmm4,%xmm0
- pxor %xmm6,%xmm0
- pxor %xmm7,%xmm1
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
testq %rcx,%rcx
jnz .Ldone
.Lodd_tail:
- movdqu (%rdx),%xmm3
-.byte 102,15,56,0,221
- pxor %xmm3,%xmm0
+ movdqu (%rdx),%xmm8
+.byte 102,69,15,56,0,194
+ pxor %xmm8,%xmm0
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
+.byte 102,15,58,68,223,0
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -987,38 +1259,60 @@ gcm_ghash_clmul:
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
.Ldone:
-.byte 102,15,56,0,197
+.byte 102,65,15,56,0,194
movdqu %xmm0,(%rdi)
.byte 0xf3,0xc3
-.LSEH_end_gcm_ghash_clmul:
.size gcm_ghash_clmul,.-gcm_ghash_clmul
+.globl gcm_init_avx
+.type gcm_init_avx,@function
+.align 32
+gcm_init_avx:
+ jmp .L_init_clmul
+.size gcm_init_avx,.-gcm_init_avx
+.globl gcm_gmult_avx
+.type gcm_gmult_avx,@function
+.align 32
+gcm_gmult_avx:
+ jmp .L_gmult_clmul
+.size gcm_gmult_avx,.-gcm_gmult_avx
+.globl gcm_ghash_avx
+.type gcm_ghash_avx,@function
+.align 32
+gcm_ghash_avx:
+ jmp .L_ghash_clmul
+.size gcm_ghash_avx,.-gcm_ghash_avx
.align 64
.Lbswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.L0x1c2_polynomial:
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+.long 7,0,7,0
+.L7_mask_poly:
+.long 7,0,450,0
.align 64
.type .Lrem_4bit,@object
.Lrem_4bit:
diff --git a/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s b/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s
index 3bb65a54e9..116efd0c18 100644
--- a/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s
+++ b/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s
@@ -46,23 +46,25 @@
sha1_block_data_order:
movl _gnutls_x86_cpuid_s+0(%rip),%r9d
movl _gnutls_x86_cpuid_s+4(%rip),%r8d
+ movl _gnutls_x86_cpuid_s+8(%rip),%r10d
testl $512,%r8d
jz .Lialu
jmp _ssse3_shortcut
.align 16
.Lialu:
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
- movq %rsp,%r11
+ pushq %r14
movq %rdi,%r8
subq $72,%rsp
movq %rsi,%r9
andq $-64,%rsp
movq %rdx,%r10
- movq %r11,64(%rsp)
+ movq %rax,64(%rsp)
.Lprologue:
movl 0(%r8),%esi
@@ -76,1230 +78,1168 @@ sha1_block_data_order:
.Lloop:
movl 0(%r9),%edx
bswapl %edx
- movl %edx,0(%rsp)
- movl %r11d,%eax
movl 4(%r9),%ebp
+ movl %r12d,%eax
+ movl %edx,0(%rsp)
movl %esi,%ecx
- xorl %r12d,%eax
bswapl %ebp
+ xorl %r11d,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%r13,1),%r13d
andl %edi,%eax
- movl %ebp,4(%rsp)
+ leal 1518500249(%rdx,%r13,1),%r13d
addl %ecx,%r13d
xorl %r12d,%eax
roll $30,%edi
addl %eax,%r13d
- movl %edi,%eax
- movl 8(%r9),%edx
+ movl 8(%r9),%r14d
+ movl %r11d,%eax
+ movl %ebp,4(%rsp)
movl %r13d,%ecx
- xorl %r11d,%eax
- bswapl %edx
+ bswapl %r14d
+ xorl %edi,%eax
roll $5,%ecx
- leal 1518500249(%rbp,%r12,1),%r12d
andl %esi,%eax
- movl %edx,8(%rsp)
+ leal 1518500249(%rbp,%r12,1),%r12d
addl %ecx,%r12d
xorl %r11d,%eax
roll $30,%esi
addl %eax,%r12d
- movl %esi,%eax
- movl 12(%r9),%ebp
+ movl 12(%r9),%edx
+ movl %edi,%eax
+ movl %r14d,8(%rsp)
movl %r12d,%ecx
- xorl %edi,%eax
- bswapl %ebp
+ bswapl %edx
+ xorl %esi,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%r11,1),%r11d
andl %r13d,%eax
- movl %ebp,12(%rsp)
+ leal 1518500249(%r14,%r11,1),%r11d
addl %ecx,%r11d
xorl %edi,%eax
roll $30,%r13d
addl %eax,%r11d
- movl %r13d,%eax
- movl 16(%r9),%edx
+ movl 16(%r9),%ebp
+ movl %esi,%eax
+ movl %edx,12(%rsp)
movl %r11d,%ecx
- xorl %esi,%eax
- bswapl %edx
+ bswapl %ebp
+ xorl %r13d,%eax
roll $5,%ecx
- leal 1518500249(%rbp,%rdi,1),%edi
andl %r12d,%eax
- movl %edx,16(%rsp)
+ leal 1518500249(%rdx,%rdi,1),%edi
addl %ecx,%edi
xorl %esi,%eax
roll $30,%r12d
addl %eax,%edi
- movl %r12d,%eax
- movl 20(%r9),%ebp
+ movl 20(%r9),%r14d
+ movl %r13d,%eax
+ movl %ebp,16(%rsp)
movl %edi,%ecx
- xorl %r13d,%eax
- bswapl %ebp
+ bswapl %r14d
+ xorl %r12d,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%rsi,1),%esi
andl %r11d,%eax
- movl %ebp,20(%rsp)
+ leal 1518500249(%rbp,%rsi,1),%esi
addl %ecx,%esi
xorl %r13d,%eax
roll $30,%r11d
addl %eax,%esi
- movl %r11d,%eax
movl 24(%r9),%edx
+ movl %r12d,%eax
+ movl %r14d,20(%rsp)
movl %esi,%ecx
- xorl %r12d,%eax
bswapl %edx
+ xorl %r11d,%eax
roll $5,%ecx
- leal 1518500249(%rbp,%r13,1),%r13d
andl %edi,%eax
- movl %edx,24(%rsp)
+ leal 1518500249(%r14,%r13,1),%r13d
addl %ecx,%r13d
xorl %r12d,%eax
roll $30,%edi
addl %eax,%r13d
- movl %edi,%eax
movl 28(%r9),%ebp
+ movl %r11d,%eax
+ movl %edx,24(%rsp)
movl %r13d,%ecx
- xorl %r11d,%eax
bswapl %ebp
+ xorl %edi,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%r12,1),%r12d
andl %esi,%eax
- movl %ebp,28(%rsp)
+ leal 1518500249(%rdx,%r12,1),%r12d
addl %ecx,%r12d
xorl %r11d,%eax
roll $30,%esi
addl %eax,%r12d
- movl %esi,%eax
- movl 32(%r9),%edx
+ movl 32(%r9),%r14d
+ movl %edi,%eax
+ movl %ebp,28(%rsp)
movl %r12d,%ecx
- xorl %edi,%eax
- bswapl %edx
+ bswapl %r14d
+ xorl %esi,%eax
roll $5,%ecx
- leal 1518500249(%rbp,%r11,1),%r11d
andl %r13d,%eax
- movl %edx,32(%rsp)
+ leal 1518500249(%rbp,%r11,1),%r11d
addl %ecx,%r11d
xorl %edi,%eax
roll $30,%r13d
addl %eax,%r11d
- movl %r13d,%eax
- movl 36(%r9),%ebp
+ movl 36(%r9),%edx
+ movl %esi,%eax
+ movl %r14d,32(%rsp)
movl %r11d,%ecx
- xorl %esi,%eax
- bswapl %ebp
+ bswapl %edx
+ xorl %r13d,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%rdi,1),%edi
andl %r12d,%eax
- movl %ebp,36(%rsp)
+ leal 1518500249(%r14,%rdi,1),%edi
addl %ecx,%edi
xorl %esi,%eax
roll $30,%r12d
addl %eax,%edi
- movl %r12d,%eax
- movl 40(%r9),%edx
+ movl 40(%r9),%ebp
+ movl %r13d,%eax
+ movl %edx,36(%rsp)
movl %edi,%ecx
- xorl %r13d,%eax
- bswapl %edx
+ bswapl %ebp
+ xorl %r12d,%eax
roll $5,%ecx
- leal 1518500249(%rbp,%rsi,1),%esi
andl %r11d,%eax
- movl %edx,40(%rsp)
+ leal 1518500249(%rdx,%rsi,1),%esi
addl %ecx,%esi
xorl %r13d,%eax
roll $30,%r11d
addl %eax,%esi
- movl %r11d,%eax
- movl 44(%r9),%ebp
+ movl 44(%r9),%r14d
+ movl %r12d,%eax
+ movl %ebp,40(%rsp)
movl %esi,%ecx
- xorl %r12d,%eax
- bswapl %ebp
+ bswapl %r14d
+ xorl %r11d,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%r13,1),%r13d
andl %edi,%eax
- movl %ebp,44(%rsp)
+ leal 1518500249(%rbp,%r13,1),%r13d
addl %ecx,%r13d
xorl %r12d,%eax
roll $30,%edi
addl %eax,%r13d
- movl %edi,%eax
movl 48(%r9),%edx
+ movl %r11d,%eax
+ movl %r14d,44(%rsp)
movl %r13d,%ecx
- xorl %r11d,%eax
bswapl %edx
+ xorl %edi,%eax
roll $5,%ecx
- leal 1518500249(%rbp,%r12,1),%r12d
andl %esi,%eax
- movl %edx,48(%rsp)
+ leal 1518500249(%r14,%r12,1),%r12d
addl %ecx,%r12d
xorl %r11d,%eax
roll $30,%esi
addl %eax,%r12d
- movl %esi,%eax
movl 52(%r9),%ebp
+ movl %edi,%eax
+ movl %edx,48(%rsp)
movl %r12d,%ecx
- xorl %edi,%eax
bswapl %ebp
+ xorl %esi,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%r11,1),%r11d
andl %r13d,%eax
- movl %ebp,52(%rsp)
+ leal 1518500249(%rdx,%r11,1),%r11d
addl %ecx,%r11d
xorl %edi,%eax
roll $30,%r13d
addl %eax,%r11d
- movl %r13d,%eax
- movl 56(%r9),%edx
+ movl 56(%r9),%r14d
+ movl %esi,%eax
+ movl %ebp,52(%rsp)
movl %r11d,%ecx
- xorl %esi,%eax
- bswapl %edx
+ bswapl %r14d
+ xorl %r13d,%eax
roll $5,%ecx
- leal 1518500249(%rbp,%rdi,1),%edi
andl %r12d,%eax
- movl %edx,56(%rsp)
+ leal 1518500249(%rbp,%rdi,1),%edi
addl %ecx,%edi
xorl %esi,%eax
roll $30,%r12d
addl %eax,%edi
- movl %r12d,%eax
- movl 60(%r9),%ebp
+ movl 60(%r9),%edx
+ movl %r13d,%eax
+ movl %r14d,56(%rsp)
movl %edi,%ecx
- xorl %r13d,%eax
- bswapl %ebp
+ bswapl %edx
+ xorl %r12d,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%rsi,1),%esi
andl %r11d,%eax
- movl %ebp,60(%rsp)
+ leal 1518500249(%r14,%rsi,1),%esi
addl %ecx,%esi
xorl %r13d,%eax
roll $30,%r11d
addl %eax,%esi
- movl 0(%rsp),%edx
- movl %r11d,%eax
+ xorl 0(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,60(%rsp)
movl %esi,%ecx
- xorl 8(%rsp),%edx
- xorl %r12d,%eax
+ xorl 8(%rsp),%ebp
+ xorl %r11d,%eax
roll $5,%ecx
- xorl 32(%rsp),%edx
+ xorl 32(%rsp),%ebp
andl %edi,%eax
- leal 1518500249(%rbp,%r13,1),%r13d
- xorl 52(%rsp),%edx
+ leal 1518500249(%rdx,%r13,1),%r13d
+ roll $30,%edi
xorl %r12d,%eax
- roll $1,%edx
addl %ecx,%r13d
- roll $30,%edi
- movl %edx,0(%rsp)
+ roll $1,%ebp
addl %eax,%r13d
- movl 4(%rsp),%ebp
- movl %edi,%eax
+ xorl 4(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,0(%rsp)
movl %r13d,%ecx
- xorl 12(%rsp),%ebp
- xorl %r11d,%eax
+ xorl 12(%rsp),%r14d
+ xorl %edi,%eax
roll $5,%ecx
- xorl 36(%rsp),%ebp
+ xorl 36(%rsp),%r14d
andl %esi,%eax
- leal 1518500249(%rdx,%r12,1),%r12d
- xorl 56(%rsp),%ebp
+ leal 1518500249(%rbp,%r12,1),%r12d
+ roll $30,%esi
xorl %r11d,%eax
- roll $1,%ebp
addl %ecx,%r12d
- roll $30,%esi
- movl %ebp,4(%rsp)
+ roll $1,%r14d
addl %eax,%r12d
- movl 8(%rsp),%edx
- movl %esi,%eax
+ xorl 8(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,4(%rsp)
movl %r12d,%ecx
xorl 16(%rsp),%edx
- xorl %edi,%eax
+ xorl %esi,%eax
roll $5,%ecx
xorl 40(%rsp),%edx
andl %r13d,%eax
- leal 1518500249(%rbp,%r11,1),%r11d
- xorl 60(%rsp),%edx
+ leal 1518500249(%r14,%r11,1),%r11d
+ roll $30,%r13d
xorl %edi,%eax
- roll $1,%edx
addl %ecx,%r11d
- roll $30,%r13d
- movl %edx,8(%rsp)
+ roll $1,%edx
addl %eax,%r11d
- movl 12(%rsp),%ebp
- movl %r13d,%eax
+ xorl 12(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,8(%rsp)
movl %r11d,%ecx
xorl 20(%rsp),%ebp
- xorl %esi,%eax
+ xorl %r13d,%eax
roll $5,%ecx
xorl 44(%rsp),%ebp
andl %r12d,%eax
leal 1518500249(%rdx,%rdi,1),%edi
- xorl 0(%rsp),%ebp
+ roll $30,%r12d
xorl %esi,%eax
- roll $1,%ebp
addl %ecx,%edi
- roll $30,%r12d
- movl %ebp,12(%rsp)
+ roll $1,%ebp
addl %eax,%edi
- movl 16(%rsp),%edx
- movl %r12d,%eax
+ xorl 16(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,12(%rsp)
movl %edi,%ecx
- xorl 24(%rsp),%edx
- xorl %r13d,%eax
+ xorl 24(%rsp),%r14d
+ xorl %r12d,%eax
roll $5,%ecx
- xorl 48(%rsp),%edx
+ xorl 48(%rsp),%r14d
andl %r11d,%eax
leal 1518500249(%rbp,%rsi,1),%esi
- xorl 4(%rsp),%edx
+ roll $30,%r11d
xorl %r13d,%eax
- roll $1,%edx
addl %ecx,%esi
- roll $30,%r11d
- movl %edx,16(%rsp)
+ roll $1,%r14d
addl %eax,%esi
- movl 20(%rsp),%ebp
- movl %r11d,%eax
+ xorl 20(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,16(%rsp)
movl %esi,%ecx
- xorl 28(%rsp),%ebp
- xorl %edi,%eax
- roll $5,%ecx
- leal 1859775393(%rdx,%r13,1),%r13d
- xorl 52(%rsp),%ebp
+ xorl 28(%rsp),%edx
xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 52(%rsp),%edx
+ leal 1859775393(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 8(%rsp),%ebp
roll $30,%edi
addl %eax,%r13d
- roll $1,%ebp
- movl %ebp,20(%rsp)
- movl 24(%rsp),%edx
- movl %edi,%eax
+ roll $1,%edx
+ xorl 24(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,20(%rsp)
movl %r13d,%ecx
- xorl 32(%rsp),%edx
- xorl %esi,%eax
- roll $5,%ecx
- leal 1859775393(%rbp,%r12,1),%r12d
- xorl 56(%rsp),%edx
+ xorl 32(%rsp),%ebp
xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 56(%rsp),%ebp
+ leal 1859775393(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 12(%rsp),%edx
roll $30,%esi
addl %eax,%r12d
- roll $1,%edx
- movl %edx,24(%rsp)
- movl 28(%rsp),%ebp
- movl %esi,%eax
+ roll $1,%ebp
+ xorl 28(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,24(%rsp)
movl %r12d,%ecx
- xorl 36(%rsp),%ebp
- xorl %r13d,%eax
- roll $5,%ecx
- leal 1859775393(%rdx,%r11,1),%r11d
- xorl 60(%rsp),%ebp
+ xorl 36(%rsp),%r14d
xorl %edi,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%r14d
+ leal 1859775393(%rbp,%r11,1),%r11d
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 16(%rsp),%ebp
roll $30,%r13d
addl %eax,%r11d
- roll $1,%ebp
- movl %ebp,28(%rsp)
- movl 32(%rsp),%edx
- movl %r13d,%eax
+ roll $1,%r14d
+ xorl 32(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,28(%rsp)
movl %r11d,%ecx
xorl 40(%rsp),%edx
- xorl %r12d,%eax
+ xorl %esi,%eax
roll $5,%ecx
- leal 1859775393(%rbp,%rdi,1),%edi
xorl 0(%rsp),%edx
- xorl %esi,%eax
+ leal 1859775393(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 20(%rsp),%edx
roll $30,%r12d
addl %eax,%edi
roll $1,%edx
+ xorl 36(%rsp),%ebp
+ movl %r11d,%eax
movl %edx,32(%rsp)
- movl 36(%rsp),%ebp
- movl %r12d,%eax
movl %edi,%ecx
xorl 44(%rsp),%ebp
- xorl %r11d,%eax
+ xorl %r13d,%eax
roll $5,%ecx
- leal 1859775393(%rdx,%rsi,1),%esi
xorl 4(%rsp),%ebp
- xorl %r13d,%eax
+ leal 1859775393(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
addl %ecx,%esi
- xorl 24(%rsp),%ebp
roll $30,%r11d
addl %eax,%esi
roll $1,%ebp
+ xorl 40(%rsp),%r14d
+ movl %edi,%eax
movl %ebp,36(%rsp)
- movl 40(%rsp),%edx
- movl %r11d,%eax
movl %esi,%ecx
- xorl 48(%rsp),%edx
- xorl %edi,%eax
+ xorl 48(%rsp),%r14d
+ xorl %r12d,%eax
roll $5,%ecx
+ xorl 8(%rsp),%r14d
leal 1859775393(%rbp,%r13,1),%r13d
- xorl 8(%rsp),%edx
- xorl %r12d,%eax
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 28(%rsp),%edx
roll $30,%edi
addl %eax,%r13d
- roll $1,%edx
- movl %edx,40(%rsp)
- movl 44(%rsp),%ebp
- movl %edi,%eax
+ roll $1,%r14d
+ xorl 44(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,40(%rsp)
movl %r13d,%ecx
- xorl 52(%rsp),%ebp
- xorl %esi,%eax
- roll $5,%ecx
- leal 1859775393(%rdx,%r12,1),%r12d
- xorl 12(%rsp),%ebp
+ xorl 52(%rsp),%edx
xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 12(%rsp),%edx
+ leal 1859775393(%r14,%r12,1),%r12d
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 32(%rsp),%ebp
roll $30,%esi
addl %eax,%r12d
- roll $1,%ebp
- movl %ebp,44(%rsp)
- movl 48(%rsp),%edx
- movl %esi,%eax
+ roll $1,%edx
+ xorl 48(%rsp),%ebp
+ movl %r13d,%eax
+ movl %edx,44(%rsp)
movl %r12d,%ecx
- xorl 56(%rsp),%edx
- xorl %r13d,%eax
- roll $5,%ecx
- leal 1859775393(%rbp,%r11,1),%r11d
- xorl 16(%rsp),%edx
+ xorl 56(%rsp),%ebp
xorl %edi,%eax
+ roll $5,%ecx
+ xorl 16(%rsp),%ebp
+ leal 1859775393(%rdx,%r11,1),%r11d
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 36(%rsp),%edx
roll $30,%r13d
addl %eax,%r11d
- roll $1,%edx
- movl %edx,48(%rsp)
- movl 52(%rsp),%ebp
- movl %r13d,%eax
+ roll $1,%ebp
+ xorl 52(%rsp),%r14d
+ movl %r12d,%eax
+ movl %ebp,48(%rsp)
movl %r11d,%ecx
- xorl 60(%rsp),%ebp
- xorl %r12d,%eax
- roll $5,%ecx
- leal 1859775393(%rdx,%rdi,1),%edi
- xorl 20(%rsp),%ebp
+ xorl 60(%rsp),%r14d
xorl %esi,%eax
+ roll $5,%ecx
+ xorl 20(%rsp),%r14d
+ leal 1859775393(%rbp,%rdi,1),%edi
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 40(%rsp),%ebp
roll $30,%r12d
addl %eax,%edi
- roll $1,%ebp
- movl %ebp,52(%rsp)
- movl 56(%rsp),%edx
- movl %r12d,%eax
+ roll $1,%r14d
+ xorl 56(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,52(%rsp)
movl %edi,%ecx
xorl 0(%rsp),%edx
- xorl %r11d,%eax
+ xorl %r13d,%eax
roll $5,%ecx
- leal 1859775393(%rbp,%rsi,1),%esi
xorl 24(%rsp),%edx
- xorl %r13d,%eax
+ leal 1859775393(%r14,%rsi,1),%esi
+ xorl %r12d,%eax
addl %ecx,%esi
- xorl 44(%rsp),%edx
roll $30,%r11d
addl %eax,%esi
roll $1,%edx
+ xorl 60(%rsp),%ebp
+ movl %edi,%eax
movl %edx,56(%rsp)
- movl 60(%rsp),%ebp
- movl %r11d,%eax
movl %esi,%ecx
xorl 4(%rsp),%ebp
- xorl %edi,%eax
+ xorl %r12d,%eax
roll $5,%ecx
- leal 1859775393(%rdx,%r13,1),%r13d
xorl 28(%rsp),%ebp
- xorl %r12d,%eax
+ leal 1859775393(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 48(%rsp),%ebp
roll $30,%edi
addl %eax,%r13d
roll $1,%ebp
+ xorl 0(%rsp),%r14d
+ movl %esi,%eax
movl %ebp,60(%rsp)
- movl 0(%rsp),%edx
- movl %edi,%eax
movl %r13d,%ecx
- xorl 8(%rsp),%edx
- xorl %esi,%eax
+ xorl 8(%rsp),%r14d
+ xorl %r11d,%eax
roll $5,%ecx
+ xorl 32(%rsp),%r14d
leal 1859775393(%rbp,%r12,1),%r12d
- xorl 32(%rsp),%edx
- xorl %r11d,%eax
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 52(%rsp),%edx
roll $30,%esi
addl %eax,%r12d
- roll $1,%edx
- movl %edx,0(%rsp)
- movl 4(%rsp),%ebp
- movl %esi,%eax
+ roll $1,%r14d
+ xorl 4(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,0(%rsp)
movl %r12d,%ecx
- xorl 12(%rsp),%ebp
- xorl %r13d,%eax
- roll $5,%ecx
- leal 1859775393(%rdx,%r11,1),%r11d
- xorl 36(%rsp),%ebp
+ xorl 12(%rsp),%edx
xorl %edi,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%edx
+ leal 1859775393(%r14,%r11,1),%r11d
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 56(%rsp),%ebp
roll $30,%r13d
addl %eax,%r11d
- roll $1,%ebp
- movl %ebp,4(%rsp)
- movl 8(%rsp),%edx
- movl %r13d,%eax
+ roll $1,%edx
+ xorl 8(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,4(%rsp)
movl %r11d,%ecx
- xorl 16(%rsp),%edx
- xorl %r12d,%eax
- roll $5,%ecx
- leal 1859775393(%rbp,%rdi,1),%edi
- xorl 40(%rsp),%edx
+ xorl 16(%rsp),%ebp
xorl %esi,%eax
+ roll $5,%ecx
+ xorl 40(%rsp),%ebp
+ leal 1859775393(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 60(%rsp),%edx
roll $30,%r12d
addl %eax,%edi
- roll $1,%edx
- movl %edx,8(%rsp)
- movl 12(%rsp),%ebp
- movl %r12d,%eax
+ roll $1,%ebp
+ xorl 12(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,8(%rsp)
movl %edi,%ecx
- xorl 20(%rsp),%ebp
- xorl %r11d,%eax
- roll $5,%ecx
- leal 1859775393(%rdx,%rsi,1),%esi
- xorl 44(%rsp),%ebp
+ xorl 20(%rsp),%r14d
xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 44(%rsp),%r14d
+ leal 1859775393(%rbp,%rsi,1),%esi
+ xorl %r12d,%eax
addl %ecx,%esi
- xorl 0(%rsp),%ebp
roll $30,%r11d
addl %eax,%esi
- roll $1,%ebp
- movl %ebp,12(%rsp)
- movl 16(%rsp),%edx
- movl %r11d,%eax
+ roll $1,%r14d
+ xorl 16(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,12(%rsp)
movl %esi,%ecx
xorl 24(%rsp),%edx
- xorl %edi,%eax
+ xorl %r12d,%eax
roll $5,%ecx
- leal 1859775393(%rbp,%r13,1),%r13d
xorl 48(%rsp),%edx
- xorl %r12d,%eax
+ leal 1859775393(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 4(%rsp),%edx
roll $30,%edi
addl %eax,%r13d
roll $1,%edx
+ xorl 20(%rsp),%ebp
+ movl %esi,%eax
movl %edx,16(%rsp)
- movl 20(%rsp),%ebp
- movl %edi,%eax
movl %r13d,%ecx
xorl 28(%rsp),%ebp
- xorl %esi,%eax
+ xorl %r11d,%eax
roll $5,%ecx
- leal 1859775393(%rdx,%r12,1),%r12d
xorl 52(%rsp),%ebp
- xorl %r11d,%eax
+ leal 1859775393(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 8(%rsp),%ebp
roll $30,%esi
addl %eax,%r12d
roll $1,%ebp
+ xorl 24(%rsp),%r14d
+ movl %r13d,%eax
movl %ebp,20(%rsp)
- movl 24(%rsp),%edx
- movl %esi,%eax
movl %r12d,%ecx
- xorl 32(%rsp),%edx
- xorl %r13d,%eax
+ xorl 32(%rsp),%r14d
+ xorl %edi,%eax
roll $5,%ecx
+ xorl 56(%rsp),%r14d
leal 1859775393(%rbp,%r11,1),%r11d
- xorl 56(%rsp),%edx
- xorl %edi,%eax
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 12(%rsp),%edx
roll $30,%r13d
addl %eax,%r11d
- roll $1,%edx
- movl %edx,24(%rsp)
- movl 28(%rsp),%ebp
- movl %r13d,%eax
+ roll $1,%r14d
+ xorl 28(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,24(%rsp)
movl %r11d,%ecx
- xorl 36(%rsp),%ebp
- xorl %r12d,%eax
- roll $5,%ecx
- leal 1859775393(%rdx,%rdi,1),%edi
- xorl 60(%rsp),%ebp
+ xorl 36(%rsp),%edx
xorl %esi,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%edx
+ leal 1859775393(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 16(%rsp),%ebp
roll $30,%r12d
addl %eax,%edi
- roll $1,%ebp
- movl %ebp,28(%rsp)
- movl 32(%rsp),%edx
- movl %r12d,%eax
+ roll $1,%edx
+ xorl 32(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,28(%rsp)
movl %edi,%ecx
- xorl 40(%rsp),%edx
- xorl %r11d,%eax
- roll $5,%ecx
- leal 1859775393(%rbp,%rsi,1),%esi
- xorl 0(%rsp),%edx
+ xorl 40(%rsp),%ebp
xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 0(%rsp),%ebp
+ leal 1859775393(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
addl %ecx,%esi
- xorl 20(%rsp),%edx
roll $30,%r11d
addl %eax,%esi
- roll $1,%edx
- movl %edx,32(%rsp)
- movl 36(%rsp),%ebp
- movl %r11d,%eax
- movl %r11d,%ebx
- xorl 44(%rsp),%ebp
- andl %r12d,%eax
+ roll $1,%ebp
+ xorl 36(%rsp),%r14d
+ movl %r12d,%eax
+ movl %ebp,32(%rsp)
+ movl %r12d,%ebx
+ xorl 44(%rsp),%r14d
+ andl %r11d,%eax
movl %esi,%ecx
- xorl 4(%rsp),%ebp
- xorl %r12d,%ebx
- leal -1894007588(%rdx,%r13,1),%r13d
+ xorl 4(%rsp),%r14d
+ leal -1894007588(%rbp,%r13,1),%r13d
+ xorl %r11d,%ebx
roll $5,%ecx
- xorl 24(%rsp),%ebp
addl %eax,%r13d
+ roll $1,%r14d
andl %edi,%ebx
- roll $1,%ebp
- addl %ebx,%r13d
- roll $30,%edi
- movl %ebp,36(%rsp)
addl %ecx,%r13d
- movl 40(%rsp),%edx
- movl %edi,%eax
- movl %edi,%ebx
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 40(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,36(%rsp)
+ movl %r11d,%ebx
xorl 48(%rsp),%edx
- andl %r11d,%eax
+ andl %edi,%eax
movl %r13d,%ecx
xorl 8(%rsp),%edx
- xorl %r11d,%ebx
- leal -1894007588(%rbp,%r12,1),%r12d
+ leal -1894007588(%r14,%r12,1),%r12d
+ xorl %edi,%ebx
roll $5,%ecx
- xorl 28(%rsp),%edx
addl %eax,%r12d
- andl %esi,%ebx
roll $1,%edx
- addl %ebx,%r12d
+ andl %esi,%ebx
+ addl %ecx,%r12d
roll $30,%esi
+ addl %ebx,%r12d
+ xorl 44(%rsp),%ebp
+ movl %edi,%eax
movl %edx,40(%rsp)
- addl %ecx,%r12d
- movl 44(%rsp),%ebp
- movl %esi,%eax
- movl %esi,%ebx
+ movl %edi,%ebx
xorl 52(%rsp),%ebp
- andl %edi,%eax
+ andl %esi,%eax
movl %r12d,%ecx
xorl 12(%rsp),%ebp
- xorl %edi,%ebx
leal -1894007588(%rdx,%r11,1),%r11d
+ xorl %esi,%ebx
roll $5,%ecx
- xorl 32(%rsp),%ebp
addl %eax,%r11d
- andl %r13d,%ebx
roll $1,%ebp
- addl %ebx,%r11d
+ andl %r13d,%ebx
+ addl %ecx,%r11d
roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 48(%rsp),%r14d
+ movl %esi,%eax
movl %ebp,44(%rsp)
- addl %ecx,%r11d
- movl 48(%rsp),%edx
- movl %r13d,%eax
- movl %r13d,%ebx
- xorl 56(%rsp),%edx
- andl %esi,%eax
+ movl %esi,%ebx
+ xorl 56(%rsp),%r14d
+ andl %r13d,%eax
movl %r11d,%ecx
- xorl 16(%rsp),%edx
- xorl %esi,%ebx
+ xorl 16(%rsp),%r14d
leal -1894007588(%rbp,%rdi,1),%edi
+ xorl %r13d,%ebx
roll $5,%ecx
- xorl 36(%rsp),%edx
addl %eax,%edi
+ roll $1,%r14d
andl %r12d,%ebx
- roll $1,%edx
- addl %ebx,%edi
- roll $30,%r12d
- movl %edx,48(%rsp)
addl %ecx,%edi
- movl 52(%rsp),%ebp
- movl %r12d,%eax
- movl %r12d,%ebx
- xorl 60(%rsp),%ebp
- andl %r13d,%eax
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 52(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,48(%rsp)
+ movl %r13d,%ebx
+ xorl 60(%rsp),%edx
+ andl %r12d,%eax
movl %edi,%ecx
- xorl 20(%rsp),%ebp
- xorl %r13d,%ebx
- leal -1894007588(%rdx,%rsi,1),%esi
+ xorl 20(%rsp),%edx
+ leal -1894007588(%r14,%rsi,1),%esi
+ xorl %r12d,%ebx
roll $5,%ecx
- xorl 40(%rsp),%ebp
addl %eax,%esi
+ roll $1,%edx
andl %r11d,%ebx
- roll $1,%ebp
- addl %ebx,%esi
- roll $30,%r11d
- movl %ebp,52(%rsp)
addl %ecx,%esi
- movl 56(%rsp),%edx
- movl %r11d,%eax
- movl %r11d,%ebx
- xorl 0(%rsp),%edx
- andl %r12d,%eax
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 56(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,52(%rsp)
+ movl %r12d,%ebx
+ xorl 0(%rsp),%ebp
+ andl %r11d,%eax
movl %esi,%ecx
- xorl 24(%rsp),%edx
- xorl %r12d,%ebx
- leal -1894007588(%rbp,%r13,1),%r13d
+ xorl 24(%rsp),%ebp
+ leal -1894007588(%rdx,%r13,1),%r13d
+ xorl %r11d,%ebx
roll $5,%ecx
- xorl 44(%rsp),%edx
addl %eax,%r13d
+ roll $1,%ebp
andl %edi,%ebx
- roll $1,%edx
- addl %ebx,%r13d
- roll $30,%edi
- movl %edx,56(%rsp)
addl %ecx,%r13d
- movl 60(%rsp),%ebp
- movl %edi,%eax
- movl %edi,%ebx
- xorl 4(%rsp),%ebp
- andl %r11d,%eax
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 60(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,56(%rsp)
+ movl %r11d,%ebx
+ xorl 4(%rsp),%r14d
+ andl %edi,%eax
movl %r13d,%ecx
- xorl 28(%rsp),%ebp
- xorl %r11d,%ebx
- leal -1894007588(%rdx,%r12,1),%r12d
+ xorl 28(%rsp),%r14d
+ leal -1894007588(%rbp,%r12,1),%r12d
+ xorl %edi,%ebx
roll $5,%ecx
- xorl 48(%rsp),%ebp
addl %eax,%r12d
+ roll $1,%r14d
andl %esi,%ebx
- roll $1,%ebp
- addl %ebx,%r12d
- roll $30,%esi
- movl %ebp,60(%rsp)
addl %ecx,%r12d
- movl 0(%rsp),%edx
- movl %esi,%eax
- movl %esi,%ebx
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 0(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,60(%rsp)
+ movl %edi,%ebx
xorl 8(%rsp),%edx
- andl %edi,%eax
+ andl %esi,%eax
movl %r12d,%ecx
xorl 32(%rsp),%edx
- xorl %edi,%ebx
- leal -1894007588(%rbp,%r11,1),%r11d
+ leal -1894007588(%r14,%r11,1),%r11d
+ xorl %esi,%ebx
roll $5,%ecx
- xorl 52(%rsp),%edx
addl %eax,%r11d
- andl %r13d,%ebx
roll $1,%edx
- addl %ebx,%r11d
+ andl %r13d,%ebx
+ addl %ecx,%r11d
roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 4(%rsp),%ebp
+ movl %esi,%eax
movl %edx,0(%rsp)
- addl %ecx,%r11d
- movl 4(%rsp),%ebp
- movl %r13d,%eax
- movl %r13d,%ebx
+ movl %esi,%ebx
xorl 12(%rsp),%ebp
- andl %esi,%eax
+ andl %r13d,%eax
movl %r11d,%ecx
xorl 36(%rsp),%ebp
- xorl %esi,%ebx
leal -1894007588(%rdx,%rdi,1),%edi
+ xorl %r13d,%ebx
roll $5,%ecx
- xorl 56(%rsp),%ebp
addl %eax,%edi
- andl %r12d,%ebx
roll $1,%ebp
- addl %ebx,%edi
+ andl %r12d,%ebx
+ addl %ecx,%edi
roll $30,%r12d
+ addl %ebx,%edi
+ xorl 8(%rsp),%r14d
+ movl %r13d,%eax
movl %ebp,4(%rsp)
- addl %ecx,%edi
- movl 8(%rsp),%edx
- movl %r12d,%eax
- movl %r12d,%ebx
- xorl 16(%rsp),%edx
- andl %r13d,%eax
+ movl %r13d,%ebx
+ xorl 16(%rsp),%r14d
+ andl %r12d,%eax
movl %edi,%ecx
- xorl 40(%rsp),%edx
- xorl %r13d,%ebx
+ xorl 40(%rsp),%r14d
leal -1894007588(%rbp,%rsi,1),%esi
+ xorl %r12d,%ebx
roll $5,%ecx
- xorl 60(%rsp),%edx
addl %eax,%esi
+ roll $1,%r14d
andl %r11d,%ebx
- roll $1,%edx
- addl %ebx,%esi
- roll $30,%r11d
- movl %edx,8(%rsp)
addl %ecx,%esi
- movl 12(%rsp),%ebp
- movl %r11d,%eax
- movl %r11d,%ebx
- xorl 20(%rsp),%ebp
- andl %r12d,%eax
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 12(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,8(%rsp)
+ movl %r12d,%ebx
+ xorl 20(%rsp),%edx
+ andl %r11d,%eax
movl %esi,%ecx
- xorl 44(%rsp),%ebp
- xorl %r12d,%ebx
- leal -1894007588(%rdx,%r13,1),%r13d
+ xorl 44(%rsp),%edx
+ leal -1894007588(%r14,%r13,1),%r13d
+ xorl %r11d,%ebx
roll $5,%ecx
- xorl 0(%rsp),%ebp
addl %eax,%r13d
+ roll $1,%edx
andl %edi,%ebx
- roll $1,%ebp
- addl %ebx,%r13d
- roll $30,%edi
- movl %ebp,12(%rsp)
addl %ecx,%r13d
- movl 16(%rsp),%edx
- movl %edi,%eax
- movl %edi,%ebx
- xorl 24(%rsp),%edx
- andl %r11d,%eax
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 16(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,12(%rsp)
+ movl %r11d,%ebx
+ xorl 24(%rsp),%ebp
+ andl %edi,%eax
movl %r13d,%ecx
- xorl 48(%rsp),%edx
- xorl %r11d,%ebx
- leal -1894007588(%rbp,%r12,1),%r12d
+ xorl 48(%rsp),%ebp
+ leal -1894007588(%rdx,%r12,1),%r12d
+ xorl %edi,%ebx
roll $5,%ecx
- xorl 4(%rsp),%edx
addl %eax,%r12d
+ roll $1,%ebp
andl %esi,%ebx
- roll $1,%edx
- addl %ebx,%r12d
- roll $30,%esi
- movl %edx,16(%rsp)
addl %ecx,%r12d
- movl 20(%rsp),%ebp
- movl %esi,%eax
- movl %esi,%ebx
- xorl 28(%rsp),%ebp
- andl %edi,%eax
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 20(%rsp),%r14d
+ movl %edi,%eax
+ movl %ebp,16(%rsp)
+ movl %edi,%ebx
+ xorl 28(%rsp),%r14d
+ andl %esi,%eax
movl %r12d,%ecx
- xorl 52(%rsp),%ebp
- xorl %edi,%ebx
- leal -1894007588(%rdx,%r11,1),%r11d
+ xorl 52(%rsp),%r14d
+ leal -1894007588(%rbp,%r11,1),%r11d
+ xorl %esi,%ebx
roll $5,%ecx
- xorl 8(%rsp),%ebp
addl %eax,%r11d
+ roll $1,%r14d
andl %r13d,%ebx
- roll $1,%ebp
- addl %ebx,%r11d
- roll $30,%r13d
- movl %ebp,20(%rsp)
addl %ecx,%r11d
- movl 24(%rsp),%edx
- movl %r13d,%eax
- movl %r13d,%ebx
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 24(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,20(%rsp)
+ movl %esi,%ebx
xorl 32(%rsp),%edx
- andl %esi,%eax
+ andl %r13d,%eax
movl %r11d,%ecx
xorl 56(%rsp),%edx
- xorl %esi,%ebx
- leal -1894007588(%rbp,%rdi,1),%edi
+ leal -1894007588(%r14,%rdi,1),%edi
+ xorl %r13d,%ebx
roll $5,%ecx
- xorl 12(%rsp),%edx
addl %eax,%edi
- andl %r12d,%ebx
roll $1,%edx
- addl %ebx,%edi
+ andl %r12d,%ebx
+ addl %ecx,%edi
roll $30,%r12d
+ addl %ebx,%edi
+ xorl 28(%rsp),%ebp
+ movl %r13d,%eax
movl %edx,24(%rsp)
- addl %ecx,%edi
- movl 28(%rsp),%ebp
- movl %r12d,%eax
- movl %r12d,%ebx
+ movl %r13d,%ebx
xorl 36(%rsp),%ebp
- andl %r13d,%eax
+ andl %r12d,%eax
movl %edi,%ecx
xorl 60(%rsp),%ebp
- xorl %r13d,%ebx
leal -1894007588(%rdx,%rsi,1),%esi
+ xorl %r12d,%ebx
roll $5,%ecx
- xorl 16(%rsp),%ebp
addl %eax,%esi
- andl %r11d,%ebx
roll $1,%ebp
- addl %ebx,%esi
+ andl %r11d,%ebx
+ addl %ecx,%esi
roll $30,%r11d
+ addl %ebx,%esi
+ xorl 32(%rsp),%r14d
+ movl %r12d,%eax
movl %ebp,28(%rsp)
- addl %ecx,%esi
- movl 32(%rsp),%edx
- movl %r11d,%eax
- movl %r11d,%ebx
- xorl 40(%rsp),%edx
- andl %r12d,%eax
+ movl %r12d,%ebx
+ xorl 40(%rsp),%r14d
+ andl %r11d,%eax
movl %esi,%ecx
- xorl 0(%rsp),%edx
- xorl %r12d,%ebx
+ xorl 0(%rsp),%r14d
leal -1894007588(%rbp,%r13,1),%r13d
+ xorl %r11d,%ebx
roll $5,%ecx
- xorl 20(%rsp),%edx
addl %eax,%r13d
+ roll $1,%r14d
andl %edi,%ebx
- roll $1,%edx
- addl %ebx,%r13d
- roll $30,%edi
- movl %edx,32(%rsp)
addl %ecx,%r13d
- movl 36(%rsp),%ebp
- movl %edi,%eax
- movl %edi,%ebx
- xorl 44(%rsp),%ebp
- andl %r11d,%eax
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 36(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,32(%rsp)
+ movl %r11d,%ebx
+ xorl 44(%rsp),%edx
+ andl %edi,%eax
movl %r13d,%ecx
- xorl 4(%rsp),%ebp
- xorl %r11d,%ebx
- leal -1894007588(%rdx,%r12,1),%r12d
+ xorl 4(%rsp),%edx
+ leal -1894007588(%r14,%r12,1),%r12d
+ xorl %edi,%ebx
roll $5,%ecx
- xorl 24(%rsp),%ebp
addl %eax,%r12d
+ roll $1,%edx
andl %esi,%ebx
- roll $1,%ebp
- addl %ebx,%r12d
- roll $30,%esi
- movl %ebp,36(%rsp)
addl %ecx,%r12d
- movl 40(%rsp),%edx
- movl %esi,%eax
- movl %esi,%ebx
- xorl 48(%rsp),%edx
- andl %edi,%eax
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 40(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,36(%rsp)
+ movl %edi,%ebx
+ xorl 48(%rsp),%ebp
+ andl %esi,%eax
movl %r12d,%ecx
- xorl 8(%rsp),%edx
- xorl %edi,%ebx
- leal -1894007588(%rbp,%r11,1),%r11d
+ xorl 8(%rsp),%ebp
+ leal -1894007588(%rdx,%r11,1),%r11d
+ xorl %esi,%ebx
roll $5,%ecx
- xorl 28(%rsp),%edx
addl %eax,%r11d
+ roll $1,%ebp
andl %r13d,%ebx
- roll $1,%edx
- addl %ebx,%r11d
- roll $30,%r13d
- movl %edx,40(%rsp)
addl %ecx,%r11d
- movl 44(%rsp),%ebp
- movl %r13d,%eax
- movl %r13d,%ebx
- xorl 52(%rsp),%ebp
- andl %esi,%eax
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 44(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,40(%rsp)
+ movl %esi,%ebx
+ xorl 52(%rsp),%r14d
+ andl %r13d,%eax
movl %r11d,%ecx
- xorl 12(%rsp),%ebp
- xorl %esi,%ebx
- leal -1894007588(%rdx,%rdi,1),%edi
+ xorl 12(%rsp),%r14d
+ leal -1894007588(%rbp,%rdi,1),%edi
+ xorl %r13d,%ebx
roll $5,%ecx
- xorl 32(%rsp),%ebp
addl %eax,%edi
+ roll $1,%r14d
andl %r12d,%ebx
- roll $1,%ebp
- addl %ebx,%edi
- roll $30,%r12d
- movl %ebp,44(%rsp)
addl %ecx,%edi
- movl 48(%rsp),%edx
- movl %r12d,%eax
- movl %r12d,%ebx
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 48(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,44(%rsp)
+ movl %r13d,%ebx
xorl 56(%rsp),%edx
- andl %r13d,%eax
+ andl %r12d,%eax
movl %edi,%ecx
xorl 16(%rsp),%edx
- xorl %r13d,%ebx
- leal -1894007588(%rbp,%rsi,1),%esi
+ leal -1894007588(%r14,%rsi,1),%esi
+ xorl %r12d,%ebx
roll $5,%ecx
- xorl 36(%rsp),%edx
addl %eax,%esi
- andl %r11d,%ebx
roll $1,%edx
- addl %ebx,%esi
+ andl %r11d,%ebx
+ addl %ecx,%esi
roll $30,%r11d
+ addl %ebx,%esi
+ xorl 52(%rsp),%ebp
+ movl %edi,%eax
movl %edx,48(%rsp)
- addl %ecx,%esi
- movl 52(%rsp),%ebp
- movl %r11d,%eax
movl %esi,%ecx
xorl 60(%rsp),%ebp
- xorl %edi,%eax
+ xorl %r12d,%eax
roll $5,%ecx
- leal -899497514(%rdx,%r13,1),%r13d
xorl 20(%rsp),%ebp
- xorl %r12d,%eax
+ leal -899497514(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 40(%rsp),%ebp
roll $30,%edi
addl %eax,%r13d
roll $1,%ebp
+ xorl 56(%rsp),%r14d
+ movl %esi,%eax
movl %ebp,52(%rsp)
- movl 56(%rsp),%edx
- movl %edi,%eax
movl %r13d,%ecx
- xorl 0(%rsp),%edx
- xorl %esi,%eax
+ xorl 0(%rsp),%r14d
+ xorl %r11d,%eax
roll $5,%ecx
+ xorl 24(%rsp),%r14d
leal -899497514(%rbp,%r12,1),%r12d
- xorl 24(%rsp),%edx
- xorl %r11d,%eax
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 44(%rsp),%edx
roll $30,%esi
addl %eax,%r12d
- roll $1,%edx
- movl %edx,56(%rsp)
- movl 60(%rsp),%ebp
- movl %esi,%eax
+ roll $1,%r14d
+ xorl 60(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,56(%rsp)
movl %r12d,%ecx
- xorl 4(%rsp),%ebp
- xorl %r13d,%eax
- roll $5,%ecx
- leal -899497514(%rdx,%r11,1),%r11d
- xorl 28(%rsp),%ebp
+ xorl 4(%rsp),%edx
xorl %edi,%eax
+ roll $5,%ecx
+ xorl 28(%rsp),%edx
+ leal -899497514(%r14,%r11,1),%r11d
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 48(%rsp),%ebp
roll $30,%r13d
addl %eax,%r11d
- roll $1,%ebp
- movl %ebp,60(%rsp)
- movl 0(%rsp),%edx
- movl %r13d,%eax
+ roll $1,%edx
+ xorl 0(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,60(%rsp)
movl %r11d,%ecx
- xorl 8(%rsp),%edx
- xorl %r12d,%eax
- roll $5,%ecx
- leal -899497514(%rbp,%rdi,1),%edi
- xorl 32(%rsp),%edx
+ xorl 8(%rsp),%ebp
xorl %esi,%eax
+ roll $5,%ecx
+ xorl 32(%rsp),%ebp
+ leal -899497514(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 52(%rsp),%edx
roll $30,%r12d
addl %eax,%edi
- roll $1,%edx
- movl %edx,0(%rsp)
- movl 4(%rsp),%ebp
- movl %r12d,%eax
+ roll $1,%ebp
+ xorl 4(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,0(%rsp)
movl %edi,%ecx
- xorl 12(%rsp),%ebp
- xorl %r11d,%eax
- roll $5,%ecx
- leal -899497514(%rdx,%rsi,1),%esi
- xorl 36(%rsp),%ebp
+ xorl 12(%rsp),%r14d
xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%r14d
+ leal -899497514(%rbp,%rsi,1),%esi
+ xorl %r12d,%eax
addl %ecx,%esi
- xorl 56(%rsp),%ebp
roll $30,%r11d
addl %eax,%esi
- roll $1,%ebp
- movl %ebp,4(%rsp)
- movl 8(%rsp),%edx
- movl %r11d,%eax
+ roll $1,%r14d
+ xorl 8(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,4(%rsp)
movl %esi,%ecx
xorl 16(%rsp),%edx
- xorl %edi,%eax
+ xorl %r12d,%eax
roll $5,%ecx
- leal -899497514(%rbp,%r13,1),%r13d
xorl 40(%rsp),%edx
- xorl %r12d,%eax
+ leal -899497514(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 60(%rsp),%edx
roll $30,%edi
addl %eax,%r13d
roll $1,%edx
+ xorl 12(%rsp),%ebp
+ movl %esi,%eax
movl %edx,8(%rsp)
- movl 12(%rsp),%ebp
- movl %edi,%eax
movl %r13d,%ecx
xorl 20(%rsp),%ebp
- xorl %esi,%eax
+ xorl %r11d,%eax
roll $5,%ecx
- leal -899497514(%rdx,%r12,1),%r12d
xorl 44(%rsp),%ebp
- xorl %r11d,%eax
+ leal -899497514(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 0(%rsp),%ebp
roll $30,%esi
addl %eax,%r12d
roll $1,%ebp
+ xorl 16(%rsp),%r14d
+ movl %r13d,%eax
movl %ebp,12(%rsp)
- movl 16(%rsp),%edx
- movl %esi,%eax
movl %r12d,%ecx
- xorl 24(%rsp),%edx
- xorl %r13d,%eax
+ xorl 24(%rsp),%r14d
+ xorl %edi,%eax
roll $5,%ecx
+ xorl 48(%rsp),%r14d
leal -899497514(%rbp,%r11,1),%r11d
- xorl 48(%rsp),%edx
- xorl %edi,%eax
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 4(%rsp),%edx
roll $30,%r13d
addl %eax,%r11d
- roll $1,%edx
- movl %edx,16(%rsp)
- movl 20(%rsp),%ebp
- movl %r13d,%eax
+ roll $1,%r14d
+ xorl 20(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,16(%rsp)
movl %r11d,%ecx
- xorl 28(%rsp),%ebp
- xorl %r12d,%eax
- roll $5,%ecx
- leal -899497514(%rdx,%rdi,1),%edi
- xorl 52(%rsp),%ebp
+ xorl 28(%rsp),%edx
xorl %esi,%eax
+ roll $5,%ecx
+ xorl 52(%rsp),%edx
+ leal -899497514(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 8(%rsp),%ebp
roll $30,%r12d
addl %eax,%edi
- roll $1,%ebp
- movl %ebp,20(%rsp)
- movl 24(%rsp),%edx
- movl %r12d,%eax
+ roll $1,%edx
+ xorl 24(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,20(%rsp)
movl %edi,%ecx
- xorl 32(%rsp),%edx
- xorl %r11d,%eax
- roll $5,%ecx
- leal -899497514(%rbp,%rsi,1),%esi
- xorl 56(%rsp),%edx
+ xorl 32(%rsp),%ebp
xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 56(%rsp),%ebp
+ leal -899497514(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
addl %ecx,%esi
- xorl 12(%rsp),%edx
roll $30,%r11d
addl %eax,%esi
- roll $1,%edx
- movl %edx,24(%rsp)
- movl 28(%rsp),%ebp
- movl %r11d,%eax
+ roll $1,%ebp
+ xorl 28(%rsp),%r14d
+ movl %edi,%eax
+ movl %ebp,24(%rsp)
movl %esi,%ecx
- xorl 36(%rsp),%ebp
- xorl %edi,%eax
- roll $5,%ecx
- leal -899497514(%rdx,%r13,1),%r13d
- xorl 60(%rsp),%ebp
+ xorl 36(%rsp),%r14d
xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%r14d
+ leal -899497514(%rbp,%r13,1),%r13d
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 16(%rsp),%ebp
roll $30,%edi
addl %eax,%r13d
- roll $1,%ebp
- movl %ebp,28(%rsp)
- movl 32(%rsp),%edx
- movl %edi,%eax
+ roll $1,%r14d
+ xorl 32(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,28(%rsp)
movl %r13d,%ecx
xorl 40(%rsp),%edx
- xorl %esi,%eax
+ xorl %r11d,%eax
roll $5,%ecx
- leal -899497514(%rbp,%r12,1),%r12d
xorl 0(%rsp),%edx
- xorl %r11d,%eax
+ leal -899497514(%r14,%r12,1),%r12d
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 20(%rsp),%edx
roll $30,%esi
addl %eax,%r12d
roll $1,%edx
- movl %edx,32(%rsp)
- movl 36(%rsp),%ebp
- movl %esi,%eax
+ xorl 36(%rsp),%ebp
+ movl %r13d,%eax
+
movl %r12d,%ecx
xorl 44(%rsp),%ebp
- xorl %r13d,%eax
+ xorl %edi,%eax
roll $5,%ecx
- leal -899497514(%rdx,%r11,1),%r11d
xorl 4(%rsp),%ebp
- xorl %edi,%eax
+ leal -899497514(%rdx,%r11,1),%r11d
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 24(%rsp),%ebp
roll $30,%r13d
addl %eax,%r11d
roll $1,%ebp
- movl %ebp,36(%rsp)
- movl 40(%rsp),%edx
- movl %r13d,%eax
+ xorl 40(%rsp),%r14d
+ movl %r12d,%eax
+
movl %r11d,%ecx
- xorl 48(%rsp),%edx
- xorl %r12d,%eax
+ xorl 48(%rsp),%r14d
+ xorl %esi,%eax
roll $5,%ecx
+ xorl 8(%rsp),%r14d
leal -899497514(%rbp,%rdi,1),%edi
- xorl 8(%rsp),%edx
- xorl %esi,%eax
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 28(%rsp),%edx
roll $30,%r12d
addl %eax,%edi
- roll $1,%edx
- movl %edx,40(%rsp)
- movl 44(%rsp),%ebp
- movl %r12d,%eax
+ roll $1,%r14d
+ xorl 44(%rsp),%edx
+ movl %r11d,%eax
+
movl %edi,%ecx
- xorl 52(%rsp),%ebp
- xorl %r11d,%eax
- roll $5,%ecx
- leal -899497514(%rdx,%rsi,1),%esi
- xorl 12(%rsp),%ebp
+ xorl 52(%rsp),%edx
xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 12(%rsp),%edx
+ leal -899497514(%r14,%rsi,1),%esi
+ xorl %r12d,%eax
addl %ecx,%esi
- xorl 32(%rsp),%ebp
roll $30,%r11d
addl %eax,%esi
- roll $1,%ebp
- movl %ebp,44(%rsp)
- movl 48(%rsp),%edx
- movl %r11d,%eax
+ roll $1,%edx
+ xorl 48(%rsp),%ebp
+ movl %edi,%eax
+
movl %esi,%ecx
- xorl 56(%rsp),%edx
- xorl %edi,%eax
- roll $5,%ecx
- leal -899497514(%rbp,%r13,1),%r13d
- xorl 16(%rsp),%edx
+ xorl 56(%rsp),%ebp
xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 16(%rsp),%ebp
+ leal -899497514(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 36(%rsp),%edx
roll $30,%edi
addl %eax,%r13d
- roll $1,%edx
- movl %edx,48(%rsp)
- movl 52(%rsp),%ebp
- movl %edi,%eax
+ roll $1,%ebp
+ xorl 52(%rsp),%r14d
+ movl %esi,%eax
+
movl %r13d,%ecx
- xorl 60(%rsp),%ebp
- xorl %esi,%eax
- roll $5,%ecx
- leal -899497514(%rdx,%r12,1),%r12d
- xorl 20(%rsp),%ebp
+ xorl 60(%rsp),%r14d
xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 20(%rsp),%r14d
+ leal -899497514(%rbp,%r12,1),%r12d
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 40(%rsp),%ebp
roll $30,%esi
addl %eax,%r12d
- roll $1,%ebp
- movl 56(%rsp),%edx
- movl %esi,%eax
+ roll $1,%r14d
+ xorl 56(%rsp),%edx
+ movl %r13d,%eax
+
movl %r12d,%ecx
xorl 0(%rsp),%edx
- xorl %r13d,%eax
+ xorl %edi,%eax
roll $5,%ecx
- leal -899497514(%rbp,%r11,1),%r11d
xorl 24(%rsp),%edx
- xorl %edi,%eax
+ leal -899497514(%r14,%r11,1),%r11d
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 44(%rsp),%edx
roll $30,%r13d
addl %eax,%r11d
roll $1,%edx
- movl 60(%rsp),%ebp
- movl %r13d,%eax
+ xorl 60(%rsp),%ebp
+ movl %r12d,%eax
+
movl %r11d,%ecx
xorl 4(%rsp),%ebp
- xorl %r12d,%eax
+ xorl %esi,%eax
roll $5,%ecx
- leal -899497514(%rdx,%rdi,1),%edi
xorl 28(%rsp),%ebp
- xorl %esi,%eax
+ leal -899497514(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 48(%rsp),%ebp
roll $30,%r12d
addl %eax,%edi
roll $1,%ebp
- movl %r12d,%eax
+ movl %r11d,%eax
movl %edi,%ecx
- xorl %r11d,%eax
+ xorl %r13d,%eax
leal -899497514(%rbp,%rsi,1),%esi
roll $5,%ecx
- xorl %r13d,%eax
+ xorl %r12d,%eax
addl %ecx,%esi
roll $30,%r11d
addl %eax,%esi
@@ -1319,11 +1259,12 @@ sha1_block_data_order:
jnz .Lloop
movq 64(%rsp),%rsi
- movq (%rsi),%r13
- movq 8(%rsi),%r12
- movq 16(%rsi),%rbp
- movq 24(%rsi),%rbx
- leaq 32(%rsi),%rsp
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
.Lepilogue:
.byte 0xf3,0xc3
.size sha1_block_data_order,.-sha1_block_data_order
@@ -1331,17 +1272,22 @@ sha1_block_data_order:
.align 16
sha1_block_data_order_ssse3:
_ssse3_shortcut:
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
+ pushq %r13
+ pushq %r14
leaq -64(%rsp),%rsp
+ movq %rax,%r14
+ andq $-64,%rsp
movq %rdi,%r8
movq %rsi,%r9
movq %rdx,%r10
shlq $6,%r10
addq %r9,%r10
- leaq K_XX_XX(%rip),%r11
+ leaq K_XX_XX+64(%rip),%r11
movl 0(%r8),%eax
movl 4(%r8),%ebx
@@ -1349,19 +1295,22 @@ _ssse3_shortcut:
movl 12(%r8),%edx
movl %ebx,%esi
movl 16(%r8),%ebp
+ movl %ecx,%edi
+ xorl %edx,%edi
+ andl %edi,%esi
movdqa 64(%r11),%xmm6
- movdqa 0(%r11),%xmm9
+ movdqa -64(%r11),%xmm9
movdqu 0(%r9),%xmm0
movdqu 16(%r9),%xmm1
movdqu 32(%r9),%xmm2
movdqu 48(%r9),%xmm3
.byte 102,15,56,0,198
- addq $64,%r9
.byte 102,15,56,0,206
.byte 102,15,56,0,214
-.byte 102,15,56,0,222
+ addq $64,%r9
paddd %xmm9,%xmm0
+.byte 102,15,56,0,222
paddd %xmm9,%xmm1
paddd %xmm9,%xmm2
movdqa %xmm0,0(%rsp)
@@ -1373,904 +1322,882 @@ _ssse3_shortcut:
jmp .Loop_ssse3
.align 16
.Loop_ssse3:
- movdqa %xmm1,%xmm4
- addl 0(%rsp),%ebp
- xorl %edx,%ecx
+ rorl $2,%ebx
+ pshufd $238,%xmm0,%xmm4
+ xorl %edx,%esi
movdqa %xmm3,%xmm8
-.byte 102,15,58,15,224,8
+ paddd %xmm3,%xmm9
movl %eax,%edi
+ addl 0(%rsp),%ebp
+ punpcklqdq %xmm1,%xmm4
+ xorl %ecx,%ebx
roll $5,%eax
- paddd %xmm3,%xmm9
- andl %ecx,%esi
- xorl %edx,%ecx
+ addl %esi,%ebp
psrldq $4,%xmm8
- xorl %edx,%esi
- addl %eax,%ebp
+ andl %ebx,%edi
+ xorl %ecx,%ebx
pxor %xmm0,%xmm4
- rorl $2,%ebx
- addl %esi,%ebp
+ addl %eax,%ebp
+ rorl $7,%eax
pxor %xmm2,%xmm8
- addl 4(%rsp),%edx
- xorl %ecx,%ebx
+ xorl %ecx,%edi
movl %ebp,%esi
- roll $5,%ebp
+ addl 4(%rsp),%edx
pxor %xmm8,%xmm4
- andl %ebx,%edi
- xorl %ecx,%ebx
+ xorl %ebx,%eax
+ roll $5,%ebp
movdqa %xmm9,48(%rsp)
- xorl %ecx,%edi
- addl %ebp,%edx
- movdqa %xmm4,%xmm10
- movdqa %xmm4,%xmm8
- rorl $7,%eax
addl %edi,%edx
- addl 8(%rsp),%ecx
+ andl %eax,%esi
+ movdqa %xmm4,%xmm10
xorl %ebx,%eax
+ addl %ebp,%edx
+ rorl $7,%ebp
+ movdqa %xmm4,%xmm8
+ xorl %ebx,%esi
pslldq $12,%xmm10
paddd %xmm4,%xmm4
movl %edx,%edi
- roll $5,%edx
- andl %eax,%esi
- xorl %ebx,%eax
+ addl 8(%rsp),%ecx
psrld $31,%xmm8
- xorl %ebx,%esi
- addl %edx,%ecx
- movdqa %xmm10,%xmm9
- rorl $7,%ebp
+ xorl %eax,%ebp
+ roll $5,%edx
addl %esi,%ecx
+ movdqa %xmm10,%xmm9
+ andl %ebp,%edi
+ xorl %eax,%ebp
psrld $30,%xmm10
+ addl %edx,%ecx
+ rorl $7,%edx
por %xmm8,%xmm4
- addl 12(%rsp),%ebx
- xorl %eax,%ebp
+ xorl %eax,%edi
movl %ecx,%esi
- roll $5,%ecx
+ addl 12(%rsp),%ebx
pslld $2,%xmm9
pxor %xmm10,%xmm4
- andl %ebp,%edi
- xorl %eax,%ebp
- movdqa 0(%r11),%xmm10
- xorl %eax,%edi
- addl %ecx,%ebx
- pxor %xmm9,%xmm4
- rorl $7,%edx
+ xorl %ebp,%edx
+ movdqa -64(%r11),%xmm10
+ roll $5,%ecx
addl %edi,%ebx
- movdqa %xmm2,%xmm5
- addl 16(%rsp),%eax
+ andl %edx,%esi
+ pxor %xmm9,%xmm4
xorl %ebp,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ pshufd $238,%xmm1,%xmm5
+ xorl %ebp,%esi
movdqa %xmm4,%xmm9
-.byte 102,15,58,15,233,8
+ paddd %xmm4,%xmm10
movl %ebx,%edi
+ addl 16(%rsp),%eax
+ punpcklqdq %xmm2,%xmm5
+ xorl %edx,%ecx
roll $5,%ebx
- paddd %xmm4,%xmm10
- andl %edx,%esi
- xorl %ebp,%edx
+ addl %esi,%eax
psrldq $4,%xmm9
- xorl %ebp,%esi
- addl %ebx,%eax
+ andl %ecx,%edi
+ xorl %edx,%ecx
pxor %xmm1,%xmm5
- rorl $7,%ecx
- addl %esi,%eax
+ addl %ebx,%eax
+ rorl $7,%ebx
pxor %xmm3,%xmm9
- addl 20(%rsp),%ebp
- xorl %edx,%ecx
+ xorl %edx,%edi
movl %eax,%esi
- roll $5,%eax
+ addl 20(%rsp),%ebp
pxor %xmm9,%xmm5
- andl %ecx,%edi
- xorl %edx,%ecx
+ xorl %ecx,%ebx
+ roll $5,%eax
movdqa %xmm10,0(%rsp)
- xorl %edx,%edi
- addl %eax,%ebp
- movdqa %xmm5,%xmm8
- movdqa %xmm5,%xmm9
- rorl $7,%ebx
addl %edi,%ebp
- addl 24(%rsp),%edx
+ andl %ebx,%esi
+ movdqa %xmm5,%xmm8
xorl %ecx,%ebx
+ addl %eax,%ebp
+ rorl $7,%eax
+ movdqa %xmm5,%xmm9
+ xorl %ecx,%esi
pslldq $12,%xmm8
paddd %xmm5,%xmm5
movl %ebp,%edi
- roll $5,%ebp
- andl %ebx,%esi
- xorl %ecx,%ebx
+ addl 24(%rsp),%edx
psrld $31,%xmm9
- xorl %ecx,%esi
- addl %ebp,%edx
- movdqa %xmm8,%xmm10
- rorl $7,%eax
+ xorl %ebx,%eax
+ roll $5,%ebp
addl %esi,%edx
+ movdqa %xmm8,%xmm10
+ andl %eax,%edi
+ xorl %ebx,%eax
psrld $30,%xmm8
+ addl %ebp,%edx
+ rorl $7,%ebp
por %xmm9,%xmm5
- addl 28(%rsp),%ecx
- xorl %ebx,%eax
+ xorl %ebx,%edi
movl %edx,%esi
- roll $5,%edx
+ addl 28(%rsp),%ecx
pslld $2,%xmm10
pxor %xmm8,%xmm5
- andl %eax,%edi
- xorl %ebx,%eax
- movdqa 16(%r11),%xmm8
- xorl %ebx,%edi
- addl %edx,%ecx
- pxor %xmm10,%xmm5
- rorl $7,%ebp
+ xorl %eax,%ebp
+ movdqa -32(%r11),%xmm8
+ roll $5,%edx
addl %edi,%ecx
- movdqa %xmm3,%xmm6
- addl 32(%rsp),%ebx
+ andl %ebp,%esi
+ pxor %xmm10,%xmm5
xorl %eax,%ebp
+ addl %edx,%ecx
+ rorl $7,%edx
+ pshufd $238,%xmm2,%xmm6
+ xorl %eax,%esi
movdqa %xmm5,%xmm10
-.byte 102,15,58,15,242,8
+ paddd %xmm5,%xmm8
movl %ecx,%edi
+ addl 32(%rsp),%ebx
+ punpcklqdq %xmm3,%xmm6
+ xorl %ebp,%edx
roll $5,%ecx
- paddd %xmm5,%xmm8
- andl %ebp,%esi
- xorl %eax,%ebp
+ addl %esi,%ebx
psrldq $4,%xmm10
- xorl %eax,%esi
- addl %ecx,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
pxor %xmm2,%xmm6
- rorl $7,%edx
- addl %esi,%ebx
+ addl %ecx,%ebx
+ rorl $7,%ecx
pxor %xmm4,%xmm10
- addl 36(%rsp),%eax
- xorl %ebp,%edx
+ xorl %ebp,%edi
movl %ebx,%esi
- roll $5,%ebx
+ addl 36(%rsp),%eax
pxor %xmm10,%xmm6
- andl %edx,%edi
- xorl %ebp,%edx
+ xorl %edx,%ecx
+ roll $5,%ebx
movdqa %xmm8,16(%rsp)
- xorl %ebp,%edi
- addl %ebx,%eax
- movdqa %xmm6,%xmm9
- movdqa %xmm6,%xmm10
- rorl $7,%ecx
addl %edi,%eax
- addl 40(%rsp),%ebp
+ andl %ecx,%esi
+ movdqa %xmm6,%xmm9
xorl %edx,%ecx
+ addl %ebx,%eax
+ rorl $7,%ebx
+ movdqa %xmm6,%xmm10
+ xorl %edx,%esi
pslldq $12,%xmm9
paddd %xmm6,%xmm6
movl %eax,%edi
- roll $5,%eax
- andl %ecx,%esi
- xorl %edx,%ecx
+ addl 40(%rsp),%ebp
psrld $31,%xmm10
- xorl %edx,%esi
- addl %eax,%ebp
- movdqa %xmm9,%xmm8
- rorl $7,%ebx
+ xorl %ecx,%ebx
+ roll $5,%eax
addl %esi,%ebp
+ movdqa %xmm9,%xmm8
+ andl %ebx,%edi
+ xorl %ecx,%ebx
psrld $30,%xmm9
+ addl %eax,%ebp
+ rorl $7,%eax
por %xmm10,%xmm6
- addl 44(%rsp),%edx
- xorl %ecx,%ebx
+ xorl %ecx,%edi
movl %ebp,%esi
- roll $5,%ebp
+ addl 44(%rsp),%edx
pslld $2,%xmm8
pxor %xmm9,%xmm6
- andl %ebx,%edi
- xorl %ecx,%ebx
- movdqa 16(%r11),%xmm9
- xorl %ecx,%edi
- addl %ebp,%edx
- pxor %xmm8,%xmm6
- rorl $7,%eax
+ xorl %ebx,%eax
+ movdqa -32(%r11),%xmm9
+ roll $5,%ebp
addl %edi,%edx
- movdqa %xmm4,%xmm7
- addl 48(%rsp),%ecx
+ andl %eax,%esi
+ pxor %xmm8,%xmm6
xorl %ebx,%eax
+ addl %ebp,%edx
+ rorl $7,%ebp
+ pshufd $238,%xmm3,%xmm7
+ xorl %ebx,%esi
movdqa %xmm6,%xmm8
-.byte 102,15,58,15,251,8
+ paddd %xmm6,%xmm9
movl %edx,%edi
+ addl 48(%rsp),%ecx
+ punpcklqdq %xmm4,%xmm7
+ xorl %eax,%ebp
roll $5,%edx
- paddd %xmm6,%xmm9
- andl %eax,%esi
- xorl %ebx,%eax
+ addl %esi,%ecx
psrldq $4,%xmm8
- xorl %ebx,%esi
- addl %edx,%ecx
+ andl %ebp,%edi
+ xorl %eax,%ebp
pxor %xmm3,%xmm7
- rorl $7,%ebp
- addl %esi,%ecx
+ addl %edx,%ecx
+ rorl $7,%edx
pxor %xmm5,%xmm8
- addl 52(%rsp),%ebx
- xorl %eax,%ebp
+ xorl %eax,%edi
movl %ecx,%esi
- roll $5,%ecx
+ addl 52(%rsp),%ebx
pxor %xmm8,%xmm7
- andl %ebp,%edi
- xorl %eax,%ebp
+ xorl %ebp,%edx
+ roll $5,%ecx
movdqa %xmm9,32(%rsp)
- xorl %eax,%edi
- addl %ecx,%ebx
- movdqa %xmm7,%xmm10
- movdqa %xmm7,%xmm8
- rorl $7,%edx
addl %edi,%ebx
- addl 56(%rsp),%eax
+ andl %edx,%esi
+ movdqa %xmm7,%xmm10
xorl %ebp,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ movdqa %xmm7,%xmm8
+ xorl %ebp,%esi
pslldq $12,%xmm10
paddd %xmm7,%xmm7
movl %ebx,%edi
- roll $5,%ebx
- andl %edx,%esi
- xorl %ebp,%edx
+ addl 56(%rsp),%eax
psrld $31,%xmm8
- xorl %ebp,%esi
- addl %ebx,%eax
- movdqa %xmm10,%xmm9
- rorl $7,%ecx
+ xorl %edx,%ecx
+ roll $5,%ebx
addl %esi,%eax
+ movdqa %xmm10,%xmm9
+ andl %ecx,%edi
+ xorl %edx,%ecx
psrld $30,%xmm10
+ addl %ebx,%eax
+ rorl $7,%ebx
por %xmm8,%xmm7
- addl 60(%rsp),%ebp
- xorl %edx,%ecx
+ xorl %edx,%edi
movl %eax,%esi
- roll $5,%eax
+ addl 60(%rsp),%ebp
pslld $2,%xmm9
pxor %xmm10,%xmm7
- andl %ecx,%edi
- xorl %edx,%ecx
- movdqa 16(%r11),%xmm10
- xorl %edx,%edi
- addl %eax,%ebp
- pxor %xmm9,%xmm7
- rorl $7,%ebx
+ xorl %ecx,%ebx
+ movdqa -32(%r11),%xmm10
+ roll $5,%eax
addl %edi,%ebp
- movdqa %xmm7,%xmm9
- addl 0(%rsp),%edx
- pxor %xmm4,%xmm0
-.byte 102,68,15,58,15,206,8
+ andl %ebx,%esi
+ pxor %xmm9,%xmm7
+ pshufd $238,%xmm6,%xmm9
xorl %ecx,%ebx
+ addl %eax,%ebp
+ rorl $7,%eax
+ pxor %xmm4,%xmm0
+ xorl %ecx,%esi
movl %ebp,%edi
+ addl 0(%rsp),%edx
+ punpcklqdq %xmm7,%xmm9
+ xorl %ebx,%eax
roll $5,%ebp
pxor %xmm1,%xmm0
- andl %ebx,%esi
- xorl %ecx,%ebx
+ addl %esi,%edx
+ andl %eax,%edi
movdqa %xmm10,%xmm8
+ xorl %ebx,%eax
paddd %xmm7,%xmm10
- xorl %ecx,%esi
addl %ebp,%edx
pxor %xmm9,%xmm0
- rorl $7,%eax
- addl %esi,%edx
+ rorl $7,%ebp
+ xorl %ebx,%edi
+ movl %edx,%esi
addl 4(%rsp),%ecx
- xorl %ebx,%eax
movdqa %xmm0,%xmm9
- movdqa %xmm10,48(%rsp)
- movl %edx,%esi
+ xorl %eax,%ebp
roll $5,%edx
- andl %eax,%edi
- xorl %ebx,%eax
+ movdqa %xmm10,48(%rsp)
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
pslld $2,%xmm0
- xorl %ebx,%edi
addl %edx,%ecx
+ rorl $7,%edx
psrld $30,%xmm9
- rorl $7,%ebp
- addl %edi,%ecx
- addl 8(%rsp),%ebx
- xorl %eax,%ebp
+ xorl %eax,%esi
movl %ecx,%edi
- roll $5,%ecx
+ addl 8(%rsp),%ebx
por %xmm9,%xmm0
- andl %ebp,%esi
- xorl %eax,%ebp
- movdqa %xmm0,%xmm10
- xorl %eax,%esi
- addl %ecx,%ebx
- rorl $7,%edx
- addl %esi,%ebx
- addl 12(%rsp),%eax
xorl %ebp,%edx
- movl %ebx,%esi
- roll $5,%ebx
+ roll $5,%ecx
+ pshufd $238,%xmm7,%xmm10
+ addl %esi,%ebx
andl %edx,%edi
xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 12(%rsp),%eax
xorl %ebp,%edi
- addl %ebx,%eax
- rorl $7,%ecx
+ movl %ebx,%esi
+ roll $5,%ebx
addl %edi,%eax
- addl 16(%rsp),%ebp
- pxor %xmm5,%xmm1
-.byte 102,68,15,58,15,215,8
xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ pxor %xmm5,%xmm1
+ addl 16(%rsp),%ebp
+ xorl %ecx,%esi
+ punpcklqdq %xmm0,%xmm10
movl %eax,%edi
roll $5,%eax
pxor %xmm2,%xmm1
- xorl %ecx,%esi
- addl %eax,%ebp
+ addl %esi,%ebp
+ xorl %ecx,%edi
movdqa %xmm8,%xmm9
- paddd %xmm0,%xmm8
rorl $7,%ebx
- addl %esi,%ebp
+ paddd %xmm0,%xmm8
+ addl %eax,%ebp
pxor %xmm10,%xmm1
addl 20(%rsp),%edx
- xorl %ecx,%edi
+ xorl %ebx,%edi
movl %ebp,%esi
roll $5,%ebp
movdqa %xmm1,%xmm10
+ addl %edi,%edx
+ xorl %ebx,%esi
movdqa %xmm8,0(%rsp)
- xorl %ebx,%edi
- addl %ebp,%edx
rorl $7,%eax
- addl %edi,%edx
- pslld $2,%xmm1
+ addl %ebp,%edx
addl 24(%rsp),%ecx
- xorl %ebx,%esi
- psrld $30,%xmm10
+ pslld $2,%xmm1
+ xorl %eax,%esi
movl %edx,%edi
+ psrld $30,%xmm10
roll $5,%edx
- xorl %eax,%esi
- addl %edx,%ecx
- rorl $7,%ebp
addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
por %xmm10,%xmm1
+ addl %edx,%ecx
addl 28(%rsp),%ebx
- xorl %eax,%edi
- movdqa %xmm1,%xmm8
+ pshufd $238,%xmm0,%xmm8
+ xorl %ebp,%edi
movl %ecx,%esi
roll $5,%ecx
- xorl %ebp,%edi
- addl %ecx,%ebx
- rorl $7,%edx
addl %edi,%ebx
- addl 32(%rsp),%eax
- pxor %xmm6,%xmm2
-.byte 102,68,15,58,15,192,8
xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ pxor %xmm6,%xmm2
+ addl 32(%rsp),%eax
+ xorl %edx,%esi
+ punpcklqdq %xmm1,%xmm8
movl %ebx,%edi
roll $5,%ebx
pxor %xmm3,%xmm2
- xorl %edx,%esi
- addl %ebx,%eax
- movdqa 32(%r11),%xmm10
- paddd %xmm1,%xmm9
- rorl $7,%ecx
addl %esi,%eax
+ xorl %edx,%edi
+ movdqa 0(%r11),%xmm10
+ rorl $7,%ecx
+ paddd %xmm1,%xmm9
+ addl %ebx,%eax
pxor %xmm8,%xmm2
addl 36(%rsp),%ebp
- xorl %edx,%edi
+ xorl %ecx,%edi
movl %eax,%esi
roll $5,%eax
movdqa %xmm2,%xmm8
+ addl %edi,%ebp
+ xorl %ecx,%esi
movdqa %xmm9,16(%rsp)
- xorl %ecx,%edi
- addl %eax,%ebp
rorl $7,%ebx
- addl %edi,%ebp
- pslld $2,%xmm2
+ addl %eax,%ebp
addl 40(%rsp),%edx
- xorl %ecx,%esi
- psrld $30,%xmm8
+ pslld $2,%xmm2
+ xorl %ebx,%esi
movl %ebp,%edi
+ psrld $30,%xmm8
roll $5,%ebp
- xorl %ebx,%esi
- addl %ebp,%edx
- rorl $7,%eax
addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
por %xmm8,%xmm2
+ addl %ebp,%edx
addl 44(%rsp),%ecx
- xorl %ebx,%edi
- movdqa %xmm2,%xmm9
+ pshufd $238,%xmm1,%xmm9
+ xorl %eax,%edi
movl %edx,%esi
roll $5,%edx
- xorl %eax,%edi
- addl %edx,%ecx
- rorl $7,%ebp
addl %edi,%ecx
- addl 48(%rsp),%ebx
- pxor %xmm7,%xmm3
-.byte 102,68,15,58,15,201,8
xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ pxor %xmm7,%xmm3
+ addl 48(%rsp),%ebx
+ xorl %ebp,%esi
+ punpcklqdq %xmm2,%xmm9
movl %ecx,%edi
roll $5,%ecx
pxor %xmm4,%xmm3
- xorl %ebp,%esi
- addl %ecx,%ebx
+ addl %esi,%ebx
+ xorl %ebp,%edi
movdqa %xmm10,%xmm8
- paddd %xmm2,%xmm10
rorl $7,%edx
- addl %esi,%ebx
+ paddd %xmm2,%xmm10
+ addl %ecx,%ebx
pxor %xmm9,%xmm3
addl 52(%rsp),%eax
- xorl %ebp,%edi
+ xorl %edx,%edi
movl %ebx,%esi
roll $5,%ebx
movdqa %xmm3,%xmm9
+ addl %edi,%eax
+ xorl %edx,%esi
movdqa %xmm10,32(%rsp)
- xorl %edx,%edi
- addl %ebx,%eax
rorl $7,%ecx
- addl %edi,%eax
- pslld $2,%xmm3
+ addl %ebx,%eax
addl 56(%rsp),%ebp
- xorl %edx,%esi
- psrld $30,%xmm9
+ pslld $2,%xmm3
+ xorl %ecx,%esi
movl %eax,%edi
+ psrld $30,%xmm9
roll $5,%eax
- xorl %ecx,%esi
- addl %eax,%ebp
- rorl $7,%ebx
addl %esi,%ebp
+ xorl %ecx,%edi
+ rorl $7,%ebx
por %xmm9,%xmm3
+ addl %eax,%ebp
addl 60(%rsp),%edx
- xorl %ecx,%edi
- movdqa %xmm3,%xmm10
+ pshufd $238,%xmm2,%xmm10
+ xorl %ebx,%edi
movl %ebp,%esi
roll $5,%ebp
- xorl %ebx,%edi
- addl %ebp,%edx
- rorl $7,%eax
addl %edi,%edx
- addl 0(%rsp),%ecx
- pxor %xmm0,%xmm4
-.byte 102,68,15,58,15,210,8
xorl %ebx,%esi
+ rorl $7,%eax
+ addl %ebp,%edx
+ pxor %xmm0,%xmm4
+ addl 0(%rsp),%ecx
+ xorl %eax,%esi
+ punpcklqdq %xmm3,%xmm10
movl %edx,%edi
roll $5,%edx
pxor %xmm5,%xmm4
- xorl %eax,%esi
- addl %edx,%ecx
+ addl %esi,%ecx
+ xorl %eax,%edi
movdqa %xmm8,%xmm9
- paddd %xmm3,%xmm8
rorl $7,%ebp
- addl %esi,%ecx
+ paddd %xmm3,%xmm8
+ addl %edx,%ecx
pxor %xmm10,%xmm4
addl 4(%rsp),%ebx
- xorl %eax,%edi
+ xorl %ebp,%edi
movl %ecx,%esi
roll $5,%ecx
movdqa %xmm4,%xmm10
+ addl %edi,%ebx
+ xorl %ebp,%esi
movdqa %xmm8,48(%rsp)
- xorl %ebp,%edi
- addl %ecx,%ebx
rorl $7,%edx
- addl %edi,%ebx
- pslld $2,%xmm4
+ addl %ecx,%ebx
addl 8(%rsp),%eax
- xorl %ebp,%esi
- psrld $30,%xmm10
+ pslld $2,%xmm4
+ xorl %edx,%esi
movl %ebx,%edi
+ psrld $30,%xmm10
roll $5,%ebx
- xorl %edx,%esi
- addl %ebx,%eax
- rorl $7,%ecx
addl %esi,%eax
+ xorl %edx,%edi
+ rorl $7,%ecx
por %xmm10,%xmm4
+ addl %ebx,%eax
addl 12(%rsp),%ebp
- xorl %edx,%edi
- movdqa %xmm4,%xmm8
+ pshufd $238,%xmm3,%xmm8
+ xorl %ecx,%edi
movl %eax,%esi
roll $5,%eax
- xorl %ecx,%edi
- addl %eax,%ebp
- rorl $7,%ebx
addl %edi,%ebp
- addl 16(%rsp),%edx
- pxor %xmm1,%xmm5
-.byte 102,68,15,58,15,195,8
xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ pxor %xmm1,%xmm5
+ addl 16(%rsp),%edx
+ xorl %ebx,%esi
+ punpcklqdq %xmm4,%xmm8
movl %ebp,%edi
roll $5,%ebp
pxor %xmm6,%xmm5
- xorl %ebx,%esi
- addl %ebp,%edx
+ addl %esi,%edx
+ xorl %ebx,%edi
movdqa %xmm9,%xmm10
- paddd %xmm4,%xmm9
rorl $7,%eax
- addl %esi,%edx
+ paddd %xmm4,%xmm9
+ addl %ebp,%edx
pxor %xmm8,%xmm5
addl 20(%rsp),%ecx
- xorl %ebx,%edi
+ xorl %eax,%edi
movl %edx,%esi
roll $5,%edx
movdqa %xmm5,%xmm8
+ addl %edi,%ecx
+ xorl %eax,%esi
movdqa %xmm9,0(%rsp)
- xorl %eax,%edi
- addl %edx,%ecx
rorl $7,%ebp
- addl %edi,%ecx
- pslld $2,%xmm5
+ addl %edx,%ecx
addl 24(%rsp),%ebx
- xorl %eax,%esi
- psrld $30,%xmm8
+ pslld $2,%xmm5
+ xorl %ebp,%esi
movl %ecx,%edi
+ psrld $30,%xmm8
roll $5,%ecx
- xorl %ebp,%esi
- addl %ecx,%ebx
- rorl $7,%edx
addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
por %xmm8,%xmm5
+ addl %ecx,%ebx
addl 28(%rsp),%eax
- xorl %ebp,%edi
- movdqa %xmm5,%xmm9
+ pshufd $238,%xmm4,%xmm9
+ rorl $7,%ecx
movl %ebx,%esi
- roll $5,%ebx
xorl %edx,%edi
- addl %ebx,%eax
- rorl $7,%ecx
+ roll $5,%ebx
addl %edi,%eax
- movl %ecx,%edi
- pxor %xmm2,%xmm6
-.byte 102,68,15,58,15,204,8
+ xorl %ecx,%esi
xorl %edx,%ecx
+ addl %ebx,%eax
+ pxor %xmm2,%xmm6
addl 32(%rsp),%ebp
- andl %edx,%edi
- pxor %xmm7,%xmm6
andl %ecx,%esi
+ xorl %edx,%ecx
rorl $7,%ebx
- movdqa %xmm10,%xmm8
- paddd %xmm5,%xmm10
- addl %edi,%ebp
+ punpcklqdq %xmm5,%xmm9
movl %eax,%edi
- pxor %xmm9,%xmm6
+ xorl %ecx,%esi
+ pxor %xmm7,%xmm6
roll $5,%eax
addl %esi,%ebp
- xorl %edx,%ecx
- addl %eax,%ebp
- movdqa %xmm6,%xmm9
- movdqa %xmm10,16(%rsp)
- movl %ebx,%esi
+ movdqa %xmm10,%xmm8
+ xorl %ebx,%edi
+ paddd %xmm5,%xmm10
xorl %ecx,%ebx
+ pxor %xmm9,%xmm6
+ addl %eax,%ebp
addl 36(%rsp),%edx
- andl %ecx,%esi
- pslld $2,%xmm6
andl %ebx,%edi
+ xorl %ecx,%ebx
rorl $7,%eax
- psrld $30,%xmm9
- addl %esi,%edx
+ movdqa %xmm6,%xmm9
movl %ebp,%esi
+ xorl %ebx,%edi
+ movdqa %xmm10,16(%rsp)
roll $5,%ebp
addl %edi,%edx
- xorl %ecx,%ebx
- addl %ebp,%edx
- por %xmm9,%xmm6
- movl %eax,%edi
+ xorl %eax,%esi
+ pslld $2,%xmm6
xorl %ebx,%eax
- movdqa %xmm6,%xmm10
+ addl %ebp,%edx
+ psrld $30,%xmm9
addl 40(%rsp),%ecx
- andl %ebx,%edi
andl %eax,%esi
+ xorl %ebx,%eax
+ por %xmm9,%xmm6
rorl $7,%ebp
- addl %edi,%ecx
movl %edx,%edi
+ xorl %eax,%esi
roll $5,%edx
+ pshufd $238,%xmm5,%xmm10
addl %esi,%ecx
- xorl %ebx,%eax
- addl %edx,%ecx
- movl %ebp,%esi
+ xorl %ebp,%edi
xorl %eax,%ebp
+ addl %edx,%ecx
addl 44(%rsp),%ebx
- andl %eax,%esi
andl %ebp,%edi
+ xorl %eax,%ebp
rorl $7,%edx
- addl %esi,%ebx
movl %ecx,%esi
+ xorl %ebp,%edi
roll $5,%ecx
addl %edi,%ebx
- xorl %eax,%ebp
+ xorl %edx,%esi
+ xorl %ebp,%edx
addl %ecx,%ebx
- movl %edx,%edi
pxor %xmm3,%xmm7
-.byte 102,68,15,58,15,213,8
- xorl %ebp,%edx
addl 48(%rsp),%eax
- andl %ebp,%edi
- pxor %xmm0,%xmm7
andl %edx,%esi
+ xorl %ebp,%edx
rorl $7,%ecx
- movdqa 48(%r11),%xmm9
- paddd %xmm6,%xmm8
- addl %edi,%eax
+ punpcklqdq %xmm6,%xmm10
movl %ebx,%edi
- pxor %xmm10,%xmm7
+ xorl %edx,%esi
+ pxor %xmm0,%xmm7
roll $5,%ebx
addl %esi,%eax
- xorl %ebp,%edx
- addl %ebx,%eax
- movdqa %xmm7,%xmm10
- movdqa %xmm8,32(%rsp)
- movl %ecx,%esi
+ movdqa 32(%r11),%xmm9
+ xorl %ecx,%edi
+ paddd %xmm6,%xmm8
xorl %edx,%ecx
+ pxor %xmm10,%xmm7
+ addl %ebx,%eax
addl 52(%rsp),%ebp
- andl %edx,%esi
- pslld $2,%xmm7
andl %ecx,%edi
+ xorl %edx,%ecx
rorl $7,%ebx
- psrld $30,%xmm10
- addl %esi,%ebp
+ movdqa %xmm7,%xmm10
movl %eax,%esi
+ xorl %ecx,%edi
+ movdqa %xmm8,32(%rsp)
roll $5,%eax
addl %edi,%ebp
- xorl %edx,%ecx
- addl %eax,%ebp
- por %xmm10,%xmm7
- movl %ebx,%edi
+ xorl %ebx,%esi
+ pslld $2,%xmm7
xorl %ecx,%ebx
- movdqa %xmm7,%xmm8
+ addl %eax,%ebp
+ psrld $30,%xmm10
addl 56(%rsp),%edx
- andl %ecx,%edi
andl %ebx,%esi
+ xorl %ecx,%ebx
+ por %xmm10,%xmm7
rorl $7,%eax
- addl %edi,%edx
movl %ebp,%edi
+ xorl %ebx,%esi
roll $5,%ebp
+ pshufd $238,%xmm6,%xmm8
addl %esi,%edx
- xorl %ecx,%ebx
- addl %ebp,%edx
- movl %eax,%esi
+ xorl %eax,%edi
xorl %ebx,%eax
+ addl %ebp,%edx
addl 60(%rsp),%ecx
- andl %ebx,%esi
andl %eax,%edi
+ xorl %ebx,%eax
rorl $7,%ebp
- addl %esi,%ecx
movl %edx,%esi
+ xorl %eax,%edi
roll $5,%edx
addl %edi,%ecx
- xorl %ebx,%eax
+ xorl %ebp,%esi
+ xorl %eax,%ebp
addl %edx,%ecx
- movl %ebp,%edi
pxor %xmm4,%xmm0
-.byte 102,68,15,58,15,198,8
- xorl %eax,%ebp
addl 0(%rsp),%ebx
- andl %eax,%edi
- pxor %xmm1,%xmm0
andl %ebp,%esi
+ xorl %eax,%ebp
rorl $7,%edx
- movdqa %xmm9,%xmm10
- paddd %xmm7,%xmm9
- addl %edi,%ebx
+ punpcklqdq %xmm7,%xmm8
movl %ecx,%edi
- pxor %xmm8,%xmm0
+ xorl %ebp,%esi
+ pxor %xmm1,%xmm0
roll $5,%ecx
addl %esi,%ebx
- xorl %eax,%ebp
- addl %ecx,%ebx
- movdqa %xmm0,%xmm8
- movdqa %xmm9,48(%rsp)
- movl %edx,%esi
+ movdqa %xmm9,%xmm10
+ xorl %edx,%edi
+ paddd %xmm7,%xmm9
xorl %ebp,%edx
+ pxor %xmm8,%xmm0
+ addl %ecx,%ebx
addl 4(%rsp),%eax
- andl %ebp,%esi
- pslld $2,%xmm0
andl %edx,%edi
+ xorl %ebp,%edx
rorl $7,%ecx
- psrld $30,%xmm8
- addl %esi,%eax
+ movdqa %xmm0,%xmm8
movl %ebx,%esi
+ xorl %edx,%edi
+ movdqa %xmm9,48(%rsp)
roll $5,%ebx
addl %edi,%eax
- xorl %ebp,%edx
- addl %ebx,%eax
- por %xmm8,%xmm0
- movl %ecx,%edi
+ xorl %ecx,%esi
+ pslld $2,%xmm0
xorl %edx,%ecx
- movdqa %xmm0,%xmm9
+ addl %ebx,%eax
+ psrld $30,%xmm8
addl 8(%rsp),%ebp
- andl %edx,%edi
andl %ecx,%esi
+ xorl %edx,%ecx
+ por %xmm8,%xmm0
rorl $7,%ebx
- addl %edi,%ebp
movl %eax,%edi
+ xorl %ecx,%esi
roll $5,%eax
+ pshufd $238,%xmm7,%xmm9
addl %esi,%ebp
- xorl %edx,%ecx
- addl %eax,%ebp
- movl %ebx,%esi
+ xorl %ebx,%edi
xorl %ecx,%ebx
+ addl %eax,%ebp
addl 12(%rsp),%edx
- andl %ecx,%esi
andl %ebx,%edi
+ xorl %ecx,%ebx
rorl $7,%eax
- addl %esi,%edx
movl %ebp,%esi
+ xorl %ebx,%edi
roll $5,%ebp
addl %edi,%edx
- xorl %ecx,%ebx
+ xorl %eax,%esi
+ xorl %ebx,%eax
addl %ebp,%edx
- movl %eax,%edi
pxor %xmm5,%xmm1
-.byte 102,68,15,58,15,207,8
- xorl %ebx,%eax
addl 16(%rsp),%ecx
- andl %ebx,%edi
- pxor %xmm2,%xmm1
andl %eax,%esi
+ xorl %ebx,%eax
rorl $7,%ebp
- movdqa %xmm10,%xmm8
- paddd %xmm0,%xmm10
- addl %edi,%ecx
+ punpcklqdq %xmm0,%xmm9
movl %edx,%edi
- pxor %xmm9,%xmm1
+ xorl %eax,%esi
+ pxor %xmm2,%xmm1
roll $5,%edx
addl %esi,%ecx
- xorl %ebx,%eax
- addl %edx,%ecx
- movdqa %xmm1,%xmm9
- movdqa %xmm10,0(%rsp)
- movl %ebp,%esi
+ movdqa %xmm10,%xmm8
+ xorl %ebp,%edi
+ paddd %xmm0,%xmm10
xorl %eax,%ebp
+ pxor %xmm9,%xmm1
+ addl %edx,%ecx
addl 20(%rsp),%ebx
- andl %eax,%esi
- pslld $2,%xmm1
andl %ebp,%edi
+ xorl %eax,%ebp
rorl $7,%edx
- psrld $30,%xmm9
- addl %esi,%ebx
+ movdqa %xmm1,%xmm9
movl %ecx,%esi
+ xorl %ebp,%edi
+ movdqa %xmm10,0(%rsp)
roll $5,%ecx
addl %edi,%ebx
- xorl %eax,%ebp
- addl %ecx,%ebx
- por %xmm9,%xmm1
- movl %edx,%edi
+ xorl %edx,%esi
+ pslld $2,%xmm1
xorl %ebp,%edx
- movdqa %xmm1,%xmm10
+ addl %ecx,%ebx
+ psrld $30,%xmm9
addl 24(%rsp),%eax
- andl %ebp,%edi
andl %edx,%esi
+ xorl %ebp,%edx
+ por %xmm9,%xmm1
rorl $7,%ecx
- addl %edi,%eax
movl %ebx,%edi
+ xorl %edx,%esi
roll $5,%ebx
+ pshufd $238,%xmm0,%xmm10
addl %esi,%eax
- xorl %ebp,%edx
- addl %ebx,%eax
- movl %ecx,%esi
+ xorl %ecx,%edi
xorl %edx,%ecx
+ addl %ebx,%eax
addl 28(%rsp),%ebp
- andl %edx,%esi
andl %ecx,%edi
+ xorl %edx,%ecx
rorl $7,%ebx
- addl %esi,%ebp
movl %eax,%esi
+ xorl %ecx,%edi
roll $5,%eax
addl %edi,%ebp
- xorl %edx,%ecx
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
addl %eax,%ebp
- movl %ebx,%edi
pxor %xmm6,%xmm2
-.byte 102,68,15,58,15,208,8
- xorl %ecx,%ebx
addl 32(%rsp),%edx
- andl %ecx,%edi
- pxor %xmm3,%xmm2
andl %ebx,%esi
+ xorl %ecx,%ebx
rorl $7,%eax
- movdqa %xmm8,%xmm9
- paddd %xmm1,%xmm8
- addl %edi,%edx
+ punpcklqdq %xmm1,%xmm10
movl %ebp,%edi
- pxor %xmm10,%xmm2
+ xorl %ebx,%esi
+ pxor %xmm3,%xmm2
roll $5,%ebp
addl %esi,%edx
- xorl %ecx,%ebx
- addl %ebp,%edx
- movdqa %xmm2,%xmm10
- movdqa %xmm8,16(%rsp)
- movl %eax,%esi
+ movdqa %xmm8,%xmm9
+ xorl %eax,%edi
+ paddd %xmm1,%xmm8
xorl %ebx,%eax
+ pxor %xmm10,%xmm2
+ addl %ebp,%edx
addl 36(%rsp),%ecx
- andl %ebx,%esi
- pslld $2,%xmm2
andl %eax,%edi
+ xorl %ebx,%eax
rorl $7,%ebp
- psrld $30,%xmm10
- addl %esi,%ecx
+ movdqa %xmm2,%xmm10
movl %edx,%esi
+ xorl %eax,%edi
+ movdqa %xmm8,16(%rsp)
roll $5,%edx
addl %edi,%ecx
- xorl %ebx,%eax
- addl %edx,%ecx
- por %xmm10,%xmm2
- movl %ebp,%edi
+ xorl %ebp,%esi
+ pslld $2,%xmm2
xorl %eax,%ebp
- movdqa %xmm2,%xmm8
+ addl %edx,%ecx
+ psrld $30,%xmm10
addl 40(%rsp),%ebx
- andl %eax,%edi
andl %ebp,%esi
+ xorl %eax,%ebp
+ por %xmm10,%xmm2
rorl $7,%edx
- addl %edi,%ebx
movl %ecx,%edi
+ xorl %ebp,%esi
roll $5,%ecx
+ pshufd $238,%xmm1,%xmm8
addl %esi,%ebx
- xorl %eax,%ebp
- addl %ecx,%ebx
- movl %edx,%esi
+ xorl %edx,%edi
xorl %ebp,%edx
+ addl %ecx,%ebx
addl 44(%rsp),%eax
- andl %ebp,%esi
andl %edx,%edi
+ xorl %ebp,%edx
rorl $7,%ecx
- addl %esi,%eax
movl %ebx,%esi
+ xorl %edx,%edi
roll $5,%ebx
addl %edi,%eax
- xorl %ebp,%edx
+ xorl %edx,%esi
addl %ebx,%eax
- addl 48(%rsp),%ebp
pxor %xmm7,%xmm3
-.byte 102,68,15,58,15,193,8
- xorl %edx,%esi
+ addl 48(%rsp),%ebp
+ xorl %ecx,%esi
+ punpcklqdq %xmm2,%xmm8
movl %eax,%edi
roll $5,%eax
pxor %xmm4,%xmm3
- xorl %ecx,%esi
- addl %eax,%ebp
+ addl %esi,%ebp
+ xorl %ecx,%edi
movdqa %xmm9,%xmm10
- paddd %xmm2,%xmm9
rorl $7,%ebx
- addl %esi,%ebp
+ paddd %xmm2,%xmm9
+ addl %eax,%ebp
pxor %xmm8,%xmm3
addl 52(%rsp),%edx
- xorl %ecx,%edi
+ xorl %ebx,%edi
movl %ebp,%esi
roll $5,%ebp
movdqa %xmm3,%xmm8
+ addl %edi,%edx
+ xorl %ebx,%esi
movdqa %xmm9,32(%rsp)
- xorl %ebx,%edi
- addl %ebp,%edx
rorl $7,%eax
- addl %edi,%edx
- pslld $2,%xmm3
+ addl %ebp,%edx
addl 56(%rsp),%ecx
- xorl %ebx,%esi
- psrld $30,%xmm8
+ pslld $2,%xmm3
+ xorl %eax,%esi
movl %edx,%edi
+ psrld $30,%xmm8
roll $5,%edx
- xorl %eax,%esi
- addl %edx,%ecx
- rorl $7,%ebp
addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
por %xmm8,%xmm3
+ addl %edx,%ecx
addl 60(%rsp),%ebx
- xorl %eax,%edi
+ xorl %ebp,%edi
movl %ecx,%esi
roll $5,%ecx
- xorl %ebp,%edi
- addl %ecx,%ebx
- rorl $7,%edx
addl %edi,%ebx
- addl 0(%rsp),%eax
- paddd %xmm3,%xmm10
xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 0(%rsp),%eax
+ xorl %edx,%esi
movl %ebx,%edi
roll $5,%ebx
- xorl %edx,%esi
+ paddd %xmm3,%xmm10
+ addl %esi,%eax
+ xorl %edx,%edi
movdqa %xmm10,48(%rsp)
- addl %ebx,%eax
rorl $7,%ecx
- addl %esi,%eax
+ addl %ebx,%eax
addl 4(%rsp),%ebp
- xorl %edx,%edi
+ xorl %ecx,%edi
movl %eax,%esi
roll $5,%eax
- xorl %ecx,%edi
- addl %eax,%ebp
- rorl $7,%ebx
addl %edi,%ebp
- addl 8(%rsp),%edx
xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 8(%rsp),%edx
+ xorl %ebx,%esi
movl %ebp,%edi
roll $5,%ebp
- xorl %ebx,%esi
- addl %ebp,%edx
- rorl $7,%eax
addl %esi,%edx
- addl 12(%rsp),%ecx
xorl %ebx,%edi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 12(%rsp),%ecx
+ xorl %eax,%edi
movl %edx,%esi
roll $5,%edx
- xorl %eax,%edi
- addl %edx,%ecx
- rorl $7,%ebp
addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
cmpq %r10,%r9
je .Ldone_ssse3
movdqa 64(%r11),%xmm6
- movdqa 0(%r11),%xmm9
+ movdqa -64(%r11),%xmm9
movdqu 0(%r9),%xmm0
movdqu 16(%r9),%xmm1
movdqu 32(%r9),%xmm2
@@ -2278,113 +2205,112 @@ _ssse3_shortcut:
.byte 102,15,56,0,198
addq $64,%r9
addl 16(%rsp),%ebx
- xorl %eax,%esi
-.byte 102,15,56,0,206
+ xorl %ebp,%esi
movl %ecx,%edi
+.byte 102,15,56,0,206
roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
paddd %xmm9,%xmm0
- xorl %ebp,%esi
addl %ecx,%ebx
- rorl $7,%edx
- addl %esi,%ebx
- movdqa %xmm0,0(%rsp)
addl 20(%rsp),%eax
- xorl %ebp,%edi
- psubd %xmm9,%xmm0
+ xorl %edx,%edi
movl %ebx,%esi
+ movdqa %xmm0,0(%rsp)
roll $5,%ebx
- xorl %edx,%edi
- addl %ebx,%eax
- rorl $7,%ecx
addl %edi,%eax
- addl 24(%rsp),%ebp
xorl %edx,%esi
+ rorl $7,%ecx
+ psubd %xmm9,%xmm0
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
movl %eax,%edi
roll $5,%eax
- xorl %ecx,%esi
- addl %eax,%ebp
- rorl $7,%ebx
addl %esi,%ebp
- addl 28(%rsp),%edx
xorl %ecx,%edi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
movl %ebp,%esi
roll $5,%ebp
- xorl %ebx,%edi
- addl %ebp,%edx
- rorl $7,%eax
addl %edi,%edx
- addl 32(%rsp),%ecx
xorl %ebx,%esi
-.byte 102,15,56,0,214
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
movl %edx,%edi
+.byte 102,15,56,0,214
roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
paddd %xmm9,%xmm1
- xorl %eax,%esi
addl %edx,%ecx
- rorl $7,%ebp
- addl %esi,%ecx
- movdqa %xmm1,16(%rsp)
addl 36(%rsp),%ebx
- xorl %eax,%edi
- psubd %xmm9,%xmm1
+ xorl %ebp,%edi
movl %ecx,%esi
+ movdqa %xmm1,16(%rsp)
roll $5,%ecx
- xorl %ebp,%edi
- addl %ecx,%ebx
- rorl $7,%edx
addl %edi,%ebx
- addl 40(%rsp),%eax
xorl %ebp,%esi
+ rorl $7,%edx
+ psubd %xmm9,%xmm1
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
movl %ebx,%edi
roll $5,%ebx
- xorl %edx,%esi
- addl %ebx,%eax
- rorl $7,%ecx
addl %esi,%eax
- addl 44(%rsp),%ebp
xorl %edx,%edi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
movl %eax,%esi
roll $5,%eax
- xorl %ecx,%edi
- addl %eax,%ebp
- rorl $7,%ebx
addl %edi,%ebp
- addl 48(%rsp),%edx
xorl %ecx,%esi
-.byte 102,15,56,0,222
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
movl %ebp,%edi
+.byte 102,15,56,0,222
roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
paddd %xmm9,%xmm2
- xorl %ebx,%esi
addl %ebp,%edx
- rorl $7,%eax
- addl %esi,%edx
- movdqa %xmm2,32(%rsp)
addl 52(%rsp),%ecx
- xorl %ebx,%edi
- psubd %xmm9,%xmm2
+ xorl %eax,%edi
movl %edx,%esi
+ movdqa %xmm2,32(%rsp)
roll $5,%edx
- xorl %eax,%edi
- addl %edx,%ecx
- rorl $7,%ebp
addl %edi,%ecx
- addl 56(%rsp),%ebx
xorl %eax,%esi
+ rorl $7,%ebp
+ psubd %xmm9,%xmm2
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
movl %ecx,%edi
roll $5,%ecx
- xorl %ebp,%esi
- addl %ecx,%ebx
- rorl $7,%edx
addl %esi,%ebx
- addl 60(%rsp),%eax
xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
movl %ebx,%esi
roll $5,%ebx
- xorl %edx,%edi
- addl %ebx,%eax
- rorl $7,%ecx
addl %edi,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
addl 0(%r8),%eax
addl 4(%r8),%esi
addl 8(%r8),%ecx
@@ -2394,108 +2320,110 @@ _ssse3_shortcut:
movl %esi,4(%r8)
movl %esi,%ebx
movl %ecx,8(%r8)
+ movl %ecx,%edi
movl %edx,12(%r8)
+ xorl %edx,%edi
movl %ebp,16(%r8)
+ andl %edi,%esi
jmp .Loop_ssse3
.align 16
.Ldone_ssse3:
addl 16(%rsp),%ebx
- xorl %eax,%esi
+ xorl %ebp,%esi
movl %ecx,%edi
roll $5,%ecx
- xorl %ebp,%esi
- addl %ecx,%ebx
- rorl $7,%edx
addl %esi,%ebx
- addl 20(%rsp),%eax
xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
movl %ebx,%esi
roll $5,%ebx
- xorl %edx,%edi
- addl %ebx,%eax
- rorl $7,%ecx
addl %edi,%eax
- addl 24(%rsp),%ebp
xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
movl %eax,%edi
roll $5,%eax
- xorl %ecx,%esi
- addl %eax,%ebp
- rorl $7,%ebx
addl %esi,%ebp
- addl 28(%rsp),%edx
xorl %ecx,%edi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
movl %ebp,%esi
roll $5,%ebp
- xorl %ebx,%edi
- addl %ebp,%edx
- rorl $7,%eax
addl %edi,%edx
- addl 32(%rsp),%ecx
xorl %ebx,%esi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
movl %edx,%edi
roll $5,%edx
- xorl %eax,%esi
- addl %edx,%ecx
- rorl $7,%ebp
addl %esi,%ecx
- addl 36(%rsp),%ebx
xorl %eax,%edi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
movl %ecx,%esi
roll $5,%ecx
- xorl %ebp,%edi
- addl %ecx,%ebx
- rorl $7,%edx
addl %edi,%ebx
- addl 40(%rsp),%eax
xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
movl %ebx,%edi
roll $5,%ebx
- xorl %edx,%esi
- addl %ebx,%eax
- rorl $7,%ecx
addl %esi,%eax
- addl 44(%rsp),%ebp
xorl %edx,%edi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
movl %eax,%esi
roll $5,%eax
- xorl %ecx,%edi
- addl %eax,%ebp
- rorl $7,%ebx
addl %edi,%ebp
- addl 48(%rsp),%edx
xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
movl %ebp,%edi
roll $5,%ebp
- xorl %ebx,%esi
- addl %ebp,%edx
- rorl $7,%eax
addl %esi,%edx
- addl 52(%rsp),%ecx
xorl %ebx,%edi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
movl %edx,%esi
roll $5,%edx
- xorl %eax,%edi
- addl %edx,%ecx
- rorl $7,%ebp
addl %edi,%ecx
- addl 56(%rsp),%ebx
xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
movl %ecx,%edi
roll $5,%ecx
- xorl %ebp,%esi
- addl %ecx,%ebx
- rorl $7,%edx
addl %esi,%ebx
- addl 60(%rsp),%eax
xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
movl %ebx,%esi
roll $5,%ebx
- xorl %edx,%edi
- addl %ebx,%eax
- rorl $7,%ecx
addl %edi,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
addl 0(%r8),%eax
addl 4(%r8),%esi
addl 8(%r8),%ecx
@@ -2506,21 +2434,28 @@ _ssse3_shortcut:
movl %ecx,8(%r8)
movl %edx,12(%r8)
movl %ebp,16(%r8)
- leaq 64(%rsp),%rsi
- movq 0(%rsi),%r12
- movq 8(%rsi),%rbp
- movq 16(%rsi),%rbx
- leaq 24(%rsi),%rsp
+ leaq (%r14),%rsi
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
.Lepilogue_ssse3:
.byte 0xf3,0xc3
.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
.align 64
K_XX_XX:
-.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
-.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
-.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
-.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
-.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64
diff --git a/lib/accelerated/x86/elf/sha256-ssse3-x86.s b/lib/accelerated/x86/elf/sha256-ssse3-x86.s
index aab48fa387..211468c32a 100644
--- a/lib/accelerated/x86/elf/sha256-ssse3-x86.s
+++ b/lib/accelerated/x86/elf/sha256-ssse3-x86.s
@@ -64,195 +64,391 @@ sha256_block_data_order:
movl %edi,4(%esp)
movl %eax,8(%esp)
movl %ebx,12(%esp)
+ jmp .L002loop
.align 16
.L002loop:
movl (%edi),%eax
movl 4(%edi),%ebx
movl 8(%edi),%ecx
- movl 12(%edi),%edx
bswap %eax
+ movl 12(%edi),%edx
bswap %ebx
- bswap %ecx
- bswap %edx
pushl %eax
+ bswap %ecx
pushl %ebx
+ bswap %edx
pushl %ecx
pushl %edx
movl 16(%edi),%eax
movl 20(%edi),%ebx
movl 24(%edi),%ecx
- movl 28(%edi),%edx
bswap %eax
+ movl 28(%edi),%edx
bswap %ebx
- bswap %ecx
- bswap %edx
pushl %eax
+ bswap %ecx
pushl %ebx
+ bswap %edx
pushl %ecx
pushl %edx
movl 32(%edi),%eax
movl 36(%edi),%ebx
movl 40(%edi),%ecx
- movl 44(%edi),%edx
bswap %eax
+ movl 44(%edi),%edx
bswap %ebx
- bswap %ecx
- bswap %edx
pushl %eax
+ bswap %ecx
pushl %ebx
+ bswap %edx
pushl %ecx
pushl %edx
movl 48(%edi),%eax
movl 52(%edi),%ebx
movl 56(%edi),%ecx
- movl 60(%edi),%edx
bswap %eax
+ movl 60(%edi),%edx
bswap %ebx
- bswap %ecx
- bswap %edx
pushl %eax
+ bswap %ecx
pushl %ebx
+ bswap %edx
pushl %ecx
pushl %edx
addl $64,%edi
- subl $32,%esp
- movl %edi,100(%esp)
+ leal -36(%esp),%esp
+ movl %edi,104(%esp)
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
movl 12(%esi),%edi
- movl %ebx,4(%esp)
- movl %ecx,8(%esp)
- movl %edi,12(%esp)
+ movl %ebx,8(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,12(%esp)
+ movl %edi,16(%esp)
+ movl %ebx,(%esp)
movl 16(%esi),%edx
movl 20(%esi),%ebx
movl 24(%esi),%ecx
movl 28(%esi),%edi
- movl %ebx,20(%esp)
- movl %ecx,24(%esp)
- movl %edi,28(%esp)
+ movl %ebx,24(%esp)
+ movl %ecx,28(%esp)
+ movl %edi,32(%esp)
.align 16
.L00300_15:
- movl 92(%esp),%ebx
movl %edx,%ecx
+ movl 24(%esp),%esi
rorl $14,%ecx
- movl 20(%esp),%esi
+ movl 28(%esp),%edi
xorl %edx,%ecx
- rorl $5,%ecx
- xorl %edx,%ecx
- rorl $6,%ecx
- movl 24(%esp),%edi
- addl %ecx,%ebx
xorl %edi,%esi
- movl %edx,16(%esp)
- movl %eax,%ecx
+ movl 96(%esp),%ebx
+ rorl $5,%ecx
andl %edx,%esi
- movl 12(%esp),%edx
+ movl %edx,20(%esp)
+ xorl %ecx,%edx
+ addl 32(%esp),%ebx
xorl %edi,%esi
- movl %eax,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
addl %esi,%ebx
rorl $9,%ecx
- addl 28(%esp),%ebx
+ addl %edx,%ebx
+ movl 8(%esp),%edi
xorl %eax,%ecx
+ movl %eax,4(%esp)
+ leal -4(%esp),%esp
rorl $11,%ecx
- movl 4(%esp),%esi
+ movl (%ebp),%esi
xorl %eax,%ecx
+ movl 20(%esp),%edx
+ xorl %edi,%eax
rorl $2,%ecx
+ addl %esi,%ebx
+ movl %eax,(%esp)
addl %ebx,%edx
- movl 8(%esp),%edi
+ andl 4(%esp),%eax
addl %ecx,%ebx
- movl %eax,(%esp)
- movl %eax,%ecx
- subl $4,%esp
- orl %esi,%eax
- andl %esi,%ecx
- andl %edi,%eax
- movl (%ebp),%esi
- orl %ecx,%eax
+ xorl %edi,%eax
addl $4,%ebp
addl %ebx,%eax
- addl %esi,%edx
- addl %esi,%eax
cmpl $3248222580,%esi
jne .L00300_15
- movl 152(%esp),%ebx
+ movl 156(%esp),%ecx
+ jmp .L00416_63
.align 16
.L00416_63:
- movl %ebx,%esi
- movl 100(%esp),%ecx
- rorl $11,%esi
- movl %ecx,%edi
- xorl %ebx,%esi
- rorl $7,%esi
+ movl %ecx,%ebx
+ movl 104(%esp),%esi
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
shrl $3,%ebx
- rorl $2,%edi
- xorl %esi,%ebx
- xorl %ecx,%edi
- rorl $17,%edi
- shrl $10,%ecx
- addl 156(%esp),%ebx
- xorl %ecx,%edi
- addl 120(%esp),%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 160(%esp),%ebx
+ shrl $10,%edi
+ addl 124(%esp),%ebx
movl %edx,%ecx
- addl %edi,%ebx
+ xorl %esi,%edi
+ movl 24(%esp),%esi
rorl $14,%ecx
- movl 20(%esp),%esi
- xorl %edx,%ecx
- rorl $5,%ecx
- movl %ebx,92(%esp)
+ addl %edi,%ebx
+ movl 28(%esp),%edi
xorl %edx,%ecx
- rorl $6,%ecx
- movl 24(%esp),%edi
- addl %ecx,%ebx
xorl %edi,%esi
- movl %edx,16(%esp)
- movl %eax,%ecx
+ movl %ebx,96(%esp)
+ rorl $5,%ecx
andl %edx,%esi
- movl 12(%esp),%edx
+ movl %edx,20(%esp)
+ xorl %ecx,%edx
+ addl 32(%esp),%ebx
xorl %edi,%esi
- movl %eax,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
addl %esi,%ebx
rorl $9,%ecx
- addl 28(%esp),%ebx
+ addl %edx,%ebx
+ movl 8(%esp),%edi
xorl %eax,%ecx
+ movl %eax,4(%esp)
+ leal -4(%esp),%esp
rorl $11,%ecx
- movl 4(%esp),%esi
+ movl (%ebp),%esi
xorl %eax,%ecx
+ movl 20(%esp),%edx
+ xorl %edi,%eax
rorl $2,%ecx
+ addl %esi,%ebx
+ movl %eax,(%esp)
addl %ebx,%edx
- movl 8(%esp),%edi
+ andl 4(%esp),%eax
addl %ecx,%ebx
+ xorl %edi,%eax
+ movl 156(%esp),%ecx
+ addl $4,%ebp
+ addl %ebx,%eax
+ cmpl $3329325298,%esi
+ jne .L00416_63
+ movl 356(%esp),%esi
+ movl 8(%esp),%ebx
+ movl 16(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl 24(%esp),%eax
+ movl 28(%esp),%ebx
+ movl 32(%esp),%ecx
+ movl 360(%esp),%edi
+ addl 16(%esi),%edx
+ addl 20(%esi),%eax
+ addl 24(%esi),%ebx
+ addl 28(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %eax,20(%esi)
+ movl %ebx,24(%esi)
+ movl %ecx,28(%esi)
+ leal 356(%esp),%esp
+ subl $256,%ebp
+ cmpl 8(%esp),%edi
+ jb .L002loop
+ movl 12(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 32
+.L005loop_shrd:
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ bswap %eax
+ movl 12(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ movl 16(%edi),%eax
+ movl 20(%edi),%ebx
+ movl 24(%edi),%ecx
+ bswap %eax
+ movl 28(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ movl 32(%edi),%eax
+ movl 36(%edi),%ebx
+ movl 40(%edi),%ecx
+ bswap %eax
+ movl 44(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ movl 48(%edi),%eax
+ movl 52(%edi),%ebx
+ movl 56(%edi),%ecx
+ bswap %eax
+ movl 60(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ addl $64,%edi
+ leal -36(%esp),%esp
+ movl %edi,104(%esp)
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,8(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,12(%esp)
+ movl %edi,16(%esp)
+ movl %ebx,(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%edi
+ movl %ebx,24(%esp)
+ movl %ecx,28(%esp)
+ movl %edi,32(%esp)
+.align 16
+.L00600_15_shrd:
+ movl %edx,%ecx
+ movl 24(%esp),%esi
+ shrdl $14,%ecx,%ecx
+ movl 28(%esp),%edi
+ xorl %edx,%ecx
+ xorl %edi,%esi
+ movl 96(%esp),%ebx
+ shrdl $5,%ecx,%ecx
+ andl %edx,%esi
+ movl %edx,20(%esp)
+ xorl %ecx,%edx
+ addl 32(%esp),%ebx
+ xorl %edi,%esi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %esi,%ebx
+ shrdl $9,%ecx,%ecx
+ addl %edx,%ebx
+ movl 8(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,4(%esp)
+ leal -4(%esp),%esp
+ shrdl $11,%ecx,%ecx
+ movl (%ebp),%esi
+ xorl %eax,%ecx
+ movl 20(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %esi,%ebx
movl %eax,(%esp)
+ addl %ebx,%edx
+ andl 4(%esp),%eax
+ addl %ecx,%ebx
+ xorl %edi,%eax
+ addl $4,%ebp
+ addl %ebx,%eax
+ cmpl $3248222580,%esi
+ jne .L00600_15_shrd
+ movl 156(%esp),%ecx
+ jmp .L00716_63_shrd
+.align 16
+.L00716_63_shrd:
+ movl %ecx,%ebx
+ movl 104(%esp),%esi
+ shrdl $11,%ecx,%ecx
+ movl %esi,%edi
+ shrdl $2,%esi,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ shrdl $17,%esi,%esi
+ addl 160(%esp),%ebx
+ shrl $10,%edi
+ addl 124(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 24(%esp),%esi
+ shrdl $14,%ecx,%ecx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %edx,%ecx
+ xorl %edi,%esi
+ movl %ebx,96(%esp)
+ shrdl $5,%ecx,%ecx
+ andl %edx,%esi
+ movl %edx,20(%esp)
+ xorl %ecx,%edx
+ addl 32(%esp),%ebx
+ xorl %edi,%esi
+ shrdl $6,%edx,%edx
movl %eax,%ecx
- subl $4,%esp
- orl %esi,%eax
- andl %esi,%ecx
- andl %edi,%eax
+ addl %esi,%ebx
+ shrdl $9,%ecx,%ecx
+ addl %edx,%ebx
+ movl 8(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,4(%esp)
+ leal -4(%esp),%esp
+ shrdl $11,%ecx,%ecx
movl (%ebp),%esi
- orl %ecx,%eax
+ xorl %eax,%ecx
+ movl 20(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %esi,%ebx
+ movl %eax,(%esp)
+ addl %ebx,%edx
+ andl 4(%esp),%eax
+ addl %ecx,%ebx
+ xorl %edi,%eax
+ movl 156(%esp),%ecx
addl $4,%ebp
addl %ebx,%eax
- movl 152(%esp),%ebx
- addl %esi,%edx
- addl %esi,%eax
cmpl $3329325298,%esi
- jne .L00416_63
- movl 352(%esp),%esi
- movl 4(%esp),%ebx
- movl 8(%esp),%ecx
- movl 12(%esp),%edi
+ jne .L00716_63_shrd
+ movl 356(%esp),%esi
+ movl 8(%esp),%ebx
+ movl 16(%esp),%ecx
addl (%esi),%eax
addl 4(%esi),%ebx
- addl 8(%esi),%ecx
- addl 12(%esi),%edi
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
movl %eax,(%esi)
movl %ebx,4(%esi)
- movl %ecx,8(%esi)
- movl %edi,12(%esi)
- movl 20(%esp),%eax
- movl 24(%esp),%ebx
- movl 28(%esp),%ecx
- movl 356(%esp),%edi
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl 24(%esp),%eax
+ movl 28(%esp),%ebx
+ movl 32(%esp),%ecx
+ movl 360(%esp),%edi
addl 16(%esi),%edx
addl 20(%esi),%eax
addl 24(%esi),%ebx
@@ -261,10 +457,10 @@ sha256_block_data_order:
movl %eax,20(%esi)
movl %ebx,24(%esi)
movl %ecx,28(%esi)
- addl $352,%esp
+ leal 356(%esp),%esp
subl $256,%ebp
cmpl 8(%esp),%edi
- jb .L002loop
+ jb .L005loop_shrd
movl 12(%esp),%esp
popl %edi
popl %esi
@@ -273,27 +469,2920 @@ sha256_block_data_order:
ret
.align 64
.L001K256:
-.long 1116352408,1899447441,3049323471,3921009573
-.long 961987163,1508970993,2453635748,2870763221
-.long 3624381080,310598401,607225278,1426881987
-.long 1925078388,2162078206,2614888103,3248222580
-.long 3835390401,4022224774,264347078,604807628
-.long 770255983,1249150122,1555081692,1996064986
-.long 2554220882,2821834349,2952996808,3210313671
-.long 3336571891,3584528711,113926993,338241895
-.long 666307205,773529912,1294757372,1396182291
-.long 1695183700,1986661051,2177026350,2456956037
-.long 2730485921,2820302411,3259730800,3345764771
-.long 3516065817,3600352804,4094571909,275423344
-.long 430227734,506948616,659060556,883997877
-.long 958139571,1322822218,1537002063,1747873779
-.long 1955562222,2024104815,2227730452,2361852424
-.long 2428436474,2756734187,3204031479,3329325298
-.size sha256_block_data_order,.-.L_sha256_block_data_order_begin
+.long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
+.long 66051,67438087,134810123,202182159
.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
.byte 62,0
+.align 16
+.L008unrolled:
+ leal -96(%esp),%esp
+ movl (%esi),%eax
+ movl 4(%esi),%ebp
+ movl 8(%esi),%ecx
+ movl 12(%esi),%ebx
+ movl %ebp,4(%esp)
+ xorl %ecx,%ebp
+ movl %ecx,8(%esp)
+ movl %ebx,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %ebx,20(%esp)
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ jmp .L009grand_loop
+.align 16
+.L009grand_loop:
+ movl (%edi),%ebx
+ movl 4(%edi),%ecx
+ bswap %ebx
+ movl 8(%edi),%esi
+ bswap %ecx
+ movl %ebx,32(%esp)
+ bswap %esi
+ movl %ecx,36(%esp)
+ movl %esi,40(%esp)
+ movl 12(%edi),%ebx
+ movl 16(%edi),%ecx
+ bswap %ebx
+ movl 20(%edi),%esi
+ bswap %ecx
+ movl %ebx,44(%esp)
+ bswap %esi
+ movl %ecx,48(%esp)
+ movl %esi,52(%esp)
+ movl 24(%edi),%ebx
+ movl 28(%edi),%ecx
+ bswap %ebx
+ movl 32(%edi),%esi
+ bswap %ecx
+ movl %ebx,56(%esp)
+ bswap %esi
+ movl %ecx,60(%esp)
+ movl %esi,64(%esp)
+ movl 36(%edi),%ebx
+ movl 40(%edi),%ecx
+ bswap %ebx
+ movl 44(%edi),%esi
+ bswap %ecx
+ movl %ebx,68(%esp)
+ bswap %esi
+ movl %ecx,72(%esp)
+ movl %esi,76(%esp)
+ movl 48(%edi),%ebx
+ movl 52(%edi),%ecx
+ bswap %ebx
+ movl 56(%edi),%esi
+ bswap %ecx
+ movl %ebx,80(%esp)
+ bswap %esi
+ movl %ecx,84(%esp)
+ movl %esi,88(%esp)
+ movl 60(%edi),%ebx
+ addl $64,%edi
+ bswap %ebx
+ movl %edi,100(%esp)
+ movl %ebx,92(%esp)
+ movl %edx,%ecx
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl 32(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1116352408(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl 36(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1899447441(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl 40(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3049323471(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl 44(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3921009573(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl 48(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 961987163(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl 52(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1508970993(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl 56(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2453635748(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl 60(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2870763221(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl 64(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3624381080(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl 68(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 310598401(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl 72(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 607225278(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl 76(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1426881987(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl 80(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1925078388(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl 84(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2162078206(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl 88(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2614888103(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl 92(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3248222580(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 36(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 88(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 32(%esp),%ebx
+ shrl $10,%edi
+ addl 68(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,32(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3835390401(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 40(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 92(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 36(%esp),%ebx
+ shrl $10,%edi
+ addl 72(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,36(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 4022224774(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 44(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 32(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 40(%esp),%ebx
+ shrl $10,%edi
+ addl 76(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,40(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 264347078(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 48(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 36(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 44(%esp),%ebx
+ shrl $10,%edi
+ addl 80(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,44(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 604807628(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 52(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 40(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 48(%esp),%ebx
+ shrl $10,%edi
+ addl 84(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,48(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 770255983(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 56(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 44(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 52(%esp),%ebx
+ shrl $10,%edi
+ addl 88(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,52(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1249150122(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 60(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 48(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 56(%esp),%ebx
+ shrl $10,%edi
+ addl 92(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,56(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1555081692(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 64(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 52(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 60(%esp),%ebx
+ shrl $10,%edi
+ addl 32(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,60(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1996064986(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 68(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 56(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 64(%esp),%ebx
+ shrl $10,%edi
+ addl 36(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,64(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2554220882(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 72(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 60(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 68(%esp),%ebx
+ shrl $10,%edi
+ addl 40(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,68(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2821834349(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 76(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 64(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 72(%esp),%ebx
+ shrl $10,%edi
+ addl 44(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,72(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2952996808(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 80(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 68(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 76(%esp),%ebx
+ shrl $10,%edi
+ addl 48(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,76(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3210313671(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 84(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 72(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 80(%esp),%ebx
+ shrl $10,%edi
+ addl 52(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,80(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3336571891(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 88(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 76(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 84(%esp),%ebx
+ shrl $10,%edi
+ addl 56(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,84(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3584528711(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 92(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 80(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 88(%esp),%ebx
+ shrl $10,%edi
+ addl 60(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,88(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 113926993(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 32(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 84(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 92(%esp),%ebx
+ shrl $10,%edi
+ addl 64(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,92(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 338241895(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 36(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 88(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 32(%esp),%ebx
+ shrl $10,%edi
+ addl 68(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,32(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 666307205(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 40(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 92(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 36(%esp),%ebx
+ shrl $10,%edi
+ addl 72(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,36(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 773529912(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 44(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 32(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 40(%esp),%ebx
+ shrl $10,%edi
+ addl 76(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,40(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1294757372(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 48(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 36(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 44(%esp),%ebx
+ shrl $10,%edi
+ addl 80(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,44(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1396182291(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 52(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 40(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 48(%esp),%ebx
+ shrl $10,%edi
+ addl 84(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,48(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1695183700(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 56(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 44(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 52(%esp),%ebx
+ shrl $10,%edi
+ addl 88(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,52(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1986661051(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 60(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 48(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 56(%esp),%ebx
+ shrl $10,%edi
+ addl 92(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,56(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2177026350(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 64(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 52(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 60(%esp),%ebx
+ shrl $10,%edi
+ addl 32(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,60(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2456956037(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 68(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 56(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 64(%esp),%ebx
+ shrl $10,%edi
+ addl 36(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,64(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2730485921(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 72(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 60(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 68(%esp),%ebx
+ shrl $10,%edi
+ addl 40(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,68(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2820302411(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 76(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 64(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 72(%esp),%ebx
+ shrl $10,%edi
+ addl 44(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,72(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3259730800(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 80(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 68(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 76(%esp),%ebx
+ shrl $10,%edi
+ addl 48(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,76(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3345764771(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 84(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 72(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 80(%esp),%ebx
+ shrl $10,%edi
+ addl 52(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,80(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3516065817(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 88(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 76(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 84(%esp),%ebx
+ shrl $10,%edi
+ addl 56(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,84(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3600352804(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 92(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 80(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 88(%esp),%ebx
+ shrl $10,%edi
+ addl 60(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,88(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 4094571909(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 32(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 84(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 92(%esp),%ebx
+ shrl $10,%edi
+ addl 64(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,92(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 275423344(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 36(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 88(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 32(%esp),%ebx
+ shrl $10,%edi
+ addl 68(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,32(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 430227734(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 40(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 92(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 36(%esp),%ebx
+ shrl $10,%edi
+ addl 72(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,36(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 506948616(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 44(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 32(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 40(%esp),%ebx
+ shrl $10,%edi
+ addl 76(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,40(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 659060556(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 48(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 36(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 44(%esp),%ebx
+ shrl $10,%edi
+ addl 80(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,44(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 883997877(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 52(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 40(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 48(%esp),%ebx
+ shrl $10,%edi
+ addl 84(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,48(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 958139571(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 56(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 44(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 52(%esp),%ebx
+ shrl $10,%edi
+ addl 88(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,52(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1322822218(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 60(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 48(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 56(%esp),%ebx
+ shrl $10,%edi
+ addl 92(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,56(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1537002063(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 64(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 52(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 60(%esp),%ebx
+ shrl $10,%edi
+ addl 32(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,60(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1747873779(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 68(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 56(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 64(%esp),%ebx
+ shrl $10,%edi
+ addl 36(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,64(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1955562222(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 72(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 60(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 68(%esp),%ebx
+ shrl $10,%edi
+ addl 40(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,68(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2024104815(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 76(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 64(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 72(%esp),%ebx
+ shrl $10,%edi
+ addl 44(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,72(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2227730452(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 80(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 68(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 76(%esp),%ebx
+ shrl $10,%edi
+ addl 48(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,76(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2361852424(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 84(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 72(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 80(%esp),%ebx
+ shrl $10,%edi
+ addl 52(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,80(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2428436474(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 88(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 76(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 84(%esp),%ebx
+ shrl $10,%edi
+ addl 56(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,84(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2756734187(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 92(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 80(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 88(%esp),%ebx
+ shrl $10,%edi
+ addl 60(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3204031479(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 32(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 84(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 92(%esp),%ebx
+ shrl $10,%edi
+ addl 64(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3329325298(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebp
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebp
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebp,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebx
+ movl 28(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ebx
+ addl 28(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %ebx,24(%esi)
+ movl %ecx,28(%esi)
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ebx,24(%esp)
+ movl %ecx,28(%esp)
+ cmpl 104(%esp),%edi
+ jb .L009grand_loop
+ movl 108(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size sha256_block_data_order,.-.L_sha256_block_data_order_begin
.section .note.GNU-stack,"",%progbits
diff --git a/lib/accelerated/x86/elf/sha512-ssse3-x86.s b/lib/accelerated/x86/elf/sha512-ssse3-x86.s
index e8eeefe7ad..7fa849a267 100644
--- a/lib/accelerated/x86/elf/sha512-ssse3-x86.s
+++ b/lib/accelerated/x86/elf/sha512-ssse3-x86.s
@@ -594,6 +594,8 @@ sha512_block_data_order:
.long 4234509866,1501505948
.long 987167468,1607167915
.long 1246189591,1816402316
+.long 67438087,66051
+.long 202182159,134810123
.size sha512_block_data_order,.-.L_sha512_block_data_order_begin
.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
diff --git a/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s b/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s
index 22f55fe953..f85e0bb6d8 100644
--- a/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s
+++ b/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s
@@ -39,10 +39,17 @@
#
.text
+
.globl sha256_block_data_order
.type sha256_block_data_order,@function
.align 16
sha256_block_data_order:
+ leaq _gnutls_x86_cpuid_s(%rip),%r11
+ movl 0(%r11),%r9d
+ movl 4(%r11),%r10d
+ movl 8(%r11),%r11d
+ testl $512,%r10d
+ jnz .Lssse3_shortcut
pushq %rbx
pushq %rbp
pushq %r12
@@ -60,8 +67,6 @@ sha256_block_data_order:
movq %r11,64+24(%rsp)
.Lprologue:
- leaq K256(%rip),%rbp
-
movl 0(%rdi),%eax
movl 4(%rdi),%ebx
movl 8(%rdi),%ecx
@@ -74,1694 +79,1632 @@ sha256_block_data_order:
.align 16
.Lloop:
- xorq %rdi,%rdi
+ movl %ebx,%edi
+ leaq K256(%rip),%rbp
+ xorl %ecx,%edi
movl 0(%rsi),%r12d
movl %r8d,%r13d
movl %eax,%r14d
bswapl %r12d
rorl $14,%r13d
movl %r9d,%r15d
- movl %r12d,0(%rsp)
- rorl $9,%r14d
xorl %r8d,%r13d
+ rorl $9,%r14d
xorl %r10d,%r15d
- rorl $5,%r13d
- addl %r11d,%r12d
+ movl %r12d,0(%rsp)
xorl %eax,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r8d,%r15d
- movl %ebx,%r11d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
rorl $11,%r14d
xorl %r8d,%r13d
- xorl %r10d,%r15d
+ addl %r15d,%r12d
- xorl %ecx,%r11d
+ movl %eax,%r15d
+ addl (%rbp),%r12d
xorl %eax,%r14d
- addl %r15d,%r12d
- movl %ebx,%r15d
+ xorl %ebx,%r15d
rorl $6,%r13d
- andl %eax,%r11d
- andl %ecx,%r15d
+ movl %ebx,%r11d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r11d
+ xorl %edi,%r11d
addl %r12d,%edx
addl %r12d,%r11d
- leaq 1(%rdi),%rdi
- addl %r14d,%r11d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r11d
movl 4(%rsi),%r12d
movl %edx,%r13d
movl %r11d,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %r8d,%r15d
- movl %r12d,4(%rsp)
+ movl %r8d,%edi
- rorl $9,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ rorl $9,%r14d
+ xorl %r9d,%edi
- rorl $5,%r13d
- addl %r10d,%r12d
+ movl %r12d,4(%rsp)
xorl %r11d,%r14d
+ andl %edx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %edx,%r15d
- movl %eax,%r10d
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
rorl $11,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ addl %edi,%r12d
- xorl %ebx,%r10d
+ movl %r11d,%edi
+ addl (%rbp),%r12d
xorl %r11d,%r14d
- addl %r15d,%r12d
- movl %eax,%r15d
+ xorl %eax,%edi
rorl $6,%r13d
- andl %r11d,%r10d
- andl %ebx,%r15d
+ movl %eax,%r10d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r10d
+ xorl %r15d,%r10d
addl %r12d,%ecx
addl %r12d,%r10d
- leaq 1(%rdi),%rdi
- addl %r14d,%r10d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r10d
movl 8(%rsi),%r12d
movl %ecx,%r13d
movl %r10d,%r14d
bswapl %r12d
rorl $14,%r13d
movl %edx,%r15d
- movl %r12d,8(%rsp)
- rorl $9,%r14d
xorl %ecx,%r13d
+ rorl $9,%r14d
xorl %r8d,%r15d
- rorl $5,%r13d
- addl %r9d,%r12d
+ movl %r12d,8(%rsp)
xorl %r10d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %ecx,%r15d
- movl %r11d,%r9d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
rorl $11,%r14d
xorl %ecx,%r13d
- xorl %r8d,%r15d
+ addl %r15d,%r12d
- xorl %eax,%r9d
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
xorl %r10d,%r14d
- addl %r15d,%r12d
- movl %r11d,%r15d
+ xorl %r11d,%r15d
rorl $6,%r13d
- andl %r10d,%r9d
- andl %eax,%r15d
+ movl %r11d,%r9d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r9d
+ xorl %edi,%r9d
addl %r12d,%ebx
addl %r12d,%r9d
- leaq 1(%rdi),%rdi
- addl %r14d,%r9d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r9d
movl 12(%rsi),%r12d
movl %ebx,%r13d
movl %r9d,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %ecx,%r15d
- movl %r12d,12(%rsp)
+ movl %ecx,%edi
- rorl $9,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ rorl $9,%r14d
+ xorl %edx,%edi
- rorl $5,%r13d
- addl %r8d,%r12d
+ movl %r12d,12(%rsp)
xorl %r9d,%r14d
+ andl %ebx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %ebx,%r15d
- movl %r10d,%r8d
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
rorl $11,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ addl %edi,%r12d
- xorl %r11d,%r8d
+ movl %r9d,%edi
+ addl (%rbp),%r12d
xorl %r9d,%r14d
- addl %r15d,%r12d
- movl %r10d,%r15d
+ xorl %r10d,%edi
rorl $6,%r13d
- andl %r9d,%r8d
- andl %r11d,%r15d
+ movl %r10d,%r8d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r8d
+ xorl %r15d,%r8d
addl %r12d,%eax
addl %r12d,%r8d
- leaq 1(%rdi),%rdi
- addl %r14d,%r8d
+ leaq 20(%rbp),%rbp
+ addl %r14d,%r8d
movl 16(%rsi),%r12d
movl %eax,%r13d
movl %r8d,%r14d
bswapl %r12d
rorl $14,%r13d
movl %ebx,%r15d
- movl %r12d,16(%rsp)
- rorl $9,%r14d
xorl %eax,%r13d
+ rorl $9,%r14d
xorl %ecx,%r15d
- rorl $5,%r13d
- addl %edx,%r12d
+ movl %r12d,16(%rsp)
xorl %r8d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %eax,%r15d
- movl %r9d,%edx
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
rorl $11,%r14d
xorl %eax,%r13d
- xorl %ecx,%r15d
+ addl %r15d,%r12d
- xorl %r10d,%edx
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
xorl %r8d,%r14d
- addl %r15d,%r12d
- movl %r9d,%r15d
+ xorl %r9d,%r15d
rorl $6,%r13d
- andl %r8d,%edx
- andl %r10d,%r15d
+ movl %r9d,%edx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%edx
+ xorl %edi,%edx
addl %r12d,%r11d
addl %r12d,%edx
- leaq 1(%rdi),%rdi
- addl %r14d,%edx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%edx
movl 20(%rsi),%r12d
movl %r11d,%r13d
movl %edx,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %eax,%r15d
- movl %r12d,20(%rsp)
+ movl %eax,%edi
- rorl $9,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ rorl $9,%r14d
+ xorl %ebx,%edi
- rorl $5,%r13d
- addl %ecx,%r12d
+ movl %r12d,20(%rsp)
xorl %edx,%r14d
+ andl %r11d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r11d,%r15d
- movl %r8d,%ecx
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
rorl $11,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ addl %edi,%r12d
- xorl %r9d,%ecx
+ movl %edx,%edi
+ addl (%rbp),%r12d
xorl %edx,%r14d
- addl %r15d,%r12d
- movl %r8d,%r15d
+ xorl %r8d,%edi
rorl $6,%r13d
- andl %edx,%ecx
- andl %r9d,%r15d
+ movl %r8d,%ecx
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ecx
+ xorl %r15d,%ecx
addl %r12d,%r10d
addl %r12d,%ecx
- leaq 1(%rdi),%rdi
- addl %r14d,%ecx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ecx
movl 24(%rsi),%r12d
movl %r10d,%r13d
movl %ecx,%r14d
bswapl %r12d
rorl $14,%r13d
movl %r11d,%r15d
- movl %r12d,24(%rsp)
- rorl $9,%r14d
xorl %r10d,%r13d
+ rorl $9,%r14d
xorl %eax,%r15d
- rorl $5,%r13d
- addl %ebx,%r12d
+ movl %r12d,24(%rsp)
xorl %ecx,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r10d,%r15d
- movl %edx,%ebx
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
rorl $11,%r14d
xorl %r10d,%r13d
- xorl %eax,%r15d
+ addl %r15d,%r12d
- xorl %r8d,%ebx
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
xorl %ecx,%r14d
- addl %r15d,%r12d
- movl %edx,%r15d
+ xorl %edx,%r15d
rorl $6,%r13d
- andl %ecx,%ebx
- andl %r8d,%r15d
+ movl %edx,%ebx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ebx
+ xorl %edi,%ebx
addl %r12d,%r9d
addl %r12d,%ebx
- leaq 1(%rdi),%rdi
- addl %r14d,%ebx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ebx
movl 28(%rsi),%r12d
movl %r9d,%r13d
movl %ebx,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %r10d,%r15d
- movl %r12d,28(%rsp)
+ movl %r10d,%edi
- rorl $9,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ rorl $9,%r14d
+ xorl %r11d,%edi
- rorl $5,%r13d
- addl %eax,%r12d
+ movl %r12d,28(%rsp)
xorl %ebx,%r14d
+ andl %r9d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r9d,%r15d
- movl %ecx,%eax
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
rorl $11,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ addl %edi,%r12d
- xorl %edx,%eax
+ movl %ebx,%edi
+ addl (%rbp),%r12d
xorl %ebx,%r14d
- addl %r15d,%r12d
- movl %ecx,%r15d
+ xorl %ecx,%edi
rorl $6,%r13d
- andl %ebx,%eax
- andl %edx,%r15d
+ movl %ecx,%eax
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%eax
+ xorl %r15d,%eax
addl %r12d,%r8d
addl %r12d,%eax
- leaq 1(%rdi),%rdi
- addl %r14d,%eax
+ leaq 20(%rbp),%rbp
+ addl %r14d,%eax
movl 32(%rsi),%r12d
movl %r8d,%r13d
movl %eax,%r14d
bswapl %r12d
rorl $14,%r13d
movl %r9d,%r15d
- movl %r12d,32(%rsp)
- rorl $9,%r14d
xorl %r8d,%r13d
+ rorl $9,%r14d
xorl %r10d,%r15d
- rorl $5,%r13d
- addl %r11d,%r12d
+ movl %r12d,32(%rsp)
xorl %eax,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r8d,%r15d
- movl %ebx,%r11d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
rorl $11,%r14d
xorl %r8d,%r13d
- xorl %r10d,%r15d
+ addl %r15d,%r12d
- xorl %ecx,%r11d
+ movl %eax,%r15d
+ addl (%rbp),%r12d
xorl %eax,%r14d
- addl %r15d,%r12d
- movl %ebx,%r15d
+ xorl %ebx,%r15d
rorl $6,%r13d
- andl %eax,%r11d
- andl %ecx,%r15d
+ movl %ebx,%r11d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r11d
+ xorl %edi,%r11d
addl %r12d,%edx
addl %r12d,%r11d
- leaq 1(%rdi),%rdi
- addl %r14d,%r11d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r11d
movl 36(%rsi),%r12d
movl %edx,%r13d
movl %r11d,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %r8d,%r15d
- movl %r12d,36(%rsp)
+ movl %r8d,%edi
- rorl $9,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ rorl $9,%r14d
+ xorl %r9d,%edi
- rorl $5,%r13d
- addl %r10d,%r12d
+ movl %r12d,36(%rsp)
xorl %r11d,%r14d
+ andl %edx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %edx,%r15d
- movl %eax,%r10d
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
rorl $11,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ addl %edi,%r12d
- xorl %ebx,%r10d
+ movl %r11d,%edi
+ addl (%rbp),%r12d
xorl %r11d,%r14d
- addl %r15d,%r12d
- movl %eax,%r15d
+ xorl %eax,%edi
rorl $6,%r13d
- andl %r11d,%r10d
- andl %ebx,%r15d
+ movl %eax,%r10d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r10d
+ xorl %r15d,%r10d
addl %r12d,%ecx
addl %r12d,%r10d
- leaq 1(%rdi),%rdi
- addl %r14d,%r10d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r10d
movl 40(%rsi),%r12d
movl %ecx,%r13d
movl %r10d,%r14d
bswapl %r12d
rorl $14,%r13d
movl %edx,%r15d
- movl %r12d,40(%rsp)
- rorl $9,%r14d
xorl %ecx,%r13d
+ rorl $9,%r14d
xorl %r8d,%r15d
- rorl $5,%r13d
- addl %r9d,%r12d
+ movl %r12d,40(%rsp)
xorl %r10d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %ecx,%r15d
- movl %r11d,%r9d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
rorl $11,%r14d
xorl %ecx,%r13d
- xorl %r8d,%r15d
+ addl %r15d,%r12d
- xorl %eax,%r9d
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
xorl %r10d,%r14d
- addl %r15d,%r12d
- movl %r11d,%r15d
+ xorl %r11d,%r15d
rorl $6,%r13d
- andl %r10d,%r9d
- andl %eax,%r15d
+ movl %r11d,%r9d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r9d
+ xorl %edi,%r9d
addl %r12d,%ebx
addl %r12d,%r9d
- leaq 1(%rdi),%rdi
- addl %r14d,%r9d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r9d
movl 44(%rsi),%r12d
movl %ebx,%r13d
movl %r9d,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %ecx,%r15d
- movl %r12d,44(%rsp)
+ movl %ecx,%edi
- rorl $9,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ rorl $9,%r14d
+ xorl %edx,%edi
- rorl $5,%r13d
- addl %r8d,%r12d
+ movl %r12d,44(%rsp)
xorl %r9d,%r14d
+ andl %ebx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %ebx,%r15d
- movl %r10d,%r8d
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
rorl $11,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ addl %edi,%r12d
- xorl %r11d,%r8d
+ movl %r9d,%edi
+ addl (%rbp),%r12d
xorl %r9d,%r14d
- addl %r15d,%r12d
- movl %r10d,%r15d
+ xorl %r10d,%edi
rorl $6,%r13d
- andl %r9d,%r8d
- andl %r11d,%r15d
+ movl %r10d,%r8d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r8d
+ xorl %r15d,%r8d
addl %r12d,%eax
addl %r12d,%r8d
- leaq 1(%rdi),%rdi
- addl %r14d,%r8d
+ leaq 20(%rbp),%rbp
+ addl %r14d,%r8d
movl 48(%rsi),%r12d
movl %eax,%r13d
movl %r8d,%r14d
bswapl %r12d
rorl $14,%r13d
movl %ebx,%r15d
- movl %r12d,48(%rsp)
- rorl $9,%r14d
xorl %eax,%r13d
+ rorl $9,%r14d
xorl %ecx,%r15d
- rorl $5,%r13d
- addl %edx,%r12d
+ movl %r12d,48(%rsp)
xorl %r8d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %eax,%r15d
- movl %r9d,%edx
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
rorl $11,%r14d
xorl %eax,%r13d
- xorl %ecx,%r15d
+ addl %r15d,%r12d
- xorl %r10d,%edx
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
xorl %r8d,%r14d
- addl %r15d,%r12d
- movl %r9d,%r15d
+ xorl %r9d,%r15d
rorl $6,%r13d
- andl %r8d,%edx
- andl %r10d,%r15d
+ movl %r9d,%edx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%edx
+ xorl %edi,%edx
addl %r12d,%r11d
addl %r12d,%edx
- leaq 1(%rdi),%rdi
- addl %r14d,%edx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%edx
movl 52(%rsi),%r12d
movl %r11d,%r13d
movl %edx,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %eax,%r15d
- movl %r12d,52(%rsp)
+ movl %eax,%edi
- rorl $9,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ rorl $9,%r14d
+ xorl %ebx,%edi
- rorl $5,%r13d
- addl %ecx,%r12d
+ movl %r12d,52(%rsp)
xorl %edx,%r14d
+ andl %r11d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r11d,%r15d
- movl %r8d,%ecx
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
rorl $11,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ addl %edi,%r12d
- xorl %r9d,%ecx
+ movl %edx,%edi
+ addl (%rbp),%r12d
xorl %edx,%r14d
- addl %r15d,%r12d
- movl %r8d,%r15d
+ xorl %r8d,%edi
rorl $6,%r13d
- andl %edx,%ecx
- andl %r9d,%r15d
+ movl %r8d,%ecx
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ecx
+ xorl %r15d,%ecx
addl %r12d,%r10d
addl %r12d,%ecx
- leaq 1(%rdi),%rdi
- addl %r14d,%ecx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ecx
movl 56(%rsi),%r12d
movl %r10d,%r13d
movl %ecx,%r14d
bswapl %r12d
rorl $14,%r13d
movl %r11d,%r15d
- movl %r12d,56(%rsp)
- rorl $9,%r14d
xorl %r10d,%r13d
+ rorl $9,%r14d
xorl %eax,%r15d
- rorl $5,%r13d
- addl %ebx,%r12d
+ movl %r12d,56(%rsp)
xorl %ecx,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r10d,%r15d
- movl %edx,%ebx
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
rorl $11,%r14d
xorl %r10d,%r13d
- xorl %eax,%r15d
+ addl %r15d,%r12d
- xorl %r8d,%ebx
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
xorl %ecx,%r14d
- addl %r15d,%r12d
- movl %edx,%r15d
+ xorl %edx,%r15d
rorl $6,%r13d
- andl %ecx,%ebx
- andl %r8d,%r15d
+ movl %edx,%ebx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ebx
+ xorl %edi,%ebx
addl %r12d,%r9d
addl %r12d,%ebx
- leaq 1(%rdi),%rdi
- addl %r14d,%ebx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ebx
movl 60(%rsi),%r12d
movl %r9d,%r13d
movl %ebx,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %r10d,%r15d
- movl %r12d,60(%rsp)
+ movl %r10d,%edi
- rorl $9,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ rorl $9,%r14d
+ xorl %r11d,%edi
- rorl $5,%r13d
- addl %eax,%r12d
+ movl %r12d,60(%rsp)
xorl %ebx,%r14d
+ andl %r9d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r9d,%r15d
- movl %ecx,%eax
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
rorl $11,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ addl %edi,%r12d
- xorl %edx,%eax
+ movl %ebx,%edi
+ addl (%rbp),%r12d
xorl %ebx,%r14d
- addl %r15d,%r12d
- movl %ecx,%r15d
+ xorl %ecx,%edi
rorl $6,%r13d
- andl %ebx,%eax
- andl %edx,%r15d
+ movl %ecx,%eax
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%eax
+ xorl %r15d,%eax
addl %r12d,%r8d
addl %r12d,%eax
- leaq 1(%rdi),%rdi
- addl %r14d,%eax
+ leaq 20(%rbp),%rbp
jmp .Lrounds_16_xx
.align 16
.Lrounds_16_xx:
movl 4(%rsp),%r13d
- movl 56(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 56(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%eax
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 36(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 36(%rsp),%r12d
addl 0(%rsp),%r12d
movl %r8d,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %eax,%r14d
rorl $14,%r13d
movl %r9d,%r15d
- movl %r12d,0(%rsp)
- rorl $9,%r14d
xorl %r8d,%r13d
+ rorl $9,%r14d
xorl %r10d,%r15d
- rorl $5,%r13d
- addl %r11d,%r12d
+ movl %r12d,0(%rsp)
xorl %eax,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r8d,%r15d
- movl %ebx,%r11d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
rorl $11,%r14d
xorl %r8d,%r13d
- xorl %r10d,%r15d
+ addl %r15d,%r12d
- xorl %ecx,%r11d
+ movl %eax,%r15d
+ addl (%rbp),%r12d
xorl %eax,%r14d
- addl %r15d,%r12d
- movl %ebx,%r15d
+ xorl %ebx,%r15d
rorl $6,%r13d
- andl %eax,%r11d
- andl %ecx,%r15d
+ movl %ebx,%r11d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r11d
+ xorl %edi,%r11d
addl %r12d,%edx
addl %r12d,%r11d
- leaq 1(%rdi),%rdi
- addl %r14d,%r11d
+ leaq 4(%rbp),%rbp
movl 8(%rsp),%r13d
- movl 60(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 60(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r11d
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 40(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 40(%rsp),%r12d
addl 4(%rsp),%r12d
movl %edx,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %r11d,%r14d
rorl $14,%r13d
- movl %r8d,%r15d
- movl %r12d,4(%rsp)
+ movl %r8d,%edi
- rorl $9,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ rorl $9,%r14d
+ xorl %r9d,%edi
- rorl $5,%r13d
- addl %r10d,%r12d
+ movl %r12d,4(%rsp)
xorl %r11d,%r14d
+ andl %edx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %edx,%r15d
- movl %eax,%r10d
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
rorl $11,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ addl %edi,%r12d
- xorl %ebx,%r10d
+ movl %r11d,%edi
+ addl (%rbp),%r12d
xorl %r11d,%r14d
- addl %r15d,%r12d
- movl %eax,%r15d
+ xorl %eax,%edi
rorl $6,%r13d
- andl %r11d,%r10d
- andl %ebx,%r15d
+ movl %eax,%r10d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r10d
+ xorl %r15d,%r10d
addl %r12d,%ecx
addl %r12d,%r10d
- leaq 1(%rdi),%rdi
- addl %r14d,%r10d
+ leaq 4(%rbp),%rbp
movl 12(%rsp),%r13d
- movl 0(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 0(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r10d
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 44(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 44(%rsp),%r12d
addl 8(%rsp),%r12d
movl %ecx,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %r10d,%r14d
rorl $14,%r13d
movl %edx,%r15d
- movl %r12d,8(%rsp)
- rorl $9,%r14d
xorl %ecx,%r13d
+ rorl $9,%r14d
xorl %r8d,%r15d
- rorl $5,%r13d
- addl %r9d,%r12d
+ movl %r12d,8(%rsp)
xorl %r10d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %ecx,%r15d
- movl %r11d,%r9d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
rorl $11,%r14d
xorl %ecx,%r13d
- xorl %r8d,%r15d
+ addl %r15d,%r12d
- xorl %eax,%r9d
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
xorl %r10d,%r14d
- addl %r15d,%r12d
- movl %r11d,%r15d
+ xorl %r11d,%r15d
rorl $6,%r13d
- andl %r10d,%r9d
- andl %eax,%r15d
+ movl %r11d,%r9d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r9d
+ xorl %edi,%r9d
addl %r12d,%ebx
addl %r12d,%r9d
- leaq 1(%rdi),%rdi
- addl %r14d,%r9d
+ leaq 4(%rbp),%rbp
movl 16(%rsp),%r13d
- movl 4(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 4(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r9d
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 48(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 48(%rsp),%r12d
addl 12(%rsp),%r12d
movl %ebx,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %r9d,%r14d
rorl $14,%r13d
- movl %ecx,%r15d
- movl %r12d,12(%rsp)
+ movl %ecx,%edi
- rorl $9,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ rorl $9,%r14d
+ xorl %edx,%edi
- rorl $5,%r13d
- addl %r8d,%r12d
+ movl %r12d,12(%rsp)
xorl %r9d,%r14d
+ andl %ebx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %ebx,%r15d
- movl %r10d,%r8d
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
rorl $11,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ addl %edi,%r12d
- xorl %r11d,%r8d
+ movl %r9d,%edi
+ addl (%rbp),%r12d
xorl %r9d,%r14d
- addl %r15d,%r12d
- movl %r10d,%r15d
+ xorl %r10d,%edi
rorl $6,%r13d
- andl %r9d,%r8d
- andl %r11d,%r15d
+ movl %r10d,%r8d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r8d
+ xorl %r15d,%r8d
addl %r12d,%eax
addl %r12d,%r8d
- leaq 1(%rdi),%rdi
- addl %r14d,%r8d
+ leaq 20(%rbp),%rbp
movl 20(%rsp),%r13d
- movl 8(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 8(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r8d
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 52(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 52(%rsp),%r12d
addl 16(%rsp),%r12d
movl %eax,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %r8d,%r14d
rorl $14,%r13d
movl %ebx,%r15d
- movl %r12d,16(%rsp)
- rorl $9,%r14d
xorl %eax,%r13d
+ rorl $9,%r14d
xorl %ecx,%r15d
- rorl $5,%r13d
- addl %edx,%r12d
+ movl %r12d,16(%rsp)
xorl %r8d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %eax,%r15d
- movl %r9d,%edx
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
rorl $11,%r14d
xorl %eax,%r13d
- xorl %ecx,%r15d
+ addl %r15d,%r12d
- xorl %r10d,%edx
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
xorl %r8d,%r14d
- addl %r15d,%r12d
- movl %r9d,%r15d
+ xorl %r9d,%r15d
rorl $6,%r13d
- andl %r8d,%edx
- andl %r10d,%r15d
+ movl %r9d,%edx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%edx
+ xorl %edi,%edx
addl %r12d,%r11d
addl %r12d,%edx
- leaq 1(%rdi),%rdi
- addl %r14d,%edx
+ leaq 4(%rbp),%rbp
movl 24(%rsp),%r13d
- movl 12(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 12(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%edx
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 56(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 56(%rsp),%r12d
addl 20(%rsp),%r12d
movl %r11d,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %edx,%r14d
rorl $14,%r13d
- movl %eax,%r15d
- movl %r12d,20(%rsp)
+ movl %eax,%edi
- rorl $9,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ rorl $9,%r14d
+ xorl %ebx,%edi
- rorl $5,%r13d
- addl %ecx,%r12d
+ movl %r12d,20(%rsp)
xorl %edx,%r14d
+ andl %r11d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r11d,%r15d
- movl %r8d,%ecx
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
rorl $11,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ addl %edi,%r12d
- xorl %r9d,%ecx
+ movl %edx,%edi
+ addl (%rbp),%r12d
xorl %edx,%r14d
- addl %r15d,%r12d
- movl %r8d,%r15d
+ xorl %r8d,%edi
rorl $6,%r13d
- andl %edx,%ecx
- andl %r9d,%r15d
+ movl %r8d,%ecx
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ecx
+ xorl %r15d,%ecx
addl %r12d,%r10d
addl %r12d,%ecx
- leaq 1(%rdi),%rdi
- addl %r14d,%ecx
+ leaq 4(%rbp),%rbp
movl 28(%rsp),%r13d
- movl 16(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 16(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ecx
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 60(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 60(%rsp),%r12d
addl 24(%rsp),%r12d
movl %r10d,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %ecx,%r14d
rorl $14,%r13d
movl %r11d,%r15d
- movl %r12d,24(%rsp)
- rorl $9,%r14d
xorl %r10d,%r13d
+ rorl $9,%r14d
xorl %eax,%r15d
- rorl $5,%r13d
- addl %ebx,%r12d
+ movl %r12d,24(%rsp)
xorl %ecx,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r10d,%r15d
- movl %edx,%ebx
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
rorl $11,%r14d
xorl %r10d,%r13d
- xorl %eax,%r15d
+ addl %r15d,%r12d
- xorl %r8d,%ebx
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
xorl %ecx,%r14d
- addl %r15d,%r12d
- movl %edx,%r15d
+ xorl %edx,%r15d
rorl $6,%r13d
- andl %ecx,%ebx
- andl %r8d,%r15d
+ movl %edx,%ebx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ebx
+ xorl %edi,%ebx
addl %r12d,%r9d
addl %r12d,%ebx
- leaq 1(%rdi),%rdi
- addl %r14d,%ebx
+ leaq 4(%rbp),%rbp
movl 32(%rsp),%r13d
- movl 20(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 20(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ebx
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 0(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 0(%rsp),%r12d
addl 28(%rsp),%r12d
movl %r9d,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %ebx,%r14d
rorl $14,%r13d
- movl %r10d,%r15d
- movl %r12d,28(%rsp)
+ movl %r10d,%edi
- rorl $9,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ rorl $9,%r14d
+ xorl %r11d,%edi
- rorl $5,%r13d
- addl %eax,%r12d
+ movl %r12d,28(%rsp)
xorl %ebx,%r14d
+ andl %r9d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r9d,%r15d
- movl %ecx,%eax
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
rorl $11,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ addl %edi,%r12d
- xorl %edx,%eax
+ movl %ebx,%edi
+ addl (%rbp),%r12d
xorl %ebx,%r14d
- addl %r15d,%r12d
- movl %ecx,%r15d
+ xorl %ecx,%edi
rorl $6,%r13d
- andl %ebx,%eax
- andl %edx,%r15d
+ movl %ecx,%eax
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%eax
+ xorl %r15d,%eax
addl %r12d,%r8d
addl %r12d,%eax
- leaq 1(%rdi),%rdi
- addl %r14d,%eax
+ leaq 20(%rbp),%rbp
movl 36(%rsp),%r13d
- movl 24(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 24(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%eax
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 4(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 4(%rsp),%r12d
addl 32(%rsp),%r12d
movl %r8d,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %eax,%r14d
rorl $14,%r13d
movl %r9d,%r15d
- movl %r12d,32(%rsp)
- rorl $9,%r14d
xorl %r8d,%r13d
+ rorl $9,%r14d
xorl %r10d,%r15d
- rorl $5,%r13d
- addl %r11d,%r12d
+ movl %r12d,32(%rsp)
xorl %eax,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r8d,%r15d
- movl %ebx,%r11d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
rorl $11,%r14d
xorl %r8d,%r13d
- xorl %r10d,%r15d
+ addl %r15d,%r12d
- xorl %ecx,%r11d
+ movl %eax,%r15d
+ addl (%rbp),%r12d
xorl %eax,%r14d
- addl %r15d,%r12d
- movl %ebx,%r15d
+ xorl %ebx,%r15d
rorl $6,%r13d
- andl %eax,%r11d
- andl %ecx,%r15d
+ movl %ebx,%r11d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r11d
+ xorl %edi,%r11d
addl %r12d,%edx
addl %r12d,%r11d
- leaq 1(%rdi),%rdi
- addl %r14d,%r11d
+ leaq 4(%rbp),%rbp
movl 40(%rsp),%r13d
- movl 28(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 28(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r11d
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 8(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 8(%rsp),%r12d
addl 36(%rsp),%r12d
movl %edx,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %r11d,%r14d
rorl $14,%r13d
- movl %r8d,%r15d
- movl %r12d,36(%rsp)
+ movl %r8d,%edi
- rorl $9,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ rorl $9,%r14d
+ xorl %r9d,%edi
- rorl $5,%r13d
- addl %r10d,%r12d
+ movl %r12d,36(%rsp)
xorl %r11d,%r14d
+ andl %edx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %edx,%r15d
- movl %eax,%r10d
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
rorl $11,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ addl %edi,%r12d
- xorl %ebx,%r10d
+ movl %r11d,%edi
+ addl (%rbp),%r12d
xorl %r11d,%r14d
- addl %r15d,%r12d
- movl %eax,%r15d
+ xorl %eax,%edi
rorl $6,%r13d
- andl %r11d,%r10d
- andl %ebx,%r15d
+ movl %eax,%r10d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r10d
+ xorl %r15d,%r10d
addl %r12d,%ecx
addl %r12d,%r10d
- leaq 1(%rdi),%rdi
- addl %r14d,%r10d
+ leaq 4(%rbp),%rbp
movl 44(%rsp),%r13d
- movl 32(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 32(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r10d
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 12(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 12(%rsp),%r12d
addl 40(%rsp),%r12d
movl %ecx,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %r10d,%r14d
rorl $14,%r13d
movl %edx,%r15d
- movl %r12d,40(%rsp)
- rorl $9,%r14d
xorl %ecx,%r13d
+ rorl $9,%r14d
xorl %r8d,%r15d
- rorl $5,%r13d
- addl %r9d,%r12d
+ movl %r12d,40(%rsp)
xorl %r10d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %ecx,%r15d
- movl %r11d,%r9d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
rorl $11,%r14d
xorl %ecx,%r13d
- xorl %r8d,%r15d
+ addl %r15d,%r12d
- xorl %eax,%r9d
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
xorl %r10d,%r14d
- addl %r15d,%r12d
- movl %r11d,%r15d
+ xorl %r11d,%r15d
rorl $6,%r13d
- andl %r10d,%r9d
- andl %eax,%r15d
+ movl %r11d,%r9d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r9d
+ xorl %edi,%r9d
addl %r12d,%ebx
addl %r12d,%r9d
- leaq 1(%rdi),%rdi
- addl %r14d,%r9d
+ leaq 4(%rbp),%rbp
movl 48(%rsp),%r13d
- movl 36(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 36(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r9d
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 16(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 16(%rsp),%r12d
addl 44(%rsp),%r12d
movl %ebx,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %r9d,%r14d
rorl $14,%r13d
- movl %ecx,%r15d
- movl %r12d,44(%rsp)
+ movl %ecx,%edi
- rorl $9,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ rorl $9,%r14d
+ xorl %edx,%edi
- rorl $5,%r13d
- addl %r8d,%r12d
+ movl %r12d,44(%rsp)
xorl %r9d,%r14d
+ andl %ebx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %ebx,%r15d
- movl %r10d,%r8d
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
rorl $11,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ addl %edi,%r12d
- xorl %r11d,%r8d
+ movl %r9d,%edi
+ addl (%rbp),%r12d
xorl %r9d,%r14d
- addl %r15d,%r12d
- movl %r10d,%r15d
+ xorl %r10d,%edi
rorl $6,%r13d
- andl %r9d,%r8d
- andl %r11d,%r15d
+ movl %r10d,%r8d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r8d
+ xorl %r15d,%r8d
addl %r12d,%eax
addl %r12d,%r8d
- leaq 1(%rdi),%rdi
- addl %r14d,%r8d
+ leaq 20(%rbp),%rbp
movl 52(%rsp),%r13d
- movl 40(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 40(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r8d
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 20(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 20(%rsp),%r12d
addl 48(%rsp),%r12d
movl %eax,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %r8d,%r14d
rorl $14,%r13d
movl %ebx,%r15d
- movl %r12d,48(%rsp)
- rorl $9,%r14d
xorl %eax,%r13d
+ rorl $9,%r14d
xorl %ecx,%r15d
- rorl $5,%r13d
- addl %edx,%r12d
+ movl %r12d,48(%rsp)
xorl %r8d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %eax,%r15d
- movl %r9d,%edx
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
rorl $11,%r14d
xorl %eax,%r13d
- xorl %ecx,%r15d
+ addl %r15d,%r12d
- xorl %r10d,%edx
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
xorl %r8d,%r14d
- addl %r15d,%r12d
- movl %r9d,%r15d
+ xorl %r9d,%r15d
rorl $6,%r13d
- andl %r8d,%edx
- andl %r10d,%r15d
+ movl %r9d,%edx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%edx
+ xorl %edi,%edx
addl %r12d,%r11d
addl %r12d,%edx
- leaq 1(%rdi),%rdi
- addl %r14d,%edx
+ leaq 4(%rbp),%rbp
movl 56(%rsp),%r13d
- movl 44(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 44(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%edx
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 24(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 24(%rsp),%r12d
addl 52(%rsp),%r12d
movl %r11d,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %edx,%r14d
rorl $14,%r13d
- movl %eax,%r15d
- movl %r12d,52(%rsp)
+ movl %eax,%edi
- rorl $9,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ rorl $9,%r14d
+ xorl %ebx,%edi
- rorl $5,%r13d
- addl %ecx,%r12d
+ movl %r12d,52(%rsp)
xorl %edx,%r14d
+ andl %r11d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r11d,%r15d
- movl %r8d,%ecx
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
rorl $11,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ addl %edi,%r12d
- xorl %r9d,%ecx
+ movl %edx,%edi
+ addl (%rbp),%r12d
xorl %edx,%r14d
- addl %r15d,%r12d
- movl %r8d,%r15d
+ xorl %r8d,%edi
rorl $6,%r13d
- andl %edx,%ecx
- andl %r9d,%r15d
+ movl %r8d,%ecx
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ecx
+ xorl %r15d,%ecx
addl %r12d,%r10d
addl %r12d,%ecx
- leaq 1(%rdi),%rdi
- addl %r14d,%ecx
+ leaq 4(%rbp),%rbp
movl 60(%rsp),%r13d
- movl 48(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 48(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ecx
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 28(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 28(%rsp),%r12d
addl 56(%rsp),%r12d
movl %r10d,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %ecx,%r14d
rorl $14,%r13d
movl %r11d,%r15d
- movl %r12d,56(%rsp)
- rorl $9,%r14d
xorl %r10d,%r13d
+ rorl $9,%r14d
xorl %eax,%r15d
- rorl $5,%r13d
- addl %ebx,%r12d
+ movl %r12d,56(%rsp)
xorl %ecx,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r10d,%r15d
- movl %edx,%ebx
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
rorl $11,%r14d
xorl %r10d,%r13d
- xorl %eax,%r15d
+ addl %r15d,%r12d
- xorl %r8d,%ebx
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
xorl %ecx,%r14d
- addl %r15d,%r12d
- movl %edx,%r15d
+ xorl %edx,%r15d
rorl $6,%r13d
- andl %ecx,%ebx
- andl %r8d,%r15d
+ movl %edx,%ebx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ebx
+ xorl %edi,%ebx
addl %r12d,%r9d
addl %r12d,%ebx
- leaq 1(%rdi),%rdi
- addl %r14d,%ebx
+ leaq 4(%rbp),%rbp
movl 0(%rsp),%r13d
- movl 52(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 52(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ebx
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 32(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 32(%rsp),%r12d
addl 60(%rsp),%r12d
movl %r9d,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %ebx,%r14d
rorl $14,%r13d
- movl %r10d,%r15d
- movl %r12d,60(%rsp)
+ movl %r10d,%edi
- rorl $9,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ rorl $9,%r14d
+ xorl %r11d,%edi
- rorl $5,%r13d
- addl %eax,%r12d
+ movl %r12d,60(%rsp)
xorl %ebx,%r14d
+ andl %r9d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r9d,%r15d
- movl %ecx,%eax
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
rorl $11,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ addl %edi,%r12d
- xorl %edx,%eax
+ movl %ebx,%edi
+ addl (%rbp),%r12d
xorl %ebx,%r14d
- addl %r15d,%r12d
- movl %ecx,%r15d
+ xorl %ecx,%edi
rorl $6,%r13d
- andl %ebx,%eax
- andl %edx,%r15d
+ movl %ecx,%eax
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%eax
+ xorl %r15d,%eax
addl %r12d,%r8d
addl %r12d,%eax
- leaq 1(%rdi),%rdi
- addl %r14d,%eax
- cmpq $64,%rdi
- jb .Lrounds_16_xx
+ leaq 20(%rbp),%rbp
+ cmpb $0,3(%rbp)
+ jnz .Lrounds_16_xx
movq 64+0(%rsp),%rdi
+ addl %r14d,%eax
leaq 64(%rsi),%rsi
addl 0(%rdi),%eax
@@ -1800,20 +1743,1139 @@ sha256_block_data_order:
.type K256,@object
K256:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.type sha256_block_data_order_ssse3,@function
+.align 64
+sha256_block_data_order_ssse3:
+.Lssse3_shortcut:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movq %rsp,%r11
+ shlq $4,%rdx
+ subq $96,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %r11,64+24(%rsp)
+.Lprologue_ssse3:
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+
+
+ jmp .Lloop_ssse3
+.align 16
+.Lloop_ssse3:
+ movdqa K256+512(%rip),%xmm7
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+.byte 102,15,56,0,199
+ leaq K256(%rip),%rbp
+.byte 102,15,56,0,207
+ movdqa 0(%rbp),%xmm4
+.byte 102,15,56,0,215
+ movdqa 32(%rbp),%xmm5
+ paddd %xmm0,%xmm4
+ movdqa 64(%rbp),%xmm6
+.byte 102,15,56,0,223
+ movdqa 96(%rbp),%xmm7
+ paddd %xmm1,%xmm5
+ paddd %xmm2,%xmm6
+ paddd %xmm3,%xmm7
+ movdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ movdqa %xmm5,16(%rsp)
+ movl %ebx,%edi
+ movdqa %xmm6,32(%rsp)
+ xorl %ecx,%edi
+ movdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp .Lssse3_00_47
+
+.align 16
+.Lssse3_00_47:
+ subq $-128,%rbp
+ rorl $14,%r13d
+ movdqa %xmm1,%xmm4
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ movdqa %xmm3,%xmm7
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+.byte 102,15,58,15,224,4
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+.byte 102,15,58,15,250,4
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ paddd %xmm7,%xmm0
+ rorl $2,%r14d
+ addl %r11d,%edx
+ psrld $7,%xmm6
+ addl %edi,%r11d
+ movl %edx,%r13d
+ pshufd $250,%xmm3,%xmm7
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %r11d,%r14d
+ pxor %xmm5,%xmm4
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ pslld $11,%xmm5
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ pxor %xmm6,%xmm4
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ paddd %xmm4,%xmm0
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ psrlq $17,%xmm6
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ psrldq $8,%xmm7
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ paddd %xmm7,%xmm0
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ pshufd $80,%xmm0,%xmm7
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ movdqa %xmm7,%xmm6
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ psrld $10,%xmm7
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ psrlq $2,%xmm6
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ pxor %xmm6,%xmm7
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ movdqa 0(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ paddd %xmm7,%xmm0
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ paddd %xmm0,%xmm6
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ movdqa %xmm6,0(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm2,%xmm4
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ movdqa %xmm0,%xmm7
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+.byte 102,15,58,15,225,4
+ andl %eax,%r12d
+ xorl %eax,%r13d
+.byte 102,15,58,15,251,4
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ paddd %xmm7,%xmm1
+ rorl $2,%r14d
+ addl %edx,%r11d
+ psrld $7,%xmm6
+ addl %edi,%edx
+ movl %r11d,%r13d
+ pshufd $250,%xmm0,%xmm7
+ addl %edx,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%edx
+ movl %eax,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %edx,%r14d
+ pxor %xmm5,%xmm4
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ pslld $11,%xmm5
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ pxor %xmm6,%xmm4
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ paddd %xmm4,%xmm1
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ psrlq $17,%xmm6
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ psrldq $8,%xmm7
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ paddd %xmm7,%xmm1
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ pshufd $80,%xmm1,%xmm7
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ movdqa %xmm7,%xmm6
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ psrld $10,%xmm7
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ psrlq $2,%xmm6
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ pxor %xmm6,%xmm7
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ movdqa 32(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ paddd %xmm7,%xmm1
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ paddd %xmm1,%xmm6
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movdqa %xmm6,16(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm3,%xmm4
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ movdqa %xmm1,%xmm7
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+.byte 102,15,58,15,226,4
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+.byte 102,15,58,15,248,4
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ paddd %xmm7,%xmm2
+ rorl $2,%r14d
+ addl %r11d,%edx
+ psrld $7,%xmm6
+ addl %edi,%r11d
+ movl %edx,%r13d
+ pshufd $250,%xmm1,%xmm7
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %r11d,%r14d
+ pxor %xmm5,%xmm4
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ pslld $11,%xmm5
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ pxor %xmm6,%xmm4
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ paddd %xmm4,%xmm2
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ psrlq $17,%xmm6
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ psrldq $8,%xmm7
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ paddd %xmm7,%xmm2
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ pshufd $80,%xmm2,%xmm7
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ movdqa %xmm7,%xmm6
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ psrld $10,%xmm7
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ psrlq $2,%xmm6
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ pxor %xmm6,%xmm7
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ movdqa 64(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ paddd %xmm7,%xmm2
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ paddd %xmm2,%xmm6
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ movdqa %xmm6,32(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm0,%xmm4
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ movdqa %xmm2,%xmm7
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+.byte 102,15,58,15,227,4
+ andl %eax,%r12d
+ xorl %eax,%r13d
+.byte 102,15,58,15,249,4
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ paddd %xmm7,%xmm3
+ rorl $2,%r14d
+ addl %edx,%r11d
+ psrld $7,%xmm6
+ addl %edi,%edx
+ movl %r11d,%r13d
+ pshufd $250,%xmm2,%xmm7
+ addl %edx,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%edx
+ movl %eax,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %edx,%r14d
+ pxor %xmm5,%xmm4
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ pslld $11,%xmm5
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ pxor %xmm6,%xmm4
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ paddd %xmm4,%xmm3
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ psrlq $17,%xmm6
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ psrldq $8,%xmm7
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ paddd %xmm7,%xmm3
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ pshufd $80,%xmm3,%xmm7
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ movdqa %xmm7,%xmm6
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ psrld $10,%xmm7
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ psrlq $2,%xmm6
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ pxor %xmm6,%xmm7
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ movdqa 96(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ paddd %xmm7,%xmm3
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ paddd %xmm3,%xmm6
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movdqa %xmm6,48(%rsp)
+ cmpb $0,131(%rbp)
+ jne .Lssse3_00_47
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ rorl $2,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ rorl $2,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ rorl $2,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ rorl $2,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%rdi
+ movl %r14d,%eax
+
+ addl 0(%rdi),%eax
+ leaq 64(%rsi),%rsi
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb .Lloop_ssse3
+
+ movq 64+24(%rsp),%rsi
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lepilogue_ssse3:
+ .byte 0xf3,0xc3
+.size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3
.section .note.GNU-stack,"",%progbits
diff --git a/lib/accelerated/x86/macosx/aes-ssse3-x86.s b/lib/accelerated/x86/macosx/aes-ssse3-x86.s
index 33f33fd4f3..3c12cbf20e 100644
--- a/lib/accelerated/x86/macosx/aes-ssse3-x86.s
+++ b/lib/accelerated/x86/macosx/aes-ssse3-x86.s
@@ -82,33 +82,33 @@ __vpaes_encrypt_core:
movdqa %xmm6,%xmm1
movdqa (%ebp),%xmm2
pandn %xmm0,%xmm1
- movdqu (%edx),%xmm5
- psrld $4,%xmm1
pand %xmm6,%xmm0
+ movdqu (%edx),%xmm5
.byte 102,15,56,0,208
movdqa 16(%ebp),%xmm0
-.byte 102,15,56,0,193
pxor %xmm5,%xmm2
- pxor %xmm2,%xmm0
+ psrld $4,%xmm1
addl $16,%edx
+.byte 102,15,56,0,193
leal 192(%ebp),%ebx
+ pxor %xmm2,%xmm0
jmp L000enc_entry
.align 4,0x90
L001enc_loop:
movdqa 32(%ebp),%xmm4
-.byte 102,15,56,0,226
- pxor %xmm5,%xmm4
movdqa 48(%ebp),%xmm0
+.byte 102,15,56,0,226
.byte 102,15,56,0,195
- pxor %xmm4,%xmm0
+ pxor %xmm5,%xmm4
movdqa 64(%ebp),%xmm5
-.byte 102,15,56,0,234
+ pxor %xmm4,%xmm0
movdqa -64(%ebx,%ecx,1),%xmm1
+.byte 102,15,56,0,234
movdqa 80(%ebp),%xmm2
-.byte 102,15,56,0,211
- pxor %xmm5,%xmm2
movdqa (%ebx,%ecx,1),%xmm4
+.byte 102,15,56,0,211
movdqa %xmm0,%xmm3
+ pxor %xmm5,%xmm2
.byte 102,15,56,0,193
addl $16,%edx
pxor %xmm2,%xmm0
@@ -117,28 +117,28 @@ L001enc_loop:
pxor %xmm0,%xmm3
.byte 102,15,56,0,193
andl $48,%ecx
- pxor %xmm3,%xmm0
subl $1,%eax
+ pxor %xmm3,%xmm0
L000enc_entry:
movdqa %xmm6,%xmm1
+ movdqa -32(%ebp),%xmm5
pandn %xmm0,%xmm1
psrld $4,%xmm1
pand %xmm6,%xmm0
- movdqa -32(%ebp),%xmm5
.byte 102,15,56,0,232
- pxor %xmm1,%xmm0
movdqa %xmm7,%xmm3
+ pxor %xmm1,%xmm0
.byte 102,15,56,0,217
- pxor %xmm5,%xmm3
movdqa %xmm7,%xmm4
+ pxor %xmm5,%xmm3
.byte 102,15,56,0,224
- pxor %xmm5,%xmm4
movdqa %xmm7,%xmm2
+ pxor %xmm5,%xmm4
.byte 102,15,56,0,211
- pxor %xmm0,%xmm2
movdqa %xmm7,%xmm3
- movdqu (%edx),%xmm5
+ pxor %xmm0,%xmm2
.byte 102,15,56,0,220
+ movdqu (%edx),%xmm5
pxor %xmm1,%xmm3
jnz L001enc_loop
movdqa 96(%ebp),%xmm4
@@ -152,8 +152,8 @@ L000enc_entry:
ret
.align 4
__vpaes_decrypt_core:
- movl 240(%edx),%eax
leal 608(%ebp),%ebx
+ movl 240(%edx),%eax
movdqa %xmm6,%xmm1
movdqa -64(%ebx),%xmm2
pandn %xmm0,%xmm1
@@ -176,56 +176,56 @@ __vpaes_decrypt_core:
.align 4,0x90
L003dec_loop:
movdqa -32(%ebx),%xmm4
+ movdqa -16(%ebx),%xmm1
.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa -16(%ebx),%xmm0
-.byte 102,15,56,0,195
+.byte 102,15,56,0,203
pxor %xmm4,%xmm0
- addl $16,%edx
-.byte 102,15,56,0,197
movdqa (%ebx),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 16(%ebx),%xmm1
.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa 16(%ebx),%xmm0
-.byte 102,15,56,0,195
- pxor %xmm4,%xmm0
- subl $1,%eax
.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
movdqa 32(%ebx),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 48(%ebx),%xmm1
.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa 48(%ebx),%xmm0
-.byte 102,15,56,0,195
- pxor %xmm4,%xmm0
.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
movdqa 64(%ebx),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 80(%ebx),%xmm1
.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa 80(%ebx),%xmm0
-.byte 102,15,56,0,195
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
pxor %xmm4,%xmm0
+ addl $16,%edx
.byte 102,15,58,15,237,12
+ pxor %xmm1,%xmm0
+ subl $1,%eax
L002dec_entry:
movdqa %xmm6,%xmm1
+ movdqa -32(%ebp),%xmm2
pandn %xmm0,%xmm1
- psrld $4,%xmm1
pand %xmm6,%xmm0
- movdqa -32(%ebp),%xmm2
+ psrld $4,%xmm1
.byte 102,15,56,0,208
- pxor %xmm1,%xmm0
movdqa %xmm7,%xmm3
+ pxor %xmm1,%xmm0
.byte 102,15,56,0,217
- pxor %xmm2,%xmm3
movdqa %xmm7,%xmm4
+ pxor %xmm2,%xmm3
.byte 102,15,56,0,224
pxor %xmm2,%xmm4
movdqa %xmm7,%xmm2
.byte 102,15,56,0,211
- pxor %xmm0,%xmm2
movdqa %xmm7,%xmm3
+ pxor %xmm0,%xmm2
.byte 102,15,56,0,220
- pxor %xmm1,%xmm3
movdqu (%edx),%xmm0
+ pxor %xmm1,%xmm3
jnz L003dec_loop
movdqa 96(%ebx),%xmm4
.byte 102,15,56,0,226
@@ -330,12 +330,12 @@ L013schedule_mangle_last_dec:
ret
.align 4
__vpaes_schedule_192_smear:
- pshufd $128,%xmm6,%xmm0
- pxor %xmm0,%xmm6
+ pshufd $128,%xmm6,%xmm1
pshufd $254,%xmm7,%xmm0
+ pxor %xmm1,%xmm6
+ pxor %xmm1,%xmm1
pxor %xmm0,%xmm6
movdqa %xmm6,%xmm0
- pxor %xmm1,%xmm1
movhlps %xmm1,%xmm6
ret
.align 4
@@ -588,6 +588,8 @@ L_vpaes_cbc_encrypt_begin:
movl 24(%esp),%edi
movl 28(%esp),%eax
movl 32(%esp),%edx
+ subl $16,%eax
+ jc L020cbc_abort
leal -56(%esp),%ebx
movl 36(%esp),%ebp
andl $-16,%ebx
@@ -597,18 +599,17 @@ L_vpaes_cbc_encrypt_begin:
subl %esi,%edi
movl %ebx,48(%esp)
movl %edi,(%esp)
- subl $16,%eax
movl %edx,4(%esp)
movl %ebp,8(%esp)
movl %eax,%edi
- leal L_vpaes_consts+0x30-L020pic_point,%ebp
+ leal L_vpaes_consts+0x30-L021pic_point,%ebp
call __vpaes_preheat
-L020pic_point:
+L021pic_point:
cmpl $0,%ecx
- je L021cbc_dec_loop
- jmp L022cbc_enc_loop
+ je L022cbc_dec_loop
+ jmp L023cbc_enc_loop
.align 4,0x90
-L022cbc_enc_loop:
+L023cbc_enc_loop:
movdqu (%esi),%xmm0
pxor %xmm1,%xmm0
call __vpaes_encrypt_core
@@ -618,10 +619,10 @@ L022cbc_enc_loop:
movdqu %xmm0,(%ebx,%esi,1)
leal 16(%esi),%esi
subl $16,%edi
- jnc L022cbc_enc_loop
- jmp L023cbc_done
+ jnc L023cbc_enc_loop
+ jmp L024cbc_done
.align 4,0x90
-L021cbc_dec_loop:
+L022cbc_dec_loop:
movdqu (%esi),%xmm0
movdqa %xmm1,16(%esp)
movdqa %xmm0,32(%esp)
@@ -633,11 +634,12 @@ L021cbc_dec_loop:
movdqu %xmm0,(%ebx,%esi,1)
leal 16(%esi),%esi
subl $16,%edi
- jnc L021cbc_dec_loop
-L023cbc_done:
+ jnc L022cbc_dec_loop
+L024cbc_done:
movl 8(%esp),%ebx
movl 48(%esp),%esp
movdqu %xmm1,(%ebx)
+L020cbc_abort:
popl %edi
popl %esi
popl %ebx
diff --git a/lib/accelerated/x86/macosx/aes-ssse3-x86_64.s b/lib/accelerated/x86/macosx/aes-ssse3-x86_64.s
index 1fff24b967..ecb0b770a4 100644
--- a/lib/accelerated/x86/macosx/aes-ssse3-x86_64.s
+++ b/lib/accelerated/x86/macosx/aes-ssse3-x86_64.s
@@ -43,8 +43,8 @@ _vpaes_encrypt_core:
movdqa L$k_ipt+16(%rip),%xmm0
.byte 102,15,56,0,193
pxor %xmm5,%xmm2
- pxor %xmm2,%xmm0
addq $16,%r9
+ pxor %xmm2,%xmm0
leaq L$k_mc_backward(%rip),%r10
jmp L$enc_entry
@@ -52,19 +52,19 @@ _vpaes_encrypt_core:
L$enc_loop:
movdqa %xmm13,%xmm4
-.byte 102,15,56,0,226
- pxor %xmm5,%xmm4
movdqa %xmm12,%xmm0
+.byte 102,15,56,0,226
.byte 102,15,56,0,195
- pxor %xmm4,%xmm0
+ pxor %xmm5,%xmm4
movdqa %xmm15,%xmm5
-.byte 102,15,56,0,234
+ pxor %xmm4,%xmm0
movdqa -64(%r11,%r10,1),%xmm1
+.byte 102,15,56,0,234
+ movdqa (%r11,%r10,1),%xmm4
movdqa %xmm14,%xmm2
.byte 102,15,56,0,211
- pxor %xmm5,%xmm2
- movdqa (%r11,%r10,1),%xmm4
movdqa %xmm0,%xmm3
+ pxor %xmm5,%xmm2
.byte 102,15,56,0,193
addq $16,%r9
pxor %xmm2,%xmm0
@@ -73,30 +73,30 @@ L$enc_loop:
pxor %xmm0,%xmm3
.byte 102,15,56,0,193
andq $48,%r11
- pxor %xmm3,%xmm0
subq $1,%rax
+ pxor %xmm3,%xmm0
L$enc_entry:
movdqa %xmm9,%xmm1
+ movdqa %xmm11,%xmm5
pandn %xmm0,%xmm1
psrld $4,%xmm1
pand %xmm9,%xmm0
- movdqa %xmm11,%xmm5
.byte 102,15,56,0,232
- pxor %xmm1,%xmm0
movdqa %xmm10,%xmm3
+ pxor %xmm1,%xmm0
.byte 102,15,56,0,217
- pxor %xmm5,%xmm3
movdqa %xmm10,%xmm4
+ pxor %xmm5,%xmm3
.byte 102,15,56,0,224
- pxor %xmm5,%xmm4
movdqa %xmm10,%xmm2
+ pxor %xmm5,%xmm4
.byte 102,15,56,0,211
- pxor %xmm0,%xmm2
movdqa %xmm10,%xmm3
- movdqu (%r9),%xmm5
+ pxor %xmm0,%xmm2
.byte 102,15,56,0,220
+ movdqu (%r9),%xmm5
pxor %xmm1,%xmm3
jnz L$enc_loop
@@ -149,62 +149,61 @@ L$dec_loop:
movdqa -32(%r10),%xmm4
+ movdqa -16(%r10),%xmm1
.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa -16(%r10),%xmm0
-.byte 102,15,56,0,195
+.byte 102,15,56,0,203
pxor %xmm4,%xmm0
- addq $16,%r9
-
-.byte 102,15,56,0,197
movdqa 0(%r10),%xmm4
-.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa 16(%r10),%xmm0
-.byte 102,15,56,0,195
- pxor %xmm4,%xmm0
- subq $1,%rax
+ pxor %xmm1,%xmm0
+ movdqa 16(%r10),%xmm1
-.byte 102,15,56,0,197
- movdqa 32(%r10),%xmm4
.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa 48(%r10),%xmm0
-.byte 102,15,56,0,195
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
pxor %xmm4,%xmm0
+ movdqa 32(%r10),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 48(%r10),%xmm1
+.byte 102,15,56,0,226
.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
movdqa 64(%r10),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 80(%r10),%xmm1
+
.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa 80(%r10),%xmm0
-.byte 102,15,56,0,195
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
pxor %xmm4,%xmm0
-
+ addq $16,%r9
.byte 102,15,58,15,237,12
+ pxor %xmm1,%xmm0
+ subq $1,%rax
L$dec_entry:
movdqa %xmm9,%xmm1
pandn %xmm0,%xmm1
+ movdqa %xmm11,%xmm2
psrld $4,%xmm1
pand %xmm9,%xmm0
- movdqa %xmm11,%xmm2
.byte 102,15,56,0,208
- pxor %xmm1,%xmm0
movdqa %xmm10,%xmm3
+ pxor %xmm1,%xmm0
.byte 102,15,56,0,217
- pxor %xmm2,%xmm3
movdqa %xmm10,%xmm4
+ pxor %xmm2,%xmm3
.byte 102,15,56,0,224
pxor %xmm2,%xmm4
movdqa %xmm10,%xmm2
.byte 102,15,56,0,211
- pxor %xmm0,%xmm2
movdqa %xmm10,%xmm3
+ pxor %xmm0,%xmm2
.byte 102,15,56,0,220
- pxor %xmm1,%xmm3
movdqu (%r9),%xmm0
+ pxor %xmm1,%xmm3
jnz L$dec_loop
@@ -212,7 +211,7 @@ L$dec_entry:
.byte 102,15,56,0,226
pxor %xmm0,%xmm4
movdqa 112(%r10),%xmm0
- movdqa L$k_sr-L$k_dsbd(%r11),%xmm2
+ movdqa -352(%r11),%xmm2
.byte 102,15,56,0,195
pxor %xmm4,%xmm0
.byte 102,15,56,0,194
@@ -232,7 +231,7 @@ _vpaes_schedule_core:
- call _vpaes_preheat
+ call _vpaes_preheat
movdqa L$k_rcon(%rip),%xmm8
movdqu (%rdi),%xmm0
@@ -278,7 +277,7 @@ L$oop_schedule_128:
call _vpaes_schedule_round
decq %rsi
jz L$schedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
jmp L$oop_schedule_128
@@ -299,7 +298,7 @@ L$oop_schedule_128:
.p2align 4
L$schedule_192:
movdqu 8(%rdi),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movdqa %xmm0,%xmm6
pxor %xmm4,%xmm4
movhlps %xmm4,%xmm6
@@ -308,13 +307,13 @@ L$schedule_192:
L$oop_schedule_192:
call _vpaes_schedule_round
.byte 102,15,58,15,198,8
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_round
decq %rsi
jz L$schedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
jmp L$oop_schedule_192
@@ -331,18 +330,18 @@ L$oop_schedule_192:
.p2align 4
L$schedule_256:
movdqu 16(%rdi),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movl $7,%esi
L$oop_schedule_256:
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
movdqa %xmm0,%xmm6
call _vpaes_schedule_round
decq %rsi
jz L$schedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
pshufd $255,%xmm0,%xmm0
@@ -380,7 +379,7 @@ L$schedule_mangle_last:
L$schedule_mangle_last_dec:
addq $-16,%rdx
pxor L$k_s63(%rip),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movdqu %xmm0,(%rdx)
@@ -412,12 +411,12 @@ L$schedule_mangle_last_dec:
.p2align 4
_vpaes_schedule_192_smear:
- pshufd $128,%xmm6,%xmm0
- pxor %xmm0,%xmm6
+ pshufd $128,%xmm6,%xmm1
pshufd $254,%xmm7,%xmm0
+ pxor %xmm1,%xmm6
+ pxor %xmm1,%xmm1
pxor %xmm0,%xmm6
movdqa %xmm6,%xmm0
- pxor %xmm1,%xmm1
movhlps %xmm1,%xmm6
.byte 0xf3,0xc3
@@ -680,9 +679,10 @@ _vpaes_decrypt:
.p2align 4
_vpaes_cbc_encrypt:
xchgq %rcx,%rdx
+ subq $16,%rcx
+ jc L$cbc_abort
movdqu (%r8),%xmm6
subq %rdi,%rsi
- subq $16,%rcx
call _vpaes_preheat
cmpl $0,%r9d
je L$cbc_dec_loop
@@ -711,6 +711,7 @@ L$cbc_dec_loop:
jnc L$cbc_dec_loop
L$cbc_done:
movdqu %xmm6,(%r8)
+L$cbc_abort:
.byte 0xf3,0xc3
@@ -833,7 +834,7 @@ L$k_dsbe:
L$k_dsbo:
.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
-.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
.p2align 6
diff --git a/lib/accelerated/x86/macosx/aesni-x86.s b/lib/accelerated/x86/macosx/aesni-x86.s
index 09ca1cbc5c..ef1a0c25a2 100644
--- a/lib/accelerated/x86/macosx/aesni-x86.s
+++ b/lib/accelerated/x86/macosx/aesni-x86.s
@@ -84,27 +84,78 @@ L001dec1_loop_2:
movups %xmm2,(%eax)
ret
.align 4
+__aesni_encrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+L002enc2_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L002enc2_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ ret
+.align 4
+__aesni_decrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+L003dec2_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L003dec2_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+ ret
+.align 4
__aesni_encrypt3:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
- movups (%edx),%xmm0
-L002enc3_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+L004enc3_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
- movups (%edx),%xmm0
- jnz L002enc3_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L004enc3_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -115,25 +166,26 @@ L002enc3_loop:
.align 4
__aesni_decrypt3:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
- movups (%edx),%xmm0
-L003dec3_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+L005dec3_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
- movups (%edx),%xmm0
- jnz L003dec3_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L005dec3_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -145,27 +197,29 @@ L003dec3_loop:
__aesni_encrypt4:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
- shrl $1,%ecx
- leal 32(%edx),%edx
+ shll $4,%ecx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
pxor %xmm0,%xmm5
- movups (%edx),%xmm0
-L004enc4_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+L006enc4_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
.byte 102,15,56,220,233
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
- movups (%edx),%xmm0
- jnz L004enc4_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L006enc4_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -179,27 +233,29 @@ L004enc4_loop:
__aesni_decrypt4:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
- shrl $1,%ecx
- leal 32(%edx),%edx
+ shll $4,%ecx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
pxor %xmm0,%xmm5
- movups (%edx),%xmm0
-L005dec4_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+L007dec4_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
.byte 102,15,56,222,233
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
- movups (%edx),%xmm0
- jnz L005dec4_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L007dec4_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -212,45 +268,44 @@ L005dec4_loop:
.align 4
__aesni_encrypt6:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
+.byte 102,15,56,220,209
pxor %xmm0,%xmm5
- decl %ecx
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,220,225
pxor %xmm0,%xmm7
+ addl $16,%ecx
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%edx),%xmm0
.byte 102,15,56,220,249
+ movups -16(%edx,%ecx,1),%xmm0
jmp L_aesni_encrypt6_enter
.align 4,0x90
-L006enc6_loop:
+L008enc6_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.align 4,0x90
L_aesni_encrypt6_enter:
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%edx),%xmm0
- jnz L006enc6_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L008enc6_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -267,45 +322,44 @@ L_aesni_encrypt6_enter:
.align 4
__aesni_decrypt6:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
-.byte 102,15,56,222,217
+.byte 102,15,56,222,209
pxor %xmm0,%xmm5
- decl %ecx
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,222,225
pxor %xmm0,%xmm7
+ addl $16,%ecx
+.byte 102,15,56,222,233
.byte 102,15,56,222,241
- movups (%edx),%xmm0
.byte 102,15,56,222,249
+ movups -16(%edx,%ecx,1),%xmm0
jmp L_aesni_decrypt6_enter
.align 4,0x90
-L007dec6_loop:
+L009dec6_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.align 4,0x90
L_aesni_decrypt6_enter:
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%edx),%xmm0
- jnz L007dec6_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L009dec6_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -333,14 +387,14 @@ L_aesni_ecb_encrypt_begin:
movl 32(%esp),%edx
movl 36(%esp),%ebx
andl $-16,%eax
- jz L008ecb_ret
+ jz L010ecb_ret
movl 240(%edx),%ecx
testl %ebx,%ebx
- jz L009ecb_decrypt
+ jz L011ecb_decrypt
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb L010ecb_enc_tail
+ jb L012ecb_enc_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -349,9 +403,9 @@ L_aesni_ecb_encrypt_begin:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp L011ecb_enc_loop6_enter
+ jmp L013ecb_enc_loop6_enter
.align 4,0x90
-L012ecb_enc_loop6:
+L014ecb_enc_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -366,12 +420,12 @@ L012ecb_enc_loop6:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-L011ecb_enc_loop6_enter:
+L013ecb_enc_loop6_enter:
call __aesni_encrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc L012ecb_enc_loop6
+ jnc L014ecb_enc_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -380,18 +434,18 @@ L011ecb_enc_loop6_enter:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz L008ecb_ret
-L010ecb_enc_tail:
+ jz L010ecb_ret
+L012ecb_enc_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb L013ecb_enc_one
+ jb L015ecb_enc_one
movups 16(%esi),%xmm3
- je L014ecb_enc_two
+ je L016ecb_enc_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb L015ecb_enc_three
+ jb L017ecb_enc_three
movups 48(%esi),%xmm5
- je L016ecb_enc_four
+ je L018ecb_enc_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call __aesni_encrypt6
@@ -400,50 +454,49 @@ L010ecb_enc_tail:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp L008ecb_ret
+ jmp L010ecb_ret
.align 4,0x90
-L013ecb_enc_one:
+L015ecb_enc_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L017enc1_loop_3:
+L019enc1_loop_3:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L017enc1_loop_3
+ jnz L019enc1_loop_3
.byte 102,15,56,221,209
movups %xmm2,(%edi)
- jmp L008ecb_ret
+ jmp L010ecb_ret
.align 4,0x90
-L014ecb_enc_two:
- xorps %xmm4,%xmm4
- call __aesni_encrypt3
+L016ecb_enc_two:
+ call __aesni_encrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp L008ecb_ret
+ jmp L010ecb_ret
.align 4,0x90
-L015ecb_enc_three:
+L017ecb_enc_three:
call __aesni_encrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp L008ecb_ret
+ jmp L010ecb_ret
.align 4,0x90
-L016ecb_enc_four:
+L018ecb_enc_four:
call __aesni_encrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
- jmp L008ecb_ret
+ jmp L010ecb_ret
.align 4,0x90
-L009ecb_decrypt:
+L011ecb_decrypt:
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb L018ecb_dec_tail
+ jb L020ecb_dec_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -452,9 +505,9 @@ L009ecb_decrypt:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp L019ecb_dec_loop6_enter
+ jmp L021ecb_dec_loop6_enter
.align 4,0x90
-L020ecb_dec_loop6:
+L022ecb_dec_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -469,12 +522,12 @@ L020ecb_dec_loop6:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-L019ecb_dec_loop6_enter:
+L021ecb_dec_loop6_enter:
call __aesni_decrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc L020ecb_dec_loop6
+ jnc L022ecb_dec_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -483,18 +536,18 @@ L019ecb_dec_loop6_enter:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz L008ecb_ret
-L018ecb_dec_tail:
+ jz L010ecb_ret
+L020ecb_dec_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb L021ecb_dec_one
+ jb L023ecb_dec_one
movups 16(%esi),%xmm3
- je L022ecb_dec_two
+ je L024ecb_dec_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb L023ecb_dec_three
+ jb L025ecb_dec_three
movups 48(%esi),%xmm5
- je L024ecb_dec_four
+ je L026ecb_dec_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call __aesni_decrypt6
@@ -503,44 +556,43 @@ L018ecb_dec_tail:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp L008ecb_ret
+ jmp L010ecb_ret
.align 4,0x90
-L021ecb_dec_one:
+L023ecb_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L025dec1_loop_4:
+L027dec1_loop_4:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L025dec1_loop_4
+ jnz L027dec1_loop_4
.byte 102,15,56,223,209
movups %xmm2,(%edi)
- jmp L008ecb_ret
+ jmp L010ecb_ret
.align 4,0x90
-L022ecb_dec_two:
- xorps %xmm4,%xmm4
- call __aesni_decrypt3
+L024ecb_dec_two:
+ call __aesni_decrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp L008ecb_ret
+ jmp L010ecb_ret
.align 4,0x90
-L023ecb_dec_three:
+L025ecb_dec_three:
call __aesni_decrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp L008ecb_ret
+ jmp L010ecb_ret
.align 4,0x90
-L024ecb_dec_four:
+L026ecb_dec_four:
call __aesni_decrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-L008ecb_ret:
+L010ecb_ret:
popl %edi
popl %esi
popl %ebx
@@ -577,45 +629,45 @@ L_aesni_ccm64_encrypt_blocks_begin:
movl %ebp,20(%esp)
movl %ebp,24(%esp)
movl %ebp,28(%esp)
- shrl $1,%ecx
+ shll $4,%ecx
+ movl $16,%ebx
leal (%edx),%ebp
movdqa (%esp),%xmm5
movdqa %xmm7,%xmm2
- movl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ subl %ecx,%ebx
.byte 102,15,56,0,253
-L026ccm64_enc_outer:
+L028ccm64_enc_outer:
movups (%ebp),%xmm0
movl %ebx,%ecx
movups (%esi),%xmm6
xorps %xmm0,%xmm2
movups 16(%ebp),%xmm1
xorps %xmm6,%xmm0
- leal 32(%ebp),%edx
xorps %xmm0,%xmm3
- movups (%edx),%xmm0
-L027ccm64_enc2_loop:
+ movups 32(%ebp),%xmm0
+L029ccm64_enc2_loop:
.byte 102,15,56,220,209
- decl %ecx
.byte 102,15,56,220,217
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
- leal 32(%edx),%edx
.byte 102,15,56,220,216
- movups (%edx),%xmm0
- jnz L027ccm64_enc2_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L029ccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
paddq 16(%esp),%xmm7
+ decl %eax
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- decl %eax
leal 16(%esi),%esi
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
movups %xmm6,(%edi)
- leal 16(%edi),%edi
.byte 102,15,56,0,213
- jnz L026ccm64_enc_outer
+ leal 16(%edi),%edi
+ jnz L028ccm64_enc_outer
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
@@ -664,67 +716,70 @@ L_aesni_ccm64_decrypt_blocks_begin:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L028enc1_loop_5:
+L030enc1_loop_5:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L028enc1_loop_5
+ jnz L030enc1_loop_5
.byte 102,15,56,221,209
+ shll $4,%ebx
+ movl $16,%ecx
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
leal 16(%esi),%esi
- jmp L029ccm64_dec_outer
+ subl %ebx,%ecx
+ leal 32(%ebp,%ebx,1),%edx
+ movl %ecx,%ebx
+ jmp L031ccm64_dec_outer
.align 4,0x90
-L029ccm64_dec_outer:
+L031ccm64_dec_outer:
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
- movl %ebx,%ecx
movups %xmm6,(%edi)
leal 16(%edi),%edi
.byte 102,15,56,0,213
subl $1,%eax
- jz L030ccm64_dec_break
+ jz L032ccm64_dec_break
movups (%ebp),%xmm0
- shrl $1,%ecx
+ movl %ebx,%ecx
movups 16(%ebp),%xmm1
xorps %xmm0,%xmm6
- leal 32(%ebp),%edx
xorps %xmm0,%xmm2
xorps %xmm6,%xmm3
- movups (%edx),%xmm0
-L031ccm64_dec2_loop:
+ movups 32(%ebp),%xmm0
+L033ccm64_dec2_loop:
.byte 102,15,56,220,209
- decl %ecx
.byte 102,15,56,220,217
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
- leal 32(%edx),%edx
.byte 102,15,56,220,216
- movups (%edx),%xmm0
- jnz L031ccm64_dec2_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L033ccm64_dec2_loop
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- leal 16(%esi),%esi
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- jmp L029ccm64_dec_outer
+ leal 16(%esi),%esi
+ jmp L031ccm64_dec_outer
.align 4,0x90
-L030ccm64_dec_break:
+L032ccm64_dec_break:
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movups (%edx),%xmm0
movups 16(%edx),%xmm1
xorps %xmm0,%xmm6
leal 32(%edx),%edx
xorps %xmm6,%xmm3
-L032enc1_loop_6:
+L034enc1_loop_6:
.byte 102,15,56,220,217
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L032enc1_loop_6
+ jnz L034enc1_loop_6
.byte 102,15,56,221,217
movl 48(%esp),%esp
movl 40(%esp),%edi
@@ -752,7 +807,7 @@ L_aesni_ctr32_encrypt_blocks_begin:
andl $-16,%esp
movl %ebp,80(%esp)
cmpl $1,%eax
- je L033ctr32_one_shortcut
+ je L035ctr32_one_shortcut
movdqu (%ebx),%xmm7
movl $202182159,(%esp)
movl $134810123,4(%esp)
@@ -768,63 +823,59 @@ L_aesni_ctr32_encrypt_blocks_begin:
.byte 102,15,58,34,253,3
movl 240(%edx),%ecx
bswap %ebx
- pxor %xmm1,%xmm1
pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movdqa (%esp),%xmm2
-.byte 102,15,58,34,203,0
+.byte 102,15,58,34,195,0
leal 3(%ebx),%ebp
-.byte 102,15,58,34,197,0
+.byte 102,15,58,34,205,0
incl %ebx
-.byte 102,15,58,34,203,1
+.byte 102,15,58,34,195,1
incl %ebp
-.byte 102,15,58,34,197,1
+.byte 102,15,58,34,205,1
incl %ebx
-.byte 102,15,58,34,203,2
+.byte 102,15,58,34,195,2
incl %ebp
-.byte 102,15,58,34,197,2
- movdqa %xmm1,48(%esp)
-.byte 102,15,56,0,202
- movdqa %xmm0,64(%esp)
+.byte 102,15,58,34,205,2
+ movdqa %xmm0,48(%esp)
.byte 102,15,56,0,194
- pshufd $192,%xmm1,%xmm2
- pshufd $128,%xmm1,%xmm3
+ movdqu (%edx),%xmm6
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
+ pshufd $192,%xmm0,%xmm2
+ pshufd $128,%xmm0,%xmm3
cmpl $6,%eax
- jb L034ctr32_tail
+ jb L036ctr32_tail
+ pxor %xmm6,%xmm7
+ shll $4,%ecx
+ movl $16,%ebx
movdqa %xmm7,32(%esp)
- shrl $1,%ecx
movl %edx,%ebp
- movl %ecx,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
subl $6,%eax
- jmp L035ctr32_loop6
+ jmp L037ctr32_loop6
.align 4,0x90
-L035ctr32_loop6:
- pshufd $64,%xmm1,%xmm4
- movdqa 32(%esp),%xmm1
- pshufd $192,%xmm0,%xmm5
- por %xmm1,%xmm2
- pshufd $128,%xmm0,%xmm6
- por %xmm1,%xmm3
- pshufd $64,%xmm0,%xmm7
- por %xmm1,%xmm4
- por %xmm1,%xmm5
- por %xmm1,%xmm6
- por %xmm1,%xmm7
- movups (%ebp),%xmm0
- movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
- decl %ecx
+L037ctr32_loop6:
+ pshufd $64,%xmm0,%xmm4
+ movdqa 32(%esp),%xmm0
+ pshufd $192,%xmm1,%xmm5
pxor %xmm0,%xmm2
+ pshufd $128,%xmm1,%xmm6
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
+ pshufd $64,%xmm1,%xmm7
+ movups 16(%ebp),%xmm1
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
+.byte 102,15,56,220,209
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
pxor %xmm0,%xmm7
+.byte 102,15,56,220,217
+ movups 32(%ebp),%xmm0
+ movl %ebx,%ecx
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%edx),%xmm0
.byte 102,15,56,220,249
call L_aesni_encrypt6_enter
movups (%esi),%xmm1
@@ -835,51 +886,51 @@ L035ctr32_loop6:
movups %xmm2,(%edi)
movdqa 16(%esp),%xmm0
xorps %xmm1,%xmm4
- movdqa 48(%esp),%xmm1
+ movdqa 64(%esp),%xmm1
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
paddd %xmm0,%xmm1
- paddd 64(%esp),%xmm0
+ paddd 48(%esp),%xmm0
movdqa (%esp),%xmm2
movups 48(%esi),%xmm3
movups 64(%esi),%xmm4
xorps %xmm3,%xmm5
movups 80(%esi),%xmm3
leal 96(%esi),%esi
- movdqa %xmm1,48(%esp)
-.byte 102,15,56,0,202
+ movdqa %xmm0,48(%esp)
+.byte 102,15,56,0,194
xorps %xmm4,%xmm6
movups %xmm5,48(%edi)
xorps %xmm3,%xmm7
- movdqa %xmm0,64(%esp)
-.byte 102,15,56,0,194
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
movups %xmm6,64(%edi)
- pshufd $192,%xmm1,%xmm2
+ pshufd $192,%xmm0,%xmm2
movups %xmm7,80(%edi)
leal 96(%edi),%edi
- movl %ebx,%ecx
- pshufd $128,%xmm1,%xmm3
+ pshufd $128,%xmm0,%xmm3
subl $6,%eax
- jnc L035ctr32_loop6
+ jnc L037ctr32_loop6
addl $6,%eax
- jz L036ctr32_ret
+ jz L038ctr32_ret
+ movdqu (%ebp),%xmm7
movl %ebp,%edx
- leal 1(,%ecx,2),%ecx
- movdqa 32(%esp),%xmm7
-L034ctr32_tail:
+ pxor 32(%esp),%xmm7
+ movl 240(%ebp),%ecx
+L036ctr32_tail:
por %xmm7,%xmm2
cmpl $2,%eax
- jb L037ctr32_one
- pshufd $64,%xmm1,%xmm4
+ jb L039ctr32_one
+ pshufd $64,%xmm0,%xmm4
por %xmm7,%xmm3
- je L038ctr32_two
- pshufd $192,%xmm0,%xmm5
+ je L040ctr32_two
+ pshufd $192,%xmm1,%xmm5
por %xmm7,%xmm4
cmpl $4,%eax
- jb L039ctr32_three
- pshufd $128,%xmm0,%xmm6
+ jb L041ctr32_three
+ pshufd $128,%xmm1,%xmm6
por %xmm7,%xmm5
- je L040ctr32_four
+ je L042ctr32_four
por %xmm7,%xmm6
call __aesni_encrypt6
movups (%esi),%xmm1
@@ -897,39 +948,39 @@ L034ctr32_tail:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp L036ctr32_ret
+ jmp L038ctr32_ret
.align 4,0x90
-L033ctr32_one_shortcut:
+L035ctr32_one_shortcut:
movups (%ebx),%xmm2
movl 240(%edx),%ecx
-L037ctr32_one:
+L039ctr32_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L041enc1_loop_7:
+L043enc1_loop_7:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L041enc1_loop_7
+ jnz L043enc1_loop_7
.byte 102,15,56,221,209
movups (%esi),%xmm6
xorps %xmm2,%xmm6
movups %xmm6,(%edi)
- jmp L036ctr32_ret
+ jmp L038ctr32_ret
.align 4,0x90
-L038ctr32_two:
- call __aesni_encrypt3
+L040ctr32_two:
+ call __aesni_encrypt2
movups (%esi),%xmm5
movups 16(%esi),%xmm6
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp L036ctr32_ret
+ jmp L038ctr32_ret
.align 4,0x90
-L039ctr32_three:
+L041ctr32_three:
call __aesni_encrypt3
movups (%esi),%xmm5
movups 16(%esi),%xmm6
@@ -940,9 +991,9 @@ L039ctr32_three:
xorps %xmm7,%xmm4
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp L036ctr32_ret
+ jmp L038ctr32_ret
.align 4,0x90
-L040ctr32_four:
+L042ctr32_four:
call __aesni_encrypt4
movups (%esi),%xmm6
movups 16(%esi),%xmm7
@@ -956,7 +1007,7 @@ L040ctr32_four:
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-L036ctr32_ret:
+L038ctr32_ret:
movl 80(%esp),%esp
popl %edi
popl %esi
@@ -979,12 +1030,12 @@ L_aesni_xts_encrypt_begin:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L042enc1_loop_8:
+L044enc1_loop_8:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L042enc1_loop_8
+ jnz L044enc1_loop_8
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1008,12 +1059,14 @@ L042enc1_loop_8:
movl %edx,%ebp
movl %ecx,%ebx
subl $96,%eax
- jc L043xts_enc_short
- shrl $1,%ecx
- movl %ecx,%ebx
- jmp L044xts_enc_loop6
+ jc L045xts_enc_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp L046xts_enc_loop6
.align 4,0x90
-L044xts_enc_loop6:
+L046xts_enc_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1049,6 +1102,7 @@ L044xts_enc_loop6:
pand %xmm3,%xmm7
movups (%esi),%xmm2
pxor %xmm1,%xmm7
+ movl %ebx,%ecx
movdqu 16(%esi),%xmm3
xorps %xmm0,%xmm2
movdqu 32(%esi),%xmm4
@@ -1064,19 +1118,17 @@ L044xts_enc_loop6:
movdqa %xmm7,80(%esp)
pxor %xmm1,%xmm7
movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
pxor 16(%esp),%xmm3
-.byte 102,15,56,220,209
pxor 32(%esp),%xmm4
-.byte 102,15,56,220,217
+.byte 102,15,56,220,209
pxor 48(%esp),%xmm5
- decl %ecx
-.byte 102,15,56,220,225
pxor 64(%esp),%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,217
pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%edx),%xmm0
.byte 102,15,56,220,249
call L_aesni_encrypt6_enter
movdqa 80(%esp),%xmm1
@@ -1101,26 +1153,25 @@ L044xts_enc_loop6:
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
- movl %ebx,%ecx
pxor %xmm2,%xmm1
subl $96,%eax
- jnc L044xts_enc_loop6
- leal 1(,%ecx,2),%ecx
+ jnc L046xts_enc_loop6
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-L043xts_enc_short:
+L045xts_enc_short:
addl $96,%eax
- jz L045xts_enc_done6x
+ jz L047xts_enc_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb L046xts_enc_one
+ jb L048xts_enc_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je L047xts_enc_two
+ je L049xts_enc_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1129,7 +1180,7 @@ L043xts_enc_short:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb L048xts_enc_three
+ jb L050xts_enc_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1139,7 +1190,7 @@ L043xts_enc_short:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je L049xts_enc_four
+ je L051xts_enc_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1171,9 +1222,9 @@ L043xts_enc_short:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp L050xts_enc_done
+ jmp L052xts_enc_done
.align 4,0x90
-L046xts_enc_one:
+L048xts_enc_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1181,37 +1232,36 @@ L046xts_enc_one:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L051enc1_loop_9:
+L053enc1_loop_9:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L051enc1_loop_9
+ jnz L053enc1_loop_9
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp L050xts_enc_done
+ jmp L052xts_enc_done
.align 4,0x90
-L047xts_enc_two:
+L049xts_enc_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
leal 32(%esi),%esi
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
- xorps %xmm4,%xmm4
- call __aesni_encrypt3
+ call __aesni_encrypt2
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L050xts_enc_done
+ jmp L052xts_enc_done
.align 4,0x90
-L048xts_enc_three:
+L050xts_enc_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1229,9 +1279,9 @@ L048xts_enc_three:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp L050xts_enc_done
+ jmp L052xts_enc_done
.align 4,0x90
-L049xts_enc_four:
+L051xts_enc_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1253,28 +1303,28 @@ L049xts_enc_four:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L050xts_enc_done
+ jmp L052xts_enc_done
.align 4,0x90
-L045xts_enc_done6x:
+L047xts_enc_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz L052xts_enc_ret
+ jz L054xts_enc_ret
movdqa %xmm1,%xmm5
movl %eax,112(%esp)
- jmp L053xts_enc_steal
+ jmp L055xts_enc_steal
.align 4,0x90
-L050xts_enc_done:
+L052xts_enc_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz L052xts_enc_ret
+ jz L054xts_enc_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm5
paddq %xmm1,%xmm1
pand 96(%esp),%xmm5
pxor %xmm1,%xmm5
-L053xts_enc_steal:
+L055xts_enc_steal:
movzbl (%esi),%ecx
movzbl -16(%edi),%edx
leal 1(%esi),%esi
@@ -1282,7 +1332,7 @@ L053xts_enc_steal:
movb %dl,(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz L053xts_enc_steal
+ jnz L055xts_enc_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1292,16 +1342,16 @@ L053xts_enc_steal:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L054enc1_loop_10:
+L056enc1_loop_10:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L054enc1_loop_10
+ jnz L056enc1_loop_10
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,-16(%edi)
-L052xts_enc_ret:
+L054xts_enc_ret:
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1324,12 +1374,12 @@ L_aesni_xts_decrypt_begin:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L055enc1_loop_11:
+L057enc1_loop_11:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L055enc1_loop_11
+ jnz L057enc1_loop_11
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1358,12 +1408,14 @@ L055enc1_loop_11:
pcmpgtd %xmm1,%xmm0
andl $-16,%eax
subl $96,%eax
- jc L056xts_dec_short
- shrl $1,%ecx
- movl %ecx,%ebx
- jmp L057xts_dec_loop6
+ jc L058xts_dec_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp L059xts_dec_loop6
.align 4,0x90
-L057xts_dec_loop6:
+L059xts_dec_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1399,6 +1451,7 @@ L057xts_dec_loop6:
pand %xmm3,%xmm7
movups (%esi),%xmm2
pxor %xmm1,%xmm7
+ movl %ebx,%ecx
movdqu 16(%esi),%xmm3
xorps %xmm0,%xmm2
movdqu 32(%esi),%xmm4
@@ -1414,19 +1467,17 @@ L057xts_dec_loop6:
movdqa %xmm7,80(%esp)
pxor %xmm1,%xmm7
movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
pxor 16(%esp),%xmm3
-.byte 102,15,56,222,209
pxor 32(%esp),%xmm4
-.byte 102,15,56,222,217
+.byte 102,15,56,222,209
pxor 48(%esp),%xmm5
- decl %ecx
-.byte 102,15,56,222,225
pxor 64(%esp),%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,217
pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
.byte 102,15,56,222,241
- movups (%edx),%xmm0
.byte 102,15,56,222,249
call L_aesni_decrypt6_enter
movdqa 80(%esp),%xmm1
@@ -1451,26 +1502,25 @@ L057xts_dec_loop6:
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
- movl %ebx,%ecx
pxor %xmm2,%xmm1
subl $96,%eax
- jnc L057xts_dec_loop6
- leal 1(,%ecx,2),%ecx
+ jnc L059xts_dec_loop6
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-L056xts_dec_short:
+L058xts_dec_short:
addl $96,%eax
- jz L058xts_dec_done6x
+ jz L060xts_dec_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb L059xts_dec_one
+ jb L061xts_dec_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je L060xts_dec_two
+ je L062xts_dec_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1479,7 +1529,7 @@ L056xts_dec_short:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb L061xts_dec_three
+ jb L063xts_dec_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1489,7 +1539,7 @@ L056xts_dec_short:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je L062xts_dec_four
+ je L064xts_dec_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1521,9 +1571,9 @@ L056xts_dec_short:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp L063xts_dec_done
+ jmp L065xts_dec_done
.align 4,0x90
-L059xts_dec_one:
+L061xts_dec_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1531,36 +1581,36 @@ L059xts_dec_one:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L064dec1_loop_12:
+L066dec1_loop_12:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L064dec1_loop_12
+ jnz L066dec1_loop_12
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp L063xts_dec_done
+ jmp L065xts_dec_done
.align 4,0x90
-L060xts_dec_two:
+L062xts_dec_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
leal 32(%esi),%esi
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
- call __aesni_decrypt3
+ call __aesni_decrypt2
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L063xts_dec_done
+ jmp L065xts_dec_done
.align 4,0x90
-L061xts_dec_three:
+L063xts_dec_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1578,9 +1628,9 @@ L061xts_dec_three:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp L063xts_dec_done
+ jmp L065xts_dec_done
.align 4,0x90
-L062xts_dec_four:
+L064xts_dec_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1602,20 +1652,20 @@ L062xts_dec_four:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L063xts_dec_done
+ jmp L065xts_dec_done
.align 4,0x90
-L058xts_dec_done6x:
+L060xts_dec_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz L065xts_dec_ret
+ jz L067xts_dec_ret
movl %eax,112(%esp)
- jmp L066xts_dec_only_one_more
+ jmp L068xts_dec_only_one_more
.align 4,0x90
-L063xts_dec_done:
+L065xts_dec_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz L065xts_dec_ret
+ jz L067xts_dec_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm2
@@ -1625,7 +1675,7 @@ L063xts_dec_done:
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
-L066xts_dec_only_one_more:
+L068xts_dec_only_one_more:
pshufd $19,%xmm0,%xmm5
movdqa %xmm1,%xmm6
paddq %xmm1,%xmm1
@@ -1639,16 +1689,16 @@ L066xts_dec_only_one_more:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L067dec1_loop_13:
+L069dec1_loop_13:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L067dec1_loop_13
+ jnz L069dec1_loop_13
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
-L068xts_dec_steal:
+L070xts_dec_steal:
movzbl 16(%esi),%ecx
movzbl (%edi),%edx
leal 1(%esi),%esi
@@ -1656,7 +1706,7 @@ L068xts_dec_steal:
movb %dl,16(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz L068xts_dec_steal
+ jnz L070xts_dec_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1666,16 +1716,16 @@ L068xts_dec_steal:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L069dec1_loop_14:
+L071dec1_loop_14:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L069dec1_loop_14
+ jnz L071dec1_loop_14
.byte 102,15,56,223,209
xorps %xmm6,%xmm2
movups %xmm2,(%edi)
-L065xts_dec_ret:
+L067xts_dec_ret:
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1699,7 +1749,7 @@ L_aesni_cbc_encrypt_begin:
movl 32(%esp),%edx
movl 36(%esp),%ebp
testl %eax,%eax
- jz L070cbc_abort
+ jz L072cbc_abort
cmpl $0,40(%esp)
xchgl %esp,%ebx
movups (%ebp),%xmm7
@@ -1707,14 +1757,14 @@ L_aesni_cbc_encrypt_begin:
movl %edx,%ebp
movl %ebx,16(%esp)
movl %ecx,%ebx
- je L071cbc_decrypt
+ je L073cbc_decrypt
movaps %xmm7,%xmm2
cmpl $16,%eax
- jb L072cbc_enc_tail
+ jb L074cbc_enc_tail
subl $16,%eax
- jmp L073cbc_enc_loop
+ jmp L075cbc_enc_loop
.align 4,0x90
-L073cbc_enc_loop:
+L075cbc_enc_loop:
movups (%esi),%xmm7
leal 16(%esi),%esi
movups (%edx),%xmm0
@@ -1722,24 +1772,24 @@ L073cbc_enc_loop:
xorps %xmm0,%xmm7
leal 32(%edx),%edx
xorps %xmm7,%xmm2
-L074enc1_loop_15:
+L076enc1_loop_15:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L074enc1_loop_15
+ jnz L076enc1_loop_15
.byte 102,15,56,221,209
movl %ebx,%ecx
movl %ebp,%edx
movups %xmm2,(%edi)
leal 16(%edi),%edi
subl $16,%eax
- jnc L073cbc_enc_loop
+ jnc L075cbc_enc_loop
addl $16,%eax
- jnz L072cbc_enc_tail
+ jnz L074cbc_enc_tail
movaps %xmm2,%xmm7
- jmp L075cbc_ret
-L072cbc_enc_tail:
+ jmp L077cbc_ret
+L074cbc_enc_tail:
movl %eax,%ecx
.long 2767451785
movl $16,%ecx
@@ -1750,20 +1800,20 @@ L072cbc_enc_tail:
movl %ebx,%ecx
movl %edi,%esi
movl %ebp,%edx
- jmp L073cbc_enc_loop
+ jmp L075cbc_enc_loop
.align 4,0x90
-L071cbc_decrypt:
+L073cbc_decrypt:
cmpl $80,%eax
- jbe L076cbc_dec_tail
+ jbe L078cbc_dec_tail
movaps %xmm7,(%esp)
subl $80,%eax
- jmp L077cbc_dec_loop6_enter
+ jmp L079cbc_dec_loop6_enter
.align 4,0x90
-L078cbc_dec_loop6:
+L080cbc_dec_loop6:
movaps %xmm0,(%esp)
movups %xmm7,(%edi)
leal 16(%edi),%edi
-L077cbc_dec_loop6_enter:
+L079cbc_dec_loop6_enter:
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -1793,28 +1843,28 @@ L077cbc_dec_loop6_enter:
movups %xmm6,64(%edi)
leal 80(%edi),%edi
subl $96,%eax
- ja L078cbc_dec_loop6
+ ja L080cbc_dec_loop6
movaps %xmm7,%xmm2
movaps %xmm0,%xmm7
addl $80,%eax
- jle L079cbc_dec_tail_collected
+ jle L081cbc_dec_tail_collected
movups %xmm2,(%edi)
leal 16(%edi),%edi
-L076cbc_dec_tail:
+L078cbc_dec_tail:
movups (%esi),%xmm2
movaps %xmm2,%xmm6
cmpl $16,%eax
- jbe L080cbc_dec_one
+ jbe L082cbc_dec_one
movups 16(%esi),%xmm3
movaps %xmm3,%xmm5
cmpl $32,%eax
- jbe L081cbc_dec_two
+ jbe L083cbc_dec_two
movups 32(%esi),%xmm4
cmpl $48,%eax
- jbe L082cbc_dec_three
+ jbe L084cbc_dec_three
movups 48(%esi),%xmm5
cmpl $64,%eax
- jbe L083cbc_dec_four
+ jbe L085cbc_dec_four
movups 64(%esi),%xmm6
movaps %xmm7,(%esp)
movups (%esi),%xmm2
@@ -1837,28 +1887,27 @@ L076cbc_dec_tail:
leal 64(%edi),%edi
movaps %xmm6,%xmm2
subl $80,%eax
- jmp L079cbc_dec_tail_collected
+ jmp L081cbc_dec_tail_collected
.align 4,0x90
-L080cbc_dec_one:
+L082cbc_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L084dec1_loop_16:
+L086dec1_loop_16:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L084dec1_loop_16
+ jnz L086dec1_loop_16
.byte 102,15,56,223,209
xorps %xmm7,%xmm2
movaps %xmm6,%xmm7
subl $16,%eax
- jmp L079cbc_dec_tail_collected
+ jmp L081cbc_dec_tail_collected
.align 4,0x90
-L081cbc_dec_two:
- xorps %xmm4,%xmm4
- call __aesni_decrypt3
+L083cbc_dec_two:
+ call __aesni_decrypt2
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
@@ -1866,9 +1915,9 @@ L081cbc_dec_two:
leal 16(%edi),%edi
movaps %xmm5,%xmm7
subl $32,%eax
- jmp L079cbc_dec_tail_collected
+ jmp L081cbc_dec_tail_collected
.align 4,0x90
-L082cbc_dec_three:
+L084cbc_dec_three:
call __aesni_decrypt3
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
@@ -1879,9 +1928,9 @@ L082cbc_dec_three:
leal 32(%edi),%edi
movups 32(%esi),%xmm7
subl $48,%eax
- jmp L079cbc_dec_tail_collected
+ jmp L081cbc_dec_tail_collected
.align 4,0x90
-L083cbc_dec_four:
+L085cbc_dec_four:
call __aesni_decrypt4
movups 16(%esi),%xmm1
movups 32(%esi),%xmm0
@@ -1896,23 +1945,23 @@ L083cbc_dec_four:
leal 48(%edi),%edi
movaps %xmm5,%xmm2
subl $64,%eax
-L079cbc_dec_tail_collected:
+L081cbc_dec_tail_collected:
andl $15,%eax
- jnz L085cbc_dec_tail_partial
+ jnz L087cbc_dec_tail_partial
movups %xmm2,(%edi)
- jmp L075cbc_ret
+ jmp L077cbc_ret
.align 4,0x90
-L085cbc_dec_tail_partial:
+L087cbc_dec_tail_partial:
movaps %xmm2,(%esp)
movl $16,%ecx
movl %esp,%esi
subl %eax,%ecx
.long 2767451785
-L075cbc_ret:
+L077cbc_ret:
movl 16(%esp),%esp
movl 36(%esp),%ebp
movups %xmm7,(%ebp)
-L070cbc_abort:
+L072cbc_abort:
popl %edi
popl %esi
popl %ebx
@@ -1921,51 +1970,51 @@ L070cbc_abort:
.align 4
__aesni_set_encrypt_key:
testl %eax,%eax
- jz L086bad_pointer
+ jz L088bad_pointer
testl %edx,%edx
- jz L086bad_pointer
+ jz L088bad_pointer
movups (%eax),%xmm0
xorps %xmm4,%xmm4
leal 16(%edx),%edx
cmpl $256,%ecx
- je L08714rounds
+ je L08914rounds
cmpl $192,%ecx
- je L08812rounds
+ je L09012rounds
cmpl $128,%ecx
- jne L089bad_keybits
+ jne L091bad_keybits
.align 4,0x90
-L09010rounds:
+L09210rounds:
movl $9,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,200,1
- call L091key_128_cold
+ call L093key_128_cold
.byte 102,15,58,223,200,2
- call L092key_128
+ call L094key_128
.byte 102,15,58,223,200,4
- call L092key_128
+ call L094key_128
.byte 102,15,58,223,200,8
- call L092key_128
+ call L094key_128
.byte 102,15,58,223,200,16
- call L092key_128
+ call L094key_128
.byte 102,15,58,223,200,32
- call L092key_128
+ call L094key_128
.byte 102,15,58,223,200,64
- call L092key_128
+ call L094key_128
.byte 102,15,58,223,200,128
- call L092key_128
+ call L094key_128
.byte 102,15,58,223,200,27
- call L092key_128
+ call L094key_128
.byte 102,15,58,223,200,54
- call L092key_128
+ call L094key_128
movups %xmm0,(%edx)
movl %ecx,80(%edx)
xorl %eax,%eax
ret
.align 4,0x90
-L092key_128:
+L094key_128:
movups %xmm0,(%edx)
leal 16(%edx),%edx
-L091key_128_cold:
+L093key_128_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -1974,38 +2023,38 @@ L091key_128_cold:
xorps %xmm1,%xmm0
ret
.align 4,0x90
-L08812rounds:
+L09012rounds:
movq 16(%eax),%xmm2
movl $11,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,202,1
- call L093key_192a_cold
+ call L095key_192a_cold
.byte 102,15,58,223,202,2
- call L094key_192b
+ call L096key_192b
.byte 102,15,58,223,202,4
- call L095key_192a
+ call L097key_192a
.byte 102,15,58,223,202,8
- call L094key_192b
+ call L096key_192b
.byte 102,15,58,223,202,16
- call L095key_192a
+ call L097key_192a
.byte 102,15,58,223,202,32
- call L094key_192b
+ call L096key_192b
.byte 102,15,58,223,202,64
- call L095key_192a
+ call L097key_192a
.byte 102,15,58,223,202,128
- call L094key_192b
+ call L096key_192b
movups %xmm0,(%edx)
movl %ecx,48(%edx)
xorl %eax,%eax
ret
.align 4,0x90
-L095key_192a:
+L097key_192a:
movups %xmm0,(%edx)
leal 16(%edx),%edx
.align 4,0x90
-L093key_192a_cold:
+L095key_192a_cold:
movaps %xmm2,%xmm5
-L096key_192b_warm:
+L098key_192b_warm:
shufps $16,%xmm0,%xmm4
movdqa %xmm2,%xmm3
xorps %xmm4,%xmm0
@@ -2019,56 +2068,56 @@ L096key_192b_warm:
pxor %xmm3,%xmm2
ret
.align 4,0x90
-L094key_192b:
+L096key_192b:
movaps %xmm0,%xmm3
shufps $68,%xmm0,%xmm5
movups %xmm5,(%edx)
shufps $78,%xmm2,%xmm3
movups %xmm3,16(%edx)
leal 32(%edx),%edx
- jmp L096key_192b_warm
+ jmp L098key_192b_warm
.align 4,0x90
-L08714rounds:
+L08914rounds:
movups 16(%eax),%xmm2
movl $13,%ecx
leal 16(%edx),%edx
movups %xmm0,-32(%edx)
movups %xmm2,-16(%edx)
.byte 102,15,58,223,202,1
- call L097key_256a_cold
+ call L099key_256a_cold
.byte 102,15,58,223,200,1
- call L098key_256b
+ call L100key_256b
.byte 102,15,58,223,202,2
- call L099key_256a
+ call L101key_256a
.byte 102,15,58,223,200,2
- call L098key_256b
+ call L100key_256b
.byte 102,15,58,223,202,4
- call L099key_256a
+ call L101key_256a
.byte 102,15,58,223,200,4
- call L098key_256b
+ call L100key_256b
.byte 102,15,58,223,202,8
- call L099key_256a
+ call L101key_256a
.byte 102,15,58,223,200,8
- call L098key_256b
+ call L100key_256b
.byte 102,15,58,223,202,16
- call L099key_256a
+ call L101key_256a
.byte 102,15,58,223,200,16
- call L098key_256b
+ call L100key_256b
.byte 102,15,58,223,202,32
- call L099key_256a
+ call L101key_256a
.byte 102,15,58,223,200,32
- call L098key_256b
+ call L100key_256b
.byte 102,15,58,223,202,64
- call L099key_256a
+ call L101key_256a
movups %xmm0,(%edx)
movl %ecx,16(%edx)
xorl %eax,%eax
ret
.align 4,0x90
-L099key_256a:
+L101key_256a:
movups %xmm2,(%edx)
leal 16(%edx),%edx
-L097key_256a_cold:
+L099key_256a_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -2077,7 +2126,7 @@ L097key_256a_cold:
xorps %xmm1,%xmm0
ret
.align 4,0x90
-L098key_256b:
+L100key_256b:
movups %xmm0,(%edx)
leal 16(%edx),%edx
shufps $16,%xmm2,%xmm4
@@ -2088,11 +2137,11 @@ L098key_256b:
xorps %xmm1,%xmm2
ret
.align 2,0x90
-L086bad_pointer:
+L088bad_pointer:
movl $-1,%eax
ret
.align 2,0x90
-L089bad_keybits:
+L091bad_keybits:
movl $-2,%eax
ret
.globl _aesni_set_encrypt_key
@@ -2115,7 +2164,7 @@ L_aesni_set_decrypt_key_begin:
movl 12(%esp),%edx
shll $4,%ecx
testl %eax,%eax
- jnz L100dec_key_ret
+ jnz L102dec_key_ret
leal 16(%edx,%ecx,1),%eax
movups (%edx),%xmm0
movups (%eax),%xmm1
@@ -2123,7 +2172,7 @@ L_aesni_set_decrypt_key_begin:
movups %xmm1,(%edx)
leal 16(%edx),%edx
leal -16(%eax),%eax
-L101dec_key_inverse:
+L103dec_key_inverse:
movups (%edx),%xmm0
movups (%eax),%xmm1
.byte 102,15,56,219,192
@@ -2133,12 +2182,12 @@ L101dec_key_inverse:
movups %xmm0,16(%eax)
movups %xmm1,-16(%edx)
cmpl %edx,%eax
- ja L101dec_key_inverse
+ ja L103dec_key_inverse
movups (%edx),%xmm0
.byte 102,15,56,219,192
movups %xmm0,(%edx)
xorl %eax,%eax
-L100dec_key_ret:
+L102dec_key_ret:
ret
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
diff --git a/lib/accelerated/x86/macosx/aesni-x86_64.s b/lib/accelerated/x86/macosx/aesni-x86_64.s
index 7089c7c9d6..b58973fe6e 100644
--- a/lib/accelerated/x86/macosx/aesni-x86_64.s
+++ b/lib/accelerated/x86/macosx/aesni-x86_64.s
@@ -38,6 +38,7 @@
# *** This file is auto-generated ***
#
.text
+
.globl _aesni_encrypt
.p2align 4
@@ -53,7 +54,7 @@ L$oop_enc1_1:
decl %eax
movups (%rdx),%xmm1
leaq 16(%rdx),%rdx
- jnz L$oop_enc1_1
+ jnz L$oop_enc1_1
.byte 102,15,56,221,209
movups %xmm2,(%rsi)
.byte 0xf3,0xc3
@@ -74,34 +75,93 @@ L$oop_dec1_2:
decl %eax
movups (%rdx),%xmm1
leaq 16(%rdx),%rdx
- jnz L$oop_dec1_2
+ jnz L$oop_dec1_2
.byte 102,15,56,223,209
movups %xmm2,(%rsi)
.byte 0xf3,0xc3
.p2align 4
+_aesni_encrypt2:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+L$enc_loop2:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$enc_loop2
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ .byte 0xf3,0xc3
+
+
+.p2align 4
+_aesni_decrypt2:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+L$dec_loop2:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$dec_loop2
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+ .byte 0xf3,0xc3
+
+
+.p2align 4
_aesni_encrypt3:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
L$enc_loop3:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$enc_loop3
.byte 102,15,56,220,209
@@ -116,25 +176,26 @@ L$enc_loop3:
.p2align 4
_aesni_decrypt3:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
L$dec_loop3:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$dec_loop3
.byte 102,15,56,222,209
@@ -149,28 +210,30 @@ L$dec_loop3:
.p2align 4
_aesni_encrypt4:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
xorps %xmm0,%xmm5
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
L$enc_loop4:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$enc_loop4
.byte 102,15,56,220,209
@@ -187,28 +250,30 @@ L$enc_loop4:
.p2align 4
_aesni_decrypt4:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
xorps %xmm0,%xmm5
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
L$dec_loop4:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$dec_loop4
.byte 102,15,56,222,209
@@ -225,43 +290,43 @@ L$dec_loop4:
.p2align 4
_aesni_encrypt6:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
+.byte 102,15,56,220,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,225
pxor %xmm0,%xmm7
- decl %eax
+ addq $16,%rax
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%rcx),%xmm0
.byte 102,15,56,220,249
+ movups -16(%rcx,%rax,1),%xmm0
jmp L$enc_loop6_enter
.p2align 4
L$enc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
L$enc_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$enc_loop6
.byte 102,15,56,220,209
@@ -282,43 +347,43 @@ L$enc_loop6_enter:
.p2align 4
_aesni_decrypt6:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
+.byte 102,15,56,222,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,222,217
pxor %xmm0,%xmm5
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,225
pxor %xmm0,%xmm7
- decl %eax
+ addq $16,%rax
+.byte 102,15,56,222,233
.byte 102,15,56,222,241
- movups (%rcx),%xmm0
.byte 102,15,56,222,249
+ movups -16(%rcx,%rax,1),%xmm0
jmp L$dec_loop6_enter
.p2align 4
L$dec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
L$dec_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$dec_loop6
.byte 102,15,56,222,209
@@ -339,52 +404,51 @@ L$dec_loop6_enter:
.p2align 4
_aesni_encrypt8:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,209
+ addq $16,%rax
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,220,241
+.byte 102,15,56,220,217
pxor %xmm0,%xmm8
-.byte 102,15,56,220,249
pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
- movups 16(%rcx),%xmm1
+ movups -16(%rcx,%rax,1),%xmm0
jmp L$enc_loop8_enter
.p2align 4
L$enc_loop8:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
- movups 16(%rcx),%xmm1
L$enc_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$enc_loop8
.byte 102,15,56,220,209
@@ -409,52 +473,51 @@ L$enc_loop8_enter:
.p2align 4
_aesni_decrypt8:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
-.byte 102,15,56,222,217
pxor %xmm0,%xmm5
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,222,209
+ addq $16,%rax
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,222,241
+.byte 102,15,56,222,217
pxor %xmm0,%xmm8
-.byte 102,15,56,222,249
pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
+ movups -16(%rcx,%rax,1),%xmm0
jmp L$dec_loop8_enter
.p2align 4
L$dec_loop8:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
L$dec_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$dec_loop8
.byte 102,15,56,222,209
@@ -583,14 +646,13 @@ L$oop_enc1_3:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_enc1_3
+ jnz L$oop_enc1_3
.byte 102,15,56,221,209
movups %xmm2,(%rsi)
jmp L$ecb_ret
.p2align 4
L$ecb_enc_two:
- xorps %xmm4,%xmm4
- call _aesni_encrypt3
+ call _aesni_encrypt2
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
jmp L$ecb_ret
@@ -728,14 +790,13 @@ L$oop_dec1_4:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_dec1_4
+ jnz L$oop_dec1_4
.byte 102,15,56,223,209
movups %xmm2,(%rsi)
jmp L$ecb_ret
.p2align 4
L$ecb_dec_two:
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
+ call _aesni_decrypt2
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
jmp L$ecb_ret
@@ -782,53 +843,53 @@ L$ecb_ret:
.p2align 4
_aesni_ccm64_encrypt_blocks:
movl 240(%rcx),%eax
- movdqu (%r8),%xmm9
- movdqa L$increment64(%rip),%xmm6
+ movdqu (%r8),%xmm6
+ movdqa L$increment64(%rip),%xmm9
movdqa L$bswap_mask(%rip),%xmm7
- shrl $1,%eax
+ shll $4,%eax
+ movl $16,%r10d
leaq 0(%rcx),%r11
movdqu (%r9),%xmm3
- movdqa %xmm9,%xmm2
- movl %eax,%r10d
-.byte 102,68,15,56,0,207
+ movdqa %xmm6,%xmm2
+ leaq 32(%rcx,%rax,1),%rcx
+.byte 102,15,56,0,247
+ subq %rax,%r10
jmp L$ccm64_enc_outer
.p2align 4
L$ccm64_enc_outer:
movups (%r11),%xmm0
- movl %r10d,%eax
+ movq %r10,%rax
movups (%rdi),%xmm8
xorps %xmm0,%xmm2
movups 16(%r11),%xmm1
xorps %xmm8,%xmm0
- leaq 32(%r11),%rcx
xorps %xmm0,%xmm3
- movups (%rcx),%xmm0
+ movups 32(%r11),%xmm0
L$ccm64_enc2_loop:
.byte 102,15,56,220,209
- decl %eax
.byte 102,15,56,220,217
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,216
- movups 0(%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$ccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
+ decq %rdx
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- decq %rdx
leaq 16(%rdi),%rdi
xorps %xmm2,%xmm8
- movdqa %xmm9,%xmm2
+ movdqa %xmm6,%xmm2
movups %xmm8,(%rsi)
- leaq 16(%rsi),%rsi
.byte 102,15,56,0,215
+ leaq 16(%rsi),%rsi
jnz L$ccm64_enc_outer
movups %xmm3,(%r9)
@@ -839,15 +900,15 @@ L$ccm64_enc2_loop:
.p2align 4
_aesni_ccm64_decrypt_blocks:
movl 240(%rcx),%eax
- movups (%r8),%xmm9
+ movups (%r8),%xmm6
movdqu (%r9),%xmm3
- movdqa L$increment64(%rip),%xmm6
+ movdqa L$increment64(%rip),%xmm9
movdqa L$bswap_mask(%rip),%xmm7
- movaps %xmm9,%xmm2
+ movaps %xmm6,%xmm2
movl %eax,%r10d
movq %rcx,%r11
-.byte 102,68,15,56,0,207
+.byte 102,15,56,0,247
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
@@ -857,17 +918,21 @@ L$oop_enc1_5:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_enc1_5
+ jnz L$oop_enc1_5
.byte 102,15,56,221,209
+ shll $4,%r10d
+ movl $16,%eax
movups (%rdi),%xmm8
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
leaq 16(%rdi),%rdi
+ subq %r10,%rax
+ leaq 32(%r11,%r10,1),%rcx
+ movq %rax,%r10
jmp L$ccm64_dec_outer
.p2align 4
L$ccm64_dec_outer:
xorps %xmm2,%xmm8
- movdqa %xmm9,%xmm2
- movl %r10d,%eax
+ movdqa %xmm6,%xmm2
movups %xmm8,(%rsi)
leaq 16(%rsi),%rsi
.byte 102,15,56,0,215
@@ -876,36 +941,36 @@ L$ccm64_dec_outer:
jz L$ccm64_dec_break
movups (%r11),%xmm0
- shrl $1,%eax
+ movq %r10,%rax
movups 16(%r11),%xmm1
xorps %xmm0,%xmm8
- leaq 32(%r11),%rcx
xorps %xmm0,%xmm2
xorps %xmm8,%xmm3
- movups (%rcx),%xmm0
-
+ movups 32(%r11),%xmm0
+ jmp L$ccm64_dec2_loop
+.p2align 4
L$ccm64_dec2_loop:
.byte 102,15,56,220,209
- decl %eax
.byte 102,15,56,220,217
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,216
- movups 0(%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$ccm64_dec2_loop
movups (%rdi),%xmm8
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- leaq 16(%rdi),%rdi
.byte 102,15,56,221,208
.byte 102,15,56,221,216
+ leaq 16(%rdi),%rdi
jmp L$ccm64_dec_outer
.p2align 4
L$ccm64_dec_break:
+ movl 240(%r11),%eax
movups (%r11),%xmm0
movups 16(%r11),%xmm1
xorps %xmm0,%xmm8
@@ -916,7 +981,7 @@ L$oop_enc1_6:
decl %eax
movups (%r11),%xmm1
leaq 16(%r11),%r11
- jnz L$oop_enc1_6
+ jnz L$oop_enc1_6
.byte 102,15,56,221,217
movups %xmm3,(%r9)
.byte 0xf3,0xc3
@@ -925,199 +990,518 @@ L$oop_enc1_6:
.p2align 4
_aesni_ctr32_encrypt_blocks:
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $128,%rsp
+ andq $-16,%rsp
+ leaq -8(%rax),%rbp
+
cmpq $1,%rdx
je L$ctr32_one_shortcut
- movdqu (%r8),%xmm14
- movdqa L$bswap_mask(%rip),%xmm15
- xorl %eax,%eax
-.byte 102,69,15,58,22,242,3
-.byte 102,68,15,58,34,240,3
-
+ movdqu (%r8),%xmm2
+ movdqu (%rcx),%xmm0
+ movl 12(%r8),%r8d
+ pxor %xmm0,%xmm2
+ movl 12(%rcx),%r11d
+ movdqa %xmm2,0(%rsp)
+ bswapl %r8d
+ movdqa %xmm2,%xmm3
+ movdqa %xmm2,%xmm4
+ movdqa %xmm2,%xmm5
+ movdqa %xmm2,64(%rsp)
+ movdqa %xmm2,80(%rsp)
+ movdqa %xmm2,96(%rsp)
+ movq %rdx,%r10
+ movdqa %xmm2,112(%rsp)
+
+ leaq 1(%r8),%rax
+ leaq 2(%r8),%rdx
+ bswapl %eax
+ bswapl %edx
+ xorl %r11d,%eax
+ xorl %r11d,%edx
+.byte 102,15,58,34,216,3
+ leaq 3(%r8),%rax
+ movdqa %xmm3,16(%rsp)
+.byte 102,15,58,34,226,3
+ bswapl %eax
+ movq %r10,%rdx
+ leaq 4(%r8),%r10
+ movdqa %xmm4,32(%rsp)
+ xorl %r11d,%eax
+ bswapl %r10d
+.byte 102,15,58,34,232,3
+ xorl %r11d,%r10d
+ movdqa %xmm5,48(%rsp)
+ leaq 5(%r8),%r9
+ movl %r10d,64+12(%rsp)
+ bswapl %r9d
+ leaq 6(%r8),%r10
movl 240(%rcx),%eax
+ xorl %r11d,%r9d
bswapl %r10d
- pxor %xmm12,%xmm12
- pxor %xmm13,%xmm13
-.byte 102,69,15,58,34,226,0
- leaq 3(%r10),%r11
-.byte 102,69,15,58,34,235,0
- incl %r10d
-.byte 102,69,15,58,34,226,1
- incq %r11
-.byte 102,69,15,58,34,235,1
- incl %r10d
-.byte 102,69,15,58,34,226,2
- incq %r11
-.byte 102,69,15,58,34,235,2
- movdqa %xmm12,-40(%rsp)
-.byte 102,69,15,56,0,231
- movdqa %xmm13,-24(%rsp)
-.byte 102,69,15,56,0,239
-
- pshufd $192,%xmm12,%xmm2
- pshufd $128,%xmm12,%xmm3
- pshufd $64,%xmm12,%xmm4
- cmpq $6,%rdx
+ movl %r9d,80+12(%rsp)
+ xorl %r11d,%r10d
+ leaq 7(%r8),%r9
+ movl %r10d,96+12(%rsp)
+ bswapl %r9d
+ movl __gnutls_x86_cpuid_s+4(%rip),%r10d
+ xorl %r11d,%r9d
+ andl $71303168,%r10d
+ movl %r9d,112+12(%rsp)
+
+ movups 16(%rcx),%xmm1
+
+ movdqa 64(%rsp),%xmm6
+ movdqa 80(%rsp),%xmm7
+
+ cmpq $8,%rdx
jb L$ctr32_tail
- shrl $1,%eax
- movq %rcx,%r11
- movl %eax,%r10d
+
subq $6,%rdx
+ cmpl $4194304,%r10d
+ je L$ctr32_6x
+
+ leaq 128(%rcx),%rcx
+ subq $2,%rdx
+ jmp L$ctr32_loop8
+
+.p2align 4
+L$ctr32_6x:
+ shll $4,%eax
+ movl $48,%r10d
+ bswapl %r11d
+ leaq 32(%rcx,%rax,1),%rcx
+ subq %rax,%r10
jmp L$ctr32_loop6
.p2align 4
L$ctr32_loop6:
- pshufd $192,%xmm13,%xmm5
- por %xmm14,%xmm2
- movups (%r11),%xmm0
- pshufd $128,%xmm13,%xmm6
- por %xmm14,%xmm3
- movups 16(%r11),%xmm1
- pshufd $64,%xmm13,%xmm7
- por %xmm14,%xmm4
- por %xmm14,%xmm5
- xorps %xmm0,%xmm2
- por %xmm14,%xmm6
- por %xmm14,%xmm7
+ addl $6,%r8d
+ movups -48(%rcx,%r10,1),%xmm0
+.byte 102,15,56,220,209
+ movl %r8d,%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,217
+.byte 0x0f,0x38,0xf1,0x44,0x24,12
+ leal 1(%r8),%eax
+.byte 102,15,56,220,225
+ xorl %r11d,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,28
+.byte 102,15,56,220,233
+ leal 2(%r8),%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,241
+.byte 0x0f,0x38,0xf1,0x44,0x24,44
+ leal 3(%r8),%eax
+.byte 102,15,56,220,249
+ movups -32(%rcx,%r10,1),%xmm1
+ xorl %r11d,%eax
+
+.byte 102,15,56,220,208
+.byte 0x0f,0x38,0xf1,0x44,0x24,60
+ leal 4(%r8),%eax
+.byte 102,15,56,220,216
+ xorl %r11d,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,76
+.byte 102,15,56,220,224
+ leal 5(%r8),%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,232
+.byte 0x0f,0x38,0xf1,0x44,0x24,92
+ movq %r10,%rax
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups -16(%rcx,%r10,1),%xmm0
+ call L$enc_loop6
+ movdqu (%rdi),%xmm8
+ movdqu 16(%rdi),%xmm9
+ movdqu 32(%rdi),%xmm10
+ movdqu 48(%rdi),%xmm11
+ movdqu 64(%rdi),%xmm12
+ movdqu 80(%rdi),%xmm13
+ leaq 96(%rdi),%rdi
+ movups -64(%rcx,%r10,1),%xmm1
+ pxor %xmm2,%xmm8
+ movaps 0(%rsp),%xmm2
+ pxor %xmm3,%xmm9
+ movaps 16(%rsp),%xmm3
+ pxor %xmm4,%xmm10
+ movaps 32(%rsp),%xmm4
+ pxor %xmm5,%xmm11
+ movaps 48(%rsp),%xmm5
+ pxor %xmm6,%xmm12
+ movaps 64(%rsp),%xmm6
+ pxor %xmm7,%xmm13
+ movaps 80(%rsp),%xmm7
+ movdqu %xmm8,(%rsi)
+ movdqu %xmm9,16(%rsi)
+ movdqu %xmm10,32(%rsi)
+ movdqu %xmm11,48(%rsi)
+ movdqu %xmm12,64(%rsi)
+ movdqu %xmm13,80(%rsi)
+ leaq 96(%rsi),%rsi
+ subq $6,%rdx
+ jnc L$ctr32_loop6
- pxor %xmm0,%xmm3
+ addq $6,%rdx
+ jz L$ctr32_done
+
+ leal -48(%r10),%eax
+ leaq -80(%rcx,%r10,1),%rcx
+ negl %eax
+ shrl $4,%eax
+ jmp L$ctr32_tail
+
+.p2align 5
+L$ctr32_loop8:
+ addl $8,%r8d
+ movdqa 96(%rsp),%xmm8
.byte 102,15,56,220,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+ movl %r8d,%r9d
+ movdqa 112(%rsp),%xmm9
.byte 102,15,56,220,217
- movdqa L$increment32(%rip),%xmm13
- pxor %xmm0,%xmm5
+ bswapl %r9d
+ movups 32-128(%rcx),%xmm0
.byte 102,15,56,220,225
- movdqa -40(%rsp),%xmm12
- pxor %xmm0,%xmm6
+ xorl %r11d,%r9d
+ nop
.byte 102,15,56,220,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+ movl %r9d,0+12(%rsp)
+ leaq 1(%r8),%r9
.byte 102,15,56,220,241
.byte 102,15,56,220,249
- jmp L$ctr32_enc_loop6_enter
-.p2align 4
-L$ctr32_enc_loop6:
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 48-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movl %r9d,16+12(%rsp)
+ leaq 2(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 64-128(%rcx),%xmm0
+ bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
+ xorl %r11d,%r9d
+.byte 0x66,0x90
.byte 102,15,56,220,225
.byte 102,15,56,220,233
+ movl %r9d,32+12(%rsp)
+ leaq 3(%r8),%r9
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-L$ctr32_enc_loop6_enter:
- movups 16(%rcx),%xmm1
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 80-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
+ xorl %r11d,%r9d
+.byte 0x66,0x90
.byte 102,15,56,220,224
.byte 102,15,56,220,232
+ movl %r9d,48+12(%rsp)
+ leaq 4(%r8),%r9
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
- jnz L$ctr32_enc_loop6
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 96-128(%rcx),%xmm0
+ bswapl %r9d
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movl %r9d,64+12(%rsp)
+ leaq 5(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 112-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movl %r9d,80+12(%rsp)
+ leaq 6(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 128-128(%rcx),%xmm0
+ bswapl %r9d
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movl %r9d,96+12(%rsp)
+ leaq 7(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 144-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+ xorl %r11d,%r9d
+ movdqu 0(%rdi),%xmm10
+.byte 102,15,56,220,232
+ movl %r9d,112+12(%rsp)
+ cmpl $11,%eax
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 160-128(%rcx),%xmm0
+
+ jb L$ctr32_enc_done
.byte 102,15,56,220,209
- paddd %xmm13,%xmm12
.byte 102,15,56,220,217
- paddd -24(%rsp),%xmm13
.byte 102,15,56,220,225
- movdqa %xmm12,-40(%rsp)
.byte 102,15,56,220,233
- movdqa %xmm13,-24(%rsp)
.byte 102,15,56,220,241
-.byte 102,69,15,56,0,231
.byte 102,15,56,220,249
-.byte 102,69,15,56,0,239
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 176-128(%rcx),%xmm1
-.byte 102,15,56,221,208
- movups (%rdi),%xmm8
-.byte 102,15,56,221,216
- movups 16(%rdi),%xmm9
-.byte 102,15,56,221,224
- movups 32(%rdi),%xmm10
-.byte 102,15,56,221,232
- movups 48(%rdi),%xmm11
-.byte 102,15,56,221,240
- movups 64(%rdi),%xmm1
-.byte 102,15,56,221,248
- movups 80(%rdi),%xmm0
- leaq 96(%rdi),%rdi
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 192-128(%rcx),%xmm0
+ je L$ctr32_enc_done
- xorps %xmm2,%xmm8
- pshufd $192,%xmm12,%xmm2
- xorps %xmm3,%xmm9
- pshufd $128,%xmm12,%xmm3
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- pshufd $64,%xmm12,%xmm4
- movups %xmm9,16(%rsi)
- xorps %xmm5,%xmm11
- movups %xmm10,32(%rsi)
- xorps %xmm6,%xmm1
- movups %xmm11,48(%rsi)
- xorps %xmm7,%xmm0
- movups %xmm1,64(%rsi)
- movups %xmm0,80(%rsi)
- leaq 96(%rsi),%rsi
- movl %r10d,%eax
- subq $6,%rdx
- jnc L$ctr32_loop6
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 208-128(%rcx),%xmm1
- addq $6,%rdx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 224-128(%rcx),%xmm0
+ jmp L$ctr32_enc_done
+
+.p2align 4
+L$ctr32_enc_done:
+ movdqu 16(%rdi),%xmm11
+ pxor %xmm0,%xmm10
+ movdqu 32(%rdi),%xmm12
+ pxor %xmm0,%xmm11
+ movdqu 48(%rdi),%xmm13
+ pxor %xmm0,%xmm12
+ movdqu 64(%rdi),%xmm14
+ pxor %xmm0,%xmm13
+ movdqu 80(%rdi),%xmm15
+ pxor %xmm0,%xmm14
+ pxor %xmm0,%xmm15
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movdqu 96(%rdi),%xmm1
+ leaq 128(%rdi),%rdi
+
+.byte 102,65,15,56,221,210
+ pxor %xmm0,%xmm1
+ movdqu 112-128(%rdi),%xmm10
+.byte 102,65,15,56,221,219
+ pxor %xmm0,%xmm10
+ movdqa 0(%rsp),%xmm11
+.byte 102,65,15,56,221,228
+.byte 102,65,15,56,221,237
+ movdqa 16(%rsp),%xmm12
+ movdqa 32(%rsp),%xmm13
+.byte 102,65,15,56,221,246
+.byte 102,65,15,56,221,255
+ movdqa 48(%rsp),%xmm14
+ movdqa 64(%rsp),%xmm15
+.byte 102,68,15,56,221,193
+ movdqa 80(%rsp),%xmm0
+ movups 16-128(%rcx),%xmm1
+.byte 102,69,15,56,221,202
+
+ movups %xmm2,(%rsi)
+ movdqa %xmm11,%xmm2
+ movups %xmm3,16(%rsi)
+ movdqa %xmm12,%xmm3
+ movups %xmm4,32(%rsi)
+ movdqa %xmm13,%xmm4
+ movups %xmm5,48(%rsi)
+ movdqa %xmm14,%xmm5
+ movups %xmm6,64(%rsi)
+ movdqa %xmm15,%xmm6
+ movups %xmm7,80(%rsi)
+ movdqa %xmm0,%xmm7
+ movups %xmm8,96(%rsi)
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+
+ subq $8,%rdx
+ jnc L$ctr32_loop8
+
+ addq $8,%rdx
jz L$ctr32_done
- movq %r11,%rcx
- leal 1(%rax,%rax,1),%eax
+ leaq -128(%rcx),%rcx
L$ctr32_tail:
- por %xmm14,%xmm2
- movups (%rdi),%xmm8
- cmpq $2,%rdx
- jb L$ctr32_one
+ leaq 16(%rcx),%rcx
+ cmpq $4,%rdx
+ jb L$ctr32_loop3
+ je L$ctr32_loop4
- por %xmm14,%xmm3
- movups 16(%rdi),%xmm9
- je L$ctr32_two
+ shll $4,%eax
+ movdqa 96(%rsp),%xmm8
+ pxor %xmm9,%xmm9
- pshufd $192,%xmm13,%xmm5
- por %xmm14,%xmm4
- movups 32(%rdi),%xmm10
- cmpq $4,%rdx
- jb L$ctr32_three
+ movups 16(%rcx),%xmm0
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ leaq 32-16(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,225
+ addq $16,%rax
+ movups (%rdi),%xmm10
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+ movups 16(%rdi),%xmm11
+ movups 32(%rdi),%xmm12
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
- pshufd $128,%xmm13,%xmm6
- por %xmm14,%xmm5
- movups 48(%rdi),%xmm11
- je L$ctr32_four
+ call L$enc_loop8_enter
- por %xmm14,%xmm6
- xorps %xmm7,%xmm7
+ movdqu 48(%rdi),%xmm13
+ pxor %xmm10,%xmm2
+ movdqu 64(%rdi),%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm10,%xmm6
+ movdqu %xmm5,48(%rsi)
+ movdqu %xmm6,64(%rsi)
+ cmpq $6,%rdx
+ jb L$ctr32_done
- call _aesni_encrypt6
+ movups 80(%rdi),%xmm11
+ xorps %xmm11,%xmm7
+ movups %xmm7,80(%rsi)
+ je L$ctr32_done
- movups 64(%rdi),%xmm1
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- movups %xmm9,16(%rsi)
- xorps %xmm5,%xmm11
- movups %xmm10,32(%rsi)
- xorps %xmm6,%xmm1
- movups %xmm11,48(%rsi)
- movups %xmm1,64(%rsi)
+ movups 96(%rdi),%xmm12
+ xorps %xmm12,%xmm8
+ movups %xmm8,96(%rsi)
+ jmp L$ctr32_done
+
+.p2align 5
+L$ctr32_loop4:
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+ decl %eax
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups (%rcx),%xmm1
+ jnz L$ctr32_loop4
+.byte 102,15,56,221,209
+.byte 102,15,56,221,217
+ movups (%rdi),%xmm10
+ movups 16(%rdi),%xmm11
+.byte 102,15,56,221,225
+.byte 102,15,56,221,233
+ movups 32(%rdi),%xmm12
+ movups 48(%rdi),%xmm13
+
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+ xorps %xmm11,%xmm3
+ movups %xmm3,16(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm5,48(%rsi)
+ jmp L$ctr32_done
+
+.p2align 5
+L$ctr32_loop3:
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+ decl %eax
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+ movups (%rcx),%xmm1
+ jnz L$ctr32_loop3
+.byte 102,15,56,221,209
+.byte 102,15,56,221,217
+.byte 102,15,56,221,225
+
+ movups (%rdi),%xmm10
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+ cmpq $2,%rdx
+ jb L$ctr32_done
+
+ movups 16(%rdi),%xmm11
+ xorps %xmm11,%xmm3
+ movups %xmm3,16(%rsi)
+ je L$ctr32_done
+
+ movups 32(%rdi),%xmm12
+ xorps %xmm12,%xmm4
+ movups %xmm4,32(%rsi)
jmp L$ctr32_done
.p2align 4
L$ctr32_one_shortcut:
movups (%r8),%xmm2
- movups (%rdi),%xmm8
+ movups (%rdi),%xmm10
movl 240(%rcx),%eax
-L$ctr32_one:
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
@@ -1127,289 +1511,303 @@ L$oop_enc1_7:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_enc1_7
+ jnz L$oop_enc1_7
.byte 102,15,56,221,209
- xorps %xmm2,%xmm8
- movups %xmm8,(%rsi)
- jmp L$ctr32_done
-
-.p2align 4
-L$ctr32_two:
- xorps %xmm4,%xmm4
- call _aesni_encrypt3
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- movups %xmm9,16(%rsi)
- jmp L$ctr32_done
-
-.p2align 4
-L$ctr32_three:
- call _aesni_encrypt3
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- movups %xmm9,16(%rsi)
- movups %xmm10,32(%rsi)
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
jmp L$ctr32_done
.p2align 4
-L$ctr32_four:
- call _aesni_encrypt4
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- movups %xmm9,16(%rsi)
- xorps %xmm5,%xmm11
- movups %xmm10,32(%rsi)
- movups %xmm11,48(%rsi)
-
L$ctr32_done:
+ leaq (%rbp),%rsp
+ popq %rbp
+L$ctr32_epilogue:
.byte 0xf3,0xc3
.globl _aesni_xts_encrypt
.p2align 4
_aesni_xts_encrypt:
- leaq -104(%rsp),%rsp
- movups (%r9),%xmm15
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $112,%rsp
+ andq $-16,%rsp
+ leaq -8(%rax),%rbp
+ movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
movups (%r8),%xmm0
movups 16(%r8),%xmm1
leaq 32(%r8),%r8
- xorps %xmm0,%xmm15
+ xorps %xmm0,%xmm2
L$oop_enc1_8:
-.byte 102,68,15,56,220,249
+.byte 102,15,56,220,209
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz L$oop_enc1_8
-.byte 102,68,15,56,221,249
+ jnz L$oop_enc1_8
+.byte 102,15,56,221,209
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+
movdqa L$xts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ movdqa %xmm2,%xmm15
+ pshufd $95,%xmm2,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc L$xts_enc_short
- shrl $1,%eax
- subl $1,%eax
- movl %eax,%r10d
+ movl $16+96,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ subq %r10,%rax
+ movups 16(%r11),%xmm1
+ movq %rax,%r10
+ leaq L$xts_magic(%rip),%r8
jmp L$xts_enc_grandloop
-.p2align 4
+.p2align 5
L$xts_enc_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,220,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,220,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,220,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,220,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,220,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,220,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,220,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm12
+
+.byte 102,15,56,220,208
+ pxor %xmm9,%xmm13
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,220,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,220,216
+ pxor %xmm9,%xmm14
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,220,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,220,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,220,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,220,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp L$xts_enc_loop6_enter
-
-.p2align 4
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups 64(%r11),%xmm0
+ movdqa %xmm8,80(%rsp)
+ pshufd $95,%xmm15,%xmm9
+ jmp L$xts_enc_loop6
+.p2align 5
L$xts_enc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-L$xts_enc_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups -64(%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
+ movups -80(%rcx,%rax,1),%xmm0
jnz L$xts_enc_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
.byte 102,15,56,220,233
.byte 102,15,56,220,241
+ pxor %xmm14,%xmm15
+ movaps %xmm10,%xmm11
.byte 102,15,56,220,249
- movups 16(%rcx),%xmm1
+ movups -64(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,208
- pand %xmm8,%xmm9
+ paddd %xmm9,%xmm9
+ pxor %xmm15,%xmm10
.byte 102,15,56,220,216
- pcmpgtd %xmm15,%xmm14
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
.byte 102,15,56,220,224
- pxor %xmm9,%xmm15
.byte 102,15,56,220,232
+ pand %xmm8,%xmm14
+ movaps %xmm11,%xmm12
.byte 102,15,56,220,240
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,248
- movups 32(%rcx),%xmm0
+ movups -48(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
.byte 102,15,56,220,233
+ movdqa %xmm13,48(%rsp)
+ pxor %xmm14,%xmm15
.byte 102,15,56,220,241
+ movaps %xmm12,%xmm13
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,249
+ movups -32(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,220,216
paddq %xmm15,%xmm15
-.byte 102,15,56,221,208
- pand %xmm8,%xmm9
-.byte 102,15,56,221,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,221,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,221,232
-.byte 102,15,56,221,240
-.byte 102,15,56,221,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+ pxor %xmm14,%xmm15
+ movaps %xmm13,%xmm14
+.byte 102,15,56,220,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,220,217
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
+ pand %xmm8,%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+.byte 102,15,56,221,84,36,0
+ psrad $31,%xmm9
+ paddq %xmm15,%xmm15
+.byte 102,15,56,221,92,36,16
+.byte 102,15,56,221,100,36,32
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+ movq %r10,%rax
+.byte 102,15,56,221,108,36,48
+.byte 102,15,56,221,116,36,64
+.byte 102,15,56,221,124,36,80
pxor %xmm9,%xmm15
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
- movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc L$xts_enc_grandloop
- leal 3(%rax,%rax,1),%eax
+ movl $16+96,%eax
+ subl %r10d,%eax
movq %r11,%rcx
- movl %eax,%r10d
+ shrl $4,%eax
L$xts_enc_short:
+ movl %eax,%r10d
+ pxor %xmm0,%xmm10
addq $96,%rdx
jz L$xts_enc_done
+ pxor %xmm0,%xmm11
cmpq $32,%rdx
jb L$xts_enc_one
+ pxor %xmm0,%xmm12
je L$xts_enc_two
+ pxor %xmm0,%xmm13
cmpq $64,%rdx
jb L$xts_enc_three
+ pxor %xmm0,%xmm14
je L$xts_enc_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
@@ -1450,7 +1848,7 @@ L$oop_enc1_9:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_enc1_9
+ jnz L$oop_enc1_9
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movdqa %xmm11,%xmm10
@@ -1466,7 +1864,7 @@ L$xts_enc_two:
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
- call _aesni_encrypt3
+ call _aesni_encrypt2
xorps %xmm10,%xmm2
movdqa %xmm12,%xmm10
@@ -1512,15 +1910,15 @@ L$xts_enc_four:
call _aesni_encrypt4
- xorps %xmm10,%xmm2
- movdqa %xmm15,%xmm10
- xorps %xmm11,%xmm3
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm10,%xmm2
+ movdqa %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp L$xts_enc_done
@@ -1555,13 +1953,14 @@ L$oop_enc1_10:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_enc1_10
+ jnz L$oop_enc1_10
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movups %xmm2,-16(%rsi)
L$xts_enc_ret:
- leaq 104(%rsp),%rsp
+ leaq (%rbp),%rsp
+ popq %rbp
L$xts_enc_epilogue:
.byte 0xf3,0xc3
@@ -1569,249 +1968,292 @@ L$xts_enc_epilogue:
.p2align 4
_aesni_xts_decrypt:
- leaq -104(%rsp),%rsp
- movups (%r9),%xmm15
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $112,%rsp
+ andq $-16,%rsp
+ leaq -8(%rax),%rbp
+ movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
movups (%r8),%xmm0
movups 16(%r8),%xmm1
leaq 32(%r8),%r8
- xorps %xmm0,%xmm15
+ xorps %xmm0,%xmm2
L$oop_enc1_11:
-.byte 102,68,15,56,220,249
+.byte 102,15,56,220,209
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz L$oop_enc1_11
-.byte 102,68,15,56,221,249
+ jnz L$oop_enc1_11
+.byte 102,15,56,221,209
xorl %eax,%eax
testq $15,%rdx
setnz %al
shlq $4,%rax
subq %rax,%rdx
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+
movdqa L$xts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ movdqa %xmm2,%xmm15
+ pshufd $95,%xmm2,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc L$xts_dec_short
- shrl $1,%eax
- subl $1,%eax
- movl %eax,%r10d
+ movl $16+96,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ subq %r10,%rax
+ movups 16(%r11),%xmm1
+ movq %rax,%r10
+ leaq L$xts_magic(%rip),%r8
jmp L$xts_dec_grandloop
-.p2align 4
+.p2align 5
L$xts_dec_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,222,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,222,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,222,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,222,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,222,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,222,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,222,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm12
+
+.byte 102,15,56,222,208
+ pxor %xmm9,%xmm13
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,222,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,222,216
+ pxor %xmm9,%xmm14
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,222,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,222,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,222,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,222,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp L$xts_dec_loop6_enter
-
-.p2align 4
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups 64(%r11),%xmm0
+ movdqa %xmm8,80(%rsp)
+ pshufd $95,%xmm15,%xmm9
+ jmp L$xts_dec_loop6
+.p2align 5
L$xts_dec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-L$xts_dec_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups -64(%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%rcx),%xmm0
+ movups -80(%rcx,%rax,1),%xmm0
jnz L$xts_dec_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
.byte 102,15,56,222,233
.byte 102,15,56,222,241
+ pxor %xmm14,%xmm15
+ movaps %xmm10,%xmm11
.byte 102,15,56,222,249
- movups 16(%rcx),%xmm1
+ movups -64(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,208
- pand %xmm8,%xmm9
+ paddd %xmm9,%xmm9
+ pxor %xmm15,%xmm10
.byte 102,15,56,222,216
- pcmpgtd %xmm15,%xmm14
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
.byte 102,15,56,222,224
- pxor %xmm9,%xmm15
.byte 102,15,56,222,232
+ pand %xmm8,%xmm14
+ movaps %xmm11,%xmm12
.byte 102,15,56,222,240
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,248
- movups 32(%rcx),%xmm0
+ movups -48(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
.byte 102,15,56,222,233
+ movdqa %xmm13,48(%rsp)
+ pxor %xmm14,%xmm15
.byte 102,15,56,222,241
+ movaps %xmm12,%xmm13
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,249
+ movups -32(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,222,216
paddq %xmm15,%xmm15
-.byte 102,15,56,223,208
- pand %xmm8,%xmm9
-.byte 102,15,56,223,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,223,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,223,232
-.byte 102,15,56,223,240
-.byte 102,15,56,223,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+ pxor %xmm14,%xmm15
+ movaps %xmm13,%xmm14
+.byte 102,15,56,222,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,222,217
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
+ pand %xmm8,%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+.byte 102,15,56,223,84,36,0
+ psrad $31,%xmm9
+ paddq %xmm15,%xmm15
+.byte 102,15,56,223,92,36,16
+.byte 102,15,56,223,100,36,32
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+ movq %r10,%rax
+.byte 102,15,56,223,108,36,48
+.byte 102,15,56,223,116,36,64
+.byte 102,15,56,223,124,36,80
pxor %xmm9,%xmm15
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
- movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc L$xts_dec_grandloop
- leal 3(%rax,%rax,1),%eax
+ movl $16+96,%eax
+ subl %r10d,%eax
movq %r11,%rcx
- movl %eax,%r10d
+ shrl $4,%eax
L$xts_dec_short:
+ movl %eax,%r10d
+ pxor %xmm0,%xmm10
+ pxor %xmm0,%xmm11
addq $96,%rdx
jz L$xts_dec_done
+ pxor %xmm0,%xmm12
cmpq $32,%rdx
jb L$xts_dec_one
+ pxor %xmm0,%xmm13
je L$xts_dec_two
+ pxor %xmm0,%xmm14
cmpq $64,%rdx
jb L$xts_dec_three
je L$xts_dec_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
@@ -1861,7 +2303,7 @@ L$oop_dec1_12:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_dec1_12
+ jnz L$oop_dec1_12
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movdqa %xmm11,%xmm10
@@ -1878,7 +2320,7 @@ L$xts_dec_two:
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
- call _aesni_decrypt3
+ call _aesni_decrypt2
xorps %xmm10,%xmm2
movdqa %xmm12,%xmm10
@@ -1904,7 +2346,7 @@ L$xts_dec_three:
xorps %xmm10,%xmm2
movdqa %xmm13,%xmm10
xorps %xmm11,%xmm3
- movdqa %xmm15,%xmm11
+ movdqa %xmm14,%xmm11
xorps %xmm12,%xmm4
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
@@ -1914,14 +2356,8 @@ L$xts_dec_three:
.p2align 4
L$xts_dec_four:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movups (%rdi),%xmm2
- pand %xmm8,%xmm9
movups 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movups 32(%rdi),%xmm4
xorps %xmm10,%xmm2
movups 48(%rdi),%xmm5
@@ -1932,16 +2368,16 @@ L$xts_dec_four:
call _aesni_decrypt4
- xorps %xmm10,%xmm2
+ pxor %xmm10,%xmm2
movdqa %xmm14,%xmm10
- xorps %xmm11,%xmm3
+ pxor %xmm11,%xmm3
movdqa %xmm15,%xmm11
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp L$xts_dec_done
@@ -1965,7 +2401,7 @@ L$oop_dec1_13:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_dec1_13
+ jnz L$oop_dec1_13
.byte 102,15,56,223,209
xorps %xmm11,%xmm2
movups %xmm2,(%rsi)
@@ -1995,13 +2431,14 @@ L$oop_dec1_14:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_dec1_14
+ jnz L$oop_dec1_14
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movups %xmm2,(%rsi)
L$xts_dec_ret:
- leaq 104(%rsp),%rsp
+ leaq (%rbp),%rsp
+ popq %rbp
L$xts_dec_epilogue:
.byte 0xf3,0xc3
@@ -2038,7 +2475,7 @@ L$oop_enc1_15:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_enc1_15
+ jnz L$oop_enc1_15
.byte 102,15,56,221,209
movl %r10d,%eax
movq %r11,%rcx
@@ -2054,163 +2491,397 @@ L$oop_enc1_15:
L$cbc_enc_tail:
movq %rdx,%rcx
xchgq %rdi,%rsi
-.long 0x9066A4F3
+.long 0x9066A4F3
movl $16,%ecx
subq %rdx,%rcx
xorl %eax,%eax
-.long 0x9066AAF3
+.long 0x9066AAF3
leaq -16(%rdi),%rdi
movl %r10d,%eax
movq %rdi,%rsi
movq %r11,%rcx
xorq %rdx,%rdx
- jmp L$cbc_enc_loop
+ jmp L$cbc_enc_loop
.p2align 4
L$cbc_decrypt:
- movups (%r8),%xmm9
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $16,%rsp
+ andq $-16,%rsp
+ leaq -8(%rax),%rbp
+ movups (%r8),%xmm10
movl %r10d,%eax
- cmpq $112,%rdx
+ cmpq $80,%rdx
jbe L$cbc_dec_tail
- shrl $1,%r10d
- subq $112,%rdx
- movl %r10d,%eax
- movaps %xmm9,-24(%rsp)
+
+ movups (%rcx),%xmm0
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqa %xmm2,%xmm11
+ movdqu 32(%rdi),%xmm4
+ movdqa %xmm3,%xmm12
+ movdqu 48(%rdi),%xmm5
+ movdqa %xmm4,%xmm13
+ movdqu 64(%rdi),%xmm6
+ movdqa %xmm5,%xmm14
+ movdqu 80(%rdi),%xmm7
+ movdqa %xmm6,%xmm15
+ movl __gnutls_x86_cpuid_s+4(%rip),%r9d
+ cmpq $112,%rdx
+ jbe L$cbc_dec_six_or_seven
+
+ andl $71303168,%r9d
+ subq $80,%rdx
+ cmpl $4194304,%r9d
+ je L$cbc_dec_loop6_enter
+ subq $32,%rdx
+ leaq 112(%rcx),%rcx
jmp L$cbc_dec_loop8_enter
.p2align 4
L$cbc_dec_loop8:
- movaps %xmm0,-24(%rsp)
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
L$cbc_dec_loop8_enter:
- movups (%rcx),%xmm0
- movups (%rdi),%xmm2
- movups 16(%rdi),%xmm3
- movups 16(%rcx),%xmm1
+ movdqu 96(%rdi),%xmm8
+ pxor %xmm0,%xmm2
+ movdqu 112(%rdi),%xmm9
+ pxor %xmm0,%xmm3
+ movups 16-112(%rcx),%xmm1
+ pxor %xmm0,%xmm4
+ xorq %r11,%r11
+ cmpq $112,%rdx
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+ pxor %xmm0,%xmm7
+ pxor %xmm0,%xmm8
- leaq 32(%rcx),%rcx
- movdqu 32(%rdi),%xmm4
- xorps %xmm0,%xmm2
- movdqu 48(%rdi),%xmm5
- xorps %xmm0,%xmm3
- movdqu 64(%rdi),%xmm6
.byte 102,15,56,222,209
- pxor %xmm0,%xmm4
- movdqu 80(%rdi),%xmm7
+ pxor %xmm0,%xmm9
+ movups 32-112(%rcx),%xmm0
.byte 102,15,56,222,217
- pxor %xmm0,%xmm5
- movdqu 96(%rdi),%xmm8
.byte 102,15,56,222,225
- pxor %xmm0,%xmm6
- movdqu 112(%rdi),%xmm9
.byte 102,15,56,222,233
- pxor %xmm0,%xmm7
- decl %eax
.byte 102,15,56,222,241
- pxor %xmm0,%xmm8
.byte 102,15,56,222,249
- pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
.byte 102,68,15,56,222,193
+ setnc %r11b
+ shlq $7,%r11
.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
-
- call L$dec_loop8_enter
+ addq %rdi,%r11
+ movups 48-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 64-112(%rcx),%xmm0
+ nop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 80-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 96-112(%rcx),%xmm0
+ nop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 112-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 128-112(%rcx),%xmm0
+ nop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 144-112(%rcx),%xmm1
+ cmpl $11,%eax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 160-112(%rcx),%xmm0
+ jb L$cbc_dec_done
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 176-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 192-112(%rcx),%xmm0
+ je L$cbc_dec_done
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 208-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 224-112(%rcx),%xmm0
+ jmp L$cbc_dec_done
+.p2align 4
+L$cbc_dec_done:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm10
+ pxor %xmm0,%xmm11
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm12
+ pxor %xmm0,%xmm13
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ pxor %xmm0,%xmm14
+ pxor %xmm0,%xmm15
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movdqu 80(%rdi),%xmm1
+
+.byte 102,65,15,56,223,210
+ movdqu 96(%rdi),%xmm10
+ pxor %xmm0,%xmm1
+.byte 102,65,15,56,223,219
+ pxor %xmm0,%xmm10
+ movdqu 112(%rdi),%xmm0
+.byte 102,65,15,56,223,228
+ leaq 128(%rdi),%rdi
+ movdqu 0(%r11),%xmm11
+.byte 102,65,15,56,223,237
+.byte 102,65,15,56,223,246
+ movdqu 16(%r11),%xmm12
+ movdqu 32(%r11),%xmm13
+.byte 102,65,15,56,223,255
+.byte 102,68,15,56,223,193
+ movdqu 48(%r11),%xmm14
+ movdqu 64(%r11),%xmm15
+.byte 102,69,15,56,223,202
+ movdqa %xmm0,%xmm10
+ movdqu 80(%r11),%xmm1
+ movups -112(%rcx),%xmm0
- movups (%rdi),%xmm1
- movups 16(%rdi),%xmm0
- xorps -24(%rsp),%xmm2
- xorps %xmm1,%xmm3
- movups 32(%rdi),%xmm1
- xorps %xmm0,%xmm4
- movups 48(%rdi),%xmm0
- xorps %xmm1,%xmm5
- movups 64(%rdi),%xmm1
- xorps %xmm0,%xmm6
- movups 80(%rdi),%xmm0
- xorps %xmm1,%xmm7
- movups 96(%rdi),%xmm1
- xorps %xmm0,%xmm8
- movups 112(%rdi),%xmm0
- xorps %xmm1,%xmm9
movups %xmm2,(%rsi)
+ movdqa %xmm11,%xmm2
movups %xmm3,16(%rsi)
+ movdqa %xmm12,%xmm3
movups %xmm4,32(%rsi)
+ movdqa %xmm13,%xmm4
movups %xmm5,48(%rsi)
- movl %r10d,%eax
+ movdqa %xmm14,%xmm5
movups %xmm6,64(%rsi)
- movq %r11,%rcx
+ movdqa %xmm15,%xmm6
movups %xmm7,80(%rsi)
- leaq 128(%rdi),%rdi
+ movdqa %xmm1,%xmm7
movups %xmm8,96(%rsi)
leaq 112(%rsi),%rsi
+
subq $128,%rdx
ja L$cbc_dec_loop8
movaps %xmm9,%xmm2
- movaps %xmm0,%xmm9
+ leaq -112(%rcx),%rcx
addq $112,%rdx
jle L$cbc_dec_tail_collected
- movups %xmm2,(%rsi)
- leal 1(%r10,%r10,1),%eax
+ movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
+ cmpq $80,%rdx
+ jbe L$cbc_dec_tail
+
+ movaps %xmm11,%xmm2
+L$cbc_dec_six_or_seven:
+ cmpq $96,%rdx
+ ja L$cbc_dec_seven
+
+ movaps %xmm7,%xmm8
+ call _aesni_decrypt6
+ pxor %xmm10,%xmm2
+ movaps %xmm8,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ movdqa %xmm7,%xmm2
+ jmp L$cbc_dec_tail_collected
+
+.p2align 4
+L$cbc_dec_seven:
+ movups 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
+ call _aesni_decrypt8
+ movups 80(%rdi),%xmm9
+ pxor %xmm10,%xmm2
+ movups 96(%rdi),%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movdqu %xmm6,64(%rsi)
+ pxor %xmm9,%xmm8
+ movdqu %xmm7,80(%rsi)
+ leaq 96(%rsi),%rsi
+ movdqa %xmm8,%xmm2
+ jmp L$cbc_dec_tail_collected
+
+.p2align 4
+L$cbc_dec_loop6:
+ movups %xmm7,(%rsi)
+ leaq 16(%rsi),%rsi
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqa %xmm2,%xmm11
+ movdqu 32(%rdi),%xmm4
+ movdqa %xmm3,%xmm12
+ movdqu 48(%rdi),%xmm5
+ movdqa %xmm4,%xmm13
+ movdqu 64(%rdi),%xmm6
+ movdqa %xmm5,%xmm14
+ movdqu 80(%rdi),%xmm7
+ movdqa %xmm6,%xmm15
+L$cbc_dec_loop6_enter:
+ leaq 96(%rdi),%rdi
+ movdqa %xmm7,%xmm8
+
+ call _aesni_decrypt6
+
+ pxor %xmm10,%xmm2
+ movdqa %xmm8,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movq %r11,%rcx
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movl %r10d,%eax
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ subq $96,%rdx
+ ja L$cbc_dec_loop6
+
+ movdqa %xmm7,%xmm2
+ addq $80,%rdx
+ jle L$cbc_dec_tail_collected
+ movups %xmm7,(%rsi)
+ leaq 16(%rsi),%rsi
+
L$cbc_dec_tail:
movups (%rdi),%xmm2
- movaps %xmm2,%xmm8
- cmpq $16,%rdx
+ subq $16,%rdx
jbe L$cbc_dec_one
movups 16(%rdi),%xmm3
- movaps %xmm3,%xmm7
- cmpq $32,%rdx
+ movaps %xmm2,%xmm11
+ subq $16,%rdx
jbe L$cbc_dec_two
movups 32(%rdi),%xmm4
- movaps %xmm4,%xmm6
- cmpq $48,%rdx
+ movaps %xmm3,%xmm12
+ subq $16,%rdx
jbe L$cbc_dec_three
movups 48(%rdi),%xmm5
- cmpq $64,%rdx
+ movaps %xmm4,%xmm13
+ subq $16,%rdx
jbe L$cbc_dec_four
movups 64(%rdi),%xmm6
- cmpq $80,%rdx
- jbe L$cbc_dec_five
-
- movups 80(%rdi),%xmm7
- cmpq $96,%rdx
- jbe L$cbc_dec_six
-
- movups 96(%rdi),%xmm8
- movaps %xmm9,-24(%rsp)
- call _aesni_decrypt8
- movups (%rdi),%xmm1
- movups 16(%rdi),%xmm0
- xorps -24(%rsp),%xmm2
- xorps %xmm1,%xmm3
- movups 32(%rdi),%xmm1
- xorps %xmm0,%xmm4
- movups 48(%rdi),%xmm0
- xorps %xmm1,%xmm5
- movups 64(%rdi),%xmm1
- xorps %xmm0,%xmm6
- movups 80(%rdi),%xmm0
- xorps %xmm1,%xmm7
- movups 96(%rdi),%xmm9
- xorps %xmm0,%xmm8
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
- leaq 96(%rsi),%rsi
- movaps %xmm8,%xmm2
- subq $112,%rdx
+ movaps %xmm5,%xmm14
+ movaps %xmm6,%xmm15
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ pxor %xmm10,%xmm2
+ movaps %xmm15,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ leaq 64(%rsi),%rsi
+ movdqa %xmm6,%xmm2
+ subq $16,%rdx
jmp L$cbc_dec_tail_collected
+
.p2align 4
L$cbc_dec_one:
+ movaps %xmm2,%xmm11
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
@@ -2220,113 +2891,70 @@ L$oop_dec1_16:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_dec1_16
+ jnz L$oop_dec1_16
.byte 102,15,56,223,209
- xorps %xmm9,%xmm2
- movaps %xmm8,%xmm9
- subq $16,%rdx
+ xorps %xmm10,%xmm2
+ movaps %xmm11,%xmm10
jmp L$cbc_dec_tail_collected
.p2align 4
L$cbc_dec_two:
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- movups %xmm2,(%rsi)
- movaps %xmm7,%xmm9
- movaps %xmm3,%xmm2
+ movaps %xmm3,%xmm12
+ call _aesni_decrypt2
+ pxor %xmm10,%xmm2
+ movaps %xmm12,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ movdqa %xmm3,%xmm2
leaq 16(%rsi),%rsi
- subq $32,%rdx
jmp L$cbc_dec_tail_collected
.p2align 4
L$cbc_dec_three:
+ movaps %xmm4,%xmm13
call _aesni_decrypt3
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- movups %xmm2,(%rsi)
- xorps %xmm7,%xmm4
- movups %xmm3,16(%rsi)
- movaps %xmm6,%xmm9
- movaps %xmm4,%xmm2
+ pxor %xmm10,%xmm2
+ movaps %xmm13,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ movdqa %xmm4,%xmm2
leaq 32(%rsi),%rsi
- subq $48,%rdx
jmp L$cbc_dec_tail_collected
.p2align 4
L$cbc_dec_four:
+ movaps %xmm5,%xmm14
call _aesni_decrypt4
- xorps %xmm9,%xmm2
- movups 48(%rdi),%xmm9
- xorps %xmm8,%xmm3
- movups %xmm2,(%rsi)
- xorps %xmm7,%xmm4
- movups %xmm3,16(%rsi)
- xorps %xmm6,%xmm5
- movups %xmm4,32(%rsi)
- movaps %xmm5,%xmm2
+ pxor %xmm10,%xmm2
+ movaps %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ movdqa %xmm5,%xmm2
leaq 48(%rsi),%rsi
- subq $64,%rdx
- jmp L$cbc_dec_tail_collected
-.p2align 4
-L$cbc_dec_five:
- xorps %xmm7,%xmm7
- call _aesni_decrypt6
- movups 16(%rdi),%xmm1
- movups 32(%rdi),%xmm0
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- xorps %xmm1,%xmm4
- movups 48(%rdi),%xmm1
- xorps %xmm0,%xmm5
- movups 64(%rdi),%xmm9
- xorps %xmm1,%xmm6
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- leaq 64(%rsi),%rsi
- movaps %xmm6,%xmm2
- subq $80,%rdx
- jmp L$cbc_dec_tail_collected
-.p2align 4
-L$cbc_dec_six:
- call _aesni_decrypt6
- movups 16(%rdi),%xmm1
- movups 32(%rdi),%xmm0
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- xorps %xmm1,%xmm4
- movups 48(%rdi),%xmm1
- xorps %xmm0,%xmm5
- movups 64(%rdi),%xmm0
- xorps %xmm1,%xmm6
- movups 80(%rdi),%xmm9
- xorps %xmm0,%xmm7
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- movups %xmm6,64(%rsi)
- leaq 80(%rsi),%rsi
- movaps %xmm7,%xmm2
- subq $96,%rdx
jmp L$cbc_dec_tail_collected
+
.p2align 4
L$cbc_dec_tail_collected:
+ movups %xmm10,(%r8)
andq $15,%rdx
- movups %xmm9,(%r8)
jnz L$cbc_dec_tail_partial
movups %xmm2,(%rsi)
jmp L$cbc_dec_ret
.p2align 4
L$cbc_dec_tail_partial:
- movaps %xmm2,-24(%rsp)
+ movaps %xmm2,(%rsp)
movq $16,%rcx
movq %rsi,%rdi
subq %rdx,%rcx
- leaq -24(%rsp),%rsi
-.long 0x9066A4F3
+ leaq (%rsp),%rsi
+.long 0x9066A4F3
L$cbc_dec_ret:
+ leaq (%rbp),%rsp
+ popq %rbp
L$cbc_ret:
.byte 0xf3,0xc3
@@ -2334,7 +2962,7 @@ L$cbc_ret:
.p2align 4
_aesni_set_decrypt_key:
-.byte 0x48,0x83,0xEC,0x08
+.byte 0x48,0x83,0xEC,0x08
call __aesni_set_encrypt_key
shll $4,%esi
testl %eax,%eax
@@ -2373,7 +3001,7 @@ L$SEH_end_set_decrypt_key:
.p2align 4
_aesni_set_encrypt_key:
__aesni_set_encrypt_key:
-.byte 0x48,0x83,0xEC,0x08
+.byte 0x48,0x83,0xEC,0x08
movq $-1,%rax
testq %rdi,%rdi
jz L$enc_key_ret
@@ -2569,6 +3197,8 @@ L$increment64:
.long 1,0,0,0
L$xts_magic:
.long 0x87,0,1,0
+L$increment1:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 6
diff --git a/lib/accelerated/x86/macosx/e_padlock-x86.s b/lib/accelerated/x86/macosx/e_padlock-x86.s
index 3ba69bd352..367962c7cb 100644
--- a/lib/accelerated/x86/macosx/e_padlock-x86.s
+++ b/lib/accelerated/x86/macosx/e_padlock-x86.s
@@ -174,16 +174,14 @@ L005ecb_pic_point:
leal 16(%edx),%edx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpl $128,%ecx
- jbe L006ecb_short
testl $32,(%edx)
- jnz L007ecb_aligned
+ jnz L006ecb_aligned
testl $15,%edi
setz %al
testl $15,%esi
setz %bl
testl %ebx,%eax
- jnz L007ecb_aligned
+ jnz L006ecb_aligned
negl %eax
movl $512,%ebx
notl %eax
@@ -195,10 +193,28 @@ L005ecb_pic_point:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
- jmp L008ecb_loop
+ movl %eax,16(%ebp)
+ cmpl %ebx,%ecx
+ ja L007ecb_loop
+ movl %esi,%eax
+ cmpl %esp,%ebp
+ cmovel %edi,%eax
+ addl %ecx,%eax
+ negl %eax
+ andl $4095,%eax
+ cmpl $128,%eax
+ movl $-128,%eax
+ cmovael %ebx,%eax
+ andl %eax,%ebx
+ jz L008ecb_unaligned_tail
+ jmp L007ecb_loop
.align 4,0x90
-L008ecb_loop:
+L007ecb_loop:
movl %edi,(%ebp)
movl %esi,4(%ebp)
movl %ecx,8(%ebp)
@@ -223,8 +239,8 @@ L009ecb_inp_aligned:
testl $15,%edi
jz L010ecb_out_aligned
movl %ebx,%ecx
- shrl $2,%ecx
leal (%esp),%esi
+ shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
L010ecb_out_aligned:
@@ -234,43 +250,75 @@ L010ecb_out_aligned:
addl %ebx,%esi
subl %ebx,%ecx
movl $512,%ebx
- jnz L008ecb_loop
+ jz L011ecb_break
+ cmpl %ebx,%ecx
+ jae L007ecb_loop
+L008ecb_unaligned_tail:
+ xorl %eax,%eax
cmpl %ebp,%esp
- je L011ecb_done
+ cmovel %ecx,%eax
+ subl %eax,%esp
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp L007ecb_loop
+.align 4,0x90
+L011ecb_break:
+ cmpl %ebp,%esp
+ je L012ecb_done
pxor %xmm0,%xmm0
leal (%esp),%eax
-L012ecb_bzero:
+L013ecb_bzero:
movaps %xmm0,(%eax)
leal 16(%eax),%eax
cmpl %eax,%ebp
- ja L012ecb_bzero
-L011ecb_done:
+ ja L013ecb_bzero
+L012ecb_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
- jmp L013ecb_exit
+ jmp L014ecb_exit
.align 4,0x90
-L006ecb_short:
+L006ecb_aligned:
+ leal (%esi,%ecx,1),%ebp
+ negl %ebp
+ andl $4095,%ebp
xorl %eax,%eax
- leal -24(%esp),%ebp
- subl %ecx,%eax
- leal (%eax,%ebp,1),%esp
- andl $-16,%esp
- xorl %ebx,%ebx
-L014ecb_short_copy:
- movups (%esi,%ebx,1),%xmm0
- leal 16(%ebx),%ebx
- cmpl %ebx,%ecx
- movaps %xmm0,-16(%esp,%ebx,1)
- ja L014ecb_short_copy
- movl %esp,%esi
- movl %ecx,%ebx
- jmp L008ecb_loop
-.align 4,0x90
-L007ecb_aligned:
+ cmpl $128,%ebp
+ movl $127,%ebp
+ cmovael %eax,%ebp
+ andl %ecx,%ebp
+ subl %ebp,%ecx
+ jz L015ecb_aligned_tail
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
.byte 243,15,167,200
-L013ecb_exit:
+ testl %ebp,%ebp
+ jz L014ecb_exit
+L015ecb_aligned_tail:
+ movl %ebp,%ecx
+ leal -24(%esp),%ebp
+ movl %ebp,%esp
+ movl %ebp,%eax
+ subl %ecx,%esp
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp L007ecb_loop
+L014ecb_exit:
movl $1,%eax
leal 4(%esp),%esp
L004ecb_abort:
@@ -292,19 +340,17 @@ L_padlock_cbc_encrypt_begin:
movl 28(%esp),%edx
movl 32(%esp),%ecx
testl $15,%edx
- jnz L015cbc_abort
+ jnz L016cbc_abort
testl $15,%ecx
- jnz L015cbc_abort
- leal Lpadlock_saved_context-L016cbc_pic_point,%eax
+ jnz L016cbc_abort
+ leal Lpadlock_saved_context-L017cbc_pic_point,%eax
pushfl
cld
call __padlock_verify_ctx
-L016cbc_pic_point:
+L017cbc_pic_point:
leal 16(%edx),%edx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpl $64,%ecx
- jbe L017cbc_short
testl $32,(%edx)
jnz L018cbc_aligned
testl $15,%edi
@@ -324,7 +370,25 @@ L016cbc_pic_point:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
+ movl %eax,16(%ebp)
+ cmpl %ebx,%ecx
+ ja L019cbc_loop
+ movl %esi,%eax
+ cmpl %esp,%ebp
+ cmovel %edi,%eax
+ addl %ecx,%eax
+ negl %eax
+ andl $4095,%eax
+ cmpl $64,%eax
+ movl $-64,%eax
+ cmovael %ebx,%eax
+ andl %eax,%ebx
+ jz L020cbc_unaligned_tail
jmp L019cbc_loop
.align 4,0x90
L019cbc_loop:
@@ -336,13 +400,13 @@ L019cbc_loop:
testl $15,%edi
cmovnzl %esp,%edi
testl $15,%esi
- jz L020cbc_inp_aligned
+ jz L021cbc_inp_aligned
shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
movl %ebx,%ecx
movl %edi,%esi
-L020cbc_inp_aligned:
+L021cbc_inp_aligned:
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
@@ -352,61 +416,93 @@ L020cbc_inp_aligned:
movl (%ebp),%edi
movl 12(%ebp),%ebx
testl $15,%edi
- jz L021cbc_out_aligned
+ jz L022cbc_out_aligned
movl %ebx,%ecx
- shrl $2,%ecx
leal (%esp),%esi
+ shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
-L021cbc_out_aligned:
+L022cbc_out_aligned:
movl 4(%ebp),%esi
movl 8(%ebp),%ecx
addl %ebx,%edi
addl %ebx,%esi
subl %ebx,%ecx
movl $512,%ebx
- jnz L019cbc_loop
+ jz L023cbc_break
+ cmpl %ebx,%ecx
+ jae L019cbc_loop
+L020cbc_unaligned_tail:
+ xorl %eax,%eax
+ cmpl %ebp,%esp
+ cmovel %ecx,%eax
+ subl %eax,%esp
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp L019cbc_loop
+.align 4,0x90
+L023cbc_break:
cmpl %ebp,%esp
- je L022cbc_done
+ je L024cbc_done
pxor %xmm0,%xmm0
leal (%esp),%eax
-L023cbc_bzero:
+L025cbc_bzero:
movaps %xmm0,(%eax)
leal 16(%eax),%eax
cmpl %eax,%ebp
- ja L023cbc_bzero
-L022cbc_done:
+ ja L025cbc_bzero
+L024cbc_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
- jmp L024cbc_exit
-.align 4,0x90
-L017cbc_short:
- xorl %eax,%eax
- leal -24(%esp),%ebp
- subl %ecx,%eax
- leal (%eax,%ebp,1),%esp
- andl $-16,%esp
- xorl %ebx,%ebx
-L025cbc_short_copy:
- movups (%esi,%ebx,1),%xmm0
- leal 16(%ebx),%ebx
- cmpl %ebx,%ecx
- movaps %xmm0,-16(%esp,%ebx,1)
- ja L025cbc_short_copy
- movl %esp,%esi
- movl %ecx,%ebx
- jmp L019cbc_loop
+ jmp L026cbc_exit
.align 4,0x90
L018cbc_aligned:
+ leal (%esi,%ecx,1),%ebp
+ negl %ebp
+ andl $4095,%ebp
+ xorl %eax,%eax
+ cmpl $64,%ebp
+ movl $63,%ebp
+ cmovael %eax,%ebp
+ andl %ecx,%ebp
+ subl %ebp,%ecx
+ jz L027cbc_aligned_tail
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
.byte 243,15,167,208
movaps (%eax),%xmm0
movaps %xmm0,-16(%edx)
-L024cbc_exit:
+ testl %ebp,%ebp
+ jz L026cbc_exit
+L027cbc_aligned_tail:
+ movl %ebp,%ecx
+ leal -24(%esp),%ebp
+ movl %ebp,%esp
+ movl %ebp,%eax
+ subl %ecx,%esp
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp L019cbc_loop
+L026cbc_exit:
movl $1,%eax
leal 4(%esp),%esp
-L015cbc_abort:
+L016cbc_abort:
popl %edi
popl %esi
popl %ebx
@@ -425,25 +521,25 @@ L_padlock_cfb_encrypt_begin:
movl 28(%esp),%edx
movl 32(%esp),%ecx
testl $15,%edx
- jnz L026cfb_abort
+ jnz L028cfb_abort
testl $15,%ecx
- jnz L026cfb_abort
- leal Lpadlock_saved_context-L027cfb_pic_point,%eax
+ jnz L028cfb_abort
+ leal Lpadlock_saved_context-L029cfb_pic_point,%eax
pushfl
cld
call __padlock_verify_ctx
-L027cfb_pic_point:
+L029cfb_pic_point:
leal 16(%edx),%edx
xorl %eax,%eax
xorl %ebx,%ebx
testl $32,(%edx)
- jnz L028cfb_aligned
+ jnz L030cfb_aligned
testl $15,%edi
setz %al
testl $15,%esi
setz %bl
testl %ebx,%eax
- jnz L028cfb_aligned
+ jnz L030cfb_aligned
negl %eax
movl $512,%ebx
notl %eax
@@ -455,10 +551,15 @@ L027cfb_pic_point:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
- jmp L029cfb_loop
+ movl %eax,16(%ebp)
+ jmp L031cfb_loop
.align 4,0x90
-L029cfb_loop:
+L031cfb_loop:
movl %edi,(%ebp)
movl %esi,4(%ebp)
movl %ecx,8(%ebp)
@@ -467,13 +568,13 @@ L029cfb_loop:
testl $15,%edi
cmovnzl %esp,%edi
testl $15,%esi
- jz L030cfb_inp_aligned
+ jz L032cfb_inp_aligned
shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
movl %ebx,%ecx
movl %edi,%esi
-L030cfb_inp_aligned:
+L032cfb_inp_aligned:
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
@@ -483,61 +584,45 @@ L030cfb_inp_aligned:
movl (%ebp),%edi
movl 12(%ebp),%ebx
testl $15,%edi
- jz L031cfb_out_aligned
+ jz L033cfb_out_aligned
movl %ebx,%ecx
- shrl $2,%ecx
leal (%esp),%esi
+ shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
-L031cfb_out_aligned:
+L033cfb_out_aligned:
movl 4(%ebp),%esi
movl 8(%ebp),%ecx
addl %ebx,%edi
addl %ebx,%esi
subl %ebx,%ecx
movl $512,%ebx
- jnz L029cfb_loop
+ jnz L031cfb_loop
cmpl %ebp,%esp
- je L032cfb_done
+ je L034cfb_done
pxor %xmm0,%xmm0
leal (%esp),%eax
-L033cfb_bzero:
+L035cfb_bzero:
movaps %xmm0,(%eax)
leal 16(%eax),%eax
cmpl %eax,%ebp
- ja L033cfb_bzero
-L032cfb_done:
+ ja L035cfb_bzero
+L034cfb_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
- jmp L034cfb_exit
+ jmp L036cfb_exit
.align 4,0x90
-L035cfb_short:
- xorl %eax,%eax
- leal -24(%esp),%ebp
- subl %ecx,%eax
- leal (%eax,%ebp,1),%esp
- andl $-16,%esp
- xorl %ebx,%ebx
-L036cfb_short_copy:
- movups (%esi,%ebx,1),%xmm0
- leal 16(%ebx),%ebx
- cmpl %ebx,%ecx
- movaps %xmm0,-16(%esp,%ebx,1)
- ja L036cfb_short_copy
- movl %esp,%esi
- movl %ecx,%ebx
- jmp L029cfb_loop
-.align 4,0x90
-L028cfb_aligned:
+L030cfb_aligned:
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
.byte 243,15,167,224
movaps (%eax),%xmm0
movaps %xmm0,-16(%edx)
-L034cfb_exit:
+L036cfb_exit:
movl $1,%eax
leal 4(%esp),%esp
-L026cfb_abort:
+L028cfb_abort:
popl %edi
popl %esi
popl %ebx
@@ -586,7 +671,12 @@ L038ofb_pic_point:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
+ movl %eax,16(%ebp)
jmp L040ofb_loop
.align 4,0x90
L040ofb_loop:
@@ -616,8 +706,8 @@ L041ofb_inp_aligned:
testl $15,%edi
jz L042ofb_out_aligned
movl %ebx,%ecx
- shrl $2,%ecx
leal (%esp),%esi
+ shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
L042ofb_out_aligned:
@@ -638,26 +728,10 @@ L044ofb_bzero:
cmpl %eax,%ebp
ja L044ofb_bzero
L043ofb_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
jmp L045ofb_exit
.align 4,0x90
-L046ofb_short:
- xorl %eax,%eax
- leal -24(%esp),%ebp
- subl %ecx,%eax
- leal (%eax,%ebp,1),%esp
- andl $-16,%esp
- xorl %ebx,%ebx
-L047ofb_short_copy:
- movups (%esi,%ebx,1),%xmm0
- leal 16(%ebx),%ebx
- cmpl %ebx,%ecx
- movaps %xmm0,-16(%esp,%ebx,1)
- ja L047ofb_short_copy
- movl %esp,%esi
- movl %ecx,%ebx
- jmp L040ofb_loop
-.align 4,0x90
L039ofb_aligned:
leal -16(%edx),%eax
leal 16(%edx),%ebx
@@ -687,14 +761,14 @@ L_padlock_ctr32_encrypt_begin:
movl 28(%esp),%edx
movl 32(%esp),%ecx
testl $15,%edx
- jnz L048ctr32_abort
+ jnz L046ctr32_abort
testl $15,%ecx
- jnz L048ctr32_abort
- leal Lpadlock_saved_context-L049ctr32_pic_point,%eax
+ jnz L046ctr32_abort
+ leal Lpadlock_saved_context-L047ctr32_pic_point,%eax
pushfl
cld
call __padlock_verify_ctx
-L049ctr32_pic_point:
+L047ctr32_pic_point:
leal 16(%edx),%edx
xorl %eax,%eax
movq -16(%edx),%mm0
@@ -708,10 +782,15 @@ L049ctr32_pic_point:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
- jmp L050ctr32_loop
+ movl %eax,16(%ebp)
+ jmp L048ctr32_loop
.align 4,0x90
-L050ctr32_loop:
+L048ctr32_loop:
movl %edi,(%ebp)
movl %esi,4(%ebp)
movl %ecx,8(%ebp)
@@ -720,7 +799,7 @@ L050ctr32_loop:
movl -4(%edx),%ecx
xorl %edi,%edi
movl -8(%edx),%eax
-L051ctr32_prepare:
+L049ctr32_prepare:
movl %ecx,12(%esp,%edi,1)
bswap %ecx
movq %mm0,(%esp,%edi,1)
@@ -729,7 +808,7 @@ L051ctr32_prepare:
bswap %ecx
leal 16(%edi),%edi
cmpl %ebx,%edi
- jb L051ctr32_prepare
+ jb L049ctr32_prepare
movl %ecx,-4(%edx)
leal (%esp),%esi
leal (%esp),%edi
@@ -742,32 +821,33 @@ L051ctr32_prepare:
movl 12(%ebp),%ebx
movl 4(%ebp),%esi
xorl %ecx,%ecx
-L052ctr32_xor:
+L050ctr32_xor:
movups (%esi,%ecx,1),%xmm1
leal 16(%ecx),%ecx
pxor -16(%esp,%ecx,1),%xmm1
movups %xmm1,-16(%edi,%ecx,1)
cmpl %ebx,%ecx
- jb L052ctr32_xor
+ jb L050ctr32_xor
movl 8(%ebp),%ecx
addl %ebx,%edi
addl %ebx,%esi
subl %ebx,%ecx
movl $512,%ebx
- jnz L050ctr32_loop
+ jnz L048ctr32_loop
pxor %xmm0,%xmm0
leal (%esp),%eax
-L053ctr32_bzero:
+L051ctr32_bzero:
movaps %xmm0,(%eax)
leal 16(%eax),%eax
cmpl %eax,%ebp
- ja L053ctr32_bzero
-L054ctr32_done:
+ ja L051ctr32_bzero
+L052ctr32_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
movl $1,%eax
leal 4(%esp),%esp
emms
-L048ctr32_abort:
+L046ctr32_abort:
popl %edi
popl %esi
popl %ebx
@@ -789,10 +869,10 @@ __win32_segv_handler:
movl 4(%esp),%edx
movl 12(%esp),%ecx
cmpl $3221225477,(%edx)
- jne L055ret
+ jne L053ret
addl $4,184(%ecx)
movl $0,%eax
-L055ret:
+L053ret:
ret
.globl _padlock_sha1_oneshot
.align 4
diff --git a/lib/accelerated/x86/macosx/e_padlock-x86_64.s b/lib/accelerated/x86/macosx/e_padlock-x86_64.s
index 00c2bdaac0..ec1377ad88 100644
--- a/lib/accelerated/x86/macosx/e_padlock-x86_64.s
+++ b/lib/accelerated/x86/macosx/e_padlock-x86_64.s
@@ -127,7 +127,7 @@ _padlock_aes_block:
movq $1,%rcx
leaq 32(%rdx),%rbx
leaq 16(%rdx),%rdx
-.byte 0xf3,0x0f,0xa7,0xc8
+.byte 0xf3,0x0f,0xa7,0xc8
movq %r8,%rbx
.byte 0xf3,0xc3
@@ -137,7 +137,7 @@ _padlock_aes_block:
.p2align 4
_padlock_xstore:
movl %esi,%edx
-.byte 0x0f,0xa7,0xc0
+.byte 0x0f,0xa7,0xc0
.byte 0xf3,0xc3
@@ -154,7 +154,7 @@ _padlock_sha1_oneshot:
movq %rsp,%rdi
movl %eax,16(%rsp)
xorq %rax,%rax
-.byte 0xf3,0x0f,0xa6,0xc8
+.byte 0xf3,0x0f,0xa6,0xc8
movaps (%rsp),%xmm0
movl 16(%rsp),%eax
addq $128+8,%rsp
@@ -176,7 +176,7 @@ _padlock_sha1_blocks:
movq %rsp,%rdi
movl %eax,16(%rsp)
movq $-1,%rax
-.byte 0xf3,0x0f,0xa6,0xc8
+.byte 0xf3,0x0f,0xa6,0xc8
movaps (%rsp),%xmm0
movl 16(%rsp),%eax
addq $128+8,%rsp
@@ -198,7 +198,7 @@ _padlock_sha256_oneshot:
movq %rsp,%rdi
movaps %xmm1,16(%rsp)
xorq %rax,%rax
-.byte 0xf3,0x0f,0xa6,0xd0
+.byte 0xf3,0x0f,0xa6,0xd0
movaps (%rsp),%xmm0
movaps 16(%rsp),%xmm1
addq $128+8,%rsp
@@ -220,7 +220,7 @@ _padlock_sha256_blocks:
movq %rsp,%rdi
movaps %xmm1,16(%rsp)
movq $-1,%rax
-.byte 0xf3,0x0f,0xa6,0xd0
+.byte 0xf3,0x0f,0xa6,0xd0
movaps (%rsp),%xmm0
movaps 16(%rsp),%xmm1
addq $128+8,%rsp
@@ -245,7 +245,7 @@ _padlock_sha512_blocks:
movaps %xmm1,16(%rsp)
movaps %xmm2,32(%rsp)
movaps %xmm3,48(%rsp)
-.byte 0xf3,0x0f,0xa6,0xe0
+.byte 0xf3,0x0f,0xa6,0xe0
movaps (%rsp),%xmm0
movaps 16(%rsp),%xmm1
movaps 32(%rsp),%xmm2
@@ -276,8 +276,6 @@ _padlock_ecb_encrypt:
leaq 16(%rdx),%rdx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpq $128,%rcx
- jbe L$ecb_short
testl $32,(%rdx)
jnz L$ecb_aligned
testq $15,%rdi
@@ -297,6 +295,21 @@ _padlock_ecb_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+ cmpq %rbx,%rcx
+ ja L$ecb_loop
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $128,%rax
+ movq $-128,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jz L$ecb_unaligned_tail
jmp L$ecb_loop
.p2align 4
L$ecb_loop:
@@ -312,7 +325,7 @@ L$ecb_loop:
testq $15,%rsi
jz L$ecb_inp_aligned
shrq $3,%rcx
-.byte 0xf3,0x48,0xa5
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
movq %rbx,%rcx
movq %rdi,%rsi
@@ -320,15 +333,15 @@ L$ecb_inp_aligned:
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,200
+.byte 0xf3,0x0f,0xa7,200
movq %r8,%rdi
movq %r11,%rbx
testq $15,%rdi
jz L$ecb_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
-.byte 0xf3,0x48,0xa5
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
L$ecb_out_aligned:
movq %r9,%rsi
@@ -337,9 +350,26 @@ L$ecb_out_aligned:
addq %rbx,%rsi
subq %rbx,%rcx
movq $512,%rbx
- jnz L$ecb_loop
-
+ jz L$ecb_break
+ cmpq %rbx,%rcx
+ jae L$ecb_loop
+L$ecb_unaligned_tail:
+ xorl %eax,%eax
cmpq %rsp,%rbp
+ cmoveq %rcx,%rax
+ movq %rdi,%r8
+ movq %rcx,%rbx
+ subq %rax,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ movq %rsp,%rsi
+ movq %r8,%rdi
+ movq %rbx,%rcx
+ jmp L$ecb_loop
+.p2align 4
+L$ecb_break:
+ cmpq %rbp,%rsp
je L$ecb_done
pxor %xmm0,%xmm0
@@ -353,26 +383,39 @@ L$ecb_bzero:
L$ecb_done:
leaq (%rbp),%rsp
jmp L$ecb_exit
-.p2align 4
-L$ecb_short:
- movq %rsp,%rbp
- subq %rcx,%rsp
- xorq %rbx,%rbx
-L$ecb_short_copy:
- movups (%rsi,%rbx,1),%xmm0
- leaq 16(%rbx),%rbx
- cmpq %rbx,%rcx
- movaps %xmm0,-16(%rsp,%rbx,1)
- ja L$ecb_short_copy
- movq %rsp,%rsi
- movq %rcx,%rbx
- jmp L$ecb_loop
+
.p2align 4
L$ecb_aligned:
+ leaq (%rsi,%rcx,1),%rbp
+ negq %rbp
+ andq $4095,%rbp
+ xorl %eax,%eax
+ cmpq $128,%rbp
+ movq $128-1,%rbp
+ cmovaeq %rax,%rbp
+ andq %rcx,%rbp
+ subq %rbp,%rcx
+ jz L$ecb_aligned_tail
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,200
+.byte 0xf3,0x0f,0xa7,200
+ testq %rbp,%rbp
+ jz L$ecb_exit
+
+L$ecb_aligned_tail:
+ movq %rdi,%r8
+ movq %rbp,%rbx
+ movq %rbp,%rcx
+ leaq (%rsp),%rbp
+ subq %rcx,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ leaq (%r8),%rdi
+ leaq (%rsp),%rsi
+ movq %rbx,%rcx
+ jmp L$ecb_loop
L$ecb_exit:
movl $1,%eax
leaq 8(%rsp),%rsp
@@ -400,8 +443,6 @@ _padlock_cbc_encrypt:
leaq 16(%rdx),%rdx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpq $64,%rcx
- jbe L$cbc_short
testl $32,(%rdx)
jnz L$cbc_aligned
testq $15,%rdi
@@ -421,6 +462,21 @@ _padlock_cbc_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+ cmpq %rbx,%rcx
+ ja L$cbc_loop
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $64,%rax
+ movq $-64,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jz L$cbc_unaligned_tail
jmp L$cbc_loop
.p2align 4
L$cbc_loop:
@@ -436,7 +492,7 @@ L$cbc_loop:
testq $15,%rsi
jz L$cbc_inp_aligned
shrq $3,%rcx
-.byte 0xf3,0x48,0xa5
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
movq %rbx,%rcx
movq %rdi,%rsi
@@ -444,7 +500,7 @@ L$cbc_inp_aligned:
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,208
+.byte 0xf3,0x0f,0xa7,208
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
movq %r8,%rdi
@@ -452,9 +508,9 @@ L$cbc_inp_aligned:
testq $15,%rdi
jz L$cbc_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
-.byte 0xf3,0x48,0xa5
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
L$cbc_out_aligned:
movq %r9,%rsi
@@ -463,9 +519,26 @@ L$cbc_out_aligned:
addq %rbx,%rsi
subq %rbx,%rcx
movq $512,%rbx
- jnz L$cbc_loop
-
+ jz L$cbc_break
+ cmpq %rbx,%rcx
+ jae L$cbc_loop
+L$cbc_unaligned_tail:
+ xorl %eax,%eax
cmpq %rsp,%rbp
+ cmoveq %rcx,%rax
+ movq %rdi,%r8
+ movq %rcx,%rbx
+ subq %rax,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ movq %rsp,%rsi
+ movq %r8,%rdi
+ movq %rbx,%rcx
+ jmp L$cbc_loop
+.p2align 4
+L$cbc_break:
+ cmpq %rbp,%rsp
je L$cbc_done
pxor %xmm0,%xmm0
@@ -479,28 +552,41 @@ L$cbc_bzero:
L$cbc_done:
leaq (%rbp),%rsp
jmp L$cbc_exit
-.p2align 4
-L$cbc_short:
- movq %rsp,%rbp
- subq %rcx,%rsp
- xorq %rbx,%rbx
-L$cbc_short_copy:
- movups (%rsi,%rbx,1),%xmm0
- leaq 16(%rbx),%rbx
- cmpq %rbx,%rcx
- movaps %xmm0,-16(%rsp,%rbx,1)
- ja L$cbc_short_copy
- movq %rsp,%rsi
- movq %rcx,%rbx
- jmp L$cbc_loop
+
.p2align 4
L$cbc_aligned:
+ leaq (%rsi,%rcx,1),%rbp
+ negq %rbp
+ andq $4095,%rbp
+ xorl %eax,%eax
+ cmpq $64,%rbp
+ movq $64-1,%rbp
+ cmovaeq %rax,%rbp
+ andq %rcx,%rbp
+ subq %rbp,%rcx
+ jz L$cbc_aligned_tail
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,208
+.byte 0xf3,0x0f,0xa7,208
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
+ testq %rbp,%rbp
+ jz L$cbc_exit
+
+L$cbc_aligned_tail:
+ movq %rdi,%r8
+ movq %rbp,%rbx
+ movq %rbp,%rcx
+ leaq (%rsp),%rbp
+ subq %rcx,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ leaq (%r8),%rdi
+ leaq (%rsp),%rsi
+ movq %rbx,%rcx
+ jmp L$cbc_loop
L$cbc_exit:
movl $1,%eax
leaq 8(%rsp),%rsp
@@ -547,6 +633,8 @@ _padlock_cfb_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
jmp L$cfb_loop
.p2align 4
L$cfb_loop:
@@ -562,7 +650,7 @@ L$cfb_loop:
testq $15,%rsi
jz L$cfb_inp_aligned
shrq $3,%rcx
-.byte 0xf3,0x48,0xa5
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
movq %rbx,%rcx
movq %rdi,%rsi
@@ -570,7 +658,7 @@ L$cfb_inp_aligned:
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,224
+.byte 0xf3,0x0f,0xa7,224
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
movq %r8,%rdi
@@ -578,9 +666,9 @@ L$cfb_inp_aligned:
testq $15,%rdi
jz L$cfb_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
-.byte 0xf3,0x48,0xa5
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
L$cfb_out_aligned:
movq %r9,%rsi
@@ -590,8 +678,7 @@ L$cfb_out_aligned:
subq %rbx,%rcx
movq $512,%rbx
jnz L$cfb_loop
-
- cmpq %rsp,%rbp
+ cmpq %rbp,%rsp
je L$cfb_done
pxor %xmm0,%xmm0
@@ -605,12 +692,13 @@ L$cfb_bzero:
L$cfb_done:
leaq (%rbp),%rsp
jmp L$cfb_exit
+
.p2align 4
L$cfb_aligned:
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,224
+.byte 0xf3,0x0f,0xa7,224
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
L$cfb_exit:
@@ -659,6 +747,8 @@ _padlock_ofb_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
jmp L$ofb_loop
.p2align 4
L$ofb_loop:
@@ -674,7 +764,7 @@ L$ofb_loop:
testq $15,%rsi
jz L$ofb_inp_aligned
shrq $3,%rcx
-.byte 0xf3,0x48,0xa5
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
movq %rbx,%rcx
movq %rdi,%rsi
@@ -682,7 +772,7 @@ L$ofb_inp_aligned:
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,232
+.byte 0xf3,0x0f,0xa7,232
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
movq %r8,%rdi
@@ -690,9 +780,9 @@ L$ofb_inp_aligned:
testq $15,%rdi
jz L$ofb_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
-.byte 0xf3,0x48,0xa5
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
L$ofb_out_aligned:
movq %r9,%rsi
@@ -702,8 +792,7 @@ L$ofb_out_aligned:
subq %rbx,%rcx
movq $512,%rbx
jnz L$ofb_loop
-
- cmpq %rsp,%rbp
+ cmpq %rbp,%rsp
je L$ofb_done
pxor %xmm0,%xmm0
@@ -717,12 +806,13 @@ L$ofb_bzero:
L$ofb_done:
leaq (%rbp),%rsp
jmp L$ofb_exit
+
.p2align 4
L$ofb_aligned:
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,232
+.byte 0xf3,0x0f,0xa7,232
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
L$ofb_exit:
@@ -752,8 +842,6 @@ _padlock_ctr32_encrypt:
leaq 16(%rdx),%rdx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpq $64,%rcx
- jbe L$ctr32_short
testl $32,(%rdx)
jnz L$ctr32_aligned
testq $15,%rdi
@@ -773,15 +861,32 @@ _padlock_ctr32_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
L$ctr32_reenter:
movl -4(%rdx),%eax
bswapl %eax
negl %eax
andl $31,%eax
- jz L$ctr32_loop
+ movq $512,%rbx
shll $4,%eax
+ cmovzq %rbx,%rax
cmpq %rax,%rcx
cmovaq %rax,%rbx
+ cmovbeq %rcx,%rbx
+ cmpq %rbx,%rcx
+ ja L$ctr32_loop
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $32,%rax
+ movq $-32,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jz L$ctr32_unaligned_tail
jmp L$ctr32_loop
.p2align 4
L$ctr32_loop:
@@ -797,7 +902,7 @@ L$ctr32_loop:
testq $15,%rsi
jz L$ctr32_inp_aligned
shrq $3,%rcx
-.byte 0xf3,0x48,0xa5
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
movq %rbx,%rcx
movq %rdi,%rsi
@@ -805,23 +910,23 @@ L$ctr32_inp_aligned:
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,216
+.byte 0xf3,0x0f,0xa7,216
movl -4(%rdx),%eax
testl $4294901760,%eax
- jnz L$ctr32_no_corr
+ jnz L$ctr32_no_carry
bswapl %eax
addl $65536,%eax
bswapl %eax
movl %eax,-4(%rdx)
-L$ctr32_no_corr:
+L$ctr32_no_carry:
movq %r8,%rdi
movq %r11,%rbx
testq $15,%rdi
jz L$ctr32_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
-.byte 0xf3,0x48,0xa5
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
L$ctr32_out_aligned:
movq %r9,%rsi
@@ -830,9 +935,38 @@ L$ctr32_out_aligned:
addq %rbx,%rsi
subq %rbx,%rcx
movq $512,%rbx
+ jz L$ctr32_break
+ cmpq %rbx,%rcx
+ jae L$ctr32_loop
+ movq %rcx,%rbx
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $32,%rax
+ movq $-32,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
jnz L$ctr32_loop
-
+L$ctr32_unaligned_tail:
+ xorl %eax,%eax
cmpq %rsp,%rbp
+ cmoveq %rcx,%rax
+ movq %rdi,%r8
+ movq %rcx,%rbx
+ subq %rax,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ movq %rsp,%rsi
+ movq %r8,%rdi
+ movq %rbx,%rcx
+ jmp L$ctr32_loop
+.p2align 4
+L$ctr32_break:
+ cmpq %rbp,%rsp
je L$ctr32_done
pxor %xmm0,%xmm0
@@ -846,56 +980,75 @@ L$ctr32_bzero:
L$ctr32_done:
leaq (%rbp),%rsp
jmp L$ctr32_exit
-.p2align 4
-L$ctr32_short:
- movq %rsp,%rbp
- subq %rcx,%rsp
- xorq %rbx,%rbx
-L$ctr32_short_copy:
- movups (%rsi,%rbx,1),%xmm0
- leaq 16(%rbx),%rbx
- cmpq %rbx,%rcx
- movaps %xmm0,-16(%rsp,%rbx,1)
- ja L$ctr32_short_copy
- movq %rsp,%rsi
- movq %rcx,%rbx
- jmp L$ctr32_reenter
+
.p2align 4
L$ctr32_aligned:
movl -4(%rdx),%eax
- movq $1048576,%rbx
bswapl %eax
- cmpq %rcx,%rbx
- cmovaq %rcx,%rbx
negl %eax
andl $65535,%eax
- jz L$ctr32_aligned_loop
+ movq $1048576,%rbx
shll $4,%eax
+ cmovzq %rbx,%rax
cmpq %rax,%rcx
cmovaq %rax,%rbx
- jmp L$ctr32_aligned_loop
-.p2align 4
+ cmovbeq %rcx,%rbx
+ jbe L$ctr32_aligned_skip
+
L$ctr32_aligned_loop:
- cmpq %rcx,%rbx
- cmovaq %rcx,%rbx
movq %rcx,%r10
movq %rbx,%rcx
movq %rbx,%r11
+
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
-.byte 0xf3,0x0f,0xa7,216
+.byte 0xf3,0x0f,0xa7,216
+
movl -4(%rdx),%eax
bswapl %eax
addl $65536,%eax
bswapl %eax
movl %eax,-4(%rdx)
- movq %r11,%rbx
movq %r10,%rcx
- subq %rbx,%rcx
+ subq %r11,%rcx
movq $1048576,%rbx
- jnz L$ctr32_aligned_loop
+ jz L$ctr32_exit
+ cmpq %rbx,%rcx
+ jae L$ctr32_aligned_loop
+
+L$ctr32_aligned_skip:
+ leaq (%rsi,%rcx,1),%rbp
+ negq %rbp
+ andq $4095,%rbp
+ xorl %eax,%eax
+ cmpq $32,%rbp
+ movq $32-1,%rbp
+ cmovaeq %rax,%rbp
+ andq %rcx,%rbp
+ subq %rbp,%rcx
+ jz L$ctr32_aligned_tail
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,216
+ testq %rbp,%rbp
+ jz L$ctr32_exit
+
+L$ctr32_aligned_tail:
+ movq %rdi,%r8
+ movq %rbp,%rbx
+ movq %rbp,%rcx
+ leaq (%rsp),%rbp
+ subq %rcx,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ leaq (%r8),%rdi
+ leaq (%rsp),%rsi
+ movq %rbx,%rcx
+ jmp L$ctr32_loop
L$ctr32_exit:
movl $1,%eax
leaq 8(%rsp),%rsp
diff --git a/lib/accelerated/x86/macosx/ghash-x86_64.s b/lib/accelerated/x86/macosx/ghash-x86_64.s
index 8aa7ffc04f..a63034a6fc 100644
--- a/lib/accelerated/x86/macosx/ghash-x86_64.s
+++ b/lib/accelerated/x86/macosx/ghash-x86_64.s
@@ -39,6 +39,7 @@
#
.text
+
.globl _gcm_gmult_4bit
.p2align 4
@@ -697,6 +698,7 @@ L$ghash_epilogue:
.p2align 4
_gcm_init_clmul:
+L$_init_clmul:
movdqu (%rsi),%xmm2
pshufd $78,%xmm2,%xmm2
@@ -715,15 +717,15 @@ _gcm_init_clmul:
pxor %xmm5,%xmm2
+ pshufd $78,%xmm2,%xmm6
movdqa %xmm2,%xmm0
+ pxor %xmm2,%xmm6
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
+.byte 102,15,58,68,222,0
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -733,44 +735,134 @@ _gcm_init_clmul:
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
psllq $1,%xmm0
pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm2,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm2,%xmm3
+ movdqu %xmm2,0(%rdi)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,16(%rdi)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,32(%rdi)
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
psrlq $5,%xmm0
pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ movdqa %xmm0,%xmm5
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- movdqu %xmm2,(%rdi)
- movdqu %xmm0,16(%rdi)
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm5,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm5,%xmm3
+ movdqu %xmm5,48(%rdi)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,64(%rdi)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,80(%rdi)
.byte 0xf3,0xc3
.globl _gcm_gmult_clmul
.p2align 4
_gcm_gmult_clmul:
+L$_gmult_clmul:
movdqu (%rdi),%xmm0
movdqa L$bswap_mask(%rip),%xmm5
movdqu (%rsi),%xmm2
+ movdqu 32(%rsi),%xmm4
.byte 102,15,56,0,197
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
.byte 102,15,58,68,220,0
@@ -783,201 +875,381 @@ _gcm_gmult_clmul:
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
.byte 102,15,56,0,197
movdqu %xmm0,(%rdi)
.byte 0xf3,0xc3
.globl _gcm_ghash_clmul
-.p2align 4
+.p2align 5
_gcm_ghash_clmul:
- movdqa L$bswap_mask(%rip),%xmm5
+L$_ghash_clmul:
+ movdqa L$bswap_mask(%rip),%xmm10
movdqu (%rdi),%xmm0
movdqu (%rsi),%xmm2
-.byte 102,15,56,0,197
+ movdqu 32(%rsi),%xmm7
+.byte 102,65,15,56,0,194
subq $16,%rcx
jz L$odd_tail
- movdqu 16(%rsi),%xmm8
-
+ movdqu 16(%rsi),%xmm6
+ movl __gnutls_x86_cpuid_s+4(%rip),%eax
+ cmpq $48,%rcx
+ jb L$skip4x
+
+ andl $71303168,%eax
+ cmpl $4194304,%eax
+ je L$skip4x
+
+ subq $48,%rcx
+ movq $11547335547999543296,%rax
+ movdqu 48(%rsi),%xmm14
+ movdqu 64(%rsi),%xmm15
+
+
+
+
+ movdqu 48(%rdx),%xmm3
+ movdqu 32(%rdx),%xmm11
+.byte 102,65,15,56,0,218
+.byte 102,69,15,56,0,218
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
+
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm11,%xmm12
+.byte 102,68,15,58,68,222,0
+.byte 102,68,15,58,68,238,17
+ xorps %xmm11,%xmm3
+.byte 102,68,15,58,68,231,16
+ xorps %xmm13,%xmm5
+ movups 80(%rsi),%xmm7
+ xorps %xmm12,%xmm4
+
+ movdqu 16(%rdx),%xmm11
+ movdqu 0(%rdx),%xmm8
+.byte 102,69,15,56,0,218
+.byte 102,69,15,56,0,194
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm8,%xmm0
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm3
+.byte 102,68,15,58,68,231,0
+ xorps %xmm13,%xmm5
+
+ leaq 64(%rdx),%rdx
+ subq $64,%rcx
+ jc L$tail4x
+
+ jmp L$mod4_loop
+.p2align 5
+L$mod4_loop:
+.byte 102,65,15,58,68,199,0
+ xorps %xmm12,%xmm4
+ movdqu 48(%rdx),%xmm11
+.byte 102,69,15,56,0,218
+.byte 102,65,15,58,68,207,17
+ xorps %xmm3,%xmm0
+ movdqu 32(%rdx),%xmm3
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+.byte 102,68,15,58,68,199,16
+ xorps %xmm5,%xmm1
+ pxor %xmm11,%xmm12
+.byte 102,65,15,56,0,218
+ movups 32(%rsi),%xmm7
+.byte 102,68,15,58,68,218,0
+ xorps %xmm4,%xmm8
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+
+ pxor %xmm0,%xmm8
+ pxor %xmm3,%xmm4
+ pxor %xmm1,%xmm8
+ movdqa %xmm8,%xmm9
+ pslldq $8,%xmm8
+.byte 102,68,15,58,68,234,17
+ psrldq $8,%xmm9
+ pxor %xmm8,%xmm0
+ movdqa L$7_mask(%rip),%xmm8
+ pxor %xmm9,%xmm1
+.byte 102,76,15,110,200
+
+ pand %xmm0,%xmm8
+.byte 102,69,15,56,0,200
+.byte 102,68,15,58,68,231,0
+ pxor %xmm0,%xmm9
+ psllq $57,%xmm9
+ movdqa %xmm9,%xmm8
+ pslldq $8,%xmm9
+.byte 102,15,58,68,222,0
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
+ movdqu 0(%rdx),%xmm8
+
+ movdqa %xmm0,%xmm9
+ psrlq $1,%xmm0
+.byte 102,15,58,68,238,17
+ xorps %xmm11,%xmm3
+ movdqu 16(%rdx),%xmm11
+.byte 102,69,15,56,0,218
+.byte 102,15,58,68,231,16
+ xorps %xmm13,%xmm5
+ movups 80(%rsi),%xmm7
+.byte 102,69,15,56,0,194
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
+ psrlq $5,%xmm0
+ movdqa %xmm11,%xmm13
+ pxor %xmm12,%xmm4
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
+ psrlq $1,%xmm0
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm3
+ pxor %xmm1,%xmm0
+.byte 102,68,15,58,68,231,0
+ xorps %xmm13,%xmm5
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
+
+ leaq 64(%rdx),%rdx
+ subq $64,%rcx
+ jnc L$mod4_loop
+
+L$tail4x:
+.byte 102,65,15,58,68,199,0
+ xorps %xmm12,%xmm4
+.byte 102,65,15,58,68,207,17
+ xorps %xmm3,%xmm0
+.byte 102,68,15,58,68,199,16
+ xorps %xmm5,%xmm1
+ pxor %xmm0,%xmm1
+ pxor %xmm4,%xmm8
+
+ pxor %xmm1,%xmm8
+ pxor %xmm0,%xmm1
+
+ movdqa %xmm8,%xmm9
+ psrldq $8,%xmm8
+ pslldq $8,%xmm9
+ pxor %xmm8,%xmm1
+ pxor %xmm9,%xmm0
- movdqu (%rdx),%xmm3
- movdqu 16(%rdx),%xmm6
-.byte 102,15,56,0,221
-.byte 102,15,56,0,245
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
- movdqa %xmm6,%xmm7
- pshufd $78,%xmm6,%xmm3
- pshufd $78,%xmm2,%xmm4
- pxor %xmm6,%xmm3
- pxor %xmm2,%xmm4
-.byte 102,15,58,68,242,0
-.byte 102,15,58,68,250,17
-.byte 102,15,58,68,220,0
- pxor %xmm6,%xmm3
- pxor %xmm7,%xmm3
-
- movdqa %xmm3,%xmm4
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm7
- pxor %xmm4,%xmm6
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm8,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm8,%xmm4
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ addq $64,%rcx
+ jz L$done
+ movdqu 32(%rsi),%xmm7
+ subq $16,%rcx
+ jz L$odd_tail
+L$skip4x:
+
+
+
+
+
+ movdqu (%rdx),%xmm8
+ movdqu 16(%rdx),%xmm3
+.byte 102,69,15,56,0,194
+.byte 102,65,15,56,0,218
+ pxor %xmm8,%xmm0
+
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
leaq 32(%rdx),%rdx
+ nop
subq $32,%rcx
jbe L$even_tail
+ nop
+ jmp L$mod_loop
+.p2align 5
L$mod_loop:
-.byte 102,65,15,58,68,192,0
-.byte 102,65,15,58,68,200,17
-.byte 102,15,58,68,220,0
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm0,%xmm4
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
+
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ movdqu (%rdx),%xmm5
+ pxor %xmm0,%xmm8
+.byte 102,65,15,56,0,234
+ movdqu 16(%rdx),%xmm3
+
+ pxor %xmm1,%xmm8
+ pxor %xmm5,%xmm1
+ pxor %xmm8,%xmm4
+.byte 102,65,15,56,0,218
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
pslldq $8,%xmm4
- pxor %xmm3,%xmm1
+ pxor %xmm8,%xmm1
pxor %xmm4,%xmm0
- movdqu (%rdx),%xmm3
- pxor %xmm6,%xmm0
- pxor %xmm7,%xmm1
-
- movdqu 16(%rdx),%xmm6
-.byte 102,15,56,0,221
-.byte 102,15,56,0,245
-
- movdqa %xmm6,%xmm7
- pshufd $78,%xmm6,%xmm9
- pshufd $78,%xmm2,%xmm10
- pxor %xmm6,%xmm9
- pxor %xmm2,%xmm10
- pxor %xmm3,%xmm1
- movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
+ movdqa %xmm3,%xmm5
+
+ movdqa %xmm0,%xmm9
+ movdqa %xmm0,%xmm8
psllq $5,%xmm0
- pxor %xmm3,%xmm0
-.byte 102,15,58,68,242,0
+ pxor %xmm0,%xmm8
+.byte 102,15,58,68,218,0
+ psllq $1,%xmm0
+ pxor %xmm8,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm8
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
-
-.byte 102,15,58,68,250,17
- movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pshufd $78,%xmm5,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm5,%xmm4
+
+.byte 102,15,58,68,234,17
+ movdqa %xmm0,%xmm9
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
+ psrlq $5,%xmm0
+ pxor %xmm9,%xmm0
+ leaq 32(%rdx),%rdx
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
-
-.byte 102,69,15,58,68,202,0
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm8,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm8,%xmm4
-
- pxor %xmm6,%xmm9
- pxor %xmm7,%xmm9
- movdqa %xmm9,%xmm10
- psrldq $8,%xmm9
- pslldq $8,%xmm10
- pxor %xmm9,%xmm7
- pxor %xmm10,%xmm6
+.byte 102,15,58,68,231,0
+ pxor %xmm1,%xmm0
+.byte 0x66,0x90
- leaq 32(%rdx),%rdx
subq $32,%rcx
ja L$mod_loop
L$even_tail:
-.byte 102,65,15,58,68,192,0
-.byte 102,65,15,58,68,200,17
-.byte 102,15,58,68,220,0
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm0,%xmm4
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
+
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ pxor %xmm0,%xmm8
+ pxor %xmm1,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
pslldq $8,%xmm4
- pxor %xmm3,%xmm1
+ pxor %xmm8,%xmm1
pxor %xmm4,%xmm0
- pxor %xmm6,%xmm0
- pxor %xmm7,%xmm1
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
testq %rcx,%rcx
jnz L$done
L$odd_tail:
- movdqu (%rdx),%xmm3
-.byte 102,15,56,0,221
- pxor %xmm3,%xmm0
+ movdqu (%rdx),%xmm8
+.byte 102,69,15,56,0,194
+ pxor %xmm8,%xmm0
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
+.byte 102,15,58,68,223,0
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -987,38 +1259,60 @@ L$odd_tail:
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
L$done:
-.byte 102,15,56,0,197
+.byte 102,65,15,56,0,194
movdqu %xmm0,(%rdi)
.byte 0xf3,0xc3
-L$SEH_end_gcm_ghash_clmul:
+
+.globl _gcm_init_avx
+
+.p2align 5
+_gcm_init_avx:
+ jmp L$_init_clmul
+
+.globl _gcm_gmult_avx
+
+.p2align 5
+_gcm_gmult_avx:
+ jmp L$_gmult_clmul
+
+.globl _gcm_ghash_avx
+
+.p2align 5
+_gcm_ghash_avx:
+ jmp L$_ghash_clmul
.p2align 6
L$bswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
L$0x1c2_polynomial:
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+L$7_mask:
+.long 7,0,7,0
+L$7_mask_poly:
+.long 7,0,450,0
.p2align 6
L$rem_4bit:
diff --git a/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s b/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s
index d7f1e40b18..9091bd802e 100644
--- a/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s
+++ b/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s
@@ -46,23 +46,25 @@
_sha1_block_data_order:
movl __gnutls_x86_cpuid_s+0(%rip),%r9d
movl __gnutls_x86_cpuid_s+4(%rip),%r8d
+ movl __gnutls_x86_cpuid_s+8(%rip),%r10d
testl $512,%r8d
jz L$ialu
jmp _ssse3_shortcut
.p2align 4
L$ialu:
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
- movq %rsp,%r11
+ pushq %r14
movq %rdi,%r8
subq $72,%rsp
movq %rsi,%r9
andq $-64,%rsp
movq %rdx,%r10
- movq %r11,64(%rsp)
+ movq %rax,64(%rsp)
L$prologue:
movl 0(%r8),%esi
@@ -76,1230 +78,1168 @@ L$prologue:
L$loop:
movl 0(%r9),%edx
bswapl %edx
- movl %edx,0(%rsp)
- movl %r11d,%eax
movl 4(%r9),%ebp
+ movl %r12d,%eax
+ movl %edx,0(%rsp)
movl %esi,%ecx
- xorl %r12d,%eax
bswapl %ebp
+ xorl %r11d,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%r13,1),%r13d
andl %edi,%eax
- movl %ebp,4(%rsp)
+ leal 1518500249(%rdx,%r13,1),%r13d
addl %ecx,%r13d
xorl %r12d,%eax
roll $30,%edi
addl %eax,%r13d
- movl %edi,%eax
- movl 8(%r9),%edx
+ movl 8(%r9),%r14d
+ movl %r11d,%eax
+ movl %ebp,4(%rsp)
movl %r13d,%ecx
- xorl %r11d,%eax
- bswapl %edx
+ bswapl %r14d
+ xorl %edi,%eax
roll $5,%ecx
- leal 1518500249(%rbp,%r12,1),%r12d
andl %esi,%eax
- movl %edx,8(%rsp)
+ leal 1518500249(%rbp,%r12,1),%r12d
addl %ecx,%r12d
xorl %r11d,%eax
roll $30,%esi
addl %eax,%r12d
- movl %esi,%eax
- movl 12(%r9),%ebp
+ movl 12(%r9),%edx
+ movl %edi,%eax
+ movl %r14d,8(%rsp)
movl %r12d,%ecx
- xorl %edi,%eax
- bswapl %ebp
+ bswapl %edx
+ xorl %esi,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%r11,1),%r11d
andl %r13d,%eax
- movl %ebp,12(%rsp)
+ leal 1518500249(%r14,%r11,1),%r11d
addl %ecx,%r11d
xorl %edi,%eax
roll $30,%r13d
addl %eax,%r11d
- movl %r13d,%eax
- movl 16(%r9),%edx
+ movl 16(%r9),%ebp
+ movl %esi,%eax
+ movl %edx,12(%rsp)
movl %r11d,%ecx
- xorl %esi,%eax
- bswapl %edx
+ bswapl %ebp
+ xorl %r13d,%eax
roll $5,%ecx
- leal 1518500249(%rbp,%rdi,1),%edi
andl %r12d,%eax
- movl %edx,16(%rsp)
+ leal 1518500249(%rdx,%rdi,1),%edi
addl %ecx,%edi
xorl %esi,%eax
roll $30,%r12d
addl %eax,%edi
- movl %r12d,%eax
- movl 20(%r9),%ebp
+ movl 20(%r9),%r14d
+ movl %r13d,%eax
+ movl %ebp,16(%rsp)
movl %edi,%ecx
- xorl %r13d,%eax
- bswapl %ebp
+ bswapl %r14d
+ xorl %r12d,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%rsi,1),%esi
andl %r11d,%eax
- movl %ebp,20(%rsp)
+ leal 1518500249(%rbp,%rsi,1),%esi
addl %ecx,%esi
xorl %r13d,%eax
roll $30,%r11d
addl %eax,%esi
- movl %r11d,%eax
movl 24(%r9),%edx
+ movl %r12d,%eax
+ movl %r14d,20(%rsp)
movl %esi,%ecx
- xorl %r12d,%eax
bswapl %edx
+ xorl %r11d,%eax
roll $5,%ecx
- leal 1518500249(%rbp,%r13,1),%r13d
andl %edi,%eax
- movl %edx,24(%rsp)
+ leal 1518500249(%r14,%r13,1),%r13d
addl %ecx,%r13d
xorl %r12d,%eax
roll $30,%edi
addl %eax,%r13d
- movl %edi,%eax
movl 28(%r9),%ebp
+ movl %r11d,%eax
+ movl %edx,24(%rsp)
movl %r13d,%ecx
- xorl %r11d,%eax
bswapl %ebp
+ xorl %edi,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%r12,1),%r12d
andl %esi,%eax
- movl %ebp,28(%rsp)
+ leal 1518500249(%rdx,%r12,1),%r12d
addl %ecx,%r12d
xorl %r11d,%eax
roll $30,%esi
addl %eax,%r12d
- movl %esi,%eax
- movl 32(%r9),%edx
+ movl 32(%r9),%r14d
+ movl %edi,%eax
+ movl %ebp,28(%rsp)
movl %r12d,%ecx
- xorl %edi,%eax
- bswapl %edx
+ bswapl %r14d
+ xorl %esi,%eax
roll $5,%ecx
- leal 1518500249(%rbp,%r11,1),%r11d
andl %r13d,%eax
- movl %edx,32(%rsp)
+ leal 1518500249(%rbp,%r11,1),%r11d
addl %ecx,%r11d
xorl %edi,%eax
roll $30,%r13d
addl %eax,%r11d
- movl %r13d,%eax
- movl 36(%r9),%ebp
+ movl 36(%r9),%edx
+ movl %esi,%eax
+ movl %r14d,32(%rsp)
movl %r11d,%ecx
- xorl %esi,%eax
- bswapl %ebp
+ bswapl %edx
+ xorl %r13d,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%rdi,1),%edi
andl %r12d,%eax
- movl %ebp,36(%rsp)
+ leal 1518500249(%r14,%rdi,1),%edi
addl %ecx,%edi
xorl %esi,%eax
roll $30,%r12d
addl %eax,%edi
- movl %r12d,%eax
- movl 40(%r9),%edx
+ movl 40(%r9),%ebp
+ movl %r13d,%eax
+ movl %edx,36(%rsp)
movl %edi,%ecx
- xorl %r13d,%eax
- bswapl %edx
+ bswapl %ebp
+ xorl %r12d,%eax
roll $5,%ecx
- leal 1518500249(%rbp,%rsi,1),%esi
andl %r11d,%eax
- movl %edx,40(%rsp)
+ leal 1518500249(%rdx,%rsi,1),%esi
addl %ecx,%esi
xorl %r13d,%eax
roll $30,%r11d
addl %eax,%esi
- movl %r11d,%eax
- movl 44(%r9),%ebp
+ movl 44(%r9),%r14d
+ movl %r12d,%eax
+ movl %ebp,40(%rsp)
movl %esi,%ecx
- xorl %r12d,%eax
- bswapl %ebp
+ bswapl %r14d
+ xorl %r11d,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%r13,1),%r13d
andl %edi,%eax
- movl %ebp,44(%rsp)
+ leal 1518500249(%rbp,%r13,1),%r13d
addl %ecx,%r13d
xorl %r12d,%eax
roll $30,%edi
addl %eax,%r13d
- movl %edi,%eax
movl 48(%r9),%edx
+ movl %r11d,%eax
+ movl %r14d,44(%rsp)
movl %r13d,%ecx
- xorl %r11d,%eax
bswapl %edx
+ xorl %edi,%eax
roll $5,%ecx
- leal 1518500249(%rbp,%r12,1),%r12d
andl %esi,%eax
- movl %edx,48(%rsp)
+ leal 1518500249(%r14,%r12,1),%r12d
addl %ecx,%r12d
xorl %r11d,%eax
roll $30,%esi
addl %eax,%r12d
- movl %esi,%eax
movl 52(%r9),%ebp
+ movl %edi,%eax
+ movl %edx,48(%rsp)
movl %r12d,%ecx
- xorl %edi,%eax
bswapl %ebp
+ xorl %esi,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%r11,1),%r11d
andl %r13d,%eax
- movl %ebp,52(%rsp)
+ leal 1518500249(%rdx,%r11,1),%r11d
addl %ecx,%r11d
xorl %edi,%eax
roll $30,%r13d
addl %eax,%r11d
- movl %r13d,%eax
- movl 56(%r9),%edx
+ movl 56(%r9),%r14d
+ movl %esi,%eax
+ movl %ebp,52(%rsp)
movl %r11d,%ecx
- xorl %esi,%eax
- bswapl %edx
+ bswapl %r14d
+ xorl %r13d,%eax
roll $5,%ecx
- leal 1518500249(%rbp,%rdi,1),%edi
andl %r12d,%eax
- movl %edx,56(%rsp)
+ leal 1518500249(%rbp,%rdi,1),%edi
addl %ecx,%edi
xorl %esi,%eax
roll $30,%r12d
addl %eax,%edi
- movl %r12d,%eax
- movl 60(%r9),%ebp
+ movl 60(%r9),%edx
+ movl %r13d,%eax
+ movl %r14d,56(%rsp)
movl %edi,%ecx
- xorl %r13d,%eax
- bswapl %ebp
+ bswapl %edx
+ xorl %r12d,%eax
roll $5,%ecx
- leal 1518500249(%rdx,%rsi,1),%esi
andl %r11d,%eax
- movl %ebp,60(%rsp)
+ leal 1518500249(%r14,%rsi,1),%esi
addl %ecx,%esi
xorl %r13d,%eax
roll $30,%r11d
addl %eax,%esi
- movl 0(%rsp),%edx
- movl %r11d,%eax
+ xorl 0(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,60(%rsp)
movl %esi,%ecx
- xorl 8(%rsp),%edx
- xorl %r12d,%eax
+ xorl 8(%rsp),%ebp
+ xorl %r11d,%eax
roll $5,%ecx
- xorl 32(%rsp),%edx
+ xorl 32(%rsp),%ebp
andl %edi,%eax
- leal 1518500249(%rbp,%r13,1),%r13d
- xorl 52(%rsp),%edx
+ leal 1518500249(%rdx,%r13,1),%r13d
+ roll $30,%edi
xorl %r12d,%eax
- roll $1,%edx
addl %ecx,%r13d
- roll $30,%edi
- movl %edx,0(%rsp)
+ roll $1,%ebp
addl %eax,%r13d
- movl 4(%rsp),%ebp
- movl %edi,%eax
+ xorl 4(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,0(%rsp)
movl %r13d,%ecx
- xorl 12(%rsp),%ebp
- xorl %r11d,%eax
+ xorl 12(%rsp),%r14d
+ xorl %edi,%eax
roll $5,%ecx
- xorl 36(%rsp),%ebp
+ xorl 36(%rsp),%r14d
andl %esi,%eax
- leal 1518500249(%rdx,%r12,1),%r12d
- xorl 56(%rsp),%ebp
+ leal 1518500249(%rbp,%r12,1),%r12d
+ roll $30,%esi
xorl %r11d,%eax
- roll $1,%ebp
addl %ecx,%r12d
- roll $30,%esi
- movl %ebp,4(%rsp)
+ roll $1,%r14d
addl %eax,%r12d
- movl 8(%rsp),%edx
- movl %esi,%eax
+ xorl 8(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,4(%rsp)
movl %r12d,%ecx
xorl 16(%rsp),%edx
- xorl %edi,%eax
+ xorl %esi,%eax
roll $5,%ecx
xorl 40(%rsp),%edx
andl %r13d,%eax
- leal 1518500249(%rbp,%r11,1),%r11d
- xorl 60(%rsp),%edx
+ leal 1518500249(%r14,%r11,1),%r11d
+ roll $30,%r13d
xorl %edi,%eax
- roll $1,%edx
addl %ecx,%r11d
- roll $30,%r13d
- movl %edx,8(%rsp)
+ roll $1,%edx
addl %eax,%r11d
- movl 12(%rsp),%ebp
- movl %r13d,%eax
+ xorl 12(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,8(%rsp)
movl %r11d,%ecx
xorl 20(%rsp),%ebp
- xorl %esi,%eax
+ xorl %r13d,%eax
roll $5,%ecx
xorl 44(%rsp),%ebp
andl %r12d,%eax
leal 1518500249(%rdx,%rdi,1),%edi
- xorl 0(%rsp),%ebp
+ roll $30,%r12d
xorl %esi,%eax
- roll $1,%ebp
addl %ecx,%edi
- roll $30,%r12d
- movl %ebp,12(%rsp)
+ roll $1,%ebp
addl %eax,%edi
- movl 16(%rsp),%edx
- movl %r12d,%eax
+ xorl 16(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,12(%rsp)
movl %edi,%ecx
- xorl 24(%rsp),%edx
- xorl %r13d,%eax
+ xorl 24(%rsp),%r14d
+ xorl %r12d,%eax
roll $5,%ecx
- xorl 48(%rsp),%edx
+ xorl 48(%rsp),%r14d
andl %r11d,%eax
leal 1518500249(%rbp,%rsi,1),%esi
- xorl 4(%rsp),%edx
+ roll $30,%r11d
xorl %r13d,%eax
- roll $1,%edx
addl %ecx,%esi
- roll $30,%r11d
- movl %edx,16(%rsp)
+ roll $1,%r14d
addl %eax,%esi
- movl 20(%rsp),%ebp
- movl %r11d,%eax
+ xorl 20(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,16(%rsp)
movl %esi,%ecx
- xorl 28(%rsp),%ebp
- xorl %edi,%eax
- roll $5,%ecx
- leal 1859775393(%rdx,%r13,1),%r13d
- xorl 52(%rsp),%ebp
+ xorl 28(%rsp),%edx
xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 52(%rsp),%edx
+ leal 1859775393(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 8(%rsp),%ebp
roll $30,%edi
addl %eax,%r13d
- roll $1,%ebp
- movl %ebp,20(%rsp)
- movl 24(%rsp),%edx
- movl %edi,%eax
+ roll $1,%edx
+ xorl 24(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,20(%rsp)
movl %r13d,%ecx
- xorl 32(%rsp),%edx
- xorl %esi,%eax
- roll $5,%ecx
- leal 1859775393(%rbp,%r12,1),%r12d
- xorl 56(%rsp),%edx
+ xorl 32(%rsp),%ebp
xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 56(%rsp),%ebp
+ leal 1859775393(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 12(%rsp),%edx
roll $30,%esi
addl %eax,%r12d
- roll $1,%edx
- movl %edx,24(%rsp)
- movl 28(%rsp),%ebp
- movl %esi,%eax
+ roll $1,%ebp
+ xorl 28(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,24(%rsp)
movl %r12d,%ecx
- xorl 36(%rsp),%ebp
- xorl %r13d,%eax
- roll $5,%ecx
- leal 1859775393(%rdx,%r11,1),%r11d
- xorl 60(%rsp),%ebp
+ xorl 36(%rsp),%r14d
xorl %edi,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%r14d
+ leal 1859775393(%rbp,%r11,1),%r11d
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 16(%rsp),%ebp
roll $30,%r13d
addl %eax,%r11d
- roll $1,%ebp
- movl %ebp,28(%rsp)
- movl 32(%rsp),%edx
- movl %r13d,%eax
+ roll $1,%r14d
+ xorl 32(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,28(%rsp)
movl %r11d,%ecx
xorl 40(%rsp),%edx
- xorl %r12d,%eax
+ xorl %esi,%eax
roll $5,%ecx
- leal 1859775393(%rbp,%rdi,1),%edi
xorl 0(%rsp),%edx
- xorl %esi,%eax
+ leal 1859775393(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 20(%rsp),%edx
roll $30,%r12d
addl %eax,%edi
roll $1,%edx
+ xorl 36(%rsp),%ebp
+ movl %r11d,%eax
movl %edx,32(%rsp)
- movl 36(%rsp),%ebp
- movl %r12d,%eax
movl %edi,%ecx
xorl 44(%rsp),%ebp
- xorl %r11d,%eax
+ xorl %r13d,%eax
roll $5,%ecx
- leal 1859775393(%rdx,%rsi,1),%esi
xorl 4(%rsp),%ebp
- xorl %r13d,%eax
+ leal 1859775393(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
addl %ecx,%esi
- xorl 24(%rsp),%ebp
roll $30,%r11d
addl %eax,%esi
roll $1,%ebp
+ xorl 40(%rsp),%r14d
+ movl %edi,%eax
movl %ebp,36(%rsp)
- movl 40(%rsp),%edx
- movl %r11d,%eax
movl %esi,%ecx
- xorl 48(%rsp),%edx
- xorl %edi,%eax
+ xorl 48(%rsp),%r14d
+ xorl %r12d,%eax
roll $5,%ecx
+ xorl 8(%rsp),%r14d
leal 1859775393(%rbp,%r13,1),%r13d
- xorl 8(%rsp),%edx
- xorl %r12d,%eax
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 28(%rsp),%edx
roll $30,%edi
addl %eax,%r13d
- roll $1,%edx
- movl %edx,40(%rsp)
- movl 44(%rsp),%ebp
- movl %edi,%eax
+ roll $1,%r14d
+ xorl 44(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,40(%rsp)
movl %r13d,%ecx
- xorl 52(%rsp),%ebp
- xorl %esi,%eax
- roll $5,%ecx
- leal 1859775393(%rdx,%r12,1),%r12d
- xorl 12(%rsp),%ebp
+ xorl 52(%rsp),%edx
xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 12(%rsp),%edx
+ leal 1859775393(%r14,%r12,1),%r12d
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 32(%rsp),%ebp
roll $30,%esi
addl %eax,%r12d
- roll $1,%ebp
- movl %ebp,44(%rsp)
- movl 48(%rsp),%edx
- movl %esi,%eax
+ roll $1,%edx
+ xorl 48(%rsp),%ebp
+ movl %r13d,%eax
+ movl %edx,44(%rsp)
movl %r12d,%ecx
- xorl 56(%rsp),%edx
- xorl %r13d,%eax
- roll $5,%ecx
- leal 1859775393(%rbp,%r11,1),%r11d
- xorl 16(%rsp),%edx
+ xorl 56(%rsp),%ebp
xorl %edi,%eax
+ roll $5,%ecx
+ xorl 16(%rsp),%ebp
+ leal 1859775393(%rdx,%r11,1),%r11d
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 36(%rsp),%edx
roll $30,%r13d
addl %eax,%r11d
- roll $1,%edx
- movl %edx,48(%rsp)
- movl 52(%rsp),%ebp
- movl %r13d,%eax
+ roll $1,%ebp
+ xorl 52(%rsp),%r14d
+ movl %r12d,%eax
+ movl %ebp,48(%rsp)
movl %r11d,%ecx
- xorl 60(%rsp),%ebp
- xorl %r12d,%eax
- roll $5,%ecx
- leal 1859775393(%rdx,%rdi,1),%edi
- xorl 20(%rsp),%ebp
+ xorl 60(%rsp),%r14d
xorl %esi,%eax
+ roll $5,%ecx
+ xorl 20(%rsp),%r14d
+ leal 1859775393(%rbp,%rdi,1),%edi
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 40(%rsp),%ebp
roll $30,%r12d
addl %eax,%edi
- roll $1,%ebp
- movl %ebp,52(%rsp)
- movl 56(%rsp),%edx
- movl %r12d,%eax
+ roll $1,%r14d
+ xorl 56(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,52(%rsp)
movl %edi,%ecx
xorl 0(%rsp),%edx
- xorl %r11d,%eax
+ xorl %r13d,%eax
roll $5,%ecx
- leal 1859775393(%rbp,%rsi,1),%esi
xorl 24(%rsp),%edx
- xorl %r13d,%eax
+ leal 1859775393(%r14,%rsi,1),%esi
+ xorl %r12d,%eax
addl %ecx,%esi
- xorl 44(%rsp),%edx
roll $30,%r11d
addl %eax,%esi
roll $1,%edx
+ xorl 60(%rsp),%ebp
+ movl %edi,%eax
movl %edx,56(%rsp)
- movl 60(%rsp),%ebp
- movl %r11d,%eax
movl %esi,%ecx
xorl 4(%rsp),%ebp
- xorl %edi,%eax
+ xorl %r12d,%eax
roll $5,%ecx
- leal 1859775393(%rdx,%r13,1),%r13d
xorl 28(%rsp),%ebp
- xorl %r12d,%eax
+ leal 1859775393(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 48(%rsp),%ebp
roll $30,%edi
addl %eax,%r13d
roll $1,%ebp
+ xorl 0(%rsp),%r14d
+ movl %esi,%eax
movl %ebp,60(%rsp)
- movl 0(%rsp),%edx
- movl %edi,%eax
movl %r13d,%ecx
- xorl 8(%rsp),%edx
- xorl %esi,%eax
+ xorl 8(%rsp),%r14d
+ xorl %r11d,%eax
roll $5,%ecx
+ xorl 32(%rsp),%r14d
leal 1859775393(%rbp,%r12,1),%r12d
- xorl 32(%rsp),%edx
- xorl %r11d,%eax
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 52(%rsp),%edx
roll $30,%esi
addl %eax,%r12d
- roll $1,%edx
- movl %edx,0(%rsp)
- movl 4(%rsp),%ebp
- movl %esi,%eax
+ roll $1,%r14d
+ xorl 4(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,0(%rsp)
movl %r12d,%ecx
- xorl 12(%rsp),%ebp
- xorl %r13d,%eax
- roll $5,%ecx
- leal 1859775393(%rdx,%r11,1),%r11d
- xorl 36(%rsp),%ebp
+ xorl 12(%rsp),%edx
xorl %edi,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%edx
+ leal 1859775393(%r14,%r11,1),%r11d
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 56(%rsp),%ebp
roll $30,%r13d
addl %eax,%r11d
- roll $1,%ebp
- movl %ebp,4(%rsp)
- movl 8(%rsp),%edx
- movl %r13d,%eax
+ roll $1,%edx
+ xorl 8(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,4(%rsp)
movl %r11d,%ecx
- xorl 16(%rsp),%edx
- xorl %r12d,%eax
- roll $5,%ecx
- leal 1859775393(%rbp,%rdi,1),%edi
- xorl 40(%rsp),%edx
+ xorl 16(%rsp),%ebp
xorl %esi,%eax
+ roll $5,%ecx
+ xorl 40(%rsp),%ebp
+ leal 1859775393(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 60(%rsp),%edx
roll $30,%r12d
addl %eax,%edi
- roll $1,%edx
- movl %edx,8(%rsp)
- movl 12(%rsp),%ebp
- movl %r12d,%eax
+ roll $1,%ebp
+ xorl 12(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,8(%rsp)
movl %edi,%ecx
- xorl 20(%rsp),%ebp
- xorl %r11d,%eax
- roll $5,%ecx
- leal 1859775393(%rdx,%rsi,1),%esi
- xorl 44(%rsp),%ebp
+ xorl 20(%rsp),%r14d
xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 44(%rsp),%r14d
+ leal 1859775393(%rbp,%rsi,1),%esi
+ xorl %r12d,%eax
addl %ecx,%esi
- xorl 0(%rsp),%ebp
roll $30,%r11d
addl %eax,%esi
- roll $1,%ebp
- movl %ebp,12(%rsp)
- movl 16(%rsp),%edx
- movl %r11d,%eax
+ roll $1,%r14d
+ xorl 16(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,12(%rsp)
movl %esi,%ecx
xorl 24(%rsp),%edx
- xorl %edi,%eax
+ xorl %r12d,%eax
roll $5,%ecx
- leal 1859775393(%rbp,%r13,1),%r13d
xorl 48(%rsp),%edx
- xorl %r12d,%eax
+ leal 1859775393(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 4(%rsp),%edx
roll $30,%edi
addl %eax,%r13d
roll $1,%edx
+ xorl 20(%rsp),%ebp
+ movl %esi,%eax
movl %edx,16(%rsp)
- movl 20(%rsp),%ebp
- movl %edi,%eax
movl %r13d,%ecx
xorl 28(%rsp),%ebp
- xorl %esi,%eax
+ xorl %r11d,%eax
roll $5,%ecx
- leal 1859775393(%rdx,%r12,1),%r12d
xorl 52(%rsp),%ebp
- xorl %r11d,%eax
+ leal 1859775393(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 8(%rsp),%ebp
roll $30,%esi
addl %eax,%r12d
roll $1,%ebp
+ xorl 24(%rsp),%r14d
+ movl %r13d,%eax
movl %ebp,20(%rsp)
- movl 24(%rsp),%edx
- movl %esi,%eax
movl %r12d,%ecx
- xorl 32(%rsp),%edx
- xorl %r13d,%eax
+ xorl 32(%rsp),%r14d
+ xorl %edi,%eax
roll $5,%ecx
+ xorl 56(%rsp),%r14d
leal 1859775393(%rbp,%r11,1),%r11d
- xorl 56(%rsp),%edx
- xorl %edi,%eax
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 12(%rsp),%edx
roll $30,%r13d
addl %eax,%r11d
- roll $1,%edx
- movl %edx,24(%rsp)
- movl 28(%rsp),%ebp
- movl %r13d,%eax
+ roll $1,%r14d
+ xorl 28(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,24(%rsp)
movl %r11d,%ecx
- xorl 36(%rsp),%ebp
- xorl %r12d,%eax
- roll $5,%ecx
- leal 1859775393(%rdx,%rdi,1),%edi
- xorl 60(%rsp),%ebp
+ xorl 36(%rsp),%edx
xorl %esi,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%edx
+ leal 1859775393(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 16(%rsp),%ebp
roll $30,%r12d
addl %eax,%edi
- roll $1,%ebp
- movl %ebp,28(%rsp)
- movl 32(%rsp),%edx
- movl %r12d,%eax
+ roll $1,%edx
+ xorl 32(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,28(%rsp)
movl %edi,%ecx
- xorl 40(%rsp),%edx
- xorl %r11d,%eax
- roll $5,%ecx
- leal 1859775393(%rbp,%rsi,1),%esi
- xorl 0(%rsp),%edx
+ xorl 40(%rsp),%ebp
xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 0(%rsp),%ebp
+ leal 1859775393(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
addl %ecx,%esi
- xorl 20(%rsp),%edx
roll $30,%r11d
addl %eax,%esi
- roll $1,%edx
- movl %edx,32(%rsp)
- movl 36(%rsp),%ebp
- movl %r11d,%eax
- movl %r11d,%ebx
- xorl 44(%rsp),%ebp
- andl %r12d,%eax
+ roll $1,%ebp
+ xorl 36(%rsp),%r14d
+ movl %r12d,%eax
+ movl %ebp,32(%rsp)
+ movl %r12d,%ebx
+ xorl 44(%rsp),%r14d
+ andl %r11d,%eax
movl %esi,%ecx
- xorl 4(%rsp),%ebp
- xorl %r12d,%ebx
- leal -1894007588(%rdx,%r13,1),%r13d
+ xorl 4(%rsp),%r14d
+ leal -1894007588(%rbp,%r13,1),%r13d
+ xorl %r11d,%ebx
roll $5,%ecx
- xorl 24(%rsp),%ebp
addl %eax,%r13d
+ roll $1,%r14d
andl %edi,%ebx
- roll $1,%ebp
- addl %ebx,%r13d
- roll $30,%edi
- movl %ebp,36(%rsp)
addl %ecx,%r13d
- movl 40(%rsp),%edx
- movl %edi,%eax
- movl %edi,%ebx
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 40(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,36(%rsp)
+ movl %r11d,%ebx
xorl 48(%rsp),%edx
- andl %r11d,%eax
+ andl %edi,%eax
movl %r13d,%ecx
xorl 8(%rsp),%edx
- xorl %r11d,%ebx
- leal -1894007588(%rbp,%r12,1),%r12d
+ leal -1894007588(%r14,%r12,1),%r12d
+ xorl %edi,%ebx
roll $5,%ecx
- xorl 28(%rsp),%edx
addl %eax,%r12d
- andl %esi,%ebx
roll $1,%edx
- addl %ebx,%r12d
+ andl %esi,%ebx
+ addl %ecx,%r12d
roll $30,%esi
+ addl %ebx,%r12d
+ xorl 44(%rsp),%ebp
+ movl %edi,%eax
movl %edx,40(%rsp)
- addl %ecx,%r12d
- movl 44(%rsp),%ebp
- movl %esi,%eax
- movl %esi,%ebx
+ movl %edi,%ebx
xorl 52(%rsp),%ebp
- andl %edi,%eax
+ andl %esi,%eax
movl %r12d,%ecx
xorl 12(%rsp),%ebp
- xorl %edi,%ebx
leal -1894007588(%rdx,%r11,1),%r11d
+ xorl %esi,%ebx
roll $5,%ecx
- xorl 32(%rsp),%ebp
addl %eax,%r11d
- andl %r13d,%ebx
roll $1,%ebp
- addl %ebx,%r11d
+ andl %r13d,%ebx
+ addl %ecx,%r11d
roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 48(%rsp),%r14d
+ movl %esi,%eax
movl %ebp,44(%rsp)
- addl %ecx,%r11d
- movl 48(%rsp),%edx
- movl %r13d,%eax
- movl %r13d,%ebx
- xorl 56(%rsp),%edx
- andl %esi,%eax
+ movl %esi,%ebx
+ xorl 56(%rsp),%r14d
+ andl %r13d,%eax
movl %r11d,%ecx
- xorl 16(%rsp),%edx
- xorl %esi,%ebx
+ xorl 16(%rsp),%r14d
leal -1894007588(%rbp,%rdi,1),%edi
+ xorl %r13d,%ebx
roll $5,%ecx
- xorl 36(%rsp),%edx
addl %eax,%edi
+ roll $1,%r14d
andl %r12d,%ebx
- roll $1,%edx
- addl %ebx,%edi
- roll $30,%r12d
- movl %edx,48(%rsp)
addl %ecx,%edi
- movl 52(%rsp),%ebp
- movl %r12d,%eax
- movl %r12d,%ebx
- xorl 60(%rsp),%ebp
- andl %r13d,%eax
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 52(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,48(%rsp)
+ movl %r13d,%ebx
+ xorl 60(%rsp),%edx
+ andl %r12d,%eax
movl %edi,%ecx
- xorl 20(%rsp),%ebp
- xorl %r13d,%ebx
- leal -1894007588(%rdx,%rsi,1),%esi
+ xorl 20(%rsp),%edx
+ leal -1894007588(%r14,%rsi,1),%esi
+ xorl %r12d,%ebx
roll $5,%ecx
- xorl 40(%rsp),%ebp
addl %eax,%esi
+ roll $1,%edx
andl %r11d,%ebx
- roll $1,%ebp
- addl %ebx,%esi
- roll $30,%r11d
- movl %ebp,52(%rsp)
addl %ecx,%esi
- movl 56(%rsp),%edx
- movl %r11d,%eax
- movl %r11d,%ebx
- xorl 0(%rsp),%edx
- andl %r12d,%eax
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 56(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,52(%rsp)
+ movl %r12d,%ebx
+ xorl 0(%rsp),%ebp
+ andl %r11d,%eax
movl %esi,%ecx
- xorl 24(%rsp),%edx
- xorl %r12d,%ebx
- leal -1894007588(%rbp,%r13,1),%r13d
+ xorl 24(%rsp),%ebp
+ leal -1894007588(%rdx,%r13,1),%r13d
+ xorl %r11d,%ebx
roll $5,%ecx
- xorl 44(%rsp),%edx
addl %eax,%r13d
+ roll $1,%ebp
andl %edi,%ebx
- roll $1,%edx
- addl %ebx,%r13d
- roll $30,%edi
- movl %edx,56(%rsp)
addl %ecx,%r13d
- movl 60(%rsp),%ebp
- movl %edi,%eax
- movl %edi,%ebx
- xorl 4(%rsp),%ebp
- andl %r11d,%eax
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 60(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,56(%rsp)
+ movl %r11d,%ebx
+ xorl 4(%rsp),%r14d
+ andl %edi,%eax
movl %r13d,%ecx
- xorl 28(%rsp),%ebp
- xorl %r11d,%ebx
- leal -1894007588(%rdx,%r12,1),%r12d
+ xorl 28(%rsp),%r14d
+ leal -1894007588(%rbp,%r12,1),%r12d
+ xorl %edi,%ebx
roll $5,%ecx
- xorl 48(%rsp),%ebp
addl %eax,%r12d
+ roll $1,%r14d
andl %esi,%ebx
- roll $1,%ebp
- addl %ebx,%r12d
- roll $30,%esi
- movl %ebp,60(%rsp)
addl %ecx,%r12d
- movl 0(%rsp),%edx
- movl %esi,%eax
- movl %esi,%ebx
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 0(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,60(%rsp)
+ movl %edi,%ebx
xorl 8(%rsp),%edx
- andl %edi,%eax
+ andl %esi,%eax
movl %r12d,%ecx
xorl 32(%rsp),%edx
- xorl %edi,%ebx
- leal -1894007588(%rbp,%r11,1),%r11d
+ leal -1894007588(%r14,%r11,1),%r11d
+ xorl %esi,%ebx
roll $5,%ecx
- xorl 52(%rsp),%edx
addl %eax,%r11d
- andl %r13d,%ebx
roll $1,%edx
- addl %ebx,%r11d
+ andl %r13d,%ebx
+ addl %ecx,%r11d
roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 4(%rsp),%ebp
+ movl %esi,%eax
movl %edx,0(%rsp)
- addl %ecx,%r11d
- movl 4(%rsp),%ebp
- movl %r13d,%eax
- movl %r13d,%ebx
+ movl %esi,%ebx
xorl 12(%rsp),%ebp
- andl %esi,%eax
+ andl %r13d,%eax
movl %r11d,%ecx
xorl 36(%rsp),%ebp
- xorl %esi,%ebx
leal -1894007588(%rdx,%rdi,1),%edi
+ xorl %r13d,%ebx
roll $5,%ecx
- xorl 56(%rsp),%ebp
addl %eax,%edi
- andl %r12d,%ebx
roll $1,%ebp
- addl %ebx,%edi
+ andl %r12d,%ebx
+ addl %ecx,%edi
roll $30,%r12d
+ addl %ebx,%edi
+ xorl 8(%rsp),%r14d
+ movl %r13d,%eax
movl %ebp,4(%rsp)
- addl %ecx,%edi
- movl 8(%rsp),%edx
- movl %r12d,%eax
- movl %r12d,%ebx
- xorl 16(%rsp),%edx
- andl %r13d,%eax
+ movl %r13d,%ebx
+ xorl 16(%rsp),%r14d
+ andl %r12d,%eax
movl %edi,%ecx
- xorl 40(%rsp),%edx
- xorl %r13d,%ebx
+ xorl 40(%rsp),%r14d
leal -1894007588(%rbp,%rsi,1),%esi
+ xorl %r12d,%ebx
roll $5,%ecx
- xorl 60(%rsp),%edx
addl %eax,%esi
+ roll $1,%r14d
andl %r11d,%ebx
- roll $1,%edx
- addl %ebx,%esi
- roll $30,%r11d
- movl %edx,8(%rsp)
addl %ecx,%esi
- movl 12(%rsp),%ebp
- movl %r11d,%eax
- movl %r11d,%ebx
- xorl 20(%rsp),%ebp
- andl %r12d,%eax
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 12(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,8(%rsp)
+ movl %r12d,%ebx
+ xorl 20(%rsp),%edx
+ andl %r11d,%eax
movl %esi,%ecx
- xorl 44(%rsp),%ebp
- xorl %r12d,%ebx
- leal -1894007588(%rdx,%r13,1),%r13d
+ xorl 44(%rsp),%edx
+ leal -1894007588(%r14,%r13,1),%r13d
+ xorl %r11d,%ebx
roll $5,%ecx
- xorl 0(%rsp),%ebp
addl %eax,%r13d
+ roll $1,%edx
andl %edi,%ebx
- roll $1,%ebp
- addl %ebx,%r13d
- roll $30,%edi
- movl %ebp,12(%rsp)
addl %ecx,%r13d
- movl 16(%rsp),%edx
- movl %edi,%eax
- movl %edi,%ebx
- xorl 24(%rsp),%edx
- andl %r11d,%eax
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 16(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,12(%rsp)
+ movl %r11d,%ebx
+ xorl 24(%rsp),%ebp
+ andl %edi,%eax
movl %r13d,%ecx
- xorl 48(%rsp),%edx
- xorl %r11d,%ebx
- leal -1894007588(%rbp,%r12,1),%r12d
+ xorl 48(%rsp),%ebp
+ leal -1894007588(%rdx,%r12,1),%r12d
+ xorl %edi,%ebx
roll $5,%ecx
- xorl 4(%rsp),%edx
addl %eax,%r12d
+ roll $1,%ebp
andl %esi,%ebx
- roll $1,%edx
- addl %ebx,%r12d
- roll $30,%esi
- movl %edx,16(%rsp)
addl %ecx,%r12d
- movl 20(%rsp),%ebp
- movl %esi,%eax
- movl %esi,%ebx
- xorl 28(%rsp),%ebp
- andl %edi,%eax
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 20(%rsp),%r14d
+ movl %edi,%eax
+ movl %ebp,16(%rsp)
+ movl %edi,%ebx
+ xorl 28(%rsp),%r14d
+ andl %esi,%eax
movl %r12d,%ecx
- xorl 52(%rsp),%ebp
- xorl %edi,%ebx
- leal -1894007588(%rdx,%r11,1),%r11d
+ xorl 52(%rsp),%r14d
+ leal -1894007588(%rbp,%r11,1),%r11d
+ xorl %esi,%ebx
roll $5,%ecx
- xorl 8(%rsp),%ebp
addl %eax,%r11d
+ roll $1,%r14d
andl %r13d,%ebx
- roll $1,%ebp
- addl %ebx,%r11d
- roll $30,%r13d
- movl %ebp,20(%rsp)
addl %ecx,%r11d
- movl 24(%rsp),%edx
- movl %r13d,%eax
- movl %r13d,%ebx
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 24(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,20(%rsp)
+ movl %esi,%ebx
xorl 32(%rsp),%edx
- andl %esi,%eax
+ andl %r13d,%eax
movl %r11d,%ecx
xorl 56(%rsp),%edx
- xorl %esi,%ebx
- leal -1894007588(%rbp,%rdi,1),%edi
+ leal -1894007588(%r14,%rdi,1),%edi
+ xorl %r13d,%ebx
roll $5,%ecx
- xorl 12(%rsp),%edx
addl %eax,%edi
- andl %r12d,%ebx
roll $1,%edx
- addl %ebx,%edi
+ andl %r12d,%ebx
+ addl %ecx,%edi
roll $30,%r12d
+ addl %ebx,%edi
+ xorl 28(%rsp),%ebp
+ movl %r13d,%eax
movl %edx,24(%rsp)
- addl %ecx,%edi
- movl 28(%rsp),%ebp
- movl %r12d,%eax
- movl %r12d,%ebx
+ movl %r13d,%ebx
xorl 36(%rsp),%ebp
- andl %r13d,%eax
+ andl %r12d,%eax
movl %edi,%ecx
xorl 60(%rsp),%ebp
- xorl %r13d,%ebx
leal -1894007588(%rdx,%rsi,1),%esi
+ xorl %r12d,%ebx
roll $5,%ecx
- xorl 16(%rsp),%ebp
addl %eax,%esi
- andl %r11d,%ebx
roll $1,%ebp
- addl %ebx,%esi
+ andl %r11d,%ebx
+ addl %ecx,%esi
roll $30,%r11d
+ addl %ebx,%esi
+ xorl 32(%rsp),%r14d
+ movl %r12d,%eax
movl %ebp,28(%rsp)
- addl %ecx,%esi
- movl 32(%rsp),%edx
- movl %r11d,%eax
- movl %r11d,%ebx
- xorl 40(%rsp),%edx
- andl %r12d,%eax
+ movl %r12d,%ebx
+ xorl 40(%rsp),%r14d
+ andl %r11d,%eax
movl %esi,%ecx
- xorl 0(%rsp),%edx
- xorl %r12d,%ebx
+ xorl 0(%rsp),%r14d
leal -1894007588(%rbp,%r13,1),%r13d
+ xorl %r11d,%ebx
roll $5,%ecx
- xorl 20(%rsp),%edx
addl %eax,%r13d
+ roll $1,%r14d
andl %edi,%ebx
- roll $1,%edx
- addl %ebx,%r13d
- roll $30,%edi
- movl %edx,32(%rsp)
addl %ecx,%r13d
- movl 36(%rsp),%ebp
- movl %edi,%eax
- movl %edi,%ebx
- xorl 44(%rsp),%ebp
- andl %r11d,%eax
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 36(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,32(%rsp)
+ movl %r11d,%ebx
+ xorl 44(%rsp),%edx
+ andl %edi,%eax
movl %r13d,%ecx
- xorl 4(%rsp),%ebp
- xorl %r11d,%ebx
- leal -1894007588(%rdx,%r12,1),%r12d
+ xorl 4(%rsp),%edx
+ leal -1894007588(%r14,%r12,1),%r12d
+ xorl %edi,%ebx
roll $5,%ecx
- xorl 24(%rsp),%ebp
addl %eax,%r12d
+ roll $1,%edx
andl %esi,%ebx
- roll $1,%ebp
- addl %ebx,%r12d
- roll $30,%esi
- movl %ebp,36(%rsp)
addl %ecx,%r12d
- movl 40(%rsp),%edx
- movl %esi,%eax
- movl %esi,%ebx
- xorl 48(%rsp),%edx
- andl %edi,%eax
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 40(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,36(%rsp)
+ movl %edi,%ebx
+ xorl 48(%rsp),%ebp
+ andl %esi,%eax
movl %r12d,%ecx
- xorl 8(%rsp),%edx
- xorl %edi,%ebx
- leal -1894007588(%rbp,%r11,1),%r11d
+ xorl 8(%rsp),%ebp
+ leal -1894007588(%rdx,%r11,1),%r11d
+ xorl %esi,%ebx
roll $5,%ecx
- xorl 28(%rsp),%edx
addl %eax,%r11d
+ roll $1,%ebp
andl %r13d,%ebx
- roll $1,%edx
- addl %ebx,%r11d
- roll $30,%r13d
- movl %edx,40(%rsp)
addl %ecx,%r11d
- movl 44(%rsp),%ebp
- movl %r13d,%eax
- movl %r13d,%ebx
- xorl 52(%rsp),%ebp
- andl %esi,%eax
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 44(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,40(%rsp)
+ movl %esi,%ebx
+ xorl 52(%rsp),%r14d
+ andl %r13d,%eax
movl %r11d,%ecx
- xorl 12(%rsp),%ebp
- xorl %esi,%ebx
- leal -1894007588(%rdx,%rdi,1),%edi
+ xorl 12(%rsp),%r14d
+ leal -1894007588(%rbp,%rdi,1),%edi
+ xorl %r13d,%ebx
roll $5,%ecx
- xorl 32(%rsp),%ebp
addl %eax,%edi
+ roll $1,%r14d
andl %r12d,%ebx
- roll $1,%ebp
- addl %ebx,%edi
- roll $30,%r12d
- movl %ebp,44(%rsp)
addl %ecx,%edi
- movl 48(%rsp),%edx
- movl %r12d,%eax
- movl %r12d,%ebx
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 48(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,44(%rsp)
+ movl %r13d,%ebx
xorl 56(%rsp),%edx
- andl %r13d,%eax
+ andl %r12d,%eax
movl %edi,%ecx
xorl 16(%rsp),%edx
- xorl %r13d,%ebx
- leal -1894007588(%rbp,%rsi,1),%esi
+ leal -1894007588(%r14,%rsi,1),%esi
+ xorl %r12d,%ebx
roll $5,%ecx
- xorl 36(%rsp),%edx
addl %eax,%esi
- andl %r11d,%ebx
roll $1,%edx
- addl %ebx,%esi
+ andl %r11d,%ebx
+ addl %ecx,%esi
roll $30,%r11d
+ addl %ebx,%esi
+ xorl 52(%rsp),%ebp
+ movl %edi,%eax
movl %edx,48(%rsp)
- addl %ecx,%esi
- movl 52(%rsp),%ebp
- movl %r11d,%eax
movl %esi,%ecx
xorl 60(%rsp),%ebp
- xorl %edi,%eax
+ xorl %r12d,%eax
roll $5,%ecx
- leal -899497514(%rdx,%r13,1),%r13d
xorl 20(%rsp),%ebp
- xorl %r12d,%eax
+ leal -899497514(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 40(%rsp),%ebp
roll $30,%edi
addl %eax,%r13d
roll $1,%ebp
+ xorl 56(%rsp),%r14d
+ movl %esi,%eax
movl %ebp,52(%rsp)
- movl 56(%rsp),%edx
- movl %edi,%eax
movl %r13d,%ecx
- xorl 0(%rsp),%edx
- xorl %esi,%eax
+ xorl 0(%rsp),%r14d
+ xorl %r11d,%eax
roll $5,%ecx
+ xorl 24(%rsp),%r14d
leal -899497514(%rbp,%r12,1),%r12d
- xorl 24(%rsp),%edx
- xorl %r11d,%eax
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 44(%rsp),%edx
roll $30,%esi
addl %eax,%r12d
- roll $1,%edx
- movl %edx,56(%rsp)
- movl 60(%rsp),%ebp
- movl %esi,%eax
+ roll $1,%r14d
+ xorl 60(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,56(%rsp)
movl %r12d,%ecx
- xorl 4(%rsp),%ebp
- xorl %r13d,%eax
- roll $5,%ecx
- leal -899497514(%rdx,%r11,1),%r11d
- xorl 28(%rsp),%ebp
+ xorl 4(%rsp),%edx
xorl %edi,%eax
+ roll $5,%ecx
+ xorl 28(%rsp),%edx
+ leal -899497514(%r14,%r11,1),%r11d
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 48(%rsp),%ebp
roll $30,%r13d
addl %eax,%r11d
- roll $1,%ebp
- movl %ebp,60(%rsp)
- movl 0(%rsp),%edx
- movl %r13d,%eax
+ roll $1,%edx
+ xorl 0(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,60(%rsp)
movl %r11d,%ecx
- xorl 8(%rsp),%edx
- xorl %r12d,%eax
- roll $5,%ecx
- leal -899497514(%rbp,%rdi,1),%edi
- xorl 32(%rsp),%edx
+ xorl 8(%rsp),%ebp
xorl %esi,%eax
+ roll $5,%ecx
+ xorl 32(%rsp),%ebp
+ leal -899497514(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 52(%rsp),%edx
roll $30,%r12d
addl %eax,%edi
- roll $1,%edx
- movl %edx,0(%rsp)
- movl 4(%rsp),%ebp
- movl %r12d,%eax
+ roll $1,%ebp
+ xorl 4(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,0(%rsp)
movl %edi,%ecx
- xorl 12(%rsp),%ebp
- xorl %r11d,%eax
- roll $5,%ecx
- leal -899497514(%rdx,%rsi,1),%esi
- xorl 36(%rsp),%ebp
+ xorl 12(%rsp),%r14d
xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%r14d
+ leal -899497514(%rbp,%rsi,1),%esi
+ xorl %r12d,%eax
addl %ecx,%esi
- xorl 56(%rsp),%ebp
roll $30,%r11d
addl %eax,%esi
- roll $1,%ebp
- movl %ebp,4(%rsp)
- movl 8(%rsp),%edx
- movl %r11d,%eax
+ roll $1,%r14d
+ xorl 8(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,4(%rsp)
movl %esi,%ecx
xorl 16(%rsp),%edx
- xorl %edi,%eax
+ xorl %r12d,%eax
roll $5,%ecx
- leal -899497514(%rbp,%r13,1),%r13d
xorl 40(%rsp),%edx
- xorl %r12d,%eax
+ leal -899497514(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 60(%rsp),%edx
roll $30,%edi
addl %eax,%r13d
roll $1,%edx
+ xorl 12(%rsp),%ebp
+ movl %esi,%eax
movl %edx,8(%rsp)
- movl 12(%rsp),%ebp
- movl %edi,%eax
movl %r13d,%ecx
xorl 20(%rsp),%ebp
- xorl %esi,%eax
+ xorl %r11d,%eax
roll $5,%ecx
- leal -899497514(%rdx,%r12,1),%r12d
xorl 44(%rsp),%ebp
- xorl %r11d,%eax
+ leal -899497514(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 0(%rsp),%ebp
roll $30,%esi
addl %eax,%r12d
roll $1,%ebp
+ xorl 16(%rsp),%r14d
+ movl %r13d,%eax
movl %ebp,12(%rsp)
- movl 16(%rsp),%edx
- movl %esi,%eax
movl %r12d,%ecx
- xorl 24(%rsp),%edx
- xorl %r13d,%eax
+ xorl 24(%rsp),%r14d
+ xorl %edi,%eax
roll $5,%ecx
+ xorl 48(%rsp),%r14d
leal -899497514(%rbp,%r11,1),%r11d
- xorl 48(%rsp),%edx
- xorl %edi,%eax
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 4(%rsp),%edx
roll $30,%r13d
addl %eax,%r11d
- roll $1,%edx
- movl %edx,16(%rsp)
- movl 20(%rsp),%ebp
- movl %r13d,%eax
+ roll $1,%r14d
+ xorl 20(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,16(%rsp)
movl %r11d,%ecx
- xorl 28(%rsp),%ebp
- xorl %r12d,%eax
- roll $5,%ecx
- leal -899497514(%rdx,%rdi,1),%edi
- xorl 52(%rsp),%ebp
+ xorl 28(%rsp),%edx
xorl %esi,%eax
+ roll $5,%ecx
+ xorl 52(%rsp),%edx
+ leal -899497514(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 8(%rsp),%ebp
roll $30,%r12d
addl %eax,%edi
- roll $1,%ebp
- movl %ebp,20(%rsp)
- movl 24(%rsp),%edx
- movl %r12d,%eax
+ roll $1,%edx
+ xorl 24(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,20(%rsp)
movl %edi,%ecx
- xorl 32(%rsp),%edx
- xorl %r11d,%eax
- roll $5,%ecx
- leal -899497514(%rbp,%rsi,1),%esi
- xorl 56(%rsp),%edx
+ xorl 32(%rsp),%ebp
xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 56(%rsp),%ebp
+ leal -899497514(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
addl %ecx,%esi
- xorl 12(%rsp),%edx
roll $30,%r11d
addl %eax,%esi
- roll $1,%edx
- movl %edx,24(%rsp)
- movl 28(%rsp),%ebp
- movl %r11d,%eax
+ roll $1,%ebp
+ xorl 28(%rsp),%r14d
+ movl %edi,%eax
+ movl %ebp,24(%rsp)
movl %esi,%ecx
- xorl 36(%rsp),%ebp
- xorl %edi,%eax
- roll $5,%ecx
- leal -899497514(%rdx,%r13,1),%r13d
- xorl 60(%rsp),%ebp
+ xorl 36(%rsp),%r14d
xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%r14d
+ leal -899497514(%rbp,%r13,1),%r13d
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 16(%rsp),%ebp
roll $30,%edi
addl %eax,%r13d
- roll $1,%ebp
- movl %ebp,28(%rsp)
- movl 32(%rsp),%edx
- movl %edi,%eax
+ roll $1,%r14d
+ xorl 32(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,28(%rsp)
movl %r13d,%ecx
xorl 40(%rsp),%edx
- xorl %esi,%eax
+ xorl %r11d,%eax
roll $5,%ecx
- leal -899497514(%rbp,%r12,1),%r12d
xorl 0(%rsp),%edx
- xorl %r11d,%eax
+ leal -899497514(%r14,%r12,1),%r12d
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 20(%rsp),%edx
roll $30,%esi
addl %eax,%r12d
roll $1,%edx
- movl %edx,32(%rsp)
- movl 36(%rsp),%ebp
- movl %esi,%eax
+ xorl 36(%rsp),%ebp
+ movl %r13d,%eax
+
movl %r12d,%ecx
xorl 44(%rsp),%ebp
- xorl %r13d,%eax
+ xorl %edi,%eax
roll $5,%ecx
- leal -899497514(%rdx,%r11,1),%r11d
xorl 4(%rsp),%ebp
- xorl %edi,%eax
+ leal -899497514(%rdx,%r11,1),%r11d
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 24(%rsp),%ebp
roll $30,%r13d
addl %eax,%r11d
roll $1,%ebp
- movl %ebp,36(%rsp)
- movl 40(%rsp),%edx
- movl %r13d,%eax
+ xorl 40(%rsp),%r14d
+ movl %r12d,%eax
+
movl %r11d,%ecx
- xorl 48(%rsp),%edx
- xorl %r12d,%eax
+ xorl 48(%rsp),%r14d
+ xorl %esi,%eax
roll $5,%ecx
+ xorl 8(%rsp),%r14d
leal -899497514(%rbp,%rdi,1),%edi
- xorl 8(%rsp),%edx
- xorl %esi,%eax
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 28(%rsp),%edx
roll $30,%r12d
addl %eax,%edi
- roll $1,%edx
- movl %edx,40(%rsp)
- movl 44(%rsp),%ebp
- movl %r12d,%eax
+ roll $1,%r14d
+ xorl 44(%rsp),%edx
+ movl %r11d,%eax
+
movl %edi,%ecx
- xorl 52(%rsp),%ebp
- xorl %r11d,%eax
- roll $5,%ecx
- leal -899497514(%rdx,%rsi,1),%esi
- xorl 12(%rsp),%ebp
+ xorl 52(%rsp),%edx
xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 12(%rsp),%edx
+ leal -899497514(%r14,%rsi,1),%esi
+ xorl %r12d,%eax
addl %ecx,%esi
- xorl 32(%rsp),%ebp
roll $30,%r11d
addl %eax,%esi
- roll $1,%ebp
- movl %ebp,44(%rsp)
- movl 48(%rsp),%edx
- movl %r11d,%eax
+ roll $1,%edx
+ xorl 48(%rsp),%ebp
+ movl %edi,%eax
+
movl %esi,%ecx
- xorl 56(%rsp),%edx
- xorl %edi,%eax
- roll $5,%ecx
- leal -899497514(%rbp,%r13,1),%r13d
- xorl 16(%rsp),%edx
+ xorl 56(%rsp),%ebp
xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 16(%rsp),%ebp
+ leal -899497514(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
addl %ecx,%r13d
- xorl 36(%rsp),%edx
roll $30,%edi
addl %eax,%r13d
- roll $1,%edx
- movl %edx,48(%rsp)
- movl 52(%rsp),%ebp
- movl %edi,%eax
+ roll $1,%ebp
+ xorl 52(%rsp),%r14d
+ movl %esi,%eax
+
movl %r13d,%ecx
- xorl 60(%rsp),%ebp
- xorl %esi,%eax
- roll $5,%ecx
- leal -899497514(%rdx,%r12,1),%r12d
- xorl 20(%rsp),%ebp
+ xorl 60(%rsp),%r14d
xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 20(%rsp),%r14d
+ leal -899497514(%rbp,%r12,1),%r12d
+ xorl %edi,%eax
addl %ecx,%r12d
- xorl 40(%rsp),%ebp
roll $30,%esi
addl %eax,%r12d
- roll $1,%ebp
- movl 56(%rsp),%edx
- movl %esi,%eax
+ roll $1,%r14d
+ xorl 56(%rsp),%edx
+ movl %r13d,%eax
+
movl %r12d,%ecx
xorl 0(%rsp),%edx
- xorl %r13d,%eax
+ xorl %edi,%eax
roll $5,%ecx
- leal -899497514(%rbp,%r11,1),%r11d
xorl 24(%rsp),%edx
- xorl %edi,%eax
+ leal -899497514(%r14,%r11,1),%r11d
+ xorl %esi,%eax
addl %ecx,%r11d
- xorl 44(%rsp),%edx
roll $30,%r13d
addl %eax,%r11d
roll $1,%edx
- movl 60(%rsp),%ebp
- movl %r13d,%eax
+ xorl 60(%rsp),%ebp
+ movl %r12d,%eax
+
movl %r11d,%ecx
xorl 4(%rsp),%ebp
- xorl %r12d,%eax
+ xorl %esi,%eax
roll $5,%ecx
- leal -899497514(%rdx,%rdi,1),%edi
xorl 28(%rsp),%ebp
- xorl %esi,%eax
+ leal -899497514(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
addl %ecx,%edi
- xorl 48(%rsp),%ebp
roll $30,%r12d
addl %eax,%edi
roll $1,%ebp
- movl %r12d,%eax
+ movl %r11d,%eax
movl %edi,%ecx
- xorl %r11d,%eax
+ xorl %r13d,%eax
leal -899497514(%rbp,%rsi,1),%esi
roll $5,%ecx
- xorl %r13d,%eax
+ xorl %r12d,%eax
addl %ecx,%esi
roll $30,%r11d
addl %eax,%esi
@@ -1319,11 +1259,12 @@ L$loop:
jnz L$loop
movq 64(%rsp),%rsi
- movq (%rsi),%r13
- movq 8(%rsi),%r12
- movq 16(%rsi),%rbp
- movq 24(%rsi),%rbx
- leaq 32(%rsi),%rsp
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
L$epilogue:
.byte 0xf3,0xc3
@@ -1331,17 +1272,22 @@ L$epilogue:
.p2align 4
sha1_block_data_order_ssse3:
_ssse3_shortcut:
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
+ pushq %r13
+ pushq %r14
leaq -64(%rsp),%rsp
+ movq %rax,%r14
+ andq $-64,%rsp
movq %rdi,%r8
movq %rsi,%r9
movq %rdx,%r10
shlq $6,%r10
addq %r9,%r10
- leaq K_XX_XX(%rip),%r11
+ leaq K_XX_XX+64(%rip),%r11
movl 0(%r8),%eax
movl 4(%r8),%ebx
@@ -1349,19 +1295,22 @@ _ssse3_shortcut:
movl 12(%r8),%edx
movl %ebx,%esi
movl 16(%r8),%ebp
+ movl %ecx,%edi
+ xorl %edx,%edi
+ andl %edi,%esi
movdqa 64(%r11),%xmm6
- movdqa 0(%r11),%xmm9
+ movdqa -64(%r11),%xmm9
movdqu 0(%r9),%xmm0
movdqu 16(%r9),%xmm1
movdqu 32(%r9),%xmm2
movdqu 48(%r9),%xmm3
.byte 102,15,56,0,198
- addq $64,%r9
.byte 102,15,56,0,206
.byte 102,15,56,0,214
-.byte 102,15,56,0,222
+ addq $64,%r9
paddd %xmm9,%xmm0
+.byte 102,15,56,0,222
paddd %xmm9,%xmm1
paddd %xmm9,%xmm2
movdqa %xmm0,0(%rsp)
@@ -1373,904 +1322,882 @@ _ssse3_shortcut:
jmp L$oop_ssse3
.p2align 4
L$oop_ssse3:
- movdqa %xmm1,%xmm4
- addl 0(%rsp),%ebp
- xorl %edx,%ecx
+ rorl $2,%ebx
+ pshufd $238,%xmm0,%xmm4
+ xorl %edx,%esi
movdqa %xmm3,%xmm8
-.byte 102,15,58,15,224,8
+ paddd %xmm3,%xmm9
movl %eax,%edi
+ addl 0(%rsp),%ebp
+ punpcklqdq %xmm1,%xmm4
+ xorl %ecx,%ebx
roll $5,%eax
- paddd %xmm3,%xmm9
- andl %ecx,%esi
- xorl %edx,%ecx
+ addl %esi,%ebp
psrldq $4,%xmm8
- xorl %edx,%esi
- addl %eax,%ebp
+ andl %ebx,%edi
+ xorl %ecx,%ebx
pxor %xmm0,%xmm4
- rorl $2,%ebx
- addl %esi,%ebp
+ addl %eax,%ebp
+ rorl $7,%eax
pxor %xmm2,%xmm8
- addl 4(%rsp),%edx
- xorl %ecx,%ebx
+ xorl %ecx,%edi
movl %ebp,%esi
- roll $5,%ebp
+ addl 4(%rsp),%edx
pxor %xmm8,%xmm4
- andl %ebx,%edi
- xorl %ecx,%ebx
+ xorl %ebx,%eax
+ roll $5,%ebp
movdqa %xmm9,48(%rsp)
- xorl %ecx,%edi
- addl %ebp,%edx
- movdqa %xmm4,%xmm10
- movdqa %xmm4,%xmm8
- rorl $7,%eax
addl %edi,%edx
- addl 8(%rsp),%ecx
+ andl %eax,%esi
+ movdqa %xmm4,%xmm10
xorl %ebx,%eax
+ addl %ebp,%edx
+ rorl $7,%ebp
+ movdqa %xmm4,%xmm8
+ xorl %ebx,%esi
pslldq $12,%xmm10
paddd %xmm4,%xmm4
movl %edx,%edi
- roll $5,%edx
- andl %eax,%esi
- xorl %ebx,%eax
+ addl 8(%rsp),%ecx
psrld $31,%xmm8
- xorl %ebx,%esi
- addl %edx,%ecx
- movdqa %xmm10,%xmm9
- rorl $7,%ebp
+ xorl %eax,%ebp
+ roll $5,%edx
addl %esi,%ecx
+ movdqa %xmm10,%xmm9
+ andl %ebp,%edi
+ xorl %eax,%ebp
psrld $30,%xmm10
+ addl %edx,%ecx
+ rorl $7,%edx
por %xmm8,%xmm4
- addl 12(%rsp),%ebx
- xorl %eax,%ebp
+ xorl %eax,%edi
movl %ecx,%esi
- roll $5,%ecx
+ addl 12(%rsp),%ebx
pslld $2,%xmm9
pxor %xmm10,%xmm4
- andl %ebp,%edi
- xorl %eax,%ebp
- movdqa 0(%r11),%xmm10
- xorl %eax,%edi
- addl %ecx,%ebx
- pxor %xmm9,%xmm4
- rorl $7,%edx
+ xorl %ebp,%edx
+ movdqa -64(%r11),%xmm10
+ roll $5,%ecx
addl %edi,%ebx
- movdqa %xmm2,%xmm5
- addl 16(%rsp),%eax
+ andl %edx,%esi
+ pxor %xmm9,%xmm4
xorl %ebp,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ pshufd $238,%xmm1,%xmm5
+ xorl %ebp,%esi
movdqa %xmm4,%xmm9
-.byte 102,15,58,15,233,8
+ paddd %xmm4,%xmm10
movl %ebx,%edi
+ addl 16(%rsp),%eax
+ punpcklqdq %xmm2,%xmm5
+ xorl %edx,%ecx
roll $5,%ebx
- paddd %xmm4,%xmm10
- andl %edx,%esi
- xorl %ebp,%edx
+ addl %esi,%eax
psrldq $4,%xmm9
- xorl %ebp,%esi
- addl %ebx,%eax
+ andl %ecx,%edi
+ xorl %edx,%ecx
pxor %xmm1,%xmm5
- rorl $7,%ecx
- addl %esi,%eax
+ addl %ebx,%eax
+ rorl $7,%ebx
pxor %xmm3,%xmm9
- addl 20(%rsp),%ebp
- xorl %edx,%ecx
+ xorl %edx,%edi
movl %eax,%esi
- roll $5,%eax
+ addl 20(%rsp),%ebp
pxor %xmm9,%xmm5
- andl %ecx,%edi
- xorl %edx,%ecx
+ xorl %ecx,%ebx
+ roll $5,%eax
movdqa %xmm10,0(%rsp)
- xorl %edx,%edi
- addl %eax,%ebp
- movdqa %xmm5,%xmm8
- movdqa %xmm5,%xmm9
- rorl $7,%ebx
addl %edi,%ebp
- addl 24(%rsp),%edx
+ andl %ebx,%esi
+ movdqa %xmm5,%xmm8
xorl %ecx,%ebx
+ addl %eax,%ebp
+ rorl $7,%eax
+ movdqa %xmm5,%xmm9
+ xorl %ecx,%esi
pslldq $12,%xmm8
paddd %xmm5,%xmm5
movl %ebp,%edi
- roll $5,%ebp
- andl %ebx,%esi
- xorl %ecx,%ebx
+ addl 24(%rsp),%edx
psrld $31,%xmm9
- xorl %ecx,%esi
- addl %ebp,%edx
- movdqa %xmm8,%xmm10
- rorl $7,%eax
+ xorl %ebx,%eax
+ roll $5,%ebp
addl %esi,%edx
+ movdqa %xmm8,%xmm10
+ andl %eax,%edi
+ xorl %ebx,%eax
psrld $30,%xmm8
+ addl %ebp,%edx
+ rorl $7,%ebp
por %xmm9,%xmm5
- addl 28(%rsp),%ecx
- xorl %ebx,%eax
+ xorl %ebx,%edi
movl %edx,%esi
- roll $5,%edx
+ addl 28(%rsp),%ecx
pslld $2,%xmm10
pxor %xmm8,%xmm5
- andl %eax,%edi
- xorl %ebx,%eax
- movdqa 16(%r11),%xmm8
- xorl %ebx,%edi
- addl %edx,%ecx
- pxor %xmm10,%xmm5
- rorl $7,%ebp
+ xorl %eax,%ebp
+ movdqa -32(%r11),%xmm8
+ roll $5,%edx
addl %edi,%ecx
- movdqa %xmm3,%xmm6
- addl 32(%rsp),%ebx
+ andl %ebp,%esi
+ pxor %xmm10,%xmm5
xorl %eax,%ebp
+ addl %edx,%ecx
+ rorl $7,%edx
+ pshufd $238,%xmm2,%xmm6
+ xorl %eax,%esi
movdqa %xmm5,%xmm10
-.byte 102,15,58,15,242,8
+ paddd %xmm5,%xmm8
movl %ecx,%edi
+ addl 32(%rsp),%ebx
+ punpcklqdq %xmm3,%xmm6
+ xorl %ebp,%edx
roll $5,%ecx
- paddd %xmm5,%xmm8
- andl %ebp,%esi
- xorl %eax,%ebp
+ addl %esi,%ebx
psrldq $4,%xmm10
- xorl %eax,%esi
- addl %ecx,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
pxor %xmm2,%xmm6
- rorl $7,%edx
- addl %esi,%ebx
+ addl %ecx,%ebx
+ rorl $7,%ecx
pxor %xmm4,%xmm10
- addl 36(%rsp),%eax
- xorl %ebp,%edx
+ xorl %ebp,%edi
movl %ebx,%esi
- roll $5,%ebx
+ addl 36(%rsp),%eax
pxor %xmm10,%xmm6
- andl %edx,%edi
- xorl %ebp,%edx
+ xorl %edx,%ecx
+ roll $5,%ebx
movdqa %xmm8,16(%rsp)
- xorl %ebp,%edi
- addl %ebx,%eax
- movdqa %xmm6,%xmm9
- movdqa %xmm6,%xmm10
- rorl $7,%ecx
addl %edi,%eax
- addl 40(%rsp),%ebp
+ andl %ecx,%esi
+ movdqa %xmm6,%xmm9
xorl %edx,%ecx
+ addl %ebx,%eax
+ rorl $7,%ebx
+ movdqa %xmm6,%xmm10
+ xorl %edx,%esi
pslldq $12,%xmm9
paddd %xmm6,%xmm6
movl %eax,%edi
- roll $5,%eax
- andl %ecx,%esi
- xorl %edx,%ecx
+ addl 40(%rsp),%ebp
psrld $31,%xmm10
- xorl %edx,%esi
- addl %eax,%ebp
- movdqa %xmm9,%xmm8
- rorl $7,%ebx
+ xorl %ecx,%ebx
+ roll $5,%eax
addl %esi,%ebp
+ movdqa %xmm9,%xmm8
+ andl %ebx,%edi
+ xorl %ecx,%ebx
psrld $30,%xmm9
+ addl %eax,%ebp
+ rorl $7,%eax
por %xmm10,%xmm6
- addl 44(%rsp),%edx
- xorl %ecx,%ebx
+ xorl %ecx,%edi
movl %ebp,%esi
- roll $5,%ebp
+ addl 44(%rsp),%edx
pslld $2,%xmm8
pxor %xmm9,%xmm6
- andl %ebx,%edi
- xorl %ecx,%ebx
- movdqa 16(%r11),%xmm9
- xorl %ecx,%edi
- addl %ebp,%edx
- pxor %xmm8,%xmm6
- rorl $7,%eax
+ xorl %ebx,%eax
+ movdqa -32(%r11),%xmm9
+ roll $5,%ebp
addl %edi,%edx
- movdqa %xmm4,%xmm7
- addl 48(%rsp),%ecx
+ andl %eax,%esi
+ pxor %xmm8,%xmm6
xorl %ebx,%eax
+ addl %ebp,%edx
+ rorl $7,%ebp
+ pshufd $238,%xmm3,%xmm7
+ xorl %ebx,%esi
movdqa %xmm6,%xmm8
-.byte 102,15,58,15,251,8
+ paddd %xmm6,%xmm9
movl %edx,%edi
+ addl 48(%rsp),%ecx
+ punpcklqdq %xmm4,%xmm7
+ xorl %eax,%ebp
roll $5,%edx
- paddd %xmm6,%xmm9
- andl %eax,%esi
- xorl %ebx,%eax
+ addl %esi,%ecx
psrldq $4,%xmm8
- xorl %ebx,%esi
- addl %edx,%ecx
+ andl %ebp,%edi
+ xorl %eax,%ebp
pxor %xmm3,%xmm7
- rorl $7,%ebp
- addl %esi,%ecx
+ addl %edx,%ecx
+ rorl $7,%edx
pxor %xmm5,%xmm8
- addl 52(%rsp),%ebx
- xorl %eax,%ebp
+ xorl %eax,%edi
movl %ecx,%esi
- roll $5,%ecx
+ addl 52(%rsp),%ebx
pxor %xmm8,%xmm7
- andl %ebp,%edi
- xorl %eax,%ebp
+ xorl %ebp,%edx
+ roll $5,%ecx
movdqa %xmm9,32(%rsp)
- xorl %eax,%edi
- addl %ecx,%ebx
- movdqa %xmm7,%xmm10
- movdqa %xmm7,%xmm8
- rorl $7,%edx
addl %edi,%ebx
- addl 56(%rsp),%eax
+ andl %edx,%esi
+ movdqa %xmm7,%xmm10
xorl %ebp,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ movdqa %xmm7,%xmm8
+ xorl %ebp,%esi
pslldq $12,%xmm10
paddd %xmm7,%xmm7
movl %ebx,%edi
- roll $5,%ebx
- andl %edx,%esi
- xorl %ebp,%edx
+ addl 56(%rsp),%eax
psrld $31,%xmm8
- xorl %ebp,%esi
- addl %ebx,%eax
- movdqa %xmm10,%xmm9
- rorl $7,%ecx
+ xorl %edx,%ecx
+ roll $5,%ebx
addl %esi,%eax
+ movdqa %xmm10,%xmm9
+ andl %ecx,%edi
+ xorl %edx,%ecx
psrld $30,%xmm10
+ addl %ebx,%eax
+ rorl $7,%ebx
por %xmm8,%xmm7
- addl 60(%rsp),%ebp
- xorl %edx,%ecx
+ xorl %edx,%edi
movl %eax,%esi
- roll $5,%eax
+ addl 60(%rsp),%ebp
pslld $2,%xmm9
pxor %xmm10,%xmm7
- andl %ecx,%edi
- xorl %edx,%ecx
- movdqa 16(%r11),%xmm10
- xorl %edx,%edi
- addl %eax,%ebp
- pxor %xmm9,%xmm7
- rorl $7,%ebx
+ xorl %ecx,%ebx
+ movdqa -32(%r11),%xmm10
+ roll $5,%eax
addl %edi,%ebp
- movdqa %xmm7,%xmm9
- addl 0(%rsp),%edx
- pxor %xmm4,%xmm0
-.byte 102,68,15,58,15,206,8
+ andl %ebx,%esi
+ pxor %xmm9,%xmm7
+ pshufd $238,%xmm6,%xmm9
xorl %ecx,%ebx
+ addl %eax,%ebp
+ rorl $7,%eax
+ pxor %xmm4,%xmm0
+ xorl %ecx,%esi
movl %ebp,%edi
+ addl 0(%rsp),%edx
+ punpcklqdq %xmm7,%xmm9
+ xorl %ebx,%eax
roll $5,%ebp
pxor %xmm1,%xmm0
- andl %ebx,%esi
- xorl %ecx,%ebx
+ addl %esi,%edx
+ andl %eax,%edi
movdqa %xmm10,%xmm8
+ xorl %ebx,%eax
paddd %xmm7,%xmm10
- xorl %ecx,%esi
addl %ebp,%edx
pxor %xmm9,%xmm0
- rorl $7,%eax
- addl %esi,%edx
+ rorl $7,%ebp
+ xorl %ebx,%edi
+ movl %edx,%esi
addl 4(%rsp),%ecx
- xorl %ebx,%eax
movdqa %xmm0,%xmm9
- movdqa %xmm10,48(%rsp)
- movl %edx,%esi
+ xorl %eax,%ebp
roll $5,%edx
- andl %eax,%edi
- xorl %ebx,%eax
+ movdqa %xmm10,48(%rsp)
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
pslld $2,%xmm0
- xorl %ebx,%edi
addl %edx,%ecx
+ rorl $7,%edx
psrld $30,%xmm9
- rorl $7,%ebp
- addl %edi,%ecx
- addl 8(%rsp),%ebx
- xorl %eax,%ebp
+ xorl %eax,%esi
movl %ecx,%edi
- roll $5,%ecx
+ addl 8(%rsp),%ebx
por %xmm9,%xmm0
- andl %ebp,%esi
- xorl %eax,%ebp
- movdqa %xmm0,%xmm10
- xorl %eax,%esi
- addl %ecx,%ebx
- rorl $7,%edx
- addl %esi,%ebx
- addl 12(%rsp),%eax
xorl %ebp,%edx
- movl %ebx,%esi
- roll $5,%ebx
+ roll $5,%ecx
+ pshufd $238,%xmm7,%xmm10
+ addl %esi,%ebx
andl %edx,%edi
xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 12(%rsp),%eax
xorl %ebp,%edi
- addl %ebx,%eax
- rorl $7,%ecx
+ movl %ebx,%esi
+ roll $5,%ebx
addl %edi,%eax
- addl 16(%rsp),%ebp
- pxor %xmm5,%xmm1
-.byte 102,68,15,58,15,215,8
xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ pxor %xmm5,%xmm1
+ addl 16(%rsp),%ebp
+ xorl %ecx,%esi
+ punpcklqdq %xmm0,%xmm10
movl %eax,%edi
roll $5,%eax
pxor %xmm2,%xmm1
- xorl %ecx,%esi
- addl %eax,%ebp
+ addl %esi,%ebp
+ xorl %ecx,%edi
movdqa %xmm8,%xmm9
- paddd %xmm0,%xmm8
rorl $7,%ebx
- addl %esi,%ebp
+ paddd %xmm0,%xmm8
+ addl %eax,%ebp
pxor %xmm10,%xmm1
addl 20(%rsp),%edx
- xorl %ecx,%edi
+ xorl %ebx,%edi
movl %ebp,%esi
roll $5,%ebp
movdqa %xmm1,%xmm10
+ addl %edi,%edx
+ xorl %ebx,%esi
movdqa %xmm8,0(%rsp)
- xorl %ebx,%edi
- addl %ebp,%edx
rorl $7,%eax
- addl %edi,%edx
- pslld $2,%xmm1
+ addl %ebp,%edx
addl 24(%rsp),%ecx
- xorl %ebx,%esi
- psrld $30,%xmm10
+ pslld $2,%xmm1
+ xorl %eax,%esi
movl %edx,%edi
+ psrld $30,%xmm10
roll $5,%edx
- xorl %eax,%esi
- addl %edx,%ecx
- rorl $7,%ebp
addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
por %xmm10,%xmm1
+ addl %edx,%ecx
addl 28(%rsp),%ebx
- xorl %eax,%edi
- movdqa %xmm1,%xmm8
+ pshufd $238,%xmm0,%xmm8
+ xorl %ebp,%edi
movl %ecx,%esi
roll $5,%ecx
- xorl %ebp,%edi
- addl %ecx,%ebx
- rorl $7,%edx
addl %edi,%ebx
- addl 32(%rsp),%eax
- pxor %xmm6,%xmm2
-.byte 102,68,15,58,15,192,8
xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ pxor %xmm6,%xmm2
+ addl 32(%rsp),%eax
+ xorl %edx,%esi
+ punpcklqdq %xmm1,%xmm8
movl %ebx,%edi
roll $5,%ebx
pxor %xmm3,%xmm2
- xorl %edx,%esi
- addl %ebx,%eax
- movdqa 32(%r11),%xmm10
- paddd %xmm1,%xmm9
- rorl $7,%ecx
addl %esi,%eax
+ xorl %edx,%edi
+ movdqa 0(%r11),%xmm10
+ rorl $7,%ecx
+ paddd %xmm1,%xmm9
+ addl %ebx,%eax
pxor %xmm8,%xmm2
addl 36(%rsp),%ebp
- xorl %edx,%edi
+ xorl %ecx,%edi
movl %eax,%esi
roll $5,%eax
movdqa %xmm2,%xmm8
+ addl %edi,%ebp
+ xorl %ecx,%esi
movdqa %xmm9,16(%rsp)
- xorl %ecx,%edi
- addl %eax,%ebp
rorl $7,%ebx
- addl %edi,%ebp
- pslld $2,%xmm2
+ addl %eax,%ebp
addl 40(%rsp),%edx
- xorl %ecx,%esi
- psrld $30,%xmm8
+ pslld $2,%xmm2
+ xorl %ebx,%esi
movl %ebp,%edi
+ psrld $30,%xmm8
roll $5,%ebp
- xorl %ebx,%esi
- addl %ebp,%edx
- rorl $7,%eax
addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
por %xmm8,%xmm2
+ addl %ebp,%edx
addl 44(%rsp),%ecx
- xorl %ebx,%edi
- movdqa %xmm2,%xmm9
+ pshufd $238,%xmm1,%xmm9
+ xorl %eax,%edi
movl %edx,%esi
roll $5,%edx
- xorl %eax,%edi
- addl %edx,%ecx
- rorl $7,%ebp
addl %edi,%ecx
- addl 48(%rsp),%ebx
- pxor %xmm7,%xmm3
-.byte 102,68,15,58,15,201,8
xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ pxor %xmm7,%xmm3
+ addl 48(%rsp),%ebx
+ xorl %ebp,%esi
+ punpcklqdq %xmm2,%xmm9
movl %ecx,%edi
roll $5,%ecx
pxor %xmm4,%xmm3
- xorl %ebp,%esi
- addl %ecx,%ebx
+ addl %esi,%ebx
+ xorl %ebp,%edi
movdqa %xmm10,%xmm8
- paddd %xmm2,%xmm10
rorl $7,%edx
- addl %esi,%ebx
+ paddd %xmm2,%xmm10
+ addl %ecx,%ebx
pxor %xmm9,%xmm3
addl 52(%rsp),%eax
- xorl %ebp,%edi
+ xorl %edx,%edi
movl %ebx,%esi
roll $5,%ebx
movdqa %xmm3,%xmm9
+ addl %edi,%eax
+ xorl %edx,%esi
movdqa %xmm10,32(%rsp)
- xorl %edx,%edi
- addl %ebx,%eax
rorl $7,%ecx
- addl %edi,%eax
- pslld $2,%xmm3
+ addl %ebx,%eax
addl 56(%rsp),%ebp
- xorl %edx,%esi
- psrld $30,%xmm9
+ pslld $2,%xmm3
+ xorl %ecx,%esi
movl %eax,%edi
+ psrld $30,%xmm9
roll $5,%eax
- xorl %ecx,%esi
- addl %eax,%ebp
- rorl $7,%ebx
addl %esi,%ebp
+ xorl %ecx,%edi
+ rorl $7,%ebx
por %xmm9,%xmm3
+ addl %eax,%ebp
addl 60(%rsp),%edx
- xorl %ecx,%edi
- movdqa %xmm3,%xmm10
+ pshufd $238,%xmm2,%xmm10
+ xorl %ebx,%edi
movl %ebp,%esi
roll $5,%ebp
- xorl %ebx,%edi
- addl %ebp,%edx
- rorl $7,%eax
addl %edi,%edx
- addl 0(%rsp),%ecx
- pxor %xmm0,%xmm4
-.byte 102,68,15,58,15,210,8
xorl %ebx,%esi
+ rorl $7,%eax
+ addl %ebp,%edx
+ pxor %xmm0,%xmm4
+ addl 0(%rsp),%ecx
+ xorl %eax,%esi
+ punpcklqdq %xmm3,%xmm10
movl %edx,%edi
roll $5,%edx
pxor %xmm5,%xmm4
- xorl %eax,%esi
- addl %edx,%ecx
+ addl %esi,%ecx
+ xorl %eax,%edi
movdqa %xmm8,%xmm9
- paddd %xmm3,%xmm8
rorl $7,%ebp
- addl %esi,%ecx
+ paddd %xmm3,%xmm8
+ addl %edx,%ecx
pxor %xmm10,%xmm4
addl 4(%rsp),%ebx
- xorl %eax,%edi
+ xorl %ebp,%edi
movl %ecx,%esi
roll $5,%ecx
movdqa %xmm4,%xmm10
+ addl %edi,%ebx
+ xorl %ebp,%esi
movdqa %xmm8,48(%rsp)
- xorl %ebp,%edi
- addl %ecx,%ebx
rorl $7,%edx
- addl %edi,%ebx
- pslld $2,%xmm4
+ addl %ecx,%ebx
addl 8(%rsp),%eax
- xorl %ebp,%esi
- psrld $30,%xmm10
+ pslld $2,%xmm4
+ xorl %edx,%esi
movl %ebx,%edi
+ psrld $30,%xmm10
roll $5,%ebx
- xorl %edx,%esi
- addl %ebx,%eax
- rorl $7,%ecx
addl %esi,%eax
+ xorl %edx,%edi
+ rorl $7,%ecx
por %xmm10,%xmm4
+ addl %ebx,%eax
addl 12(%rsp),%ebp
- xorl %edx,%edi
- movdqa %xmm4,%xmm8
+ pshufd $238,%xmm3,%xmm8
+ xorl %ecx,%edi
movl %eax,%esi
roll $5,%eax
- xorl %ecx,%edi
- addl %eax,%ebp
- rorl $7,%ebx
addl %edi,%ebp
- addl 16(%rsp),%edx
- pxor %xmm1,%xmm5
-.byte 102,68,15,58,15,195,8
xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ pxor %xmm1,%xmm5
+ addl 16(%rsp),%edx
+ xorl %ebx,%esi
+ punpcklqdq %xmm4,%xmm8
movl %ebp,%edi
roll $5,%ebp
pxor %xmm6,%xmm5
- xorl %ebx,%esi
- addl %ebp,%edx
+ addl %esi,%edx
+ xorl %ebx,%edi
movdqa %xmm9,%xmm10
- paddd %xmm4,%xmm9
rorl $7,%eax
- addl %esi,%edx
+ paddd %xmm4,%xmm9
+ addl %ebp,%edx
pxor %xmm8,%xmm5
addl 20(%rsp),%ecx
- xorl %ebx,%edi
+ xorl %eax,%edi
movl %edx,%esi
roll $5,%edx
movdqa %xmm5,%xmm8
+ addl %edi,%ecx
+ xorl %eax,%esi
movdqa %xmm9,0(%rsp)
- xorl %eax,%edi
- addl %edx,%ecx
rorl $7,%ebp
- addl %edi,%ecx
- pslld $2,%xmm5
+ addl %edx,%ecx
addl 24(%rsp),%ebx
- xorl %eax,%esi
- psrld $30,%xmm8
+ pslld $2,%xmm5
+ xorl %ebp,%esi
movl %ecx,%edi
+ psrld $30,%xmm8
roll $5,%ecx
- xorl %ebp,%esi
- addl %ecx,%ebx
- rorl $7,%edx
addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
por %xmm8,%xmm5
+ addl %ecx,%ebx
addl 28(%rsp),%eax
- xorl %ebp,%edi
- movdqa %xmm5,%xmm9
+ pshufd $238,%xmm4,%xmm9
+ rorl $7,%ecx
movl %ebx,%esi
- roll $5,%ebx
xorl %edx,%edi
- addl %ebx,%eax
- rorl $7,%ecx
+ roll $5,%ebx
addl %edi,%eax
- movl %ecx,%edi
- pxor %xmm2,%xmm6
-.byte 102,68,15,58,15,204,8
+ xorl %ecx,%esi
xorl %edx,%ecx
+ addl %ebx,%eax
+ pxor %xmm2,%xmm6
addl 32(%rsp),%ebp
- andl %edx,%edi
- pxor %xmm7,%xmm6
andl %ecx,%esi
+ xorl %edx,%ecx
rorl $7,%ebx
- movdqa %xmm10,%xmm8
- paddd %xmm5,%xmm10
- addl %edi,%ebp
+ punpcklqdq %xmm5,%xmm9
movl %eax,%edi
- pxor %xmm9,%xmm6
+ xorl %ecx,%esi
+ pxor %xmm7,%xmm6
roll $5,%eax
addl %esi,%ebp
- xorl %edx,%ecx
- addl %eax,%ebp
- movdqa %xmm6,%xmm9
- movdqa %xmm10,16(%rsp)
- movl %ebx,%esi
+ movdqa %xmm10,%xmm8
+ xorl %ebx,%edi
+ paddd %xmm5,%xmm10
xorl %ecx,%ebx
+ pxor %xmm9,%xmm6
+ addl %eax,%ebp
addl 36(%rsp),%edx
- andl %ecx,%esi
- pslld $2,%xmm6
andl %ebx,%edi
+ xorl %ecx,%ebx
rorl $7,%eax
- psrld $30,%xmm9
- addl %esi,%edx
+ movdqa %xmm6,%xmm9
movl %ebp,%esi
+ xorl %ebx,%edi
+ movdqa %xmm10,16(%rsp)
roll $5,%ebp
addl %edi,%edx
- xorl %ecx,%ebx
- addl %ebp,%edx
- por %xmm9,%xmm6
- movl %eax,%edi
+ xorl %eax,%esi
+ pslld $2,%xmm6
xorl %ebx,%eax
- movdqa %xmm6,%xmm10
+ addl %ebp,%edx
+ psrld $30,%xmm9
addl 40(%rsp),%ecx
- andl %ebx,%edi
andl %eax,%esi
+ xorl %ebx,%eax
+ por %xmm9,%xmm6
rorl $7,%ebp
- addl %edi,%ecx
movl %edx,%edi
+ xorl %eax,%esi
roll $5,%edx
+ pshufd $238,%xmm5,%xmm10
addl %esi,%ecx
- xorl %ebx,%eax
- addl %edx,%ecx
- movl %ebp,%esi
+ xorl %ebp,%edi
xorl %eax,%ebp
+ addl %edx,%ecx
addl 44(%rsp),%ebx
- andl %eax,%esi
andl %ebp,%edi
+ xorl %eax,%ebp
rorl $7,%edx
- addl %esi,%ebx
movl %ecx,%esi
+ xorl %ebp,%edi
roll $5,%ecx
addl %edi,%ebx
- xorl %eax,%ebp
+ xorl %edx,%esi
+ xorl %ebp,%edx
addl %ecx,%ebx
- movl %edx,%edi
pxor %xmm3,%xmm7
-.byte 102,68,15,58,15,213,8
- xorl %ebp,%edx
addl 48(%rsp),%eax
- andl %ebp,%edi
- pxor %xmm0,%xmm7
andl %edx,%esi
+ xorl %ebp,%edx
rorl $7,%ecx
- movdqa 48(%r11),%xmm9
- paddd %xmm6,%xmm8
- addl %edi,%eax
+ punpcklqdq %xmm6,%xmm10
movl %ebx,%edi
- pxor %xmm10,%xmm7
+ xorl %edx,%esi
+ pxor %xmm0,%xmm7
roll $5,%ebx
addl %esi,%eax
- xorl %ebp,%edx
- addl %ebx,%eax
- movdqa %xmm7,%xmm10
- movdqa %xmm8,32(%rsp)
- movl %ecx,%esi
+ movdqa 32(%r11),%xmm9
+ xorl %ecx,%edi
+ paddd %xmm6,%xmm8
xorl %edx,%ecx
+ pxor %xmm10,%xmm7
+ addl %ebx,%eax
addl 52(%rsp),%ebp
- andl %edx,%esi
- pslld $2,%xmm7
andl %ecx,%edi
+ xorl %edx,%ecx
rorl $7,%ebx
- psrld $30,%xmm10
- addl %esi,%ebp
+ movdqa %xmm7,%xmm10
movl %eax,%esi
+ xorl %ecx,%edi
+ movdqa %xmm8,32(%rsp)
roll $5,%eax
addl %edi,%ebp
- xorl %edx,%ecx
- addl %eax,%ebp
- por %xmm10,%xmm7
- movl %ebx,%edi
+ xorl %ebx,%esi
+ pslld $2,%xmm7
xorl %ecx,%ebx
- movdqa %xmm7,%xmm8
+ addl %eax,%ebp
+ psrld $30,%xmm10
addl 56(%rsp),%edx
- andl %ecx,%edi
andl %ebx,%esi
+ xorl %ecx,%ebx
+ por %xmm10,%xmm7
rorl $7,%eax
- addl %edi,%edx
movl %ebp,%edi
+ xorl %ebx,%esi
roll $5,%ebp
+ pshufd $238,%xmm6,%xmm8
addl %esi,%edx
- xorl %ecx,%ebx
- addl %ebp,%edx
- movl %eax,%esi
+ xorl %eax,%edi
xorl %ebx,%eax
+ addl %ebp,%edx
addl 60(%rsp),%ecx
- andl %ebx,%esi
andl %eax,%edi
+ xorl %ebx,%eax
rorl $7,%ebp
- addl %esi,%ecx
movl %edx,%esi
+ xorl %eax,%edi
roll $5,%edx
addl %edi,%ecx
- xorl %ebx,%eax
+ xorl %ebp,%esi
+ xorl %eax,%ebp
addl %edx,%ecx
- movl %ebp,%edi
pxor %xmm4,%xmm0
-.byte 102,68,15,58,15,198,8
- xorl %eax,%ebp
addl 0(%rsp),%ebx
- andl %eax,%edi
- pxor %xmm1,%xmm0
andl %ebp,%esi
+ xorl %eax,%ebp
rorl $7,%edx
- movdqa %xmm9,%xmm10
- paddd %xmm7,%xmm9
- addl %edi,%ebx
+ punpcklqdq %xmm7,%xmm8
movl %ecx,%edi
- pxor %xmm8,%xmm0
+ xorl %ebp,%esi
+ pxor %xmm1,%xmm0
roll $5,%ecx
addl %esi,%ebx
- xorl %eax,%ebp
- addl %ecx,%ebx
- movdqa %xmm0,%xmm8
- movdqa %xmm9,48(%rsp)
- movl %edx,%esi
+ movdqa %xmm9,%xmm10
+ xorl %edx,%edi
+ paddd %xmm7,%xmm9
xorl %ebp,%edx
+ pxor %xmm8,%xmm0
+ addl %ecx,%ebx
addl 4(%rsp),%eax
- andl %ebp,%esi
- pslld $2,%xmm0
andl %edx,%edi
+ xorl %ebp,%edx
rorl $7,%ecx
- psrld $30,%xmm8
- addl %esi,%eax
+ movdqa %xmm0,%xmm8
movl %ebx,%esi
+ xorl %edx,%edi
+ movdqa %xmm9,48(%rsp)
roll $5,%ebx
addl %edi,%eax
- xorl %ebp,%edx
- addl %ebx,%eax
- por %xmm8,%xmm0
- movl %ecx,%edi
+ xorl %ecx,%esi
+ pslld $2,%xmm0
xorl %edx,%ecx
- movdqa %xmm0,%xmm9
+ addl %ebx,%eax
+ psrld $30,%xmm8
addl 8(%rsp),%ebp
- andl %edx,%edi
andl %ecx,%esi
+ xorl %edx,%ecx
+ por %xmm8,%xmm0
rorl $7,%ebx
- addl %edi,%ebp
movl %eax,%edi
+ xorl %ecx,%esi
roll $5,%eax
+ pshufd $238,%xmm7,%xmm9
addl %esi,%ebp
- xorl %edx,%ecx
- addl %eax,%ebp
- movl %ebx,%esi
+ xorl %ebx,%edi
xorl %ecx,%ebx
+ addl %eax,%ebp
addl 12(%rsp),%edx
- andl %ecx,%esi
andl %ebx,%edi
+ xorl %ecx,%ebx
rorl $7,%eax
- addl %esi,%edx
movl %ebp,%esi
+ xorl %ebx,%edi
roll $5,%ebp
addl %edi,%edx
- xorl %ecx,%ebx
+ xorl %eax,%esi
+ xorl %ebx,%eax
addl %ebp,%edx
- movl %eax,%edi
pxor %xmm5,%xmm1
-.byte 102,68,15,58,15,207,8
- xorl %ebx,%eax
addl 16(%rsp),%ecx
- andl %ebx,%edi
- pxor %xmm2,%xmm1
andl %eax,%esi
+ xorl %ebx,%eax
rorl $7,%ebp
- movdqa %xmm10,%xmm8
- paddd %xmm0,%xmm10
- addl %edi,%ecx
+ punpcklqdq %xmm0,%xmm9
movl %edx,%edi
- pxor %xmm9,%xmm1
+ xorl %eax,%esi
+ pxor %xmm2,%xmm1
roll $5,%edx
addl %esi,%ecx
- xorl %ebx,%eax
- addl %edx,%ecx
- movdqa %xmm1,%xmm9
- movdqa %xmm10,0(%rsp)
- movl %ebp,%esi
+ movdqa %xmm10,%xmm8
+ xorl %ebp,%edi
+ paddd %xmm0,%xmm10
xorl %eax,%ebp
+ pxor %xmm9,%xmm1
+ addl %edx,%ecx
addl 20(%rsp),%ebx
- andl %eax,%esi
- pslld $2,%xmm1
andl %ebp,%edi
+ xorl %eax,%ebp
rorl $7,%edx
- psrld $30,%xmm9
- addl %esi,%ebx
+ movdqa %xmm1,%xmm9
movl %ecx,%esi
+ xorl %ebp,%edi
+ movdqa %xmm10,0(%rsp)
roll $5,%ecx
addl %edi,%ebx
- xorl %eax,%ebp
- addl %ecx,%ebx
- por %xmm9,%xmm1
- movl %edx,%edi
+ xorl %edx,%esi
+ pslld $2,%xmm1
xorl %ebp,%edx
- movdqa %xmm1,%xmm10
+ addl %ecx,%ebx
+ psrld $30,%xmm9
addl 24(%rsp),%eax
- andl %ebp,%edi
andl %edx,%esi
+ xorl %ebp,%edx
+ por %xmm9,%xmm1
rorl $7,%ecx
- addl %edi,%eax
movl %ebx,%edi
+ xorl %edx,%esi
roll $5,%ebx
+ pshufd $238,%xmm0,%xmm10
addl %esi,%eax
- xorl %ebp,%edx
- addl %ebx,%eax
- movl %ecx,%esi
+ xorl %ecx,%edi
xorl %edx,%ecx
+ addl %ebx,%eax
addl 28(%rsp),%ebp
- andl %edx,%esi
andl %ecx,%edi
+ xorl %edx,%ecx
rorl $7,%ebx
- addl %esi,%ebp
movl %eax,%esi
+ xorl %ecx,%edi
roll $5,%eax
addl %edi,%ebp
- xorl %edx,%ecx
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
addl %eax,%ebp
- movl %ebx,%edi
pxor %xmm6,%xmm2
-.byte 102,68,15,58,15,208,8
- xorl %ecx,%ebx
addl 32(%rsp),%edx
- andl %ecx,%edi
- pxor %xmm3,%xmm2
andl %ebx,%esi
+ xorl %ecx,%ebx
rorl $7,%eax
- movdqa %xmm8,%xmm9
- paddd %xmm1,%xmm8
- addl %edi,%edx
+ punpcklqdq %xmm1,%xmm10
movl %ebp,%edi
- pxor %xmm10,%xmm2
+ xorl %ebx,%esi
+ pxor %xmm3,%xmm2
roll $5,%ebp
addl %esi,%edx
- xorl %ecx,%ebx
- addl %ebp,%edx
- movdqa %xmm2,%xmm10
- movdqa %xmm8,16(%rsp)
- movl %eax,%esi
+ movdqa %xmm8,%xmm9
+ xorl %eax,%edi
+ paddd %xmm1,%xmm8
xorl %ebx,%eax
+ pxor %xmm10,%xmm2
+ addl %ebp,%edx
addl 36(%rsp),%ecx
- andl %ebx,%esi
- pslld $2,%xmm2
andl %eax,%edi
+ xorl %ebx,%eax
rorl $7,%ebp
- psrld $30,%xmm10
- addl %esi,%ecx
+ movdqa %xmm2,%xmm10
movl %edx,%esi
+ xorl %eax,%edi
+ movdqa %xmm8,16(%rsp)
roll $5,%edx
addl %edi,%ecx
- xorl %ebx,%eax
- addl %edx,%ecx
- por %xmm10,%xmm2
- movl %ebp,%edi
+ xorl %ebp,%esi
+ pslld $2,%xmm2
xorl %eax,%ebp
- movdqa %xmm2,%xmm8
+ addl %edx,%ecx
+ psrld $30,%xmm10
addl 40(%rsp),%ebx
- andl %eax,%edi
andl %ebp,%esi
+ xorl %eax,%ebp
+ por %xmm10,%xmm2
rorl $7,%edx
- addl %edi,%ebx
movl %ecx,%edi
+ xorl %ebp,%esi
roll $5,%ecx
+ pshufd $238,%xmm1,%xmm8
addl %esi,%ebx
- xorl %eax,%ebp
- addl %ecx,%ebx
- movl %edx,%esi
+ xorl %edx,%edi
xorl %ebp,%edx
+ addl %ecx,%ebx
addl 44(%rsp),%eax
- andl %ebp,%esi
andl %edx,%edi
+ xorl %ebp,%edx
rorl $7,%ecx
- addl %esi,%eax
movl %ebx,%esi
+ xorl %edx,%edi
roll $5,%ebx
addl %edi,%eax
- xorl %ebp,%edx
+ xorl %edx,%esi
addl %ebx,%eax
- addl 48(%rsp),%ebp
pxor %xmm7,%xmm3
-.byte 102,68,15,58,15,193,8
- xorl %edx,%esi
+ addl 48(%rsp),%ebp
+ xorl %ecx,%esi
+ punpcklqdq %xmm2,%xmm8
movl %eax,%edi
roll $5,%eax
pxor %xmm4,%xmm3
- xorl %ecx,%esi
- addl %eax,%ebp
+ addl %esi,%ebp
+ xorl %ecx,%edi
movdqa %xmm9,%xmm10
- paddd %xmm2,%xmm9
rorl $7,%ebx
- addl %esi,%ebp
+ paddd %xmm2,%xmm9
+ addl %eax,%ebp
pxor %xmm8,%xmm3
addl 52(%rsp),%edx
- xorl %ecx,%edi
+ xorl %ebx,%edi
movl %ebp,%esi
roll $5,%ebp
movdqa %xmm3,%xmm8
+ addl %edi,%edx
+ xorl %ebx,%esi
movdqa %xmm9,32(%rsp)
- xorl %ebx,%edi
- addl %ebp,%edx
rorl $7,%eax
- addl %edi,%edx
- pslld $2,%xmm3
+ addl %ebp,%edx
addl 56(%rsp),%ecx
- xorl %ebx,%esi
- psrld $30,%xmm8
+ pslld $2,%xmm3
+ xorl %eax,%esi
movl %edx,%edi
+ psrld $30,%xmm8
roll $5,%edx
- xorl %eax,%esi
- addl %edx,%ecx
- rorl $7,%ebp
addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
por %xmm8,%xmm3
+ addl %edx,%ecx
addl 60(%rsp),%ebx
- xorl %eax,%edi
+ xorl %ebp,%edi
movl %ecx,%esi
roll $5,%ecx
- xorl %ebp,%edi
- addl %ecx,%ebx
- rorl $7,%edx
addl %edi,%ebx
- addl 0(%rsp),%eax
- paddd %xmm3,%xmm10
xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 0(%rsp),%eax
+ xorl %edx,%esi
movl %ebx,%edi
roll $5,%ebx
- xorl %edx,%esi
+ paddd %xmm3,%xmm10
+ addl %esi,%eax
+ xorl %edx,%edi
movdqa %xmm10,48(%rsp)
- addl %ebx,%eax
rorl $7,%ecx
- addl %esi,%eax
+ addl %ebx,%eax
addl 4(%rsp),%ebp
- xorl %edx,%edi
+ xorl %ecx,%edi
movl %eax,%esi
roll $5,%eax
- xorl %ecx,%edi
- addl %eax,%ebp
- rorl $7,%ebx
addl %edi,%ebp
- addl 8(%rsp),%edx
xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 8(%rsp),%edx
+ xorl %ebx,%esi
movl %ebp,%edi
roll $5,%ebp
- xorl %ebx,%esi
- addl %ebp,%edx
- rorl $7,%eax
addl %esi,%edx
- addl 12(%rsp),%ecx
xorl %ebx,%edi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 12(%rsp),%ecx
+ xorl %eax,%edi
movl %edx,%esi
roll $5,%edx
- xorl %eax,%edi
- addl %edx,%ecx
- rorl $7,%ebp
addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
cmpq %r10,%r9
je L$done_ssse3
movdqa 64(%r11),%xmm6
- movdqa 0(%r11),%xmm9
+ movdqa -64(%r11),%xmm9
movdqu 0(%r9),%xmm0
movdqu 16(%r9),%xmm1
movdqu 32(%r9),%xmm2
@@ -2278,113 +2205,112 @@ L$oop_ssse3:
.byte 102,15,56,0,198
addq $64,%r9
addl 16(%rsp),%ebx
- xorl %eax,%esi
-.byte 102,15,56,0,206
+ xorl %ebp,%esi
movl %ecx,%edi
+.byte 102,15,56,0,206
roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
paddd %xmm9,%xmm0
- xorl %ebp,%esi
addl %ecx,%ebx
- rorl $7,%edx
- addl %esi,%ebx
- movdqa %xmm0,0(%rsp)
addl 20(%rsp),%eax
- xorl %ebp,%edi
- psubd %xmm9,%xmm0
+ xorl %edx,%edi
movl %ebx,%esi
+ movdqa %xmm0,0(%rsp)
roll $5,%ebx
- xorl %edx,%edi
- addl %ebx,%eax
- rorl $7,%ecx
addl %edi,%eax
- addl 24(%rsp),%ebp
xorl %edx,%esi
+ rorl $7,%ecx
+ psubd %xmm9,%xmm0
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
movl %eax,%edi
roll $5,%eax
- xorl %ecx,%esi
- addl %eax,%ebp
- rorl $7,%ebx
addl %esi,%ebp
- addl 28(%rsp),%edx
xorl %ecx,%edi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
movl %ebp,%esi
roll $5,%ebp
- xorl %ebx,%edi
- addl %ebp,%edx
- rorl $7,%eax
addl %edi,%edx
- addl 32(%rsp),%ecx
xorl %ebx,%esi
-.byte 102,15,56,0,214
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
movl %edx,%edi
+.byte 102,15,56,0,214
roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
paddd %xmm9,%xmm1
- xorl %eax,%esi
addl %edx,%ecx
- rorl $7,%ebp
- addl %esi,%ecx
- movdqa %xmm1,16(%rsp)
addl 36(%rsp),%ebx
- xorl %eax,%edi
- psubd %xmm9,%xmm1
+ xorl %ebp,%edi
movl %ecx,%esi
+ movdqa %xmm1,16(%rsp)
roll $5,%ecx
- xorl %ebp,%edi
- addl %ecx,%ebx
- rorl $7,%edx
addl %edi,%ebx
- addl 40(%rsp),%eax
xorl %ebp,%esi
+ rorl $7,%edx
+ psubd %xmm9,%xmm1
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
movl %ebx,%edi
roll $5,%ebx
- xorl %edx,%esi
- addl %ebx,%eax
- rorl $7,%ecx
addl %esi,%eax
- addl 44(%rsp),%ebp
xorl %edx,%edi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
movl %eax,%esi
roll $5,%eax
- xorl %ecx,%edi
- addl %eax,%ebp
- rorl $7,%ebx
addl %edi,%ebp
- addl 48(%rsp),%edx
xorl %ecx,%esi
-.byte 102,15,56,0,222
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
movl %ebp,%edi
+.byte 102,15,56,0,222
roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
paddd %xmm9,%xmm2
- xorl %ebx,%esi
addl %ebp,%edx
- rorl $7,%eax
- addl %esi,%edx
- movdqa %xmm2,32(%rsp)
addl 52(%rsp),%ecx
- xorl %ebx,%edi
- psubd %xmm9,%xmm2
+ xorl %eax,%edi
movl %edx,%esi
+ movdqa %xmm2,32(%rsp)
roll $5,%edx
- xorl %eax,%edi
- addl %edx,%ecx
- rorl $7,%ebp
addl %edi,%ecx
- addl 56(%rsp),%ebx
xorl %eax,%esi
+ rorl $7,%ebp
+ psubd %xmm9,%xmm2
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
movl %ecx,%edi
roll $5,%ecx
- xorl %ebp,%esi
- addl %ecx,%ebx
- rorl $7,%edx
addl %esi,%ebx
- addl 60(%rsp),%eax
xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
movl %ebx,%esi
roll $5,%ebx
- xorl %edx,%edi
- addl %ebx,%eax
- rorl $7,%ecx
addl %edi,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
addl 0(%r8),%eax
addl 4(%r8),%esi
addl 8(%r8),%ecx
@@ -2394,108 +2320,110 @@ L$oop_ssse3:
movl %esi,4(%r8)
movl %esi,%ebx
movl %ecx,8(%r8)
+ movl %ecx,%edi
movl %edx,12(%r8)
+ xorl %edx,%edi
movl %ebp,16(%r8)
+ andl %edi,%esi
jmp L$oop_ssse3
.p2align 4
L$done_ssse3:
addl 16(%rsp),%ebx
- xorl %eax,%esi
+ xorl %ebp,%esi
movl %ecx,%edi
roll $5,%ecx
- xorl %ebp,%esi
- addl %ecx,%ebx
- rorl $7,%edx
addl %esi,%ebx
- addl 20(%rsp),%eax
xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
movl %ebx,%esi
roll $5,%ebx
- xorl %edx,%edi
- addl %ebx,%eax
- rorl $7,%ecx
addl %edi,%eax
- addl 24(%rsp),%ebp
xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
movl %eax,%edi
roll $5,%eax
- xorl %ecx,%esi
- addl %eax,%ebp
- rorl $7,%ebx
addl %esi,%ebp
- addl 28(%rsp),%edx
xorl %ecx,%edi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
movl %ebp,%esi
roll $5,%ebp
- xorl %ebx,%edi
- addl %ebp,%edx
- rorl $7,%eax
addl %edi,%edx
- addl 32(%rsp),%ecx
xorl %ebx,%esi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
movl %edx,%edi
roll $5,%edx
- xorl %eax,%esi
- addl %edx,%ecx
- rorl $7,%ebp
addl %esi,%ecx
- addl 36(%rsp),%ebx
xorl %eax,%edi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
movl %ecx,%esi
roll $5,%ecx
- xorl %ebp,%edi
- addl %ecx,%ebx
- rorl $7,%edx
addl %edi,%ebx
- addl 40(%rsp),%eax
xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
movl %ebx,%edi
roll $5,%ebx
- xorl %edx,%esi
- addl %ebx,%eax
- rorl $7,%ecx
addl %esi,%eax
- addl 44(%rsp),%ebp
xorl %edx,%edi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
movl %eax,%esi
roll $5,%eax
- xorl %ecx,%edi
- addl %eax,%ebp
- rorl $7,%ebx
addl %edi,%ebp
- addl 48(%rsp),%edx
xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
movl %ebp,%edi
roll $5,%ebp
- xorl %ebx,%esi
- addl %ebp,%edx
- rorl $7,%eax
addl %esi,%edx
- addl 52(%rsp),%ecx
xorl %ebx,%edi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
movl %edx,%esi
roll $5,%edx
- xorl %eax,%edi
- addl %edx,%ecx
- rorl $7,%ebp
addl %edi,%ecx
- addl 56(%rsp),%ebx
xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
movl %ecx,%edi
roll $5,%ecx
- xorl %ebp,%esi
- addl %ecx,%ebx
- rorl $7,%edx
addl %esi,%ebx
- addl 60(%rsp),%eax
xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
movl %ebx,%esi
roll $5,%ebx
- xorl %edx,%edi
- addl %ebx,%eax
- rorl $7,%ecx
addl %edi,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
addl 0(%r8),%eax
addl 4(%r8),%esi
addl 8(%r8),%ecx
@@ -2506,21 +2434,28 @@ L$done_ssse3:
movl %ecx,8(%r8)
movl %edx,12(%r8)
movl %ebp,16(%r8)
- leaq 64(%rsp),%rsi
- movq 0(%rsi),%r12
- movq 8(%rsi),%rbp
- movq 16(%rsi),%rbx
- leaq 24(%rsi),%rsp
+ leaq (%r14),%rsi
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
L$epilogue_ssse3:
.byte 0xf3,0xc3
.p2align 6
K_XX_XX:
-.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
-.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
-.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
-.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
-.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 6
diff --git a/lib/accelerated/x86/macosx/sha256-ssse3-x86.s b/lib/accelerated/x86/macosx/sha256-ssse3-x86.s
index 36e0a06f18..e35d28fee0 100644
--- a/lib/accelerated/x86/macosx/sha256-ssse3-x86.s
+++ b/lib/accelerated/x86/macosx/sha256-ssse3-x86.s
@@ -63,195 +63,391 @@ L000pic_point:
movl %edi,4(%esp)
movl %eax,8(%esp)
movl %ebx,12(%esp)
+ jmp L002loop
.align 4,0x90
L002loop:
movl (%edi),%eax
movl 4(%edi),%ebx
movl 8(%edi),%ecx
- movl 12(%edi),%edx
bswap %eax
+ movl 12(%edi),%edx
bswap %ebx
- bswap %ecx
- bswap %edx
pushl %eax
+ bswap %ecx
pushl %ebx
+ bswap %edx
pushl %ecx
pushl %edx
movl 16(%edi),%eax
movl 20(%edi),%ebx
movl 24(%edi),%ecx
- movl 28(%edi),%edx
bswap %eax
+ movl 28(%edi),%edx
bswap %ebx
- bswap %ecx
- bswap %edx
pushl %eax
+ bswap %ecx
pushl %ebx
+ bswap %edx
pushl %ecx
pushl %edx
movl 32(%edi),%eax
movl 36(%edi),%ebx
movl 40(%edi),%ecx
- movl 44(%edi),%edx
bswap %eax
+ movl 44(%edi),%edx
bswap %ebx
- bswap %ecx
- bswap %edx
pushl %eax
+ bswap %ecx
pushl %ebx
+ bswap %edx
pushl %ecx
pushl %edx
movl 48(%edi),%eax
movl 52(%edi),%ebx
movl 56(%edi),%ecx
- movl 60(%edi),%edx
bswap %eax
+ movl 60(%edi),%edx
bswap %ebx
- bswap %ecx
- bswap %edx
pushl %eax
+ bswap %ecx
pushl %ebx
+ bswap %edx
pushl %ecx
pushl %edx
addl $64,%edi
- subl $32,%esp
- movl %edi,100(%esp)
+ leal -36(%esp),%esp
+ movl %edi,104(%esp)
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
movl 12(%esi),%edi
- movl %ebx,4(%esp)
- movl %ecx,8(%esp)
- movl %edi,12(%esp)
+ movl %ebx,8(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,12(%esp)
+ movl %edi,16(%esp)
+ movl %ebx,(%esp)
movl 16(%esi),%edx
movl 20(%esi),%ebx
movl 24(%esi),%ecx
movl 28(%esi),%edi
- movl %ebx,20(%esp)
- movl %ecx,24(%esp)
- movl %edi,28(%esp)
+ movl %ebx,24(%esp)
+ movl %ecx,28(%esp)
+ movl %edi,32(%esp)
.align 4,0x90
L00300_15:
- movl 92(%esp),%ebx
movl %edx,%ecx
+ movl 24(%esp),%esi
rorl $14,%ecx
- movl 20(%esp),%esi
+ movl 28(%esp),%edi
xorl %edx,%ecx
- rorl $5,%ecx
- xorl %edx,%ecx
- rorl $6,%ecx
- movl 24(%esp),%edi
- addl %ecx,%ebx
xorl %edi,%esi
- movl %edx,16(%esp)
- movl %eax,%ecx
+ movl 96(%esp),%ebx
+ rorl $5,%ecx
andl %edx,%esi
- movl 12(%esp),%edx
+ movl %edx,20(%esp)
+ xorl %ecx,%edx
+ addl 32(%esp),%ebx
xorl %edi,%esi
- movl %eax,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
addl %esi,%ebx
rorl $9,%ecx
- addl 28(%esp),%ebx
+ addl %edx,%ebx
+ movl 8(%esp),%edi
xorl %eax,%ecx
+ movl %eax,4(%esp)
+ leal -4(%esp),%esp
rorl $11,%ecx
- movl 4(%esp),%esi
+ movl (%ebp),%esi
xorl %eax,%ecx
+ movl 20(%esp),%edx
+ xorl %edi,%eax
rorl $2,%ecx
+ addl %esi,%ebx
+ movl %eax,(%esp)
addl %ebx,%edx
- movl 8(%esp),%edi
+ andl 4(%esp),%eax
addl %ecx,%ebx
- movl %eax,(%esp)
- movl %eax,%ecx
- subl $4,%esp
- orl %esi,%eax
- andl %esi,%ecx
- andl %edi,%eax
- movl (%ebp),%esi
- orl %ecx,%eax
+ xorl %edi,%eax
addl $4,%ebp
addl %ebx,%eax
- addl %esi,%edx
- addl %esi,%eax
cmpl $3248222580,%esi
jne L00300_15
- movl 152(%esp),%ebx
+ movl 156(%esp),%ecx
+ jmp L00416_63
.align 4,0x90
L00416_63:
- movl %ebx,%esi
- movl 100(%esp),%ecx
- rorl $11,%esi
- movl %ecx,%edi
- xorl %ebx,%esi
- rorl $7,%esi
+ movl %ecx,%ebx
+ movl 104(%esp),%esi
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
shrl $3,%ebx
- rorl $2,%edi
- xorl %esi,%ebx
- xorl %ecx,%edi
- rorl $17,%edi
- shrl $10,%ecx
- addl 156(%esp),%ebx
- xorl %ecx,%edi
- addl 120(%esp),%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 160(%esp),%ebx
+ shrl $10,%edi
+ addl 124(%esp),%ebx
movl %edx,%ecx
- addl %edi,%ebx
+ xorl %esi,%edi
+ movl 24(%esp),%esi
rorl $14,%ecx
- movl 20(%esp),%esi
- xorl %edx,%ecx
- rorl $5,%ecx
- movl %ebx,92(%esp)
+ addl %edi,%ebx
+ movl 28(%esp),%edi
xorl %edx,%ecx
- rorl $6,%ecx
- movl 24(%esp),%edi
- addl %ecx,%ebx
xorl %edi,%esi
- movl %edx,16(%esp)
- movl %eax,%ecx
+ movl %ebx,96(%esp)
+ rorl $5,%ecx
andl %edx,%esi
- movl 12(%esp),%edx
+ movl %edx,20(%esp)
+ xorl %ecx,%edx
+ addl 32(%esp),%ebx
xorl %edi,%esi
- movl %eax,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
addl %esi,%ebx
rorl $9,%ecx
- addl 28(%esp),%ebx
+ addl %edx,%ebx
+ movl 8(%esp),%edi
xorl %eax,%ecx
+ movl %eax,4(%esp)
+ leal -4(%esp),%esp
rorl $11,%ecx
- movl 4(%esp),%esi
+ movl (%ebp),%esi
xorl %eax,%ecx
+ movl 20(%esp),%edx
+ xorl %edi,%eax
rorl $2,%ecx
+ addl %esi,%ebx
+ movl %eax,(%esp)
addl %ebx,%edx
- movl 8(%esp),%edi
+ andl 4(%esp),%eax
addl %ecx,%ebx
+ xorl %edi,%eax
+ movl 156(%esp),%ecx
+ addl $4,%ebp
+ addl %ebx,%eax
+ cmpl $3329325298,%esi
+ jne L00416_63
+ movl 356(%esp),%esi
+ movl 8(%esp),%ebx
+ movl 16(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl 24(%esp),%eax
+ movl 28(%esp),%ebx
+ movl 32(%esp),%ecx
+ movl 360(%esp),%edi
+ addl 16(%esi),%edx
+ addl 20(%esi),%eax
+ addl 24(%esi),%ebx
+ addl 28(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %eax,20(%esi)
+ movl %ebx,24(%esi)
+ movl %ecx,28(%esi)
+ leal 356(%esp),%esp
+ subl $256,%ebp
+ cmpl 8(%esp),%edi
+ jb L002loop
+ movl 12(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 5,0x90
+L005loop_shrd:
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ bswap %eax
+ movl 12(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ movl 16(%edi),%eax
+ movl 20(%edi),%ebx
+ movl 24(%edi),%ecx
+ bswap %eax
+ movl 28(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ movl 32(%edi),%eax
+ movl 36(%edi),%ebx
+ movl 40(%edi),%ecx
+ bswap %eax
+ movl 44(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ movl 48(%edi),%eax
+ movl 52(%edi),%ebx
+ movl 56(%edi),%ecx
+ bswap %eax
+ movl 60(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ addl $64,%edi
+ leal -36(%esp),%esp
+ movl %edi,104(%esp)
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,8(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,12(%esp)
+ movl %edi,16(%esp)
+ movl %ebx,(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%edi
+ movl %ebx,24(%esp)
+ movl %ecx,28(%esp)
+ movl %edi,32(%esp)
+.align 4,0x90
+L00600_15_shrd:
+ movl %edx,%ecx
+ movl 24(%esp),%esi
+ shrdl $14,%ecx,%ecx
+ movl 28(%esp),%edi
+ xorl %edx,%ecx
+ xorl %edi,%esi
+ movl 96(%esp),%ebx
+ shrdl $5,%ecx,%ecx
+ andl %edx,%esi
+ movl %edx,20(%esp)
+ xorl %ecx,%edx
+ addl 32(%esp),%ebx
+ xorl %edi,%esi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %esi,%ebx
+ shrdl $9,%ecx,%ecx
+ addl %edx,%ebx
+ movl 8(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,4(%esp)
+ leal -4(%esp),%esp
+ shrdl $11,%ecx,%ecx
+ movl (%ebp),%esi
+ xorl %eax,%ecx
+ movl 20(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %esi,%ebx
movl %eax,(%esp)
+ addl %ebx,%edx
+ andl 4(%esp),%eax
+ addl %ecx,%ebx
+ xorl %edi,%eax
+ addl $4,%ebp
+ addl %ebx,%eax
+ cmpl $3248222580,%esi
+ jne L00600_15_shrd
+ movl 156(%esp),%ecx
+ jmp L00716_63_shrd
+.align 4,0x90
+L00716_63_shrd:
+ movl %ecx,%ebx
+ movl 104(%esp),%esi
+ shrdl $11,%ecx,%ecx
+ movl %esi,%edi
+ shrdl $2,%esi,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ shrdl $17,%esi,%esi
+ addl 160(%esp),%ebx
+ shrl $10,%edi
+ addl 124(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 24(%esp),%esi
+ shrdl $14,%ecx,%ecx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %edx,%ecx
+ xorl %edi,%esi
+ movl %ebx,96(%esp)
+ shrdl $5,%ecx,%ecx
+ andl %edx,%esi
+ movl %edx,20(%esp)
+ xorl %ecx,%edx
+ addl 32(%esp),%ebx
+ xorl %edi,%esi
+ shrdl $6,%edx,%edx
movl %eax,%ecx
- subl $4,%esp
- orl %esi,%eax
- andl %esi,%ecx
- andl %edi,%eax
+ addl %esi,%ebx
+ shrdl $9,%ecx,%ecx
+ addl %edx,%ebx
+ movl 8(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,4(%esp)
+ leal -4(%esp),%esp
+ shrdl $11,%ecx,%ecx
movl (%ebp),%esi
- orl %ecx,%eax
+ xorl %eax,%ecx
+ movl 20(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %esi,%ebx
+ movl %eax,(%esp)
+ addl %ebx,%edx
+ andl 4(%esp),%eax
+ addl %ecx,%ebx
+ xorl %edi,%eax
+ movl 156(%esp),%ecx
addl $4,%ebp
addl %ebx,%eax
- movl 152(%esp),%ebx
- addl %esi,%edx
- addl %esi,%eax
cmpl $3329325298,%esi
- jne L00416_63
- movl 352(%esp),%esi
- movl 4(%esp),%ebx
- movl 8(%esp),%ecx
- movl 12(%esp),%edi
+ jne L00716_63_shrd
+ movl 356(%esp),%esi
+ movl 8(%esp),%ebx
+ movl 16(%esp),%ecx
addl (%esi),%eax
addl 4(%esi),%ebx
- addl 8(%esi),%ecx
- addl 12(%esi),%edi
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
movl %eax,(%esi)
movl %ebx,4(%esi)
- movl %ecx,8(%esi)
- movl %edi,12(%esi)
- movl 20(%esp),%eax
- movl 24(%esp),%ebx
- movl 28(%esp),%ecx
- movl 356(%esp),%edi
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl 24(%esp),%eax
+ movl 28(%esp),%ebx
+ movl 32(%esp),%ecx
+ movl 360(%esp),%edi
addl 16(%esi),%edx
addl 20(%esi),%eax
addl 24(%esi),%ebx
@@ -260,10 +456,10 @@ L00416_63:
movl %eax,20(%esi)
movl %ebx,24(%esi)
movl %ecx,28(%esi)
- addl $352,%esp
+ leal 356(%esp),%esp
subl $256,%ebp
cmpl 8(%esp),%edi
- jb L002loop
+ jb L005loop_shrd
movl 12(%esp),%esp
popl %edi
popl %esi
@@ -272,25 +468,2918 @@ L00416_63:
ret
.align 6,0x90
L001K256:
-.long 1116352408,1899447441,3049323471,3921009573
-.long 961987163,1508970993,2453635748,2870763221
-.long 3624381080,310598401,607225278,1426881987
-.long 1925078388,2162078206,2614888103,3248222580
-.long 3835390401,4022224774,264347078,604807628
-.long 770255983,1249150122,1555081692,1996064986
-.long 2554220882,2821834349,2952996808,3210313671
-.long 3336571891,3584528711,113926993,338241895
-.long 666307205,773529912,1294757372,1396182291
-.long 1695183700,1986661051,2177026350,2456956037
-.long 2730485921,2820302411,3259730800,3345764771
-.long 3516065817,3600352804,4094571909,275423344
-.long 430227734,506948616,659060556,883997877
-.long 958139571,1322822218,1537002063,1747873779
-.long 1955562222,2024104815,2227730452,2361852424
-.long 2428436474,2756734187,3204031479,3329325298
+.long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
+.long 66051,67438087,134810123,202182159
.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
.byte 62,0
+.align 4,0x90
+L008unrolled:
+ leal -96(%esp),%esp
+ movl (%esi),%eax
+ movl 4(%esi),%ebp
+ movl 8(%esi),%ecx
+ movl 12(%esi),%ebx
+ movl %ebp,4(%esp)
+ xorl %ecx,%ebp
+ movl %ecx,8(%esp)
+ movl %ebx,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %ebx,20(%esp)
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ jmp L009grand_loop
+.align 4,0x90
+L009grand_loop:
+ movl (%edi),%ebx
+ movl 4(%edi),%ecx
+ bswap %ebx
+ movl 8(%edi),%esi
+ bswap %ecx
+ movl %ebx,32(%esp)
+ bswap %esi
+ movl %ecx,36(%esp)
+ movl %esi,40(%esp)
+ movl 12(%edi),%ebx
+ movl 16(%edi),%ecx
+ bswap %ebx
+ movl 20(%edi),%esi
+ bswap %ecx
+ movl %ebx,44(%esp)
+ bswap %esi
+ movl %ecx,48(%esp)
+ movl %esi,52(%esp)
+ movl 24(%edi),%ebx
+ movl 28(%edi),%ecx
+ bswap %ebx
+ movl 32(%edi),%esi
+ bswap %ecx
+ movl %ebx,56(%esp)
+ bswap %esi
+ movl %ecx,60(%esp)
+ movl %esi,64(%esp)
+ movl 36(%edi),%ebx
+ movl 40(%edi),%ecx
+ bswap %ebx
+ movl 44(%edi),%esi
+ bswap %ecx
+ movl %ebx,68(%esp)
+ bswap %esi
+ movl %ecx,72(%esp)
+ movl %esi,76(%esp)
+ movl 48(%edi),%ebx
+ movl 52(%edi),%ecx
+ bswap %ebx
+ movl 56(%edi),%esi
+ bswap %ecx
+ movl %ebx,80(%esp)
+ bswap %esi
+ movl %ecx,84(%esp)
+ movl %esi,88(%esp)
+ movl 60(%edi),%ebx
+ addl $64,%edi
+ bswap %ebx
+ movl %edi,100(%esp)
+ movl %ebx,92(%esp)
+ movl %edx,%ecx
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl 32(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1116352408(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl 36(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1899447441(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl 40(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3049323471(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl 44(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3921009573(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl 48(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 961987163(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl 52(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1508970993(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl 56(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2453635748(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl 60(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2870763221(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl 64(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3624381080(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl 68(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 310598401(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl 72(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 607225278(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl 76(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1426881987(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl 80(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1925078388(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl 84(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2162078206(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl 88(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2614888103(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl 92(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3248222580(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 36(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 88(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 32(%esp),%ebx
+ shrl $10,%edi
+ addl 68(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,32(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3835390401(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 40(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 92(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 36(%esp),%ebx
+ shrl $10,%edi
+ addl 72(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,36(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 4022224774(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 44(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 32(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 40(%esp),%ebx
+ shrl $10,%edi
+ addl 76(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,40(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 264347078(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 48(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 36(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 44(%esp),%ebx
+ shrl $10,%edi
+ addl 80(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,44(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 604807628(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 52(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 40(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 48(%esp),%ebx
+ shrl $10,%edi
+ addl 84(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,48(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 770255983(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 56(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 44(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 52(%esp),%ebx
+ shrl $10,%edi
+ addl 88(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,52(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1249150122(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 60(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 48(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 56(%esp),%ebx
+ shrl $10,%edi
+ addl 92(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,56(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1555081692(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 64(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 52(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 60(%esp),%ebx
+ shrl $10,%edi
+ addl 32(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,60(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1996064986(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 68(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 56(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 64(%esp),%ebx
+ shrl $10,%edi
+ addl 36(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,64(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2554220882(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 72(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 60(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 68(%esp),%ebx
+ shrl $10,%edi
+ addl 40(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,68(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2821834349(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 76(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 64(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 72(%esp),%ebx
+ shrl $10,%edi
+ addl 44(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,72(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2952996808(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 80(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 68(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 76(%esp),%ebx
+ shrl $10,%edi
+ addl 48(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,76(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3210313671(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 84(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 72(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 80(%esp),%ebx
+ shrl $10,%edi
+ addl 52(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,80(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3336571891(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 88(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 76(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 84(%esp),%ebx
+ shrl $10,%edi
+ addl 56(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,84(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3584528711(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 92(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 80(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 88(%esp),%ebx
+ shrl $10,%edi
+ addl 60(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,88(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 113926993(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 32(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 84(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 92(%esp),%ebx
+ shrl $10,%edi
+ addl 64(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,92(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 338241895(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 36(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 88(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 32(%esp),%ebx
+ shrl $10,%edi
+ addl 68(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,32(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 666307205(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 40(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 92(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 36(%esp),%ebx
+ shrl $10,%edi
+ addl 72(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,36(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 773529912(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 44(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 32(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 40(%esp),%ebx
+ shrl $10,%edi
+ addl 76(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,40(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1294757372(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 48(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 36(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 44(%esp),%ebx
+ shrl $10,%edi
+ addl 80(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,44(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1396182291(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 52(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 40(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 48(%esp),%ebx
+ shrl $10,%edi
+ addl 84(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,48(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1695183700(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 56(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 44(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 52(%esp),%ebx
+ shrl $10,%edi
+ addl 88(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,52(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1986661051(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 60(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 48(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 56(%esp),%ebx
+ shrl $10,%edi
+ addl 92(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,56(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2177026350(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 64(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 52(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 60(%esp),%ebx
+ shrl $10,%edi
+ addl 32(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,60(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2456956037(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 68(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 56(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 64(%esp),%ebx
+ shrl $10,%edi
+ addl 36(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,64(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2730485921(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 72(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 60(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 68(%esp),%ebx
+ shrl $10,%edi
+ addl 40(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,68(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2820302411(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 76(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 64(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 72(%esp),%ebx
+ shrl $10,%edi
+ addl 44(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,72(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3259730800(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 80(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 68(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 76(%esp),%ebx
+ shrl $10,%edi
+ addl 48(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,76(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3345764771(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 84(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 72(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 80(%esp),%ebx
+ shrl $10,%edi
+ addl 52(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,80(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3516065817(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 88(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 76(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 84(%esp),%ebx
+ shrl $10,%edi
+ addl 56(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,84(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3600352804(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 92(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 80(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 88(%esp),%ebx
+ shrl $10,%edi
+ addl 60(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,88(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 4094571909(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 32(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 84(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 92(%esp),%ebx
+ shrl $10,%edi
+ addl 64(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,92(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 275423344(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 36(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 88(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 32(%esp),%ebx
+ shrl $10,%edi
+ addl 68(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,32(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 430227734(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 40(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 92(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 36(%esp),%ebx
+ shrl $10,%edi
+ addl 72(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,36(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 506948616(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 44(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 32(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 40(%esp),%ebx
+ shrl $10,%edi
+ addl 76(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,40(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 659060556(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 48(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 36(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 44(%esp),%ebx
+ shrl $10,%edi
+ addl 80(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,44(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 883997877(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 52(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 40(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 48(%esp),%ebx
+ shrl $10,%edi
+ addl 84(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,48(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 958139571(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 56(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 44(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 52(%esp),%ebx
+ shrl $10,%edi
+ addl 88(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,52(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1322822218(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 60(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 48(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 56(%esp),%ebx
+ shrl $10,%edi
+ addl 92(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,56(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1537002063(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 64(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 52(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 60(%esp),%ebx
+ shrl $10,%edi
+ addl 32(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,60(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1747873779(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 68(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 56(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 64(%esp),%ebx
+ shrl $10,%edi
+ addl 36(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,64(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1955562222(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 72(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 60(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 68(%esp),%ebx
+ shrl $10,%edi
+ addl 40(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,68(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2024104815(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 76(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 64(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 72(%esp),%ebx
+ shrl $10,%edi
+ addl 44(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,72(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2227730452(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 80(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 68(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 76(%esp),%ebx
+ shrl $10,%edi
+ addl 48(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,76(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2361852424(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 84(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 72(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 80(%esp),%ebx
+ shrl $10,%edi
+ addl 52(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,80(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2428436474(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 88(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 76(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 84(%esp),%ebx
+ shrl $10,%edi
+ addl 56(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,84(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2756734187(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 92(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 80(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 88(%esp),%ebx
+ shrl $10,%edi
+ addl 60(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3204031479(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 32(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 84(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 92(%esp),%ebx
+ shrl $10,%edi
+ addl 64(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3329325298(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebp
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebp
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebp,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebx
+ movl 28(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ebx
+ addl 28(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %ebx,24(%esi)
+ movl %ecx,28(%esi)
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ebx,24(%esp)
+ movl %ecx,28(%esp)
+ cmpl 104(%esp),%edi
+ jb L009grand_loop
+ movl 108(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
diff --git a/lib/accelerated/x86/macosx/sha512-ssse3-x86.s b/lib/accelerated/x86/macosx/sha512-ssse3-x86.s
index 0014a8116b..53a7945f1c 100644
--- a/lib/accelerated/x86/macosx/sha512-ssse3-x86.s
+++ b/lib/accelerated/x86/macosx/sha512-ssse3-x86.s
@@ -593,6 +593,8 @@ L001K512:
.long 4234509866,1501505948
.long 987167468,1607167915
.long 1246189591,1816402316
+.long 67438087,66051
+.long 202182159,134810123
.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
diff --git a/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s b/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s
index 23da5e6e39..c48240f457 100644
--- a/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s
+++ b/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s
@@ -39,10 +39,17 @@
#
.text
+
.globl _sha256_block_data_order
.p2align 4
_sha256_block_data_order:
+ leaq __gnutls_x86_cpuid_s(%rip),%r11
+ movl 0(%r11),%r9d
+ movl 4(%r11),%r10d
+ movl 8(%r11),%r11d
+ testl $512,%r10d
+ jnz L$ssse3_shortcut
pushq %rbx
pushq %rbp
pushq %r12
@@ -60,8 +67,6 @@ _sha256_block_data_order:
movq %r11,64+24(%rsp)
L$prologue:
- leaq K256(%rip),%rbp
-
movl 0(%rdi),%eax
movl 4(%rdi),%ebx
movl 8(%rdi),%ecx
@@ -74,1694 +79,1632 @@ L$prologue:
.p2align 4
L$loop:
- xorq %rdi,%rdi
+ movl %ebx,%edi
+ leaq K256(%rip),%rbp
+ xorl %ecx,%edi
movl 0(%rsi),%r12d
movl %r8d,%r13d
movl %eax,%r14d
bswapl %r12d
rorl $14,%r13d
movl %r9d,%r15d
- movl %r12d,0(%rsp)
- rorl $9,%r14d
xorl %r8d,%r13d
+ rorl $9,%r14d
xorl %r10d,%r15d
- rorl $5,%r13d
- addl %r11d,%r12d
+ movl %r12d,0(%rsp)
xorl %eax,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r8d,%r15d
- movl %ebx,%r11d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
rorl $11,%r14d
xorl %r8d,%r13d
- xorl %r10d,%r15d
+ addl %r15d,%r12d
- xorl %ecx,%r11d
+ movl %eax,%r15d
+ addl (%rbp),%r12d
xorl %eax,%r14d
- addl %r15d,%r12d
- movl %ebx,%r15d
+ xorl %ebx,%r15d
rorl $6,%r13d
- andl %eax,%r11d
- andl %ecx,%r15d
+ movl %ebx,%r11d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r11d
+ xorl %edi,%r11d
addl %r12d,%edx
addl %r12d,%r11d
- leaq 1(%rdi),%rdi
- addl %r14d,%r11d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r11d
movl 4(%rsi),%r12d
movl %edx,%r13d
movl %r11d,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %r8d,%r15d
- movl %r12d,4(%rsp)
+ movl %r8d,%edi
- rorl $9,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ rorl $9,%r14d
+ xorl %r9d,%edi
- rorl $5,%r13d
- addl %r10d,%r12d
+ movl %r12d,4(%rsp)
xorl %r11d,%r14d
+ andl %edx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %edx,%r15d
- movl %eax,%r10d
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
rorl $11,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ addl %edi,%r12d
- xorl %ebx,%r10d
+ movl %r11d,%edi
+ addl (%rbp),%r12d
xorl %r11d,%r14d
- addl %r15d,%r12d
- movl %eax,%r15d
+ xorl %eax,%edi
rorl $6,%r13d
- andl %r11d,%r10d
- andl %ebx,%r15d
+ movl %eax,%r10d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r10d
+ xorl %r15d,%r10d
addl %r12d,%ecx
addl %r12d,%r10d
- leaq 1(%rdi),%rdi
- addl %r14d,%r10d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r10d
movl 8(%rsi),%r12d
movl %ecx,%r13d
movl %r10d,%r14d
bswapl %r12d
rorl $14,%r13d
movl %edx,%r15d
- movl %r12d,8(%rsp)
- rorl $9,%r14d
xorl %ecx,%r13d
+ rorl $9,%r14d
xorl %r8d,%r15d
- rorl $5,%r13d
- addl %r9d,%r12d
+ movl %r12d,8(%rsp)
xorl %r10d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %ecx,%r15d
- movl %r11d,%r9d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
rorl $11,%r14d
xorl %ecx,%r13d
- xorl %r8d,%r15d
+ addl %r15d,%r12d
- xorl %eax,%r9d
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
xorl %r10d,%r14d
- addl %r15d,%r12d
- movl %r11d,%r15d
+ xorl %r11d,%r15d
rorl $6,%r13d
- andl %r10d,%r9d
- andl %eax,%r15d
+ movl %r11d,%r9d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r9d
+ xorl %edi,%r9d
addl %r12d,%ebx
addl %r12d,%r9d
- leaq 1(%rdi),%rdi
- addl %r14d,%r9d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r9d
movl 12(%rsi),%r12d
movl %ebx,%r13d
movl %r9d,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %ecx,%r15d
- movl %r12d,12(%rsp)
+ movl %ecx,%edi
- rorl $9,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ rorl $9,%r14d
+ xorl %edx,%edi
- rorl $5,%r13d
- addl %r8d,%r12d
+ movl %r12d,12(%rsp)
xorl %r9d,%r14d
+ andl %ebx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %ebx,%r15d
- movl %r10d,%r8d
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
rorl $11,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ addl %edi,%r12d
- xorl %r11d,%r8d
+ movl %r9d,%edi
+ addl (%rbp),%r12d
xorl %r9d,%r14d
- addl %r15d,%r12d
- movl %r10d,%r15d
+ xorl %r10d,%edi
rorl $6,%r13d
- andl %r9d,%r8d
- andl %r11d,%r15d
+ movl %r10d,%r8d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r8d
+ xorl %r15d,%r8d
addl %r12d,%eax
addl %r12d,%r8d
- leaq 1(%rdi),%rdi
- addl %r14d,%r8d
+ leaq 20(%rbp),%rbp
+ addl %r14d,%r8d
movl 16(%rsi),%r12d
movl %eax,%r13d
movl %r8d,%r14d
bswapl %r12d
rorl $14,%r13d
movl %ebx,%r15d
- movl %r12d,16(%rsp)
- rorl $9,%r14d
xorl %eax,%r13d
+ rorl $9,%r14d
xorl %ecx,%r15d
- rorl $5,%r13d
- addl %edx,%r12d
+ movl %r12d,16(%rsp)
xorl %r8d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %eax,%r15d
- movl %r9d,%edx
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
rorl $11,%r14d
xorl %eax,%r13d
- xorl %ecx,%r15d
+ addl %r15d,%r12d
- xorl %r10d,%edx
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
xorl %r8d,%r14d
- addl %r15d,%r12d
- movl %r9d,%r15d
+ xorl %r9d,%r15d
rorl $6,%r13d
- andl %r8d,%edx
- andl %r10d,%r15d
+ movl %r9d,%edx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%edx
+ xorl %edi,%edx
addl %r12d,%r11d
addl %r12d,%edx
- leaq 1(%rdi),%rdi
- addl %r14d,%edx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%edx
movl 20(%rsi),%r12d
movl %r11d,%r13d
movl %edx,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %eax,%r15d
- movl %r12d,20(%rsp)
+ movl %eax,%edi
- rorl $9,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ rorl $9,%r14d
+ xorl %ebx,%edi
- rorl $5,%r13d
- addl %ecx,%r12d
+ movl %r12d,20(%rsp)
xorl %edx,%r14d
+ andl %r11d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r11d,%r15d
- movl %r8d,%ecx
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
rorl $11,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ addl %edi,%r12d
- xorl %r9d,%ecx
+ movl %edx,%edi
+ addl (%rbp),%r12d
xorl %edx,%r14d
- addl %r15d,%r12d
- movl %r8d,%r15d
+ xorl %r8d,%edi
rorl $6,%r13d
- andl %edx,%ecx
- andl %r9d,%r15d
+ movl %r8d,%ecx
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ecx
+ xorl %r15d,%ecx
addl %r12d,%r10d
addl %r12d,%ecx
- leaq 1(%rdi),%rdi
- addl %r14d,%ecx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ecx
movl 24(%rsi),%r12d
movl %r10d,%r13d
movl %ecx,%r14d
bswapl %r12d
rorl $14,%r13d
movl %r11d,%r15d
- movl %r12d,24(%rsp)
- rorl $9,%r14d
xorl %r10d,%r13d
+ rorl $9,%r14d
xorl %eax,%r15d
- rorl $5,%r13d
- addl %ebx,%r12d
+ movl %r12d,24(%rsp)
xorl %ecx,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r10d,%r15d
- movl %edx,%ebx
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
rorl $11,%r14d
xorl %r10d,%r13d
- xorl %eax,%r15d
+ addl %r15d,%r12d
- xorl %r8d,%ebx
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
xorl %ecx,%r14d
- addl %r15d,%r12d
- movl %edx,%r15d
+ xorl %edx,%r15d
rorl $6,%r13d
- andl %ecx,%ebx
- andl %r8d,%r15d
+ movl %edx,%ebx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ebx
+ xorl %edi,%ebx
addl %r12d,%r9d
addl %r12d,%ebx
- leaq 1(%rdi),%rdi
- addl %r14d,%ebx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ebx
movl 28(%rsi),%r12d
movl %r9d,%r13d
movl %ebx,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %r10d,%r15d
- movl %r12d,28(%rsp)
+ movl %r10d,%edi
- rorl $9,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ rorl $9,%r14d
+ xorl %r11d,%edi
- rorl $5,%r13d
- addl %eax,%r12d
+ movl %r12d,28(%rsp)
xorl %ebx,%r14d
+ andl %r9d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r9d,%r15d
- movl %ecx,%eax
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
rorl $11,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ addl %edi,%r12d
- xorl %edx,%eax
+ movl %ebx,%edi
+ addl (%rbp),%r12d
xorl %ebx,%r14d
- addl %r15d,%r12d
- movl %ecx,%r15d
+ xorl %ecx,%edi
rorl $6,%r13d
- andl %ebx,%eax
- andl %edx,%r15d
+ movl %ecx,%eax
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%eax
+ xorl %r15d,%eax
addl %r12d,%r8d
addl %r12d,%eax
- leaq 1(%rdi),%rdi
- addl %r14d,%eax
+ leaq 20(%rbp),%rbp
+ addl %r14d,%eax
movl 32(%rsi),%r12d
movl %r8d,%r13d
movl %eax,%r14d
bswapl %r12d
rorl $14,%r13d
movl %r9d,%r15d
- movl %r12d,32(%rsp)
- rorl $9,%r14d
xorl %r8d,%r13d
+ rorl $9,%r14d
xorl %r10d,%r15d
- rorl $5,%r13d
- addl %r11d,%r12d
+ movl %r12d,32(%rsp)
xorl %eax,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r8d,%r15d
- movl %ebx,%r11d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
rorl $11,%r14d
xorl %r8d,%r13d
- xorl %r10d,%r15d
+ addl %r15d,%r12d
- xorl %ecx,%r11d
+ movl %eax,%r15d
+ addl (%rbp),%r12d
xorl %eax,%r14d
- addl %r15d,%r12d
- movl %ebx,%r15d
+ xorl %ebx,%r15d
rorl $6,%r13d
- andl %eax,%r11d
- andl %ecx,%r15d
+ movl %ebx,%r11d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r11d
+ xorl %edi,%r11d
addl %r12d,%edx
addl %r12d,%r11d
- leaq 1(%rdi),%rdi
- addl %r14d,%r11d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r11d
movl 36(%rsi),%r12d
movl %edx,%r13d
movl %r11d,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %r8d,%r15d
- movl %r12d,36(%rsp)
+ movl %r8d,%edi
- rorl $9,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ rorl $9,%r14d
+ xorl %r9d,%edi
- rorl $5,%r13d
- addl %r10d,%r12d
+ movl %r12d,36(%rsp)
xorl %r11d,%r14d
+ andl %edx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %edx,%r15d
- movl %eax,%r10d
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
rorl $11,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ addl %edi,%r12d
- xorl %ebx,%r10d
+ movl %r11d,%edi
+ addl (%rbp),%r12d
xorl %r11d,%r14d
- addl %r15d,%r12d
- movl %eax,%r15d
+ xorl %eax,%edi
rorl $6,%r13d
- andl %r11d,%r10d
- andl %ebx,%r15d
+ movl %eax,%r10d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r10d
+ xorl %r15d,%r10d
addl %r12d,%ecx
addl %r12d,%r10d
- leaq 1(%rdi),%rdi
- addl %r14d,%r10d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r10d
movl 40(%rsi),%r12d
movl %ecx,%r13d
movl %r10d,%r14d
bswapl %r12d
rorl $14,%r13d
movl %edx,%r15d
- movl %r12d,40(%rsp)
- rorl $9,%r14d
xorl %ecx,%r13d
+ rorl $9,%r14d
xorl %r8d,%r15d
- rorl $5,%r13d
- addl %r9d,%r12d
+ movl %r12d,40(%rsp)
xorl %r10d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %ecx,%r15d
- movl %r11d,%r9d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
rorl $11,%r14d
xorl %ecx,%r13d
- xorl %r8d,%r15d
+ addl %r15d,%r12d
- xorl %eax,%r9d
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
xorl %r10d,%r14d
- addl %r15d,%r12d
- movl %r11d,%r15d
+ xorl %r11d,%r15d
rorl $6,%r13d
- andl %r10d,%r9d
- andl %eax,%r15d
+ movl %r11d,%r9d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r9d
+ xorl %edi,%r9d
addl %r12d,%ebx
addl %r12d,%r9d
- leaq 1(%rdi),%rdi
- addl %r14d,%r9d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r9d
movl 44(%rsi),%r12d
movl %ebx,%r13d
movl %r9d,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %ecx,%r15d
- movl %r12d,44(%rsp)
+ movl %ecx,%edi
- rorl $9,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ rorl $9,%r14d
+ xorl %edx,%edi
- rorl $5,%r13d
- addl %r8d,%r12d
+ movl %r12d,44(%rsp)
xorl %r9d,%r14d
+ andl %ebx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %ebx,%r15d
- movl %r10d,%r8d
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
rorl $11,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ addl %edi,%r12d
- xorl %r11d,%r8d
+ movl %r9d,%edi
+ addl (%rbp),%r12d
xorl %r9d,%r14d
- addl %r15d,%r12d
- movl %r10d,%r15d
+ xorl %r10d,%edi
rorl $6,%r13d
- andl %r9d,%r8d
- andl %r11d,%r15d
+ movl %r10d,%r8d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r8d
+ xorl %r15d,%r8d
addl %r12d,%eax
addl %r12d,%r8d
- leaq 1(%rdi),%rdi
- addl %r14d,%r8d
+ leaq 20(%rbp),%rbp
+ addl %r14d,%r8d
movl 48(%rsi),%r12d
movl %eax,%r13d
movl %r8d,%r14d
bswapl %r12d
rorl $14,%r13d
movl %ebx,%r15d
- movl %r12d,48(%rsp)
- rorl $9,%r14d
xorl %eax,%r13d
+ rorl $9,%r14d
xorl %ecx,%r15d
- rorl $5,%r13d
- addl %edx,%r12d
+ movl %r12d,48(%rsp)
xorl %r8d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %eax,%r15d
- movl %r9d,%edx
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
rorl $11,%r14d
xorl %eax,%r13d
- xorl %ecx,%r15d
+ addl %r15d,%r12d
- xorl %r10d,%edx
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
xorl %r8d,%r14d
- addl %r15d,%r12d
- movl %r9d,%r15d
+ xorl %r9d,%r15d
rorl $6,%r13d
- andl %r8d,%edx
- andl %r10d,%r15d
+ movl %r9d,%edx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%edx
+ xorl %edi,%edx
addl %r12d,%r11d
addl %r12d,%edx
- leaq 1(%rdi),%rdi
- addl %r14d,%edx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%edx
movl 52(%rsi),%r12d
movl %r11d,%r13d
movl %edx,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %eax,%r15d
- movl %r12d,52(%rsp)
+ movl %eax,%edi
- rorl $9,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ rorl $9,%r14d
+ xorl %ebx,%edi
- rorl $5,%r13d
- addl %ecx,%r12d
+ movl %r12d,52(%rsp)
xorl %edx,%r14d
+ andl %r11d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r11d,%r15d
- movl %r8d,%ecx
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
rorl $11,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ addl %edi,%r12d
- xorl %r9d,%ecx
+ movl %edx,%edi
+ addl (%rbp),%r12d
xorl %edx,%r14d
- addl %r15d,%r12d
- movl %r8d,%r15d
+ xorl %r8d,%edi
rorl $6,%r13d
- andl %edx,%ecx
- andl %r9d,%r15d
+ movl %r8d,%ecx
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ecx
+ xorl %r15d,%ecx
addl %r12d,%r10d
addl %r12d,%ecx
- leaq 1(%rdi),%rdi
- addl %r14d,%ecx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ecx
movl 56(%rsi),%r12d
movl %r10d,%r13d
movl %ecx,%r14d
bswapl %r12d
rorl $14,%r13d
movl %r11d,%r15d
- movl %r12d,56(%rsp)
- rorl $9,%r14d
xorl %r10d,%r13d
+ rorl $9,%r14d
xorl %eax,%r15d
- rorl $5,%r13d
- addl %ebx,%r12d
+ movl %r12d,56(%rsp)
xorl %ecx,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r10d,%r15d
- movl %edx,%ebx
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
rorl $11,%r14d
xorl %r10d,%r13d
- xorl %eax,%r15d
+ addl %r15d,%r12d
- xorl %r8d,%ebx
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
xorl %ecx,%r14d
- addl %r15d,%r12d
- movl %edx,%r15d
+ xorl %edx,%r15d
rorl $6,%r13d
- andl %ecx,%ebx
- andl %r8d,%r15d
+ movl %edx,%ebx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ebx
+ xorl %edi,%ebx
addl %r12d,%r9d
addl %r12d,%ebx
- leaq 1(%rdi),%rdi
- addl %r14d,%ebx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ebx
movl 60(%rsi),%r12d
movl %r9d,%r13d
movl %ebx,%r14d
bswapl %r12d
rorl $14,%r13d
- movl %r10d,%r15d
- movl %r12d,60(%rsp)
+ movl %r10d,%edi
- rorl $9,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ rorl $9,%r14d
+ xorl %r11d,%edi
- rorl $5,%r13d
- addl %eax,%r12d
+ movl %r12d,60(%rsp)
xorl %ebx,%r14d
+ andl %r9d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r9d,%r15d
- movl %ecx,%eax
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
rorl $11,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ addl %edi,%r12d
- xorl %edx,%eax
+ movl %ebx,%edi
+ addl (%rbp),%r12d
xorl %ebx,%r14d
- addl %r15d,%r12d
- movl %ecx,%r15d
+ xorl %ecx,%edi
rorl $6,%r13d
- andl %ebx,%eax
- andl %edx,%r15d
+ movl %ecx,%eax
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%eax
+ xorl %r15d,%eax
addl %r12d,%r8d
addl %r12d,%eax
- leaq 1(%rdi),%rdi
- addl %r14d,%eax
+ leaq 20(%rbp),%rbp
jmp L$rounds_16_xx
.p2align 4
L$rounds_16_xx:
movl 4(%rsp),%r13d
- movl 56(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 56(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%eax
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 36(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 36(%rsp),%r12d
addl 0(%rsp),%r12d
movl %r8d,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %eax,%r14d
rorl $14,%r13d
movl %r9d,%r15d
- movl %r12d,0(%rsp)
- rorl $9,%r14d
xorl %r8d,%r13d
+ rorl $9,%r14d
xorl %r10d,%r15d
- rorl $5,%r13d
- addl %r11d,%r12d
+ movl %r12d,0(%rsp)
xorl %eax,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r8d,%r15d
- movl %ebx,%r11d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
rorl $11,%r14d
xorl %r8d,%r13d
- xorl %r10d,%r15d
+ addl %r15d,%r12d
- xorl %ecx,%r11d
+ movl %eax,%r15d
+ addl (%rbp),%r12d
xorl %eax,%r14d
- addl %r15d,%r12d
- movl %ebx,%r15d
+ xorl %ebx,%r15d
rorl $6,%r13d
- andl %eax,%r11d
- andl %ecx,%r15d
+ movl %ebx,%r11d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r11d
+ xorl %edi,%r11d
addl %r12d,%edx
addl %r12d,%r11d
- leaq 1(%rdi),%rdi
- addl %r14d,%r11d
+ leaq 4(%rbp),%rbp
movl 8(%rsp),%r13d
- movl 60(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 60(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r11d
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 40(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 40(%rsp),%r12d
addl 4(%rsp),%r12d
movl %edx,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %r11d,%r14d
rorl $14,%r13d
- movl %r8d,%r15d
- movl %r12d,4(%rsp)
+ movl %r8d,%edi
- rorl $9,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ rorl $9,%r14d
+ xorl %r9d,%edi
- rorl $5,%r13d
- addl %r10d,%r12d
+ movl %r12d,4(%rsp)
xorl %r11d,%r14d
+ andl %edx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %edx,%r15d
- movl %eax,%r10d
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
rorl $11,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ addl %edi,%r12d
- xorl %ebx,%r10d
+ movl %r11d,%edi
+ addl (%rbp),%r12d
xorl %r11d,%r14d
- addl %r15d,%r12d
- movl %eax,%r15d
+ xorl %eax,%edi
rorl $6,%r13d
- andl %r11d,%r10d
- andl %ebx,%r15d
+ movl %eax,%r10d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r10d
+ xorl %r15d,%r10d
addl %r12d,%ecx
addl %r12d,%r10d
- leaq 1(%rdi),%rdi
- addl %r14d,%r10d
+ leaq 4(%rbp),%rbp
movl 12(%rsp),%r13d
- movl 0(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 0(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r10d
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 44(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 44(%rsp),%r12d
addl 8(%rsp),%r12d
movl %ecx,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %r10d,%r14d
rorl $14,%r13d
movl %edx,%r15d
- movl %r12d,8(%rsp)
- rorl $9,%r14d
xorl %ecx,%r13d
+ rorl $9,%r14d
xorl %r8d,%r15d
- rorl $5,%r13d
- addl %r9d,%r12d
+ movl %r12d,8(%rsp)
xorl %r10d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %ecx,%r15d
- movl %r11d,%r9d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
rorl $11,%r14d
xorl %ecx,%r13d
- xorl %r8d,%r15d
+ addl %r15d,%r12d
- xorl %eax,%r9d
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
xorl %r10d,%r14d
- addl %r15d,%r12d
- movl %r11d,%r15d
+ xorl %r11d,%r15d
rorl $6,%r13d
- andl %r10d,%r9d
- andl %eax,%r15d
+ movl %r11d,%r9d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r9d
+ xorl %edi,%r9d
addl %r12d,%ebx
addl %r12d,%r9d
- leaq 1(%rdi),%rdi
- addl %r14d,%r9d
+ leaq 4(%rbp),%rbp
movl 16(%rsp),%r13d
- movl 4(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 4(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r9d
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 48(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 48(%rsp),%r12d
addl 12(%rsp),%r12d
movl %ebx,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %r9d,%r14d
rorl $14,%r13d
- movl %ecx,%r15d
- movl %r12d,12(%rsp)
+ movl %ecx,%edi
- rorl $9,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ rorl $9,%r14d
+ xorl %edx,%edi
- rorl $5,%r13d
- addl %r8d,%r12d
+ movl %r12d,12(%rsp)
xorl %r9d,%r14d
+ andl %ebx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %ebx,%r15d
- movl %r10d,%r8d
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
rorl $11,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ addl %edi,%r12d
- xorl %r11d,%r8d
+ movl %r9d,%edi
+ addl (%rbp),%r12d
xorl %r9d,%r14d
- addl %r15d,%r12d
- movl %r10d,%r15d
+ xorl %r10d,%edi
rorl $6,%r13d
- andl %r9d,%r8d
- andl %r11d,%r15d
+ movl %r10d,%r8d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r8d
+ xorl %r15d,%r8d
addl %r12d,%eax
addl %r12d,%r8d
- leaq 1(%rdi),%rdi
- addl %r14d,%r8d
+ leaq 20(%rbp),%rbp
movl 20(%rsp),%r13d
- movl 8(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 8(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r8d
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 52(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 52(%rsp),%r12d
addl 16(%rsp),%r12d
movl %eax,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %r8d,%r14d
rorl $14,%r13d
movl %ebx,%r15d
- movl %r12d,16(%rsp)
- rorl $9,%r14d
xorl %eax,%r13d
+ rorl $9,%r14d
xorl %ecx,%r15d
- rorl $5,%r13d
- addl %edx,%r12d
+ movl %r12d,16(%rsp)
xorl %r8d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %eax,%r15d
- movl %r9d,%edx
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
rorl $11,%r14d
xorl %eax,%r13d
- xorl %ecx,%r15d
+ addl %r15d,%r12d
- xorl %r10d,%edx
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
xorl %r8d,%r14d
- addl %r15d,%r12d
- movl %r9d,%r15d
+ xorl %r9d,%r15d
rorl $6,%r13d
- andl %r8d,%edx
- andl %r10d,%r15d
+ movl %r9d,%edx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%edx
+ xorl %edi,%edx
addl %r12d,%r11d
addl %r12d,%edx
- leaq 1(%rdi),%rdi
- addl %r14d,%edx
+ leaq 4(%rbp),%rbp
movl 24(%rsp),%r13d
- movl 12(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 12(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%edx
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 56(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 56(%rsp),%r12d
addl 20(%rsp),%r12d
movl %r11d,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %edx,%r14d
rorl $14,%r13d
- movl %eax,%r15d
- movl %r12d,20(%rsp)
+ movl %eax,%edi
- rorl $9,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ rorl $9,%r14d
+ xorl %ebx,%edi
- rorl $5,%r13d
- addl %ecx,%r12d
+ movl %r12d,20(%rsp)
xorl %edx,%r14d
+ andl %r11d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r11d,%r15d
- movl %r8d,%ecx
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
rorl $11,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ addl %edi,%r12d
- xorl %r9d,%ecx
+ movl %edx,%edi
+ addl (%rbp),%r12d
xorl %edx,%r14d
- addl %r15d,%r12d
- movl %r8d,%r15d
+ xorl %r8d,%edi
rorl $6,%r13d
- andl %edx,%ecx
- andl %r9d,%r15d
+ movl %r8d,%ecx
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ecx
+ xorl %r15d,%ecx
addl %r12d,%r10d
addl %r12d,%ecx
- leaq 1(%rdi),%rdi
- addl %r14d,%ecx
+ leaq 4(%rbp),%rbp
movl 28(%rsp),%r13d
- movl 16(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 16(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ecx
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 60(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 60(%rsp),%r12d
addl 24(%rsp),%r12d
movl %r10d,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %ecx,%r14d
rorl $14,%r13d
movl %r11d,%r15d
- movl %r12d,24(%rsp)
- rorl $9,%r14d
xorl %r10d,%r13d
+ rorl $9,%r14d
xorl %eax,%r15d
- rorl $5,%r13d
- addl %ebx,%r12d
+ movl %r12d,24(%rsp)
xorl %ecx,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r10d,%r15d
- movl %edx,%ebx
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
rorl $11,%r14d
xorl %r10d,%r13d
- xorl %eax,%r15d
+ addl %r15d,%r12d
- xorl %r8d,%ebx
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
xorl %ecx,%r14d
- addl %r15d,%r12d
- movl %edx,%r15d
+ xorl %edx,%r15d
rorl $6,%r13d
- andl %ecx,%ebx
- andl %r8d,%r15d
+ movl %edx,%ebx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ebx
+ xorl %edi,%ebx
addl %r12d,%r9d
addl %r12d,%ebx
- leaq 1(%rdi),%rdi
- addl %r14d,%ebx
+ leaq 4(%rbp),%rbp
movl 32(%rsp),%r13d
- movl 20(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 20(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ebx
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 0(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 0(%rsp),%r12d
addl 28(%rsp),%r12d
movl %r9d,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %ebx,%r14d
rorl $14,%r13d
- movl %r10d,%r15d
- movl %r12d,28(%rsp)
+ movl %r10d,%edi
- rorl $9,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ rorl $9,%r14d
+ xorl %r11d,%edi
- rorl $5,%r13d
- addl %eax,%r12d
+ movl %r12d,28(%rsp)
xorl %ebx,%r14d
+ andl %r9d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r9d,%r15d
- movl %ecx,%eax
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
rorl $11,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ addl %edi,%r12d
- xorl %edx,%eax
+ movl %ebx,%edi
+ addl (%rbp),%r12d
xorl %ebx,%r14d
- addl %r15d,%r12d
- movl %ecx,%r15d
+ xorl %ecx,%edi
rorl $6,%r13d
- andl %ebx,%eax
- andl %edx,%r15d
+ movl %ecx,%eax
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%eax
+ xorl %r15d,%eax
addl %r12d,%r8d
addl %r12d,%eax
- leaq 1(%rdi),%rdi
- addl %r14d,%eax
+ leaq 20(%rbp),%rbp
movl 36(%rsp),%r13d
- movl 24(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 24(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%eax
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 4(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 4(%rsp),%r12d
addl 32(%rsp),%r12d
movl %r8d,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %eax,%r14d
rorl $14,%r13d
movl %r9d,%r15d
- movl %r12d,32(%rsp)
- rorl $9,%r14d
xorl %r8d,%r13d
+ rorl $9,%r14d
xorl %r10d,%r15d
- rorl $5,%r13d
- addl %r11d,%r12d
+ movl %r12d,32(%rsp)
xorl %eax,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r8d,%r15d
- movl %ebx,%r11d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
rorl $11,%r14d
xorl %r8d,%r13d
- xorl %r10d,%r15d
+ addl %r15d,%r12d
- xorl %ecx,%r11d
+ movl %eax,%r15d
+ addl (%rbp),%r12d
xorl %eax,%r14d
- addl %r15d,%r12d
- movl %ebx,%r15d
+ xorl %ebx,%r15d
rorl $6,%r13d
- andl %eax,%r11d
- andl %ecx,%r15d
+ movl %ebx,%r11d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r11d
+ xorl %edi,%r11d
addl %r12d,%edx
addl %r12d,%r11d
- leaq 1(%rdi),%rdi
- addl %r14d,%r11d
+ leaq 4(%rbp),%rbp
movl 40(%rsp),%r13d
- movl 28(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 28(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r11d
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 8(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 8(%rsp),%r12d
addl 36(%rsp),%r12d
movl %edx,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %r11d,%r14d
rorl $14,%r13d
- movl %r8d,%r15d
- movl %r12d,36(%rsp)
+ movl %r8d,%edi
- rorl $9,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ rorl $9,%r14d
+ xorl %r9d,%edi
- rorl $5,%r13d
- addl %r10d,%r12d
+ movl %r12d,36(%rsp)
xorl %r11d,%r14d
+ andl %edx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %edx,%r15d
- movl %eax,%r10d
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
rorl $11,%r14d
xorl %edx,%r13d
- xorl %r9d,%r15d
+ addl %edi,%r12d
- xorl %ebx,%r10d
+ movl %r11d,%edi
+ addl (%rbp),%r12d
xorl %r11d,%r14d
- addl %r15d,%r12d
- movl %eax,%r15d
+ xorl %eax,%edi
rorl $6,%r13d
- andl %r11d,%r10d
- andl %ebx,%r15d
+ movl %eax,%r10d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r10d
+ xorl %r15d,%r10d
addl %r12d,%ecx
addl %r12d,%r10d
- leaq 1(%rdi),%rdi
- addl %r14d,%r10d
+ leaq 4(%rbp),%rbp
movl 44(%rsp),%r13d
- movl 32(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 32(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r10d
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 12(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 12(%rsp),%r12d
addl 40(%rsp),%r12d
movl %ecx,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %r10d,%r14d
rorl $14,%r13d
movl %edx,%r15d
- movl %r12d,40(%rsp)
- rorl $9,%r14d
xorl %ecx,%r13d
+ rorl $9,%r14d
xorl %r8d,%r15d
- rorl $5,%r13d
- addl %r9d,%r12d
+ movl %r12d,40(%rsp)
xorl %r10d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %ecx,%r15d
- movl %r11d,%r9d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
rorl $11,%r14d
xorl %ecx,%r13d
- xorl %r8d,%r15d
+ addl %r15d,%r12d
- xorl %eax,%r9d
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
xorl %r10d,%r14d
- addl %r15d,%r12d
- movl %r11d,%r15d
+ xorl %r11d,%r15d
rorl $6,%r13d
- andl %r10d,%r9d
- andl %eax,%r15d
+ movl %r11d,%r9d
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r9d
+ xorl %edi,%r9d
addl %r12d,%ebx
addl %r12d,%r9d
- leaq 1(%rdi),%rdi
- addl %r14d,%r9d
+ leaq 4(%rbp),%rbp
movl 48(%rsp),%r13d
- movl 36(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 36(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r9d
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 16(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 16(%rsp),%r12d
addl 44(%rsp),%r12d
movl %ebx,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %r9d,%r14d
rorl $14,%r13d
- movl %ecx,%r15d
- movl %r12d,44(%rsp)
+ movl %ecx,%edi
- rorl $9,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ rorl $9,%r14d
+ xorl %edx,%edi
- rorl $5,%r13d
- addl %r8d,%r12d
+ movl %r12d,44(%rsp)
xorl %r9d,%r14d
+ andl %ebx,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %ebx,%r15d
- movl %r10d,%r8d
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
rorl $11,%r14d
xorl %ebx,%r13d
- xorl %edx,%r15d
+ addl %edi,%r12d
- xorl %r11d,%r8d
+ movl %r9d,%edi
+ addl (%rbp),%r12d
xorl %r9d,%r14d
- addl %r15d,%r12d
- movl %r10d,%r15d
+ xorl %r10d,%edi
rorl $6,%r13d
- andl %r9d,%r8d
- andl %r11d,%r15d
+ movl %r10d,%r8d
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%r8d
+ xorl %r15d,%r8d
addl %r12d,%eax
addl %r12d,%r8d
- leaq 1(%rdi),%rdi
- addl %r14d,%r8d
+ leaq 20(%rbp),%rbp
movl 52(%rsp),%r13d
- movl 40(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 40(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r8d
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 20(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 20(%rsp),%r12d
addl 48(%rsp),%r12d
movl %eax,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %r8d,%r14d
rorl $14,%r13d
movl %ebx,%r15d
- movl %r12d,48(%rsp)
- rorl $9,%r14d
xorl %eax,%r13d
+ rorl $9,%r14d
xorl %ecx,%r15d
- rorl $5,%r13d
- addl %edx,%r12d
+ movl %r12d,48(%rsp)
xorl %r8d,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %eax,%r15d
- movl %r9d,%edx
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
rorl $11,%r14d
xorl %eax,%r13d
- xorl %ecx,%r15d
+ addl %r15d,%r12d
- xorl %r10d,%edx
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
xorl %r8d,%r14d
- addl %r15d,%r12d
- movl %r9d,%r15d
+ xorl %r9d,%r15d
rorl $6,%r13d
- andl %r8d,%edx
- andl %r10d,%r15d
+ movl %r9d,%edx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%edx
+ xorl %edi,%edx
addl %r12d,%r11d
addl %r12d,%edx
- leaq 1(%rdi),%rdi
- addl %r14d,%edx
+ leaq 4(%rbp),%rbp
movl 56(%rsp),%r13d
- movl 44(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 44(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%edx
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 24(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 24(%rsp),%r12d
addl 52(%rsp),%r12d
movl %r11d,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %edx,%r14d
rorl $14,%r13d
- movl %eax,%r15d
- movl %r12d,52(%rsp)
+ movl %eax,%edi
- rorl $9,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ rorl $9,%r14d
+ xorl %ebx,%edi
- rorl $5,%r13d
- addl %ecx,%r12d
+ movl %r12d,52(%rsp)
xorl %edx,%r14d
+ andl %r11d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r11d,%r15d
- movl %r8d,%ecx
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
rorl $11,%r14d
xorl %r11d,%r13d
- xorl %ebx,%r15d
+ addl %edi,%r12d
- xorl %r9d,%ecx
+ movl %edx,%edi
+ addl (%rbp),%r12d
xorl %edx,%r14d
- addl %r15d,%r12d
- movl %r8d,%r15d
+ xorl %r8d,%edi
rorl $6,%r13d
- andl %edx,%ecx
- andl %r9d,%r15d
+ movl %r8d,%ecx
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ecx
+ xorl %r15d,%ecx
addl %r12d,%r10d
addl %r12d,%ecx
- leaq 1(%rdi),%rdi
- addl %r14d,%ecx
+ leaq 4(%rbp),%rbp
movl 60(%rsp),%r13d
- movl 48(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 48(%rsp),%r15d
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ecx
+ movl %r15d,%r14d
+ rorl $2,%r15d
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 28(%rsp),%r12d
-
- rorl $2,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
xorl %r14d,%r15d
shrl $10,%r14d
rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 28(%rsp),%r12d
addl 56(%rsp),%r12d
movl %r10d,%r13d
- addl %r14d,%r12d
+ addl %r15d,%r12d
movl %ecx,%r14d
rorl $14,%r13d
movl %r11d,%r15d
- movl %r12d,56(%rsp)
- rorl $9,%r14d
xorl %r10d,%r13d
+ rorl $9,%r14d
xorl %eax,%r15d
- rorl $5,%r13d
- addl %ebx,%r12d
+ movl %r12d,56(%rsp)
xorl %ecx,%r14d
-
- addl (%rbp,%rdi,4),%r12d
andl %r10d,%r15d
- movl %edx,%ebx
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
rorl $11,%r14d
xorl %r10d,%r13d
- xorl %eax,%r15d
+ addl %r15d,%r12d
- xorl %r8d,%ebx
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
xorl %ecx,%r14d
- addl %r15d,%r12d
- movl %edx,%r15d
+ xorl %edx,%r15d
rorl $6,%r13d
- andl %ecx,%ebx
- andl %r8d,%r15d
+ movl %edx,%ebx
+ andl %r15d,%edi
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%ebx
+ xorl %edi,%ebx
addl %r12d,%r9d
addl %r12d,%ebx
- leaq 1(%rdi),%rdi
- addl %r14d,%ebx
+ leaq 4(%rbp),%rbp
movl 0(%rsp),%r13d
- movl 52(%rsp),%r14d
- movl %r13d,%r12d
- movl %r14d,%r15d
+ movl 52(%rsp),%edi
- rorl $11,%r12d
- xorl %r13d,%r12d
- shrl $3,%r13d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ebx
+ movl %edi,%r14d
+ rorl $2,%edi
- rorl $7,%r12d
xorl %r12d,%r13d
- movl 32(%rsp),%r12d
-
- rorl $2,%r15d
- xorl %r14d,%r15d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
shrl $10,%r14d
- rorl $17,%r15d
- addl %r13d,%r12d
- xorl %r15d,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 32(%rsp),%r12d
addl 60(%rsp),%r12d
movl %r9d,%r13d
- addl %r14d,%r12d
+ addl %edi,%r12d
movl %ebx,%r14d
rorl $14,%r13d
- movl %r10d,%r15d
- movl %r12d,60(%rsp)
+ movl %r10d,%edi
- rorl $9,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ rorl $9,%r14d
+ xorl %r11d,%edi
- rorl $5,%r13d
- addl %eax,%r12d
+ movl %r12d,60(%rsp)
xorl %ebx,%r14d
+ andl %r9d,%edi
- addl (%rbp,%rdi,4),%r12d
- andl %r9d,%r15d
- movl %ecx,%eax
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
rorl $11,%r14d
xorl %r9d,%r13d
- xorl %r11d,%r15d
+ addl %edi,%r12d
- xorl %edx,%eax
+ movl %ebx,%edi
+ addl (%rbp),%r12d
xorl %ebx,%r14d
- addl %r15d,%r12d
- movl %ecx,%r15d
+ xorl %ecx,%edi
rorl $6,%r13d
- andl %ebx,%eax
- andl %edx,%r15d
+ movl %ecx,%eax
+ andl %edi,%r15d
rorl $2,%r14d
addl %r13d,%r12d
- addl %r15d,%eax
+ xorl %r15d,%eax
addl %r12d,%r8d
addl %r12d,%eax
- leaq 1(%rdi),%rdi
- addl %r14d,%eax
- cmpq $64,%rdi
- jb L$rounds_16_xx
+ leaq 20(%rbp),%rbp
+ cmpb $0,3(%rbp)
+ jnz L$rounds_16_xx
movq 64+0(%rsp),%rdi
+ addl %r14d,%eax
leaq 64(%rsi),%rsi
addl 0(%rdi),%eax
@@ -1800,19 +1743,1138 @@ L$epilogue:
K256:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+
+.p2align 6
+sha256_block_data_order_ssse3:
+L$ssse3_shortcut:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movq %rsp,%r11
+ shlq $4,%rdx
+ subq $96,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %r11,64+24(%rsp)
+L$prologue_ssse3:
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+
+
+ jmp L$loop_ssse3
+.p2align 4
+L$loop_ssse3:
+ movdqa K256+512(%rip),%xmm7
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+.byte 102,15,56,0,199
+ leaq K256(%rip),%rbp
+.byte 102,15,56,0,207
+ movdqa 0(%rbp),%xmm4
+.byte 102,15,56,0,215
+ movdqa 32(%rbp),%xmm5
+ paddd %xmm0,%xmm4
+ movdqa 64(%rbp),%xmm6
+.byte 102,15,56,0,223
+ movdqa 96(%rbp),%xmm7
+ paddd %xmm1,%xmm5
+ paddd %xmm2,%xmm6
+ paddd %xmm3,%xmm7
+ movdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ movdqa %xmm5,16(%rsp)
+ movl %ebx,%edi
+ movdqa %xmm6,32(%rsp)
+ xorl %ecx,%edi
+ movdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp L$ssse3_00_47
+
+.p2align 4
+L$ssse3_00_47:
+ subq $-128,%rbp
+ rorl $14,%r13d
+ movdqa %xmm1,%xmm4
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ movdqa %xmm3,%xmm7
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+.byte 102,15,58,15,224,4
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+.byte 102,15,58,15,250,4
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ paddd %xmm7,%xmm0
+ rorl $2,%r14d
+ addl %r11d,%edx
+ psrld $7,%xmm6
+ addl %edi,%r11d
+ movl %edx,%r13d
+ pshufd $250,%xmm3,%xmm7
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %r11d,%r14d
+ pxor %xmm5,%xmm4
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ pslld $11,%xmm5
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ pxor %xmm6,%xmm4
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ paddd %xmm4,%xmm0
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ psrlq $17,%xmm6
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ psrldq $8,%xmm7
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ paddd %xmm7,%xmm0
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ pshufd $80,%xmm0,%xmm7
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ movdqa %xmm7,%xmm6
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ psrld $10,%xmm7
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ psrlq $2,%xmm6
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ pxor %xmm6,%xmm7
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ movdqa 0(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ paddd %xmm7,%xmm0
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ paddd %xmm0,%xmm6
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ movdqa %xmm6,0(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm2,%xmm4
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ movdqa %xmm0,%xmm7
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+.byte 102,15,58,15,225,4
+ andl %eax,%r12d
+ xorl %eax,%r13d
+.byte 102,15,58,15,251,4
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ paddd %xmm7,%xmm1
+ rorl $2,%r14d
+ addl %edx,%r11d
+ psrld $7,%xmm6
+ addl %edi,%edx
+ movl %r11d,%r13d
+ pshufd $250,%xmm0,%xmm7
+ addl %edx,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%edx
+ movl %eax,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %edx,%r14d
+ pxor %xmm5,%xmm4
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ pslld $11,%xmm5
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ pxor %xmm6,%xmm4
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ paddd %xmm4,%xmm1
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ psrlq $17,%xmm6
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ psrldq $8,%xmm7
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ paddd %xmm7,%xmm1
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ pshufd $80,%xmm1,%xmm7
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ movdqa %xmm7,%xmm6
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ psrld $10,%xmm7
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ psrlq $2,%xmm6
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ pxor %xmm6,%xmm7
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ movdqa 32(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ paddd %xmm7,%xmm1
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ paddd %xmm1,%xmm6
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movdqa %xmm6,16(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm3,%xmm4
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ movdqa %xmm1,%xmm7
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+.byte 102,15,58,15,226,4
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+.byte 102,15,58,15,248,4
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ paddd %xmm7,%xmm2
+ rorl $2,%r14d
+ addl %r11d,%edx
+ psrld $7,%xmm6
+ addl %edi,%r11d
+ movl %edx,%r13d
+ pshufd $250,%xmm1,%xmm7
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %r11d,%r14d
+ pxor %xmm5,%xmm4
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ pslld $11,%xmm5
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ pxor %xmm6,%xmm4
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ paddd %xmm4,%xmm2
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ psrlq $17,%xmm6
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ psrldq $8,%xmm7
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ paddd %xmm7,%xmm2
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ pshufd $80,%xmm2,%xmm7
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ movdqa %xmm7,%xmm6
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ psrld $10,%xmm7
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ psrlq $2,%xmm6
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ pxor %xmm6,%xmm7
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ movdqa 64(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ paddd %xmm7,%xmm2
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ paddd %xmm2,%xmm6
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ movdqa %xmm6,32(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm0,%xmm4
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ movdqa %xmm2,%xmm7
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+.byte 102,15,58,15,227,4
+ andl %eax,%r12d
+ xorl %eax,%r13d
+.byte 102,15,58,15,249,4
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ paddd %xmm7,%xmm3
+ rorl $2,%r14d
+ addl %edx,%r11d
+ psrld $7,%xmm6
+ addl %edi,%edx
+ movl %r11d,%r13d
+ pshufd $250,%xmm2,%xmm7
+ addl %edx,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%edx
+ movl %eax,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %edx,%r14d
+ pxor %xmm5,%xmm4
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ pslld $11,%xmm5
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ pxor %xmm6,%xmm4
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ paddd %xmm4,%xmm3
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ psrlq $17,%xmm6
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ psrldq $8,%xmm7
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ paddd %xmm7,%xmm3
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ pshufd $80,%xmm3,%xmm7
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ movdqa %xmm7,%xmm6
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ psrld $10,%xmm7
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ psrlq $2,%xmm6
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ pxor %xmm6,%xmm7
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ movdqa 96(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ paddd %xmm7,%xmm3
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ paddd %xmm3,%xmm6
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movdqa %xmm6,48(%rsp)
+ cmpb $0,131(%rbp)
+ jne L$ssse3_00_47
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ rorl $2,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ rorl $2,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ rorl $2,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ rorl $2,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%rdi
+ movl %r14d,%eax
+
+ addl 0(%rdi),%eax
+ leaq 64(%rsi),%rsi
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb L$loop_ssse3
+
+ movq 64+24(%rsp),%rsi
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+L$epilogue_ssse3:
+ .byte 0xf3,0xc3
+