diff options
author | Nikos Mavrogiannopoulos <nmav@redhat.com> | 2014-05-30 15:40:14 +0200 |
---|---|---|
committer | Nikos Mavrogiannopoulos <nmav@redhat.com> | 2014-05-30 15:40:14 +0200 |
commit | 676040da1b760daf8da94944ebf271696b955b59 (patch) | |
tree | ba26fdcab72161ff01cb1aa65dbaa4e7df6c1940 | |
parent | bd93a14d329f0bdd291f1461d11db0c1dc33f369 (diff) | |
download | gnutls-676040da1b760daf8da94944ebf271696b955b59.tar.gz |
Updated asm sources
34 files changed, 27755 insertions, 11804 deletions
diff --git a/devel/openssl b/devel/openssl -Subproject b0ee17ad475c97f068c7314efafdd8a53d92af5 +Subproject e09ea622bba106e13ab85173c205f354b0f1d48 diff --git a/lib/accelerated/x86/coff/aes-ssse3-x86.s b/lib/accelerated/x86/coff/aes-ssse3-x86.s index e209b7d0c0..063f6b157e 100644 --- a/lib/accelerated/x86/coff/aes-ssse3-x86.s +++ b/lib/accelerated/x86/coff/aes-ssse3-x86.s @@ -84,33 +84,33 @@ __vpaes_encrypt_core: movdqa %xmm6,%xmm1 movdqa (%ebp),%xmm2 pandn %xmm0,%xmm1 - movdqu (%edx),%xmm5 - psrld $4,%xmm1 pand %xmm6,%xmm0 + movdqu (%edx),%xmm5 .byte 102,15,56,0,208 movdqa 16(%ebp),%xmm0 -.byte 102,15,56,0,193 pxor %xmm5,%xmm2 - pxor %xmm2,%xmm0 + psrld $4,%xmm1 addl $16,%edx +.byte 102,15,56,0,193 leal 192(%ebp),%ebx + pxor %xmm2,%xmm0 jmp .L000enc_entry .align 16 .L001enc_loop: movdqa 32(%ebp),%xmm4 -.byte 102,15,56,0,226 - pxor %xmm5,%xmm4 movdqa 48(%ebp),%xmm0 +.byte 102,15,56,0,226 .byte 102,15,56,0,195 - pxor %xmm4,%xmm0 + pxor %xmm5,%xmm4 movdqa 64(%ebp),%xmm5 -.byte 102,15,56,0,234 + pxor %xmm4,%xmm0 movdqa -64(%ebx,%ecx,1),%xmm1 +.byte 102,15,56,0,234 movdqa 80(%ebp),%xmm2 -.byte 102,15,56,0,211 - pxor %xmm5,%xmm2 movdqa (%ebx,%ecx,1),%xmm4 +.byte 102,15,56,0,211 movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 .byte 102,15,56,0,193 addl $16,%edx pxor %xmm2,%xmm0 @@ -119,28 +119,28 @@ __vpaes_encrypt_core: pxor %xmm0,%xmm3 .byte 102,15,56,0,193 andl $48,%ecx - pxor %xmm3,%xmm0 subl $1,%eax + pxor %xmm3,%xmm0 .L000enc_entry: movdqa %xmm6,%xmm1 + movdqa -32(%ebp),%xmm5 pandn %xmm0,%xmm1 psrld $4,%xmm1 pand %xmm6,%xmm0 - movdqa -32(%ebp),%xmm5 .byte 102,15,56,0,232 - pxor %xmm1,%xmm0 movdqa %xmm7,%xmm3 + pxor %xmm1,%xmm0 .byte 102,15,56,0,217 - pxor %xmm5,%xmm3 movdqa %xmm7,%xmm4 + pxor %xmm5,%xmm3 .byte 102,15,56,0,224 - pxor %xmm5,%xmm4 movdqa %xmm7,%xmm2 + pxor %xmm5,%xmm4 .byte 102,15,56,0,211 - pxor %xmm0,%xmm2 movdqa %xmm7,%xmm3 - movdqu (%edx),%xmm5 + pxor %xmm0,%xmm2 .byte 102,15,56,0,220 + movdqu (%edx),%xmm5 pxor %xmm1,%xmm3 jnz .L001enc_loop movdqa 96(%ebp),%xmm4 @@ -155,8 +155,8 @@ __vpaes_encrypt_core: .def __vpaes_decrypt_core; .scl 3; .type 32; .endef .align 16 __vpaes_decrypt_core: - movl 240(%edx),%eax leal 608(%ebp),%ebx + movl 240(%edx),%eax movdqa %xmm6,%xmm1 movdqa -64(%ebx),%xmm2 pandn %xmm0,%xmm1 @@ -179,56 +179,56 @@ __vpaes_decrypt_core: .align 16 .L003dec_loop: movdqa -32(%ebx),%xmm4 + movdqa -16(%ebx),%xmm1 .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa -16(%ebx),%xmm0 -.byte 102,15,56,0,195 +.byte 102,15,56,0,203 pxor %xmm4,%xmm0 - addl $16,%edx -.byte 102,15,56,0,197 movdqa (%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 16(%ebx),%xmm1 .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 16(%ebx),%xmm0 -.byte 102,15,56,0,195 - pxor %xmm4,%xmm0 - subl $1,%eax .byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 movdqa 32(%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%ebx),%xmm1 .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 48(%ebx),%xmm0 -.byte 102,15,56,0,195 - pxor %xmm4,%xmm0 .byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 movdqa 64(%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%ebx),%xmm1 .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 80(%ebx),%xmm0 -.byte 102,15,56,0,195 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 pxor %xmm4,%xmm0 + addl $16,%edx .byte 102,15,58,15,237,12 + pxor %xmm1,%xmm0 + subl $1,%eax .L002dec_entry: movdqa %xmm6,%xmm1 + movdqa -32(%ebp),%xmm2 pandn %xmm0,%xmm1 - psrld $4,%xmm1 pand %xmm6,%xmm0 - movdqa -32(%ebp),%xmm2 + psrld $4,%xmm1 .byte 102,15,56,0,208 - pxor %xmm1,%xmm0 movdqa %xmm7,%xmm3 + pxor %xmm1,%xmm0 .byte 102,15,56,0,217 - pxor %xmm2,%xmm3 movdqa %xmm7,%xmm4 + pxor %xmm2,%xmm3 .byte 102,15,56,0,224 pxor %xmm2,%xmm4 movdqa %xmm7,%xmm2 .byte 102,15,56,0,211 - pxor %xmm0,%xmm2 movdqa %xmm7,%xmm3 + pxor %xmm0,%xmm2 .byte 102,15,56,0,220 - pxor %xmm1,%xmm3 movdqu (%edx),%xmm0 + pxor %xmm1,%xmm3 jnz .L003dec_loop movdqa 96(%ebx),%xmm4 .byte 102,15,56,0,226 @@ -335,12 +335,12 @@ __vpaes_schedule_core: .def __vpaes_schedule_192_smear; .scl 3; .type 32; .endef .align 16 __vpaes_schedule_192_smear: - pshufd $128,%xmm6,%xmm0 - pxor %xmm0,%xmm6 + pshufd $128,%xmm6,%xmm1 pshufd $254,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 pxor %xmm0,%xmm6 movdqa %xmm6,%xmm0 - pxor %xmm1,%xmm1 movhlps %xmm1,%xmm6 ret .def __vpaes_schedule_round; .scl 3; .type 32; .endef @@ -601,6 +601,8 @@ _vpaes_cbc_encrypt: movl 24(%esp),%edi movl 28(%esp),%eax movl 32(%esp),%edx + subl $16,%eax + jc .L020cbc_abort leal -56(%esp),%ebx movl 36(%esp),%ebp andl $-16,%ebx @@ -610,18 +612,17 @@ _vpaes_cbc_encrypt: subl %esi,%edi movl %ebx,48(%esp) movl %edi,(%esp) - subl $16,%eax movl %edx,4(%esp) movl %ebp,8(%esp) movl %eax,%edi - leal .L_vpaes_consts+0x30-.L020pic_point,%ebp + leal .L_vpaes_consts+0x30-.L021pic_point,%ebp call __vpaes_preheat -.L020pic_point: +.L021pic_point: cmpl $0,%ecx - je .L021cbc_dec_loop - jmp .L022cbc_enc_loop + je .L022cbc_dec_loop + jmp .L023cbc_enc_loop .align 16 -.L022cbc_enc_loop: +.L023cbc_enc_loop: movdqu (%esi),%xmm0 pxor %xmm1,%xmm0 call __vpaes_encrypt_core @@ -631,10 +632,10 @@ _vpaes_cbc_encrypt: movdqu %xmm0,(%ebx,%esi,1) leal 16(%esi),%esi subl $16,%edi - jnc .L022cbc_enc_loop - jmp .L023cbc_done + jnc .L023cbc_enc_loop + jmp .L024cbc_done .align 16 -.L021cbc_dec_loop: +.L022cbc_dec_loop: movdqu (%esi),%xmm0 movdqa %xmm1,16(%esp) movdqa %xmm0,32(%esp) @@ -646,11 +647,12 @@ _vpaes_cbc_encrypt: movdqu %xmm0,(%ebx,%esi,1) leal 16(%esi),%esi subl $16,%edi - jnc .L021cbc_dec_loop -.L023cbc_done: + jnc .L022cbc_dec_loop +.L024cbc_done: movl 8(%esp),%ebx movl 48(%esp),%esp movdqu %xmm1,(%ebx) +.L020cbc_abort: popl %edi popl %esi popl %ebx diff --git a/lib/accelerated/x86/coff/aes-ssse3-x86_64.s b/lib/accelerated/x86/coff/aes-ssse3-x86_64.s index 1a6cf83232..c052b5ba6c 100644 --- a/lib/accelerated/x86/coff/aes-ssse3-x86_64.s +++ b/lib/accelerated/x86/coff/aes-ssse3-x86_64.s @@ -43,8 +43,8 @@ _vpaes_encrypt_core: movdqa .Lk_ipt+16(%rip),%xmm0 .byte 102,15,56,0,193 pxor %xmm5,%xmm2 - pxor %xmm2,%xmm0 addq $16,%r9 + pxor %xmm2,%xmm0 leaq .Lk_mc_backward(%rip),%r10 jmp .Lenc_entry @@ -52,19 +52,19 @@ _vpaes_encrypt_core: .Lenc_loop: movdqa %xmm13,%xmm4 -.byte 102,15,56,0,226 - pxor %xmm5,%xmm4 movdqa %xmm12,%xmm0 +.byte 102,15,56,0,226 .byte 102,15,56,0,195 - pxor %xmm4,%xmm0 + pxor %xmm5,%xmm4 movdqa %xmm15,%xmm5 -.byte 102,15,56,0,234 + pxor %xmm4,%xmm0 movdqa -64(%r11,%r10,1),%xmm1 +.byte 102,15,56,0,234 + movdqa (%r11,%r10,1),%xmm4 movdqa %xmm14,%xmm2 .byte 102,15,56,0,211 - pxor %xmm5,%xmm2 - movdqa (%r11,%r10,1),%xmm4 movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 .byte 102,15,56,0,193 addq $16,%r9 pxor %xmm2,%xmm0 @@ -73,30 +73,30 @@ _vpaes_encrypt_core: pxor %xmm0,%xmm3 .byte 102,15,56,0,193 andq $48,%r11 - pxor %xmm3,%xmm0 subq $1,%rax + pxor %xmm3,%xmm0 .Lenc_entry: movdqa %xmm9,%xmm1 + movdqa %xmm11,%xmm5 pandn %xmm0,%xmm1 psrld $4,%xmm1 pand %xmm9,%xmm0 - movdqa %xmm11,%xmm5 .byte 102,15,56,0,232 - pxor %xmm1,%xmm0 movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 .byte 102,15,56,0,217 - pxor %xmm5,%xmm3 movdqa %xmm10,%xmm4 + pxor %xmm5,%xmm3 .byte 102,15,56,0,224 - pxor %xmm5,%xmm4 movdqa %xmm10,%xmm2 + pxor %xmm5,%xmm4 .byte 102,15,56,0,211 - pxor %xmm0,%xmm2 movdqa %xmm10,%xmm3 - movdqu (%r9),%xmm5 + pxor %xmm0,%xmm2 .byte 102,15,56,0,220 + movdqu (%r9),%xmm5 pxor %xmm1,%xmm3 jnz .Lenc_loop @@ -149,62 +149,61 @@ _vpaes_decrypt_core: movdqa -32(%r10),%xmm4 + movdqa -16(%r10),%xmm1 .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa -16(%r10),%xmm0 -.byte 102,15,56,0,195 +.byte 102,15,56,0,203 pxor %xmm4,%xmm0 - addq $16,%r9 - -.byte 102,15,56,0,197 movdqa 0(%r10),%xmm4 -.byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 16(%r10),%xmm0 -.byte 102,15,56,0,195 - pxor %xmm4,%xmm0 - subq $1,%rax + pxor %xmm1,%xmm0 + movdqa 16(%r10),%xmm1 -.byte 102,15,56,0,197 - movdqa 32(%r10),%xmm4 .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 48(%r10),%xmm0 -.byte 102,15,56,0,195 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 pxor %xmm4,%xmm0 + movdqa 32(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%r10),%xmm1 +.byte 102,15,56,0,226 .byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 movdqa 64(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%r10),%xmm1 + .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 80(%r10),%xmm0 -.byte 102,15,56,0,195 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 pxor %xmm4,%xmm0 - + addq $16,%r9 .byte 102,15,58,15,237,12 + pxor %xmm1,%xmm0 + subq $1,%rax .Ldec_entry: movdqa %xmm9,%xmm1 pandn %xmm0,%xmm1 + movdqa %xmm11,%xmm2 psrld $4,%xmm1 pand %xmm9,%xmm0 - movdqa %xmm11,%xmm2 .byte 102,15,56,0,208 - pxor %xmm1,%xmm0 movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 .byte 102,15,56,0,217 - pxor %xmm2,%xmm3 movdqa %xmm10,%xmm4 + pxor %xmm2,%xmm3 .byte 102,15,56,0,224 pxor %xmm2,%xmm4 movdqa %xmm10,%xmm2 .byte 102,15,56,0,211 - pxor %xmm0,%xmm2 movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 .byte 102,15,56,0,220 - pxor %xmm1,%xmm3 movdqu (%r9),%xmm0 + pxor %xmm1,%xmm3 jnz .Ldec_loop @@ -212,7 +211,7 @@ _vpaes_decrypt_core: .byte 102,15,56,0,226 pxor %xmm0,%xmm4 movdqa 112(%r10),%xmm0 - movdqa .Lk_sr-.Lk_dsbd(%r11),%xmm2 + movdqa -352(%r11),%xmm2 .byte 102,15,56,0,195 pxor %xmm4,%xmm0 .byte 102,15,56,0,194 @@ -232,7 +231,7 @@ _vpaes_schedule_core: - call _vpaes_preheat + call _vpaes_preheat movdqa .Lk_rcon(%rip),%xmm8 movdqu (%rdi),%xmm0 @@ -278,7 +277,7 @@ _vpaes_schedule_core: call _vpaes_schedule_round decq %rsi jz .Lschedule_mangle_last - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle jmp .Loop_schedule_128 @@ -299,7 +298,7 @@ _vpaes_schedule_core: .p2align 4 .Lschedule_192: movdqu 8(%rdi),%xmm0 - call _vpaes_schedule_transform + call _vpaes_schedule_transform movdqa %xmm0,%xmm6 pxor %xmm4,%xmm4 movhlps %xmm4,%xmm6 @@ -308,13 +307,13 @@ _vpaes_schedule_core: .Loop_schedule_192: call _vpaes_schedule_round .byte 102,15,58,15,198,8 - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle call _vpaes_schedule_192_smear - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle call _vpaes_schedule_round decq %rsi jz .Lschedule_mangle_last - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle call _vpaes_schedule_192_smear jmp .Loop_schedule_192 @@ -331,18 +330,18 @@ _vpaes_schedule_core: .p2align 4 .Lschedule_256: movdqu 16(%rdi),%xmm0 - call _vpaes_schedule_transform + call _vpaes_schedule_transform movl $7,%esi .Loop_schedule_256: - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle movdqa %xmm0,%xmm6 call _vpaes_schedule_round decq %rsi jz .Lschedule_mangle_last - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle pshufd $255,%xmm0,%xmm0 @@ -380,7 +379,7 @@ _vpaes_schedule_core: .Lschedule_mangle_last_dec: addq $-16,%rdx pxor .Lk_s63(%rip),%xmm0 - call _vpaes_schedule_transform + call _vpaes_schedule_transform movdqu %xmm0,(%rdx) @@ -412,12 +411,12 @@ _vpaes_schedule_core: .def _vpaes_schedule_192_smear; .scl 3; .type 32; .endef .p2align 4 _vpaes_schedule_192_smear: - pshufd $128,%xmm6,%xmm0 - pxor %xmm0,%xmm6 + pshufd $128,%xmm6,%xmm1 pshufd $254,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 pxor %xmm0,%xmm6 movdqa %xmm6,%xmm0 - pxor %xmm1,%xmm1 movhlps %xmm1,%xmm6 .byte 0xf3,0xc3 @@ -827,6 +826,8 @@ vpaes_cbc_encrypt: movq 48(%rsp),%r9 xchgq %rcx,%rdx + subq $16,%rcx + jc .Lcbc_abort leaq -184(%rsp),%rsp movaps %xmm6,16(%rsp) movaps %xmm7,32(%rsp) @@ -841,7 +842,6 @@ vpaes_cbc_encrypt: .Lcbc_body: movdqu (%r8),%xmm6 subq %rdi,%rsi - subq $16,%rcx call _vpaes_preheat cmpl $0,%r9d je .Lcbc_dec_loop @@ -882,6 +882,7 @@ vpaes_cbc_encrypt: movaps 160(%rsp),%xmm15 leaq 184(%rsp),%rsp .Lcbc_epilogue: +.Lcbc_abort: movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 @@ -1006,7 +1007,7 @@ _vpaes_consts: .Lk_dsbo: .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C -.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 .p2align 6 @@ -1045,7 +1046,7 @@ se_handler: leaq 16(%rax),%rsi leaq 512(%r8),%rdi movl $20,%ecx -.long 0xa548f3fc +.long 0xa548f3fc leaq 184(%rax),%rax .Lin_prologue: @@ -1058,7 +1059,7 @@ se_handler: movq 40(%r9),%rdi movq %r8,%rsi movl $154,%ecx -.long 0xa548f3fc +.long 0xa548f3fc movq %r9,%rsi xorq %rcx,%rcx @@ -1115,21 +1116,21 @@ se_handler: .LSEH_info_vpaes_set_encrypt_key: .byte 9,0,0,0 .rva se_handler -.rva .Lenc_key_body,.Lenc_key_epilogue +.rva .Lenc_key_body,.Lenc_key_epilogue .LSEH_info_vpaes_set_decrypt_key: .byte 9,0,0,0 .rva se_handler -.rva .Ldec_key_body,.Ldec_key_epilogue +.rva .Ldec_key_body,.Ldec_key_epilogue .LSEH_info_vpaes_encrypt: .byte 9,0,0,0 .rva se_handler -.rva .Lenc_body,.Lenc_epilogue +.rva .Lenc_body,.Lenc_epilogue .LSEH_info_vpaes_decrypt: .byte 9,0,0,0 .rva se_handler -.rva .Ldec_body,.Ldec_epilogue +.rva .Ldec_body,.Ldec_epilogue .LSEH_info_vpaes_cbc_encrypt: .byte 9,0,0,0 .rva se_handler -.rva .Lcbc_body,.Lcbc_epilogue +.rva .Lcbc_body,.Lcbc_epilogue diff --git a/lib/accelerated/x86/coff/aesni-x86.s b/lib/accelerated/x86/coff/aesni-x86.s index 502be77883..1c9b935f34 100644 --- a/lib/accelerated/x86/coff/aesni-x86.s +++ b/lib/accelerated/x86/coff/aesni-x86.s @@ -85,29 +85,82 @@ _aesni_decrypt: .byte 102,15,56,223,209 movups %xmm2,(%eax) ret +.def __aesni_encrypt2; .scl 3; .type 32; .endef +.align 16 +__aesni_encrypt2: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L002enc2_loop: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L002enc2_loop +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 + ret +.def __aesni_decrypt2; .scl 3; .type 32; .endef +.align 16 +__aesni_decrypt2: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L003dec2_loop: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L003dec2_loop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,223,208 +.byte 102,15,56,223,216 + ret .def __aesni_encrypt3; .scl 3; .type 32; .endef .align 16 __aesni_encrypt3: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 - movups (%edx),%xmm0 -.L002enc3_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L004enc3_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %ecx .byte 102,15,56,220,225 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leal 32(%edx),%edx .byte 102,15,56,220,224 - movups (%edx),%xmm0 - jnz .L002enc3_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L004enc3_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -119,25 +172,26 @@ __aesni_encrypt3: .align 16 __aesni_decrypt3: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 - movups (%edx),%xmm0 -.L003dec3_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L005dec3_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %ecx .byte 102,15,56,222,225 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leal 32(%edx),%edx .byte 102,15,56,222,224 - movups (%edx),%xmm0 - jnz .L003dec3_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L005dec3_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -150,27 +204,29 @@ __aesni_decrypt3: __aesni_encrypt4: movups (%edx),%xmm0 movups 16(%edx),%xmm1 - shrl $1,%ecx - leal 32(%edx),%edx + shll $4,%ecx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 pxor %xmm0,%xmm5 - movups (%edx),%xmm0 -.L004enc4_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 15,31,64,0 + addl $16,%ecx +.L006enc4_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %ecx .byte 102,15,56,220,225 .byte 102,15,56,220,233 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leal 32(%edx),%edx .byte 102,15,56,220,224 .byte 102,15,56,220,232 - movups (%edx),%xmm0 - jnz .L004enc4_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L006enc4_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -185,27 +241,29 @@ __aesni_encrypt4: __aesni_decrypt4: movups (%edx),%xmm0 movups 16(%edx),%xmm1 - shrl $1,%ecx - leal 32(%edx),%edx + shll $4,%ecx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 pxor %xmm0,%xmm5 - movups (%edx),%xmm0 -.L005dec4_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 15,31,64,0 + addl $16,%ecx +.L007dec4_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %ecx .byte 102,15,56,222,225 .byte 102,15,56,222,233 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leal 32(%edx),%edx .byte 102,15,56,222,224 .byte 102,15,56,222,232 - movups (%edx),%xmm0 - jnz .L005dec4_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L007dec4_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -219,45 +277,44 @@ __aesni_decrypt4: .align 16 __aesni_encrypt6: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 -.byte 102,15,56,220,209 pxor %xmm0,%xmm4 -.byte 102,15,56,220,217 +.byte 102,15,56,220,209 pxor %xmm0,%xmm5 - decl %ecx -.byte 102,15,56,220,225 pxor %xmm0,%xmm6 -.byte 102,15,56,220,233 +.byte 102,15,56,220,217 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 102,15,56,220,225 pxor %xmm0,%xmm7 + addl $16,%ecx +.byte 102,15,56,220,233 .byte 102,15,56,220,241 - movups (%edx),%xmm0 .byte 102,15,56,220,249 + movups -16(%edx,%ecx,1),%xmm0 jmp .L_aesni_encrypt6_enter .align 16 -.L006enc6_loop: +.L008enc6_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %ecx .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -.align 16 .L_aesni_encrypt6_enter: - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leal 32(%edx),%edx .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups (%edx),%xmm0 - jnz .L006enc6_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L008enc6_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -275,45 +332,44 @@ __aesni_encrypt6: .align 16 __aesni_decrypt6: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 -.byte 102,15,56,222,209 pxor %xmm0,%xmm4 -.byte 102,15,56,222,217 +.byte 102,15,56,222,209 pxor %xmm0,%xmm5 - decl %ecx -.byte 102,15,56,222,225 pxor %xmm0,%xmm6 -.byte 102,15,56,222,233 +.byte 102,15,56,222,217 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 102,15,56,222,225 pxor %xmm0,%xmm7 + addl $16,%ecx +.byte 102,15,56,222,233 .byte 102,15,56,222,241 - movups (%edx),%xmm0 .byte 102,15,56,222,249 + movups -16(%edx,%ecx,1),%xmm0 jmp .L_aesni_decrypt6_enter .align 16 -.L007dec6_loop: +.L009dec6_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %ecx .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 -.align 16 .L_aesni_decrypt6_enter: - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leal 32(%edx),%edx .byte 102,15,56,222,224 .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups (%edx),%xmm0 - jnz .L007dec6_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L009dec6_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -342,14 +398,14 @@ _aesni_ecb_encrypt: movl 32(%esp),%edx movl 36(%esp),%ebx andl $-16,%eax - jz .L008ecb_ret + jz .L010ecb_ret movl 240(%edx),%ecx testl %ebx,%ebx - jz .L009ecb_decrypt + jz .L011ecb_decrypt movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb .L010ecb_enc_tail + jb .L012ecb_enc_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -358,9 +414,9 @@ _aesni_ecb_encrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp .L011ecb_enc_loop6_enter + jmp .L013ecb_enc_loop6_enter .align 16 -.L012ecb_enc_loop6: +.L014ecb_enc_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -375,12 +431,12 @@ _aesni_ecb_encrypt: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -.L011ecb_enc_loop6_enter: +.L013ecb_enc_loop6_enter: call __aesni_encrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc .L012ecb_enc_loop6 + jnc .L014ecb_enc_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -389,18 +445,18 @@ _aesni_ecb_encrypt: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz .L008ecb_ret -.L010ecb_enc_tail: + jz .L010ecb_ret +.L012ecb_enc_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb .L013ecb_enc_one + jb .L015ecb_enc_one movups 16(%esi),%xmm3 - je .L014ecb_enc_two + je .L016ecb_enc_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb .L015ecb_enc_three + jb .L017ecb_enc_three movups 48(%esi),%xmm5 - je .L016ecb_enc_four + je .L018ecb_enc_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call __aesni_encrypt6 @@ -409,50 +465,49 @@ _aesni_ecb_encrypt: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L008ecb_ret + jmp .L010ecb_ret .align 16 -.L013ecb_enc_one: +.L015ecb_enc_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L017enc1_loop_3: +.L019enc1_loop_3: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L017enc1_loop_3 + jnz .L019enc1_loop_3 .byte 102,15,56,221,209 movups %xmm2,(%edi) - jmp .L008ecb_ret + jmp .L010ecb_ret .align 16 -.L014ecb_enc_two: - xorps %xmm4,%xmm4 - call __aesni_encrypt3 +.L016ecb_enc_two: + call __aesni_encrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L008ecb_ret + jmp .L010ecb_ret .align 16 -.L015ecb_enc_three: +.L017ecb_enc_three: call __aesni_encrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L008ecb_ret + jmp .L010ecb_ret .align 16 -.L016ecb_enc_four: +.L018ecb_enc_four: call __aesni_encrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) - jmp .L008ecb_ret + jmp .L010ecb_ret .align 16 -.L009ecb_decrypt: +.L011ecb_decrypt: movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb .L018ecb_dec_tail + jb .L020ecb_dec_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -461,9 +516,9 @@ _aesni_ecb_encrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp .L019ecb_dec_loop6_enter + jmp .L021ecb_dec_loop6_enter .align 16 -.L020ecb_dec_loop6: +.L022ecb_dec_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -478,12 +533,12 @@ _aesni_ecb_encrypt: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -.L019ecb_dec_loop6_enter: +.L021ecb_dec_loop6_enter: call __aesni_decrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc .L020ecb_dec_loop6 + jnc .L022ecb_dec_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -492,18 +547,18 @@ _aesni_ecb_encrypt: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz .L008ecb_ret -.L018ecb_dec_tail: + jz .L010ecb_ret +.L020ecb_dec_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb .L021ecb_dec_one + jb .L023ecb_dec_one movups 16(%esi),%xmm3 - je .L022ecb_dec_two + je .L024ecb_dec_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb .L023ecb_dec_three + jb .L025ecb_dec_three movups 48(%esi),%xmm5 - je .L024ecb_dec_four + je .L026ecb_dec_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call __aesni_decrypt6 @@ -512,44 +567,43 @@ _aesni_ecb_encrypt: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L008ecb_ret + jmp .L010ecb_ret .align 16 -.L021ecb_dec_one: +.L023ecb_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L025dec1_loop_4: +.L027dec1_loop_4: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L025dec1_loop_4 + jnz .L027dec1_loop_4 .byte 102,15,56,223,209 movups %xmm2,(%edi) - jmp .L008ecb_ret + jmp .L010ecb_ret .align 16 -.L022ecb_dec_two: - xorps %xmm4,%xmm4 - call __aesni_decrypt3 +.L024ecb_dec_two: + call __aesni_decrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L008ecb_ret + jmp .L010ecb_ret .align 16 -.L023ecb_dec_three: +.L025ecb_dec_three: call __aesni_decrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L008ecb_ret + jmp .L010ecb_ret .align 16 -.L024ecb_dec_four: +.L026ecb_dec_four: call __aesni_decrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) -.L008ecb_ret: +.L010ecb_ret: popl %edi popl %esi popl %ebx @@ -587,45 +641,45 @@ _aesni_ccm64_encrypt_blocks: movl %ebp,20(%esp) movl %ebp,24(%esp) movl %ebp,28(%esp) - shrl $1,%ecx + shll $4,%ecx + movl $16,%ebx leal (%edx),%ebp movdqa (%esp),%xmm5 movdqa %xmm7,%xmm2 - movl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + subl %ecx,%ebx .byte 102,15,56,0,253 -.L026ccm64_enc_outer: +.L028ccm64_enc_outer: movups (%ebp),%xmm0 movl %ebx,%ecx movups (%esi),%xmm6 xorps %xmm0,%xmm2 movups 16(%ebp),%xmm1 xorps %xmm6,%xmm0 - leal 32(%ebp),%edx xorps %xmm0,%xmm3 - movups (%edx),%xmm0 -.L027ccm64_enc2_loop: + movups 32(%ebp),%xmm0 +.L029ccm64_enc2_loop: .byte 102,15,56,220,209 - decl %ecx .byte 102,15,56,220,217 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 - leal 32(%edx),%edx .byte 102,15,56,220,216 - movups (%edx),%xmm0 - jnz .L027ccm64_enc2_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L029ccm64_enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 paddq 16(%esp),%xmm7 + decl %eax .byte 102,15,56,221,208 .byte 102,15,56,221,216 - decl %eax leal 16(%esi),%esi xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 movups %xmm6,(%edi) - leal 16(%edi),%edi .byte 102,15,56,0,213 - jnz .L026ccm64_enc_outer + leal 16(%edi),%edi + jnz .L028ccm64_enc_outer movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) @@ -675,67 +729,70 @@ _aesni_ccm64_decrypt_blocks: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L028enc1_loop_5: +.L030enc1_loop_5: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L028enc1_loop_5 + jnz .L030enc1_loop_5 .byte 102,15,56,221,209 + shll $4,%ebx + movl $16,%ecx movups (%esi),%xmm6 paddq 16(%esp),%xmm7 leal 16(%esi),%esi - jmp .L029ccm64_dec_outer + subl %ebx,%ecx + leal 32(%ebp,%ebx,1),%edx + movl %ecx,%ebx + jmp .L031ccm64_dec_outer .align 16 -.L029ccm64_dec_outer: +.L031ccm64_dec_outer: xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 - movl %ebx,%ecx movups %xmm6,(%edi) leal 16(%edi),%edi .byte 102,15,56,0,213 subl $1,%eax - jz .L030ccm64_dec_break + jz .L032ccm64_dec_break movups (%ebp),%xmm0 - shrl $1,%ecx + movl %ebx,%ecx movups 16(%ebp),%xmm1 xorps %xmm0,%xmm6 - leal 32(%ebp),%edx xorps %xmm0,%xmm2 xorps %xmm6,%xmm3 - movups (%edx),%xmm0 -.L031ccm64_dec2_loop: + movups 32(%ebp),%xmm0 +.L033ccm64_dec2_loop: .byte 102,15,56,220,209 - decl %ecx .byte 102,15,56,220,217 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 - leal 32(%edx),%edx .byte 102,15,56,220,216 - movups (%edx),%xmm0 - jnz .L031ccm64_dec2_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L033ccm64_dec2_loop movups (%esi),%xmm6 paddq 16(%esp),%xmm7 .byte 102,15,56,220,209 .byte 102,15,56,220,217 - leal 16(%esi),%esi .byte 102,15,56,221,208 .byte 102,15,56,221,216 - jmp .L029ccm64_dec_outer + leal 16(%esi),%esi + jmp .L031ccm64_dec_outer .align 16 -.L030ccm64_dec_break: +.L032ccm64_dec_break: + movl 240(%ebp),%ecx movl %ebp,%edx movups (%edx),%xmm0 movups 16(%edx),%xmm1 xorps %xmm0,%xmm6 leal 32(%edx),%edx xorps %xmm6,%xmm3 -.L032enc1_loop_6: +.L034enc1_loop_6: .byte 102,15,56,220,217 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L032enc1_loop_6 + jnz .L034enc1_loop_6 .byte 102,15,56,221,217 movl 48(%esp),%esp movl 40(%esp),%edi @@ -764,7 +821,7 @@ _aesni_ctr32_encrypt_blocks: andl $-16,%esp movl %ebp,80(%esp) cmpl $1,%eax - je .L033ctr32_one_shortcut + je .L035ctr32_one_shortcut movdqu (%ebx),%xmm7 movl $202182159,(%esp) movl $134810123,4(%esp) @@ -780,63 +837,59 @@ _aesni_ctr32_encrypt_blocks: .byte 102,15,58,34,253,3 movl 240(%edx),%ecx bswap %ebx - pxor %xmm1,%xmm1 pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movdqa (%esp),%xmm2 -.byte 102,15,58,34,203,0 +.byte 102,15,58,34,195,0 leal 3(%ebx),%ebp -.byte 102,15,58,34,197,0 +.byte 102,15,58,34,205,0 incl %ebx -.byte 102,15,58,34,203,1 +.byte 102,15,58,34,195,1 incl %ebp -.byte 102,15,58,34,197,1 +.byte 102,15,58,34,205,1 incl %ebx -.byte 102,15,58,34,203,2 +.byte 102,15,58,34,195,2 incl %ebp -.byte 102,15,58,34,197,2 - movdqa %xmm1,48(%esp) -.byte 102,15,56,0,202 - movdqa %xmm0,64(%esp) +.byte 102,15,58,34,205,2 + movdqa %xmm0,48(%esp) .byte 102,15,56,0,194 - pshufd $192,%xmm1,%xmm2 - pshufd $128,%xmm1,%xmm3 + movdqu (%edx),%xmm6 + movdqa %xmm1,64(%esp) +.byte 102,15,56,0,202 + pshufd $192,%xmm0,%xmm2 + pshufd $128,%xmm0,%xmm3 cmpl $6,%eax - jb .L034ctr32_tail + jb .L036ctr32_tail + pxor %xmm6,%xmm7 + shll $4,%ecx + movl $16,%ebx movdqa %xmm7,32(%esp) - shrl $1,%ecx movl %edx,%ebp - movl %ecx,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx subl $6,%eax - jmp .L035ctr32_loop6 -.align 16 -.L035ctr32_loop6: - pshufd $64,%xmm1,%xmm4 - movdqa 32(%esp),%xmm1 - pshufd $192,%xmm0,%xmm5 - por %xmm1,%xmm2 - pshufd $128,%xmm0,%xmm6 - por %xmm1,%xmm3 - pshufd $64,%xmm0,%xmm7 - por %xmm1,%xmm4 - por %xmm1,%xmm5 - por %xmm1,%xmm6 - por %xmm1,%xmm7 - movups (%ebp),%xmm0 - movups 16(%ebp),%xmm1 - leal 32(%ebp),%edx - decl %ecx + jmp .L037ctr32_loop6 +.align 16 +.L037ctr32_loop6: + pshufd $64,%xmm0,%xmm4 + movdqa 32(%esp),%xmm0 + pshufd $192,%xmm1,%xmm5 pxor %xmm0,%xmm2 + pshufd $128,%xmm1,%xmm6 pxor %xmm0,%xmm3 -.byte 102,15,56,220,209 + pshufd $64,%xmm1,%xmm7 + movups 16(%ebp),%xmm1 pxor %xmm0,%xmm4 -.byte 102,15,56,220,217 pxor %xmm0,%xmm5 -.byte 102,15,56,220,225 +.byte 102,15,56,220,209 pxor %xmm0,%xmm6 -.byte 102,15,56,220,233 pxor %xmm0,%xmm7 +.byte 102,15,56,220,217 + movups 32(%ebp),%xmm0 + movl %ebx,%ecx +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 .byte 102,15,56,220,241 - movups (%edx),%xmm0 .byte 102,15,56,220,249 call .L_aesni_encrypt6_enter movups (%esi),%xmm1 @@ -847,51 +900,51 @@ _aesni_ctr32_encrypt_blocks: movups %xmm2,(%edi) movdqa 16(%esp),%xmm0 xorps %xmm1,%xmm4 - movdqa 48(%esp),%xmm1 + movdqa 64(%esp),%xmm1 movups %xmm3,16(%edi) movups %xmm4,32(%edi) paddd %xmm0,%xmm1 - paddd 64(%esp),%xmm0 + paddd 48(%esp),%xmm0 movdqa (%esp),%xmm2 movups 48(%esi),%xmm3 movups 64(%esi),%xmm4 xorps %xmm3,%xmm5 movups 80(%esi),%xmm3 leal 96(%esi),%esi - movdqa %xmm1,48(%esp) -.byte 102,15,56,0,202 + movdqa %xmm0,48(%esp) +.byte 102,15,56,0,194 xorps %xmm4,%xmm6 movups %xmm5,48(%edi) xorps %xmm3,%xmm7 - movdqa %xmm0,64(%esp) -.byte 102,15,56,0,194 + movdqa %xmm1,64(%esp) +.byte 102,15,56,0,202 movups %xmm6,64(%edi) - pshufd $192,%xmm1,%xmm2 + pshufd $192,%xmm0,%xmm2 movups %xmm7,80(%edi) leal 96(%edi),%edi - movl %ebx,%ecx - pshufd $128,%xmm1,%xmm3 + pshufd $128,%xmm0,%xmm3 subl $6,%eax - jnc .L035ctr32_loop6 + jnc .L037ctr32_loop6 addl $6,%eax - jz .L036ctr32_ret + jz .L038ctr32_ret + movdqu (%ebp),%xmm7 movl %ebp,%edx - leal 1(,%ecx,2),%ecx - movdqa 32(%esp),%xmm7 -.L034ctr32_tail: + pxor 32(%esp),%xmm7 + movl 240(%ebp),%ecx +.L036ctr32_tail: por %xmm7,%xmm2 cmpl $2,%eax - jb .L037ctr32_one - pshufd $64,%xmm1,%xmm4 + jb .L039ctr32_one + pshufd $64,%xmm0,%xmm4 por %xmm7,%xmm3 - je .L038ctr32_two - pshufd $192,%xmm0,%xmm5 + je .L040ctr32_two + pshufd $192,%xmm1,%xmm5 por %xmm7,%xmm4 cmpl $4,%eax - jb .L039ctr32_three - pshufd $128,%xmm0,%xmm6 + jb .L041ctr32_three + pshufd $128,%xmm1,%xmm6 por %xmm7,%xmm5 - je .L040ctr32_four + je .L042ctr32_four por %xmm7,%xmm6 call __aesni_encrypt6 movups (%esi),%xmm1 @@ -909,39 +962,39 @@ _aesni_ctr32_encrypt_blocks: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L036ctr32_ret + jmp .L038ctr32_ret .align 16 -.L033ctr32_one_shortcut: +.L035ctr32_one_shortcut: movups (%ebx),%xmm2 movl 240(%edx),%ecx -.L037ctr32_one: +.L039ctr32_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L041enc1_loop_7: +.L043enc1_loop_7: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L041enc1_loop_7 + jnz .L043enc1_loop_7 .byte 102,15,56,221,209 movups (%esi),%xmm6 xorps %xmm2,%xmm6 movups %xmm6,(%edi) - jmp .L036ctr32_ret + jmp .L038ctr32_ret .align 16 -.L038ctr32_two: - call __aesni_encrypt3 +.L040ctr32_two: + call __aesni_encrypt2 movups (%esi),%xmm5 movups 16(%esi),%xmm6 xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L036ctr32_ret + jmp .L038ctr32_ret .align 16 -.L039ctr32_three: +.L041ctr32_three: call __aesni_encrypt3 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -952,9 +1005,9 @@ _aesni_ctr32_encrypt_blocks: xorps %xmm7,%xmm4 movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L036ctr32_ret + jmp .L038ctr32_ret .align 16 -.L040ctr32_four: +.L042ctr32_four: call __aesni_encrypt4 movups (%esi),%xmm6 movups 16(%esi),%xmm7 @@ -968,7 +1021,7 @@ _aesni_ctr32_encrypt_blocks: xorps %xmm0,%xmm5 movups %xmm4,32(%edi) movups %xmm5,48(%edi) -.L036ctr32_ret: +.L038ctr32_ret: movl 80(%esp),%esp popl %edi popl %esi @@ -992,12 +1045,12 @@ _aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L042enc1_loop_8: +.L044enc1_loop_8: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L042enc1_loop_8 + jnz .L044enc1_loop_8 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1021,12 +1074,14 @@ _aesni_xts_encrypt: movl %edx,%ebp movl %ecx,%ebx subl $96,%eax - jc .L043xts_enc_short - shrl $1,%ecx - movl %ecx,%ebx - jmp .L044xts_enc_loop6 + jc .L045xts_enc_short + shll $4,%ecx + movl $16,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + jmp .L046xts_enc_loop6 .align 16 -.L044xts_enc_loop6: +.L046xts_enc_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1062,6 +1117,7 @@ _aesni_xts_encrypt: pand %xmm3,%xmm7 movups (%esi),%xmm2 pxor %xmm1,%xmm7 + movl %ebx,%ecx movdqu 16(%esi),%xmm3 xorps %xmm0,%xmm2 movdqu 32(%esi),%xmm4 @@ -1077,19 +1133,17 @@ _aesni_xts_encrypt: movdqa %xmm7,80(%esp) pxor %xmm1,%xmm7 movups 16(%ebp),%xmm1 - leal 32(%ebp),%edx pxor 16(%esp),%xmm3 -.byte 102,15,56,220,209 pxor 32(%esp),%xmm4 -.byte 102,15,56,220,217 +.byte 102,15,56,220,209 pxor 48(%esp),%xmm5 - decl %ecx -.byte 102,15,56,220,225 pxor 64(%esp),%xmm6 -.byte 102,15,56,220,233 +.byte 102,15,56,220,217 pxor %xmm0,%xmm7 + movups 32(%ebp),%xmm0 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 .byte 102,15,56,220,241 - movups (%edx),%xmm0 .byte 102,15,56,220,249 call .L_aesni_encrypt6_enter movdqa 80(%esp),%xmm1 @@ -1114,26 +1168,25 @@ _aesni_xts_encrypt: paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 - movl %ebx,%ecx pxor %xmm2,%xmm1 subl $96,%eax - jnc .L044xts_enc_loop6 - leal 1(,%ecx,2),%ecx + jnc .L046xts_enc_loop6 + movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -.L043xts_enc_short: +.L045xts_enc_short: addl $96,%eax - jz .L045xts_enc_done6x + jz .L047xts_enc_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb .L046xts_enc_one + jb .L048xts_enc_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je .L047xts_enc_two + je .L049xts_enc_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1142,7 +1195,7 @@ _aesni_xts_encrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb .L048xts_enc_three + jb .L050xts_enc_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1152,7 +1205,7 @@ _aesni_xts_encrypt: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je .L049xts_enc_four + je .L051xts_enc_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1184,9 +1237,9 @@ _aesni_xts_encrypt: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp .L050xts_enc_done + jmp .L052xts_enc_done .align 16 -.L046xts_enc_one: +.L048xts_enc_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1194,37 +1247,36 @@ _aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L051enc1_loop_9: +.L053enc1_loop_9: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L051enc1_loop_9 + jnz .L053enc1_loop_9 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp .L050xts_enc_done + jmp .L052xts_enc_done .align 16 -.L047xts_enc_two: +.L049xts_enc_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 leal 32(%esi),%esi xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 - xorps %xmm4,%xmm4 - call __aesni_encrypt3 + call __aesni_encrypt2 xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L050xts_enc_done + jmp .L052xts_enc_done .align 16 -.L048xts_enc_three: +.L050xts_enc_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1242,9 +1294,9 @@ _aesni_xts_encrypt: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp .L050xts_enc_done + jmp .L052xts_enc_done .align 16 -.L049xts_enc_four: +.L051xts_enc_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1266,28 +1318,28 @@ _aesni_xts_encrypt: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L050xts_enc_done + jmp .L052xts_enc_done .align 16 -.L045xts_enc_done6x: +.L047xts_enc_done6x: movl 112(%esp),%eax andl $15,%eax - jz .L052xts_enc_ret + jz .L054xts_enc_ret movdqa %xmm1,%xmm5 movl %eax,112(%esp) - jmp .L053xts_enc_steal + jmp .L055xts_enc_steal .align 16 -.L050xts_enc_done: +.L052xts_enc_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz .L052xts_enc_ret + jz .L054xts_enc_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm5 paddq %xmm1,%xmm1 pand 96(%esp),%xmm5 pxor %xmm1,%xmm5 -.L053xts_enc_steal: +.L055xts_enc_steal: movzbl (%esi),%ecx movzbl -16(%edi),%edx leal 1(%esi),%esi @@ -1295,7 +1347,7 @@ _aesni_xts_encrypt: movb %dl,(%edi) leal 1(%edi),%edi subl $1,%eax - jnz .L053xts_enc_steal + jnz .L055xts_enc_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1305,16 +1357,16 @@ _aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L054enc1_loop_10: +.L056enc1_loop_10: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L054enc1_loop_10 + jnz .L056enc1_loop_10 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,-16(%edi) -.L052xts_enc_ret: +.L054xts_enc_ret: movl 116(%esp),%esp popl %edi popl %esi @@ -1338,12 +1390,12 @@ _aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L055enc1_loop_11: +.L057enc1_loop_11: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L055enc1_loop_11 + jnz .L057enc1_loop_11 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1372,12 +1424,14 @@ _aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 andl $-16,%eax subl $96,%eax - jc .L056xts_dec_short - shrl $1,%ecx - movl %ecx,%ebx - jmp .L057xts_dec_loop6 + jc .L058xts_dec_short + shll $4,%ecx + movl $16,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + jmp .L059xts_dec_loop6 .align 16 -.L057xts_dec_loop6: +.L059xts_dec_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1413,6 +1467,7 @@ _aesni_xts_decrypt: pand %xmm3,%xmm7 movups (%esi),%xmm2 pxor %xmm1,%xmm7 + movl %ebx,%ecx movdqu 16(%esi),%xmm3 xorps %xmm0,%xmm2 movdqu 32(%esi),%xmm4 @@ -1428,19 +1483,17 @@ _aesni_xts_decrypt: movdqa %xmm7,80(%esp) pxor %xmm1,%xmm7 movups 16(%ebp),%xmm1 - leal 32(%ebp),%edx pxor 16(%esp),%xmm3 -.byte 102,15,56,222,209 pxor 32(%esp),%xmm4 -.byte 102,15,56,222,217 +.byte 102,15,56,222,209 pxor 48(%esp),%xmm5 - decl %ecx -.byte 102,15,56,222,225 pxor 64(%esp),%xmm6 -.byte 102,15,56,222,233 +.byte 102,15,56,222,217 pxor %xmm0,%xmm7 + movups 32(%ebp),%xmm0 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 .byte 102,15,56,222,241 - movups (%edx),%xmm0 .byte 102,15,56,222,249 call .L_aesni_decrypt6_enter movdqa 80(%esp),%xmm1 @@ -1465,26 +1518,25 @@ _aesni_xts_decrypt: paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 - movl %ebx,%ecx pxor %xmm2,%xmm1 subl $96,%eax - jnc .L057xts_dec_loop6 - leal 1(,%ecx,2),%ecx + jnc .L059xts_dec_loop6 + movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -.L056xts_dec_short: +.L058xts_dec_short: addl $96,%eax - jz .L058xts_dec_done6x + jz .L060xts_dec_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb .L059xts_dec_one + jb .L061xts_dec_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je .L060xts_dec_two + je .L062xts_dec_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1493,7 +1545,7 @@ _aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb .L061xts_dec_three + jb .L063xts_dec_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1503,7 +1555,7 @@ _aesni_xts_decrypt: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je .L062xts_dec_four + je .L064xts_dec_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1535,9 +1587,9 @@ _aesni_xts_decrypt: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp .L063xts_dec_done + jmp .L065xts_dec_done .align 16 -.L059xts_dec_one: +.L061xts_dec_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1545,36 +1597,36 @@ _aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L064dec1_loop_12: +.L066dec1_loop_12: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L064dec1_loop_12 + jnz .L066dec1_loop_12 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp .L063xts_dec_done + jmp .L065xts_dec_done .align 16 -.L060xts_dec_two: +.L062xts_dec_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 leal 32(%esi),%esi xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 - call __aesni_decrypt3 + call __aesni_decrypt2 xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L063xts_dec_done + jmp .L065xts_dec_done .align 16 -.L061xts_dec_three: +.L063xts_dec_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1592,9 +1644,9 @@ _aesni_xts_decrypt: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp .L063xts_dec_done + jmp .L065xts_dec_done .align 16 -.L062xts_dec_four: +.L064xts_dec_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1616,20 +1668,20 @@ _aesni_xts_decrypt: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L063xts_dec_done + jmp .L065xts_dec_done .align 16 -.L058xts_dec_done6x: +.L060xts_dec_done6x: movl 112(%esp),%eax andl $15,%eax - jz .L065xts_dec_ret + jz .L067xts_dec_ret movl %eax,112(%esp) - jmp .L066xts_dec_only_one_more + jmp .L068xts_dec_only_one_more .align 16 -.L063xts_dec_done: +.L065xts_dec_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz .L065xts_dec_ret + jz .L067xts_dec_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm2 @@ -1639,7 +1691,7 @@ _aesni_xts_decrypt: pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 -.L066xts_dec_only_one_more: +.L068xts_dec_only_one_more: pshufd $19,%xmm0,%xmm5 movdqa %xmm1,%xmm6 paddq %xmm1,%xmm1 @@ -1653,16 +1705,16 @@ _aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L067dec1_loop_13: +.L069dec1_loop_13: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L067dec1_loop_13 + jnz .L069dec1_loop_13 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) -.L068xts_dec_steal: +.L070xts_dec_steal: movzbl 16(%esi),%ecx movzbl (%edi),%edx leal 1(%esi),%esi @@ -1670,7 +1722,7 @@ _aesni_xts_decrypt: movb %dl,16(%edi) leal 1(%edi),%edi subl $1,%eax - jnz .L068xts_dec_steal + jnz .L070xts_dec_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1680,16 +1732,16 @@ _aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L069dec1_loop_14: +.L071dec1_loop_14: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L069dec1_loop_14 + jnz .L071dec1_loop_14 .byte 102,15,56,223,209 xorps %xmm6,%xmm2 movups %xmm2,(%edi) -.L065xts_dec_ret: +.L067xts_dec_ret: movl 116(%esp),%esp popl %edi popl %esi @@ -1714,7 +1766,7 @@ _aesni_cbc_encrypt: movl 32(%esp),%edx movl 36(%esp),%ebp testl %eax,%eax - jz .L070cbc_abort + jz .L072cbc_abort cmpl $0,40(%esp) xchgl %esp,%ebx movups (%ebp),%xmm7 @@ -1722,14 +1774,14 @@ _aesni_cbc_encrypt: movl %edx,%ebp movl %ebx,16(%esp) movl %ecx,%ebx - je .L071cbc_decrypt + je .L073cbc_decrypt movaps %xmm7,%xmm2 cmpl $16,%eax - jb .L072cbc_enc_tail + jb .L074cbc_enc_tail subl $16,%eax - jmp .L073cbc_enc_loop + jmp .L075cbc_enc_loop .align 16 -.L073cbc_enc_loop: +.L075cbc_enc_loop: movups (%esi),%xmm7 leal 16(%esi),%esi movups (%edx),%xmm0 @@ -1737,24 +1789,24 @@ _aesni_cbc_encrypt: xorps %xmm0,%xmm7 leal 32(%edx),%edx xorps %xmm7,%xmm2 -.L074enc1_loop_15: +.L076enc1_loop_15: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L074enc1_loop_15 + jnz .L076enc1_loop_15 .byte 102,15,56,221,209 movl %ebx,%ecx movl %ebp,%edx movups %xmm2,(%edi) leal 16(%edi),%edi subl $16,%eax - jnc .L073cbc_enc_loop + jnc .L075cbc_enc_loop addl $16,%eax - jnz .L072cbc_enc_tail + jnz .L074cbc_enc_tail movaps %xmm2,%xmm7 - jmp .L075cbc_ret -.L072cbc_enc_tail: + jmp .L077cbc_ret +.L074cbc_enc_tail: movl %eax,%ecx .long 2767451785 movl $16,%ecx @@ -1765,20 +1817,20 @@ _aesni_cbc_encrypt: movl %ebx,%ecx movl %edi,%esi movl %ebp,%edx - jmp .L073cbc_enc_loop + jmp .L075cbc_enc_loop .align 16 -.L071cbc_decrypt: +.L073cbc_decrypt: cmpl $80,%eax - jbe .L076cbc_dec_tail + jbe .L078cbc_dec_tail movaps %xmm7,(%esp) subl $80,%eax - jmp .L077cbc_dec_loop6_enter + jmp .L079cbc_dec_loop6_enter .align 16 -.L078cbc_dec_loop6: +.L080cbc_dec_loop6: movaps %xmm0,(%esp) movups %xmm7,(%edi) leal 16(%edi),%edi -.L077cbc_dec_loop6_enter: +.L079cbc_dec_loop6_enter: movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -1808,28 +1860,28 @@ _aesni_cbc_encrypt: movups %xmm6,64(%edi) leal 80(%edi),%edi subl $96,%eax - ja .L078cbc_dec_loop6 + ja .L080cbc_dec_loop6 movaps %xmm7,%xmm2 movaps %xmm0,%xmm7 addl $80,%eax - jle .L079cbc_dec_tail_collected + jle .L081cbc_dec_tail_collected movups %xmm2,(%edi) leal 16(%edi),%edi -.L076cbc_dec_tail: +.L078cbc_dec_tail: movups (%esi),%xmm2 movaps %xmm2,%xmm6 cmpl $16,%eax - jbe .L080cbc_dec_one + jbe .L082cbc_dec_one movups 16(%esi),%xmm3 movaps %xmm3,%xmm5 cmpl $32,%eax - jbe .L081cbc_dec_two + jbe .L083cbc_dec_two movups 32(%esi),%xmm4 cmpl $48,%eax - jbe .L082cbc_dec_three + jbe .L084cbc_dec_three movups 48(%esi),%xmm5 cmpl $64,%eax - jbe .L083cbc_dec_four + jbe .L085cbc_dec_four movups 64(%esi),%xmm6 movaps %xmm7,(%esp) movups (%esi),%xmm2 @@ -1852,28 +1904,27 @@ _aesni_cbc_encrypt: leal 64(%edi),%edi movaps %xmm6,%xmm2 subl $80,%eax - jmp .L079cbc_dec_tail_collected + jmp .L081cbc_dec_tail_collected .align 16 -.L080cbc_dec_one: +.L082cbc_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L084dec1_loop_16: +.L086dec1_loop_16: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L084dec1_loop_16 + jnz .L086dec1_loop_16 .byte 102,15,56,223,209 xorps %xmm7,%xmm2 movaps %xmm6,%xmm7 subl $16,%eax - jmp .L079cbc_dec_tail_collected + jmp .L081cbc_dec_tail_collected .align 16 -.L081cbc_dec_two: - xorps %xmm4,%xmm4 - call __aesni_decrypt3 +.L083cbc_dec_two: + call __aesni_decrypt2 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) @@ -1881,9 +1932,9 @@ _aesni_cbc_encrypt: leal 16(%edi),%edi movaps %xmm5,%xmm7 subl $32,%eax - jmp .L079cbc_dec_tail_collected + jmp .L081cbc_dec_tail_collected .align 16 -.L082cbc_dec_three: +.L084cbc_dec_three: call __aesni_decrypt3 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 @@ -1894,9 +1945,9 @@ _aesni_cbc_encrypt: leal 32(%edi),%edi movups 32(%esi),%xmm7 subl $48,%eax - jmp .L079cbc_dec_tail_collected + jmp .L081cbc_dec_tail_collected .align 16 -.L083cbc_dec_four: +.L085cbc_dec_four: call __aesni_decrypt4 movups 16(%esi),%xmm1 movups 32(%esi),%xmm0 @@ -1911,23 +1962,23 @@ _aesni_cbc_encrypt: leal 48(%edi),%edi movaps %xmm5,%xmm2 subl $64,%eax -.L079cbc_dec_tail_collected: +.L081cbc_dec_tail_collected: andl $15,%eax - jnz .L085cbc_dec_tail_partial + jnz .L087cbc_dec_tail_partial movups %xmm2,(%edi) - jmp .L075cbc_ret + jmp .L077cbc_ret .align 16 -.L085cbc_dec_tail_partial: +.L087cbc_dec_tail_partial: movaps %xmm2,(%esp) movl $16,%ecx movl %esp,%esi subl %eax,%ecx .long 2767451785 -.L075cbc_ret: +.L077cbc_ret: movl 16(%esp),%esp movl 36(%esp),%ebp movups %xmm7,(%ebp) -.L070cbc_abort: +.L072cbc_abort: popl %edi popl %esi popl %ebx @@ -1937,51 +1988,51 @@ _aesni_cbc_encrypt: .align 16 __aesni_set_encrypt_key: testl %eax,%eax - jz .L086bad_pointer + jz .L088bad_pointer testl %edx,%edx - jz .L086bad_pointer + jz .L088bad_pointer movups (%eax),%xmm0 xorps %xmm4,%xmm4 leal 16(%edx),%edx cmpl $256,%ecx - je .L08714rounds + je .L08914rounds cmpl $192,%ecx - je .L08812rounds + je .L09012rounds cmpl $128,%ecx - jne .L089bad_keybits + jne .L091bad_keybits .align 16 -.L09010rounds: +.L09210rounds: movl $9,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,200,1 - call .L091key_128_cold + call .L093key_128_cold .byte 102,15,58,223,200,2 - call .L092key_128 + call .L094key_128 .byte 102,15,58,223,200,4 - call .L092key_128 + call .L094key_128 .byte 102,15,58,223,200,8 - call .L092key_128 + call .L094key_128 .byte 102,15,58,223,200,16 - call .L092key_128 + call .L094key_128 .byte 102,15,58,223,200,32 - call .L092key_128 + call .L094key_128 .byte 102,15,58,223,200,64 - call .L092key_128 + call .L094key_128 .byte 102,15,58,223,200,128 - call .L092key_128 + call .L094key_128 .byte 102,15,58,223,200,27 - call .L092key_128 + call .L094key_128 .byte 102,15,58,223,200,54 - call .L092key_128 + call .L094key_128 movups %xmm0,(%edx) movl %ecx,80(%edx) xorl %eax,%eax ret .align 16 -.L092key_128: +.L094key_128: movups %xmm0,(%edx) leal 16(%edx),%edx -.L091key_128_cold: +.L093key_128_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -1990,38 +2041,38 @@ __aesni_set_encrypt_key: xorps %xmm1,%xmm0 ret .align 16 -.L08812rounds: +.L09012rounds: movq 16(%eax),%xmm2 movl $11,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,202,1 - call .L093key_192a_cold + call .L095key_192a_cold .byte 102,15,58,223,202,2 - call .L094key_192b + call .L096key_192b .byte 102,15,58,223,202,4 - call .L095key_192a + call .L097key_192a .byte 102,15,58,223,202,8 - call .L094key_192b + call .L096key_192b .byte 102,15,58,223,202,16 - call .L095key_192a + call .L097key_192a .byte 102,15,58,223,202,32 - call .L094key_192b + call .L096key_192b .byte 102,15,58,223,202,64 - call .L095key_192a + call .L097key_192a .byte 102,15,58,223,202,128 - call .L094key_192b + call .L096key_192b movups %xmm0,(%edx) movl %ecx,48(%edx) xorl %eax,%eax ret .align 16 -.L095key_192a: +.L097key_192a: movups %xmm0,(%edx) leal 16(%edx),%edx .align 16 -.L093key_192a_cold: +.L095key_192a_cold: movaps %xmm2,%xmm5 -.L096key_192b_warm: +.L098key_192b_warm: shufps $16,%xmm0,%xmm4 movdqa %xmm2,%xmm3 xorps %xmm4,%xmm0 @@ -2035,56 +2086,56 @@ __aesni_set_encrypt_key: pxor %xmm3,%xmm2 ret .align 16 -.L094key_192b: +.L096key_192b: movaps %xmm0,%xmm3 shufps $68,%xmm0,%xmm5 movups %xmm5,(%edx) shufps $78,%xmm2,%xmm3 movups %xmm3,16(%edx) leal 32(%edx),%edx - jmp .L096key_192b_warm + jmp .L098key_192b_warm .align 16 -.L08714rounds: +.L08914rounds: movups 16(%eax),%xmm2 movl $13,%ecx leal 16(%edx),%edx movups %xmm0,-32(%edx) movups %xmm2,-16(%edx) .byte 102,15,58,223,202,1 - call .L097key_256a_cold + call .L099key_256a_cold .byte 102,15,58,223,200,1 - call .L098key_256b + call .L100key_256b .byte 102,15,58,223,202,2 - call .L099key_256a + call .L101key_256a .byte 102,15,58,223,200,2 - call .L098key_256b + call .L100key_256b .byte 102,15,58,223,202,4 - call .L099key_256a + call .L101key_256a .byte 102,15,58,223,200,4 - call .L098key_256b + call .L100key_256b .byte 102,15,58,223,202,8 - call .L099key_256a + call .L101key_256a .byte 102,15,58,223,200,8 - call .L098key_256b + call .L100key_256b .byte 102,15,58,223,202,16 - call .L099key_256a + call .L101key_256a .byte 102,15,58,223,200,16 - call .L098key_256b + call .L100key_256b .byte 102,15,58,223,202,32 - call .L099key_256a + call .L101key_256a .byte 102,15,58,223,200,32 - call .L098key_256b + call .L100key_256b .byte 102,15,58,223,202,64 - call .L099key_256a + call .L101key_256a movups %xmm0,(%edx) movl %ecx,16(%edx) xorl %eax,%eax ret .align 16 -.L099key_256a: +.L101key_256a: movups %xmm2,(%edx) leal 16(%edx),%edx -.L097key_256a_cold: +.L099key_256a_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2093,7 +2144,7 @@ __aesni_set_encrypt_key: xorps %xmm1,%xmm0 ret .align 16 -.L098key_256b: +.L100key_256b: movups %xmm0,(%edx) leal 16(%edx),%edx shufps $16,%xmm2,%xmm4 @@ -2104,11 +2155,11 @@ __aesni_set_encrypt_key: xorps %xmm1,%xmm2 ret .align 4 -.L086bad_pointer: +.L088bad_pointer: movl $-1,%eax ret .align 4 -.L089bad_keybits: +.L091bad_keybits: movl $-2,%eax ret .globl _aesni_set_encrypt_key @@ -2133,7 +2184,7 @@ _aesni_set_decrypt_key: movl 12(%esp),%edx shll $4,%ecx testl %eax,%eax - jnz .L100dec_key_ret + jnz .L102dec_key_ret leal 16(%edx,%ecx,1),%eax movups (%edx),%xmm0 movups (%eax),%xmm1 @@ -2141,7 +2192,7 @@ _aesni_set_decrypt_key: movups %xmm1,(%edx) leal 16(%edx),%edx leal -16(%eax),%eax -.L101dec_key_inverse: +.L103dec_key_inverse: movups (%edx),%xmm0 movups (%eax),%xmm1 .byte 102,15,56,219,192 @@ -2151,12 +2202,12 @@ _aesni_set_decrypt_key: movups %xmm0,16(%eax) movups %xmm1,-16(%edx) cmpl %edx,%eax - ja .L101dec_key_inverse + ja .L103dec_key_inverse movups (%edx),%xmm0 .byte 102,15,56,219,192 movups %xmm0,(%edx) xorl %eax,%eax -.L100dec_key_ret: +.L102dec_key_ret: ret .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 .byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 diff --git a/lib/accelerated/x86/coff/aesni-x86_64.s b/lib/accelerated/x86/coff/aesni-x86_64.s index 8751ccba8f..a79c18b158 100644 --- a/lib/accelerated/x86/coff/aesni-x86_64.s +++ b/lib/accelerated/x86/coff/aesni-x86_64.s @@ -38,6 +38,7 @@ # *** This file is auto-generated *** # .text + .globl aesni_encrypt .def aesni_encrypt; .scl 2; .type 32; .endef .p2align 4 @@ -53,7 +54,7 @@ aesni_encrypt: decl %eax movups (%r8),%xmm1 leaq 16(%r8),%r8 - jnz .Loop_enc1_1 + jnz .Loop_enc1_1 .byte 102,15,56,221,209 movups %xmm2,(%rdx) .byte 0xf3,0xc3 @@ -74,34 +75,93 @@ aesni_decrypt: decl %eax movups (%r8),%xmm1 leaq 16(%r8),%r8 - jnz .Loop_dec1_2 + jnz .Loop_dec1_2 .byte 102,15,56,223,209 movups %xmm2,(%rdx) .byte 0xf3,0xc3 +.def _aesni_encrypt2; .scl 3; .type 32; .endef +.p2align 4 +_aesni_encrypt2: + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +.Lenc_loop2: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Lenc_loop2 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 + .byte 0xf3,0xc3 + +.def _aesni_decrypt2; .scl 3; .type 32; .endef +.p2align 4 +_aesni_decrypt2: + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +.Ldec_loop2: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Ldec_loop2 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,223,208 +.byte 102,15,56,223,216 + .byte 0xf3,0xc3 + .def _aesni_encrypt3; .scl 3; .type 32; .endef .p2align 4 _aesni_encrypt3: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 - movups (%rcx),%xmm0 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax .Lenc_loop3: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Lenc_loop3 .byte 102,15,56,220,209 @@ -116,25 +176,26 @@ _aesni_encrypt3: .p2align 4 _aesni_decrypt3: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 - movups (%rcx),%xmm0 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax .Ldec_loop3: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Ldec_loop3 .byte 102,15,56,222,209 @@ -149,28 +210,30 @@ _aesni_decrypt3: .p2align 4 _aesni_encrypt4: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 xorps %xmm0,%xmm5 - movups (%rcx),%xmm0 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 0x0f,0x1f,0x00 + addq $16,%rax .Lenc_loop4: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 .byte 102,15,56,220,233 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 .byte 102,15,56,220,232 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Lenc_loop4 .byte 102,15,56,220,209 @@ -187,28 +250,30 @@ _aesni_encrypt4: .p2align 4 _aesni_decrypt4: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 xorps %xmm0,%xmm5 - movups (%rcx),%xmm0 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 0x0f,0x1f,0x00 + addq $16,%rax .Ldec_loop4: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 .byte 102,15,56,222,233 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 .byte 102,15,56,222,232 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Ldec_loop4 .byte 102,15,56,222,209 @@ -225,43 +290,43 @@ _aesni_decrypt4: .p2align 4 _aesni_encrypt6: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 -.byte 102,15,56,220,209 pxor %xmm0,%xmm4 +.byte 102,15,56,220,209 + leaq 32(%rcx,%rax,1),%rcx + negq %rax .byte 102,15,56,220,217 pxor %xmm0,%xmm5 -.byte 102,15,56,220,225 pxor %xmm0,%xmm6 -.byte 102,15,56,220,233 +.byte 102,15,56,220,225 pxor %xmm0,%xmm7 - decl %eax + addq $16,%rax +.byte 102,15,56,220,233 .byte 102,15,56,220,241 - movups (%rcx),%xmm0 .byte 102,15,56,220,249 + movups -16(%rcx,%rax,1),%xmm0 jmp .Lenc_loop6_enter .p2align 4 .Lenc_loop6: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .Lenc_loop6_enter: - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Lenc_loop6 .byte 102,15,56,220,209 @@ -282,43 +347,43 @@ _aesni_encrypt6: .p2align 4 _aesni_decrypt6: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 -.byte 102,15,56,222,209 pxor %xmm0,%xmm4 +.byte 102,15,56,222,209 + leaq 32(%rcx,%rax,1),%rcx + negq %rax .byte 102,15,56,222,217 pxor %xmm0,%xmm5 -.byte 102,15,56,222,225 pxor %xmm0,%xmm6 -.byte 102,15,56,222,233 +.byte 102,15,56,222,225 pxor %xmm0,%xmm7 - decl %eax + addq $16,%rax +.byte 102,15,56,222,233 .byte 102,15,56,222,241 - movups (%rcx),%xmm0 .byte 102,15,56,222,249 + movups -16(%rcx,%rax,1),%xmm0 jmp .Ldec_loop6_enter .p2align 4 .Ldec_loop6: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 .Ldec_loop6_enter: - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Ldec_loop6 .byte 102,15,56,222,209 @@ -339,52 +404,51 @@ _aesni_decrypt6: .p2align 4 _aesni_encrypt8: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 -.byte 102,15,56,220,209 pxor %xmm0,%xmm4 -.byte 102,15,56,220,217 pxor %xmm0,%xmm5 -.byte 102,15,56,220,225 pxor %xmm0,%xmm6 -.byte 102,15,56,220,233 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,220,209 + addq $16,%rax pxor %xmm0,%xmm7 - decl %eax -.byte 102,15,56,220,241 +.byte 102,15,56,220,217 pxor %xmm0,%xmm8 -.byte 102,15,56,220,249 pxor %xmm0,%xmm9 - movups (%rcx),%xmm0 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 - movups 16(%rcx),%xmm1 + movups -16(%rcx,%rax,1),%xmm0 jmp .Lenc_loop8_enter .p2align 4 .Lenc_loop8: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 - movups 16(%rcx),%xmm1 .Lenc_loop8_enter: + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 .byte 102,68,15,56,220,192 .byte 102,68,15,56,220,200 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Lenc_loop8 .byte 102,15,56,220,209 @@ -409,52 +473,51 @@ _aesni_encrypt8: .p2align 4 _aesni_decrypt8: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 -.byte 102,15,56,222,209 pxor %xmm0,%xmm4 -.byte 102,15,56,222,217 pxor %xmm0,%xmm5 -.byte 102,15,56,222,225 pxor %xmm0,%xmm6 -.byte 102,15,56,222,233 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,222,209 + addq $16,%rax pxor %xmm0,%xmm7 - decl %eax -.byte 102,15,56,222,241 +.byte 102,15,56,222,217 pxor %xmm0,%xmm8 -.byte 102,15,56,222,249 pxor %xmm0,%xmm9 - movups (%rcx),%xmm0 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 .byte 102,68,15,56,222,193 .byte 102,68,15,56,222,201 - movups 16(%rcx),%xmm1 + movups -16(%rcx,%rax,1),%xmm0 jmp .Ldec_loop8_enter .p2align 4 .Ldec_loop8: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 .byte 102,68,15,56,222,193 .byte 102,68,15,56,222,201 - movups 16(%rcx),%xmm1 .Ldec_loop8_enter: + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 .byte 102,68,15,56,222,192 .byte 102,68,15,56,222,200 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Ldec_loop8 .byte 102,15,56,222,209 @@ -593,14 +656,13 @@ aesni_ecb_encrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_3 + jnz .Loop_enc1_3 .byte 102,15,56,221,209 movups %xmm2,(%rsi) jmp .Lecb_ret .p2align 4 .Lecb_enc_two: - xorps %xmm4,%xmm4 - call _aesni_encrypt3 + call _aesni_encrypt2 movups %xmm2,(%rsi) movups %xmm3,16(%rsi) jmp .Lecb_ret @@ -738,14 +800,13 @@ aesni_ecb_encrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_4 + jnz .Loop_dec1_4 .byte 102,15,56,223,209 movups %xmm2,(%rsi) jmp .Lecb_ret .p2align 4 .Lecb_dec_two: - xorps %xmm4,%xmm4 - call _aesni_decrypt3 + call _aesni_decrypt2 movups %xmm2,(%rsi) movups %xmm3,16(%rsi) jmp .Lecb_ret @@ -811,53 +872,53 @@ aesni_ccm64_encrypt_blocks: movaps %xmm9,48(%rsp) .Lccm64_enc_body: movl 240(%rcx),%eax - movdqu (%r8),%xmm9 - movdqa .Lincrement64(%rip),%xmm6 + movdqu (%r8),%xmm6 + movdqa .Lincrement64(%rip),%xmm9 movdqa .Lbswap_mask(%rip),%xmm7 - shrl $1,%eax + shll $4,%eax + movl $16,%r10d leaq 0(%rcx),%r11 movdqu (%r9),%xmm3 - movdqa %xmm9,%xmm2 - movl %eax,%r10d -.byte 102,68,15,56,0,207 + movdqa %xmm6,%xmm2 + leaq 32(%rcx,%rax,1),%rcx +.byte 102,15,56,0,247 + subq %rax,%r10 jmp .Lccm64_enc_outer .p2align 4 .Lccm64_enc_outer: movups (%r11),%xmm0 - movl %r10d,%eax + movq %r10,%rax movups (%rdi),%xmm8 xorps %xmm0,%xmm2 movups 16(%r11),%xmm1 xorps %xmm8,%xmm0 - leaq 32(%r11),%rcx xorps %xmm0,%xmm3 - movups (%rcx),%xmm0 + movups 32(%r11),%xmm0 .Lccm64_enc2_loop: .byte 102,15,56,220,209 - decl %eax .byte 102,15,56,220,217 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 - leaq 32(%rcx),%rcx .byte 102,15,56,220,216 - movups 0(%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Lccm64_enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 - paddq %xmm6,%xmm9 + paddq %xmm9,%xmm6 + decq %rdx .byte 102,15,56,221,208 .byte 102,15,56,221,216 - decq %rdx leaq 16(%rdi),%rdi xorps %xmm2,%xmm8 - movdqa %xmm9,%xmm2 + movdqa %xmm6,%xmm2 movups %xmm8,(%rsi) - leaq 16(%rsi),%rsi .byte 102,15,56,0,215 + leaq 16(%rsi),%rsi jnz .Lccm64_enc_outer movups %xmm3,(%r9) @@ -893,15 +954,15 @@ aesni_ccm64_decrypt_blocks: movaps %xmm9,48(%rsp) .Lccm64_dec_body: movl 240(%rcx),%eax - movups (%r8),%xmm9 + movups (%r8),%xmm6 movdqu (%r9),%xmm3 - movdqa .Lincrement64(%rip),%xmm6 + movdqa .Lincrement64(%rip),%xmm9 movdqa .Lbswap_mask(%rip),%xmm7 - movaps %xmm9,%xmm2 + movaps %xmm6,%xmm2 movl %eax,%r10d movq %rcx,%r11 -.byte 102,68,15,56,0,207 +.byte 102,15,56,0,247 movups (%rcx),%xmm0 movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx @@ -911,17 +972,21 @@ aesni_ccm64_decrypt_blocks: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_5 + jnz .Loop_enc1_5 .byte 102,15,56,221,209 + shll $4,%r10d + movl $16,%eax movups (%rdi),%xmm8 - paddq %xmm6,%xmm9 + paddq %xmm9,%xmm6 leaq 16(%rdi),%rdi + subq %r10,%rax + leaq 32(%r11,%r10,1),%rcx + movq %rax,%r10 jmp .Lccm64_dec_outer .p2align 4 .Lccm64_dec_outer: xorps %xmm2,%xmm8 - movdqa %xmm9,%xmm2 - movl %r10d,%eax + movdqa %xmm6,%xmm2 movups %xmm8,(%rsi) leaq 16(%rsi),%rsi .byte 102,15,56,0,215 @@ -930,36 +995,36 @@ aesni_ccm64_decrypt_blocks: jz .Lccm64_dec_break movups (%r11),%xmm0 - shrl $1,%eax + movq %r10,%rax movups 16(%r11),%xmm1 xorps %xmm0,%xmm8 - leaq 32(%r11),%rcx xorps %xmm0,%xmm2 xorps %xmm8,%xmm3 - movups (%rcx),%xmm0 - + movups 32(%r11),%xmm0 + jmp .Lccm64_dec2_loop +.p2align 4 .Lccm64_dec2_loop: .byte 102,15,56,220,209 - decl %eax .byte 102,15,56,220,217 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 - leaq 32(%rcx),%rcx .byte 102,15,56,220,216 - movups 0(%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Lccm64_dec2_loop movups (%rdi),%xmm8 - paddq %xmm6,%xmm9 + paddq %xmm9,%xmm6 .byte 102,15,56,220,209 .byte 102,15,56,220,217 - leaq 16(%rdi),%rdi .byte 102,15,56,221,208 .byte 102,15,56,221,216 + leaq 16(%rdi),%rdi jmp .Lccm64_dec_outer .p2align 4 .Lccm64_dec_break: + movl 240(%r11),%eax movups (%r11),%xmm0 movups 16(%r11),%xmm1 xorps %xmm0,%xmm8 @@ -970,7 +1035,7 @@ aesni_ccm64_decrypt_blocks: decl %eax movups (%r11),%xmm1 leaq 16(%r11),%r11 - jnz .Loop_enc1_6 + jnz .Loop_enc1_6 .byte 102,15,56,221,217 movups %xmm3,(%r9) movaps (%rsp),%xmm6 @@ -997,211 +1062,529 @@ aesni_ctr32_encrypt_blocks: movq %r9,%rcx movq 40(%rsp),%r8 - leaq -200(%rsp),%rsp - movaps %xmm6,32(%rsp) - movaps %xmm7,48(%rsp) - movaps %xmm8,64(%rsp) - movaps %xmm9,80(%rsp) - movaps %xmm10,96(%rsp) - movaps %xmm11,112(%rsp) - movaps %xmm12,128(%rsp) - movaps %xmm13,144(%rsp) - movaps %xmm14,160(%rsp) - movaps %xmm15,176(%rsp) + leaq (%rsp),%rax + pushq %rbp + subq $288,%rsp + andq $-16,%rsp + movaps %xmm6,-168(%rax) + movaps %xmm7,-152(%rax) + movaps %xmm8,-136(%rax) + movaps %xmm9,-120(%rax) + movaps %xmm10,-104(%rax) + movaps %xmm11,-88(%rax) + movaps %xmm12,-72(%rax) + movaps %xmm13,-56(%rax) + movaps %xmm14,-40(%rax) + movaps %xmm15,-24(%rax) .Lctr32_body: + leaq -8(%rax),%rbp + cmpq $1,%rdx je .Lctr32_one_shortcut - movdqu (%r8),%xmm14 - movdqa .Lbswap_mask(%rip),%xmm15 - xorl %eax,%eax -.byte 102,69,15,58,22,242,3 -.byte 102,68,15,58,34,240,3 - + movdqu (%r8),%xmm2 + movdqu (%rcx),%xmm0 + movl 12(%r8),%r8d + pxor %xmm0,%xmm2 + movl 12(%rcx),%r11d + movdqa %xmm2,0(%rsp) + bswapl %r8d + movdqa %xmm2,%xmm3 + movdqa %xmm2,%xmm4 + movdqa %xmm2,%xmm5 + movdqa %xmm2,64(%rsp) + movdqa %xmm2,80(%rsp) + movdqa %xmm2,96(%rsp) + movq %rdx,%r10 + movdqa %xmm2,112(%rsp) + + leaq 1(%r8),%rax + leaq 2(%r8),%rdx + bswapl %eax + bswapl %edx + xorl %r11d,%eax + xorl %r11d,%edx +.byte 102,15,58,34,216,3 + leaq 3(%r8),%rax + movdqa %xmm3,16(%rsp) +.byte 102,15,58,34,226,3 + bswapl %eax + movq %r10,%rdx + leaq 4(%r8),%r10 + movdqa %xmm4,32(%rsp) + xorl %r11d,%eax + bswapl %r10d +.byte 102,15,58,34,232,3 + xorl %r11d,%r10d + movdqa %xmm5,48(%rsp) + leaq 5(%r8),%r9 + movl %r10d,64+12(%rsp) + bswapl %r9d + leaq 6(%r8),%r10 movl 240(%rcx),%eax + xorl %r11d,%r9d bswapl %r10d - pxor %xmm12,%xmm12 - pxor %xmm13,%xmm13 -.byte 102,69,15,58,34,226,0 - leaq 3(%r10),%r11 -.byte 102,69,15,58,34,235,0 - incl %r10d -.byte 102,69,15,58,34,226,1 - incq %r11 -.byte 102,69,15,58,34,235,1 - incl %r10d -.byte 102,69,15,58,34,226,2 - incq %r11 -.byte 102,69,15,58,34,235,2 - movdqa %xmm12,0(%rsp) -.byte 102,69,15,56,0,231 - movdqa %xmm13,16(%rsp) -.byte 102,69,15,56,0,239 - - pshufd $192,%xmm12,%xmm2 - pshufd $128,%xmm12,%xmm3 - pshufd $64,%xmm12,%xmm4 - cmpq $6,%rdx + movl %r9d,80+12(%rsp) + xorl %r11d,%r10d + leaq 7(%r8),%r9 + movl %r10d,96+12(%rsp) + bswapl %r9d + movl _gnutls_x86_cpuid_s+4(%rip),%r10d + xorl %r11d,%r9d + andl $71303168,%r10d + movl %r9d,112+12(%rsp) + + movups 16(%rcx),%xmm1 + + movdqa 64(%rsp),%xmm6 + movdqa 80(%rsp),%xmm7 + + cmpq $8,%rdx jb .Lctr32_tail - shrl $1,%eax - movq %rcx,%r11 - movl %eax,%r10d + subq $6,%rdx + cmpl $4194304,%r10d + je .Lctr32_6x + + leaq 128(%rcx),%rcx + subq $2,%rdx + jmp .Lctr32_loop8 + +.p2align 4 +.Lctr32_6x: + shll $4,%eax + movl $48,%r10d + bswapl %r11d + leaq 32(%rcx,%rax,1),%rcx + subq %rax,%r10 jmp .Lctr32_loop6 .p2align 4 .Lctr32_loop6: - pshufd $192,%xmm13,%xmm5 - por %xmm14,%xmm2 - movups (%r11),%xmm0 - pshufd $128,%xmm13,%xmm6 - por %xmm14,%xmm3 - movups 16(%r11),%xmm1 - pshufd $64,%xmm13,%xmm7 - por %xmm14,%xmm4 - por %xmm14,%xmm5 - xorps %xmm0,%xmm2 - por %xmm14,%xmm6 - por %xmm14,%xmm7 + addl $6,%r8d + movups -48(%rcx,%r10,1),%xmm0 +.byte 102,15,56,220,209 + movl %r8d,%eax + xorl %r11d,%eax +.byte 102,15,56,220,217 +.byte 0x0f,0x38,0xf1,0x44,0x24,12 + leal 1(%r8),%eax +.byte 102,15,56,220,225 + xorl %r11d,%eax +.byte 0x0f,0x38,0xf1,0x44,0x24,28 +.byte 102,15,56,220,233 + leal 2(%r8),%eax + xorl %r11d,%eax +.byte 102,15,56,220,241 +.byte 0x0f,0x38,0xf1,0x44,0x24,44 + leal 3(%r8),%eax +.byte 102,15,56,220,249 + movups -32(%rcx,%r10,1),%xmm1 + xorl %r11d,%eax + +.byte 102,15,56,220,208 +.byte 0x0f,0x38,0xf1,0x44,0x24,60 + leal 4(%r8),%eax +.byte 102,15,56,220,216 + xorl %r11d,%eax +.byte 0x0f,0x38,0xf1,0x44,0x24,76 +.byte 102,15,56,220,224 + leal 5(%r8),%eax + xorl %r11d,%eax +.byte 102,15,56,220,232 +.byte 0x0f,0x38,0xf1,0x44,0x24,92 + movq %r10,%rax +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups -16(%rcx,%r10,1),%xmm0 + call .Lenc_loop6 + movdqu (%rdi),%xmm8 + movdqu 16(%rdi),%xmm9 + movdqu 32(%rdi),%xmm10 + movdqu 48(%rdi),%xmm11 + movdqu 64(%rdi),%xmm12 + movdqu 80(%rdi),%xmm13 + leaq 96(%rdi),%rdi + movups -64(%rcx,%r10,1),%xmm1 + pxor %xmm2,%xmm8 + movaps 0(%rsp),%xmm2 + pxor %xmm3,%xmm9 + movaps 16(%rsp),%xmm3 + pxor %xmm4,%xmm10 + movaps 32(%rsp),%xmm4 + pxor %xmm5,%xmm11 + movaps 48(%rsp),%xmm5 + pxor %xmm6,%xmm12 + movaps 64(%rsp),%xmm6 + pxor %xmm7,%xmm13 + movaps 80(%rsp),%xmm7 + movdqu %xmm8,(%rsi) + movdqu %xmm9,16(%rsi) + movdqu %xmm10,32(%rsi) + movdqu %xmm11,48(%rsi) + movdqu %xmm12,64(%rsi) + movdqu %xmm13,80(%rsi) + leaq 96(%rsi),%rsi + subq $6,%rdx + jnc .Lctr32_loop6 - pxor %xmm0,%xmm3 + addq $6,%rdx + jz .Lctr32_done + + leal -48(%r10),%eax + leaq -80(%rcx,%r10,1),%rcx + negl %eax + shrl $4,%eax + jmp .Lctr32_tail + +.p2align 5 +.Lctr32_loop8: + addl $8,%r8d + movdqa 96(%rsp),%xmm8 .byte 102,15,56,220,209 - leaq 32(%r11),%rcx - pxor %xmm0,%xmm4 + movl %r8d,%r9d + movdqa 112(%rsp),%xmm9 .byte 102,15,56,220,217 - movdqa .Lincrement32(%rip),%xmm13 - pxor %xmm0,%xmm5 + bswapl %r9d + movups 32-128(%rcx),%xmm0 .byte 102,15,56,220,225 - movdqa 0(%rsp),%xmm12 - pxor %xmm0,%xmm6 + xorl %r11d,%r9d + nop .byte 102,15,56,220,233 - pxor %xmm0,%xmm7 - movups (%rcx),%xmm0 - decl %eax + movl %r9d,0+12(%rsp) + leaq 1(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 - jmp .Lctr32_enc_loop6_enter -.p2align 4 -.Lctr32_enc_loop6: +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 48-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movl %r9d,16+12(%rsp) + leaq 2(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 64-128(%rcx),%xmm0 + bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax + xorl %r11d,%r9d +.byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 + movl %r9d,32+12(%rsp) + leaq 3(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -.Lctr32_enc_loop6_enter: - movups 16(%rcx),%xmm1 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 80-128(%rcx),%xmm1 + bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx + xorl %r11d,%r9d +.byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 + movl %r9d,48+12(%rsp) + leaq 4(%r8),%r9 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups (%rcx),%xmm0 - jnz .Lctr32_enc_loop6 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 96-128(%rcx),%xmm0 + bswapl %r9d +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movl %r9d,64+12(%rsp) + leaq 5(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 112-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movl %r9d,80+12(%rsp) + leaq 6(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 128-128(%rcx),%xmm0 + bswapl %r9d +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movl %r9d,96+12(%rsp) + leaq 7(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 144-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 + xorl %r11d,%r9d + movdqu 0(%rdi),%xmm10 +.byte 102,15,56,220,232 + movl %r9d,112+12(%rsp) + cmpl $11,%eax +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 160-128(%rcx),%xmm0 + + jb .Lctr32_enc_done .byte 102,15,56,220,209 - paddd %xmm13,%xmm12 .byte 102,15,56,220,217 - paddd 16(%rsp),%xmm13 .byte 102,15,56,220,225 - movdqa %xmm12,0(%rsp) .byte 102,15,56,220,233 - movdqa %xmm13,16(%rsp) .byte 102,15,56,220,241 -.byte 102,69,15,56,0,231 .byte 102,15,56,220,249 -.byte 102,69,15,56,0,239 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 176-128(%rcx),%xmm1 -.byte 102,15,56,221,208 - movups (%rdi),%xmm8 -.byte 102,15,56,221,216 - movups 16(%rdi),%xmm9 -.byte 102,15,56,221,224 - movups 32(%rdi),%xmm10 -.byte 102,15,56,221,232 - movups 48(%rdi),%xmm11 -.byte 102,15,56,221,240 - movups 64(%rdi),%xmm1 -.byte 102,15,56,221,248 - movups 80(%rdi),%xmm0 - leaq 96(%rdi),%rdi +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 192-128(%rcx),%xmm0 + je .Lctr32_enc_done - xorps %xmm2,%xmm8 - pshufd $192,%xmm12,%xmm2 - xorps %xmm3,%xmm9 - pshufd $128,%xmm12,%xmm3 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - pshufd $64,%xmm12,%xmm4 - movups %xmm9,16(%rsi) - xorps %xmm5,%xmm11 - movups %xmm10,32(%rsi) - xorps %xmm6,%xmm1 - movups %xmm11,48(%rsi) - xorps %xmm7,%xmm0 - movups %xmm1,64(%rsi) - movups %xmm0,80(%rsi) - leaq 96(%rsi),%rsi - movl %r10d,%eax - subq $6,%rdx - jnc .Lctr32_loop6 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 208-128(%rcx),%xmm1 - addq $6,%rdx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 224-128(%rcx),%xmm0 + jmp .Lctr32_enc_done + +.p2align 4 +.Lctr32_enc_done: + movdqu 16(%rdi),%xmm11 + pxor %xmm0,%xmm10 + movdqu 32(%rdi),%xmm12 + pxor %xmm0,%xmm11 + movdqu 48(%rdi),%xmm13 + pxor %xmm0,%xmm12 + movdqu 64(%rdi),%xmm14 + pxor %xmm0,%xmm13 + movdqu 80(%rdi),%xmm15 + pxor %xmm0,%xmm14 + pxor %xmm0,%xmm15 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movdqu 96(%rdi),%xmm1 + leaq 128(%rdi),%rdi + +.byte 102,65,15,56,221,210 + pxor %xmm0,%xmm1 + movdqu 112-128(%rdi),%xmm10 +.byte 102,65,15,56,221,219 + pxor %xmm0,%xmm10 + movdqa 0(%rsp),%xmm11 +.byte 102,65,15,56,221,228 +.byte 102,65,15,56,221,237 + movdqa 16(%rsp),%xmm12 + movdqa 32(%rsp),%xmm13 +.byte 102,65,15,56,221,246 +.byte 102,65,15,56,221,255 + movdqa 48(%rsp),%xmm14 + movdqa 64(%rsp),%xmm15 +.byte 102,68,15,56,221,193 + movdqa 80(%rsp),%xmm0 + movups 16-128(%rcx),%xmm1 +.byte 102,69,15,56,221,202 + + movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 + movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 + movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 + movups %xmm5,48(%rsi) + movdqa %xmm14,%xmm5 + movups %xmm6,64(%rsi) + movdqa %xmm15,%xmm6 + movups %xmm7,80(%rsi) + movdqa %xmm0,%xmm7 + movups %xmm8,96(%rsi) + movups %xmm9,112(%rsi) + leaq 128(%rsi),%rsi + + subq $8,%rdx + jnc .Lctr32_loop8 + + addq $8,%rdx jz .Lctr32_done - movq %r11,%rcx - leal 1(%rax,%rax,1),%eax + leaq -128(%rcx),%rcx .Lctr32_tail: - por %xmm14,%xmm2 - movups (%rdi),%xmm8 - cmpq $2,%rdx - jb .Lctr32_one + leaq 16(%rcx),%rcx + cmpq $4,%rdx + jb .Lctr32_loop3 + je .Lctr32_loop4 - por %xmm14,%xmm3 - movups 16(%rdi),%xmm9 - je .Lctr32_two + shll $4,%eax + movdqa 96(%rsp),%xmm8 + pxor %xmm9,%xmm9 - pshufd $192,%xmm13,%xmm5 - por %xmm14,%xmm4 - movups 32(%rdi),%xmm10 - cmpq $4,%rdx - jb .Lctr32_three + movups 16(%rcx),%xmm0 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + leaq 32-16(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,220,225 + addq $16,%rax + movups (%rdi),%xmm10 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 + movups 16(%rdi),%xmm11 + movups 32(%rdi),%xmm12 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 - pshufd $128,%xmm13,%xmm6 - por %xmm14,%xmm5 - movups 48(%rdi),%xmm11 - je .Lctr32_four + call .Lenc_loop8_enter - por %xmm14,%xmm6 - xorps %xmm7,%xmm7 + movdqu 48(%rdi),%xmm13 + pxor %xmm10,%xmm2 + movdqu 64(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm10,%xmm6 + movdqu %xmm5,48(%rsi) + movdqu %xmm6,64(%rsi) + cmpq $6,%rdx + jb .Lctr32_done - call _aesni_encrypt6 + movups 80(%rdi),%xmm11 + xorps %xmm11,%xmm7 + movups %xmm7,80(%rsi) + je .Lctr32_done - movups 64(%rdi),%xmm1 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - movups %xmm9,16(%rsi) - xorps %xmm5,%xmm11 - movups %xmm10,32(%rsi) - xorps %xmm6,%xmm1 - movups %xmm11,48(%rsi) - movups %xmm1,64(%rsi) + movups 96(%rdi),%xmm12 + xorps %xmm12,%xmm8 + movups %xmm8,96(%rsi) + jmp .Lctr32_done + +.p2align 5 +.Lctr32_loop4: +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx + decl %eax +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups (%rcx),%xmm1 + jnz .Lctr32_loop4 +.byte 102,15,56,221,209 +.byte 102,15,56,221,217 + movups (%rdi),%xmm10 + movups 16(%rdi),%xmm11 +.byte 102,15,56,221,225 +.byte 102,15,56,221,233 + movups 32(%rdi),%xmm12 + movups 48(%rdi),%xmm13 + + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm4,32(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm5,48(%rsi) + jmp .Lctr32_done + +.p2align 5 +.Lctr32_loop3: +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx + decl %eax +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 + movups (%rcx),%xmm1 + jnz .Lctr32_loop3 +.byte 102,15,56,221,209 +.byte 102,15,56,221,217 +.byte 102,15,56,221,225 + + movups (%rdi),%xmm10 + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + cmpq $2,%rdx + jb .Lctr32_done + + movups 16(%rdi),%xmm11 + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + je .Lctr32_done + + movups 32(%rdi),%xmm12 + xorps %xmm12,%xmm4 + movups %xmm4,32(%rsi) jmp .Lctr32_done .p2align 4 .Lctr32_one_shortcut: movups (%r8),%xmm2 - movups (%rdi),%xmm8 + movups (%rdi),%xmm10 movl 240(%rcx),%eax -.Lctr32_one: movups (%rcx),%xmm0 movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx @@ -1211,58 +1594,27 @@ aesni_ctr32_encrypt_blocks: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_7 + jnz .Loop_enc1_7 .byte 102,15,56,221,209 - xorps %xmm2,%xmm8 - movups %xmm8,(%rsi) - jmp .Lctr32_done - -.p2align 4 -.Lctr32_two: - xorps %xmm4,%xmm4 - call _aesni_encrypt3 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - movups %xmm9,16(%rsi) - jmp .Lctr32_done - -.p2align 4 -.Lctr32_three: - call _aesni_encrypt3 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - movups %xmm9,16(%rsi) - movups %xmm10,32(%rsi) + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) jmp .Lctr32_done .p2align 4 -.Lctr32_four: - call _aesni_encrypt4 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - movups %xmm9,16(%rsi) - xorps %xmm5,%xmm11 - movups %xmm10,32(%rsi) - movups %xmm11,48(%rsi) - .Lctr32_done: - movaps 32(%rsp),%xmm6 - movaps 48(%rsp),%xmm7 - movaps 64(%rsp),%xmm8 - movaps 80(%rsp),%xmm9 - movaps 96(%rsp),%xmm10 - movaps 112(%rsp),%xmm11 - movaps 128(%rsp),%xmm12 - movaps 144(%rsp),%xmm13 - movaps 160(%rsp),%xmm14 - movaps 176(%rsp),%xmm15 - leaq 200(%rsp),%rsp -.Lctr32_ret: + movaps -160(%rbp),%xmm6 + movaps -144(%rbp),%xmm7 + movaps -128(%rbp),%xmm8 + movaps -112(%rbp),%xmm9 + movaps -96(%rbp),%xmm10 + movaps -80(%rbp),%xmm11 + movaps -64(%rbp),%xmm12 + movaps -48(%rbp),%xmm13 + movaps -32(%rbp),%xmm14 + movaps -16(%rbp),%xmm15 + leaq (%rbp),%rsp + popq %rbp +.Lctr32_epilogue: movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 @@ -1282,254 +1634,297 @@ aesni_xts_encrypt: movq 40(%rsp),%r8 movq 48(%rsp),%r9 - leaq -264(%rsp),%rsp - movaps %xmm6,96(%rsp) - movaps %xmm7,112(%rsp) - movaps %xmm8,128(%rsp) - movaps %xmm9,144(%rsp) - movaps %xmm10,160(%rsp) - movaps %xmm11,176(%rsp) - movaps %xmm12,192(%rsp) - movaps %xmm13,208(%rsp) - movaps %xmm14,224(%rsp) - movaps %xmm15,240(%rsp) + leaq (%rsp),%rax + pushq %rbp + subq $272,%rsp + andq $-16,%rsp + movaps %xmm6,-168(%rax) + movaps %xmm7,-152(%rax) + movaps %xmm8,-136(%rax) + movaps %xmm9,-120(%rax) + movaps %xmm10,-104(%rax) + movaps %xmm11,-88(%rax) + movaps %xmm12,-72(%rax) + movaps %xmm13,-56(%rax) + movaps %xmm14,-40(%rax) + movaps %xmm15,-24(%rax) .Lxts_enc_body: - movups (%r9),%xmm15 + leaq -8(%rax),%rbp + movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d movups (%r8),%xmm0 movups 16(%r8),%xmm1 leaq 32(%r8),%r8 - xorps %xmm0,%xmm15 + xorps %xmm0,%xmm2 .Loop_enc1_8: -.byte 102,68,15,56,220,249 +.byte 102,15,56,220,209 decl %eax movups (%r8),%xmm1 leaq 16(%r8),%r8 - jnz .Loop_enc1_8 -.byte 102,68,15,56,221,249 + jnz .Loop_enc1_8 +.byte 102,15,56,221,209 + movups (%rcx),%xmm0 movq %rcx,%r11 movl %r10d,%eax + shll $4,%r10d movq %rdx,%r9 andq $-16,%rdx + movups 16(%rcx,%r10,1),%xmm1 + movdqa .Lxts_magic(%rip),%xmm8 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + movdqa %xmm2,%xmm15 + pshufd $95,%xmm2,%xmm9 + pxor %xmm0,%xmm1 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm10 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm10 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm11 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm11 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm12 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm12 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm13 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm13 + pxor %xmm14,%xmm15 + movdqa %xmm15,%xmm14 + psrad $31,%xmm9 paddq %xmm15,%xmm15 pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 + pxor %xmm0,%xmm14 pxor %xmm9,%xmm15 + movaps %xmm1,96(%rsp) + subq $96,%rdx jc .Lxts_enc_short - shrl $1,%eax - subl $1,%eax - movl %eax,%r10d + movl $16+96,%eax + leaq 32(%r11,%r10,1),%rcx + subq %r10,%rax + movups 16(%r11),%xmm1 + movq %rax,%r10 + leaq .Lxts_magic(%rip),%r8 jmp .Lxts_enc_grandloop -.p2align 4 +.p2align 5 .Lxts_enc_grandloop: - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu 0(%rdi),%xmm2 - pand %xmm8,%xmm9 + movdqa %xmm0,%xmm8 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 + movdqu 32(%rdi),%xmm4 pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 +.byte 102,15,56,220,209 + movdqu 48(%rdi),%xmm5 pxor %xmm12,%xmm4 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi +.byte 102,15,56,220,217 + movdqu 64(%rdi),%xmm6 pxor %xmm13,%xmm5 - movups (%r11),%xmm0 +.byte 102,15,56,220,225 + movdqu 80(%rdi),%xmm7 + pxor %xmm15,%xmm8 + movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 - pxor %xmm15,%xmm7 - - +.byte 102,15,56,220,233 + movups 32(%r11),%xmm0 + leaq 96(%rdi),%rdi + pxor %xmm8,%xmm7 - movups 16(%r11),%xmm1 - pxor %xmm0,%xmm2 - pxor %xmm0,%xmm3 + pxor %xmm9,%xmm10 +.byte 102,15,56,220,241 + pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) -.byte 102,15,56,220,209 - leaq 32(%r11),%rcx - pxor %xmm0,%xmm4 +.byte 102,15,56,220,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm12 + +.byte 102,15,56,220,208 + pxor %xmm9,%xmm13 movdqa %xmm11,16(%rsp) -.byte 102,15,56,220,217 - pxor %xmm0,%xmm5 +.byte 102,15,56,220,216 + pxor %xmm9,%xmm14 movdqa %xmm12,32(%rsp) -.byte 102,15,56,220,225 - pxor %xmm0,%xmm6 - movdqa %xmm13,48(%rsp) -.byte 102,15,56,220,233 - pxor %xmm0,%xmm7 - movups (%rcx),%xmm0 - decl %eax +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + pxor %xmm9,%xmm8 movdqa %xmm14,64(%rsp) -.byte 102,15,56,220,241 - movdqa %xmm15,80(%rsp) -.byte 102,15,56,220,249 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - jmp .Lxts_enc_loop6_enter - -.p2align 4 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups 64(%r11),%xmm0 + movdqa %xmm8,80(%rsp) + pshufd $95,%xmm15,%xmm9 + jmp .Lxts_enc_loop6 +.p2align 5 .Lxts_enc_loop6: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -.Lxts_enc_loop6_enter: - movups 16(%rcx),%xmm1 + movups -64(%rcx,%rax,1),%xmm1 + addq $32,%rax + .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups (%rcx),%xmm0 + movups -80(%rcx,%rax,1),%xmm0 jnz .Lxts_enc_loop6 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - paddq %xmm15,%xmm15 + movdqa (%r8),%xmm8 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,220,209 - pand %xmm8,%xmm9 + paddq %xmm15,%xmm15 + psrad $31,%xmm14 .byte 102,15,56,220,217 - pcmpgtd %xmm15,%xmm14 + pand %xmm8,%xmm14 + movups (%r11),%xmm10 .byte 102,15,56,220,225 - pxor %xmm9,%xmm15 .byte 102,15,56,220,233 .byte 102,15,56,220,241 + pxor %xmm14,%xmm15 + movaps %xmm10,%xmm11 .byte 102,15,56,220,249 - movups 16(%rcx),%xmm1 + movups -64(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm10 - paddq %xmm15,%xmm15 + movdqa %xmm9,%xmm14 .byte 102,15,56,220,208 - pand %xmm8,%xmm9 + paddd %xmm9,%xmm9 + pxor %xmm15,%xmm10 .byte 102,15,56,220,216 - pcmpgtd %xmm15,%xmm14 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 .byte 102,15,56,220,224 - pxor %xmm9,%xmm15 .byte 102,15,56,220,232 + pand %xmm8,%xmm14 + movaps %xmm11,%xmm12 .byte 102,15,56,220,240 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 .byte 102,15,56,220,248 - movups 32(%rcx),%xmm0 + movups -48(%rcx),%xmm0 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm11 - paddq %xmm15,%xmm15 + paddd %xmm9,%xmm9 .byte 102,15,56,220,209 - pand %xmm8,%xmm9 + pxor %xmm15,%xmm11 + psrad $31,%xmm14 .byte 102,15,56,220,217 - pcmpgtd %xmm15,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 .byte 102,15,56,220,225 - pxor %xmm9,%xmm15 .byte 102,15,56,220,233 + movdqa %xmm13,48(%rsp) + pxor %xmm14,%xmm15 .byte 102,15,56,220,241 + movaps %xmm12,%xmm13 + movdqa %xmm9,%xmm14 .byte 102,15,56,220,249 + movups -32(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm12 + paddd %xmm9,%xmm9 +.byte 102,15,56,220,208 + pxor %xmm15,%xmm12 + psrad $31,%xmm14 +.byte 102,15,56,220,216 paddq %xmm15,%xmm15 -.byte 102,15,56,221,208 - pand %xmm8,%xmm9 -.byte 102,15,56,221,216 - pcmpgtd %xmm15,%xmm14 -.byte 102,15,56,221,224 - pxor %xmm9,%xmm15 -.byte 102,15,56,221,232 -.byte 102,15,56,221,240 -.byte 102,15,56,221,248 + pand %xmm8,%xmm14 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 + pxor %xmm14,%xmm15 + movaps %xmm13,%xmm14 +.byte 102,15,56,220,248 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm13 + movdqa %xmm9,%xmm0 + paddd %xmm9,%xmm9 +.byte 102,15,56,220,209 + pxor %xmm15,%xmm13 + psrad $31,%xmm0 +.byte 102,15,56,220,217 paddq %xmm15,%xmm15 - xorps 0(%rsp),%xmm2 + pand %xmm8,%xmm0 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + pxor %xmm0,%xmm15 + movups (%r11),%xmm0 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups 16(%r11),%xmm1 + + pxor %xmm15,%xmm14 +.byte 102,15,56,221,84,36,0 + psrad $31,%xmm9 + paddq %xmm15,%xmm15 +.byte 102,15,56,221,92,36,16 +.byte 102,15,56,221,100,36,32 pand %xmm8,%xmm9 - xorps 16(%rsp),%xmm3 - pcmpgtd %xmm15,%xmm14 + movq %r10,%rax +.byte 102,15,56,221,108,36,48 +.byte 102,15,56,221,116,36,64 +.byte 102,15,56,221,124,36,80 pxor %xmm9,%xmm15 - xorps 32(%rsp),%xmm4 - movups %xmm2,0(%rsi) - xorps 48(%rsp),%xmm5 - movups %xmm3,16(%rsi) - xorps 64(%rsp),%xmm6 - movups %xmm4,32(%rsi) - xorps 80(%rsp),%xmm7 - movups %xmm5,48(%rsi) - movl %r10d,%eax - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) leaq 96(%rsi),%rsi + movups %xmm2,-96(%rsi) + movups %xmm3,-80(%rsi) + movups %xmm4,-64(%rsi) + movups %xmm5,-48(%rsi) + movups %xmm6,-32(%rsi) + movups %xmm7,-16(%rsi) subq $96,%rdx jnc .Lxts_enc_grandloop - leal 3(%rax,%rax,1),%eax + movl $16+96,%eax + subl %r10d,%eax movq %r11,%rcx - movl %eax,%r10d + shrl $4,%eax .Lxts_enc_short: + movl %eax,%r10d + pxor %xmm0,%xmm10 addq $96,%rdx jz .Lxts_enc_done + pxor %xmm0,%xmm11 cmpq $32,%rdx jb .Lxts_enc_one + pxor %xmm0,%xmm12 je .Lxts_enc_two + pxor %xmm0,%xmm13 cmpq $64,%rdx jb .Lxts_enc_three + pxor %xmm0,%xmm14 je .Lxts_enc_four - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu (%rdi),%xmm2 - pand %xmm8,%xmm9 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 movdqu 48(%rdi),%xmm5 @@ -1570,7 +1965,7 @@ aesni_xts_encrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_9 + jnz .Loop_enc1_9 .byte 102,15,56,221,209 xorps %xmm10,%xmm2 movdqa %xmm11,%xmm10 @@ -1586,7 +1981,7 @@ aesni_xts_encrypt: xorps %xmm10,%xmm2 xorps %xmm11,%xmm3 - call _aesni_encrypt3 + call _aesni_encrypt2 xorps %xmm10,%xmm2 movdqa %xmm12,%xmm10 @@ -1632,15 +2027,15 @@ aesni_xts_encrypt: call _aesni_encrypt4 - xorps %xmm10,%xmm2 - movdqa %xmm15,%xmm10 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) + pxor %xmm10,%xmm2 + movdqa %xmm14,%xmm10 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm4 + movdqu %xmm2,(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm3,16(%rsi) + movdqu %xmm4,32(%rsi) + movdqu %xmm5,48(%rsi) leaq 64(%rsi),%rsi jmp .Lxts_enc_done @@ -1675,23 +2070,24 @@ aesni_xts_encrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_10 + jnz .Loop_enc1_10 .byte 102,15,56,221,209 xorps %xmm10,%xmm2 movups %xmm2,-16(%rsi) .Lxts_enc_ret: - movaps 96(%rsp),%xmm6 - movaps 112(%rsp),%xmm7 - movaps 128(%rsp),%xmm8 - movaps 144(%rsp),%xmm9 - movaps 160(%rsp),%xmm10 - movaps 176(%rsp),%xmm11 - movaps 192(%rsp),%xmm12 - movaps 208(%rsp),%xmm13 - movaps 224(%rsp),%xmm14 - movaps 240(%rsp),%xmm15 - leaq 264(%rsp),%rsp + movaps -160(%rbp),%xmm6 + movaps -144(%rbp),%xmm7 + movaps -128(%rbp),%xmm8 + movaps -112(%rbp),%xmm9 + movaps -96(%rbp),%xmm10 + movaps -80(%rbp),%xmm11 + movaps -64(%rbp),%xmm12 + movaps -48(%rbp),%xmm13 + movaps -32(%rbp),%xmm14 + movaps -16(%rbp),%xmm15 + leaq (%rbp),%rsp + popq %rbp .Lxts_enc_epilogue: movq 8(%rsp),%rdi movq 16(%rsp),%rsi @@ -1712,260 +2108,303 @@ aesni_xts_decrypt: movq 40(%rsp),%r8 movq 48(%rsp),%r9 - leaq -264(%rsp),%rsp - movaps %xmm6,96(%rsp) - movaps %xmm7,112(%rsp) - movaps %xmm8,128(%rsp) - movaps %xmm9,144(%rsp) - movaps %xmm10,160(%rsp) - movaps %xmm11,176(%rsp) - movaps %xmm12,192(%rsp) - movaps %xmm13,208(%rsp) - movaps %xmm14,224(%rsp) - movaps %xmm15,240(%rsp) + leaq (%rsp),%rax + pushq %rbp + subq $272,%rsp + andq $-16,%rsp + movaps %xmm6,-168(%rax) + movaps %xmm7,-152(%rax) + movaps %xmm8,-136(%rax) + movaps %xmm9,-120(%rax) + movaps %xmm10,-104(%rax) + movaps %xmm11,-88(%rax) + movaps %xmm12,-72(%rax) + movaps %xmm13,-56(%rax) + movaps %xmm14,-40(%rax) + movaps %xmm15,-24(%rax) .Lxts_dec_body: - movups (%r9),%xmm15 + leaq -8(%rax),%rbp + movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d movups (%r8),%xmm0 movups 16(%r8),%xmm1 leaq 32(%r8),%r8 - xorps %xmm0,%xmm15 + xorps %xmm0,%xmm2 .Loop_enc1_11: -.byte 102,68,15,56,220,249 +.byte 102,15,56,220,209 decl %eax movups (%r8),%xmm1 leaq 16(%r8),%r8 - jnz .Loop_enc1_11 -.byte 102,68,15,56,221,249 + jnz .Loop_enc1_11 +.byte 102,15,56,221,209 xorl %eax,%eax testq $15,%rdx setnz %al shlq $4,%rax subq %rax,%rdx + movups (%rcx),%xmm0 movq %rcx,%r11 movl %r10d,%eax + shll $4,%r10d movq %rdx,%r9 andq $-16,%rdx + movups 16(%rcx,%r10,1),%xmm1 + movdqa .Lxts_magic(%rip),%xmm8 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + movdqa %xmm2,%xmm15 + pshufd $95,%xmm2,%xmm9 + pxor %xmm0,%xmm1 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm10 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm10 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm11 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm11 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm12 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm12 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm13 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm13 + pxor %xmm14,%xmm15 + movdqa %xmm15,%xmm14 + psrad $31,%xmm9 paddq %xmm15,%xmm15 pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 + pxor %xmm0,%xmm14 pxor %xmm9,%xmm15 + movaps %xmm1,96(%rsp) + subq $96,%rdx jc .Lxts_dec_short - shrl $1,%eax - subl $1,%eax - movl %eax,%r10d + movl $16+96,%eax + leaq 32(%r11,%r10,1),%rcx + subq %r10,%rax + movups 16(%r11),%xmm1 + movq %rax,%r10 + leaq .Lxts_magic(%rip),%r8 jmp .Lxts_dec_grandloop -.p2align 4 +.p2align 5 .Lxts_dec_grandloop: - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu 0(%rdi),%xmm2 - pand %xmm8,%xmm9 + movdqa %xmm0,%xmm8 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 + movdqu 32(%rdi),%xmm4 pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 +.byte 102,15,56,222,209 + movdqu 48(%rdi),%xmm5 pxor %xmm12,%xmm4 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi +.byte 102,15,56,222,217 + movdqu 64(%rdi),%xmm6 pxor %xmm13,%xmm5 - movups (%r11),%xmm0 +.byte 102,15,56,222,225 + movdqu 80(%rdi),%xmm7 + pxor %xmm15,%xmm8 + movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 - pxor %xmm15,%xmm7 - - +.byte 102,15,56,222,233 + movups 32(%r11),%xmm0 + leaq 96(%rdi),%rdi + pxor %xmm8,%xmm7 - movups 16(%r11),%xmm1 - pxor %xmm0,%xmm2 - pxor %xmm0,%xmm3 + pxor %xmm9,%xmm10 +.byte 102,15,56,222,241 + pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) -.byte 102,15,56,222,209 - leaq 32(%r11),%rcx - pxor %xmm0,%xmm4 +.byte 102,15,56,222,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm12 + +.byte 102,15,56,222,208 + pxor %xmm9,%xmm13 movdqa %xmm11,16(%rsp) -.byte 102,15,56,222,217 - pxor %xmm0,%xmm5 +.byte 102,15,56,222,216 + pxor %xmm9,%xmm14 movdqa %xmm12,32(%rsp) -.byte 102,15,56,222,225 - pxor %xmm0,%xmm6 - movdqa %xmm13,48(%rsp) -.byte 102,15,56,222,233 - pxor %xmm0,%xmm7 - movups (%rcx),%xmm0 - decl %eax +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + pxor %xmm9,%xmm8 movdqa %xmm14,64(%rsp) -.byte 102,15,56,222,241 - movdqa %xmm15,80(%rsp) -.byte 102,15,56,222,249 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - jmp .Lxts_dec_loop6_enter - -.p2align 4 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups 64(%r11),%xmm0 + movdqa %xmm8,80(%rsp) + pshufd $95,%xmm15,%xmm9 + jmp .Lxts_dec_loop6 +.p2align 5 .Lxts_dec_loop6: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 -.Lxts_dec_loop6_enter: - movups 16(%rcx),%xmm1 + movups -64(%rcx,%rax,1),%xmm1 + addq $32,%rax + .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups (%rcx),%xmm0 + movups -80(%rcx,%rax,1),%xmm0 jnz .Lxts_dec_loop6 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - paddq %xmm15,%xmm15 + movdqa (%r8),%xmm8 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,222,209 - pand %xmm8,%xmm9 + paddq %xmm15,%xmm15 + psrad $31,%xmm14 .byte 102,15,56,222,217 - pcmpgtd %xmm15,%xmm14 + pand %xmm8,%xmm14 + movups (%r11),%xmm10 .byte 102,15,56,222,225 - pxor %xmm9,%xmm15 .byte 102,15,56,222,233 .byte 102,15,56,222,241 + pxor %xmm14,%xmm15 + movaps %xmm10,%xmm11 .byte 102,15,56,222,249 - movups 16(%rcx),%xmm1 + movups -64(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm10 - paddq %xmm15,%xmm15 + movdqa %xmm9,%xmm14 .byte 102,15,56,222,208 - pand %xmm8,%xmm9 + paddd %xmm9,%xmm9 + pxor %xmm15,%xmm10 .byte 102,15,56,222,216 - pcmpgtd %xmm15,%xmm14 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 .byte 102,15,56,222,224 - pxor %xmm9,%xmm15 .byte 102,15,56,222,232 + pand %xmm8,%xmm14 + movaps %xmm11,%xmm12 .byte 102,15,56,222,240 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 .byte 102,15,56,222,248 - movups 32(%rcx),%xmm0 + movups -48(%rcx),%xmm0 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm11 - paddq %xmm15,%xmm15 + paddd %xmm9,%xmm9 .byte 102,15,56,222,209 - pand %xmm8,%xmm9 + pxor %xmm15,%xmm11 + psrad $31,%xmm14 .byte 102,15,56,222,217 - pcmpgtd %xmm15,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 .byte 102,15,56,222,225 - pxor %xmm9,%xmm15 .byte 102,15,56,222,233 + movdqa %xmm13,48(%rsp) + pxor %xmm14,%xmm15 .byte 102,15,56,222,241 + movaps %xmm12,%xmm13 + movdqa %xmm9,%xmm14 .byte 102,15,56,222,249 + movups -32(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm12 + paddd %xmm9,%xmm9 +.byte 102,15,56,222,208 + pxor %xmm15,%xmm12 + psrad $31,%xmm14 +.byte 102,15,56,222,216 paddq %xmm15,%xmm15 -.byte 102,15,56,223,208 - pand %xmm8,%xmm9 -.byte 102,15,56,223,216 - pcmpgtd %xmm15,%xmm14 -.byte 102,15,56,223,224 - pxor %xmm9,%xmm15 -.byte 102,15,56,223,232 -.byte 102,15,56,223,240 -.byte 102,15,56,223,248 + pand %xmm8,%xmm14 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 + pxor %xmm14,%xmm15 + movaps %xmm13,%xmm14 +.byte 102,15,56,222,248 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm13 + movdqa %xmm9,%xmm0 + paddd %xmm9,%xmm9 +.byte 102,15,56,222,209 + pxor %xmm15,%xmm13 + psrad $31,%xmm0 +.byte 102,15,56,222,217 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm0 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + pxor %xmm0,%xmm15 + movups (%r11),%xmm0 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups 16(%r11),%xmm1 + + pxor %xmm15,%xmm14 +.byte 102,15,56,223,84,36,0 + psrad $31,%xmm9 paddq %xmm15,%xmm15 - xorps 0(%rsp),%xmm2 +.byte 102,15,56,223,92,36,16 +.byte 102,15,56,223,100,36,32 pand %xmm8,%xmm9 - xorps 16(%rsp),%xmm3 - pcmpgtd %xmm15,%xmm14 + movq %r10,%rax +.byte 102,15,56,223,108,36,48 +.byte 102,15,56,223,116,36,64 +.byte 102,15,56,223,124,36,80 pxor %xmm9,%xmm15 - xorps 32(%rsp),%xmm4 - movups %xmm2,0(%rsi) - xorps 48(%rsp),%xmm5 - movups %xmm3,16(%rsi) - xorps 64(%rsp),%xmm6 - movups %xmm4,32(%rsi) - xorps 80(%rsp),%xmm7 - movups %xmm5,48(%rsi) - movl %r10d,%eax - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) leaq 96(%rsi),%rsi + movups %xmm2,-96(%rsi) + movups %xmm3,-80(%rsi) + movups %xmm4,-64(%rsi) + movups %xmm5,-48(%rsi) + movups %xmm6,-32(%rsi) + movups %xmm7,-16(%rsi) subq $96,%rdx jnc .Lxts_dec_grandloop - leal 3(%rax,%rax,1),%eax + movl $16+96,%eax + subl %r10d,%eax movq %r11,%rcx - movl %eax,%r10d + shrl $4,%eax .Lxts_dec_short: + movl %eax,%r10d + pxor %xmm0,%xmm10 + pxor %xmm0,%xmm11 addq $96,%rdx jz .Lxts_dec_done + pxor %xmm0,%xmm12 cmpq $32,%rdx jb .Lxts_dec_one + pxor %xmm0,%xmm13 je .Lxts_dec_two + pxor %xmm0,%xmm14 cmpq $64,%rdx jb .Lxts_dec_three je .Lxts_dec_four - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu (%rdi),%xmm2 - pand %xmm8,%xmm9 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 movdqu 48(%rdi),%xmm5 @@ -2015,7 +2454,7 @@ aesni_xts_decrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_12 + jnz .Loop_dec1_12 .byte 102,15,56,223,209 xorps %xmm10,%xmm2 movdqa %xmm11,%xmm10 @@ -2032,7 +2471,7 @@ aesni_xts_decrypt: xorps %xmm10,%xmm2 xorps %xmm11,%xmm3 - call _aesni_decrypt3 + call _aesni_decrypt2 xorps %xmm10,%xmm2 movdqa %xmm12,%xmm10 @@ -2058,7 +2497,7 @@ aesni_xts_decrypt: xorps %xmm10,%xmm2 movdqa %xmm13,%xmm10 xorps %xmm11,%xmm3 - movdqa %xmm15,%xmm11 + movdqa %xmm14,%xmm11 xorps %xmm12,%xmm4 movups %xmm2,(%rsi) movups %xmm3,16(%rsi) @@ -2068,14 +2507,8 @@ aesni_xts_decrypt: .p2align 4 .Lxts_dec_four: - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movups (%rdi),%xmm2 - pand %xmm8,%xmm9 movups 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - movups 32(%rdi),%xmm4 xorps %xmm10,%xmm2 movups 48(%rdi),%xmm5 @@ -2086,16 +2519,16 @@ aesni_xts_decrypt: call _aesni_decrypt4 - xorps %xmm10,%xmm2 + pxor %xmm10,%xmm2 movdqa %xmm14,%xmm10 - xorps %xmm11,%xmm3 + pxor %xmm11,%xmm3 movdqa %xmm15,%xmm11 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm2,(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm3,16(%rsi) + movdqu %xmm4,32(%rsi) + movdqu %xmm5,48(%rsi) leaq 64(%rsi),%rsi jmp .Lxts_dec_done @@ -2119,7 +2552,7 @@ aesni_xts_decrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_13 + jnz .Loop_dec1_13 .byte 102,15,56,223,209 xorps %xmm11,%xmm2 movups %xmm2,(%rsi) @@ -2149,23 +2582,24 @@ aesni_xts_decrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_14 + jnz .Loop_dec1_14 .byte 102,15,56,223,209 xorps %xmm10,%xmm2 movups %xmm2,(%rsi) .Lxts_dec_ret: - movaps 96(%rsp),%xmm6 - movaps 112(%rsp),%xmm7 - movaps 128(%rsp),%xmm8 - movaps 144(%rsp),%xmm9 - movaps 160(%rsp),%xmm10 - movaps 176(%rsp),%xmm11 - movaps 192(%rsp),%xmm12 - movaps 208(%rsp),%xmm13 - movaps 224(%rsp),%xmm14 - movaps 240(%rsp),%xmm15 - leaq 264(%rsp),%rsp + movaps -160(%rbp),%xmm6 + movaps -144(%rbp),%xmm7 + movaps -128(%rbp),%xmm8 + movaps -112(%rbp),%xmm9 + movaps -96(%rbp),%xmm10 + movaps -80(%rbp),%xmm11 + movaps -64(%rbp),%xmm12 + movaps -48(%rbp),%xmm13 + movaps -32(%rbp),%xmm14 + movaps -16(%rbp),%xmm15 + leaq (%rbp),%rsp + popq %rbp .Lxts_dec_epilogue: movq 8(%rsp),%rdi movq 16(%rsp),%rsi @@ -2215,7 +2649,7 @@ aesni_cbc_encrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_15 + jnz .Loop_enc1_15 .byte 102,15,56,221,209 movl %r10d,%eax movq %r11,%rcx @@ -2231,169 +2665,408 @@ aesni_cbc_encrypt: .Lcbc_enc_tail: movq %rdx,%rcx xchgq %rdi,%rsi -.long 0x9066A4F3 +.long 0x9066A4F3 movl $16,%ecx subq %rdx,%rcx xorl %eax,%eax -.long 0x9066AAF3 +.long 0x9066AAF3 leaq -16(%rdi),%rdi movl %r10d,%eax movq %rdi,%rsi movq %r11,%rcx xorq %rdx,%rdx - jmp .Lcbc_enc_loop + jmp .Lcbc_enc_loop .p2align 4 .Lcbc_decrypt: - leaq -88(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,16(%rsp) - movaps %xmm8,32(%rsp) - movaps %xmm9,48(%rsp) + leaq (%rsp),%rax + pushq %rbp + subq $176,%rsp + andq $-16,%rsp + movaps %xmm6,16(%rsp) + movaps %xmm7,32(%rsp) + movaps %xmm8,48(%rsp) + movaps %xmm9,64(%rsp) + movaps %xmm10,80(%rsp) + movaps %xmm11,96(%rsp) + movaps %xmm12,112(%rsp) + movaps %xmm13,128(%rsp) + movaps %xmm14,144(%rsp) + movaps %xmm15,160(%rsp) .Lcbc_decrypt_body: - movups (%r8),%xmm9 + leaq -8(%rax),%rbp + movups (%r8),%xmm10 movl %r10d,%eax - cmpq $112,%rdx + cmpq $80,%rdx jbe .Lcbc_dec_tail - shrl $1,%r10d - subq $112,%rdx - movl %r10d,%eax - movaps %xmm9,64(%rsp) + + movups (%rcx),%xmm0 + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqa %xmm2,%xmm11 + movdqu 32(%rdi),%xmm4 + movdqa %xmm3,%xmm12 + movdqu 48(%rdi),%xmm5 + movdqa %xmm4,%xmm13 + movdqu 64(%rdi),%xmm6 + movdqa %xmm5,%xmm14 + movdqu 80(%rdi),%xmm7 + movdqa %xmm6,%xmm15 + movl _gnutls_x86_cpuid_s+4(%rip),%r9d + cmpq $112,%rdx + jbe .Lcbc_dec_six_or_seven + + andl $71303168,%r9d + subq $80,%rdx + cmpl $4194304,%r9d + je .Lcbc_dec_loop6_enter + subq $32,%rdx + leaq 112(%rcx),%rcx jmp .Lcbc_dec_loop8_enter .p2align 4 .Lcbc_dec_loop8: - movaps %xmm0,64(%rsp) movups %xmm9,(%rsi) leaq 16(%rsi),%rsi .Lcbc_dec_loop8_enter: - movups (%rcx),%xmm0 - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 16(%rcx),%xmm1 + movdqu 96(%rdi),%xmm8 + pxor %xmm0,%xmm2 + movdqu 112(%rdi),%xmm9 + pxor %xmm0,%xmm3 + movups 16-112(%rcx),%xmm1 + pxor %xmm0,%xmm4 + xorq %r11,%r11 + cmpq $112,%rdx + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 + pxor %xmm0,%xmm8 - leaq 32(%rcx),%rcx - movdqu 32(%rdi),%xmm4 - xorps %xmm0,%xmm2 - movdqu 48(%rdi),%xmm5 - xorps %xmm0,%xmm3 - movdqu 64(%rdi),%xmm6 .byte 102,15,56,222,209 - pxor %xmm0,%xmm4 - movdqu 80(%rdi),%xmm7 + pxor %xmm0,%xmm9 + movups 32-112(%rcx),%xmm0 .byte 102,15,56,222,217 - pxor %xmm0,%xmm5 - movdqu 96(%rdi),%xmm8 .byte 102,15,56,222,225 - pxor %xmm0,%xmm6 - movdqu 112(%rdi),%xmm9 .byte 102,15,56,222,233 - pxor %xmm0,%xmm7 - decl %eax .byte 102,15,56,222,241 - pxor %xmm0,%xmm8 .byte 102,15,56,222,249 - pxor %xmm0,%xmm9 - movups (%rcx),%xmm0 .byte 102,68,15,56,222,193 + setnc %r11b + shlq $7,%r11 .byte 102,68,15,56,222,201 - movups 16(%rcx),%xmm1 - - call .Ldec_loop8_enter + addq %rdi,%r11 + movups 48-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 64-112(%rcx),%xmm0 + nop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 80-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 96-112(%rcx),%xmm0 + nop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 112-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 128-112(%rcx),%xmm0 + nop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 144-112(%rcx),%xmm1 + cmpl $11,%eax +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 160-112(%rcx),%xmm0 + jb .Lcbc_dec_done +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 176-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 192-112(%rcx),%xmm0 + je .Lcbc_dec_done +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 208-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 224-112(%rcx),%xmm0 + jmp .Lcbc_dec_done +.p2align 4 +.Lcbc_dec_done: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 + pxor %xmm0,%xmm10 + pxor %xmm0,%xmm11 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + pxor %xmm0,%xmm12 + pxor %xmm0,%xmm13 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + pxor %xmm0,%xmm14 + pxor %xmm0,%xmm15 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movdqu 80(%rdi),%xmm1 + +.byte 102,65,15,56,223,210 + movdqu 96(%rdi),%xmm10 + pxor %xmm0,%xmm1 +.byte 102,65,15,56,223,219 + pxor %xmm0,%xmm10 + movdqu 112(%rdi),%xmm0 +.byte 102,65,15,56,223,228 + leaq 128(%rdi),%rdi + movdqu 0(%r11),%xmm11 +.byte 102,65,15,56,223,237 +.byte 102,65,15,56,223,246 + movdqu 16(%r11),%xmm12 + movdqu 32(%r11),%xmm13 +.byte 102,65,15,56,223,255 +.byte 102,68,15,56,223,193 + movdqu 48(%r11),%xmm14 + movdqu 64(%r11),%xmm15 +.byte 102,69,15,56,223,202 + movdqa %xmm0,%xmm10 + movdqu 80(%r11),%xmm1 + movups -112(%rcx),%xmm0 - movups (%rdi),%xmm1 - movups 16(%rdi),%xmm0 - xorps 64(%rsp),%xmm2 - xorps %xmm1,%xmm3 - movups 32(%rdi),%xmm1 - xorps %xmm0,%xmm4 - movups 48(%rdi),%xmm0 - xorps %xmm1,%xmm5 - movups 64(%rdi),%xmm1 - xorps %xmm0,%xmm6 - movups 80(%rdi),%xmm0 - xorps %xmm1,%xmm7 - movups 96(%rdi),%xmm1 - xorps %xmm0,%xmm8 - movups 112(%rdi),%xmm0 - xorps %xmm1,%xmm9 movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 movups %xmm5,48(%rsi) - movl %r10d,%eax + movdqa %xmm14,%xmm5 movups %xmm6,64(%rsi) - movq %r11,%rcx + movdqa %xmm15,%xmm6 movups %xmm7,80(%rsi) - leaq 128(%rdi),%rdi + movdqa %xmm1,%xmm7 movups %xmm8,96(%rsi) leaq 112(%rsi),%rsi + subq $128,%rdx ja .Lcbc_dec_loop8 movaps %xmm9,%xmm2 - movaps %xmm0,%xmm9 + leaq -112(%rcx),%rcx addq $112,%rdx jle .Lcbc_dec_tail_collected - movups %xmm2,(%rsi) - leal 1(%r10,%r10,1),%eax + movups %xmm9,(%rsi) + leaq 16(%rsi),%rsi + cmpq $80,%rdx + jbe .Lcbc_dec_tail + + movaps %xmm11,%xmm2 +.Lcbc_dec_six_or_seven: + cmpq $96,%rdx + ja .Lcbc_dec_seven + + movaps %xmm7,%xmm8 + call _aesni_decrypt6 + pxor %xmm10,%xmm2 + movaps %xmm8,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm15,%xmm7 + movdqu %xmm6,64(%rsi) + leaq 80(%rsi),%rsi + movdqa %xmm7,%xmm2 + jmp .Lcbc_dec_tail_collected + +.p2align 4 +.Lcbc_dec_seven: + movups 96(%rdi),%xmm8 + xorps %xmm9,%xmm9 + call _aesni_decrypt8 + movups 80(%rdi),%xmm9 + pxor %xmm10,%xmm2 + movups 96(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm15,%xmm7 + movdqu %xmm6,64(%rsi) + pxor %xmm9,%xmm8 + movdqu %xmm7,80(%rsi) + leaq 96(%rsi),%rsi + movdqa %xmm8,%xmm2 + jmp .Lcbc_dec_tail_collected + +.p2align 4 +.Lcbc_dec_loop6: + movups %xmm7,(%rsi) + leaq 16(%rsi),%rsi + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqa %xmm2,%xmm11 + movdqu 32(%rdi),%xmm4 + movdqa %xmm3,%xmm12 + movdqu 48(%rdi),%xmm5 + movdqa %xmm4,%xmm13 + movdqu 64(%rdi),%xmm6 + movdqa %xmm5,%xmm14 + movdqu 80(%rdi),%xmm7 + movdqa %xmm6,%xmm15 +.Lcbc_dec_loop6_enter: + leaq 96(%rdi),%rdi + movdqa %xmm7,%xmm8 + + call _aesni_decrypt6 + + pxor %xmm10,%xmm2 + movdqa %xmm8,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movq %r11,%rcx + movdqu %xmm5,48(%rsi) + pxor %xmm15,%xmm7 + movl %r10d,%eax + movdqu %xmm6,64(%rsi) + leaq 80(%rsi),%rsi + subq $96,%rdx + ja .Lcbc_dec_loop6 + + movdqa %xmm7,%xmm2 + addq $80,%rdx + jle .Lcbc_dec_tail_collected + movups %xmm7,(%rsi) leaq 16(%rsi),%rsi + .Lcbc_dec_tail: movups (%rdi),%xmm2 - movaps %xmm2,%xmm8 - cmpq $16,%rdx + subq $16,%rdx jbe .Lcbc_dec_one movups 16(%rdi),%xmm3 - movaps %xmm3,%xmm7 - cmpq $32,%rdx + movaps %xmm2,%xmm11 + subq $16,%rdx jbe .Lcbc_dec_two movups 32(%rdi),%xmm4 - movaps %xmm4,%xmm6 - cmpq $48,%rdx + movaps %xmm3,%xmm12 + subq $16,%rdx jbe .Lcbc_dec_three movups 48(%rdi),%xmm5 - cmpq $64,%rdx + movaps %xmm4,%xmm13 + subq $16,%rdx jbe .Lcbc_dec_four movups 64(%rdi),%xmm6 - cmpq $80,%rdx - jbe .Lcbc_dec_five - - movups 80(%rdi),%xmm7 - cmpq $96,%rdx - jbe .Lcbc_dec_six - - movups 96(%rdi),%xmm8 - movaps %xmm9,64(%rsp) - call _aesni_decrypt8 - movups (%rdi),%xmm1 - movups 16(%rdi),%xmm0 - xorps 64(%rsp),%xmm2 - xorps %xmm1,%xmm3 - movups 32(%rdi),%xmm1 - xorps %xmm0,%xmm4 - movups 48(%rdi),%xmm0 - xorps %xmm1,%xmm5 - movups 64(%rdi),%xmm1 - xorps %xmm0,%xmm6 - movups 80(%rdi),%xmm0 - xorps %xmm1,%xmm7 - movups 96(%rdi),%xmm9 - xorps %xmm0,%xmm8 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) - leaq 96(%rsi),%rsi - movaps %xmm8,%xmm2 - subq $112,%rdx + movaps %xmm5,%xmm14 + movaps %xmm6,%xmm15 + xorps %xmm7,%xmm7 + call _aesni_decrypt6 + pxor %xmm10,%xmm2 + movaps %xmm15,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + leaq 64(%rsi),%rsi + movdqa %xmm6,%xmm2 + subq $16,%rdx jmp .Lcbc_dec_tail_collected + .p2align 4 .Lcbc_dec_one: + movaps %xmm2,%xmm11 movups (%rcx),%xmm0 movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx @@ -2403,118 +3076,80 @@ aesni_cbc_encrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_16 + jnz .Loop_dec1_16 .byte 102,15,56,223,209 - xorps %xmm9,%xmm2 - movaps %xmm8,%xmm9 - subq $16,%rdx + xorps %xmm10,%xmm2 + movaps %xmm11,%xmm10 jmp .Lcbc_dec_tail_collected .p2align 4 .Lcbc_dec_two: - xorps %xmm4,%xmm4 - call _aesni_decrypt3 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - movups %xmm2,(%rsi) - movaps %xmm7,%xmm9 - movaps %xmm3,%xmm2 + movaps %xmm3,%xmm12 + call _aesni_decrypt2 + pxor %xmm10,%xmm2 + movaps %xmm12,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + movdqa %xmm3,%xmm2 leaq 16(%rsi),%rsi - subq $32,%rdx jmp .Lcbc_dec_tail_collected .p2align 4 .Lcbc_dec_three: + movaps %xmm4,%xmm13 call _aesni_decrypt3 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - movups %xmm2,(%rsi) - xorps %xmm7,%xmm4 - movups %xmm3,16(%rsi) - movaps %xmm6,%xmm9 - movaps %xmm4,%xmm2 + pxor %xmm10,%xmm2 + movaps %xmm13,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + movdqa %xmm4,%xmm2 leaq 32(%rsi),%rsi - subq $48,%rdx jmp .Lcbc_dec_tail_collected .p2align 4 .Lcbc_dec_four: + movaps %xmm5,%xmm14 call _aesni_decrypt4 - xorps %xmm9,%xmm2 - movups 48(%rdi),%xmm9 - xorps %xmm8,%xmm3 - movups %xmm2,(%rsi) - xorps %xmm7,%xmm4 - movups %xmm3,16(%rsi) - xorps %xmm6,%xmm5 - movups %xmm4,32(%rsi) - movaps %xmm5,%xmm2 + pxor %xmm10,%xmm2 + movaps %xmm14,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + movdqa %xmm5,%xmm2 leaq 48(%rsi),%rsi - subq $64,%rdx - jmp .Lcbc_dec_tail_collected -.p2align 4 -.Lcbc_dec_five: - xorps %xmm7,%xmm7 - call _aesni_decrypt6 - movups 16(%rdi),%xmm1 - movups 32(%rdi),%xmm0 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - xorps %xmm1,%xmm4 - movups 48(%rdi),%xmm1 - xorps %xmm0,%xmm5 - movups 64(%rdi),%xmm9 - xorps %xmm1,%xmm6 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - leaq 64(%rsi),%rsi - movaps %xmm6,%xmm2 - subq $80,%rdx - jmp .Lcbc_dec_tail_collected -.p2align 4 -.Lcbc_dec_six: - call _aesni_decrypt6 - movups 16(%rdi),%xmm1 - movups 32(%rdi),%xmm0 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - xorps %xmm1,%xmm4 - movups 48(%rdi),%xmm1 - xorps %xmm0,%xmm5 - movups 64(%rdi),%xmm0 - xorps %xmm1,%xmm6 - movups 80(%rdi),%xmm9 - xorps %xmm0,%xmm7 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - leaq 80(%rsi),%rsi - movaps %xmm7,%xmm2 - subq $96,%rdx jmp .Lcbc_dec_tail_collected + .p2align 4 .Lcbc_dec_tail_collected: + movups %xmm10,(%r8) andq $15,%rdx - movups %xmm9,(%r8) jnz .Lcbc_dec_tail_partial movups %xmm2,(%rsi) jmp .Lcbc_dec_ret .p2align 4 .Lcbc_dec_tail_partial: - movaps %xmm2,64(%rsp) + movaps %xmm2,(%rsp) movq $16,%rcx movq %rsi,%rdi subq %rdx,%rcx - leaq 64(%rsp),%rsi -.long 0x9066A4F3 + leaq (%rsp),%rsi +.long 0x9066A4F3 .Lcbc_dec_ret: - movaps (%rsp),%xmm6 - movaps 16(%rsp),%xmm7 - movaps 32(%rsp),%xmm8 - movaps 48(%rsp),%xmm9 - leaq 88(%rsp),%rsp + movaps 16(%rsp),%xmm6 + movaps 32(%rsp),%xmm7 + movaps 48(%rsp),%xmm8 + movaps 64(%rsp),%xmm9 + movaps 80(%rsp),%xmm10 + movaps 96(%rsp),%xmm11 + movaps 112(%rsp),%xmm12 + movaps 128(%rsp),%xmm13 + movaps 144(%rsp),%xmm14 + movaps 160(%rsp),%xmm15 + leaq (%rbp),%rsp + popq %rbp .Lcbc_ret: movq 8(%rsp),%rdi movq 16(%rsp),%rsi @@ -2524,7 +3159,7 @@ aesni_cbc_encrypt: .def aesni_set_decrypt_key; .scl 2; .type 32; .endef .p2align 4 aesni_set_decrypt_key: -.byte 0x48,0x83,0xEC,0x08 +.byte 0x48,0x83,0xEC,0x08 call __aesni_set_encrypt_key shll $4,%edx testl %eax,%eax @@ -2563,7 +3198,7 @@ aesni_set_decrypt_key: .p2align 4 aesni_set_encrypt_key: __aesni_set_encrypt_key: -.byte 0x48,0x83,0xEC,0x08 +.byte 0x48,0x83,0xEC,0x08 movq $-1,%rax testq %rcx,%rcx jz .Lenc_key_ret @@ -2759,6 +3394,8 @@ __aesni_set_encrypt_key: .long 1,0,0,0 .Lxts_magic: .long 0x87,0,1,0 +.Lincrement1: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 6 @@ -2817,51 +3454,15 @@ ccm64_se_handler: leaq 0(%rax),%rsi leaq 512(%r8),%rdi movl $8,%ecx -.long 0xa548f3fc +.long 0xa548f3fc leaq 88(%rax),%rax jmp .Lcommon_seh_tail -.def ctr32_se_handler; .scl 3; .type 32; .endef +.def ctr_xts_se_handler; .scl 3; .type 32; .endef .p2align 4 -ctr32_se_handler: - pushq %rsi - pushq %rdi - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushfq - subq $64,%rsp - - movq 120(%r8),%rax - movq 248(%r8),%rbx - - leaq .Lctr32_body(%rip),%r10 - cmpq %r10,%rbx - jb .Lcommon_seh_tail - - movq 152(%r8),%rax - - leaq .Lctr32_ret(%rip),%r10 - cmpq %r10,%rbx - jae .Lcommon_seh_tail - - leaq 32(%rax),%rsi - leaq 512(%r8),%rdi - movl $20,%ecx -.long 0xa548f3fc - leaq 200(%rax),%rax - - jmp .Lcommon_seh_tail - - -.def xts_se_handler; .scl 3; .type 32; .endef -.p2align 4 -xts_se_handler: +ctr_xts_se_handler: pushq %rsi pushq %rdi pushq %rbx @@ -2891,13 +3492,13 @@ xts_se_handler: cmpq %r10,%rbx jae .Lcommon_seh_tail - leaq 96(%rax),%rsi + movq 160(%r8),%rax + leaq -160(%rax),%rsi leaq 512(%r8),%rdi movl $20,%ecx -.long 0xa548f3fc - leaq 104+160(%rax),%rax +.long 0xa548f3fc - jmp .Lcommon_seh_tail + jmp .Lcommon_rbp_tail .def cbc_se_handler; .scl 3; .type 32; .endef .p2align 4 @@ -2928,11 +3529,16 @@ cbc_se_handler: cmpq %r10,%rbx jae .Lcommon_seh_tail - leaq 0(%rax),%rsi + leaq 16(%rax),%rsi leaq 512(%r8),%rdi - movl $8,%ecx -.long 0xa548f3fc - leaq 88(%rax),%rax + movl $20,%ecx +.long 0xa548f3fc + +.Lcommon_rbp_tail: + movq 160(%r8),%rax + movq (%rax),%rbp + leaq 8(%rax),%rax + movq %rbp,160(%r8) jmp .Lcommon_seh_tail .Lrestore_cbc_rax: @@ -2948,7 +3554,7 @@ cbc_se_handler: movq 40(%r9),%rdi movq %r8,%rsi movl $154,%ecx -.long 0xa548f3fc +.long 0xa548f3fc movq %r9,%rsi xorq %rcx,%rcx @@ -3022,26 +3628,27 @@ cbc_se_handler: .LSEH_info_ccm64_enc: .byte 9,0,0,0 .rva ccm64_se_handler -.rva .Lccm64_enc_body,.Lccm64_enc_ret +.rva .Lccm64_enc_body,.Lccm64_enc_ret .LSEH_info_ccm64_dec: .byte 9,0,0,0 .rva ccm64_se_handler -.rva .Lccm64_dec_body,.Lccm64_dec_ret +.rva .Lccm64_dec_body,.Lccm64_dec_ret .LSEH_info_ctr32: .byte 9,0,0,0 -.rva ctr32_se_handler +.rva ctr_xts_se_handler +.rva .Lctr32_body,.Lctr32_epilogue .LSEH_info_xts_enc: .byte 9,0,0,0 -.rva xts_se_handler -.rva .Lxts_enc_body,.Lxts_enc_epilogue +.rva ctr_xts_se_handler +.rva .Lxts_enc_body,.Lxts_enc_epilogue .LSEH_info_xts_dec: .byte 9,0,0,0 -.rva xts_se_handler -.rva .Lxts_dec_body,.Lxts_dec_epilogue +.rva ctr_xts_se_handler +.rva .Lxts_dec_body,.Lxts_dec_epilogue .LSEH_info_cbc: .byte 9,0,0,0 .rva cbc_se_handler .LSEH_info_key: .byte 0x01,0x04,0x01,0x00 -.byte 0x04,0x02,0x00,0x00 +.byte 0x04,0x02,0x00,0x00 diff --git a/lib/accelerated/x86/coff/e_padlock-x86.s b/lib/accelerated/x86/coff/e_padlock-x86.s index 47dd6d36d4..41f87b1174 100644 --- a/lib/accelerated/x86/coff/e_padlock-x86.s +++ b/lib/accelerated/x86/coff/e_padlock-x86.s @@ -180,16 +180,14 @@ _padlock_ecb_encrypt: leal 16(%edx),%edx xorl %eax,%eax xorl %ebx,%ebx - cmpl $128,%ecx - jbe .L006ecb_short testl $32,(%edx) - jnz .L007ecb_aligned + jnz .L006ecb_aligned testl $15,%edi setz %al testl $15,%esi setz %bl testl %ebx,%eax - jnz .L007ecb_aligned + jnz .L006ecb_aligned negl %eax movl $512,%ebx notl %eax @@ -201,10 +199,28 @@ _padlock_ecb_encrypt: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp - jmp .L008ecb_loop + movl %eax,16(%ebp) + cmpl %ebx,%ecx + ja .L007ecb_loop + movl %esi,%eax + cmpl %esp,%ebp + cmovel %edi,%eax + addl %ecx,%eax + negl %eax + andl $4095,%eax + cmpl $128,%eax + movl $-128,%eax + cmovael %ebx,%eax + andl %eax,%ebx + jz .L008ecb_unaligned_tail + jmp .L007ecb_loop .align 16 -.L008ecb_loop: +.L007ecb_loop: movl %edi,(%ebp) movl %esi,4(%ebp) movl %ecx,8(%ebp) @@ -229,8 +245,8 @@ _padlock_ecb_encrypt: testl $15,%edi jz .L010ecb_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi .L010ecb_out_aligned: @@ -240,43 +256,75 @@ _padlock_ecb_encrypt: addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz .L008ecb_loop + jz .L011ecb_break + cmpl %ebx,%ecx + jae .L007ecb_loop +.L008ecb_unaligned_tail: + xorl %eax,%eax cmpl %ebp,%esp - je .L011ecb_done + cmovel %ecx,%eax + subl %eax,%esp + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L007ecb_loop +.align 16 +.L011ecb_break: + cmpl %ebp,%esp + je .L012ecb_done pxor %xmm0,%xmm0 leal (%esp),%eax -.L012ecb_bzero: +.L013ecb_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja .L012ecb_bzero -.L011ecb_done: + ja .L013ecb_bzero +.L012ecb_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp - jmp .L013ecb_exit + jmp .L014ecb_exit .align 16 -.L006ecb_short: +.L006ecb_aligned: + leal (%esi,%ecx,1),%ebp + negl %ebp + andl $4095,%ebp xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -.L014ecb_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja .L014ecb_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp .L008ecb_loop -.align 16 -.L007ecb_aligned: + cmpl $128,%ebp + movl $127,%ebp + cmovael %eax,%ebp + andl %ecx,%ebp + subl %ebp,%ecx + jz .L015ecb_aligned_tail leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx .byte 243,15,167,200 -.L013ecb_exit: + testl %ebp,%ebp + jz .L014ecb_exit +.L015ecb_aligned_tail: + movl %ebp,%ecx + leal -24(%esp),%ebp + movl %ebp,%esp + movl %ebp,%eax + subl %ecx,%esp + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L007ecb_loop +.L014ecb_exit: movl $1,%eax leal 4(%esp),%esp .L004ecb_abort: @@ -299,19 +347,17 @@ _padlock_cbc_encrypt: movl 28(%esp),%edx movl 32(%esp),%ecx testl $15,%edx - jnz .L015cbc_abort + jnz .L016cbc_abort testl $15,%ecx - jnz .L015cbc_abort + jnz .L016cbc_abort leal .Lpadlock_saved_context,%eax pushfl cld call __padlock_verify_ctx -.L016cbc_pic_point: +.L017cbc_pic_point: leal 16(%edx),%edx xorl %eax,%eax xorl %ebx,%ebx - cmpl $64,%ecx - jbe .L017cbc_short testl $32,(%edx) jnz .L018cbc_aligned testl $15,%edi @@ -331,7 +377,25 @@ _padlock_cbc_encrypt: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp + movl %eax,16(%ebp) + cmpl %ebx,%ecx + ja .L019cbc_loop + movl %esi,%eax + cmpl %esp,%ebp + cmovel %edi,%eax + addl %ecx,%eax + negl %eax + andl $4095,%eax + cmpl $64,%eax + movl $-64,%eax + cmovael %ebx,%eax + andl %eax,%ebx + jz .L020cbc_unaligned_tail jmp .L019cbc_loop .align 16 .L019cbc_loop: @@ -343,13 +407,13 @@ _padlock_cbc_encrypt: testl $15,%edi cmovnzl %esp,%edi testl $15,%esi - jz .L020cbc_inp_aligned + jz .L021cbc_inp_aligned shrl $2,%ecx .byte 243,165 subl %ebx,%edi movl %ebx,%ecx movl %edi,%esi -.L020cbc_inp_aligned: +.L021cbc_inp_aligned: leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx @@ -359,61 +423,93 @@ _padlock_cbc_encrypt: movl (%ebp),%edi movl 12(%ebp),%ebx testl $15,%edi - jz .L021cbc_out_aligned + jz .L022cbc_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi -.L021cbc_out_aligned: +.L022cbc_out_aligned: movl 4(%ebp),%esi movl 8(%ebp),%ecx addl %ebx,%edi addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz .L019cbc_loop + jz .L023cbc_break + cmpl %ebx,%ecx + jae .L019cbc_loop +.L020cbc_unaligned_tail: + xorl %eax,%eax + cmpl %ebp,%esp + cmovel %ecx,%eax + subl %eax,%esp + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L019cbc_loop +.align 16 +.L023cbc_break: cmpl %ebp,%esp - je .L022cbc_done + je .L024cbc_done pxor %xmm0,%xmm0 leal (%esp),%eax -.L023cbc_bzero: +.L025cbc_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja .L023cbc_bzero -.L022cbc_done: + ja .L025cbc_bzero +.L024cbc_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp - jmp .L024cbc_exit -.align 16 -.L017cbc_short: - xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -.L025cbc_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja .L025cbc_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp .L019cbc_loop + jmp .L026cbc_exit .align 16 .L018cbc_aligned: + leal (%esi,%ecx,1),%ebp + negl %ebp + andl $4095,%ebp + xorl %eax,%eax + cmpl $64,%ebp + movl $63,%ebp + cmovael %eax,%ebp + andl %ecx,%ebp + subl %ebp,%ecx + jz .L027cbc_aligned_tail leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx .byte 243,15,167,208 movaps (%eax),%xmm0 movaps %xmm0,-16(%edx) -.L024cbc_exit: + testl %ebp,%ebp + jz .L026cbc_exit +.L027cbc_aligned_tail: + movl %ebp,%ecx + leal -24(%esp),%ebp + movl %ebp,%esp + movl %ebp,%eax + subl %ecx,%esp + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L019cbc_loop +.L026cbc_exit: movl $1,%eax leal 4(%esp),%esp -.L015cbc_abort: +.L016cbc_abort: popl %edi popl %esi popl %ebx @@ -433,25 +529,25 @@ _padlock_cfb_encrypt: movl 28(%esp),%edx movl 32(%esp),%ecx testl $15,%edx - jnz .L026cfb_abort + jnz .L028cfb_abort testl $15,%ecx - jnz .L026cfb_abort + jnz .L028cfb_abort leal .Lpadlock_saved_context,%eax pushfl cld call __padlock_verify_ctx -.L027cfb_pic_point: +.L029cfb_pic_point: leal 16(%edx),%edx xorl %eax,%eax xorl %ebx,%ebx testl $32,(%edx) - jnz .L028cfb_aligned + jnz .L030cfb_aligned testl $15,%edi setz %al testl $15,%esi setz %bl testl %ebx,%eax - jnz .L028cfb_aligned + jnz .L030cfb_aligned negl %eax movl $512,%ebx notl %eax @@ -463,10 +559,15 @@ _padlock_cfb_encrypt: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp - jmp .L029cfb_loop + movl %eax,16(%ebp) + jmp .L031cfb_loop .align 16 -.L029cfb_loop: +.L031cfb_loop: movl %edi,(%ebp) movl %esi,4(%ebp) movl %ecx,8(%ebp) @@ -475,13 +576,13 @@ _padlock_cfb_encrypt: testl $15,%edi cmovnzl %esp,%edi testl $15,%esi - jz .L030cfb_inp_aligned + jz .L032cfb_inp_aligned shrl $2,%ecx .byte 243,165 subl %ebx,%edi movl %ebx,%ecx movl %edi,%esi -.L030cfb_inp_aligned: +.L032cfb_inp_aligned: leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx @@ -491,61 +592,45 @@ _padlock_cfb_encrypt: movl (%ebp),%edi movl 12(%ebp),%ebx testl $15,%edi - jz .L031cfb_out_aligned + jz .L033cfb_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi -.L031cfb_out_aligned: +.L033cfb_out_aligned: movl 4(%ebp),%esi movl 8(%ebp),%ecx addl %ebx,%edi addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz .L029cfb_loop + jnz .L031cfb_loop cmpl %ebp,%esp - je .L032cfb_done + je .L034cfb_done pxor %xmm0,%xmm0 leal (%esp),%eax -.L033cfb_bzero: +.L035cfb_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja .L033cfb_bzero -.L032cfb_done: + ja .L035cfb_bzero +.L034cfb_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp - jmp .L034cfb_exit + jmp .L036cfb_exit .align 16 -.L035cfb_short: - xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -.L036cfb_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja .L036cfb_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp .L029cfb_loop -.align 16 -.L028cfb_aligned: +.L030cfb_aligned: leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx .byte 243,15,167,224 movaps (%eax),%xmm0 movaps %xmm0,-16(%edx) -.L034cfb_exit: +.L036cfb_exit: movl $1,%eax leal 4(%esp),%esp -.L026cfb_abort: +.L028cfb_abort: popl %edi popl %esi popl %ebx @@ -595,7 +680,12 @@ _padlock_ofb_encrypt: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp + movl %eax,16(%ebp) jmp .L040ofb_loop .align 16 .L040ofb_loop: @@ -625,8 +715,8 @@ _padlock_ofb_encrypt: testl $15,%edi jz .L042ofb_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi .L042ofb_out_aligned: @@ -647,26 +737,10 @@ _padlock_ofb_encrypt: cmpl %eax,%ebp ja .L044ofb_bzero .L043ofb_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp jmp .L045ofb_exit .align 16 -.L046ofb_short: - xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -.L047ofb_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja .L047ofb_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp .L040ofb_loop -.align 16 .L039ofb_aligned: leal -16(%edx),%eax leal 16(%edx),%ebx @@ -697,14 +771,14 @@ _padlock_ctr32_encrypt: movl 28(%esp),%edx movl 32(%esp),%ecx testl $15,%edx - jnz .L048ctr32_abort + jnz .L046ctr32_abort testl $15,%ecx - jnz .L048ctr32_abort + jnz .L046ctr32_abort leal .Lpadlock_saved_context,%eax pushfl cld call __padlock_verify_ctx -.L049ctr32_pic_point: +.L047ctr32_pic_point: leal 16(%edx),%edx xorl %eax,%eax movq -16(%edx),%mm0 @@ -718,10 +792,15 @@ _padlock_ctr32_encrypt: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp - jmp .L050ctr32_loop + movl %eax,16(%ebp) + jmp .L048ctr32_loop .align 16 -.L050ctr32_loop: +.L048ctr32_loop: movl %edi,(%ebp) movl %esi,4(%ebp) movl %ecx,8(%ebp) @@ -730,7 +809,7 @@ _padlock_ctr32_encrypt: movl -4(%edx),%ecx xorl %edi,%edi movl -8(%edx),%eax -.L051ctr32_prepare: +.L049ctr32_prepare: movl %ecx,12(%esp,%edi,1) bswap %ecx movq %mm0,(%esp,%edi,1) @@ -739,7 +818,7 @@ _padlock_ctr32_encrypt: bswap %ecx leal 16(%edi),%edi cmpl %ebx,%edi - jb .L051ctr32_prepare + jb .L049ctr32_prepare movl %ecx,-4(%edx) leal (%esp),%esi leal (%esp),%edi @@ -752,32 +831,33 @@ _padlock_ctr32_encrypt: movl 12(%ebp),%ebx movl 4(%ebp),%esi xorl %ecx,%ecx -.L052ctr32_xor: +.L050ctr32_xor: movups (%esi,%ecx,1),%xmm1 leal 16(%ecx),%ecx pxor -16(%esp,%ecx,1),%xmm1 movups %xmm1,-16(%edi,%ecx,1) cmpl %ebx,%ecx - jb .L052ctr32_xor + jb .L050ctr32_xor movl 8(%ebp),%ecx addl %ebx,%edi addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz .L050ctr32_loop + jnz .L048ctr32_loop pxor %xmm0,%xmm0 leal (%esp),%eax -.L053ctr32_bzero: +.L051ctr32_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja .L053ctr32_bzero -.L054ctr32_done: + ja .L051ctr32_bzero +.L052ctr32_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp movl $1,%eax leal 4(%esp),%esp emms -.L048ctr32_abort: +.L046ctr32_abort: popl %edi popl %esi popl %ebx @@ -801,10 +881,10 @@ __win32_segv_handler: movl 4(%esp),%edx movl 12(%esp),%ecx cmpl $3221225477,(%edx) - jne .L055ret + jne .L053ret addl $4,184(%ecx) movl $0,%eax -.L055ret: +.L053ret: ret .globl _padlock_sha1_oneshot .def _padlock_sha1_oneshot; .scl 2; .type 32; .endef diff --git a/lib/accelerated/x86/coff/e_padlock-x86_64.s b/lib/accelerated/x86/coff/e_padlock-x86_64.s index b9b93cc592..cfa91d3fff 100644 --- a/lib/accelerated/x86/coff/e_padlock-x86_64.s +++ b/lib/accelerated/x86/coff/e_padlock-x86_64.s @@ -135,7 +135,7 @@ padlock_aes_block: movq $1,%rcx leaq 32(%rdx),%rbx leaq 16(%rdx),%rdx -.byte 0xf3,0x0f,0xa7,0xc8 +.byte 0xf3,0x0f,0xa7,0xc8 movq %r8,%rbx movq 8(%rsp),%rdi movq 16(%rsp),%rsi @@ -154,7 +154,7 @@ padlock_xstore: movq %rdx,%rsi movl %esi,%edx -.byte 0x0f,0xa7,0xc0 +.byte 0x0f,0xa7,0xc0 movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 @@ -181,7 +181,7 @@ padlock_sha1_oneshot: movq %rsp,%rdi movl %eax,16(%rsp) xorq %rax,%rax -.byte 0xf3,0x0f,0xa6,0xc8 +.byte 0xf3,0x0f,0xa6,0xc8 movaps (%rsp),%xmm0 movl 16(%rsp),%eax addq $128+8,%rsp @@ -213,7 +213,7 @@ padlock_sha1_blocks: movq %rsp,%rdi movl %eax,16(%rsp) movq $-1,%rax -.byte 0xf3,0x0f,0xa6,0xc8 +.byte 0xf3,0x0f,0xa6,0xc8 movaps (%rsp),%xmm0 movl 16(%rsp),%eax addq $128+8,%rsp @@ -245,7 +245,7 @@ padlock_sha256_oneshot: movq %rsp,%rdi movaps %xmm1,16(%rsp) xorq %rax,%rax -.byte 0xf3,0x0f,0xa6,0xd0 +.byte 0xf3,0x0f,0xa6,0xd0 movaps (%rsp),%xmm0 movaps 16(%rsp),%xmm1 addq $128+8,%rsp @@ -277,7 +277,7 @@ padlock_sha256_blocks: movq %rsp,%rdi movaps %xmm1,16(%rsp) movq $-1,%rax -.byte 0xf3,0x0f,0xa6,0xd0 +.byte 0xf3,0x0f,0xa6,0xd0 movaps (%rsp),%xmm0 movaps 16(%rsp),%xmm1 addq $128+8,%rsp @@ -312,7 +312,7 @@ padlock_sha512_blocks: movaps %xmm1,16(%rsp) movaps %xmm2,32(%rsp) movaps %xmm3,48(%rsp) -.byte 0xf3,0x0f,0xa6,0xe0 +.byte 0xf3,0x0f,0xa6,0xe0 movaps (%rsp),%xmm0 movaps 16(%rsp),%xmm1 movaps 32(%rsp),%xmm2 @@ -354,8 +354,6 @@ padlock_ecb_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $128,%rcx - jbe .Lecb_short testl $32,(%rdx) jnz .Lecb_aligned testq $15,%rdi @@ -375,6 +373,21 @@ padlock_ecb_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + cmpq %rbx,%rcx + ja .Lecb_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $128,%rax + movq $-128,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz .Lecb_unaligned_tail jmp .Lecb_loop .p2align 4 .Lecb_loop: @@ -390,7 +403,7 @@ padlock_ecb_encrypt: testq $15,%rsi jz .Lecb_inp_aligned shrq $3,%rcx -.byte 0xf3,0x48,0xa5 +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi movq %rbx,%rcx movq %rdi,%rsi @@ -398,15 +411,15 @@ padlock_ecb_encrypt: leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,200 +.byte 0xf3,0x0f,0xa7,200 movq %r8,%rdi movq %r11,%rbx testq $15,%rdi jz .Lecb_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi -.byte 0xf3,0x48,0xa5 + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi .Lecb_out_aligned: movq %r9,%rsi @@ -415,9 +428,26 @@ padlock_ecb_encrypt: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx - jnz .Lecb_loop - + jz .Lecb_break + cmpq %rbx,%rcx + jae .Lecb_loop +.Lecb_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp .Lecb_loop +.p2align 4 +.Lecb_break: + cmpq %rbp,%rsp je .Lecb_done pxor %xmm0,%xmm0 @@ -431,26 +461,39 @@ padlock_ecb_encrypt: .Lecb_done: leaq (%rbp),%rsp jmp .Lecb_exit -.p2align 4 -.Lecb_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -.Lecb_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja .Lecb_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp .Lecb_loop + .p2align 4 .Lecb_aligned: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $128,%rbp + movq $128-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz .Lecb_aligned_tail leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,200 +.byte 0xf3,0x0f,0xa7,200 + testq %rbp,%rbp + jz .Lecb_exit + +.Lecb_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp .Lecb_loop .Lecb_exit: movl $1,%eax leaq 8(%rsp),%rsp @@ -489,8 +532,6 @@ padlock_cbc_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $64,%rcx - jbe .Lcbc_short testl $32,(%rdx) jnz .Lcbc_aligned testq $15,%rdi @@ -510,6 +551,21 @@ padlock_cbc_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + cmpq %rbx,%rcx + ja .Lcbc_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $64,%rax + movq $-64,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz .Lcbc_unaligned_tail jmp .Lcbc_loop .p2align 4 .Lcbc_loop: @@ -525,7 +581,7 @@ padlock_cbc_encrypt: testq $15,%rsi jz .Lcbc_inp_aligned shrq $3,%rcx -.byte 0xf3,0x48,0xa5 +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi movq %rbx,%rcx movq %rdi,%rsi @@ -533,7 +589,7 @@ padlock_cbc_encrypt: leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,208 +.byte 0xf3,0x0f,0xa7,208 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) movq %r8,%rdi @@ -541,9 +597,9 @@ padlock_cbc_encrypt: testq $15,%rdi jz .Lcbc_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi -.byte 0xf3,0x48,0xa5 + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi .Lcbc_out_aligned: movq %r9,%rsi @@ -552,9 +608,26 @@ padlock_cbc_encrypt: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx - jnz .Lcbc_loop - + jz .Lcbc_break + cmpq %rbx,%rcx + jae .Lcbc_loop +.Lcbc_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp .Lcbc_loop +.p2align 4 +.Lcbc_break: + cmpq %rbp,%rsp je .Lcbc_done pxor %xmm0,%xmm0 @@ -568,28 +641,41 @@ padlock_cbc_encrypt: .Lcbc_done: leaq (%rbp),%rsp jmp .Lcbc_exit -.p2align 4 -.Lcbc_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -.Lcbc_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja .Lcbc_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp .Lcbc_loop + .p2align 4 .Lcbc_aligned: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $64,%rbp + movq $64-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz .Lcbc_aligned_tail leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,208 +.byte 0xf3,0x0f,0xa7,208 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) + testq %rbp,%rbp + jz .Lcbc_exit + +.Lcbc_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp .Lcbc_loop .Lcbc_exit: movl $1,%eax leaq 8(%rsp),%rsp @@ -647,6 +733,8 @@ padlock_cfb_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx jmp .Lcfb_loop .p2align 4 .Lcfb_loop: @@ -662,7 +750,7 @@ padlock_cfb_encrypt: testq $15,%rsi jz .Lcfb_inp_aligned shrq $3,%rcx -.byte 0xf3,0x48,0xa5 +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi movq %rbx,%rcx movq %rdi,%rsi @@ -670,7 +758,7 @@ padlock_cfb_encrypt: leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,224 +.byte 0xf3,0x0f,0xa7,224 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) movq %r8,%rdi @@ -678,9 +766,9 @@ padlock_cfb_encrypt: testq $15,%rdi jz .Lcfb_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi -.byte 0xf3,0x48,0xa5 + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi .Lcfb_out_aligned: movq %r9,%rsi @@ -690,8 +778,7 @@ padlock_cfb_encrypt: subq %rbx,%rcx movq $512,%rbx jnz .Lcfb_loop - - cmpq %rsp,%rbp + cmpq %rbp,%rsp je .Lcfb_done pxor %xmm0,%xmm0 @@ -705,12 +792,13 @@ padlock_cfb_encrypt: .Lcfb_done: leaq (%rbp),%rsp jmp .Lcfb_exit + .p2align 4 .Lcfb_aligned: leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,224 +.byte 0xf3,0x0f,0xa7,224 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) .Lcfb_exit: @@ -770,6 +858,8 @@ padlock_ofb_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx jmp .Lofb_loop .p2align 4 .Lofb_loop: @@ -785,7 +875,7 @@ padlock_ofb_encrypt: testq $15,%rsi jz .Lofb_inp_aligned shrq $3,%rcx -.byte 0xf3,0x48,0xa5 +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi movq %rbx,%rcx movq %rdi,%rsi @@ -793,7 +883,7 @@ padlock_ofb_encrypt: leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,232 +.byte 0xf3,0x0f,0xa7,232 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) movq %r8,%rdi @@ -801,9 +891,9 @@ padlock_ofb_encrypt: testq $15,%rdi jz .Lofb_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi -.byte 0xf3,0x48,0xa5 + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi .Lofb_out_aligned: movq %r9,%rsi @@ -813,8 +903,7 @@ padlock_ofb_encrypt: subq %rbx,%rcx movq $512,%rbx jnz .Lofb_loop - - cmpq %rsp,%rbp + cmpq %rbp,%rsp je .Lofb_done pxor %xmm0,%xmm0 @@ -828,12 +917,13 @@ padlock_ofb_encrypt: .Lofb_done: leaq (%rbp),%rsp jmp .Lofb_exit + .p2align 4 .Lofb_aligned: leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,232 +.byte 0xf3,0x0f,0xa7,232 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) .Lofb_exit: @@ -874,8 +964,6 @@ padlock_ctr32_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $64,%rcx - jbe .Lctr32_short testl $32,(%rdx) jnz .Lctr32_aligned testq $15,%rdi @@ -895,15 +983,32 @@ padlock_ctr32_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx .Lctr32_reenter: movl -4(%rdx),%eax bswapl %eax negl %eax andl $31,%eax - jz .Lctr32_loop + movq $512,%rbx shll $4,%eax + cmovzq %rbx,%rax cmpq %rax,%rcx cmovaq %rax,%rbx + cmovbeq %rcx,%rbx + cmpq %rbx,%rcx + ja .Lctr32_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $32,%rax + movq $-32,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz .Lctr32_unaligned_tail jmp .Lctr32_loop .p2align 4 .Lctr32_loop: @@ -919,7 +1024,7 @@ padlock_ctr32_encrypt: testq $15,%rsi jz .Lctr32_inp_aligned shrq $3,%rcx -.byte 0xf3,0x48,0xa5 +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi movq %rbx,%rcx movq %rdi,%rsi @@ -927,23 +1032,23 @@ padlock_ctr32_encrypt: leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,216 +.byte 0xf3,0x0f,0xa7,216 movl -4(%rdx),%eax testl $4294901760,%eax - jnz .Lctr32_no_corr + jnz .Lctr32_no_carry bswapl %eax addl $65536,%eax bswapl %eax movl %eax,-4(%rdx) -.Lctr32_no_corr: +.Lctr32_no_carry: movq %r8,%rdi movq %r11,%rbx testq $15,%rdi jz .Lctr32_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi -.byte 0xf3,0x48,0xa5 + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi .Lctr32_out_aligned: movq %r9,%rsi @@ -952,9 +1057,38 @@ padlock_ctr32_encrypt: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx + jz .Lctr32_break + cmpq %rbx,%rcx + jae .Lctr32_loop + movq %rcx,%rbx + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $32,%rax + movq $-32,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx jnz .Lctr32_loop - +.Lctr32_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp .Lctr32_loop +.p2align 4 +.Lctr32_break: + cmpq %rbp,%rsp je .Lctr32_done pxor %xmm0,%xmm0 @@ -968,56 +1102,75 @@ padlock_ctr32_encrypt: .Lctr32_done: leaq (%rbp),%rsp jmp .Lctr32_exit -.p2align 4 -.Lctr32_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -.Lctr32_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja .Lctr32_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp .Lctr32_reenter + .p2align 4 .Lctr32_aligned: movl -4(%rdx),%eax - movq $1048576,%rbx bswapl %eax - cmpq %rcx,%rbx - cmovaq %rcx,%rbx negl %eax andl $65535,%eax - jz .Lctr32_aligned_loop + movq $1048576,%rbx shll $4,%eax + cmovzq %rbx,%rax cmpq %rax,%rcx cmovaq %rax,%rbx - jmp .Lctr32_aligned_loop -.p2align 4 + cmovbeq %rcx,%rbx + jbe .Lctr32_aligned_skip + .Lctr32_aligned_loop: - cmpq %rcx,%rbx - cmovaq %rcx,%rbx movq %rcx,%r10 movq %rbx,%rcx movq %rbx,%r11 + leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,216 +.byte 0xf3,0x0f,0xa7,216 + movl -4(%rdx),%eax bswapl %eax addl $65536,%eax bswapl %eax movl %eax,-4(%rdx) - movq %r11,%rbx movq %r10,%rcx - subq %rbx,%rcx + subq %r11,%rcx movq $1048576,%rbx - jnz .Lctr32_aligned_loop + jz .Lctr32_exit + cmpq %rbx,%rcx + jae .Lctr32_aligned_loop + +.Lctr32_aligned_skip: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $32,%rbp + movq $32-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz .Lctr32_aligned_tail + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,216 + testq %rbp,%rbp + jz .Lctr32_exit + +.Lctr32_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp .Lctr32_loop .Lctr32_exit: movl $1,%eax leaq 8(%rsp),%rsp diff --git a/lib/accelerated/x86/coff/ghash-x86_64.s b/lib/accelerated/x86/coff/ghash-x86_64.s index 7a89b7a599..b1d69911c7 100644 --- a/lib/accelerated/x86/coff/ghash-x86_64.s +++ b/lib/accelerated/x86/coff/ghash-x86_64.s @@ -39,6 +39,7 @@ # .text + .globl gcm_gmult_4bit .def gcm_gmult_4bit; .scl 2; .type 32; .endef .p2align 4 @@ -717,6 +718,11 @@ gcm_ghash_4bit: .def gcm_init_clmul; .scl 2; .type 32; .endef .p2align 4 gcm_init_clmul: +.L_init_clmul: +.LSEH_begin_gcm_init_clmul: + +.byte 0x48,0x83,0xec,0x18 +.byte 0x0f,0x29,0x34,0x24 movdqu (%rdx),%xmm2 pshufd $78,%xmm2,%xmm2 @@ -735,15 +741,15 @@ gcm_init_clmul: pxor %xmm5,%xmm2 + pshufd $78,%xmm2,%xmm6 movdqa %xmm2,%xmm0 + pxor %xmm2,%xmm6 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 +.byte 102,15,58,68,222,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 @@ -753,44 +759,137 @@ gcm_init_clmul: pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm2,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm2,%xmm3 + movdqu %xmm2,0(%rcx) + pxor %xmm0,%xmm4 + movdqu %xmm0,16(%rcx) +.byte 102,15,58,15,227,8 + movdqu %xmm4,32(%rcx) + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + movdqa %xmm0,%xmm5 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - movdqu %xmm2,(%rcx) - movdqu %xmm0,16(%rcx) + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm5,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm5,%xmm3 + movdqu %xmm5,48(%rcx) + pxor %xmm0,%xmm4 + movdqu %xmm0,64(%rcx) +.byte 102,15,58,15,227,8 + movdqu %xmm4,80(%rcx) + movaps (%rsp),%xmm6 + leaq 24(%rsp),%rsp +.LSEH_end_gcm_init_clmul: .byte 0xf3,0xc3 .globl gcm_gmult_clmul .def gcm_gmult_clmul; .scl 2; .type 32; .endef .p2align 4 gcm_gmult_clmul: +.L_gmult_clmul: movdqu (%rcx),%xmm0 movdqa .Lbswap_mask(%rip),%xmm5 movdqu (%rdx),%xmm2 + movdqu 32(%rdx),%xmm4 .byte 102,15,56,0,197 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,220,0 @@ -803,209 +902,395 @@ gcm_gmult_clmul: pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 .byte 102,15,56,0,197 movdqu %xmm0,(%rcx) .byte 0xf3,0xc3 .globl gcm_ghash_clmul .def gcm_ghash_clmul; .scl 2; .type 32; .endef -.p2align 4 +.p2align 5 gcm_ghash_clmul: +.L_ghash_clmul: + leaq -136(%rsp),%rax .LSEH_begin_gcm_ghash_clmul: -.byte 0x48,0x83,0xec,0x58 -.byte 0x0f,0x29,0x34,0x24 -.byte 0x0f,0x29,0x7c,0x24,0x10 -.byte 0x44,0x0f,0x29,0x44,0x24,0x20 -.byte 0x44,0x0f,0x29,0x4c,0x24,0x30 -.byte 0x44,0x0f,0x29,0x54,0x24,0x40 - movdqa .Lbswap_mask(%rip),%xmm5 +.byte 0x48,0x8d,0x60,0xe0 +.byte 0x0f,0x29,0x70,0xe0 +.byte 0x0f,0x29,0x78,0xf0 +.byte 0x44,0x0f,0x29,0x00 +.byte 0x44,0x0f,0x29,0x48,0x10 +.byte 0x44,0x0f,0x29,0x50,0x20 +.byte 0x44,0x0f,0x29,0x58,0x30 +.byte 0x44,0x0f,0x29,0x60,0x40 +.byte 0x44,0x0f,0x29,0x68,0x50 +.byte 0x44,0x0f,0x29,0x70,0x60 +.byte 0x44,0x0f,0x29,0x78,0x70 + movdqa .Lbswap_mask(%rip),%xmm10 movdqu (%rcx),%xmm0 movdqu (%rdx),%xmm2 -.byte 102,15,56,0,197 + movdqu 32(%rdx),%xmm7 +.byte 102,65,15,56,0,194 subq $16,%r9 jz .Lodd_tail - movdqu 16(%rdx),%xmm8 - + movdqu 16(%rdx),%xmm6 + movl _gnutls_x86_cpuid_s+4(%rip),%eax + cmpq $48,%r9 + jb .Lskip4x + + andl $71303168,%eax + cmpl $4194304,%eax + je .Lskip4x + + subq $48,%r9 + movq $11547335547999543296,%rax + movdqu 48(%rdx),%xmm14 + movdqu 64(%rdx),%xmm15 + + + + + movdqu 48(%r8),%xmm3 + movdqu 32(%r8),%xmm11 +.byte 102,65,15,56,0,218 +.byte 102,69,15,56,0,218 + movdqa %xmm3,%xmm5 + pshufd $78,%xmm3,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,68,218,0 +.byte 102,15,58,68,234,17 +.byte 102,15,58,68,231,0 + + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm11,%xmm12 +.byte 102,68,15,58,68,222,0 +.byte 102,68,15,58,68,238,17 + xorps %xmm11,%xmm3 +.byte 102,68,15,58,68,231,16 + xorps %xmm13,%xmm5 + movups 80(%rdx),%xmm7 + xorps %xmm12,%xmm4 + + movdqu 16(%r8),%xmm11 + movdqu 0(%r8),%xmm8 +.byte 102,69,15,56,0,218 +.byte 102,69,15,56,0,194 + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm8,%xmm0 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 +.byte 102,69,15,58,68,238,17 + xorps %xmm11,%xmm3 +.byte 102,68,15,58,68,231,0 + xorps %xmm13,%xmm5 + + leaq 64(%r8),%r8 + subq $64,%r9 + jc .Ltail4x + + jmp .Lmod4_loop +.p2align 5 +.Lmod4_loop: +.byte 102,65,15,58,68,199,0 + xorps %xmm12,%xmm4 + movdqu 48(%r8),%xmm11 +.byte 102,69,15,56,0,218 +.byte 102,65,15,58,68,207,17 + xorps %xmm3,%xmm0 + movdqu 32(%r8),%xmm3 + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 +.byte 102,68,15,58,68,199,16 + xorps %xmm5,%xmm1 + pxor %xmm11,%xmm12 +.byte 102,65,15,56,0,218 + movups 32(%rdx),%xmm7 +.byte 102,68,15,58,68,218,0 + xorps %xmm4,%xmm8 + movdqa %xmm3,%xmm5 + pshufd $78,%xmm3,%xmm4 + + pxor %xmm0,%xmm8 + pxor %xmm3,%xmm4 + pxor %xmm1,%xmm8 + movdqa %xmm8,%xmm9 + pslldq $8,%xmm8 +.byte 102,68,15,58,68,234,17 + psrldq $8,%xmm9 + pxor %xmm8,%xmm0 + movdqa .L7_mask(%rip),%xmm8 + pxor %xmm9,%xmm1 +.byte 102,76,15,110,200 + + pand %xmm0,%xmm8 +.byte 102,69,15,56,0,200 +.byte 102,68,15,58,68,231,0 + pxor %xmm0,%xmm9 + psllq $57,%xmm9 + movdqa %xmm9,%xmm8 + pslldq $8,%xmm9 +.byte 102,15,58,68,222,0 + psrldq $8,%xmm8 + pxor %xmm9,%xmm0 + pxor %xmm8,%xmm1 + movdqu 0(%r8),%xmm8 + + movdqa %xmm0,%xmm9 + psrlq $1,%xmm0 +.byte 102,15,58,68,238,17 + xorps %xmm11,%xmm3 + movdqu 16(%r8),%xmm11 +.byte 102,69,15,56,0,218 +.byte 102,15,58,68,231,16 + xorps %xmm13,%xmm5 + movups 80(%rdx),%xmm7 +.byte 102,69,15,56,0,194 + pxor %xmm9,%xmm1 + pxor %xmm0,%xmm9 + psrlq $5,%xmm0 + movdqa %xmm11,%xmm13 + pxor %xmm12,%xmm4 + pshufd $78,%xmm11,%xmm12 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 + pxor %xmm9,%xmm0 + pxor %xmm8,%xmm1 + psrlq $1,%xmm0 +.byte 102,69,15,58,68,238,17 + xorps %xmm11,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,68,15,58,68,231,0 + xorps %xmm13,%xmm5 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 + + leaq 64(%r8),%r8 + subq $64,%r9 + jnc .Lmod4_loop + +.Ltail4x: +.byte 102,65,15,58,68,199,0 + xorps %xmm12,%xmm4 +.byte 102,65,15,58,68,207,17 + xorps %xmm3,%xmm0 +.byte 102,68,15,58,68,199,16 + xorps %xmm5,%xmm1 + pxor %xmm0,%xmm1 + pxor %xmm4,%xmm8 + + pxor %xmm1,%xmm8 + pxor %xmm0,%xmm1 + + movdqa %xmm8,%xmm9 + psrldq $8,%xmm8 + pslldq $8,%xmm9 + pxor %xmm8,%xmm1 + pxor %xmm9,%xmm0 - movdqu (%r8),%xmm3 - movdqu 16(%r8),%xmm6 -.byte 102,15,56,0,221 -.byte 102,15,56,0,245 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 - movdqa %xmm6,%xmm7 - pshufd $78,%xmm6,%xmm3 - pshufd $78,%xmm2,%xmm4 - pxor %xmm6,%xmm3 - pxor %xmm2,%xmm4 -.byte 102,15,58,68,242,0 -.byte 102,15,58,68,250,17 -.byte 102,15,58,68,220,0 - pxor %xmm6,%xmm3 - pxor %xmm7,%xmm3 - - movdqa %xmm3,%xmm4 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm7 - pxor %xmm4,%xmm6 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm8,%xmm4 - pxor %xmm0,%xmm3 - pxor %xmm8,%xmm4 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + addq $64,%r9 + jz .Ldone + movdqu 32(%rdx),%xmm7 + subq $16,%r9 + jz .Lodd_tail +.Lskip4x: + + + + + + movdqu (%r8),%xmm8 + movdqu 16(%r8),%xmm3 +.byte 102,69,15,56,0,194 +.byte 102,65,15,56,0,218 + pxor %xmm8,%xmm0 + + movdqa %xmm3,%xmm5 + pshufd $78,%xmm3,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,68,218,0 +.byte 102,15,58,68,234,17 +.byte 102,15,58,68,231,0 leaq 32(%r8),%r8 + nop subq $32,%r9 jbe .Leven_tail + nop + jmp .Lmod_loop +.p2align 5 .Lmod_loop: -.byte 102,65,15,58,68,192,0 -.byte 102,65,15,58,68,200,17 -.byte 102,15,58,68,220,0 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm8 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 +.byte 102,15,58,68,198,0 +.byte 102,15,58,68,206,17 +.byte 102,15,58,68,231,16 + + pxor %xmm3,%xmm0 + pxor %xmm5,%xmm1 + movdqu (%r8),%xmm5 + pxor %xmm0,%xmm8 +.byte 102,65,15,56,0,234 + movdqu 16(%r8),%xmm3 + + pxor %xmm1,%xmm8 + pxor %xmm5,%xmm1 + pxor %xmm8,%xmm4 +.byte 102,65,15,56,0,218 + movdqa %xmm4,%xmm8 + psrldq $8,%xmm8 pslldq $8,%xmm4 - pxor %xmm3,%xmm1 + pxor %xmm8,%xmm1 pxor %xmm4,%xmm0 - movdqu (%r8),%xmm3 - pxor %xmm6,%xmm0 - pxor %xmm7,%xmm1 - - movdqu 16(%r8),%xmm6 -.byte 102,15,56,0,221 -.byte 102,15,56,0,245 - - movdqa %xmm6,%xmm7 - pshufd $78,%xmm6,%xmm9 - pshufd $78,%xmm2,%xmm10 - pxor %xmm6,%xmm9 - pxor %xmm2,%xmm10 - pxor %xmm3,%xmm1 - movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 + movdqa %xmm3,%xmm5 + + movdqa %xmm0,%xmm9 + movdqa %xmm0,%xmm8 psllq $5,%xmm0 - pxor %xmm3,%xmm0 -.byte 102,15,58,68,242,0 + pxor %xmm0,%xmm8 +.byte 102,15,58,68,218,0 + psllq $1,%xmm0 + pxor %xmm8,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm8 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 - -.byte 102,15,58,68,250,17 - movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 + psrldq $8,%xmm8 + pxor %xmm9,%xmm0 + pshufd $78,%xmm5,%xmm4 + pxor %xmm8,%xmm1 + pxor %xmm5,%xmm4 + +.byte 102,15,58,68,234,17 + movdqa %xmm0,%xmm9 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 + pxor %xmm9,%xmm1 + pxor %xmm0,%xmm9 + psrlq $5,%xmm0 + pxor %xmm9,%xmm0 + leaq 32(%r8),%r8 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 - -.byte 102,69,15,58,68,202,0 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm8,%xmm4 - pxor %xmm0,%xmm3 - pxor %xmm8,%xmm4 - - pxor %xmm6,%xmm9 - pxor %xmm7,%xmm9 - movdqa %xmm9,%xmm10 - psrldq $8,%xmm9 - pslldq $8,%xmm10 - pxor %xmm9,%xmm7 - pxor %xmm10,%xmm6 +.byte 102,15,58,68,231,0 + pxor %xmm1,%xmm0 +.byte 0x66,0x90 - leaq 32(%r8),%r8 subq $32,%r9 ja .Lmod_loop .Leven_tail: -.byte 102,65,15,58,68,192,0 -.byte 102,65,15,58,68,200,17 -.byte 102,15,58,68,220,0 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm8 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 +.byte 102,15,58,68,198,0 +.byte 102,15,58,68,206,17 +.byte 102,15,58,68,231,16 + + pxor %xmm3,%xmm0 + pxor %xmm5,%xmm1 + pxor %xmm0,%xmm8 + pxor %xmm1,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm8 + psrldq $8,%xmm8 pslldq $8,%xmm4 - pxor %xmm3,%xmm1 + pxor %xmm8,%xmm1 pxor %xmm4,%xmm0 - pxor %xmm6,%xmm0 - pxor %xmm7,%xmm1 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 testq %r9,%r9 jnz .Ldone .Lodd_tail: - movdqu (%r8),%xmm3 -.byte 102,15,56,0,221 - pxor %xmm3,%xmm0 + movdqu (%r8),%xmm8 +.byte 102,69,15,56,0,194 + pxor %xmm8,%xmm0 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 +.byte 102,15,58,68,223,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 @@ -1015,44 +1300,72 @@ gcm_ghash_clmul: pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 .Ldone: -.byte 102,15,56,0,197 +.byte 102,65,15,56,0,194 movdqu %xmm0,(%rcx) movaps (%rsp),%xmm6 movaps 16(%rsp),%xmm7 movaps 32(%rsp),%xmm8 movaps 48(%rsp),%xmm9 movaps 64(%rsp),%xmm10 - addq $88,%rsp - .byte 0xf3,0xc3 + movaps 80(%rsp),%xmm11 + movaps 96(%rsp),%xmm12 + movaps 112(%rsp),%xmm13 + movaps 128(%rsp),%xmm14 + movaps 144(%rsp),%xmm15 + leaq 168(%rsp),%rsp .LSEH_end_gcm_ghash_clmul: + .byte 0xf3,0xc3 + +.globl gcm_init_avx +.def gcm_init_avx; .scl 2; .type 32; .endef +.p2align 5 +gcm_init_avx: + jmp .L_init_clmul + +.globl gcm_gmult_avx +.def gcm_gmult_avx; .scl 2; .type 32; .endef +.p2align 5 +gcm_gmult_avx: + jmp .L_gmult_clmul + +.globl gcm_ghash_avx +.def gcm_ghash_avx; .scl 2; .type 32; .endef +.p2align 5 +gcm_ghash_avx: + jmp .L_ghash_clmul .p2align 6 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .L0x1c2_polynomial: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.L7_mask: +.long 7,0,7,0 +.L7_mask_poly: +.long 7,0,450,0 .p2align 6 .Lrem_4bit: @@ -1149,7 +1462,7 @@ se_handler: movq 40(%r9),%rdi movq %r8,%rsi movl $154,%ecx -.long 0xa548f3fc +.long 0xa548f3fc movq %r9,%rsi xorq %rcx,%rcx @@ -1189,26 +1502,38 @@ se_handler: .rva .LSEH_end_gcm_ghash_4bit .rva .LSEH_info_gcm_ghash_4bit +.rva .LSEH_begin_gcm_init_clmul +.rva .LSEH_end_gcm_init_clmul +.rva .LSEH_info_gcm_init_clmul + .rva .LSEH_begin_gcm_ghash_clmul .rva .LSEH_end_gcm_ghash_clmul .rva .LSEH_info_gcm_ghash_clmul - .section .xdata .p2align 3 .LSEH_info_gcm_gmult_4bit: .byte 9,0,0,0 .rva se_handler -.rva .Lgmult_prologue,.Lgmult_epilogue +.rva .Lgmult_prologue,.Lgmult_epilogue .LSEH_info_gcm_ghash_4bit: .byte 9,0,0,0 .rva se_handler -.rva .Lghash_prologue,.Lghash_epilogue +.rva .Lghash_prologue,.Lghash_epilogue +.LSEH_info_gcm_init_clmul: +.byte 0x01,0x08,0x03,0x00 +.byte 0x08,0x68,0x00,0x00 +.byte 0x04,0x22,0x00,0x00 .LSEH_info_gcm_ghash_clmul: -.byte 0x01,0x1f,0x0b,0x00 -.byte 0x1f,0xa8,0x04,0x00 -.byte 0x19,0x98,0x03,0x00 -.byte 0x13,0x88,0x02,0x00 -.byte 0x0d,0x78,0x01,0x00 -.byte 0x08,0x68,0x00,0x00 -.byte 0x04,0xa2,0x00,0x00 +.byte 0x01,0x33,0x16,0x00 +.byte 0x33,0xf8,0x09,0x00 +.byte 0x2e,0xe8,0x08,0x00 +.byte 0x29,0xd8,0x07,0x00 +.byte 0x24,0xc8,0x06,0x00 +.byte 0x1f,0xb8,0x05,0x00 +.byte 0x1a,0xa8,0x04,0x00 +.byte 0x15,0x98,0x03,0x00 +.byte 0x10,0x88,0x02,0x00 +.byte 0x0c,0x78,0x01,0x00 +.byte 0x08,0x68,0x00,0x00 +.byte 0x04,0x01,0x15,0x00 diff --git a/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s b/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s index 9aa029f1bb..e8136025e8 100644 --- a/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s +++ b/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s @@ -54,23 +54,25 @@ sha1_block_data_order: movl _gnutls_x86_cpuid_s+0(%rip),%r9d movl _gnutls_x86_cpuid_s+4(%rip),%r8d + movl _gnutls_x86_cpuid_s+8(%rip),%r10d testl $512,%r8d jz .Lialu jmp _ssse3_shortcut .p2align 4 .Lialu: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 - movq %rsp,%r11 + pushq %r14 movq %rdi,%r8 subq $72,%rsp movq %rsi,%r9 andq $-64,%rsp movq %rdx,%r10 - movq %r11,64(%rsp) + movq %rax,64(%rsp) .Lprologue: movl 0(%r8),%esi @@ -84,1230 +86,1168 @@ sha1_block_data_order: .Lloop: movl 0(%r9),%edx bswapl %edx - movl %edx,0(%rsp) - movl %r11d,%eax movl 4(%r9),%ebp + movl %r12d,%eax + movl %edx,0(%rsp) movl %esi,%ecx - xorl %r12d,%eax bswapl %ebp + xorl %r11d,%eax roll $5,%ecx - leal 1518500249(%rdx,%r13,1),%r13d andl %edi,%eax - movl %ebp,4(%rsp) + leal 1518500249(%rdx,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d - movl %edi,%eax - movl 8(%r9),%edx + movl 8(%r9),%r14d + movl %r11d,%eax + movl %ebp,4(%rsp) movl %r13d,%ecx - xorl %r11d,%eax - bswapl %edx + bswapl %r14d + xorl %edi,%eax roll $5,%ecx - leal 1518500249(%rbp,%r12,1),%r12d andl %esi,%eax - movl %edx,8(%rsp) + leal 1518500249(%rbp,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d - movl %esi,%eax - movl 12(%r9),%ebp + movl 12(%r9),%edx + movl %edi,%eax + movl %r14d,8(%rsp) movl %r12d,%ecx - xorl %edi,%eax - bswapl %ebp + bswapl %edx + xorl %esi,%eax roll $5,%ecx - leal 1518500249(%rdx,%r11,1),%r11d andl %r13d,%eax - movl %ebp,12(%rsp) + leal 1518500249(%r14,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d - movl %r13d,%eax - movl 16(%r9),%edx + movl 16(%r9),%ebp + movl %esi,%eax + movl %edx,12(%rsp) movl %r11d,%ecx - xorl %esi,%eax - bswapl %edx + bswapl %ebp + xorl %r13d,%eax roll $5,%ecx - leal 1518500249(%rbp,%rdi,1),%edi andl %r12d,%eax - movl %edx,16(%rsp) + leal 1518500249(%rdx,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi - movl %r12d,%eax - movl 20(%r9),%ebp + movl 20(%r9),%r14d + movl %r13d,%eax + movl %ebp,16(%rsp) movl %edi,%ecx - xorl %r13d,%eax - bswapl %ebp + bswapl %r14d + xorl %r12d,%eax roll $5,%ecx - leal 1518500249(%rdx,%rsi,1),%esi andl %r11d,%eax - movl %ebp,20(%rsp) + leal 1518500249(%rbp,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi - movl %r11d,%eax movl 24(%r9),%edx + movl %r12d,%eax + movl %r14d,20(%rsp) movl %esi,%ecx - xorl %r12d,%eax bswapl %edx + xorl %r11d,%eax roll $5,%ecx - leal 1518500249(%rbp,%r13,1),%r13d andl %edi,%eax - movl %edx,24(%rsp) + leal 1518500249(%r14,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d - movl %edi,%eax movl 28(%r9),%ebp + movl %r11d,%eax + movl %edx,24(%rsp) movl %r13d,%ecx - xorl %r11d,%eax bswapl %ebp + xorl %edi,%eax roll $5,%ecx - leal 1518500249(%rdx,%r12,1),%r12d andl %esi,%eax - movl %ebp,28(%rsp) + leal 1518500249(%rdx,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d - movl %esi,%eax - movl 32(%r9),%edx + movl 32(%r9),%r14d + movl %edi,%eax + movl %ebp,28(%rsp) movl %r12d,%ecx - xorl %edi,%eax - bswapl %edx + bswapl %r14d + xorl %esi,%eax roll $5,%ecx - leal 1518500249(%rbp,%r11,1),%r11d andl %r13d,%eax - movl %edx,32(%rsp) + leal 1518500249(%rbp,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d - movl %r13d,%eax - movl 36(%r9),%ebp + movl 36(%r9),%edx + movl %esi,%eax + movl %r14d,32(%rsp) movl %r11d,%ecx - xorl %esi,%eax - bswapl %ebp + bswapl %edx + xorl %r13d,%eax roll $5,%ecx - leal 1518500249(%rdx,%rdi,1),%edi andl %r12d,%eax - movl %ebp,36(%rsp) + leal 1518500249(%r14,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi - movl %r12d,%eax - movl 40(%r9),%edx + movl 40(%r9),%ebp + movl %r13d,%eax + movl %edx,36(%rsp) movl %edi,%ecx - xorl %r13d,%eax - bswapl %edx + bswapl %ebp + xorl %r12d,%eax roll $5,%ecx - leal 1518500249(%rbp,%rsi,1),%esi andl %r11d,%eax - movl %edx,40(%rsp) + leal 1518500249(%rdx,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi - movl %r11d,%eax - movl 44(%r9),%ebp + movl 44(%r9),%r14d + movl %r12d,%eax + movl %ebp,40(%rsp) movl %esi,%ecx - xorl %r12d,%eax - bswapl %ebp + bswapl %r14d + xorl %r11d,%eax roll $5,%ecx - leal 1518500249(%rdx,%r13,1),%r13d andl %edi,%eax - movl %ebp,44(%rsp) + leal 1518500249(%rbp,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d - movl %edi,%eax movl 48(%r9),%edx + movl %r11d,%eax + movl %r14d,44(%rsp) movl %r13d,%ecx - xorl %r11d,%eax bswapl %edx + xorl %edi,%eax roll $5,%ecx - leal 1518500249(%rbp,%r12,1),%r12d andl %esi,%eax - movl %edx,48(%rsp) + leal 1518500249(%r14,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d - movl %esi,%eax movl 52(%r9),%ebp + movl %edi,%eax + movl %edx,48(%rsp) movl %r12d,%ecx - xorl %edi,%eax bswapl %ebp + xorl %esi,%eax roll $5,%ecx - leal 1518500249(%rdx,%r11,1),%r11d andl %r13d,%eax - movl %ebp,52(%rsp) + leal 1518500249(%rdx,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d - movl %r13d,%eax - movl 56(%r9),%edx + movl 56(%r9),%r14d + movl %esi,%eax + movl %ebp,52(%rsp) movl %r11d,%ecx - xorl %esi,%eax - bswapl %edx + bswapl %r14d + xorl %r13d,%eax roll $5,%ecx - leal 1518500249(%rbp,%rdi,1),%edi andl %r12d,%eax - movl %edx,56(%rsp) + leal 1518500249(%rbp,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi - movl %r12d,%eax - movl 60(%r9),%ebp + movl 60(%r9),%edx + movl %r13d,%eax + movl %r14d,56(%rsp) movl %edi,%ecx - xorl %r13d,%eax - bswapl %ebp + bswapl %edx + xorl %r12d,%eax roll $5,%ecx - leal 1518500249(%rdx,%rsi,1),%esi andl %r11d,%eax - movl %ebp,60(%rsp) + leal 1518500249(%r14,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi - movl 0(%rsp),%edx - movl %r11d,%eax + xorl 0(%rsp),%ebp + movl %r12d,%eax + movl %edx,60(%rsp) movl %esi,%ecx - xorl 8(%rsp),%edx - xorl %r12d,%eax + xorl 8(%rsp),%ebp + xorl %r11d,%eax roll $5,%ecx - xorl 32(%rsp),%edx + xorl 32(%rsp),%ebp andl %edi,%eax - leal 1518500249(%rbp,%r13,1),%r13d - xorl 52(%rsp),%edx + leal 1518500249(%rdx,%r13,1),%r13d + roll $30,%edi xorl %r12d,%eax - roll $1,%edx addl %ecx,%r13d - roll $30,%edi - movl %edx,0(%rsp) + roll $1,%ebp addl %eax,%r13d - movl 4(%rsp),%ebp - movl %edi,%eax + xorl 4(%rsp),%r14d + movl %r11d,%eax + movl %ebp,0(%rsp) movl %r13d,%ecx - xorl 12(%rsp),%ebp - xorl %r11d,%eax + xorl 12(%rsp),%r14d + xorl %edi,%eax roll $5,%ecx - xorl 36(%rsp),%ebp + xorl 36(%rsp),%r14d andl %esi,%eax - leal 1518500249(%rdx,%r12,1),%r12d - xorl 56(%rsp),%ebp + leal 1518500249(%rbp,%r12,1),%r12d + roll $30,%esi xorl %r11d,%eax - roll $1,%ebp addl %ecx,%r12d - roll $30,%esi - movl %ebp,4(%rsp) + roll $1,%r14d addl %eax,%r12d - movl 8(%rsp),%edx - movl %esi,%eax + xorl 8(%rsp),%edx + movl %edi,%eax + movl %r14d,4(%rsp) movl %r12d,%ecx xorl 16(%rsp),%edx - xorl %edi,%eax + xorl %esi,%eax roll $5,%ecx xorl 40(%rsp),%edx andl %r13d,%eax - leal 1518500249(%rbp,%r11,1),%r11d - xorl 60(%rsp),%edx + leal 1518500249(%r14,%r11,1),%r11d + roll $30,%r13d xorl %edi,%eax - roll $1,%edx addl %ecx,%r11d - roll $30,%r13d - movl %edx,8(%rsp) + roll $1,%edx addl %eax,%r11d - movl 12(%rsp),%ebp - movl %r13d,%eax + xorl 12(%rsp),%ebp + movl %esi,%eax + movl %edx,8(%rsp) movl %r11d,%ecx xorl 20(%rsp),%ebp - xorl %esi,%eax + xorl %r13d,%eax roll $5,%ecx xorl 44(%rsp),%ebp andl %r12d,%eax leal 1518500249(%rdx,%rdi,1),%edi - xorl 0(%rsp),%ebp + roll $30,%r12d xorl %esi,%eax - roll $1,%ebp addl %ecx,%edi - roll $30,%r12d - movl %ebp,12(%rsp) + roll $1,%ebp addl %eax,%edi - movl 16(%rsp),%edx - movl %r12d,%eax + xorl 16(%rsp),%r14d + movl %r13d,%eax + movl %ebp,12(%rsp) movl %edi,%ecx - xorl 24(%rsp),%edx - xorl %r13d,%eax + xorl 24(%rsp),%r14d + xorl %r12d,%eax roll $5,%ecx - xorl 48(%rsp),%edx + xorl 48(%rsp),%r14d andl %r11d,%eax leal 1518500249(%rbp,%rsi,1),%esi - xorl 4(%rsp),%edx + roll $30,%r11d xorl %r13d,%eax - roll $1,%edx addl %ecx,%esi - roll $30,%r11d - movl %edx,16(%rsp) + roll $1,%r14d addl %eax,%esi - movl 20(%rsp),%ebp - movl %r11d,%eax + xorl 20(%rsp),%edx + movl %edi,%eax + movl %r14d,16(%rsp) movl %esi,%ecx - xorl 28(%rsp),%ebp - xorl %edi,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r13,1),%r13d - xorl 52(%rsp),%ebp + xorl 28(%rsp),%edx xorl %r12d,%eax + roll $5,%ecx + xorl 52(%rsp),%edx + leal 1859775393(%r14,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 8(%rsp),%ebp roll $30,%edi addl %eax,%r13d - roll $1,%ebp - movl %ebp,20(%rsp) - movl 24(%rsp),%edx - movl %edi,%eax + roll $1,%edx + xorl 24(%rsp),%ebp + movl %esi,%eax + movl %edx,20(%rsp) movl %r13d,%ecx - xorl 32(%rsp),%edx - xorl %esi,%eax - roll $5,%ecx - leal 1859775393(%rbp,%r12,1),%r12d - xorl 56(%rsp),%edx + xorl 32(%rsp),%ebp xorl %r11d,%eax + roll $5,%ecx + xorl 56(%rsp),%ebp + leal 1859775393(%rdx,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 12(%rsp),%edx roll $30,%esi addl %eax,%r12d - roll $1,%edx - movl %edx,24(%rsp) - movl 28(%rsp),%ebp - movl %esi,%eax + roll $1,%ebp + xorl 28(%rsp),%r14d + movl %r13d,%eax + movl %ebp,24(%rsp) movl %r12d,%ecx - xorl 36(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r11,1),%r11d - xorl 60(%rsp),%ebp + xorl 36(%rsp),%r14d xorl %edi,%eax + roll $5,%ecx + xorl 60(%rsp),%r14d + leal 1859775393(%rbp,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 16(%rsp),%ebp roll $30,%r13d addl %eax,%r11d - roll $1,%ebp - movl %ebp,28(%rsp) - movl 32(%rsp),%edx - movl %r13d,%eax + roll $1,%r14d + xorl 32(%rsp),%edx + movl %r12d,%eax + movl %r14d,28(%rsp) movl %r11d,%ecx xorl 40(%rsp),%edx - xorl %r12d,%eax + xorl %esi,%eax roll $5,%ecx - leal 1859775393(%rbp,%rdi,1),%edi xorl 0(%rsp),%edx - xorl %esi,%eax + leal 1859775393(%r14,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 20(%rsp),%edx roll $30,%r12d addl %eax,%edi roll $1,%edx + xorl 36(%rsp),%ebp + movl %r11d,%eax movl %edx,32(%rsp) - movl 36(%rsp),%ebp - movl %r12d,%eax movl %edi,%ecx xorl 44(%rsp),%ebp - xorl %r11d,%eax + xorl %r13d,%eax roll $5,%ecx - leal 1859775393(%rdx,%rsi,1),%esi xorl 4(%rsp),%ebp - xorl %r13d,%eax + leal 1859775393(%rdx,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 24(%rsp),%ebp roll $30,%r11d addl %eax,%esi roll $1,%ebp + xorl 40(%rsp),%r14d + movl %edi,%eax movl %ebp,36(%rsp) - movl 40(%rsp),%edx - movl %r11d,%eax movl %esi,%ecx - xorl 48(%rsp),%edx - xorl %edi,%eax + xorl 48(%rsp),%r14d + xorl %r12d,%eax roll $5,%ecx + xorl 8(%rsp),%r14d leal 1859775393(%rbp,%r13,1),%r13d - xorl 8(%rsp),%edx - xorl %r12d,%eax + xorl %r11d,%eax addl %ecx,%r13d - xorl 28(%rsp),%edx roll $30,%edi addl %eax,%r13d - roll $1,%edx - movl %edx,40(%rsp) - movl 44(%rsp),%ebp - movl %edi,%eax + roll $1,%r14d + xorl 44(%rsp),%edx + movl %esi,%eax + movl %r14d,40(%rsp) movl %r13d,%ecx - xorl 52(%rsp),%ebp - xorl %esi,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r12,1),%r12d - xorl 12(%rsp),%ebp + xorl 52(%rsp),%edx xorl %r11d,%eax + roll $5,%ecx + xorl 12(%rsp),%edx + leal 1859775393(%r14,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 32(%rsp),%ebp roll $30,%esi addl %eax,%r12d - roll $1,%ebp - movl %ebp,44(%rsp) - movl 48(%rsp),%edx - movl %esi,%eax + roll $1,%edx + xorl 48(%rsp),%ebp + movl %r13d,%eax + movl %edx,44(%rsp) movl %r12d,%ecx - xorl 56(%rsp),%edx - xorl %r13d,%eax - roll $5,%ecx - leal 1859775393(%rbp,%r11,1),%r11d - xorl 16(%rsp),%edx + xorl 56(%rsp),%ebp xorl %edi,%eax + roll $5,%ecx + xorl 16(%rsp),%ebp + leal 1859775393(%rdx,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 36(%rsp),%edx roll $30,%r13d addl %eax,%r11d - roll $1,%edx - movl %edx,48(%rsp) - movl 52(%rsp),%ebp - movl %r13d,%eax + roll $1,%ebp + xorl 52(%rsp),%r14d + movl %r12d,%eax + movl %ebp,48(%rsp) movl %r11d,%ecx - xorl 60(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%rdi,1),%edi - xorl 20(%rsp),%ebp + xorl 60(%rsp),%r14d xorl %esi,%eax + roll $5,%ecx + xorl 20(%rsp),%r14d + leal 1859775393(%rbp,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 40(%rsp),%ebp roll $30,%r12d addl %eax,%edi - roll $1,%ebp - movl %ebp,52(%rsp) - movl 56(%rsp),%edx - movl %r12d,%eax + roll $1,%r14d + xorl 56(%rsp),%edx + movl %r11d,%eax + movl %r14d,52(%rsp) movl %edi,%ecx xorl 0(%rsp),%edx - xorl %r11d,%eax + xorl %r13d,%eax roll $5,%ecx - leal 1859775393(%rbp,%rsi,1),%esi xorl 24(%rsp),%edx - xorl %r13d,%eax + leal 1859775393(%r14,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 44(%rsp),%edx roll $30,%r11d addl %eax,%esi roll $1,%edx + xorl 60(%rsp),%ebp + movl %edi,%eax movl %edx,56(%rsp) - movl 60(%rsp),%ebp - movl %r11d,%eax movl %esi,%ecx xorl 4(%rsp),%ebp - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal 1859775393(%rdx,%r13,1),%r13d xorl 28(%rsp),%ebp - xorl %r12d,%eax + leal 1859775393(%rdx,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 48(%rsp),%ebp roll $30,%edi addl %eax,%r13d roll $1,%ebp + xorl 0(%rsp),%r14d + movl %esi,%eax movl %ebp,60(%rsp) - movl 0(%rsp),%edx - movl %edi,%eax movl %r13d,%ecx - xorl 8(%rsp),%edx - xorl %esi,%eax + xorl 8(%rsp),%r14d + xorl %r11d,%eax roll $5,%ecx + xorl 32(%rsp),%r14d leal 1859775393(%rbp,%r12,1),%r12d - xorl 32(%rsp),%edx - xorl %r11d,%eax + xorl %edi,%eax addl %ecx,%r12d - xorl 52(%rsp),%edx roll $30,%esi addl %eax,%r12d - roll $1,%edx - movl %edx,0(%rsp) - movl 4(%rsp),%ebp - movl %esi,%eax + roll $1,%r14d + xorl 4(%rsp),%edx + movl %r13d,%eax + movl %r14d,0(%rsp) movl %r12d,%ecx - xorl 12(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r11,1),%r11d - xorl 36(%rsp),%ebp + xorl 12(%rsp),%edx xorl %edi,%eax + roll $5,%ecx + xorl 36(%rsp),%edx + leal 1859775393(%r14,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 56(%rsp),%ebp roll $30,%r13d addl %eax,%r11d - roll $1,%ebp - movl %ebp,4(%rsp) - movl 8(%rsp),%edx - movl %r13d,%eax + roll $1,%edx + xorl 8(%rsp),%ebp + movl %r12d,%eax + movl %edx,4(%rsp) movl %r11d,%ecx - xorl 16(%rsp),%edx - xorl %r12d,%eax - roll $5,%ecx - leal 1859775393(%rbp,%rdi,1),%edi - xorl 40(%rsp),%edx + xorl 16(%rsp),%ebp xorl %esi,%eax + roll $5,%ecx + xorl 40(%rsp),%ebp + leal 1859775393(%rdx,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 60(%rsp),%edx roll $30,%r12d addl %eax,%edi - roll $1,%edx - movl %edx,8(%rsp) - movl 12(%rsp),%ebp - movl %r12d,%eax + roll $1,%ebp + xorl 12(%rsp),%r14d + movl %r11d,%eax + movl %ebp,8(%rsp) movl %edi,%ecx - xorl 20(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%rsi,1),%esi - xorl 44(%rsp),%ebp + xorl 20(%rsp),%r14d xorl %r13d,%eax + roll $5,%ecx + xorl 44(%rsp),%r14d + leal 1859775393(%rbp,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 0(%rsp),%ebp roll $30,%r11d addl %eax,%esi - roll $1,%ebp - movl %ebp,12(%rsp) - movl 16(%rsp),%edx - movl %r11d,%eax + roll $1,%r14d + xorl 16(%rsp),%edx + movl %edi,%eax + movl %r14d,12(%rsp) movl %esi,%ecx xorl 24(%rsp),%edx - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal 1859775393(%rbp,%r13,1),%r13d xorl 48(%rsp),%edx - xorl %r12d,%eax + leal 1859775393(%r14,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 4(%rsp),%edx roll $30,%edi addl %eax,%r13d roll $1,%edx + xorl 20(%rsp),%ebp + movl %esi,%eax movl %edx,16(%rsp) - movl 20(%rsp),%ebp - movl %edi,%eax movl %r13d,%ecx xorl 28(%rsp),%ebp - xorl %esi,%eax + xorl %r11d,%eax roll $5,%ecx - leal 1859775393(%rdx,%r12,1),%r12d xorl 52(%rsp),%ebp - xorl %r11d,%eax + leal 1859775393(%rdx,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 8(%rsp),%ebp roll $30,%esi addl %eax,%r12d roll $1,%ebp + xorl 24(%rsp),%r14d + movl %r13d,%eax movl %ebp,20(%rsp) - movl 24(%rsp),%edx - movl %esi,%eax movl %r12d,%ecx - xorl 32(%rsp),%edx - xorl %r13d,%eax + xorl 32(%rsp),%r14d + xorl %edi,%eax roll $5,%ecx + xorl 56(%rsp),%r14d leal 1859775393(%rbp,%r11,1),%r11d - xorl 56(%rsp),%edx - xorl %edi,%eax + xorl %esi,%eax addl %ecx,%r11d - xorl 12(%rsp),%edx roll $30,%r13d addl %eax,%r11d - roll $1,%edx - movl %edx,24(%rsp) - movl 28(%rsp),%ebp - movl %r13d,%eax + roll $1,%r14d + xorl 28(%rsp),%edx + movl %r12d,%eax + movl %r14d,24(%rsp) movl %r11d,%ecx - xorl 36(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%rdi,1),%edi - xorl 60(%rsp),%ebp + xorl 36(%rsp),%edx xorl %esi,%eax + roll $5,%ecx + xorl 60(%rsp),%edx + leal 1859775393(%r14,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 16(%rsp),%ebp roll $30,%r12d addl %eax,%edi - roll $1,%ebp - movl %ebp,28(%rsp) - movl 32(%rsp),%edx - movl %r12d,%eax + roll $1,%edx + xorl 32(%rsp),%ebp + movl %r11d,%eax + movl %edx,28(%rsp) movl %edi,%ecx - xorl 40(%rsp),%edx - xorl %r11d,%eax - roll $5,%ecx - leal 1859775393(%rbp,%rsi,1),%esi - xorl 0(%rsp),%edx + xorl 40(%rsp),%ebp xorl %r13d,%eax + roll $5,%ecx + xorl 0(%rsp),%ebp + leal 1859775393(%rdx,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 20(%rsp),%edx roll $30,%r11d addl %eax,%esi - roll $1,%edx - movl %edx,32(%rsp) - movl 36(%rsp),%ebp - movl %r11d,%eax - movl %r11d,%ebx - xorl 44(%rsp),%ebp - andl %r12d,%eax + roll $1,%ebp + xorl 36(%rsp),%r14d + movl %r12d,%eax + movl %ebp,32(%rsp) + movl %r12d,%ebx + xorl 44(%rsp),%r14d + andl %r11d,%eax movl %esi,%ecx - xorl 4(%rsp),%ebp - xorl %r12d,%ebx - leal -1894007588(%rdx,%r13,1),%r13d + xorl 4(%rsp),%r14d + leal -1894007588(%rbp,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 24(%rsp),%ebp addl %eax,%r13d + roll $1,%r14d andl %edi,%ebx - roll $1,%ebp - addl %ebx,%r13d - roll $30,%edi - movl %ebp,36(%rsp) addl %ecx,%r13d - movl 40(%rsp),%edx - movl %edi,%eax - movl %edi,%ebx + roll $30,%edi + addl %ebx,%r13d + xorl 40(%rsp),%edx + movl %r11d,%eax + movl %r14d,36(%rsp) + movl %r11d,%ebx xorl 48(%rsp),%edx - andl %r11d,%eax + andl %edi,%eax movl %r13d,%ecx xorl 8(%rsp),%edx - xorl %r11d,%ebx - leal -1894007588(%rbp,%r12,1),%r12d + leal -1894007588(%r14,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 28(%rsp),%edx addl %eax,%r12d - andl %esi,%ebx roll $1,%edx - addl %ebx,%r12d + andl %esi,%ebx + addl %ecx,%r12d roll $30,%esi + addl %ebx,%r12d + xorl 44(%rsp),%ebp + movl %edi,%eax movl %edx,40(%rsp) - addl %ecx,%r12d - movl 44(%rsp),%ebp - movl %esi,%eax - movl %esi,%ebx + movl %edi,%ebx xorl 52(%rsp),%ebp - andl %edi,%eax + andl %esi,%eax movl %r12d,%ecx xorl 12(%rsp),%ebp - xorl %edi,%ebx leal -1894007588(%rdx,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 32(%rsp),%ebp addl %eax,%r11d - andl %r13d,%ebx roll $1,%ebp - addl %ebx,%r11d + andl %r13d,%ebx + addl %ecx,%r11d roll $30,%r13d + addl %ebx,%r11d + xorl 48(%rsp),%r14d + movl %esi,%eax movl %ebp,44(%rsp) - addl %ecx,%r11d - movl 48(%rsp),%edx - movl %r13d,%eax - movl %r13d,%ebx - xorl 56(%rsp),%edx - andl %esi,%eax + movl %esi,%ebx + xorl 56(%rsp),%r14d + andl %r13d,%eax movl %r11d,%ecx - xorl 16(%rsp),%edx - xorl %esi,%ebx + xorl 16(%rsp),%r14d leal -1894007588(%rbp,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 36(%rsp),%edx addl %eax,%edi + roll $1,%r14d andl %r12d,%ebx - roll $1,%edx - addl %ebx,%edi - roll $30,%r12d - movl %edx,48(%rsp) addl %ecx,%edi - movl 52(%rsp),%ebp - movl %r12d,%eax - movl %r12d,%ebx - xorl 60(%rsp),%ebp - andl %r13d,%eax + roll $30,%r12d + addl %ebx,%edi + xorl 52(%rsp),%edx + movl %r13d,%eax + movl %r14d,48(%rsp) + movl %r13d,%ebx + xorl 60(%rsp),%edx + andl %r12d,%eax movl %edi,%ecx - xorl 20(%rsp),%ebp - xorl %r13d,%ebx - leal -1894007588(%rdx,%rsi,1),%esi + xorl 20(%rsp),%edx + leal -1894007588(%r14,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 40(%rsp),%ebp addl %eax,%esi + roll $1,%edx andl %r11d,%ebx - roll $1,%ebp - addl %ebx,%esi - roll $30,%r11d - movl %ebp,52(%rsp) addl %ecx,%esi - movl 56(%rsp),%edx - movl %r11d,%eax - movl %r11d,%ebx - xorl 0(%rsp),%edx - andl %r12d,%eax + roll $30,%r11d + addl %ebx,%esi + xorl 56(%rsp),%ebp + movl %r12d,%eax + movl %edx,52(%rsp) + movl %r12d,%ebx + xorl 0(%rsp),%ebp + andl %r11d,%eax movl %esi,%ecx - xorl 24(%rsp),%edx - xorl %r12d,%ebx - leal -1894007588(%rbp,%r13,1),%r13d + xorl 24(%rsp),%ebp + leal -1894007588(%rdx,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 44(%rsp),%edx addl %eax,%r13d + roll $1,%ebp andl %edi,%ebx - roll $1,%edx - addl %ebx,%r13d - roll $30,%edi - movl %edx,56(%rsp) addl %ecx,%r13d - movl 60(%rsp),%ebp - movl %edi,%eax - movl %edi,%ebx - xorl 4(%rsp),%ebp - andl %r11d,%eax + roll $30,%edi + addl %ebx,%r13d + xorl 60(%rsp),%r14d + movl %r11d,%eax + movl %ebp,56(%rsp) + movl %r11d,%ebx + xorl 4(%rsp),%r14d + andl %edi,%eax movl %r13d,%ecx - xorl 28(%rsp),%ebp - xorl %r11d,%ebx - leal -1894007588(%rdx,%r12,1),%r12d + xorl 28(%rsp),%r14d + leal -1894007588(%rbp,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 48(%rsp),%ebp addl %eax,%r12d + roll $1,%r14d andl %esi,%ebx - roll $1,%ebp - addl %ebx,%r12d - roll $30,%esi - movl %ebp,60(%rsp) addl %ecx,%r12d - movl 0(%rsp),%edx - movl %esi,%eax - movl %esi,%ebx + roll $30,%esi + addl %ebx,%r12d + xorl 0(%rsp),%edx + movl %edi,%eax + movl %r14d,60(%rsp) + movl %edi,%ebx xorl 8(%rsp),%edx - andl %edi,%eax + andl %esi,%eax movl %r12d,%ecx xorl 32(%rsp),%edx - xorl %edi,%ebx - leal -1894007588(%rbp,%r11,1),%r11d + leal -1894007588(%r14,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 52(%rsp),%edx addl %eax,%r11d - andl %r13d,%ebx roll $1,%edx - addl %ebx,%r11d + andl %r13d,%ebx + addl %ecx,%r11d roll $30,%r13d + addl %ebx,%r11d + xorl 4(%rsp),%ebp + movl %esi,%eax movl %edx,0(%rsp) - addl %ecx,%r11d - movl 4(%rsp),%ebp - movl %r13d,%eax - movl %r13d,%ebx + movl %esi,%ebx xorl 12(%rsp),%ebp - andl %esi,%eax + andl %r13d,%eax movl %r11d,%ecx xorl 36(%rsp),%ebp - xorl %esi,%ebx leal -1894007588(%rdx,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 56(%rsp),%ebp addl %eax,%edi - andl %r12d,%ebx roll $1,%ebp - addl %ebx,%edi + andl %r12d,%ebx + addl %ecx,%edi roll $30,%r12d + addl %ebx,%edi + xorl 8(%rsp),%r14d + movl %r13d,%eax movl %ebp,4(%rsp) - addl %ecx,%edi - movl 8(%rsp),%edx - movl %r12d,%eax - movl %r12d,%ebx - xorl 16(%rsp),%edx - andl %r13d,%eax + movl %r13d,%ebx + xorl 16(%rsp),%r14d + andl %r12d,%eax movl %edi,%ecx - xorl 40(%rsp),%edx - xorl %r13d,%ebx + xorl 40(%rsp),%r14d leal -1894007588(%rbp,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 60(%rsp),%edx addl %eax,%esi + roll $1,%r14d andl %r11d,%ebx - roll $1,%edx - addl %ebx,%esi - roll $30,%r11d - movl %edx,8(%rsp) addl %ecx,%esi - movl 12(%rsp),%ebp - movl %r11d,%eax - movl %r11d,%ebx - xorl 20(%rsp),%ebp - andl %r12d,%eax + roll $30,%r11d + addl %ebx,%esi + xorl 12(%rsp),%edx + movl %r12d,%eax + movl %r14d,8(%rsp) + movl %r12d,%ebx + xorl 20(%rsp),%edx + andl %r11d,%eax movl %esi,%ecx - xorl 44(%rsp),%ebp - xorl %r12d,%ebx - leal -1894007588(%rdx,%r13,1),%r13d + xorl 44(%rsp),%edx + leal -1894007588(%r14,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 0(%rsp),%ebp addl %eax,%r13d + roll $1,%edx andl %edi,%ebx - roll $1,%ebp - addl %ebx,%r13d - roll $30,%edi - movl %ebp,12(%rsp) addl %ecx,%r13d - movl 16(%rsp),%edx - movl %edi,%eax - movl %edi,%ebx - xorl 24(%rsp),%edx - andl %r11d,%eax + roll $30,%edi + addl %ebx,%r13d + xorl 16(%rsp),%ebp + movl %r11d,%eax + movl %edx,12(%rsp) + movl %r11d,%ebx + xorl 24(%rsp),%ebp + andl %edi,%eax movl %r13d,%ecx - xorl 48(%rsp),%edx - xorl %r11d,%ebx - leal -1894007588(%rbp,%r12,1),%r12d + xorl 48(%rsp),%ebp + leal -1894007588(%rdx,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 4(%rsp),%edx addl %eax,%r12d + roll $1,%ebp andl %esi,%ebx - roll $1,%edx - addl %ebx,%r12d - roll $30,%esi - movl %edx,16(%rsp) addl %ecx,%r12d - movl 20(%rsp),%ebp - movl %esi,%eax - movl %esi,%ebx - xorl 28(%rsp),%ebp - andl %edi,%eax + roll $30,%esi + addl %ebx,%r12d + xorl 20(%rsp),%r14d + movl %edi,%eax + movl %ebp,16(%rsp) + movl %edi,%ebx + xorl 28(%rsp),%r14d + andl %esi,%eax movl %r12d,%ecx - xorl 52(%rsp),%ebp - xorl %edi,%ebx - leal -1894007588(%rdx,%r11,1),%r11d + xorl 52(%rsp),%r14d + leal -1894007588(%rbp,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 8(%rsp),%ebp addl %eax,%r11d + roll $1,%r14d andl %r13d,%ebx - roll $1,%ebp - addl %ebx,%r11d - roll $30,%r13d - movl %ebp,20(%rsp) addl %ecx,%r11d - movl 24(%rsp),%edx - movl %r13d,%eax - movl %r13d,%ebx + roll $30,%r13d + addl %ebx,%r11d + xorl 24(%rsp),%edx + movl %esi,%eax + movl %r14d,20(%rsp) + movl %esi,%ebx xorl 32(%rsp),%edx - andl %esi,%eax + andl %r13d,%eax movl %r11d,%ecx xorl 56(%rsp),%edx - xorl %esi,%ebx - leal -1894007588(%rbp,%rdi,1),%edi + leal -1894007588(%r14,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 12(%rsp),%edx addl %eax,%edi - andl %r12d,%ebx roll $1,%edx - addl %ebx,%edi + andl %r12d,%ebx + addl %ecx,%edi roll $30,%r12d + addl %ebx,%edi + xorl 28(%rsp),%ebp + movl %r13d,%eax movl %edx,24(%rsp) - addl %ecx,%edi - movl 28(%rsp),%ebp - movl %r12d,%eax - movl %r12d,%ebx + movl %r13d,%ebx xorl 36(%rsp),%ebp - andl %r13d,%eax + andl %r12d,%eax movl %edi,%ecx xorl 60(%rsp),%ebp - xorl %r13d,%ebx leal -1894007588(%rdx,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 16(%rsp),%ebp addl %eax,%esi - andl %r11d,%ebx roll $1,%ebp - addl %ebx,%esi + andl %r11d,%ebx + addl %ecx,%esi roll $30,%r11d + addl %ebx,%esi + xorl 32(%rsp),%r14d + movl %r12d,%eax movl %ebp,28(%rsp) - addl %ecx,%esi - movl 32(%rsp),%edx - movl %r11d,%eax - movl %r11d,%ebx - xorl 40(%rsp),%edx - andl %r12d,%eax + movl %r12d,%ebx + xorl 40(%rsp),%r14d + andl %r11d,%eax movl %esi,%ecx - xorl 0(%rsp),%edx - xorl %r12d,%ebx + xorl 0(%rsp),%r14d leal -1894007588(%rbp,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 20(%rsp),%edx addl %eax,%r13d + roll $1,%r14d andl %edi,%ebx - roll $1,%edx - addl %ebx,%r13d - roll $30,%edi - movl %edx,32(%rsp) addl %ecx,%r13d - movl 36(%rsp),%ebp - movl %edi,%eax - movl %edi,%ebx - xorl 44(%rsp),%ebp - andl %r11d,%eax + roll $30,%edi + addl %ebx,%r13d + xorl 36(%rsp),%edx + movl %r11d,%eax + movl %r14d,32(%rsp) + movl %r11d,%ebx + xorl 44(%rsp),%edx + andl %edi,%eax movl %r13d,%ecx - xorl 4(%rsp),%ebp - xorl %r11d,%ebx - leal -1894007588(%rdx,%r12,1),%r12d + xorl 4(%rsp),%edx + leal -1894007588(%r14,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 24(%rsp),%ebp addl %eax,%r12d + roll $1,%edx andl %esi,%ebx - roll $1,%ebp - addl %ebx,%r12d - roll $30,%esi - movl %ebp,36(%rsp) addl %ecx,%r12d - movl 40(%rsp),%edx - movl %esi,%eax - movl %esi,%ebx - xorl 48(%rsp),%edx - andl %edi,%eax + roll $30,%esi + addl %ebx,%r12d + xorl 40(%rsp),%ebp + movl %edi,%eax + movl %edx,36(%rsp) + movl %edi,%ebx + xorl 48(%rsp),%ebp + andl %esi,%eax movl %r12d,%ecx - xorl 8(%rsp),%edx - xorl %edi,%ebx - leal -1894007588(%rbp,%r11,1),%r11d + xorl 8(%rsp),%ebp + leal -1894007588(%rdx,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 28(%rsp),%edx addl %eax,%r11d + roll $1,%ebp andl %r13d,%ebx - roll $1,%edx - addl %ebx,%r11d - roll $30,%r13d - movl %edx,40(%rsp) addl %ecx,%r11d - movl 44(%rsp),%ebp - movl %r13d,%eax - movl %r13d,%ebx - xorl 52(%rsp),%ebp - andl %esi,%eax + roll $30,%r13d + addl %ebx,%r11d + xorl 44(%rsp),%r14d + movl %esi,%eax + movl %ebp,40(%rsp) + movl %esi,%ebx + xorl 52(%rsp),%r14d + andl %r13d,%eax movl %r11d,%ecx - xorl 12(%rsp),%ebp - xorl %esi,%ebx - leal -1894007588(%rdx,%rdi,1),%edi + xorl 12(%rsp),%r14d + leal -1894007588(%rbp,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 32(%rsp),%ebp addl %eax,%edi + roll $1,%r14d andl %r12d,%ebx - roll $1,%ebp - addl %ebx,%edi - roll $30,%r12d - movl %ebp,44(%rsp) addl %ecx,%edi - movl 48(%rsp),%edx - movl %r12d,%eax - movl %r12d,%ebx + roll $30,%r12d + addl %ebx,%edi + xorl 48(%rsp),%edx + movl %r13d,%eax + movl %r14d,44(%rsp) + movl %r13d,%ebx xorl 56(%rsp),%edx - andl %r13d,%eax + andl %r12d,%eax movl %edi,%ecx xorl 16(%rsp),%edx - xorl %r13d,%ebx - leal -1894007588(%rbp,%rsi,1),%esi + leal -1894007588(%r14,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 36(%rsp),%edx addl %eax,%esi - andl %r11d,%ebx roll $1,%edx - addl %ebx,%esi + andl %r11d,%ebx + addl %ecx,%esi roll $30,%r11d + addl %ebx,%esi + xorl 52(%rsp),%ebp + movl %edi,%eax movl %edx,48(%rsp) - addl %ecx,%esi - movl 52(%rsp),%ebp - movl %r11d,%eax movl %esi,%ecx xorl 60(%rsp),%ebp - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal -899497514(%rdx,%r13,1),%r13d xorl 20(%rsp),%ebp - xorl %r12d,%eax + leal -899497514(%rdx,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 40(%rsp),%ebp roll $30,%edi addl %eax,%r13d roll $1,%ebp + xorl 56(%rsp),%r14d + movl %esi,%eax movl %ebp,52(%rsp) - movl 56(%rsp),%edx - movl %edi,%eax movl %r13d,%ecx - xorl 0(%rsp),%edx - xorl %esi,%eax + xorl 0(%rsp),%r14d + xorl %r11d,%eax roll $5,%ecx + xorl 24(%rsp),%r14d leal -899497514(%rbp,%r12,1),%r12d - xorl 24(%rsp),%edx - xorl %r11d,%eax + xorl %edi,%eax addl %ecx,%r12d - xorl 44(%rsp),%edx roll $30,%esi addl %eax,%r12d - roll $1,%edx - movl %edx,56(%rsp) - movl 60(%rsp),%ebp - movl %esi,%eax + roll $1,%r14d + xorl 60(%rsp),%edx + movl %r13d,%eax + movl %r14d,56(%rsp) movl %r12d,%ecx - xorl 4(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - leal -899497514(%rdx,%r11,1),%r11d - xorl 28(%rsp),%ebp + xorl 4(%rsp),%edx xorl %edi,%eax + roll $5,%ecx + xorl 28(%rsp),%edx + leal -899497514(%r14,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 48(%rsp),%ebp roll $30,%r13d addl %eax,%r11d - roll $1,%ebp - movl %ebp,60(%rsp) - movl 0(%rsp),%edx - movl %r13d,%eax + roll $1,%edx + xorl 0(%rsp),%ebp + movl %r12d,%eax + movl %edx,60(%rsp) movl %r11d,%ecx - xorl 8(%rsp),%edx - xorl %r12d,%eax - roll $5,%ecx - leal -899497514(%rbp,%rdi,1),%edi - xorl 32(%rsp),%edx + xorl 8(%rsp),%ebp xorl %esi,%eax + roll $5,%ecx + xorl 32(%rsp),%ebp + leal -899497514(%rdx,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 52(%rsp),%edx roll $30,%r12d addl %eax,%edi - roll $1,%edx - movl %edx,0(%rsp) - movl 4(%rsp),%ebp - movl %r12d,%eax + roll $1,%ebp + xorl 4(%rsp),%r14d + movl %r11d,%eax + movl %ebp,0(%rsp) movl %edi,%ecx - xorl 12(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - leal -899497514(%rdx,%rsi,1),%esi - xorl 36(%rsp),%ebp + xorl 12(%rsp),%r14d xorl %r13d,%eax + roll $5,%ecx + xorl 36(%rsp),%r14d + leal -899497514(%rbp,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 56(%rsp),%ebp roll $30,%r11d addl %eax,%esi - roll $1,%ebp - movl %ebp,4(%rsp) - movl 8(%rsp),%edx - movl %r11d,%eax + roll $1,%r14d + xorl 8(%rsp),%edx + movl %edi,%eax + movl %r14d,4(%rsp) movl %esi,%ecx xorl 16(%rsp),%edx - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal -899497514(%rbp,%r13,1),%r13d xorl 40(%rsp),%edx - xorl %r12d,%eax + leal -899497514(%r14,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 60(%rsp),%edx roll $30,%edi addl %eax,%r13d roll $1,%edx + xorl 12(%rsp),%ebp + movl %esi,%eax movl %edx,8(%rsp) - movl 12(%rsp),%ebp - movl %edi,%eax movl %r13d,%ecx xorl 20(%rsp),%ebp - xorl %esi,%eax + xorl %r11d,%eax roll $5,%ecx - leal -899497514(%rdx,%r12,1),%r12d xorl 44(%rsp),%ebp - xorl %r11d,%eax + leal -899497514(%rdx,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 0(%rsp),%ebp roll $30,%esi addl %eax,%r12d roll $1,%ebp + xorl 16(%rsp),%r14d + movl %r13d,%eax movl %ebp,12(%rsp) - movl 16(%rsp),%edx - movl %esi,%eax movl %r12d,%ecx - xorl 24(%rsp),%edx - xorl %r13d,%eax + xorl 24(%rsp),%r14d + xorl %edi,%eax roll $5,%ecx + xorl 48(%rsp),%r14d leal -899497514(%rbp,%r11,1),%r11d - xorl 48(%rsp),%edx - xorl %edi,%eax + xorl %esi,%eax addl %ecx,%r11d - xorl 4(%rsp),%edx roll $30,%r13d addl %eax,%r11d - roll $1,%edx - movl %edx,16(%rsp) - movl 20(%rsp),%ebp - movl %r13d,%eax + roll $1,%r14d + xorl 20(%rsp),%edx + movl %r12d,%eax + movl %r14d,16(%rsp) movl %r11d,%ecx - xorl 28(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - leal -899497514(%rdx,%rdi,1),%edi - xorl 52(%rsp),%ebp + xorl 28(%rsp),%edx xorl %esi,%eax + roll $5,%ecx + xorl 52(%rsp),%edx + leal -899497514(%r14,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 8(%rsp),%ebp roll $30,%r12d addl %eax,%edi - roll $1,%ebp - movl %ebp,20(%rsp) - movl 24(%rsp),%edx - movl %r12d,%eax + roll $1,%edx + xorl 24(%rsp),%ebp + movl %r11d,%eax + movl %edx,20(%rsp) movl %edi,%ecx - xorl 32(%rsp),%edx - xorl %r11d,%eax - roll $5,%ecx - leal -899497514(%rbp,%rsi,1),%esi - xorl 56(%rsp),%edx + xorl 32(%rsp),%ebp xorl %r13d,%eax + roll $5,%ecx + xorl 56(%rsp),%ebp + leal -899497514(%rdx,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 12(%rsp),%edx roll $30,%r11d addl %eax,%esi - roll $1,%edx - movl %edx,24(%rsp) - movl 28(%rsp),%ebp - movl %r11d,%eax + roll $1,%ebp + xorl 28(%rsp),%r14d + movl %edi,%eax + movl %ebp,24(%rsp) movl %esi,%ecx - xorl 36(%rsp),%ebp - xorl %edi,%eax - roll $5,%ecx - leal -899497514(%rdx,%r13,1),%r13d - xorl 60(%rsp),%ebp + xorl 36(%rsp),%r14d xorl %r12d,%eax + roll $5,%ecx + xorl 60(%rsp),%r14d + leal -899497514(%rbp,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 16(%rsp),%ebp roll $30,%edi addl %eax,%r13d - roll $1,%ebp - movl %ebp,28(%rsp) - movl 32(%rsp),%edx - movl %edi,%eax + roll $1,%r14d + xorl 32(%rsp),%edx + movl %esi,%eax + movl %r14d,28(%rsp) movl %r13d,%ecx xorl 40(%rsp),%edx - xorl %esi,%eax + xorl %r11d,%eax roll $5,%ecx - leal -899497514(%rbp,%r12,1),%r12d xorl 0(%rsp),%edx - xorl %r11d,%eax + leal -899497514(%r14,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 20(%rsp),%edx roll $30,%esi addl %eax,%r12d roll $1,%edx - movl %edx,32(%rsp) - movl 36(%rsp),%ebp - movl %esi,%eax + xorl 36(%rsp),%ebp + movl %r13d,%eax + movl %r12d,%ecx xorl 44(%rsp),%ebp - xorl %r13d,%eax + xorl %edi,%eax roll $5,%ecx - leal -899497514(%rdx,%r11,1),%r11d xorl 4(%rsp),%ebp - xorl %edi,%eax + leal -899497514(%rdx,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 24(%rsp),%ebp roll $30,%r13d addl %eax,%r11d roll $1,%ebp - movl %ebp,36(%rsp) - movl 40(%rsp),%edx - movl %r13d,%eax + xorl 40(%rsp),%r14d + movl %r12d,%eax + movl %r11d,%ecx - xorl 48(%rsp),%edx - xorl %r12d,%eax + xorl 48(%rsp),%r14d + xorl %esi,%eax roll $5,%ecx + xorl 8(%rsp),%r14d leal -899497514(%rbp,%rdi,1),%edi - xorl 8(%rsp),%edx - xorl %esi,%eax + xorl %r13d,%eax addl %ecx,%edi - xorl 28(%rsp),%edx roll $30,%r12d addl %eax,%edi - roll $1,%edx - movl %edx,40(%rsp) - movl 44(%rsp),%ebp - movl %r12d,%eax + roll $1,%r14d + xorl 44(%rsp),%edx + movl %r11d,%eax + movl %edi,%ecx - xorl 52(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - leal -899497514(%rdx,%rsi,1),%esi - xorl 12(%rsp),%ebp + xorl 52(%rsp),%edx xorl %r13d,%eax + roll $5,%ecx + xorl 12(%rsp),%edx + leal -899497514(%r14,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 32(%rsp),%ebp roll $30,%r11d addl %eax,%esi - roll $1,%ebp - movl %ebp,44(%rsp) - movl 48(%rsp),%edx - movl %r11d,%eax + roll $1,%edx + xorl 48(%rsp),%ebp + movl %edi,%eax + movl %esi,%ecx - xorl 56(%rsp),%edx - xorl %edi,%eax - roll $5,%ecx - leal -899497514(%rbp,%r13,1),%r13d - xorl 16(%rsp),%edx + xorl 56(%rsp),%ebp xorl %r12d,%eax + roll $5,%ecx + xorl 16(%rsp),%ebp + leal -899497514(%rdx,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 36(%rsp),%edx roll $30,%edi addl %eax,%r13d - roll $1,%edx - movl %edx,48(%rsp) - movl 52(%rsp),%ebp - movl %edi,%eax + roll $1,%ebp + xorl 52(%rsp),%r14d + movl %esi,%eax + movl %r13d,%ecx - xorl 60(%rsp),%ebp - xorl %esi,%eax - roll $5,%ecx - leal -899497514(%rdx,%r12,1),%r12d - xorl 20(%rsp),%ebp + xorl 60(%rsp),%r14d xorl %r11d,%eax + roll $5,%ecx + xorl 20(%rsp),%r14d + leal -899497514(%rbp,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 40(%rsp),%ebp roll $30,%esi addl %eax,%r12d - roll $1,%ebp - movl 56(%rsp),%edx - movl %esi,%eax + roll $1,%r14d + xorl 56(%rsp),%edx + movl %r13d,%eax + movl %r12d,%ecx xorl 0(%rsp),%edx - xorl %r13d,%eax + xorl %edi,%eax roll $5,%ecx - leal -899497514(%rbp,%r11,1),%r11d xorl 24(%rsp),%edx - xorl %edi,%eax + leal -899497514(%r14,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 44(%rsp),%edx roll $30,%r13d addl %eax,%r11d roll $1,%edx - movl 60(%rsp),%ebp - movl %r13d,%eax + xorl 60(%rsp),%ebp + movl %r12d,%eax + movl %r11d,%ecx xorl 4(%rsp),%ebp - xorl %r12d,%eax + xorl %esi,%eax roll $5,%ecx - leal -899497514(%rdx,%rdi,1),%edi xorl 28(%rsp),%ebp - xorl %esi,%eax + leal -899497514(%rdx,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 48(%rsp),%ebp roll $30,%r12d addl %eax,%edi roll $1,%ebp - movl %r12d,%eax + movl %r11d,%eax movl %edi,%ecx - xorl %r11d,%eax + xorl %r13d,%eax leal -899497514(%rbp,%rsi,1),%esi roll $5,%ecx - xorl %r13d,%eax + xorl %r12d,%eax addl %ecx,%esi roll $30,%r11d addl %eax,%esi @@ -1327,11 +1267,12 @@ sha1_block_data_order: jnz .Lloop movq 64(%rsp),%rsi - movq (%rsi),%r13 - movq 8(%rsi),%r12 - movq 16(%rsi),%rbp - movq 24(%rsi),%rbx - leaq 32(%rsi),%rsp + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lepilogue: movq 8(%rsp),%rdi movq 16(%rsp),%rsi @@ -1349,23 +1290,29 @@ sha1_block_data_order_ssse3: movq %r8,%rdx _ssse3_shortcut: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 - leaq -144(%rsp),%rsp - movaps %xmm6,64+0(%rsp) - movaps %xmm7,64+16(%rsp) - movaps %xmm8,64+32(%rsp) - movaps %xmm9,64+48(%rsp) - movaps %xmm10,64+64(%rsp) + pushq %r13 + pushq %r14 + leaq -160(%rsp),%rsp + movaps %xmm6,-40-96(%rax) + movaps %xmm7,-40-80(%rax) + movaps %xmm8,-40-64(%rax) + movaps %xmm9,-40-48(%rax) + movaps %xmm10,-40-32(%rax) + movaps %xmm11,-40-16(%rax) .Lprologue_ssse3: + movq %rax,%r14 + andq $-64,%rsp movq %rdi,%r8 movq %rsi,%r9 movq %rdx,%r10 shlq $6,%r10 addq %r9,%r10 - leaq K_XX_XX(%rip),%r11 + leaq K_XX_XX+64(%rip),%r11 movl 0(%r8),%eax movl 4(%r8),%ebx @@ -1373,19 +1320,22 @@ _ssse3_shortcut: movl 12(%r8),%edx movl %ebx,%esi movl 16(%r8),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi movdqa 64(%r11),%xmm6 - movdqa 0(%r11),%xmm9 + movdqa -64(%r11),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 movdqu 48(%r9),%xmm3 .byte 102,15,56,0,198 - addq $64,%r9 .byte 102,15,56,0,206 .byte 102,15,56,0,214 -.byte 102,15,56,0,222 + addq $64,%r9 paddd %xmm9,%xmm0 +.byte 102,15,56,0,222 paddd %xmm9,%xmm1 paddd %xmm9,%xmm2 movdqa %xmm0,0(%rsp) @@ -1397,904 +1347,882 @@ _ssse3_shortcut: jmp .Loop_ssse3 .p2align 4 .Loop_ssse3: - movdqa %xmm1,%xmm4 - addl 0(%rsp),%ebp - xorl %edx,%ecx + rorl $2,%ebx + pshufd $238,%xmm0,%xmm4 + xorl %edx,%esi movdqa %xmm3,%xmm8 -.byte 102,15,58,15,224,8 + paddd %xmm3,%xmm9 movl %eax,%edi + addl 0(%rsp),%ebp + punpcklqdq %xmm1,%xmm4 + xorl %ecx,%ebx roll $5,%eax - paddd %xmm3,%xmm9 - andl %ecx,%esi - xorl %edx,%ecx + addl %esi,%ebp psrldq $4,%xmm8 - xorl %edx,%esi - addl %eax,%ebp + andl %ebx,%edi + xorl %ecx,%ebx pxor %xmm0,%xmm4 - rorl $2,%ebx - addl %esi,%ebp + addl %eax,%ebp + rorl $7,%eax pxor %xmm2,%xmm8 - addl 4(%rsp),%edx - xorl %ecx,%ebx + xorl %ecx,%edi movl %ebp,%esi - roll $5,%ebp + addl 4(%rsp),%edx pxor %xmm8,%xmm4 - andl %ebx,%edi - xorl %ecx,%ebx + xorl %ebx,%eax + roll $5,%ebp movdqa %xmm9,48(%rsp) - xorl %ecx,%edi - addl %ebp,%edx - movdqa %xmm4,%xmm10 - movdqa %xmm4,%xmm8 - rorl $7,%eax addl %edi,%edx - addl 8(%rsp),%ecx + andl %eax,%esi + movdqa %xmm4,%xmm10 xorl %ebx,%eax + addl %ebp,%edx + rorl $7,%ebp + movdqa %xmm4,%xmm8 + xorl %ebx,%esi pslldq $12,%xmm10 paddd %xmm4,%xmm4 movl %edx,%edi - roll $5,%edx - andl %eax,%esi - xorl %ebx,%eax + addl 8(%rsp),%ecx psrld $31,%xmm8 - xorl %ebx,%esi - addl %edx,%ecx - movdqa %xmm10,%xmm9 - rorl $7,%ebp + xorl %eax,%ebp + roll $5,%edx addl %esi,%ecx + movdqa %xmm10,%xmm9 + andl %ebp,%edi + xorl %eax,%ebp psrld $30,%xmm10 + addl %edx,%ecx + rorl $7,%edx por %xmm8,%xmm4 - addl 12(%rsp),%ebx - xorl %eax,%ebp + xorl %eax,%edi movl %ecx,%esi - roll $5,%ecx + addl 12(%rsp),%ebx pslld $2,%xmm9 pxor %xmm10,%xmm4 - andl %ebp,%edi - xorl %eax,%ebp - movdqa 0(%r11),%xmm10 - xorl %eax,%edi - addl %ecx,%ebx - pxor %xmm9,%xmm4 - rorl $7,%edx + xorl %ebp,%edx + movdqa -64(%r11),%xmm10 + roll $5,%ecx addl %edi,%ebx - movdqa %xmm2,%xmm5 - addl 16(%rsp),%eax + andl %edx,%esi + pxor %xmm9,%xmm4 xorl %ebp,%edx + addl %ecx,%ebx + rorl $7,%ecx + pshufd $238,%xmm1,%xmm5 + xorl %ebp,%esi movdqa %xmm4,%xmm9 -.byte 102,15,58,15,233,8 + paddd %xmm4,%xmm10 movl %ebx,%edi + addl 16(%rsp),%eax + punpcklqdq %xmm2,%xmm5 + xorl %edx,%ecx roll $5,%ebx - paddd %xmm4,%xmm10 - andl %edx,%esi - xorl %ebp,%edx + addl %esi,%eax psrldq $4,%xmm9 - xorl %ebp,%esi - addl %ebx,%eax + andl %ecx,%edi + xorl %edx,%ecx pxor %xmm1,%xmm5 - rorl $7,%ecx - addl %esi,%eax + addl %ebx,%eax + rorl $7,%ebx pxor %xmm3,%xmm9 - addl 20(%rsp),%ebp - xorl %edx,%ecx + xorl %edx,%edi movl %eax,%esi - roll $5,%eax + addl 20(%rsp),%ebp pxor %xmm9,%xmm5 - andl %ecx,%edi - xorl %edx,%ecx + xorl %ecx,%ebx + roll $5,%eax movdqa %xmm10,0(%rsp) - xorl %edx,%edi - addl %eax,%ebp - movdqa %xmm5,%xmm8 - movdqa %xmm5,%xmm9 - rorl $7,%ebx addl %edi,%ebp - addl 24(%rsp),%edx + andl %ebx,%esi + movdqa %xmm5,%xmm8 xorl %ecx,%ebx + addl %eax,%ebp + rorl $7,%eax + movdqa %xmm5,%xmm9 + xorl %ecx,%esi pslldq $12,%xmm8 paddd %xmm5,%xmm5 movl %ebp,%edi - roll $5,%ebp - andl %ebx,%esi - xorl %ecx,%ebx + addl 24(%rsp),%edx psrld $31,%xmm9 - xorl %ecx,%esi - addl %ebp,%edx - movdqa %xmm8,%xmm10 - rorl $7,%eax + xorl %ebx,%eax + roll $5,%ebp addl %esi,%edx + movdqa %xmm8,%xmm10 + andl %eax,%edi + xorl %ebx,%eax psrld $30,%xmm8 + addl %ebp,%edx + rorl $7,%ebp por %xmm9,%xmm5 - addl 28(%rsp),%ecx - xorl %ebx,%eax + xorl %ebx,%edi movl %edx,%esi - roll $5,%edx + addl 28(%rsp),%ecx pslld $2,%xmm10 pxor %xmm8,%xmm5 - andl %eax,%edi - xorl %ebx,%eax - movdqa 16(%r11),%xmm8 - xorl %ebx,%edi - addl %edx,%ecx - pxor %xmm10,%xmm5 - rorl $7,%ebp + xorl %eax,%ebp + movdqa -32(%r11),%xmm8 + roll $5,%edx addl %edi,%ecx - movdqa %xmm3,%xmm6 - addl 32(%rsp),%ebx + andl %ebp,%esi + pxor %xmm10,%xmm5 xorl %eax,%ebp + addl %edx,%ecx + rorl $7,%edx + pshufd $238,%xmm2,%xmm6 + xorl %eax,%esi movdqa %xmm5,%xmm10 -.byte 102,15,58,15,242,8 + paddd %xmm5,%xmm8 movl %ecx,%edi + addl 32(%rsp),%ebx + punpcklqdq %xmm3,%xmm6 + xorl %ebp,%edx roll $5,%ecx - paddd %xmm5,%xmm8 - andl %ebp,%esi - xorl %eax,%ebp + addl %esi,%ebx psrldq $4,%xmm10 - xorl %eax,%esi - addl %ecx,%ebx + andl %edx,%edi + xorl %ebp,%edx pxor %xmm2,%xmm6 - rorl $7,%edx - addl %esi,%ebx + addl %ecx,%ebx + rorl $7,%ecx pxor %xmm4,%xmm10 - addl 36(%rsp),%eax - xorl %ebp,%edx + xorl %ebp,%edi movl %ebx,%esi - roll $5,%ebx + addl 36(%rsp),%eax pxor %xmm10,%xmm6 - andl %edx,%edi - xorl %ebp,%edx + xorl %edx,%ecx + roll $5,%ebx movdqa %xmm8,16(%rsp) - xorl %ebp,%edi - addl %ebx,%eax - movdqa %xmm6,%xmm9 - movdqa %xmm6,%xmm10 - rorl $7,%ecx addl %edi,%eax - addl 40(%rsp),%ebp + andl %ecx,%esi + movdqa %xmm6,%xmm9 xorl %edx,%ecx + addl %ebx,%eax + rorl $7,%ebx + movdqa %xmm6,%xmm10 + xorl %edx,%esi pslldq $12,%xmm9 paddd %xmm6,%xmm6 movl %eax,%edi - roll $5,%eax - andl %ecx,%esi - xorl %edx,%ecx + addl 40(%rsp),%ebp psrld $31,%xmm10 - xorl %edx,%esi - addl %eax,%ebp - movdqa %xmm9,%xmm8 - rorl $7,%ebx + xorl %ecx,%ebx + roll $5,%eax addl %esi,%ebp + movdqa %xmm9,%xmm8 + andl %ebx,%edi + xorl %ecx,%ebx psrld $30,%xmm9 + addl %eax,%ebp + rorl $7,%eax por %xmm10,%xmm6 - addl 44(%rsp),%edx - xorl %ecx,%ebx + xorl %ecx,%edi movl %ebp,%esi - roll $5,%ebp + addl 44(%rsp),%edx pslld $2,%xmm8 pxor %xmm9,%xmm6 - andl %ebx,%edi - xorl %ecx,%ebx - movdqa 16(%r11),%xmm9 - xorl %ecx,%edi - addl %ebp,%edx - pxor %xmm8,%xmm6 - rorl $7,%eax + xorl %ebx,%eax + movdqa -32(%r11),%xmm9 + roll $5,%ebp addl %edi,%edx - movdqa %xmm4,%xmm7 - addl 48(%rsp),%ecx + andl %eax,%esi + pxor %xmm8,%xmm6 xorl %ebx,%eax + addl %ebp,%edx + rorl $7,%ebp + pshufd $238,%xmm3,%xmm7 + xorl %ebx,%esi movdqa %xmm6,%xmm8 -.byte 102,15,58,15,251,8 + paddd %xmm6,%xmm9 movl %edx,%edi + addl 48(%rsp),%ecx + punpcklqdq %xmm4,%xmm7 + xorl %eax,%ebp roll $5,%edx - paddd %xmm6,%xmm9 - andl %eax,%esi - xorl %ebx,%eax + addl %esi,%ecx psrldq $4,%xmm8 - xorl %ebx,%esi - addl %edx,%ecx + andl %ebp,%edi + xorl %eax,%ebp pxor %xmm3,%xmm7 - rorl $7,%ebp - addl %esi,%ecx + addl %edx,%ecx + rorl $7,%edx pxor %xmm5,%xmm8 - addl 52(%rsp),%ebx - xorl %eax,%ebp + xorl %eax,%edi movl %ecx,%esi - roll $5,%ecx + addl 52(%rsp),%ebx pxor %xmm8,%xmm7 - andl %ebp,%edi - xorl %eax,%ebp + xorl %ebp,%edx + roll $5,%ecx movdqa %xmm9,32(%rsp) - xorl %eax,%edi - addl %ecx,%ebx - movdqa %xmm7,%xmm10 - movdqa %xmm7,%xmm8 - rorl $7,%edx addl %edi,%ebx - addl 56(%rsp),%eax + andl %edx,%esi + movdqa %xmm7,%xmm10 xorl %ebp,%edx + addl %ecx,%ebx + rorl $7,%ecx + movdqa %xmm7,%xmm8 + xorl %ebp,%esi pslldq $12,%xmm10 paddd %xmm7,%xmm7 movl %ebx,%edi - roll $5,%ebx - andl %edx,%esi - xorl %ebp,%edx + addl 56(%rsp),%eax psrld $31,%xmm8 - xorl %ebp,%esi - addl %ebx,%eax - movdqa %xmm10,%xmm9 - rorl $7,%ecx + xorl %edx,%ecx + roll $5,%ebx addl %esi,%eax + movdqa %xmm10,%xmm9 + andl %ecx,%edi + xorl %edx,%ecx psrld $30,%xmm10 + addl %ebx,%eax + rorl $7,%ebx por %xmm8,%xmm7 - addl 60(%rsp),%ebp - xorl %edx,%ecx + xorl %edx,%edi movl %eax,%esi - roll $5,%eax + addl 60(%rsp),%ebp pslld $2,%xmm9 pxor %xmm10,%xmm7 - andl %ecx,%edi - xorl %edx,%ecx - movdqa 16(%r11),%xmm10 - xorl %edx,%edi - addl %eax,%ebp - pxor %xmm9,%xmm7 - rorl $7,%ebx + xorl %ecx,%ebx + movdqa -32(%r11),%xmm10 + roll $5,%eax addl %edi,%ebp - movdqa %xmm7,%xmm9 - addl 0(%rsp),%edx - pxor %xmm4,%xmm0 -.byte 102,68,15,58,15,206,8 + andl %ebx,%esi + pxor %xmm9,%xmm7 + pshufd $238,%xmm6,%xmm9 xorl %ecx,%ebx + addl %eax,%ebp + rorl $7,%eax + pxor %xmm4,%xmm0 + xorl %ecx,%esi movl %ebp,%edi + addl 0(%rsp),%edx + punpcklqdq %xmm7,%xmm9 + xorl %ebx,%eax roll $5,%ebp pxor %xmm1,%xmm0 - andl %ebx,%esi - xorl %ecx,%ebx + addl %esi,%edx + andl %eax,%edi movdqa %xmm10,%xmm8 + xorl %ebx,%eax paddd %xmm7,%xmm10 - xorl %ecx,%esi addl %ebp,%edx pxor %xmm9,%xmm0 - rorl $7,%eax - addl %esi,%edx + rorl $7,%ebp + xorl %ebx,%edi + movl %edx,%esi addl 4(%rsp),%ecx - xorl %ebx,%eax movdqa %xmm0,%xmm9 - movdqa %xmm10,48(%rsp) - movl %edx,%esi + xorl %eax,%ebp roll $5,%edx - andl %eax,%edi - xorl %ebx,%eax + movdqa %xmm10,48(%rsp) + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp pslld $2,%xmm0 - xorl %ebx,%edi addl %edx,%ecx + rorl $7,%edx psrld $30,%xmm9 - rorl $7,%ebp - addl %edi,%ecx - addl 8(%rsp),%ebx - xorl %eax,%ebp + xorl %eax,%esi movl %ecx,%edi - roll $5,%ecx + addl 8(%rsp),%ebx por %xmm9,%xmm0 - andl %ebp,%esi - xorl %eax,%ebp - movdqa %xmm0,%xmm10 - xorl %eax,%esi - addl %ecx,%ebx - rorl $7,%edx - addl %esi,%ebx - addl 12(%rsp),%eax xorl %ebp,%edx - movl %ebx,%esi - roll $5,%ebx + roll $5,%ecx + pshufd $238,%xmm7,%xmm10 + addl %esi,%ebx andl %edx,%edi xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax xorl %ebp,%edi - addl %ebx,%eax - rorl $7,%ecx + movl %ebx,%esi + roll $5,%ebx addl %edi,%eax - addl 16(%rsp),%ebp - pxor %xmm5,%xmm1 -.byte 102,68,15,58,15,215,8 xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + pxor %xmm5,%xmm1 + addl 16(%rsp),%ebp + xorl %ecx,%esi + punpcklqdq %xmm0,%xmm10 movl %eax,%edi roll $5,%eax pxor %xmm2,%xmm1 - xorl %ecx,%esi - addl %eax,%ebp + addl %esi,%ebp + xorl %ecx,%edi movdqa %xmm8,%xmm9 - paddd %xmm0,%xmm8 rorl $7,%ebx - addl %esi,%ebp + paddd %xmm0,%xmm8 + addl %eax,%ebp pxor %xmm10,%xmm1 addl 20(%rsp),%edx - xorl %ecx,%edi + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp movdqa %xmm1,%xmm10 + addl %edi,%edx + xorl %ebx,%esi movdqa %xmm8,0(%rsp) - xorl %ebx,%edi - addl %ebp,%edx rorl $7,%eax - addl %edi,%edx - pslld $2,%xmm1 + addl %ebp,%edx addl 24(%rsp),%ecx - xorl %ebx,%esi - psrld $30,%xmm10 + pslld $2,%xmm1 + xorl %eax,%esi movl %edx,%edi + psrld $30,%xmm10 roll $5,%edx - xorl %eax,%esi - addl %edx,%ecx - rorl $7,%ebp addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp por %xmm10,%xmm1 + addl %edx,%ecx addl 28(%rsp),%ebx - xorl %eax,%edi - movdqa %xmm1,%xmm8 + pshufd $238,%xmm0,%xmm8 + xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx - xorl %ebp,%edi - addl %ecx,%ebx - rorl $7,%edx addl %edi,%ebx - addl 32(%rsp),%eax - pxor %xmm6,%xmm2 -.byte 102,68,15,58,15,192,8 xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + pxor %xmm6,%xmm2 + addl 32(%rsp),%eax + xorl %edx,%esi + punpcklqdq %xmm1,%xmm8 movl %ebx,%edi roll $5,%ebx pxor %xmm3,%xmm2 - xorl %edx,%esi - addl %ebx,%eax - movdqa 32(%r11),%xmm10 - paddd %xmm1,%xmm9 - rorl $7,%ecx addl %esi,%eax + xorl %edx,%edi + movdqa 0(%r11),%xmm10 + rorl $7,%ecx + paddd %xmm1,%xmm9 + addl %ebx,%eax pxor %xmm8,%xmm2 addl 36(%rsp),%ebp - xorl %edx,%edi + xorl %ecx,%edi movl %eax,%esi roll $5,%eax movdqa %xmm2,%xmm8 + addl %edi,%ebp + xorl %ecx,%esi movdqa %xmm9,16(%rsp) - xorl %ecx,%edi - addl %eax,%ebp rorl $7,%ebx - addl %edi,%ebp - pslld $2,%xmm2 + addl %eax,%ebp addl 40(%rsp),%edx - xorl %ecx,%esi - psrld $30,%xmm8 + pslld $2,%xmm2 + xorl %ebx,%esi movl %ebp,%edi + psrld $30,%xmm8 roll $5,%ebp - xorl %ebx,%esi - addl %ebp,%edx - rorl $7,%eax addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax por %xmm8,%xmm2 + addl %ebp,%edx addl 44(%rsp),%ecx - xorl %ebx,%edi - movdqa %xmm2,%xmm9 + pshufd $238,%xmm1,%xmm9 + xorl %eax,%edi movl %edx,%esi roll $5,%edx - xorl %eax,%edi - addl %edx,%ecx - rorl $7,%ebp addl %edi,%ecx - addl 48(%rsp),%ebx - pxor %xmm7,%xmm3 -.byte 102,68,15,58,15,201,8 xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + pxor %xmm7,%xmm3 + addl 48(%rsp),%ebx + xorl %ebp,%esi + punpcklqdq %xmm2,%xmm9 movl %ecx,%edi roll $5,%ecx pxor %xmm4,%xmm3 - xorl %ebp,%esi - addl %ecx,%ebx + addl %esi,%ebx + xorl %ebp,%edi movdqa %xmm10,%xmm8 - paddd %xmm2,%xmm10 rorl $7,%edx - addl %esi,%ebx + paddd %xmm2,%xmm10 + addl %ecx,%ebx pxor %xmm9,%xmm3 addl 52(%rsp),%eax - xorl %ebp,%edi + xorl %edx,%edi movl %ebx,%esi roll $5,%ebx movdqa %xmm3,%xmm9 + addl %edi,%eax + xorl %edx,%esi movdqa %xmm10,32(%rsp) - xorl %edx,%edi - addl %ebx,%eax rorl $7,%ecx - addl %edi,%eax - pslld $2,%xmm3 + addl %ebx,%eax addl 56(%rsp),%ebp - xorl %edx,%esi - psrld $30,%xmm9 + pslld $2,%xmm3 + xorl %ecx,%esi movl %eax,%edi + psrld $30,%xmm9 roll $5,%eax - xorl %ecx,%esi - addl %eax,%ebp - rorl $7,%ebx addl %esi,%ebp + xorl %ecx,%edi + rorl $7,%ebx por %xmm9,%xmm3 + addl %eax,%ebp addl 60(%rsp),%edx - xorl %ecx,%edi - movdqa %xmm3,%xmm10 + pshufd $238,%xmm2,%xmm10 + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp - xorl %ebx,%edi - addl %ebp,%edx - rorl $7,%eax addl %edi,%edx - addl 0(%rsp),%ecx - pxor %xmm0,%xmm4 -.byte 102,68,15,58,15,210,8 xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + pxor %xmm0,%xmm4 + addl 0(%rsp),%ecx + xorl %eax,%esi + punpcklqdq %xmm3,%xmm10 movl %edx,%edi roll $5,%edx pxor %xmm5,%xmm4 - xorl %eax,%esi - addl %edx,%ecx + addl %esi,%ecx + xorl %eax,%edi movdqa %xmm8,%xmm9 - paddd %xmm3,%xmm8 rorl $7,%ebp - addl %esi,%ecx + paddd %xmm3,%xmm8 + addl %edx,%ecx pxor %xmm10,%xmm4 addl 4(%rsp),%ebx - xorl %eax,%edi + xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx movdqa %xmm4,%xmm10 + addl %edi,%ebx + xorl %ebp,%esi movdqa %xmm8,48(%rsp) - xorl %ebp,%edi - addl %ecx,%ebx rorl $7,%edx - addl %edi,%ebx - pslld $2,%xmm4 + addl %ecx,%ebx addl 8(%rsp),%eax - xorl %ebp,%esi - psrld $30,%xmm10 + pslld $2,%xmm4 + xorl %edx,%esi movl %ebx,%edi + psrld $30,%xmm10 roll $5,%ebx - xorl %edx,%esi - addl %ebx,%eax - rorl $7,%ecx addl %esi,%eax + xorl %edx,%edi + rorl $7,%ecx por %xmm10,%xmm4 + addl %ebx,%eax addl 12(%rsp),%ebp - xorl %edx,%edi - movdqa %xmm4,%xmm8 + pshufd $238,%xmm3,%xmm8 + xorl %ecx,%edi movl %eax,%esi roll $5,%eax - xorl %ecx,%edi - addl %eax,%ebp - rorl $7,%ebx addl %edi,%ebp - addl 16(%rsp),%edx - pxor %xmm1,%xmm5 -.byte 102,68,15,58,15,195,8 xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + pxor %xmm1,%xmm5 + addl 16(%rsp),%edx + xorl %ebx,%esi + punpcklqdq %xmm4,%xmm8 movl %ebp,%edi roll $5,%ebp pxor %xmm6,%xmm5 - xorl %ebx,%esi - addl %ebp,%edx + addl %esi,%edx + xorl %ebx,%edi movdqa %xmm9,%xmm10 - paddd %xmm4,%xmm9 rorl $7,%eax - addl %esi,%edx + paddd %xmm4,%xmm9 + addl %ebp,%edx pxor %xmm8,%xmm5 addl 20(%rsp),%ecx - xorl %ebx,%edi + xorl %eax,%edi movl %edx,%esi roll $5,%edx movdqa %xmm5,%xmm8 + addl %edi,%ecx + xorl %eax,%esi movdqa %xmm9,0(%rsp) - xorl %eax,%edi - addl %edx,%ecx rorl $7,%ebp - addl %edi,%ecx - pslld $2,%xmm5 + addl %edx,%ecx addl 24(%rsp),%ebx - xorl %eax,%esi - psrld $30,%xmm8 + pslld $2,%xmm5 + xorl %ebp,%esi movl %ecx,%edi + psrld $30,%xmm8 roll $5,%ecx - xorl %ebp,%esi - addl %ecx,%ebx - rorl $7,%edx addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx por %xmm8,%xmm5 + addl %ecx,%ebx addl 28(%rsp),%eax - xorl %ebp,%edi - movdqa %xmm5,%xmm9 + pshufd $238,%xmm4,%xmm9 + rorl $7,%ecx movl %ebx,%esi - roll $5,%ebx xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx + roll $5,%ebx addl %edi,%eax - movl %ecx,%edi - pxor %xmm2,%xmm6 -.byte 102,68,15,58,15,204,8 + xorl %ecx,%esi xorl %edx,%ecx + addl %ebx,%eax + pxor %xmm2,%xmm6 addl 32(%rsp),%ebp - andl %edx,%edi - pxor %xmm7,%xmm6 andl %ecx,%esi + xorl %edx,%ecx rorl $7,%ebx - movdqa %xmm10,%xmm8 - paddd %xmm5,%xmm10 - addl %edi,%ebp + punpcklqdq %xmm5,%xmm9 movl %eax,%edi - pxor %xmm9,%xmm6 + xorl %ecx,%esi + pxor %xmm7,%xmm6 roll $5,%eax addl %esi,%ebp - xorl %edx,%ecx - addl %eax,%ebp - movdqa %xmm6,%xmm9 - movdqa %xmm10,16(%rsp) - movl %ebx,%esi + movdqa %xmm10,%xmm8 + xorl %ebx,%edi + paddd %xmm5,%xmm10 xorl %ecx,%ebx + pxor %xmm9,%xmm6 + addl %eax,%ebp addl 36(%rsp),%edx - andl %ecx,%esi - pslld $2,%xmm6 andl %ebx,%edi + xorl %ecx,%ebx rorl $7,%eax - psrld $30,%xmm9 - addl %esi,%edx + movdqa %xmm6,%xmm9 movl %ebp,%esi + xorl %ebx,%edi + movdqa %xmm10,16(%rsp) roll $5,%ebp addl %edi,%edx - xorl %ecx,%ebx - addl %ebp,%edx - por %xmm9,%xmm6 - movl %eax,%edi + xorl %eax,%esi + pslld $2,%xmm6 xorl %ebx,%eax - movdqa %xmm6,%xmm10 + addl %ebp,%edx + psrld $30,%xmm9 addl 40(%rsp),%ecx - andl %ebx,%edi andl %eax,%esi + xorl %ebx,%eax + por %xmm9,%xmm6 rorl $7,%ebp - addl %edi,%ecx movl %edx,%edi + xorl %eax,%esi roll $5,%edx + pshufd $238,%xmm5,%xmm10 addl %esi,%ecx - xorl %ebx,%eax - addl %edx,%ecx - movl %ebp,%esi + xorl %ebp,%edi xorl %eax,%ebp + addl %edx,%ecx addl 44(%rsp),%ebx - andl %eax,%esi andl %ebp,%edi + xorl %eax,%ebp rorl $7,%edx - addl %esi,%ebx movl %ecx,%esi + xorl %ebp,%edi roll $5,%ecx addl %edi,%ebx - xorl %eax,%ebp + xorl %edx,%esi + xorl %ebp,%edx addl %ecx,%ebx - movl %edx,%edi pxor %xmm3,%xmm7 -.byte 102,68,15,58,15,213,8 - xorl %ebp,%edx addl 48(%rsp),%eax - andl %ebp,%edi - pxor %xmm0,%xmm7 andl %edx,%esi + xorl %ebp,%edx rorl $7,%ecx - movdqa 48(%r11),%xmm9 - paddd %xmm6,%xmm8 - addl %edi,%eax + punpcklqdq %xmm6,%xmm10 movl %ebx,%edi - pxor %xmm10,%xmm7 + xorl %edx,%esi + pxor %xmm0,%xmm7 roll $5,%ebx addl %esi,%eax - xorl %ebp,%edx - addl %ebx,%eax - movdqa %xmm7,%xmm10 - movdqa %xmm8,32(%rsp) - movl %ecx,%esi + movdqa 32(%r11),%xmm9 + xorl %ecx,%edi + paddd %xmm6,%xmm8 xorl %edx,%ecx + pxor %xmm10,%xmm7 + addl %ebx,%eax addl 52(%rsp),%ebp - andl %edx,%esi - pslld $2,%xmm7 andl %ecx,%edi + xorl %edx,%ecx rorl $7,%ebx - psrld $30,%xmm10 - addl %esi,%ebp + movdqa %xmm7,%xmm10 movl %eax,%esi + xorl %ecx,%edi + movdqa %xmm8,32(%rsp) roll $5,%eax addl %edi,%ebp - xorl %edx,%ecx - addl %eax,%ebp - por %xmm10,%xmm7 - movl %ebx,%edi + xorl %ebx,%esi + pslld $2,%xmm7 xorl %ecx,%ebx - movdqa %xmm7,%xmm8 + addl %eax,%ebp + psrld $30,%xmm10 addl 56(%rsp),%edx - andl %ecx,%edi andl %ebx,%esi + xorl %ecx,%ebx + por %xmm10,%xmm7 rorl $7,%eax - addl %edi,%edx movl %ebp,%edi + xorl %ebx,%esi roll $5,%ebp + pshufd $238,%xmm6,%xmm8 addl %esi,%edx - xorl %ecx,%ebx - addl %ebp,%edx - movl %eax,%esi + xorl %eax,%edi xorl %ebx,%eax + addl %ebp,%edx addl 60(%rsp),%ecx - andl %ebx,%esi andl %eax,%edi + xorl %ebx,%eax rorl $7,%ebp - addl %esi,%ecx movl %edx,%esi + xorl %eax,%edi roll $5,%edx addl %edi,%ecx - xorl %ebx,%eax + xorl %ebp,%esi + xorl %eax,%ebp addl %edx,%ecx - movl %ebp,%edi pxor %xmm4,%xmm0 -.byte 102,68,15,58,15,198,8 - xorl %eax,%ebp addl 0(%rsp),%ebx - andl %eax,%edi - pxor %xmm1,%xmm0 andl %ebp,%esi + xorl %eax,%ebp rorl $7,%edx - movdqa %xmm9,%xmm10 - paddd %xmm7,%xmm9 - addl %edi,%ebx + punpcklqdq %xmm7,%xmm8 movl %ecx,%edi - pxor %xmm8,%xmm0 + xorl %ebp,%esi + pxor %xmm1,%xmm0 roll $5,%ecx addl %esi,%ebx - xorl %eax,%ebp - addl %ecx,%ebx - movdqa %xmm0,%xmm8 - movdqa %xmm9,48(%rsp) - movl %edx,%esi + movdqa %xmm9,%xmm10 + xorl %edx,%edi + paddd %xmm7,%xmm9 xorl %ebp,%edx + pxor %xmm8,%xmm0 + addl %ecx,%ebx addl 4(%rsp),%eax - andl %ebp,%esi - pslld $2,%xmm0 andl %edx,%edi + xorl %ebp,%edx rorl $7,%ecx - psrld $30,%xmm8 - addl %esi,%eax + movdqa %xmm0,%xmm8 movl %ebx,%esi + xorl %edx,%edi + movdqa %xmm9,48(%rsp) roll $5,%ebx addl %edi,%eax - xorl %ebp,%edx - addl %ebx,%eax - por %xmm8,%xmm0 - movl %ecx,%edi + xorl %ecx,%esi + pslld $2,%xmm0 xorl %edx,%ecx - movdqa %xmm0,%xmm9 + addl %ebx,%eax + psrld $30,%xmm8 addl 8(%rsp),%ebp - andl %edx,%edi andl %ecx,%esi + xorl %edx,%ecx + por %xmm8,%xmm0 rorl $7,%ebx - addl %edi,%ebp movl %eax,%edi + xorl %ecx,%esi roll $5,%eax + pshufd $238,%xmm7,%xmm9 addl %esi,%ebp - xorl %edx,%ecx - addl %eax,%ebp - movl %ebx,%esi + xorl %ebx,%edi xorl %ecx,%ebx + addl %eax,%ebp addl 12(%rsp),%edx - andl %ecx,%esi andl %ebx,%edi + xorl %ecx,%ebx rorl $7,%eax - addl %esi,%edx movl %ebp,%esi + xorl %ebx,%edi roll $5,%ebp addl %edi,%edx - xorl %ecx,%ebx + xorl %eax,%esi + xorl %ebx,%eax addl %ebp,%edx - movl %eax,%edi pxor %xmm5,%xmm1 -.byte 102,68,15,58,15,207,8 - xorl %ebx,%eax addl 16(%rsp),%ecx - andl %ebx,%edi - pxor %xmm2,%xmm1 andl %eax,%esi + xorl %ebx,%eax rorl $7,%ebp - movdqa %xmm10,%xmm8 - paddd %xmm0,%xmm10 - addl %edi,%ecx + punpcklqdq %xmm0,%xmm9 movl %edx,%edi - pxor %xmm9,%xmm1 + xorl %eax,%esi + pxor %xmm2,%xmm1 roll $5,%edx addl %esi,%ecx - xorl %ebx,%eax - addl %edx,%ecx - movdqa %xmm1,%xmm9 - movdqa %xmm10,0(%rsp) - movl %ebp,%esi + movdqa %xmm10,%xmm8 + xorl %ebp,%edi + paddd %xmm0,%xmm10 xorl %eax,%ebp + pxor %xmm9,%xmm1 + addl %edx,%ecx addl 20(%rsp),%ebx - andl %eax,%esi - pslld $2,%xmm1 andl %ebp,%edi + xorl %eax,%ebp rorl $7,%edx - psrld $30,%xmm9 - addl %esi,%ebx + movdqa %xmm1,%xmm9 movl %ecx,%esi + xorl %ebp,%edi + movdqa %xmm10,0(%rsp) roll $5,%ecx addl %edi,%ebx - xorl %eax,%ebp - addl %ecx,%ebx - por %xmm9,%xmm1 - movl %edx,%edi + xorl %edx,%esi + pslld $2,%xmm1 xorl %ebp,%edx - movdqa %xmm1,%xmm10 + addl %ecx,%ebx + psrld $30,%xmm9 addl 24(%rsp),%eax - andl %ebp,%edi andl %edx,%esi + xorl %ebp,%edx + por %xmm9,%xmm1 rorl $7,%ecx - addl %edi,%eax movl %ebx,%edi + xorl %edx,%esi roll $5,%ebx + pshufd $238,%xmm0,%xmm10 addl %esi,%eax - xorl %ebp,%edx - addl %ebx,%eax - movl %ecx,%esi + xorl %ecx,%edi xorl %edx,%ecx + addl %ebx,%eax addl 28(%rsp),%ebp - andl %edx,%esi andl %ecx,%edi + xorl %edx,%ecx rorl $7,%ebx - addl %esi,%ebp movl %eax,%esi + xorl %ecx,%edi roll $5,%eax addl %edi,%ebp - xorl %edx,%ecx + xorl %ebx,%esi + xorl %ecx,%ebx addl %eax,%ebp - movl %ebx,%edi pxor %xmm6,%xmm2 -.byte 102,68,15,58,15,208,8 - xorl %ecx,%ebx addl 32(%rsp),%edx - andl %ecx,%edi - pxor %xmm3,%xmm2 andl %ebx,%esi + xorl %ecx,%ebx rorl $7,%eax - movdqa %xmm8,%xmm9 - paddd %xmm1,%xmm8 - addl %edi,%edx + punpcklqdq %xmm1,%xmm10 movl %ebp,%edi - pxor %xmm10,%xmm2 + xorl %ebx,%esi + pxor %xmm3,%xmm2 roll $5,%ebp addl %esi,%edx - xorl %ecx,%ebx - addl %ebp,%edx - movdqa %xmm2,%xmm10 - movdqa %xmm8,16(%rsp) - movl %eax,%esi + movdqa %xmm8,%xmm9 + xorl %eax,%edi + paddd %xmm1,%xmm8 xorl %ebx,%eax + pxor %xmm10,%xmm2 + addl %ebp,%edx addl 36(%rsp),%ecx - andl %ebx,%esi - pslld $2,%xmm2 andl %eax,%edi + xorl %ebx,%eax rorl $7,%ebp - psrld $30,%xmm10 - addl %esi,%ecx + movdqa %xmm2,%xmm10 movl %edx,%esi + xorl %eax,%edi + movdqa %xmm8,16(%rsp) roll $5,%edx addl %edi,%ecx - xorl %ebx,%eax - addl %edx,%ecx - por %xmm10,%xmm2 - movl %ebp,%edi + xorl %ebp,%esi + pslld $2,%xmm2 xorl %eax,%ebp - movdqa %xmm2,%xmm8 + addl %edx,%ecx + psrld $30,%xmm10 addl 40(%rsp),%ebx - andl %eax,%edi andl %ebp,%esi + xorl %eax,%ebp + por %xmm10,%xmm2 rorl $7,%edx - addl %edi,%ebx movl %ecx,%edi + xorl %ebp,%esi roll $5,%ecx + pshufd $238,%xmm1,%xmm8 addl %esi,%ebx - xorl %eax,%ebp - addl %ecx,%ebx - movl %edx,%esi + xorl %edx,%edi xorl %ebp,%edx + addl %ecx,%ebx addl 44(%rsp),%eax - andl %ebp,%esi andl %edx,%edi + xorl %ebp,%edx rorl $7,%ecx - addl %esi,%eax movl %ebx,%esi + xorl %edx,%edi roll $5,%ebx addl %edi,%eax - xorl %ebp,%edx + xorl %edx,%esi addl %ebx,%eax - addl 48(%rsp),%ebp pxor %xmm7,%xmm3 -.byte 102,68,15,58,15,193,8 - xorl %edx,%esi + addl 48(%rsp),%ebp + xorl %ecx,%esi + punpcklqdq %xmm2,%xmm8 movl %eax,%edi roll $5,%eax pxor %xmm4,%xmm3 - xorl %ecx,%esi - addl %eax,%ebp + addl %esi,%ebp + xorl %ecx,%edi movdqa %xmm9,%xmm10 - paddd %xmm2,%xmm9 rorl $7,%ebx - addl %esi,%ebp + paddd %xmm2,%xmm9 + addl %eax,%ebp pxor %xmm8,%xmm3 addl 52(%rsp),%edx - xorl %ecx,%edi + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp movdqa %xmm3,%xmm8 + addl %edi,%edx + xorl %ebx,%esi movdqa %xmm9,32(%rsp) - xorl %ebx,%edi - addl %ebp,%edx rorl $7,%eax - addl %edi,%edx - pslld $2,%xmm3 + addl %ebp,%edx addl 56(%rsp),%ecx - xorl %ebx,%esi - psrld $30,%xmm8 + pslld $2,%xmm3 + xorl %eax,%esi movl %edx,%edi + psrld $30,%xmm8 roll $5,%edx - xorl %eax,%esi - addl %edx,%ecx - rorl $7,%ebp addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp por %xmm8,%xmm3 + addl %edx,%ecx addl 60(%rsp),%ebx - xorl %eax,%edi + xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx - xorl %ebp,%edi - addl %ecx,%ebx - rorl $7,%edx addl %edi,%ebx - addl 0(%rsp),%eax - paddd %xmm3,%xmm10 xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + xorl %edx,%esi movl %ebx,%edi roll $5,%ebx - xorl %edx,%esi + paddd %xmm3,%xmm10 + addl %esi,%eax + xorl %edx,%edi movdqa %xmm10,48(%rsp) - addl %ebx,%eax rorl $7,%ecx - addl %esi,%eax + addl %ebx,%eax addl 4(%rsp),%ebp - xorl %edx,%edi + xorl %ecx,%edi movl %eax,%esi roll $5,%eax - xorl %ecx,%edi - addl %eax,%ebp - rorl $7,%ebx addl %edi,%ebp - addl 8(%rsp),%edx xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi movl %ebp,%edi roll $5,%ebp - xorl %ebx,%esi - addl %ebp,%edx - rorl $7,%eax addl %esi,%edx - addl 12(%rsp),%ecx xorl %ebx,%edi + rorl $7,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi movl %edx,%esi roll $5,%edx - xorl %eax,%edi - addl %edx,%ecx - rorl $7,%ebp addl %edi,%ecx + xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx cmpq %r10,%r9 je .Ldone_ssse3 movdqa 64(%r11),%xmm6 - movdqa 0(%r11),%xmm9 + movdqa -64(%r11),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 @@ -2302,113 +2230,112 @@ _ssse3_shortcut: .byte 102,15,56,0,198 addq $64,%r9 addl 16(%rsp),%ebx - xorl %eax,%esi -.byte 102,15,56,0,206 + xorl %ebp,%esi movl %ecx,%edi +.byte 102,15,56,0,206 roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx paddd %xmm9,%xmm0 - xorl %ebp,%esi addl %ecx,%ebx - rorl $7,%edx - addl %esi,%ebx - movdqa %xmm0,0(%rsp) addl 20(%rsp),%eax - xorl %ebp,%edi - psubd %xmm9,%xmm0 + xorl %edx,%edi movl %ebx,%esi + movdqa %xmm0,0(%rsp) roll $5,%ebx - xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx addl %edi,%eax - addl 24(%rsp),%ebp xorl %edx,%esi + rorl $7,%ecx + psubd %xmm9,%xmm0 + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi movl %eax,%edi roll $5,%eax - xorl %ecx,%esi - addl %eax,%ebp - rorl $7,%ebx addl %esi,%ebp - addl 28(%rsp),%edx xorl %ecx,%edi + rorl $7,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp - xorl %ebx,%edi - addl %ebp,%edx - rorl $7,%eax addl %edi,%edx - addl 32(%rsp),%ecx xorl %ebx,%esi -.byte 102,15,56,0,214 + rorl $7,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi movl %edx,%edi +.byte 102,15,56,0,214 roll $5,%edx + addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp paddd %xmm9,%xmm1 - xorl %eax,%esi addl %edx,%ecx - rorl $7,%ebp - addl %esi,%ecx - movdqa %xmm1,16(%rsp) addl 36(%rsp),%ebx - xorl %eax,%edi - psubd %xmm9,%xmm1 + xorl %ebp,%edi movl %ecx,%esi + movdqa %xmm1,16(%rsp) roll $5,%ecx - xorl %ebp,%edi - addl %ecx,%ebx - rorl $7,%edx addl %edi,%ebx - addl 40(%rsp),%eax xorl %ebp,%esi + rorl $7,%edx + psubd %xmm9,%xmm1 + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi movl %ebx,%edi roll $5,%ebx - xorl %edx,%esi - addl %ebx,%eax - rorl $7,%ecx addl %esi,%eax - addl 44(%rsp),%ebp xorl %edx,%edi + rorl $7,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi movl %eax,%esi roll $5,%eax - xorl %ecx,%edi - addl %eax,%ebp - rorl $7,%ebx addl %edi,%ebp - addl 48(%rsp),%edx xorl %ecx,%esi -.byte 102,15,56,0,222 + rorl $7,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi movl %ebp,%edi +.byte 102,15,56,0,222 roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax paddd %xmm9,%xmm2 - xorl %ebx,%esi addl %ebp,%edx - rorl $7,%eax - addl %esi,%edx - movdqa %xmm2,32(%rsp) addl 52(%rsp),%ecx - xorl %ebx,%edi - psubd %xmm9,%xmm2 + xorl %eax,%edi movl %edx,%esi + movdqa %xmm2,32(%rsp) roll $5,%edx - xorl %eax,%edi - addl %edx,%ecx - rorl $7,%ebp addl %edi,%ecx - addl 56(%rsp),%ebx xorl %eax,%esi + rorl $7,%ebp + psubd %xmm9,%xmm2 + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi movl %ecx,%edi roll $5,%ecx - xorl %ebp,%esi - addl %ecx,%ebx - rorl $7,%edx addl %esi,%ebx - addl 60(%rsp),%eax xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi movl %ebx,%esi roll $5,%ebx - xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx addl %edi,%eax + rorl $7,%ecx + addl %ebx,%eax addl 0(%r8),%eax addl 4(%r8),%esi addl 8(%r8),%ecx @@ -2418,108 +2345,110 @@ _ssse3_shortcut: movl %esi,4(%r8) movl %esi,%ebx movl %ecx,8(%r8) + movl %ecx,%edi movl %edx,12(%r8) + xorl %edx,%edi movl %ebp,16(%r8) + andl %edi,%esi jmp .Loop_ssse3 .p2align 4 .Ldone_ssse3: addl 16(%rsp),%ebx - xorl %eax,%esi + xorl %ebp,%esi movl %ecx,%edi roll $5,%ecx - xorl %ebp,%esi - addl %ecx,%ebx - rorl $7,%edx addl %esi,%ebx - addl 20(%rsp),%eax xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi movl %ebx,%esi roll $5,%ebx - xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx addl %edi,%eax - addl 24(%rsp),%ebp xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi movl %eax,%edi roll $5,%eax - xorl %ecx,%esi - addl %eax,%ebp - rorl $7,%ebx addl %esi,%ebp - addl 28(%rsp),%edx xorl %ecx,%edi + rorl $7,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp - xorl %ebx,%edi - addl %ebp,%edx - rorl $7,%eax addl %edi,%edx - addl 32(%rsp),%ecx xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi movl %edx,%edi roll $5,%edx - xorl %eax,%esi - addl %edx,%ecx - rorl $7,%ebp addl %esi,%ecx - addl 36(%rsp),%ebx xorl %eax,%edi + rorl $7,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx - xorl %ebp,%edi - addl %ecx,%ebx - rorl $7,%edx addl %edi,%ebx - addl 40(%rsp),%eax xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi movl %ebx,%edi roll $5,%ebx - xorl %edx,%esi - addl %ebx,%eax - rorl $7,%ecx addl %esi,%eax - addl 44(%rsp),%ebp xorl %edx,%edi + rorl $7,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi movl %eax,%esi roll $5,%eax - xorl %ecx,%edi - addl %eax,%ebp - rorl $7,%ebx addl %edi,%ebp - addl 48(%rsp),%edx xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi movl %ebp,%edi roll $5,%ebp - xorl %ebx,%esi - addl %ebp,%edx - rorl $7,%eax addl %esi,%edx - addl 52(%rsp),%ecx xorl %ebx,%edi + rorl $7,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi movl %edx,%esi roll $5,%edx - xorl %eax,%edi - addl %edx,%ecx - rorl $7,%ebp addl %edi,%ecx - addl 56(%rsp),%ebx xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi movl %ecx,%edi roll $5,%ecx - xorl %ebp,%esi - addl %ecx,%ebx - rorl $7,%edx addl %esi,%ebx - addl 60(%rsp),%eax xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi movl %ebx,%esi roll $5,%ebx - xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx addl %edi,%eax + rorl $7,%ecx + addl %ebx,%eax addl 0(%r8),%eax addl 4(%r8),%esi addl 8(%r8),%ecx @@ -2530,16 +2459,19 @@ _ssse3_shortcut: movl %ecx,8(%r8) movl %edx,12(%r8) movl %ebp,16(%r8) - movaps 64+0(%rsp),%xmm6 - movaps 64+16(%rsp),%xmm7 - movaps 64+32(%rsp),%xmm8 - movaps 64+48(%rsp),%xmm9 - movaps 64+64(%rsp),%xmm10 - leaq 144(%rsp),%rsi - movq 0(%rsi),%r12 - movq 8(%rsi),%rbp - movq 16(%rsi),%rbx - leaq 24(%rsi),%rsp + movaps -40-96(%r14),%xmm6 + movaps -40-80(%r14),%xmm7 + movaps -40-64(%r14),%xmm8 + movaps -40-48(%r14),%xmm9 + movaps -40-32(%r14),%xmm10 + movaps -40-16(%r14),%xmm11 + leaq (%r14),%rsi + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lepilogue_ssse3: movq 8(%rsp),%rdi movq 16(%rsp),%rsi @@ -2547,11 +2479,16 @@ _ssse3_shortcut: .LSEH_end_sha1_block_data_order_ssse3: .p2align 6 K_XX_XX: -.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 -.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 -.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc -.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 6 @@ -2583,16 +2520,17 @@ se_handler: jae .Lcommon_seh_tail movq 64(%rax),%rax - leaq 32(%rax),%rax movq -8(%rax),%rbx movq -16(%rax),%rbp movq -24(%rax),%r12 movq -32(%rax),%r13 + movq -40(%rax),%r14 movq %rbx,144(%r8) movq %rbp,160(%r8) movq %r12,216(%r8) movq %r13,224(%r8) + movq %r14,232(%r8) jmp .Lcommon_seh_tail @@ -2629,18 +2567,23 @@ ssse3_handler: cmpq %r10,%rbx jae .Lcommon_seh_tail - leaq 64(%rax),%rsi + movq 232(%r8),%rax + + leaq -40-96(%rax),%rsi leaq 512(%r8),%rdi - movl $10,%ecx -.long 0xa548f3fc - leaq 168(%rax),%rax + movl $12,%ecx +.long 0xa548f3fc movq -8(%rax),%rbx movq -16(%rax),%rbp movq -24(%rax),%r12 + movq -32(%rax),%r13 + movq -40(%rax),%r14 movq %rbx,144(%r8) movq %rbp,160(%r8) movq %r12,216(%r8) + movq %r13,224(%r8) + movq %r14,232(%r8) .Lcommon_seh_tail: movq 8(%rax),%rdi @@ -2652,7 +2595,7 @@ ssse3_handler: movq 40(%r9),%rdi movq %r8,%rsi movl $154,%ecx -.long 0xa548f3fc +.long 0xa548f3fc movq %r9,%rsi xorq %rcx,%rcx @@ -2698,5 +2641,5 @@ ssse3_handler: .LSEH_info_sha1_block_data_order_ssse3: .byte 9,0,0,0 .rva ssse3_handler -.rva .Lprologue_ssse3,.Lepilogue_ssse3 +.rva .Lprologue_ssse3,.Lepilogue_ssse3 diff --git a/lib/accelerated/x86/coff/sha256-ssse3-x86.s b/lib/accelerated/x86/coff/sha256-ssse3-x86.s index 61d6eaccf9..91cf88cc34 100644 --- a/lib/accelerated/x86/coff/sha256-ssse3-x86.s +++ b/lib/accelerated/x86/coff/sha256-ssse3-x86.s @@ -64,195 +64,391 @@ _sha256_block_data_order: movl %edi,4(%esp) movl %eax,8(%esp) movl %ebx,12(%esp) + jmp .L002loop .align 16 .L002loop: movl (%edi),%eax movl 4(%edi),%ebx movl 8(%edi),%ecx - movl 12(%edi),%edx bswap %eax + movl 12(%edi),%edx bswap %ebx - bswap %ecx - bswap %edx pushl %eax + bswap %ecx pushl %ebx + bswap %edx pushl %ecx pushl %edx movl 16(%edi),%eax movl 20(%edi),%ebx movl 24(%edi),%ecx - movl 28(%edi),%edx bswap %eax + movl 28(%edi),%edx bswap %ebx - bswap %ecx - bswap %edx pushl %eax + bswap %ecx pushl %ebx + bswap %edx pushl %ecx pushl %edx movl 32(%edi),%eax movl 36(%edi),%ebx movl 40(%edi),%ecx - movl 44(%edi),%edx bswap %eax + movl 44(%edi),%edx bswap %ebx - bswap %ecx - bswap %edx pushl %eax + bswap %ecx pushl %ebx + bswap %edx pushl %ecx pushl %edx movl 48(%edi),%eax movl 52(%edi),%ebx movl 56(%edi),%ecx - movl 60(%edi),%edx bswap %eax + movl 60(%edi),%edx bswap %ebx - bswap %ecx - bswap %edx pushl %eax + bswap %ecx pushl %ebx + bswap %edx pushl %ecx pushl %edx addl $64,%edi - subl $32,%esp - movl %edi,100(%esp) + leal -36(%esp),%esp + movl %edi,104(%esp) movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx movl 12(%esi),%edi - movl %ebx,4(%esp) - movl %ecx,8(%esp) - movl %edi,12(%esp) + movl %ebx,8(%esp) + xorl %ecx,%ebx + movl %ecx,12(%esp) + movl %edi,16(%esp) + movl %ebx,(%esp) movl 16(%esi),%edx movl 20(%esi),%ebx movl 24(%esi),%ecx movl 28(%esi),%edi - movl %ebx,20(%esp) - movl %ecx,24(%esp) - movl %edi,28(%esp) + movl %ebx,24(%esp) + movl %ecx,28(%esp) + movl %edi,32(%esp) .align 16 .L00300_15: - movl 92(%esp),%ebx movl %edx,%ecx + movl 24(%esp),%esi rorl $14,%ecx - movl 20(%esp),%esi + movl 28(%esp),%edi xorl %edx,%ecx - rorl $5,%ecx - xorl %edx,%ecx - rorl $6,%ecx - movl 24(%esp),%edi - addl %ecx,%ebx xorl %edi,%esi - movl %edx,16(%esp) - movl %eax,%ecx + movl 96(%esp),%ebx + rorl $5,%ecx andl %edx,%esi - movl 12(%esp),%edx + movl %edx,20(%esp) + xorl %ecx,%edx + addl 32(%esp),%ebx xorl %edi,%esi - movl %eax,%edi + rorl $6,%edx + movl %eax,%ecx addl %esi,%ebx rorl $9,%ecx - addl 28(%esp),%ebx + addl %edx,%ebx + movl 8(%esp),%edi xorl %eax,%ecx + movl %eax,4(%esp) + leal -4(%esp),%esp rorl $11,%ecx - movl 4(%esp),%esi + movl (%ebp),%esi xorl %eax,%ecx + movl 20(%esp),%edx + xorl %edi,%eax rorl $2,%ecx + addl %esi,%ebx + movl %eax,(%esp) addl %ebx,%edx - movl 8(%esp),%edi + andl 4(%esp),%eax addl %ecx,%ebx - movl %eax,(%esp) - movl %eax,%ecx - subl $4,%esp - orl %esi,%eax - andl %esi,%ecx - andl %edi,%eax - movl (%ebp),%esi - orl %ecx,%eax + xorl %edi,%eax addl $4,%ebp addl %ebx,%eax - addl %esi,%edx - addl %esi,%eax cmpl $3248222580,%esi jne .L00300_15 - movl 152(%esp),%ebx + movl 156(%esp),%ecx + jmp .L00416_63 .align 16 .L00416_63: - movl %ebx,%esi - movl 100(%esp),%ecx - rorl $11,%esi - movl %ecx,%edi - xorl %ebx,%esi - rorl $7,%esi + movl %ecx,%ebx + movl 104(%esp),%esi + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx shrl $3,%ebx - rorl $2,%edi - xorl %esi,%ebx - xorl %ecx,%edi - rorl $17,%edi - shrl $10,%ecx - addl 156(%esp),%ebx - xorl %ecx,%edi - addl 120(%esp),%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 160(%esp),%ebx + shrl $10,%edi + addl 124(%esp),%ebx movl %edx,%ecx - addl %edi,%ebx + xorl %esi,%edi + movl 24(%esp),%esi rorl $14,%ecx - movl 20(%esp),%esi - xorl %edx,%ecx - rorl $5,%ecx - movl %ebx,92(%esp) + addl %edi,%ebx + movl 28(%esp),%edi xorl %edx,%ecx - rorl $6,%ecx - movl 24(%esp),%edi - addl %ecx,%ebx xorl %edi,%esi - movl %edx,16(%esp) - movl %eax,%ecx + movl %ebx,96(%esp) + rorl $5,%ecx andl %edx,%esi - movl 12(%esp),%edx + movl %edx,20(%esp) + xorl %ecx,%edx + addl 32(%esp),%ebx xorl %edi,%esi - movl %eax,%edi + rorl $6,%edx + movl %eax,%ecx addl %esi,%ebx rorl $9,%ecx - addl 28(%esp),%ebx + addl %edx,%ebx + movl 8(%esp),%edi xorl %eax,%ecx + movl %eax,4(%esp) + leal -4(%esp),%esp rorl $11,%ecx - movl 4(%esp),%esi + movl (%ebp),%esi xorl %eax,%ecx + movl 20(%esp),%edx + xorl %edi,%eax rorl $2,%ecx + addl %esi,%ebx + movl %eax,(%esp) addl %ebx,%edx - movl 8(%esp),%edi + andl 4(%esp),%eax addl %ecx,%ebx + xorl %edi,%eax + movl 156(%esp),%ecx + addl $4,%ebp + addl %ebx,%eax + cmpl $3329325298,%esi + jne .L00416_63 + movl 356(%esp),%esi + movl 8(%esp),%ebx + movl 16(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl 24(%esp),%eax + movl 28(%esp),%ebx + movl 32(%esp),%ecx + movl 360(%esp),%edi + addl 16(%esi),%edx + addl 20(%esi),%eax + addl 24(%esi),%ebx + addl 28(%esi),%ecx + movl %edx,16(%esi) + movl %eax,20(%esi) + movl %ebx,24(%esi) + movl %ecx,28(%esi) + leal 356(%esp),%esp + subl $256,%ebp + cmpl 8(%esp),%edi + jb .L002loop + movl 12(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 32 +.L005loop_shrd: + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + bswap %eax + movl 12(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + movl 16(%edi),%eax + movl 20(%edi),%ebx + movl 24(%edi),%ecx + bswap %eax + movl 28(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + movl 32(%edi),%eax + movl 36(%edi),%ebx + movl 40(%edi),%ecx + bswap %eax + movl 44(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + movl 48(%edi),%eax + movl 52(%edi),%ebx + movl 56(%edi),%ecx + bswap %eax + movl 60(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + addl $64,%edi + leal -36(%esp),%esp + movl %edi,104(%esp) + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,8(%esp) + xorl %ecx,%ebx + movl %ecx,12(%esp) + movl %edi,16(%esp) + movl %ebx,(%esp) + movl 16(%esi),%edx + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%edi + movl %ebx,24(%esp) + movl %ecx,28(%esp) + movl %edi,32(%esp) +.align 16 +.L00600_15_shrd: + movl %edx,%ecx + movl 24(%esp),%esi + shrdl $14,%ecx,%ecx + movl 28(%esp),%edi + xorl %edx,%ecx + xorl %edi,%esi + movl 96(%esp),%ebx + shrdl $5,%ecx,%ecx + andl %edx,%esi + movl %edx,20(%esp) + xorl %ecx,%edx + addl 32(%esp),%ebx + xorl %edi,%esi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %esi,%ebx + shrdl $9,%ecx,%ecx + addl %edx,%ebx + movl 8(%esp),%edi + xorl %eax,%ecx + movl %eax,4(%esp) + leal -4(%esp),%esp + shrdl $11,%ecx,%ecx + movl (%ebp),%esi + xorl %eax,%ecx + movl 20(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %esi,%ebx movl %eax,(%esp) + addl %ebx,%edx + andl 4(%esp),%eax + addl %ecx,%ebx + xorl %edi,%eax + addl $4,%ebp + addl %ebx,%eax + cmpl $3248222580,%esi + jne .L00600_15_shrd + movl 156(%esp),%ecx + jmp .L00716_63_shrd +.align 16 +.L00716_63_shrd: + movl %ecx,%ebx + movl 104(%esp),%esi + shrdl $11,%ecx,%ecx + movl %esi,%edi + shrdl $2,%esi,%esi + xorl %ebx,%ecx + shrl $3,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + shrdl $17,%esi,%esi + addl 160(%esp),%ebx + shrl $10,%edi + addl 124(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 24(%esp),%esi + shrdl $14,%ecx,%ecx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %edx,%ecx + xorl %edi,%esi + movl %ebx,96(%esp) + shrdl $5,%ecx,%ecx + andl %edx,%esi + movl %edx,20(%esp) + xorl %ecx,%edx + addl 32(%esp),%ebx + xorl %edi,%esi + shrdl $6,%edx,%edx movl %eax,%ecx - subl $4,%esp - orl %esi,%eax - andl %esi,%ecx - andl %edi,%eax + addl %esi,%ebx + shrdl $9,%ecx,%ecx + addl %edx,%ebx + movl 8(%esp),%edi + xorl %eax,%ecx + movl %eax,4(%esp) + leal -4(%esp),%esp + shrdl $11,%ecx,%ecx movl (%ebp),%esi - orl %ecx,%eax + xorl %eax,%ecx + movl 20(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %esi,%ebx + movl %eax,(%esp) + addl %ebx,%edx + andl 4(%esp),%eax + addl %ecx,%ebx + xorl %edi,%eax + movl 156(%esp),%ecx addl $4,%ebp addl %ebx,%eax - movl 152(%esp),%ebx - addl %esi,%edx - addl %esi,%eax cmpl $3329325298,%esi - jne .L00416_63 - movl 352(%esp),%esi - movl 4(%esp),%ebx - movl 8(%esp),%ecx - movl 12(%esp),%edi + jne .L00716_63_shrd + movl 356(%esp),%esi + movl 8(%esp),%ebx + movl 16(%esp),%ecx addl (%esi),%eax addl 4(%esi),%ebx - addl 8(%esi),%ecx - addl 12(%esi),%edi + addl 8(%esi),%edi + addl 12(%esi),%ecx movl %eax,(%esi) movl %ebx,4(%esi) - movl %ecx,8(%esi) - movl %edi,12(%esi) - movl 20(%esp),%eax - movl 24(%esp),%ebx - movl 28(%esp),%ecx - movl 356(%esp),%edi + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl 24(%esp),%eax + movl 28(%esp),%ebx + movl 32(%esp),%ecx + movl 360(%esp),%edi addl 16(%esi),%edx addl 20(%esi),%eax addl 24(%esi),%ebx @@ -261,10 +457,10 @@ _sha256_block_data_order: movl %eax,20(%esi) movl %ebx,24(%esi) movl %ecx,28(%esi) - addl $352,%esp + leal 356(%esp),%esp subl $256,%ebp cmpl 8(%esp),%edi - jb .L002loop + jb .L005loop_shrd movl 12(%esp),%esp popl %edi popl %esi @@ -273,25 +469,2918 @@ _sha256_block_data_order: ret .align 64 .L001K256: -.long 1116352408,1899447441,3049323471,3921009573 -.long 961987163,1508970993,2453635748,2870763221 -.long 3624381080,310598401,607225278,1426881987 -.long 1925078388,2162078206,2614888103,3248222580 -.long 3835390401,4022224774,264347078,604807628 -.long 770255983,1249150122,1555081692,1996064986 -.long 2554220882,2821834349,2952996808,3210313671 -.long 3336571891,3584528711,113926993,338241895 -.long 666307205,773529912,1294757372,1396182291 -.long 1695183700,1986661051,2177026350,2456956037 -.long 2730485921,2820302411,3259730800,3345764771 -.long 3516065817,3600352804,4094571909,275423344 -.long 430227734,506948616,659060556,883997877 -.long 958139571,1322822218,1537002063,1747873779 -.long 1955562222,2024104815,2227730452,2361852424 -.long 2428436474,2756734187,3204031479,3329325298 +.long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298 +.long 66051,67438087,134810123,202182159 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 .byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 .byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 +.align 16 +.L008unrolled: + leal -96(%esp),%esp + movl (%esi),%eax + movl 4(%esi),%ebp + movl 8(%esi),%ecx + movl 12(%esi),%ebx + movl %ebp,4(%esp) + xorl %ecx,%ebp + movl %ecx,8(%esp) + movl %ebx,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %ebx,20(%esp) + movl %ecx,24(%esp) + movl %esi,28(%esp) + jmp .L009grand_loop +.align 16 +.L009grand_loop: + movl (%edi),%ebx + movl 4(%edi),%ecx + bswap %ebx + movl 8(%edi),%esi + bswap %ecx + movl %ebx,32(%esp) + bswap %esi + movl %ecx,36(%esp) + movl %esi,40(%esp) + movl 12(%edi),%ebx + movl 16(%edi),%ecx + bswap %ebx + movl 20(%edi),%esi + bswap %ecx + movl %ebx,44(%esp) + bswap %esi + movl %ecx,48(%esp) + movl %esi,52(%esp) + movl 24(%edi),%ebx + movl 28(%edi),%ecx + bswap %ebx + movl 32(%edi),%esi + bswap %ecx + movl %ebx,56(%esp) + bswap %esi + movl %ecx,60(%esp) + movl %esi,64(%esp) + movl 36(%edi),%ebx + movl 40(%edi),%ecx + bswap %ebx + movl 44(%edi),%esi + bswap %ecx + movl %ebx,68(%esp) + bswap %esi + movl %ecx,72(%esp) + movl %esi,76(%esp) + movl 48(%edi),%ebx + movl 52(%edi),%ecx + bswap %ebx + movl 56(%edi),%esi + bswap %ecx + movl %ebx,80(%esp) + bswap %esi + movl %ecx,84(%esp) + movl %esi,88(%esp) + movl 60(%edi),%ebx + addl $64,%edi + bswap %ebx + movl %edi,100(%esp) + movl %ebx,92(%esp) + movl %edx,%ecx + movl 20(%esp),%esi + rorl $14,%edx + movl 24(%esp),%edi + xorl %ecx,%edx + movl 32(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1116352408(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 16(%esp),%ecx + rorl $14,%edx + movl 20(%esp),%edi + xorl %esi,%edx + movl 36(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1899447441(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 12(%esp),%esi + rorl $14,%edx + movl 16(%esp),%edi + xorl %ecx,%edx + movl 40(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3049323471(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 8(%esp),%ecx + rorl $14,%edx + movl 12(%esp),%edi + xorl %esi,%edx + movl 44(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3921009573(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 4(%esp),%esi + rorl $14,%edx + movl 8(%esp),%edi + xorl %ecx,%edx + movl 48(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 961987163(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl (%esp),%ecx + rorl $14,%edx + movl 4(%esp),%edi + xorl %esi,%edx + movl 52(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1508970993(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 28(%esp),%esi + rorl $14,%edx + movl (%esp),%edi + xorl %ecx,%edx + movl 56(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2453635748(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 24(%esp),%ecx + rorl $14,%edx + movl 28(%esp),%edi + xorl %esi,%edx + movl 60(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2870763221(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 20(%esp),%esi + rorl $14,%edx + movl 24(%esp),%edi + xorl %ecx,%edx + movl 64(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3624381080(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 16(%esp),%ecx + rorl $14,%edx + movl 20(%esp),%edi + xorl %esi,%edx + movl 68(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 310598401(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 12(%esp),%esi + rorl $14,%edx + movl 16(%esp),%edi + xorl %ecx,%edx + movl 72(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 607225278(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 8(%esp),%ecx + rorl $14,%edx + movl 12(%esp),%edi + xorl %esi,%edx + movl 76(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1426881987(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 4(%esp),%esi + rorl $14,%edx + movl 8(%esp),%edi + xorl %ecx,%edx + movl 80(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1925078388(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl (%esp),%ecx + rorl $14,%edx + movl 4(%esp),%edi + xorl %esi,%edx + movl 84(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2162078206(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 28(%esp),%esi + rorl $14,%edx + movl (%esp),%edi + xorl %ecx,%edx + movl 88(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2614888103(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 24(%esp),%ecx + rorl $14,%edx + movl 28(%esp),%edi + xorl %esi,%edx + movl 92(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3248222580(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 36(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 88(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 32(%esp),%ebx + shrl $10,%edi + addl 68(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,32(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3835390401(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 40(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 92(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 36(%esp),%ebx + shrl $10,%edi + addl 72(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,36(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 4022224774(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 44(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 32(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 40(%esp),%ebx + shrl $10,%edi + addl 76(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,40(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 264347078(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 48(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 36(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 44(%esp),%ebx + shrl $10,%edi + addl 80(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,44(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 604807628(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 52(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 40(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 48(%esp),%ebx + shrl $10,%edi + addl 84(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,48(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 770255983(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 56(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 44(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 52(%esp),%ebx + shrl $10,%edi + addl 88(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,52(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1249150122(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 60(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 48(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 56(%esp),%ebx + shrl $10,%edi + addl 92(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,56(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1555081692(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 64(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 52(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 60(%esp),%ebx + shrl $10,%edi + addl 32(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,60(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1996064986(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 68(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 56(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 64(%esp),%ebx + shrl $10,%edi + addl 36(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,64(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2554220882(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 72(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 60(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 68(%esp),%ebx + shrl $10,%edi + addl 40(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,68(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2821834349(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 76(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 64(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 72(%esp),%ebx + shrl $10,%edi + addl 44(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,72(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2952996808(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 80(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 68(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 76(%esp),%ebx + shrl $10,%edi + addl 48(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,76(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3210313671(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 84(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 72(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 80(%esp),%ebx + shrl $10,%edi + addl 52(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,80(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3336571891(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 88(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 76(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 84(%esp),%ebx + shrl $10,%edi + addl 56(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,84(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3584528711(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 92(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 80(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 88(%esp),%ebx + shrl $10,%edi + addl 60(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,88(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 113926993(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 32(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 84(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 92(%esp),%ebx + shrl $10,%edi + addl 64(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,92(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 338241895(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 36(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 88(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 32(%esp),%ebx + shrl $10,%edi + addl 68(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,32(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 666307205(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 40(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 92(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 36(%esp),%ebx + shrl $10,%edi + addl 72(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,36(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 773529912(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 44(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 32(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 40(%esp),%ebx + shrl $10,%edi + addl 76(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,40(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1294757372(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 48(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 36(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 44(%esp),%ebx + shrl $10,%edi + addl 80(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,44(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1396182291(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 52(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 40(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 48(%esp),%ebx + shrl $10,%edi + addl 84(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,48(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1695183700(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 56(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 44(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 52(%esp),%ebx + shrl $10,%edi + addl 88(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,52(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1986661051(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 60(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 48(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 56(%esp),%ebx + shrl $10,%edi + addl 92(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,56(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2177026350(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 64(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 52(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 60(%esp),%ebx + shrl $10,%edi + addl 32(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,60(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2456956037(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 68(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 56(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 64(%esp),%ebx + shrl $10,%edi + addl 36(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,64(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2730485921(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 72(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 60(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 68(%esp),%ebx + shrl $10,%edi + addl 40(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,68(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2820302411(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 76(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 64(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 72(%esp),%ebx + shrl $10,%edi + addl 44(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,72(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3259730800(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 80(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 68(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 76(%esp),%ebx + shrl $10,%edi + addl 48(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,76(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3345764771(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 84(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 72(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 80(%esp),%ebx + shrl $10,%edi + addl 52(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,80(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3516065817(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 88(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 76(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 84(%esp),%ebx + shrl $10,%edi + addl 56(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,84(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3600352804(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 92(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 80(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 88(%esp),%ebx + shrl $10,%edi + addl 60(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,88(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 4094571909(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 32(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 84(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 92(%esp),%ebx + shrl $10,%edi + addl 64(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,92(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 275423344(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 36(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 88(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 32(%esp),%ebx + shrl $10,%edi + addl 68(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,32(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 430227734(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 40(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 92(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 36(%esp),%ebx + shrl $10,%edi + addl 72(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,36(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 506948616(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 44(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 32(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 40(%esp),%ebx + shrl $10,%edi + addl 76(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,40(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 659060556(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 48(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 36(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 44(%esp),%ebx + shrl $10,%edi + addl 80(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,44(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 883997877(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 52(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 40(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 48(%esp),%ebx + shrl $10,%edi + addl 84(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,48(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 958139571(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 56(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 44(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 52(%esp),%ebx + shrl $10,%edi + addl 88(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,52(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1322822218(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 60(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 48(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 56(%esp),%ebx + shrl $10,%edi + addl 92(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,56(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1537002063(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 64(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 52(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 60(%esp),%ebx + shrl $10,%edi + addl 32(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,60(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1747873779(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 68(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 56(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 64(%esp),%ebx + shrl $10,%edi + addl 36(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,64(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1955562222(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 72(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 60(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 68(%esp),%ebx + shrl $10,%edi + addl 40(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,68(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2024104815(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 76(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 64(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 72(%esp),%ebx + shrl $10,%edi + addl 44(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,72(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2227730452(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 80(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 68(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 76(%esp),%ebx + shrl $10,%edi + addl 48(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,76(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2361852424(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 84(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 72(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 80(%esp),%ebx + shrl $10,%edi + addl 52(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,80(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2428436474(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 88(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 76(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 84(%esp),%ebx + shrl $10,%edi + addl 56(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,84(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2756734187(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 92(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 80(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 88(%esp),%ebx + shrl $10,%edi + addl 60(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3204031479(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 32(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 84(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 92(%esp),%ebx + shrl $10,%edi + addl 64(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3329325298(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 96(%esp),%esi + xorl %edi,%ebp + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebp + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebp,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebp,4(%esp) + xorl %edi,%ebp + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ebx + movl 28(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ebx + addl 28(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %ebx,24(%esi) + movl %ecx,28(%esi) + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ebx,24(%esp) + movl %ecx,28(%esp) + cmpl 104(%esp),%edi + jb .L009grand_loop + movl 108(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret diff --git a/lib/accelerated/x86/coff/sha512-ssse3-x86.s b/lib/accelerated/x86/coff/sha512-ssse3-x86.s index acad0ec1e7..42233253c9 100644 --- a/lib/accelerated/x86/coff/sha512-ssse3-x86.s +++ b/lib/accelerated/x86/coff/sha512-ssse3-x86.s @@ -594,6 +594,8 @@ _sha512_block_data_order: .long 4234509866,1501505948 .long 987167468,1607167915 .long 1246189591,1816402316 +.long 67438087,66051 +.long 202182159,134810123 .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97 .byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 .byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 diff --git a/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s b/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s index 2b6cd0fcd3..edaa67b95f 100644 --- a/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s +++ b/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s @@ -39,6 +39,7 @@ # .text + .globl sha256_block_data_order .def sha256_block_data_order; .scl 2; .type 32; .endef .p2align 4 @@ -50,8 +51,13 @@ sha256_block_data_order: movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx - movq %r9,%rcx + leaq _gnutls_x86_cpuid_s(%rip),%r11 + movl 0(%r11),%r9d + movl 4(%r11),%r10d + movl 8(%r11),%r11d + testl $512,%r10d + jnz .Lssse3_shortcut pushq %rbx pushq %rbp pushq %r12 @@ -69,8 +75,6 @@ sha256_block_data_order: movq %r11,64+24(%rsp) .Lprologue: - leaq K256(%rip),%rbp - movl 0(%rdi),%eax movl 4(%rdi),%ebx movl 8(%rdi),%ecx @@ -83,1694 +87,1632 @@ sha256_block_data_order: .p2align 4 .Lloop: - xorq %rdi,%rdi + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi movl 0(%rsi),%r12d movl %r8d,%r13d movl %eax,%r14d bswapl %r12d rorl $14,%r13d movl %r9d,%r15d - movl %r12d,0(%rsp) - rorl $9,%r14d xorl %r8d,%r13d + rorl $9,%r14d xorl %r10d,%r15d - rorl $5,%r13d - addl %r11d,%r12d + movl %r12d,0(%rsp) xorl %eax,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r8d,%r15d - movl %ebx,%r11d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d - xorl %r10d,%r15d + addl %r15d,%r12d - xorl %ecx,%r11d + movl %eax,%r15d + addl (%rbp),%r12d xorl %eax,%r14d - addl %r15d,%r12d - movl %ebx,%r15d + xorl %ebx,%r15d rorl $6,%r13d - andl %eax,%r11d - andl %ecx,%r15d + movl %ebx,%r11d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r11d + xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d - leaq 1(%rdi),%rdi - addl %r14d,%r11d + leaq 4(%rbp),%rbp + addl %r14d,%r11d movl 4(%rsi),%r12d movl %edx,%r13d movl %r11d,%r14d bswapl %r12d rorl $14,%r13d - movl %r8d,%r15d - movl %r12d,4(%rsp) + movl %r8d,%edi - rorl $9,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + rorl $9,%r14d + xorl %r9d,%edi - rorl $5,%r13d - addl %r10d,%r12d + movl %r12d,4(%rsp) xorl %r11d,%r14d + andl %edx,%edi - addl (%rbp,%rdi,4),%r12d - andl %edx,%r15d - movl %eax,%r10d + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + addl %edi,%r12d - xorl %ebx,%r10d + movl %r11d,%edi + addl (%rbp),%r12d xorl %r11d,%r14d - addl %r15d,%r12d - movl %eax,%r15d + xorl %eax,%edi rorl $6,%r13d - andl %r11d,%r10d - andl %ebx,%r15d + movl %eax,%r10d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r10d + xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d - leaq 1(%rdi),%rdi - addl %r14d,%r10d + leaq 4(%rbp),%rbp + addl %r14d,%r10d movl 8(%rsi),%r12d movl %ecx,%r13d movl %r10d,%r14d bswapl %r12d rorl $14,%r13d movl %edx,%r15d - movl %r12d,8(%rsp) - rorl $9,%r14d xorl %ecx,%r13d + rorl $9,%r14d xorl %r8d,%r15d - rorl $5,%r13d - addl %r9d,%r12d + movl %r12d,8(%rsp) xorl %r10d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %ecx,%r15d - movl %r11d,%r9d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d - xorl %r8d,%r15d + addl %r15d,%r12d - xorl %eax,%r9d + movl %r10d,%r15d + addl (%rbp),%r12d xorl %r10d,%r14d - addl %r15d,%r12d - movl %r11d,%r15d + xorl %r11d,%r15d rorl $6,%r13d - andl %r10d,%r9d - andl %eax,%r15d + movl %r11d,%r9d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r9d + xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d - leaq 1(%rdi),%rdi - addl %r14d,%r9d + leaq 4(%rbp),%rbp + addl %r14d,%r9d movl 12(%rsi),%r12d movl %ebx,%r13d movl %r9d,%r14d bswapl %r12d rorl $14,%r13d - movl %ecx,%r15d - movl %r12d,12(%rsp) + movl %ecx,%edi - rorl $9,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + rorl $9,%r14d + xorl %edx,%edi - rorl $5,%r13d - addl %r8d,%r12d + movl %r12d,12(%rsp) xorl %r9d,%r14d + andl %ebx,%edi - addl (%rbp,%rdi,4),%r12d - andl %ebx,%r15d - movl %r10d,%r8d + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + addl %edi,%r12d - xorl %r11d,%r8d + movl %r9d,%edi + addl (%rbp),%r12d xorl %r9d,%r14d - addl %r15d,%r12d - movl %r10d,%r15d + xorl %r10d,%edi rorl $6,%r13d - andl %r9d,%r8d - andl %r11d,%r15d + movl %r10d,%r8d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r8d + xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d - leaq 1(%rdi),%rdi - addl %r14d,%r8d + leaq 20(%rbp),%rbp + addl %r14d,%r8d movl 16(%rsi),%r12d movl %eax,%r13d movl %r8d,%r14d bswapl %r12d rorl $14,%r13d movl %ebx,%r15d - movl %r12d,16(%rsp) - rorl $9,%r14d xorl %eax,%r13d + rorl $9,%r14d xorl %ecx,%r15d - rorl $5,%r13d - addl %edx,%r12d + movl %r12d,16(%rsp) xorl %r8d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %eax,%r15d - movl %r9d,%edx + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d - xorl %ecx,%r15d + addl %r15d,%r12d - xorl %r10d,%edx + movl %r8d,%r15d + addl (%rbp),%r12d xorl %r8d,%r14d - addl %r15d,%r12d - movl %r9d,%r15d + xorl %r9d,%r15d rorl $6,%r13d - andl %r8d,%edx - andl %r10d,%r15d + movl %r9d,%edx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%edx + xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx - leaq 1(%rdi),%rdi - addl %r14d,%edx + leaq 4(%rbp),%rbp + addl %r14d,%edx movl 20(%rsi),%r12d movl %r11d,%r13d movl %edx,%r14d bswapl %r12d rorl $14,%r13d - movl %eax,%r15d - movl %r12d,20(%rsp) + movl %eax,%edi - rorl $9,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + rorl $9,%r14d + xorl %ebx,%edi - rorl $5,%r13d - addl %ecx,%r12d + movl %r12d,20(%rsp) xorl %edx,%r14d + andl %r11d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r11d,%r15d - movl %r8d,%ecx + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + addl %edi,%r12d - xorl %r9d,%ecx + movl %edx,%edi + addl (%rbp),%r12d xorl %edx,%r14d - addl %r15d,%r12d - movl %r8d,%r15d + xorl %r8d,%edi rorl $6,%r13d - andl %edx,%ecx - andl %r9d,%r15d + movl %r8d,%ecx + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ecx + xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx - leaq 1(%rdi),%rdi - addl %r14d,%ecx + leaq 4(%rbp),%rbp + addl %r14d,%ecx movl 24(%rsi),%r12d movl %r10d,%r13d movl %ecx,%r14d bswapl %r12d rorl $14,%r13d movl %r11d,%r15d - movl %r12d,24(%rsp) - rorl $9,%r14d xorl %r10d,%r13d + rorl $9,%r14d xorl %eax,%r15d - rorl $5,%r13d - addl %ebx,%r12d + movl %r12d,24(%rsp) xorl %ecx,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r10d,%r15d - movl %edx,%ebx + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d - xorl %eax,%r15d + addl %r15d,%r12d - xorl %r8d,%ebx + movl %ecx,%r15d + addl (%rbp),%r12d xorl %ecx,%r14d - addl %r15d,%r12d - movl %edx,%r15d + xorl %edx,%r15d rorl $6,%r13d - andl %ecx,%ebx - andl %r8d,%r15d + movl %edx,%ebx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ebx + xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx - leaq 1(%rdi),%rdi - addl %r14d,%ebx + leaq 4(%rbp),%rbp + addl %r14d,%ebx movl 28(%rsi),%r12d movl %r9d,%r13d movl %ebx,%r14d bswapl %r12d rorl $14,%r13d - movl %r10d,%r15d - movl %r12d,28(%rsp) + movl %r10d,%edi - rorl $9,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + rorl $9,%r14d + xorl %r11d,%edi - rorl $5,%r13d - addl %eax,%r12d + movl %r12d,28(%rsp) xorl %ebx,%r14d + andl %r9d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r9d,%r15d - movl %ecx,%eax + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + addl %edi,%r12d - xorl %edx,%eax + movl %ebx,%edi + addl (%rbp),%r12d xorl %ebx,%r14d - addl %r15d,%r12d - movl %ecx,%r15d + xorl %ecx,%edi rorl $6,%r13d - andl %ebx,%eax - andl %edx,%r15d + movl %ecx,%eax + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%eax + xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax - leaq 1(%rdi),%rdi - addl %r14d,%eax + leaq 20(%rbp),%rbp + addl %r14d,%eax movl 32(%rsi),%r12d movl %r8d,%r13d movl %eax,%r14d bswapl %r12d rorl $14,%r13d movl %r9d,%r15d - movl %r12d,32(%rsp) - rorl $9,%r14d xorl %r8d,%r13d + rorl $9,%r14d xorl %r10d,%r15d - rorl $5,%r13d - addl %r11d,%r12d + movl %r12d,32(%rsp) xorl %eax,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r8d,%r15d - movl %ebx,%r11d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d - xorl %r10d,%r15d + addl %r15d,%r12d - xorl %ecx,%r11d + movl %eax,%r15d + addl (%rbp),%r12d xorl %eax,%r14d - addl %r15d,%r12d - movl %ebx,%r15d + xorl %ebx,%r15d rorl $6,%r13d - andl %eax,%r11d - andl %ecx,%r15d + movl %ebx,%r11d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r11d + xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d - leaq 1(%rdi),%rdi - addl %r14d,%r11d + leaq 4(%rbp),%rbp + addl %r14d,%r11d movl 36(%rsi),%r12d movl %edx,%r13d movl %r11d,%r14d bswapl %r12d rorl $14,%r13d - movl %r8d,%r15d - movl %r12d,36(%rsp) + movl %r8d,%edi - rorl $9,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + rorl $9,%r14d + xorl %r9d,%edi - rorl $5,%r13d - addl %r10d,%r12d + movl %r12d,36(%rsp) xorl %r11d,%r14d + andl %edx,%edi - addl (%rbp,%rdi,4),%r12d - andl %edx,%r15d - movl %eax,%r10d + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + addl %edi,%r12d - xorl %ebx,%r10d + movl %r11d,%edi + addl (%rbp),%r12d xorl %r11d,%r14d - addl %r15d,%r12d - movl %eax,%r15d + xorl %eax,%edi rorl $6,%r13d - andl %r11d,%r10d - andl %ebx,%r15d + movl %eax,%r10d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r10d + xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d - leaq 1(%rdi),%rdi - addl %r14d,%r10d + leaq 4(%rbp),%rbp + addl %r14d,%r10d movl 40(%rsi),%r12d movl %ecx,%r13d movl %r10d,%r14d bswapl %r12d rorl $14,%r13d movl %edx,%r15d - movl %r12d,40(%rsp) - rorl $9,%r14d xorl %ecx,%r13d + rorl $9,%r14d xorl %r8d,%r15d - rorl $5,%r13d - addl %r9d,%r12d + movl %r12d,40(%rsp) xorl %r10d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %ecx,%r15d - movl %r11d,%r9d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d - xorl %r8d,%r15d + addl %r15d,%r12d - xorl %eax,%r9d + movl %r10d,%r15d + addl (%rbp),%r12d xorl %r10d,%r14d - addl %r15d,%r12d - movl %r11d,%r15d + xorl %r11d,%r15d rorl $6,%r13d - andl %r10d,%r9d - andl %eax,%r15d + movl %r11d,%r9d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r9d + xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d - leaq 1(%rdi),%rdi - addl %r14d,%r9d + leaq 4(%rbp),%rbp + addl %r14d,%r9d movl 44(%rsi),%r12d movl %ebx,%r13d movl %r9d,%r14d bswapl %r12d rorl $14,%r13d - movl %ecx,%r15d - movl %r12d,44(%rsp) + movl %ecx,%edi - rorl $9,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + rorl $9,%r14d + xorl %edx,%edi - rorl $5,%r13d - addl %r8d,%r12d + movl %r12d,44(%rsp) xorl %r9d,%r14d + andl %ebx,%edi - addl (%rbp,%rdi,4),%r12d - andl %ebx,%r15d - movl %r10d,%r8d + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + addl %edi,%r12d - xorl %r11d,%r8d + movl %r9d,%edi + addl (%rbp),%r12d xorl %r9d,%r14d - addl %r15d,%r12d - movl %r10d,%r15d + xorl %r10d,%edi rorl $6,%r13d - andl %r9d,%r8d - andl %r11d,%r15d + movl %r10d,%r8d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r8d + xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d - leaq 1(%rdi),%rdi - addl %r14d,%r8d + leaq 20(%rbp),%rbp + addl %r14d,%r8d movl 48(%rsi),%r12d movl %eax,%r13d movl %r8d,%r14d bswapl %r12d rorl $14,%r13d movl %ebx,%r15d - movl %r12d,48(%rsp) - rorl $9,%r14d xorl %eax,%r13d + rorl $9,%r14d xorl %ecx,%r15d - rorl $5,%r13d - addl %edx,%r12d + movl %r12d,48(%rsp) xorl %r8d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %eax,%r15d - movl %r9d,%edx + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d - xorl %ecx,%r15d + addl %r15d,%r12d - xorl %r10d,%edx + movl %r8d,%r15d + addl (%rbp),%r12d xorl %r8d,%r14d - addl %r15d,%r12d - movl %r9d,%r15d + xorl %r9d,%r15d rorl $6,%r13d - andl %r8d,%edx - andl %r10d,%r15d + movl %r9d,%edx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%edx + xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx - leaq 1(%rdi),%rdi - addl %r14d,%edx + leaq 4(%rbp),%rbp + addl %r14d,%edx movl 52(%rsi),%r12d movl %r11d,%r13d movl %edx,%r14d bswapl %r12d rorl $14,%r13d - movl %eax,%r15d - movl %r12d,52(%rsp) + movl %eax,%edi - rorl $9,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + rorl $9,%r14d + xorl %ebx,%edi - rorl $5,%r13d - addl %ecx,%r12d + movl %r12d,52(%rsp) xorl %edx,%r14d + andl %r11d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r11d,%r15d - movl %r8d,%ecx + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + addl %edi,%r12d - xorl %r9d,%ecx + movl %edx,%edi + addl (%rbp),%r12d xorl %edx,%r14d - addl %r15d,%r12d - movl %r8d,%r15d + xorl %r8d,%edi rorl $6,%r13d - andl %edx,%ecx - andl %r9d,%r15d + movl %r8d,%ecx + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ecx + xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx - leaq 1(%rdi),%rdi - addl %r14d,%ecx + leaq 4(%rbp),%rbp + addl %r14d,%ecx movl 56(%rsi),%r12d movl %r10d,%r13d movl %ecx,%r14d bswapl %r12d rorl $14,%r13d movl %r11d,%r15d - movl %r12d,56(%rsp) - rorl $9,%r14d xorl %r10d,%r13d + rorl $9,%r14d xorl %eax,%r15d - rorl $5,%r13d - addl %ebx,%r12d + movl %r12d,56(%rsp) xorl %ecx,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r10d,%r15d - movl %edx,%ebx + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d - xorl %eax,%r15d + addl %r15d,%r12d - xorl %r8d,%ebx + movl %ecx,%r15d + addl (%rbp),%r12d xorl %ecx,%r14d - addl %r15d,%r12d - movl %edx,%r15d + xorl %edx,%r15d rorl $6,%r13d - andl %ecx,%ebx - andl %r8d,%r15d + movl %edx,%ebx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ebx + xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx - leaq 1(%rdi),%rdi - addl %r14d,%ebx + leaq 4(%rbp),%rbp + addl %r14d,%ebx movl 60(%rsi),%r12d movl %r9d,%r13d movl %ebx,%r14d bswapl %r12d rorl $14,%r13d - movl %r10d,%r15d - movl %r12d,60(%rsp) + movl %r10d,%edi - rorl $9,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + rorl $9,%r14d + xorl %r11d,%edi - rorl $5,%r13d - addl %eax,%r12d + movl %r12d,60(%rsp) xorl %ebx,%r14d + andl %r9d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r9d,%r15d - movl %ecx,%eax + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + addl %edi,%r12d - xorl %edx,%eax + movl %ebx,%edi + addl (%rbp),%r12d xorl %ebx,%r14d - addl %r15d,%r12d - movl %ecx,%r15d + xorl %ecx,%edi rorl $6,%r13d - andl %ebx,%eax - andl %edx,%r15d + movl %ecx,%eax + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%eax + xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax - leaq 1(%rdi),%rdi - addl %r14d,%eax + leaq 20(%rbp),%rbp jmp .Lrounds_16_xx .p2align 4 .Lrounds_16_xx: movl 4(%rsp),%r13d - movl 56(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 56(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 36(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d addl 0(%rsp),%r12d movl %r8d,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %eax,%r14d rorl $14,%r13d movl %r9d,%r15d - movl %r12d,0(%rsp) - rorl $9,%r14d xorl %r8d,%r13d + rorl $9,%r14d xorl %r10d,%r15d - rorl $5,%r13d - addl %r11d,%r12d + movl %r12d,0(%rsp) xorl %eax,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r8d,%r15d - movl %ebx,%r11d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d - xorl %r10d,%r15d + addl %r15d,%r12d - xorl %ecx,%r11d + movl %eax,%r15d + addl (%rbp),%r12d xorl %eax,%r14d - addl %r15d,%r12d - movl %ebx,%r15d + xorl %ebx,%r15d rorl $6,%r13d - andl %eax,%r11d - andl %ecx,%r15d + movl %ebx,%r11d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r11d + xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d - leaq 1(%rdi),%rdi - addl %r14d,%r11d + leaq 4(%rbp),%rbp movl 8(%rsp),%r13d - movl 60(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 60(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 40(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d addl 4(%rsp),%r12d movl %edx,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %r11d,%r14d rorl $14,%r13d - movl %r8d,%r15d - movl %r12d,4(%rsp) + movl %r8d,%edi - rorl $9,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + rorl $9,%r14d + xorl %r9d,%edi - rorl $5,%r13d - addl %r10d,%r12d + movl %r12d,4(%rsp) xorl %r11d,%r14d + andl %edx,%edi - addl (%rbp,%rdi,4),%r12d - andl %edx,%r15d - movl %eax,%r10d + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + addl %edi,%r12d - xorl %ebx,%r10d + movl %r11d,%edi + addl (%rbp),%r12d xorl %r11d,%r14d - addl %r15d,%r12d - movl %eax,%r15d + xorl %eax,%edi rorl $6,%r13d - andl %r11d,%r10d - andl %ebx,%r15d + movl %eax,%r10d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r10d + xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d - leaq 1(%rdi),%rdi - addl %r14d,%r10d + leaq 4(%rbp),%rbp movl 12(%rsp),%r13d - movl 0(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 0(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 44(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d addl 8(%rsp),%r12d movl %ecx,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %r10d,%r14d rorl $14,%r13d movl %edx,%r15d - movl %r12d,8(%rsp) - rorl $9,%r14d xorl %ecx,%r13d + rorl $9,%r14d xorl %r8d,%r15d - rorl $5,%r13d - addl %r9d,%r12d + movl %r12d,8(%rsp) xorl %r10d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %ecx,%r15d - movl %r11d,%r9d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d - xorl %r8d,%r15d + addl %r15d,%r12d - xorl %eax,%r9d + movl %r10d,%r15d + addl (%rbp),%r12d xorl %r10d,%r14d - addl %r15d,%r12d - movl %r11d,%r15d + xorl %r11d,%r15d rorl $6,%r13d - andl %r10d,%r9d - andl %eax,%r15d + movl %r11d,%r9d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r9d + xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d - leaq 1(%rdi),%rdi - addl %r14d,%r9d + leaq 4(%rbp),%rbp movl 16(%rsp),%r13d - movl 4(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 4(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 48(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d addl 12(%rsp),%r12d movl %ebx,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %r9d,%r14d rorl $14,%r13d - movl %ecx,%r15d - movl %r12d,12(%rsp) + movl %ecx,%edi - rorl $9,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + rorl $9,%r14d + xorl %edx,%edi - rorl $5,%r13d - addl %r8d,%r12d + movl %r12d,12(%rsp) xorl %r9d,%r14d + andl %ebx,%edi - addl (%rbp,%rdi,4),%r12d - andl %ebx,%r15d - movl %r10d,%r8d + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + addl %edi,%r12d - xorl %r11d,%r8d + movl %r9d,%edi + addl (%rbp),%r12d xorl %r9d,%r14d - addl %r15d,%r12d - movl %r10d,%r15d + xorl %r10d,%edi rorl $6,%r13d - andl %r9d,%r8d - andl %r11d,%r15d + movl %r10d,%r8d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r8d + xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d - leaq 1(%rdi),%rdi - addl %r14d,%r8d + leaq 20(%rbp),%rbp movl 20(%rsp),%r13d - movl 8(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 8(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 52(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d addl 16(%rsp),%r12d movl %eax,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %r8d,%r14d rorl $14,%r13d movl %ebx,%r15d - movl %r12d,16(%rsp) - rorl $9,%r14d xorl %eax,%r13d + rorl $9,%r14d xorl %ecx,%r15d - rorl $5,%r13d - addl %edx,%r12d + movl %r12d,16(%rsp) xorl %r8d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %eax,%r15d - movl %r9d,%edx + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d - xorl %ecx,%r15d + addl %r15d,%r12d - xorl %r10d,%edx + movl %r8d,%r15d + addl (%rbp),%r12d xorl %r8d,%r14d - addl %r15d,%r12d - movl %r9d,%r15d + xorl %r9d,%r15d rorl $6,%r13d - andl %r8d,%edx - andl %r10d,%r15d + movl %r9d,%edx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%edx + xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx - leaq 1(%rdi),%rdi - addl %r14d,%edx + leaq 4(%rbp),%rbp movl 24(%rsp),%r13d - movl 12(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 12(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 56(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d addl 20(%rsp),%r12d movl %r11d,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %edx,%r14d rorl $14,%r13d - movl %eax,%r15d - movl %r12d,20(%rsp) + movl %eax,%edi - rorl $9,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + rorl $9,%r14d + xorl %ebx,%edi - rorl $5,%r13d - addl %ecx,%r12d + movl %r12d,20(%rsp) xorl %edx,%r14d + andl %r11d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r11d,%r15d - movl %r8d,%ecx + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + addl %edi,%r12d - xorl %r9d,%ecx + movl %edx,%edi + addl (%rbp),%r12d xorl %edx,%r14d - addl %r15d,%r12d - movl %r8d,%r15d + xorl %r8d,%edi rorl $6,%r13d - andl %edx,%ecx - andl %r9d,%r15d + movl %r8d,%ecx + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ecx + xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx - leaq 1(%rdi),%rdi - addl %r14d,%ecx + leaq 4(%rbp),%rbp movl 28(%rsp),%r13d - movl 16(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 16(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 60(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d addl 24(%rsp),%r12d movl %r10d,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %ecx,%r14d rorl $14,%r13d movl %r11d,%r15d - movl %r12d,24(%rsp) - rorl $9,%r14d xorl %r10d,%r13d + rorl $9,%r14d xorl %eax,%r15d - rorl $5,%r13d - addl %ebx,%r12d + movl %r12d,24(%rsp) xorl %ecx,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r10d,%r15d - movl %edx,%ebx + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d - xorl %eax,%r15d + addl %r15d,%r12d - xorl %r8d,%ebx + movl %ecx,%r15d + addl (%rbp),%r12d xorl %ecx,%r14d - addl %r15d,%r12d - movl %edx,%r15d + xorl %edx,%r15d rorl $6,%r13d - andl %ecx,%ebx - andl %r8d,%r15d + movl %edx,%ebx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ebx + xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx - leaq 1(%rdi),%rdi - addl %r14d,%ebx + leaq 4(%rbp),%rbp movl 32(%rsp),%r13d - movl 20(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 20(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 0(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d addl 28(%rsp),%r12d movl %r9d,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %ebx,%r14d rorl $14,%r13d - movl %r10d,%r15d - movl %r12d,28(%rsp) + movl %r10d,%edi - rorl $9,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + rorl $9,%r14d + xorl %r11d,%edi - rorl $5,%r13d - addl %eax,%r12d + movl %r12d,28(%rsp) xorl %ebx,%r14d + andl %r9d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r9d,%r15d - movl %ecx,%eax + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + addl %edi,%r12d - xorl %edx,%eax + movl %ebx,%edi + addl (%rbp),%r12d xorl %ebx,%r14d - addl %r15d,%r12d - movl %ecx,%r15d + xorl %ecx,%edi rorl $6,%r13d - andl %ebx,%eax - andl %edx,%r15d + movl %ecx,%eax + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%eax + xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax - leaq 1(%rdi),%rdi - addl %r14d,%eax + leaq 20(%rbp),%rbp movl 36(%rsp),%r13d - movl 24(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 24(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 4(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d addl 32(%rsp),%r12d movl %r8d,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %eax,%r14d rorl $14,%r13d movl %r9d,%r15d - movl %r12d,32(%rsp) - rorl $9,%r14d xorl %r8d,%r13d + rorl $9,%r14d xorl %r10d,%r15d - rorl $5,%r13d - addl %r11d,%r12d + movl %r12d,32(%rsp) xorl %eax,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r8d,%r15d - movl %ebx,%r11d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d - xorl %r10d,%r15d + addl %r15d,%r12d - xorl %ecx,%r11d + movl %eax,%r15d + addl (%rbp),%r12d xorl %eax,%r14d - addl %r15d,%r12d - movl %ebx,%r15d + xorl %ebx,%r15d rorl $6,%r13d - andl %eax,%r11d - andl %ecx,%r15d + movl %ebx,%r11d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r11d + xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d - leaq 1(%rdi),%rdi - addl %r14d,%r11d + leaq 4(%rbp),%rbp movl 40(%rsp),%r13d - movl 28(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 28(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 8(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d addl 36(%rsp),%r12d movl %edx,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %r11d,%r14d rorl $14,%r13d - movl %r8d,%r15d - movl %r12d,36(%rsp) + movl %r8d,%edi - rorl $9,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + rorl $9,%r14d + xorl %r9d,%edi - rorl $5,%r13d - addl %r10d,%r12d + movl %r12d,36(%rsp) xorl %r11d,%r14d + andl %edx,%edi - addl (%rbp,%rdi,4),%r12d - andl %edx,%r15d - movl %eax,%r10d + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + addl %edi,%r12d - xorl %ebx,%r10d + movl %r11d,%edi + addl (%rbp),%r12d xorl %r11d,%r14d - addl %r15d,%r12d - movl %eax,%r15d + xorl %eax,%edi rorl $6,%r13d - andl %r11d,%r10d - andl %ebx,%r15d + movl %eax,%r10d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r10d + xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d - leaq 1(%rdi),%rdi - addl %r14d,%r10d + leaq 4(%rbp),%rbp movl 44(%rsp),%r13d - movl 32(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 32(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 12(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d addl 40(%rsp),%r12d movl %ecx,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %r10d,%r14d rorl $14,%r13d movl %edx,%r15d - movl %r12d,40(%rsp) - rorl $9,%r14d xorl %ecx,%r13d + rorl $9,%r14d xorl %r8d,%r15d - rorl $5,%r13d - addl %r9d,%r12d + movl %r12d,40(%rsp) xorl %r10d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %ecx,%r15d - movl %r11d,%r9d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d - xorl %r8d,%r15d + addl %r15d,%r12d - xorl %eax,%r9d + movl %r10d,%r15d + addl (%rbp),%r12d xorl %r10d,%r14d - addl %r15d,%r12d - movl %r11d,%r15d + xorl %r11d,%r15d rorl $6,%r13d - andl %r10d,%r9d - andl %eax,%r15d + movl %r11d,%r9d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r9d + xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d - leaq 1(%rdi),%rdi - addl %r14d,%r9d + leaq 4(%rbp),%rbp movl 48(%rsp),%r13d - movl 36(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 36(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 16(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d addl 44(%rsp),%r12d movl %ebx,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %r9d,%r14d rorl $14,%r13d - movl %ecx,%r15d - movl %r12d,44(%rsp) + movl %ecx,%edi - rorl $9,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + rorl $9,%r14d + xorl %edx,%edi - rorl $5,%r13d - addl %r8d,%r12d + movl %r12d,44(%rsp) xorl %r9d,%r14d + andl %ebx,%edi - addl (%rbp,%rdi,4),%r12d - andl %ebx,%r15d - movl %r10d,%r8d + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + addl %edi,%r12d - xorl %r11d,%r8d + movl %r9d,%edi + addl (%rbp),%r12d xorl %r9d,%r14d - addl %r15d,%r12d - movl %r10d,%r15d + xorl %r10d,%edi rorl $6,%r13d - andl %r9d,%r8d - andl %r11d,%r15d + movl %r10d,%r8d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r8d + xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d - leaq 1(%rdi),%rdi - addl %r14d,%r8d + leaq 20(%rbp),%rbp movl 52(%rsp),%r13d - movl 40(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 40(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 20(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d addl 48(%rsp),%r12d movl %eax,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %r8d,%r14d rorl $14,%r13d movl %ebx,%r15d - movl %r12d,48(%rsp) - rorl $9,%r14d xorl %eax,%r13d + rorl $9,%r14d xorl %ecx,%r15d - rorl $5,%r13d - addl %edx,%r12d + movl %r12d,48(%rsp) xorl %r8d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %eax,%r15d - movl %r9d,%edx + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d - xorl %ecx,%r15d + addl %r15d,%r12d - xorl %r10d,%edx + movl %r8d,%r15d + addl (%rbp),%r12d xorl %r8d,%r14d - addl %r15d,%r12d - movl %r9d,%r15d + xorl %r9d,%r15d rorl $6,%r13d - andl %r8d,%edx - andl %r10d,%r15d + movl %r9d,%edx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%edx + xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx - leaq 1(%rdi),%rdi - addl %r14d,%edx + leaq 4(%rbp),%rbp movl 56(%rsp),%r13d - movl 44(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 44(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 24(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d addl 52(%rsp),%r12d movl %r11d,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %edx,%r14d rorl $14,%r13d - movl %eax,%r15d - movl %r12d,52(%rsp) + movl %eax,%edi - rorl $9,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + rorl $9,%r14d + xorl %ebx,%edi - rorl $5,%r13d - addl %ecx,%r12d + movl %r12d,52(%rsp) xorl %edx,%r14d + andl %r11d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r11d,%r15d - movl %r8d,%ecx + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + addl %edi,%r12d - xorl %r9d,%ecx + movl %edx,%edi + addl (%rbp),%r12d xorl %edx,%r14d - addl %r15d,%r12d - movl %r8d,%r15d + xorl %r8d,%edi rorl $6,%r13d - andl %edx,%ecx - andl %r9d,%r15d + movl %r8d,%ecx + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ecx + xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx - leaq 1(%rdi),%rdi - addl %r14d,%ecx + leaq 4(%rbp),%rbp movl 60(%rsp),%r13d - movl 48(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 48(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 28(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d addl 56(%rsp),%r12d movl %r10d,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %ecx,%r14d rorl $14,%r13d movl %r11d,%r15d - movl %r12d,56(%rsp) - rorl $9,%r14d xorl %r10d,%r13d + rorl $9,%r14d xorl %eax,%r15d - rorl $5,%r13d - addl %ebx,%r12d + movl %r12d,56(%rsp) xorl %ecx,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r10d,%r15d - movl %edx,%ebx + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d - xorl %eax,%r15d + addl %r15d,%r12d - xorl %r8d,%ebx + movl %ecx,%r15d + addl (%rbp),%r12d xorl %ecx,%r14d - addl %r15d,%r12d - movl %edx,%r15d + xorl %edx,%r15d rorl $6,%r13d - andl %ecx,%ebx - andl %r8d,%r15d + movl %edx,%ebx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ebx + xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx - leaq 1(%rdi),%rdi - addl %r14d,%ebx + leaq 4(%rbp),%rbp movl 0(%rsp),%r13d - movl 52(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 52(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 32(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d addl 60(%rsp),%r12d movl %r9d,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %ebx,%r14d rorl $14,%r13d - movl %r10d,%r15d - movl %r12d,60(%rsp) + movl %r10d,%edi - rorl $9,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + rorl $9,%r14d + xorl %r11d,%edi - rorl $5,%r13d - addl %eax,%r12d + movl %r12d,60(%rsp) xorl %ebx,%r14d + andl %r9d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r9d,%r15d - movl %ecx,%eax + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + addl %edi,%r12d - xorl %edx,%eax + movl %ebx,%edi + addl (%rbp),%r12d xorl %ebx,%r14d - addl %r15d,%r12d - movl %ecx,%r15d + xorl %ecx,%edi rorl $6,%r13d - andl %ebx,%eax - andl %edx,%r15d + movl %ecx,%eax + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%eax + xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax - leaq 1(%rdi),%rdi - addl %r14d,%eax - cmpq $64,%rdi - jb .Lrounds_16_xx + leaq 20(%rbp),%rbp + cmpb $0,3(%rbp) + jnz .Lrounds_16_xx movq 64+0(%rsp),%rdi + addl %r14d,%eax leaq 64(%rsi),%rsi addl 0(%rdi),%eax @@ -1811,21 +1753,1158 @@ sha256_block_data_order: K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.def sha256_block_data_order_ssse3; .scl 3; .type 32; .endef +.p2align 6 +sha256_block_data_order_ssse3: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_sha256_block_data_order_ssse3: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + +.Lssse3_shortcut: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%r11 + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %r11,64+24(%rsp) + movaps %xmm6,64+32(%rsp) + movaps %xmm7,64+48(%rsp) + movaps %xmm8,64+64(%rsp) + movaps %xmm9,64+80(%rsp) +.Lprologue_ssse3: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp .Lloop_ssse3 +.p2align 4 +.Lloop_ssse3: + movdqa K256+512(%rip),%xmm7 + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 +.byte 102,15,56,0,199 + leaq K256(%rip),%rbp +.byte 102,15,56,0,207 + movdqa 0(%rbp),%xmm4 +.byte 102,15,56,0,215 + movdqa 32(%rbp),%xmm5 + paddd %xmm0,%xmm4 + movdqa 64(%rbp),%xmm6 +.byte 102,15,56,0,223 + movdqa 96(%rbp),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lssse3_00_47 + +.p2align 4 +.Lssse3_00_47: + subq $-128,%rbp + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 32(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 64(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 96(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne .Lssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_ssse3 + + movq 64+24(%rsp),%rsi + movaps 64+32(%rsp),%xmm6 + movaps 64+48(%rsp),%xmm7 + movaps 64+64(%rsp),%xmm8 + movaps 64+80(%rsp),%xmm9 + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lepilogue_ssse3: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_sha256_block_data_order_ssse3: .def se_handler; .scl 3; .type 32; .endef .p2align 4 @@ -1844,16 +2923,21 @@ se_handler: movq 120(%r8),%rax movq 248(%r8),%rbx - leaq .Lprologue(%rip),%r10 + movq 8(%r9),%rsi + movq 56(%r9),%r11 + + movl 0(%r11),%r10d + leaq (%rsi,%r10,1),%r10 cmpq %r10,%rbx jb .Lin_prologue movq 152(%r8),%rax - leaq .Lepilogue(%rip),%r10 + movl 4(%r11),%r10d + leaq (%rsi,%r10,1),%r10 cmpq %r10,%rbx jae .Lin_prologue - + movq %rax,%rsi movq 64+24(%rax),%rax leaq 48(%rax),%rax @@ -1870,6 +2954,15 @@ se_handler: movq %r14,232(%r8) movq %r15,240(%r8) + leaq .Lepilogue(%rip),%r10 + cmpq %r10,%rbx + jb .Lin_prologue + + leaq 64+32(%rsi),%rsi + leaq 512(%r8),%rdi + movl $8,%ecx +.long 0xa548f3fc + .Lin_prologue: movq 8(%rax),%rdi movq 16(%rax),%rsi @@ -1880,7 +2973,7 @@ se_handler: movq 40(%r9),%rdi movq %r8,%rsi movl $154,%ecx -.long 0xa548f3fc +.long 0xa548f3fc movq %r9,%rsi xorq %rcx,%rcx @@ -1915,10 +3008,17 @@ se_handler: .rva .LSEH_begin_sha256_block_data_order .rva .LSEH_end_sha256_block_data_order .rva .LSEH_info_sha256_block_data_order - +.rva .LSEH_begin_sha256_block_data_order_ssse3 +.rva .LSEH_end_sha256_block_data_order_ssse3 +.rva .LSEH_info_sha256_block_data_order_ssse3 .section .xdata .p2align 3 .LSEH_info_sha256_block_data_order: .byte 9,0,0,0 .rva se_handler +.rva .Lprologue,.Lepilogue +.LSEH_info_sha256_block_data_order_ssse3: +.byte 9,0,0,0 +.rva se_handler +.rva .Lprologue_ssse3,.Lepilogue_ssse3 diff --git a/lib/accelerated/x86/elf/aes-ssse3-x86.s b/lib/accelerated/x86/elf/aes-ssse3-x86.s index 92bdeeadb3..3aa221267a 100644 --- a/lib/accelerated/x86/elf/aes-ssse3-x86.s +++ b/lib/accelerated/x86/elf/aes-ssse3-x86.s @@ -85,33 +85,33 @@ _vpaes_encrypt_core: movdqa %xmm6,%xmm1 movdqa (%ebp),%xmm2 pandn %xmm0,%xmm1 - movdqu (%edx),%xmm5 - psrld $4,%xmm1 pand %xmm6,%xmm0 + movdqu (%edx),%xmm5 .byte 102,15,56,0,208 movdqa 16(%ebp),%xmm0 -.byte 102,15,56,0,193 pxor %xmm5,%xmm2 - pxor %xmm2,%xmm0 + psrld $4,%xmm1 addl $16,%edx +.byte 102,15,56,0,193 leal 192(%ebp),%ebx + pxor %xmm2,%xmm0 jmp .L000enc_entry .align 16 .L001enc_loop: movdqa 32(%ebp),%xmm4 -.byte 102,15,56,0,226 - pxor %xmm5,%xmm4 movdqa 48(%ebp),%xmm0 +.byte 102,15,56,0,226 .byte 102,15,56,0,195 - pxor %xmm4,%xmm0 + pxor %xmm5,%xmm4 movdqa 64(%ebp),%xmm5 -.byte 102,15,56,0,234 + pxor %xmm4,%xmm0 movdqa -64(%ebx,%ecx,1),%xmm1 +.byte 102,15,56,0,234 movdqa 80(%ebp),%xmm2 -.byte 102,15,56,0,211 - pxor %xmm5,%xmm2 movdqa (%ebx,%ecx,1),%xmm4 +.byte 102,15,56,0,211 movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 .byte 102,15,56,0,193 addl $16,%edx pxor %xmm2,%xmm0 @@ -120,28 +120,28 @@ _vpaes_encrypt_core: pxor %xmm0,%xmm3 .byte 102,15,56,0,193 andl $48,%ecx - pxor %xmm3,%xmm0 subl $1,%eax + pxor %xmm3,%xmm0 .L000enc_entry: movdqa %xmm6,%xmm1 + movdqa -32(%ebp),%xmm5 pandn %xmm0,%xmm1 psrld $4,%xmm1 pand %xmm6,%xmm0 - movdqa -32(%ebp),%xmm5 .byte 102,15,56,0,232 - pxor %xmm1,%xmm0 movdqa %xmm7,%xmm3 + pxor %xmm1,%xmm0 .byte 102,15,56,0,217 - pxor %xmm5,%xmm3 movdqa %xmm7,%xmm4 + pxor %xmm5,%xmm3 .byte 102,15,56,0,224 - pxor %xmm5,%xmm4 movdqa %xmm7,%xmm2 + pxor %xmm5,%xmm4 .byte 102,15,56,0,211 - pxor %xmm0,%xmm2 movdqa %xmm7,%xmm3 - movdqu (%edx),%xmm5 + pxor %xmm0,%xmm2 .byte 102,15,56,0,220 + movdqu (%edx),%xmm5 pxor %xmm1,%xmm3 jnz .L001enc_loop movdqa 96(%ebp),%xmm4 @@ -157,8 +157,8 @@ _vpaes_encrypt_core: .type _vpaes_decrypt_core,@function .align 16 _vpaes_decrypt_core: - movl 240(%edx),%eax leal 608(%ebp),%ebx + movl 240(%edx),%eax movdqa %xmm6,%xmm1 movdqa -64(%ebx),%xmm2 pandn %xmm0,%xmm1 @@ -181,56 +181,56 @@ _vpaes_decrypt_core: .align 16 .L003dec_loop: movdqa -32(%ebx),%xmm4 + movdqa -16(%ebx),%xmm1 .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa -16(%ebx),%xmm0 -.byte 102,15,56,0,195 +.byte 102,15,56,0,203 pxor %xmm4,%xmm0 - addl $16,%edx -.byte 102,15,56,0,197 movdqa (%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 16(%ebx),%xmm1 .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 16(%ebx),%xmm0 -.byte 102,15,56,0,195 - pxor %xmm4,%xmm0 - subl $1,%eax .byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 movdqa 32(%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%ebx),%xmm1 .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 48(%ebx),%xmm0 -.byte 102,15,56,0,195 - pxor %xmm4,%xmm0 .byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 movdqa 64(%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%ebx),%xmm1 .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 80(%ebx),%xmm0 -.byte 102,15,56,0,195 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 pxor %xmm4,%xmm0 + addl $16,%edx .byte 102,15,58,15,237,12 + pxor %xmm1,%xmm0 + subl $1,%eax .L002dec_entry: movdqa %xmm6,%xmm1 + movdqa -32(%ebp),%xmm2 pandn %xmm0,%xmm1 - psrld $4,%xmm1 pand %xmm6,%xmm0 - movdqa -32(%ebp),%xmm2 + psrld $4,%xmm1 .byte 102,15,56,0,208 - pxor %xmm1,%xmm0 movdqa %xmm7,%xmm3 + pxor %xmm1,%xmm0 .byte 102,15,56,0,217 - pxor %xmm2,%xmm3 movdqa %xmm7,%xmm4 + pxor %xmm2,%xmm3 .byte 102,15,56,0,224 pxor %xmm2,%xmm4 movdqa %xmm7,%xmm2 .byte 102,15,56,0,211 - pxor %xmm0,%xmm2 movdqa %xmm7,%xmm3 + pxor %xmm0,%xmm2 .byte 102,15,56,0,220 - pxor %xmm1,%xmm3 movdqu (%edx),%xmm0 + pxor %xmm1,%xmm3 jnz .L003dec_loop movdqa 96(%ebx),%xmm4 .byte 102,15,56,0,226 @@ -339,12 +339,12 @@ _vpaes_schedule_core: .type _vpaes_schedule_192_smear,@function .align 16 _vpaes_schedule_192_smear: - pshufd $128,%xmm6,%xmm0 - pxor %xmm0,%xmm6 + pshufd $128,%xmm6,%xmm1 pshufd $254,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 pxor %xmm0,%xmm6 movdqa %xmm6,%xmm0 - pxor %xmm1,%xmm1 movhlps %xmm1,%xmm6 ret .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear @@ -613,6 +613,8 @@ vpaes_cbc_encrypt: movl 24(%esp),%edi movl 28(%esp),%eax movl 32(%esp),%edx + subl $16,%eax + jc .L020cbc_abort leal -56(%esp),%ebx movl 36(%esp),%ebp andl $-16,%ebx @@ -622,18 +624,17 @@ vpaes_cbc_encrypt: subl %esi,%edi movl %ebx,48(%esp) movl %edi,(%esp) - subl $16,%eax movl %edx,4(%esp) movl %ebp,8(%esp) movl %eax,%edi - leal .L_vpaes_consts+0x30-.L020pic_point,%ebp + leal .L_vpaes_consts+0x30-.L021pic_point,%ebp call _vpaes_preheat -.L020pic_point: +.L021pic_point: cmpl $0,%ecx - je .L021cbc_dec_loop - jmp .L022cbc_enc_loop + je .L022cbc_dec_loop + jmp .L023cbc_enc_loop .align 16 -.L022cbc_enc_loop: +.L023cbc_enc_loop: movdqu (%esi),%xmm0 pxor %xmm1,%xmm0 call _vpaes_encrypt_core @@ -643,10 +644,10 @@ vpaes_cbc_encrypt: movdqu %xmm0,(%ebx,%esi,1) leal 16(%esi),%esi subl $16,%edi - jnc .L022cbc_enc_loop - jmp .L023cbc_done + jnc .L023cbc_enc_loop + jmp .L024cbc_done .align 16 -.L021cbc_dec_loop: +.L022cbc_dec_loop: movdqu (%esi),%xmm0 movdqa %xmm1,16(%esp) movdqa %xmm0,32(%esp) @@ -658,11 +659,12 @@ vpaes_cbc_encrypt: movdqu %xmm0,(%ebx,%esi,1) leal 16(%esi),%esi subl $16,%edi - jnc .L021cbc_dec_loop -.L023cbc_done: + jnc .L022cbc_dec_loop +.L024cbc_done: movl 8(%esp),%ebx movl 48(%esp),%esp movdqu %xmm1,(%ebx) +.L020cbc_abort: popl %edi popl %esi popl %ebx diff --git a/lib/accelerated/x86/elf/aes-ssse3-x86_64.s b/lib/accelerated/x86/elf/aes-ssse3-x86_64.s index 78dd07e700..09cc51059f 100644 --- a/lib/accelerated/x86/elf/aes-ssse3-x86_64.s +++ b/lib/accelerated/x86/elf/aes-ssse3-x86_64.s @@ -43,8 +43,8 @@ _vpaes_encrypt_core: movdqa .Lk_ipt+16(%rip),%xmm0 .byte 102,15,56,0,193 pxor %xmm5,%xmm2 - pxor %xmm2,%xmm0 addq $16,%r9 + pxor %xmm2,%xmm0 leaq .Lk_mc_backward(%rip),%r10 jmp .Lenc_entry @@ -52,19 +52,19 @@ _vpaes_encrypt_core: .Lenc_loop: movdqa %xmm13,%xmm4 -.byte 102,15,56,0,226 - pxor %xmm5,%xmm4 movdqa %xmm12,%xmm0 +.byte 102,15,56,0,226 .byte 102,15,56,0,195 - pxor %xmm4,%xmm0 + pxor %xmm5,%xmm4 movdqa %xmm15,%xmm5 -.byte 102,15,56,0,234 + pxor %xmm4,%xmm0 movdqa -64(%r11,%r10,1),%xmm1 +.byte 102,15,56,0,234 + movdqa (%r11,%r10,1),%xmm4 movdqa %xmm14,%xmm2 .byte 102,15,56,0,211 - pxor %xmm5,%xmm2 - movdqa (%r11,%r10,1),%xmm4 movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 .byte 102,15,56,0,193 addq $16,%r9 pxor %xmm2,%xmm0 @@ -73,30 +73,30 @@ _vpaes_encrypt_core: pxor %xmm0,%xmm3 .byte 102,15,56,0,193 andq $48,%r11 - pxor %xmm3,%xmm0 subq $1,%rax + pxor %xmm3,%xmm0 .Lenc_entry: movdqa %xmm9,%xmm1 + movdqa %xmm11,%xmm5 pandn %xmm0,%xmm1 psrld $4,%xmm1 pand %xmm9,%xmm0 - movdqa %xmm11,%xmm5 .byte 102,15,56,0,232 - pxor %xmm1,%xmm0 movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 .byte 102,15,56,0,217 - pxor %xmm5,%xmm3 movdqa %xmm10,%xmm4 + pxor %xmm5,%xmm3 .byte 102,15,56,0,224 - pxor %xmm5,%xmm4 movdqa %xmm10,%xmm2 + pxor %xmm5,%xmm4 .byte 102,15,56,0,211 - pxor %xmm0,%xmm2 movdqa %xmm10,%xmm3 - movdqu (%r9),%xmm5 + pxor %xmm0,%xmm2 .byte 102,15,56,0,220 + movdqu (%r9),%xmm5 pxor %xmm1,%xmm3 jnz .Lenc_loop @@ -149,62 +149,61 @@ _vpaes_decrypt_core: movdqa -32(%r10),%xmm4 + movdqa -16(%r10),%xmm1 .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa -16(%r10),%xmm0 -.byte 102,15,56,0,195 +.byte 102,15,56,0,203 pxor %xmm4,%xmm0 - addq $16,%r9 - -.byte 102,15,56,0,197 movdqa 0(%r10),%xmm4 -.byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 16(%r10),%xmm0 -.byte 102,15,56,0,195 - pxor %xmm4,%xmm0 - subq $1,%rax + pxor %xmm1,%xmm0 + movdqa 16(%r10),%xmm1 -.byte 102,15,56,0,197 - movdqa 32(%r10),%xmm4 .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 48(%r10),%xmm0 -.byte 102,15,56,0,195 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 pxor %xmm4,%xmm0 + movdqa 32(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%r10),%xmm1 +.byte 102,15,56,0,226 .byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 movdqa 64(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%r10),%xmm1 + .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 80(%r10),%xmm0 -.byte 102,15,56,0,195 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 pxor %xmm4,%xmm0 - + addq $16,%r9 .byte 102,15,58,15,237,12 + pxor %xmm1,%xmm0 + subq $1,%rax .Ldec_entry: movdqa %xmm9,%xmm1 pandn %xmm0,%xmm1 + movdqa %xmm11,%xmm2 psrld $4,%xmm1 pand %xmm9,%xmm0 - movdqa %xmm11,%xmm2 .byte 102,15,56,0,208 - pxor %xmm1,%xmm0 movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 .byte 102,15,56,0,217 - pxor %xmm2,%xmm3 movdqa %xmm10,%xmm4 + pxor %xmm2,%xmm3 .byte 102,15,56,0,224 pxor %xmm2,%xmm4 movdqa %xmm10,%xmm2 .byte 102,15,56,0,211 - pxor %xmm0,%xmm2 movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 .byte 102,15,56,0,220 - pxor %xmm1,%xmm3 movdqu (%r9),%xmm0 + pxor %xmm1,%xmm3 jnz .Ldec_loop @@ -212,7 +211,7 @@ _vpaes_decrypt_core: .byte 102,15,56,0,226 pxor %xmm0,%xmm4 movdqa 112(%r10),%xmm0 - movdqa .Lk_sr-.Lk_dsbd(%r11),%xmm2 + movdqa -352(%r11),%xmm2 .byte 102,15,56,0,195 pxor %xmm4,%xmm0 .byte 102,15,56,0,194 @@ -232,7 +231,7 @@ _vpaes_schedule_core: - call _vpaes_preheat + call _vpaes_preheat movdqa .Lk_rcon(%rip),%xmm8 movdqu (%rdi),%xmm0 @@ -278,7 +277,7 @@ _vpaes_schedule_core: call _vpaes_schedule_round decq %rsi jz .Lschedule_mangle_last - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle jmp .Loop_schedule_128 @@ -299,7 +298,7 @@ _vpaes_schedule_core: .align 16 .Lschedule_192: movdqu 8(%rdi),%xmm0 - call _vpaes_schedule_transform + call _vpaes_schedule_transform movdqa %xmm0,%xmm6 pxor %xmm4,%xmm4 movhlps %xmm4,%xmm6 @@ -308,13 +307,13 @@ _vpaes_schedule_core: .Loop_schedule_192: call _vpaes_schedule_round .byte 102,15,58,15,198,8 - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle call _vpaes_schedule_192_smear - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle call _vpaes_schedule_round decq %rsi jz .Lschedule_mangle_last - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle call _vpaes_schedule_192_smear jmp .Loop_schedule_192 @@ -331,18 +330,18 @@ _vpaes_schedule_core: .align 16 .Lschedule_256: movdqu 16(%rdi),%xmm0 - call _vpaes_schedule_transform + call _vpaes_schedule_transform movl $7,%esi .Loop_schedule_256: - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle movdqa %xmm0,%xmm6 call _vpaes_schedule_round decq %rsi jz .Lschedule_mangle_last - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle pshufd $255,%xmm0,%xmm0 @@ -380,7 +379,7 @@ _vpaes_schedule_core: .Lschedule_mangle_last_dec: addq $-16,%rdx pxor .Lk_s63(%rip),%xmm0 - call _vpaes_schedule_transform + call _vpaes_schedule_transform movdqu %xmm0,(%rdx) @@ -412,12 +411,12 @@ _vpaes_schedule_core: .type _vpaes_schedule_192_smear,@function .align 16 _vpaes_schedule_192_smear: - pshufd $128,%xmm6,%xmm0 - pxor %xmm0,%xmm6 + pshufd $128,%xmm6,%xmm1 pshufd $254,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 pxor %xmm0,%xmm6 movdqa %xmm6,%xmm0 - pxor %xmm1,%xmm1 movhlps %xmm1,%xmm6 .byte 0xf3,0xc3 .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear @@ -680,9 +679,10 @@ vpaes_decrypt: .align 16 vpaes_cbc_encrypt: xchgq %rcx,%rdx + subq $16,%rcx + jc .Lcbc_abort movdqu (%r8),%xmm6 subq %rdi,%rsi - subq $16,%rcx call _vpaes_preheat cmpl $0,%r9d je .Lcbc_dec_loop @@ -711,6 +711,7 @@ vpaes_cbc_encrypt: jnc .Lcbc_dec_loop .Lcbc_done: movdqu %xmm6,(%r8) +.Lcbc_abort: .byte 0xf3,0xc3 .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt @@ -833,7 +834,7 @@ _vpaes_consts: .Lk_dsbo: .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C -.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 .align 64 .size _vpaes_consts,.-_vpaes_consts diff --git a/lib/accelerated/x86/elf/aesni-x86.s b/lib/accelerated/x86/elf/aesni-x86.s index 5ee55499d6..ab8ee06865 100644 --- a/lib/accelerated/x86/elf/aesni-x86.s +++ b/lib/accelerated/x86/elf/aesni-x86.s @@ -87,29 +87,84 @@ aesni_decrypt: movups %xmm2,(%eax) ret .size aesni_decrypt,.-.L_aesni_decrypt_begin +.type _aesni_encrypt2,@function +.align 16 +_aesni_encrypt2: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L002enc2_loop: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L002enc2_loop +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 + ret +.size _aesni_encrypt2,.-_aesni_encrypt2 +.type _aesni_decrypt2,@function +.align 16 +_aesni_decrypt2: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L003dec2_loop: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L003dec2_loop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,223,208 +.byte 102,15,56,223,216 + ret +.size _aesni_decrypt2,.-_aesni_decrypt2 .type _aesni_encrypt3,@function .align 16 _aesni_encrypt3: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 - movups (%edx),%xmm0 -.L002enc3_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L004enc3_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %ecx .byte 102,15,56,220,225 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leal 32(%edx),%edx .byte 102,15,56,220,224 - movups (%edx),%xmm0 - jnz .L002enc3_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L004enc3_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -122,25 +177,26 @@ _aesni_encrypt3: .align 16 _aesni_decrypt3: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 - movups (%edx),%xmm0 -.L003dec3_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L005dec3_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %ecx .byte 102,15,56,222,225 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leal 32(%edx),%edx .byte 102,15,56,222,224 - movups (%edx),%xmm0 - jnz .L003dec3_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L005dec3_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -154,27 +210,29 @@ _aesni_decrypt3: _aesni_encrypt4: movups (%edx),%xmm0 movups 16(%edx),%xmm1 - shrl $1,%ecx - leal 32(%edx),%edx + shll $4,%ecx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 pxor %xmm0,%xmm5 - movups (%edx),%xmm0 -.L004enc4_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 15,31,64,0 + addl $16,%ecx +.L006enc4_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %ecx .byte 102,15,56,220,225 .byte 102,15,56,220,233 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leal 32(%edx),%edx .byte 102,15,56,220,224 .byte 102,15,56,220,232 - movups (%edx),%xmm0 - jnz .L004enc4_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L006enc4_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -190,27 +248,29 @@ _aesni_encrypt4: _aesni_decrypt4: movups (%edx),%xmm0 movups 16(%edx),%xmm1 - shrl $1,%ecx - leal 32(%edx),%edx + shll $4,%ecx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 pxor %xmm0,%xmm5 - movups (%edx),%xmm0 -.L005dec4_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 15,31,64,0 + addl $16,%ecx +.L007dec4_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %ecx .byte 102,15,56,222,225 .byte 102,15,56,222,233 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leal 32(%edx),%edx .byte 102,15,56,222,224 .byte 102,15,56,222,232 - movups (%edx),%xmm0 - jnz .L005dec4_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L007dec4_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -225,45 +285,44 @@ _aesni_decrypt4: .align 16 _aesni_encrypt6: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 -.byte 102,15,56,220,209 pxor %xmm0,%xmm4 -.byte 102,15,56,220,217 +.byte 102,15,56,220,209 pxor %xmm0,%xmm5 - decl %ecx -.byte 102,15,56,220,225 pxor %xmm0,%xmm6 -.byte 102,15,56,220,233 +.byte 102,15,56,220,217 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 102,15,56,220,225 pxor %xmm0,%xmm7 + addl $16,%ecx +.byte 102,15,56,220,233 .byte 102,15,56,220,241 - movups (%edx),%xmm0 .byte 102,15,56,220,249 + movups -16(%edx,%ecx,1),%xmm0 jmp .L_aesni_encrypt6_enter .align 16 -.L006enc6_loop: +.L008enc6_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %ecx .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -.align 16 .L_aesni_encrypt6_enter: - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leal 32(%edx),%edx .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups (%edx),%xmm0 - jnz .L006enc6_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L008enc6_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -282,45 +341,44 @@ _aesni_encrypt6: .align 16 _aesni_decrypt6: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 -.byte 102,15,56,222,209 pxor %xmm0,%xmm4 -.byte 102,15,56,222,217 +.byte 102,15,56,222,209 pxor %xmm0,%xmm5 - decl %ecx -.byte 102,15,56,222,225 pxor %xmm0,%xmm6 -.byte 102,15,56,222,233 +.byte 102,15,56,222,217 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 102,15,56,222,225 pxor %xmm0,%xmm7 + addl $16,%ecx +.byte 102,15,56,222,233 .byte 102,15,56,222,241 - movups (%edx),%xmm0 .byte 102,15,56,222,249 + movups -16(%edx,%ecx,1),%xmm0 jmp .L_aesni_decrypt6_enter .align 16 -.L007dec6_loop: +.L009dec6_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %ecx .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 -.align 16 .L_aesni_decrypt6_enter: - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leal 32(%edx),%edx .byte 102,15,56,222,224 .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups (%edx),%xmm0 - jnz .L007dec6_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L009dec6_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -350,14 +408,14 @@ aesni_ecb_encrypt: movl 32(%esp),%edx movl 36(%esp),%ebx andl $-16,%eax - jz .L008ecb_ret + jz .L010ecb_ret movl 240(%edx),%ecx testl %ebx,%ebx - jz .L009ecb_decrypt + jz .L011ecb_decrypt movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb .L010ecb_enc_tail + jb .L012ecb_enc_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -366,9 +424,9 @@ aesni_ecb_encrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp .L011ecb_enc_loop6_enter + jmp .L013ecb_enc_loop6_enter .align 16 -.L012ecb_enc_loop6: +.L014ecb_enc_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -383,12 +441,12 @@ aesni_ecb_encrypt: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -.L011ecb_enc_loop6_enter: +.L013ecb_enc_loop6_enter: call _aesni_encrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc .L012ecb_enc_loop6 + jnc .L014ecb_enc_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -397,18 +455,18 @@ aesni_ecb_encrypt: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz .L008ecb_ret -.L010ecb_enc_tail: + jz .L010ecb_ret +.L012ecb_enc_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb .L013ecb_enc_one + jb .L015ecb_enc_one movups 16(%esi),%xmm3 - je .L014ecb_enc_two + je .L016ecb_enc_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb .L015ecb_enc_three + jb .L017ecb_enc_three movups 48(%esi),%xmm5 - je .L016ecb_enc_four + je .L018ecb_enc_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call _aesni_encrypt6 @@ -417,50 +475,49 @@ aesni_ecb_encrypt: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L008ecb_ret + jmp .L010ecb_ret .align 16 -.L013ecb_enc_one: +.L015ecb_enc_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L017enc1_loop_3: +.L019enc1_loop_3: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L017enc1_loop_3 + jnz .L019enc1_loop_3 .byte 102,15,56,221,209 movups %xmm2,(%edi) - jmp .L008ecb_ret + jmp .L010ecb_ret .align 16 -.L014ecb_enc_two: - xorps %xmm4,%xmm4 - call _aesni_encrypt3 +.L016ecb_enc_two: + call _aesni_encrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L008ecb_ret + jmp .L010ecb_ret .align 16 -.L015ecb_enc_three: +.L017ecb_enc_three: call _aesni_encrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L008ecb_ret + jmp .L010ecb_ret .align 16 -.L016ecb_enc_four: +.L018ecb_enc_four: call _aesni_encrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) - jmp .L008ecb_ret + jmp .L010ecb_ret .align 16 -.L009ecb_decrypt: +.L011ecb_decrypt: movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb .L018ecb_dec_tail + jb .L020ecb_dec_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -469,9 +526,9 @@ aesni_ecb_encrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp .L019ecb_dec_loop6_enter + jmp .L021ecb_dec_loop6_enter .align 16 -.L020ecb_dec_loop6: +.L022ecb_dec_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -486,12 +543,12 @@ aesni_ecb_encrypt: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -.L019ecb_dec_loop6_enter: +.L021ecb_dec_loop6_enter: call _aesni_decrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc .L020ecb_dec_loop6 + jnc .L022ecb_dec_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -500,18 +557,18 @@ aesni_ecb_encrypt: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz .L008ecb_ret -.L018ecb_dec_tail: + jz .L010ecb_ret +.L020ecb_dec_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb .L021ecb_dec_one + jb .L023ecb_dec_one movups 16(%esi),%xmm3 - je .L022ecb_dec_two + je .L024ecb_dec_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb .L023ecb_dec_three + jb .L025ecb_dec_three movups 48(%esi),%xmm5 - je .L024ecb_dec_four + je .L026ecb_dec_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call _aesni_decrypt6 @@ -520,44 +577,43 @@ aesni_ecb_encrypt: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L008ecb_ret + jmp .L010ecb_ret .align 16 -.L021ecb_dec_one: +.L023ecb_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L025dec1_loop_4: +.L027dec1_loop_4: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L025dec1_loop_4 + jnz .L027dec1_loop_4 .byte 102,15,56,223,209 movups %xmm2,(%edi) - jmp .L008ecb_ret + jmp .L010ecb_ret .align 16 -.L022ecb_dec_two: - xorps %xmm4,%xmm4 - call _aesni_decrypt3 +.L024ecb_dec_two: + call _aesni_decrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L008ecb_ret + jmp .L010ecb_ret .align 16 -.L023ecb_dec_three: +.L025ecb_dec_three: call _aesni_decrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L008ecb_ret + jmp .L010ecb_ret .align 16 -.L024ecb_dec_four: +.L026ecb_dec_four: call _aesni_decrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) -.L008ecb_ret: +.L010ecb_ret: popl %edi popl %esi popl %ebx @@ -596,45 +652,45 @@ aesni_ccm64_encrypt_blocks: movl %ebp,20(%esp) movl %ebp,24(%esp) movl %ebp,28(%esp) - shrl $1,%ecx + shll $4,%ecx + movl $16,%ebx leal (%edx),%ebp movdqa (%esp),%xmm5 movdqa %xmm7,%xmm2 - movl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + subl %ecx,%ebx .byte 102,15,56,0,253 -.L026ccm64_enc_outer: +.L028ccm64_enc_outer: movups (%ebp),%xmm0 movl %ebx,%ecx movups (%esi),%xmm6 xorps %xmm0,%xmm2 movups 16(%ebp),%xmm1 xorps %xmm6,%xmm0 - leal 32(%ebp),%edx xorps %xmm0,%xmm3 - movups (%edx),%xmm0 -.L027ccm64_enc2_loop: + movups 32(%ebp),%xmm0 +.L029ccm64_enc2_loop: .byte 102,15,56,220,209 - decl %ecx .byte 102,15,56,220,217 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 - leal 32(%edx),%edx .byte 102,15,56,220,216 - movups (%edx),%xmm0 - jnz .L027ccm64_enc2_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L029ccm64_enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 paddq 16(%esp),%xmm7 + decl %eax .byte 102,15,56,221,208 .byte 102,15,56,221,216 - decl %eax leal 16(%esi),%esi xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 movups %xmm6,(%edi) - leal 16(%edi),%edi .byte 102,15,56,0,213 - jnz .L026ccm64_enc_outer + leal 16(%edi),%edi + jnz .L028ccm64_enc_outer movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) @@ -685,67 +741,70 @@ aesni_ccm64_decrypt_blocks: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L028enc1_loop_5: +.L030enc1_loop_5: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L028enc1_loop_5 + jnz .L030enc1_loop_5 .byte 102,15,56,221,209 + shll $4,%ebx + movl $16,%ecx movups (%esi),%xmm6 paddq 16(%esp),%xmm7 leal 16(%esi),%esi - jmp .L029ccm64_dec_outer + subl %ebx,%ecx + leal 32(%ebp,%ebx,1),%edx + movl %ecx,%ebx + jmp .L031ccm64_dec_outer .align 16 -.L029ccm64_dec_outer: +.L031ccm64_dec_outer: xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 - movl %ebx,%ecx movups %xmm6,(%edi) leal 16(%edi),%edi .byte 102,15,56,0,213 subl $1,%eax - jz .L030ccm64_dec_break + jz .L032ccm64_dec_break movups (%ebp),%xmm0 - shrl $1,%ecx + movl %ebx,%ecx movups 16(%ebp),%xmm1 xorps %xmm0,%xmm6 - leal 32(%ebp),%edx xorps %xmm0,%xmm2 xorps %xmm6,%xmm3 - movups (%edx),%xmm0 -.L031ccm64_dec2_loop: + movups 32(%ebp),%xmm0 +.L033ccm64_dec2_loop: .byte 102,15,56,220,209 - decl %ecx .byte 102,15,56,220,217 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 - leal 32(%edx),%edx .byte 102,15,56,220,216 - movups (%edx),%xmm0 - jnz .L031ccm64_dec2_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz .L033ccm64_dec2_loop movups (%esi),%xmm6 paddq 16(%esp),%xmm7 .byte 102,15,56,220,209 .byte 102,15,56,220,217 - leal 16(%esi),%esi .byte 102,15,56,221,208 .byte 102,15,56,221,216 - jmp .L029ccm64_dec_outer + leal 16(%esi),%esi + jmp .L031ccm64_dec_outer .align 16 -.L030ccm64_dec_break: +.L032ccm64_dec_break: + movl 240(%ebp),%ecx movl %ebp,%edx movups (%edx),%xmm0 movups 16(%edx),%xmm1 xorps %xmm0,%xmm6 leal 32(%edx),%edx xorps %xmm6,%xmm3 -.L032enc1_loop_6: +.L034enc1_loop_6: .byte 102,15,56,220,217 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L032enc1_loop_6 + jnz .L034enc1_loop_6 .byte 102,15,56,221,217 movl 48(%esp),%esp movl 40(%esp),%edi @@ -775,7 +834,7 @@ aesni_ctr32_encrypt_blocks: andl $-16,%esp movl %ebp,80(%esp) cmpl $1,%eax - je .L033ctr32_one_shortcut + je .L035ctr32_one_shortcut movdqu (%ebx),%xmm7 movl $202182159,(%esp) movl $134810123,4(%esp) @@ -791,63 +850,59 @@ aesni_ctr32_encrypt_blocks: .byte 102,15,58,34,253,3 movl 240(%edx),%ecx bswap %ebx - pxor %xmm1,%xmm1 pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movdqa (%esp),%xmm2 -.byte 102,15,58,34,203,0 +.byte 102,15,58,34,195,0 leal 3(%ebx),%ebp -.byte 102,15,58,34,197,0 +.byte 102,15,58,34,205,0 incl %ebx -.byte 102,15,58,34,203,1 +.byte 102,15,58,34,195,1 incl %ebp -.byte 102,15,58,34,197,1 +.byte 102,15,58,34,205,1 incl %ebx -.byte 102,15,58,34,203,2 +.byte 102,15,58,34,195,2 incl %ebp -.byte 102,15,58,34,197,2 - movdqa %xmm1,48(%esp) -.byte 102,15,56,0,202 - movdqa %xmm0,64(%esp) +.byte 102,15,58,34,205,2 + movdqa %xmm0,48(%esp) .byte 102,15,56,0,194 - pshufd $192,%xmm1,%xmm2 - pshufd $128,%xmm1,%xmm3 + movdqu (%edx),%xmm6 + movdqa %xmm1,64(%esp) +.byte 102,15,56,0,202 + pshufd $192,%xmm0,%xmm2 + pshufd $128,%xmm0,%xmm3 cmpl $6,%eax - jb .L034ctr32_tail + jb .L036ctr32_tail + pxor %xmm6,%xmm7 + shll $4,%ecx + movl $16,%ebx movdqa %xmm7,32(%esp) - shrl $1,%ecx movl %edx,%ebp - movl %ecx,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx subl $6,%eax - jmp .L035ctr32_loop6 -.align 16 -.L035ctr32_loop6: - pshufd $64,%xmm1,%xmm4 - movdqa 32(%esp),%xmm1 - pshufd $192,%xmm0,%xmm5 - por %xmm1,%xmm2 - pshufd $128,%xmm0,%xmm6 - por %xmm1,%xmm3 - pshufd $64,%xmm0,%xmm7 - por %xmm1,%xmm4 - por %xmm1,%xmm5 - por %xmm1,%xmm6 - por %xmm1,%xmm7 - movups (%ebp),%xmm0 - movups 16(%ebp),%xmm1 - leal 32(%ebp),%edx - decl %ecx + jmp .L037ctr32_loop6 +.align 16 +.L037ctr32_loop6: + pshufd $64,%xmm0,%xmm4 + movdqa 32(%esp),%xmm0 + pshufd $192,%xmm1,%xmm5 pxor %xmm0,%xmm2 + pshufd $128,%xmm1,%xmm6 pxor %xmm0,%xmm3 -.byte 102,15,56,220,209 + pshufd $64,%xmm1,%xmm7 + movups 16(%ebp),%xmm1 pxor %xmm0,%xmm4 -.byte 102,15,56,220,217 pxor %xmm0,%xmm5 -.byte 102,15,56,220,225 +.byte 102,15,56,220,209 pxor %xmm0,%xmm6 -.byte 102,15,56,220,233 pxor %xmm0,%xmm7 +.byte 102,15,56,220,217 + movups 32(%ebp),%xmm0 + movl %ebx,%ecx +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 .byte 102,15,56,220,241 - movups (%edx),%xmm0 .byte 102,15,56,220,249 call .L_aesni_encrypt6_enter movups (%esi),%xmm1 @@ -858,51 +913,51 @@ aesni_ctr32_encrypt_blocks: movups %xmm2,(%edi) movdqa 16(%esp),%xmm0 xorps %xmm1,%xmm4 - movdqa 48(%esp),%xmm1 + movdqa 64(%esp),%xmm1 movups %xmm3,16(%edi) movups %xmm4,32(%edi) paddd %xmm0,%xmm1 - paddd 64(%esp),%xmm0 + paddd 48(%esp),%xmm0 movdqa (%esp),%xmm2 movups 48(%esi),%xmm3 movups 64(%esi),%xmm4 xorps %xmm3,%xmm5 movups 80(%esi),%xmm3 leal 96(%esi),%esi - movdqa %xmm1,48(%esp) -.byte 102,15,56,0,202 + movdqa %xmm0,48(%esp) +.byte 102,15,56,0,194 xorps %xmm4,%xmm6 movups %xmm5,48(%edi) xorps %xmm3,%xmm7 - movdqa %xmm0,64(%esp) -.byte 102,15,56,0,194 + movdqa %xmm1,64(%esp) +.byte 102,15,56,0,202 movups %xmm6,64(%edi) - pshufd $192,%xmm1,%xmm2 + pshufd $192,%xmm0,%xmm2 movups %xmm7,80(%edi) leal 96(%edi),%edi - movl %ebx,%ecx - pshufd $128,%xmm1,%xmm3 + pshufd $128,%xmm0,%xmm3 subl $6,%eax - jnc .L035ctr32_loop6 + jnc .L037ctr32_loop6 addl $6,%eax - jz .L036ctr32_ret + jz .L038ctr32_ret + movdqu (%ebp),%xmm7 movl %ebp,%edx - leal 1(,%ecx,2),%ecx - movdqa 32(%esp),%xmm7 -.L034ctr32_tail: + pxor 32(%esp),%xmm7 + movl 240(%ebp),%ecx +.L036ctr32_tail: por %xmm7,%xmm2 cmpl $2,%eax - jb .L037ctr32_one - pshufd $64,%xmm1,%xmm4 + jb .L039ctr32_one + pshufd $64,%xmm0,%xmm4 por %xmm7,%xmm3 - je .L038ctr32_two - pshufd $192,%xmm0,%xmm5 + je .L040ctr32_two + pshufd $192,%xmm1,%xmm5 por %xmm7,%xmm4 cmpl $4,%eax - jb .L039ctr32_three - pshufd $128,%xmm0,%xmm6 + jb .L041ctr32_three + pshufd $128,%xmm1,%xmm6 por %xmm7,%xmm5 - je .L040ctr32_four + je .L042ctr32_four por %xmm7,%xmm6 call _aesni_encrypt6 movups (%esi),%xmm1 @@ -920,39 +975,39 @@ aesni_ctr32_encrypt_blocks: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L036ctr32_ret + jmp .L038ctr32_ret .align 16 -.L033ctr32_one_shortcut: +.L035ctr32_one_shortcut: movups (%ebx),%xmm2 movl 240(%edx),%ecx -.L037ctr32_one: +.L039ctr32_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L041enc1_loop_7: +.L043enc1_loop_7: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L041enc1_loop_7 + jnz .L043enc1_loop_7 .byte 102,15,56,221,209 movups (%esi),%xmm6 xorps %xmm2,%xmm6 movups %xmm6,(%edi) - jmp .L036ctr32_ret + jmp .L038ctr32_ret .align 16 -.L038ctr32_two: - call _aesni_encrypt3 +.L040ctr32_two: + call _aesni_encrypt2 movups (%esi),%xmm5 movups 16(%esi),%xmm6 xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L036ctr32_ret + jmp .L038ctr32_ret .align 16 -.L039ctr32_three: +.L041ctr32_three: call _aesni_encrypt3 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -963,9 +1018,9 @@ aesni_ctr32_encrypt_blocks: xorps %xmm7,%xmm4 movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L036ctr32_ret + jmp .L038ctr32_ret .align 16 -.L040ctr32_four: +.L042ctr32_four: call _aesni_encrypt4 movups (%esi),%xmm6 movups 16(%esi),%xmm7 @@ -979,7 +1034,7 @@ aesni_ctr32_encrypt_blocks: xorps %xmm0,%xmm5 movups %xmm4,32(%edi) movups %xmm5,48(%edi) -.L036ctr32_ret: +.L038ctr32_ret: movl 80(%esp),%esp popl %edi popl %esi @@ -1004,12 +1059,12 @@ aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L042enc1_loop_8: +.L044enc1_loop_8: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L042enc1_loop_8 + jnz .L044enc1_loop_8 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1033,12 +1088,14 @@ aesni_xts_encrypt: movl %edx,%ebp movl %ecx,%ebx subl $96,%eax - jc .L043xts_enc_short - shrl $1,%ecx - movl %ecx,%ebx - jmp .L044xts_enc_loop6 + jc .L045xts_enc_short + shll $4,%ecx + movl $16,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + jmp .L046xts_enc_loop6 .align 16 -.L044xts_enc_loop6: +.L046xts_enc_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1074,6 +1131,7 @@ aesni_xts_encrypt: pand %xmm3,%xmm7 movups (%esi),%xmm2 pxor %xmm1,%xmm7 + movl %ebx,%ecx movdqu 16(%esi),%xmm3 xorps %xmm0,%xmm2 movdqu 32(%esi),%xmm4 @@ -1089,19 +1147,17 @@ aesni_xts_encrypt: movdqa %xmm7,80(%esp) pxor %xmm1,%xmm7 movups 16(%ebp),%xmm1 - leal 32(%ebp),%edx pxor 16(%esp),%xmm3 -.byte 102,15,56,220,209 pxor 32(%esp),%xmm4 -.byte 102,15,56,220,217 +.byte 102,15,56,220,209 pxor 48(%esp),%xmm5 - decl %ecx -.byte 102,15,56,220,225 pxor 64(%esp),%xmm6 -.byte 102,15,56,220,233 +.byte 102,15,56,220,217 pxor %xmm0,%xmm7 + movups 32(%ebp),%xmm0 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 .byte 102,15,56,220,241 - movups (%edx),%xmm0 .byte 102,15,56,220,249 call .L_aesni_encrypt6_enter movdqa 80(%esp),%xmm1 @@ -1126,26 +1182,25 @@ aesni_xts_encrypt: paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 - movl %ebx,%ecx pxor %xmm2,%xmm1 subl $96,%eax - jnc .L044xts_enc_loop6 - leal 1(,%ecx,2),%ecx + jnc .L046xts_enc_loop6 + movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -.L043xts_enc_short: +.L045xts_enc_short: addl $96,%eax - jz .L045xts_enc_done6x + jz .L047xts_enc_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb .L046xts_enc_one + jb .L048xts_enc_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je .L047xts_enc_two + je .L049xts_enc_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1154,7 +1209,7 @@ aesni_xts_encrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb .L048xts_enc_three + jb .L050xts_enc_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1164,7 +1219,7 @@ aesni_xts_encrypt: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je .L049xts_enc_four + je .L051xts_enc_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1196,9 +1251,9 @@ aesni_xts_encrypt: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp .L050xts_enc_done + jmp .L052xts_enc_done .align 16 -.L046xts_enc_one: +.L048xts_enc_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1206,37 +1261,36 @@ aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L051enc1_loop_9: +.L053enc1_loop_9: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L051enc1_loop_9 + jnz .L053enc1_loop_9 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp .L050xts_enc_done + jmp .L052xts_enc_done .align 16 -.L047xts_enc_two: +.L049xts_enc_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 leal 32(%esi),%esi xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 - xorps %xmm4,%xmm4 - call _aesni_encrypt3 + call _aesni_encrypt2 xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L050xts_enc_done + jmp .L052xts_enc_done .align 16 -.L048xts_enc_three: +.L050xts_enc_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1254,9 +1308,9 @@ aesni_xts_encrypt: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp .L050xts_enc_done + jmp .L052xts_enc_done .align 16 -.L049xts_enc_four: +.L051xts_enc_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1278,28 +1332,28 @@ aesni_xts_encrypt: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L050xts_enc_done + jmp .L052xts_enc_done .align 16 -.L045xts_enc_done6x: +.L047xts_enc_done6x: movl 112(%esp),%eax andl $15,%eax - jz .L052xts_enc_ret + jz .L054xts_enc_ret movdqa %xmm1,%xmm5 movl %eax,112(%esp) - jmp .L053xts_enc_steal + jmp .L055xts_enc_steal .align 16 -.L050xts_enc_done: +.L052xts_enc_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz .L052xts_enc_ret + jz .L054xts_enc_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm5 paddq %xmm1,%xmm1 pand 96(%esp),%xmm5 pxor %xmm1,%xmm5 -.L053xts_enc_steal: +.L055xts_enc_steal: movzbl (%esi),%ecx movzbl -16(%edi),%edx leal 1(%esi),%esi @@ -1307,7 +1361,7 @@ aesni_xts_encrypt: movb %dl,(%edi) leal 1(%edi),%edi subl $1,%eax - jnz .L053xts_enc_steal + jnz .L055xts_enc_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1317,16 +1371,16 @@ aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L054enc1_loop_10: +.L056enc1_loop_10: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L054enc1_loop_10 + jnz .L056enc1_loop_10 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,-16(%edi) -.L052xts_enc_ret: +.L054xts_enc_ret: movl 116(%esp),%esp popl %edi popl %esi @@ -1351,12 +1405,12 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L055enc1_loop_11: +.L057enc1_loop_11: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L055enc1_loop_11 + jnz .L057enc1_loop_11 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1385,12 +1439,14 @@ aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 andl $-16,%eax subl $96,%eax - jc .L056xts_dec_short - shrl $1,%ecx - movl %ecx,%ebx - jmp .L057xts_dec_loop6 + jc .L058xts_dec_short + shll $4,%ecx + movl $16,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + jmp .L059xts_dec_loop6 .align 16 -.L057xts_dec_loop6: +.L059xts_dec_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1426,6 +1482,7 @@ aesni_xts_decrypt: pand %xmm3,%xmm7 movups (%esi),%xmm2 pxor %xmm1,%xmm7 + movl %ebx,%ecx movdqu 16(%esi),%xmm3 xorps %xmm0,%xmm2 movdqu 32(%esi),%xmm4 @@ -1441,19 +1498,17 @@ aesni_xts_decrypt: movdqa %xmm7,80(%esp) pxor %xmm1,%xmm7 movups 16(%ebp),%xmm1 - leal 32(%ebp),%edx pxor 16(%esp),%xmm3 -.byte 102,15,56,222,209 pxor 32(%esp),%xmm4 -.byte 102,15,56,222,217 +.byte 102,15,56,222,209 pxor 48(%esp),%xmm5 - decl %ecx -.byte 102,15,56,222,225 pxor 64(%esp),%xmm6 -.byte 102,15,56,222,233 +.byte 102,15,56,222,217 pxor %xmm0,%xmm7 + movups 32(%ebp),%xmm0 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 .byte 102,15,56,222,241 - movups (%edx),%xmm0 .byte 102,15,56,222,249 call .L_aesni_decrypt6_enter movdqa 80(%esp),%xmm1 @@ -1478,26 +1533,25 @@ aesni_xts_decrypt: paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 - movl %ebx,%ecx pxor %xmm2,%xmm1 subl $96,%eax - jnc .L057xts_dec_loop6 - leal 1(,%ecx,2),%ecx + jnc .L059xts_dec_loop6 + movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -.L056xts_dec_short: +.L058xts_dec_short: addl $96,%eax - jz .L058xts_dec_done6x + jz .L060xts_dec_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb .L059xts_dec_one + jb .L061xts_dec_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je .L060xts_dec_two + je .L062xts_dec_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1506,7 +1560,7 @@ aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb .L061xts_dec_three + jb .L063xts_dec_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1516,7 +1570,7 @@ aesni_xts_decrypt: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je .L062xts_dec_four + je .L064xts_dec_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1548,9 +1602,9 @@ aesni_xts_decrypt: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp .L063xts_dec_done + jmp .L065xts_dec_done .align 16 -.L059xts_dec_one: +.L061xts_dec_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1558,36 +1612,36 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L064dec1_loop_12: +.L066dec1_loop_12: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L064dec1_loop_12 + jnz .L066dec1_loop_12 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp .L063xts_dec_done + jmp .L065xts_dec_done .align 16 -.L060xts_dec_two: +.L062xts_dec_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 leal 32(%esi),%esi xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 - call _aesni_decrypt3 + call _aesni_decrypt2 xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L063xts_dec_done + jmp .L065xts_dec_done .align 16 -.L061xts_dec_three: +.L063xts_dec_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1605,9 +1659,9 @@ aesni_xts_decrypt: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp .L063xts_dec_done + jmp .L065xts_dec_done .align 16 -.L062xts_dec_four: +.L064xts_dec_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1629,20 +1683,20 @@ aesni_xts_decrypt: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L063xts_dec_done + jmp .L065xts_dec_done .align 16 -.L058xts_dec_done6x: +.L060xts_dec_done6x: movl 112(%esp),%eax andl $15,%eax - jz .L065xts_dec_ret + jz .L067xts_dec_ret movl %eax,112(%esp) - jmp .L066xts_dec_only_one_more + jmp .L068xts_dec_only_one_more .align 16 -.L063xts_dec_done: +.L065xts_dec_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz .L065xts_dec_ret + jz .L067xts_dec_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm2 @@ -1652,7 +1706,7 @@ aesni_xts_decrypt: pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 -.L066xts_dec_only_one_more: +.L068xts_dec_only_one_more: pshufd $19,%xmm0,%xmm5 movdqa %xmm1,%xmm6 paddq %xmm1,%xmm1 @@ -1666,16 +1720,16 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L067dec1_loop_13: +.L069dec1_loop_13: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L067dec1_loop_13 + jnz .L069dec1_loop_13 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) -.L068xts_dec_steal: +.L070xts_dec_steal: movzbl 16(%esi),%ecx movzbl (%edi),%edx leal 1(%esi),%esi @@ -1683,7 +1737,7 @@ aesni_xts_decrypt: movb %dl,16(%edi) leal 1(%edi),%edi subl $1,%eax - jnz .L068xts_dec_steal + jnz .L070xts_dec_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1693,16 +1747,16 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L069dec1_loop_14: +.L071dec1_loop_14: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L069dec1_loop_14 + jnz .L071dec1_loop_14 .byte 102,15,56,223,209 xorps %xmm6,%xmm2 movups %xmm2,(%edi) -.L065xts_dec_ret: +.L067xts_dec_ret: movl 116(%esp),%esp popl %edi popl %esi @@ -1728,7 +1782,7 @@ aesni_cbc_encrypt: movl 32(%esp),%edx movl 36(%esp),%ebp testl %eax,%eax - jz .L070cbc_abort + jz .L072cbc_abort cmpl $0,40(%esp) xchgl %esp,%ebx movups (%ebp),%xmm7 @@ -1736,14 +1790,14 @@ aesni_cbc_encrypt: movl %edx,%ebp movl %ebx,16(%esp) movl %ecx,%ebx - je .L071cbc_decrypt + je .L073cbc_decrypt movaps %xmm7,%xmm2 cmpl $16,%eax - jb .L072cbc_enc_tail + jb .L074cbc_enc_tail subl $16,%eax - jmp .L073cbc_enc_loop + jmp .L075cbc_enc_loop .align 16 -.L073cbc_enc_loop: +.L075cbc_enc_loop: movups (%esi),%xmm7 leal 16(%esi),%esi movups (%edx),%xmm0 @@ -1751,24 +1805,24 @@ aesni_cbc_encrypt: xorps %xmm0,%xmm7 leal 32(%edx),%edx xorps %xmm7,%xmm2 -.L074enc1_loop_15: +.L076enc1_loop_15: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L074enc1_loop_15 + jnz .L076enc1_loop_15 .byte 102,15,56,221,209 movl %ebx,%ecx movl %ebp,%edx movups %xmm2,(%edi) leal 16(%edi),%edi subl $16,%eax - jnc .L073cbc_enc_loop + jnc .L075cbc_enc_loop addl $16,%eax - jnz .L072cbc_enc_tail + jnz .L074cbc_enc_tail movaps %xmm2,%xmm7 - jmp .L075cbc_ret -.L072cbc_enc_tail: + jmp .L077cbc_ret +.L074cbc_enc_tail: movl %eax,%ecx .long 2767451785 movl $16,%ecx @@ -1779,20 +1833,20 @@ aesni_cbc_encrypt: movl %ebx,%ecx movl %edi,%esi movl %ebp,%edx - jmp .L073cbc_enc_loop + jmp .L075cbc_enc_loop .align 16 -.L071cbc_decrypt: +.L073cbc_decrypt: cmpl $80,%eax - jbe .L076cbc_dec_tail + jbe .L078cbc_dec_tail movaps %xmm7,(%esp) subl $80,%eax - jmp .L077cbc_dec_loop6_enter + jmp .L079cbc_dec_loop6_enter .align 16 -.L078cbc_dec_loop6: +.L080cbc_dec_loop6: movaps %xmm0,(%esp) movups %xmm7,(%edi) leal 16(%edi),%edi -.L077cbc_dec_loop6_enter: +.L079cbc_dec_loop6_enter: movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -1822,28 +1876,28 @@ aesni_cbc_encrypt: movups %xmm6,64(%edi) leal 80(%edi),%edi subl $96,%eax - ja .L078cbc_dec_loop6 + ja .L080cbc_dec_loop6 movaps %xmm7,%xmm2 movaps %xmm0,%xmm7 addl $80,%eax - jle .L079cbc_dec_tail_collected + jle .L081cbc_dec_tail_collected movups %xmm2,(%edi) leal 16(%edi),%edi -.L076cbc_dec_tail: +.L078cbc_dec_tail: movups (%esi),%xmm2 movaps %xmm2,%xmm6 cmpl $16,%eax - jbe .L080cbc_dec_one + jbe .L082cbc_dec_one movups 16(%esi),%xmm3 movaps %xmm3,%xmm5 cmpl $32,%eax - jbe .L081cbc_dec_two + jbe .L083cbc_dec_two movups 32(%esi),%xmm4 cmpl $48,%eax - jbe .L082cbc_dec_three + jbe .L084cbc_dec_three movups 48(%esi),%xmm5 cmpl $64,%eax - jbe .L083cbc_dec_four + jbe .L085cbc_dec_four movups 64(%esi),%xmm6 movaps %xmm7,(%esp) movups (%esi),%xmm2 @@ -1866,28 +1920,27 @@ aesni_cbc_encrypt: leal 64(%edi),%edi movaps %xmm6,%xmm2 subl $80,%eax - jmp .L079cbc_dec_tail_collected + jmp .L081cbc_dec_tail_collected .align 16 -.L080cbc_dec_one: +.L082cbc_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L084dec1_loop_16: +.L086dec1_loop_16: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L084dec1_loop_16 + jnz .L086dec1_loop_16 .byte 102,15,56,223,209 xorps %xmm7,%xmm2 movaps %xmm6,%xmm7 subl $16,%eax - jmp .L079cbc_dec_tail_collected + jmp .L081cbc_dec_tail_collected .align 16 -.L081cbc_dec_two: - xorps %xmm4,%xmm4 - call _aesni_decrypt3 +.L083cbc_dec_two: + call _aesni_decrypt2 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) @@ -1895,9 +1948,9 @@ aesni_cbc_encrypt: leal 16(%edi),%edi movaps %xmm5,%xmm7 subl $32,%eax - jmp .L079cbc_dec_tail_collected + jmp .L081cbc_dec_tail_collected .align 16 -.L082cbc_dec_three: +.L084cbc_dec_three: call _aesni_decrypt3 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 @@ -1908,9 +1961,9 @@ aesni_cbc_encrypt: leal 32(%edi),%edi movups 32(%esi),%xmm7 subl $48,%eax - jmp .L079cbc_dec_tail_collected + jmp .L081cbc_dec_tail_collected .align 16 -.L083cbc_dec_four: +.L085cbc_dec_four: call _aesni_decrypt4 movups 16(%esi),%xmm1 movups 32(%esi),%xmm0 @@ -1925,23 +1978,23 @@ aesni_cbc_encrypt: leal 48(%edi),%edi movaps %xmm5,%xmm2 subl $64,%eax -.L079cbc_dec_tail_collected: +.L081cbc_dec_tail_collected: andl $15,%eax - jnz .L085cbc_dec_tail_partial + jnz .L087cbc_dec_tail_partial movups %xmm2,(%edi) - jmp .L075cbc_ret + jmp .L077cbc_ret .align 16 -.L085cbc_dec_tail_partial: +.L087cbc_dec_tail_partial: movaps %xmm2,(%esp) movl $16,%ecx movl %esp,%esi subl %eax,%ecx .long 2767451785 -.L075cbc_ret: +.L077cbc_ret: movl 16(%esp),%esp movl 36(%esp),%ebp movups %xmm7,(%ebp) -.L070cbc_abort: +.L072cbc_abort: popl %edi popl %esi popl %ebx @@ -1952,51 +2005,51 @@ aesni_cbc_encrypt: .align 16 _aesni_set_encrypt_key: testl %eax,%eax - jz .L086bad_pointer + jz .L088bad_pointer testl %edx,%edx - jz .L086bad_pointer + jz .L088bad_pointer movups (%eax),%xmm0 xorps %xmm4,%xmm4 leal 16(%edx),%edx cmpl $256,%ecx - je .L08714rounds + je .L08914rounds cmpl $192,%ecx - je .L08812rounds + je .L09012rounds cmpl $128,%ecx - jne .L089bad_keybits + jne .L091bad_keybits .align 16 -.L09010rounds: +.L09210rounds: movl $9,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,200,1 - call .L091key_128_cold + call .L093key_128_cold .byte 102,15,58,223,200,2 - call .L092key_128 + call .L094key_128 .byte 102,15,58,223,200,4 - call .L092key_128 + call .L094key_128 .byte 102,15,58,223,200,8 - call .L092key_128 + call .L094key_128 .byte 102,15,58,223,200,16 - call .L092key_128 + call .L094key_128 .byte 102,15,58,223,200,32 - call .L092key_128 + call .L094key_128 .byte 102,15,58,223,200,64 - call .L092key_128 + call .L094key_128 .byte 102,15,58,223,200,128 - call .L092key_128 + call .L094key_128 .byte 102,15,58,223,200,27 - call .L092key_128 + call .L094key_128 .byte 102,15,58,223,200,54 - call .L092key_128 + call .L094key_128 movups %xmm0,(%edx) movl %ecx,80(%edx) xorl %eax,%eax ret .align 16 -.L092key_128: +.L094key_128: movups %xmm0,(%edx) leal 16(%edx),%edx -.L091key_128_cold: +.L093key_128_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2005,38 +2058,38 @@ _aesni_set_encrypt_key: xorps %xmm1,%xmm0 ret .align 16 -.L08812rounds: +.L09012rounds: movq 16(%eax),%xmm2 movl $11,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,202,1 - call .L093key_192a_cold + call .L095key_192a_cold .byte 102,15,58,223,202,2 - call .L094key_192b + call .L096key_192b .byte 102,15,58,223,202,4 - call .L095key_192a + call .L097key_192a .byte 102,15,58,223,202,8 - call .L094key_192b + call .L096key_192b .byte 102,15,58,223,202,16 - call .L095key_192a + call .L097key_192a .byte 102,15,58,223,202,32 - call .L094key_192b + call .L096key_192b .byte 102,15,58,223,202,64 - call .L095key_192a + call .L097key_192a .byte 102,15,58,223,202,128 - call .L094key_192b + call .L096key_192b movups %xmm0,(%edx) movl %ecx,48(%edx) xorl %eax,%eax ret .align 16 -.L095key_192a: +.L097key_192a: movups %xmm0,(%edx) leal 16(%edx),%edx .align 16 -.L093key_192a_cold: +.L095key_192a_cold: movaps %xmm2,%xmm5 -.L096key_192b_warm: +.L098key_192b_warm: shufps $16,%xmm0,%xmm4 movdqa %xmm2,%xmm3 xorps %xmm4,%xmm0 @@ -2050,56 +2103,56 @@ _aesni_set_encrypt_key: pxor %xmm3,%xmm2 ret .align 16 -.L094key_192b: +.L096key_192b: movaps %xmm0,%xmm3 shufps $68,%xmm0,%xmm5 movups %xmm5,(%edx) shufps $78,%xmm2,%xmm3 movups %xmm3,16(%edx) leal 32(%edx),%edx - jmp .L096key_192b_warm + jmp .L098key_192b_warm .align 16 -.L08714rounds: +.L08914rounds: movups 16(%eax),%xmm2 movl $13,%ecx leal 16(%edx),%edx movups %xmm0,-32(%edx) movups %xmm2,-16(%edx) .byte 102,15,58,223,202,1 - call .L097key_256a_cold + call .L099key_256a_cold .byte 102,15,58,223,200,1 - call .L098key_256b + call .L100key_256b .byte 102,15,58,223,202,2 - call .L099key_256a + call .L101key_256a .byte 102,15,58,223,200,2 - call .L098key_256b + call .L100key_256b .byte 102,15,58,223,202,4 - call .L099key_256a + call .L101key_256a .byte 102,15,58,223,200,4 - call .L098key_256b + call .L100key_256b .byte 102,15,58,223,202,8 - call .L099key_256a + call .L101key_256a .byte 102,15,58,223,200,8 - call .L098key_256b + call .L100key_256b .byte 102,15,58,223,202,16 - call .L099key_256a + call .L101key_256a .byte 102,15,58,223,200,16 - call .L098key_256b + call .L100key_256b .byte 102,15,58,223,202,32 - call .L099key_256a + call .L101key_256a .byte 102,15,58,223,200,32 - call .L098key_256b + call .L100key_256b .byte 102,15,58,223,202,64 - call .L099key_256a + call .L101key_256a movups %xmm0,(%edx) movl %ecx,16(%edx) xorl %eax,%eax ret .align 16 -.L099key_256a: +.L101key_256a: movups %xmm2,(%edx) leal 16(%edx),%edx -.L097key_256a_cold: +.L099key_256a_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2108,7 +2161,7 @@ _aesni_set_encrypt_key: xorps %xmm1,%xmm0 ret .align 16 -.L098key_256b: +.L100key_256b: movups %xmm0,(%edx) leal 16(%edx),%edx shufps $16,%xmm2,%xmm4 @@ -2119,11 +2172,11 @@ _aesni_set_encrypt_key: xorps %xmm1,%xmm2 ret .align 4 -.L086bad_pointer: +.L088bad_pointer: movl $-1,%eax ret .align 4 -.L089bad_keybits: +.L091bad_keybits: movl $-2,%eax ret .size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key @@ -2150,7 +2203,7 @@ aesni_set_decrypt_key: movl 12(%esp),%edx shll $4,%ecx testl %eax,%eax - jnz .L100dec_key_ret + jnz .L102dec_key_ret leal 16(%edx,%ecx,1),%eax movups (%edx),%xmm0 movups (%eax),%xmm1 @@ -2158,7 +2211,7 @@ aesni_set_decrypt_key: movups %xmm1,(%edx) leal 16(%edx),%edx leal -16(%eax),%eax -.L101dec_key_inverse: +.L103dec_key_inverse: movups (%edx),%xmm0 movups (%eax),%xmm1 .byte 102,15,56,219,192 @@ -2168,12 +2221,12 @@ aesni_set_decrypt_key: movups %xmm0,16(%eax) movups %xmm1,-16(%edx) cmpl %edx,%eax - ja .L101dec_key_inverse + ja .L103dec_key_inverse movups (%edx),%xmm0 .byte 102,15,56,219,192 movups %xmm0,(%edx) xorl %eax,%eax -.L100dec_key_ret: +.L102dec_key_ret: ret .size aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 diff --git a/lib/accelerated/x86/elf/aesni-x86_64.s b/lib/accelerated/x86/elf/aesni-x86_64.s index 59b37d83ba..996c531af2 100644 --- a/lib/accelerated/x86/elf/aesni-x86_64.s +++ b/lib/accelerated/x86/elf/aesni-x86_64.s @@ -38,6 +38,7 @@ # *** This file is auto-generated *** # .text + .globl aesni_encrypt .type aesni_encrypt,@function .align 16 @@ -53,7 +54,7 @@ aesni_encrypt: decl %eax movups (%rdx),%xmm1 leaq 16(%rdx),%rdx - jnz .Loop_enc1_1 + jnz .Loop_enc1_1 .byte 102,15,56,221,209 movups %xmm2,(%rsi) .byte 0xf3,0xc3 @@ -74,34 +75,93 @@ aesni_decrypt: decl %eax movups (%rdx),%xmm1 leaq 16(%rdx),%rdx - jnz .Loop_dec1_2 + jnz .Loop_dec1_2 .byte 102,15,56,223,209 movups %xmm2,(%rsi) .byte 0xf3,0xc3 .size aesni_decrypt, .-aesni_decrypt +.type _aesni_encrypt2,@function +.align 16 +_aesni_encrypt2: + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +.Lenc_loop2: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Lenc_loop2 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 + .byte 0xf3,0xc3 +.size _aesni_encrypt2,.-_aesni_encrypt2 +.type _aesni_decrypt2,@function +.align 16 +_aesni_decrypt2: + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +.Ldec_loop2: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Ldec_loop2 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,223,208 +.byte 102,15,56,223,216 + .byte 0xf3,0xc3 +.size _aesni_decrypt2,.-_aesni_decrypt2 .type _aesni_encrypt3,@function .align 16 _aesni_encrypt3: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 - movups (%rcx),%xmm0 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax .Lenc_loop3: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Lenc_loop3 .byte 102,15,56,220,209 @@ -116,25 +176,26 @@ _aesni_encrypt3: .align 16 _aesni_decrypt3: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 - movups (%rcx),%xmm0 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax .Ldec_loop3: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Ldec_loop3 .byte 102,15,56,222,209 @@ -149,28 +210,30 @@ _aesni_decrypt3: .align 16 _aesni_encrypt4: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 xorps %xmm0,%xmm5 - movups (%rcx),%xmm0 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 0x0f,0x1f,0x00 + addq $16,%rax .Lenc_loop4: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 .byte 102,15,56,220,233 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 .byte 102,15,56,220,232 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Lenc_loop4 .byte 102,15,56,220,209 @@ -187,28 +250,30 @@ _aesni_encrypt4: .align 16 _aesni_decrypt4: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 xorps %xmm0,%xmm5 - movups (%rcx),%xmm0 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 0x0f,0x1f,0x00 + addq $16,%rax .Ldec_loop4: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 .byte 102,15,56,222,233 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 .byte 102,15,56,222,232 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Ldec_loop4 .byte 102,15,56,222,209 @@ -225,43 +290,43 @@ _aesni_decrypt4: .align 16 _aesni_encrypt6: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 -.byte 102,15,56,220,209 pxor %xmm0,%xmm4 +.byte 102,15,56,220,209 + leaq 32(%rcx,%rax,1),%rcx + negq %rax .byte 102,15,56,220,217 pxor %xmm0,%xmm5 -.byte 102,15,56,220,225 pxor %xmm0,%xmm6 -.byte 102,15,56,220,233 +.byte 102,15,56,220,225 pxor %xmm0,%xmm7 - decl %eax + addq $16,%rax +.byte 102,15,56,220,233 .byte 102,15,56,220,241 - movups (%rcx),%xmm0 .byte 102,15,56,220,249 + movups -16(%rcx,%rax,1),%xmm0 jmp .Lenc_loop6_enter .align 16 .Lenc_loop6: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .Lenc_loop6_enter: - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Lenc_loop6 .byte 102,15,56,220,209 @@ -282,43 +347,43 @@ _aesni_encrypt6: .align 16 _aesni_decrypt6: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 -.byte 102,15,56,222,209 pxor %xmm0,%xmm4 +.byte 102,15,56,222,209 + leaq 32(%rcx,%rax,1),%rcx + negq %rax .byte 102,15,56,222,217 pxor %xmm0,%xmm5 -.byte 102,15,56,222,225 pxor %xmm0,%xmm6 -.byte 102,15,56,222,233 +.byte 102,15,56,222,225 pxor %xmm0,%xmm7 - decl %eax + addq $16,%rax +.byte 102,15,56,222,233 .byte 102,15,56,222,241 - movups (%rcx),%xmm0 .byte 102,15,56,222,249 + movups -16(%rcx,%rax,1),%xmm0 jmp .Ldec_loop6_enter .align 16 .Ldec_loop6: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 .Ldec_loop6_enter: - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Ldec_loop6 .byte 102,15,56,222,209 @@ -339,52 +404,51 @@ _aesni_decrypt6: .align 16 _aesni_encrypt8: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 -.byte 102,15,56,220,209 pxor %xmm0,%xmm4 -.byte 102,15,56,220,217 pxor %xmm0,%xmm5 -.byte 102,15,56,220,225 pxor %xmm0,%xmm6 -.byte 102,15,56,220,233 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,220,209 + addq $16,%rax pxor %xmm0,%xmm7 - decl %eax -.byte 102,15,56,220,241 +.byte 102,15,56,220,217 pxor %xmm0,%xmm8 -.byte 102,15,56,220,249 pxor %xmm0,%xmm9 - movups (%rcx),%xmm0 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 - movups 16(%rcx),%xmm1 + movups -16(%rcx,%rax,1),%xmm0 jmp .Lenc_loop8_enter .align 16 .Lenc_loop8: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 - movups 16(%rcx),%xmm1 .Lenc_loop8_enter: + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 .byte 102,68,15,56,220,192 .byte 102,68,15,56,220,200 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Lenc_loop8 .byte 102,15,56,220,209 @@ -409,52 +473,51 @@ _aesni_encrypt8: .align 16 _aesni_decrypt8: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 -.byte 102,15,56,222,209 pxor %xmm0,%xmm4 -.byte 102,15,56,222,217 pxor %xmm0,%xmm5 -.byte 102,15,56,222,225 pxor %xmm0,%xmm6 -.byte 102,15,56,222,233 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,222,209 + addq $16,%rax pxor %xmm0,%xmm7 - decl %eax -.byte 102,15,56,222,241 +.byte 102,15,56,222,217 pxor %xmm0,%xmm8 -.byte 102,15,56,222,249 pxor %xmm0,%xmm9 - movups (%rcx),%xmm0 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 .byte 102,68,15,56,222,193 .byte 102,68,15,56,222,201 - movups 16(%rcx),%xmm1 + movups -16(%rcx,%rax,1),%xmm0 jmp .Ldec_loop8_enter .align 16 .Ldec_loop8: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 .byte 102,68,15,56,222,193 .byte 102,68,15,56,222,201 - movups 16(%rcx),%xmm1 .Ldec_loop8_enter: + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 .byte 102,68,15,56,222,192 .byte 102,68,15,56,222,200 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Ldec_loop8 .byte 102,15,56,222,209 @@ -583,14 +646,13 @@ aesni_ecb_encrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_3 + jnz .Loop_enc1_3 .byte 102,15,56,221,209 movups %xmm2,(%rsi) jmp .Lecb_ret .align 16 .Lecb_enc_two: - xorps %xmm4,%xmm4 - call _aesni_encrypt3 + call _aesni_encrypt2 movups %xmm2,(%rsi) movups %xmm3,16(%rsi) jmp .Lecb_ret @@ -728,14 +790,13 @@ aesni_ecb_encrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_4 + jnz .Loop_dec1_4 .byte 102,15,56,223,209 movups %xmm2,(%rsi) jmp .Lecb_ret .align 16 .Lecb_dec_two: - xorps %xmm4,%xmm4 - call _aesni_decrypt3 + call _aesni_decrypt2 movups %xmm2,(%rsi) movups %xmm3,16(%rsi) jmp .Lecb_ret @@ -782,53 +843,53 @@ aesni_ecb_encrypt: .align 16 aesni_ccm64_encrypt_blocks: movl 240(%rcx),%eax - movdqu (%r8),%xmm9 - movdqa .Lincrement64(%rip),%xmm6 + movdqu (%r8),%xmm6 + movdqa .Lincrement64(%rip),%xmm9 movdqa .Lbswap_mask(%rip),%xmm7 - shrl $1,%eax + shll $4,%eax + movl $16,%r10d leaq 0(%rcx),%r11 movdqu (%r9),%xmm3 - movdqa %xmm9,%xmm2 - movl %eax,%r10d -.byte 102,68,15,56,0,207 + movdqa %xmm6,%xmm2 + leaq 32(%rcx,%rax,1),%rcx +.byte 102,15,56,0,247 + subq %rax,%r10 jmp .Lccm64_enc_outer .align 16 .Lccm64_enc_outer: movups (%r11),%xmm0 - movl %r10d,%eax + movq %r10,%rax movups (%rdi),%xmm8 xorps %xmm0,%xmm2 movups 16(%r11),%xmm1 xorps %xmm8,%xmm0 - leaq 32(%r11),%rcx xorps %xmm0,%xmm3 - movups (%rcx),%xmm0 + movups 32(%r11),%xmm0 .Lccm64_enc2_loop: .byte 102,15,56,220,209 - decl %eax .byte 102,15,56,220,217 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 - leaq 32(%rcx),%rcx .byte 102,15,56,220,216 - movups 0(%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Lccm64_enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 - paddq %xmm6,%xmm9 + paddq %xmm9,%xmm6 + decq %rdx .byte 102,15,56,221,208 .byte 102,15,56,221,216 - decq %rdx leaq 16(%rdi),%rdi xorps %xmm2,%xmm8 - movdqa %xmm9,%xmm2 + movdqa %xmm6,%xmm2 movups %xmm8,(%rsi) - leaq 16(%rsi),%rsi .byte 102,15,56,0,215 + leaq 16(%rsi),%rsi jnz .Lccm64_enc_outer movups %xmm3,(%r9) @@ -839,15 +900,15 @@ aesni_ccm64_encrypt_blocks: .align 16 aesni_ccm64_decrypt_blocks: movl 240(%rcx),%eax - movups (%r8),%xmm9 + movups (%r8),%xmm6 movdqu (%r9),%xmm3 - movdqa .Lincrement64(%rip),%xmm6 + movdqa .Lincrement64(%rip),%xmm9 movdqa .Lbswap_mask(%rip),%xmm7 - movaps %xmm9,%xmm2 + movaps %xmm6,%xmm2 movl %eax,%r10d movq %rcx,%r11 -.byte 102,68,15,56,0,207 +.byte 102,15,56,0,247 movups (%rcx),%xmm0 movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx @@ -857,17 +918,21 @@ aesni_ccm64_decrypt_blocks: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_5 + jnz .Loop_enc1_5 .byte 102,15,56,221,209 + shll $4,%r10d + movl $16,%eax movups (%rdi),%xmm8 - paddq %xmm6,%xmm9 + paddq %xmm9,%xmm6 leaq 16(%rdi),%rdi + subq %r10,%rax + leaq 32(%r11,%r10,1),%rcx + movq %rax,%r10 jmp .Lccm64_dec_outer .align 16 .Lccm64_dec_outer: xorps %xmm2,%xmm8 - movdqa %xmm9,%xmm2 - movl %r10d,%eax + movdqa %xmm6,%xmm2 movups %xmm8,(%rsi) leaq 16(%rsi),%rsi .byte 102,15,56,0,215 @@ -876,36 +941,36 @@ aesni_ccm64_decrypt_blocks: jz .Lccm64_dec_break movups (%r11),%xmm0 - shrl $1,%eax + movq %r10,%rax movups 16(%r11),%xmm1 xorps %xmm0,%xmm8 - leaq 32(%r11),%rcx xorps %xmm0,%xmm2 xorps %xmm8,%xmm3 - movups (%rcx),%xmm0 - + movups 32(%r11),%xmm0 + jmp .Lccm64_dec2_loop +.align 16 .Lccm64_dec2_loop: .byte 102,15,56,220,209 - decl %eax .byte 102,15,56,220,217 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 - leaq 32(%rcx),%rcx .byte 102,15,56,220,216 - movups 0(%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz .Lccm64_dec2_loop movups (%rdi),%xmm8 - paddq %xmm6,%xmm9 + paddq %xmm9,%xmm6 .byte 102,15,56,220,209 .byte 102,15,56,220,217 - leaq 16(%rdi),%rdi .byte 102,15,56,221,208 .byte 102,15,56,221,216 + leaq 16(%rdi),%rdi jmp .Lccm64_dec_outer .align 16 .Lccm64_dec_break: + movl 240(%r11),%eax movups (%r11),%xmm0 movups 16(%r11),%xmm1 xorps %xmm0,%xmm8 @@ -916,7 +981,7 @@ aesni_ccm64_decrypt_blocks: decl %eax movups (%r11),%xmm1 leaq 16(%r11),%r11 - jnz .Loop_enc1_6 + jnz .Loop_enc1_6 .byte 102,15,56,221,217 movups %xmm3,(%r9) .byte 0xf3,0xc3 @@ -925,199 +990,518 @@ aesni_ccm64_decrypt_blocks: .type aesni_ctr32_encrypt_blocks,@function .align 16 aesni_ctr32_encrypt_blocks: + leaq (%rsp),%rax + pushq %rbp + subq $128,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp + cmpq $1,%rdx je .Lctr32_one_shortcut - movdqu (%r8),%xmm14 - movdqa .Lbswap_mask(%rip),%xmm15 - xorl %eax,%eax -.byte 102,69,15,58,22,242,3 -.byte 102,68,15,58,34,240,3 - + movdqu (%r8),%xmm2 + movdqu (%rcx),%xmm0 + movl 12(%r8),%r8d + pxor %xmm0,%xmm2 + movl 12(%rcx),%r11d + movdqa %xmm2,0(%rsp) + bswapl %r8d + movdqa %xmm2,%xmm3 + movdqa %xmm2,%xmm4 + movdqa %xmm2,%xmm5 + movdqa %xmm2,64(%rsp) + movdqa %xmm2,80(%rsp) + movdqa %xmm2,96(%rsp) + movq %rdx,%r10 + movdqa %xmm2,112(%rsp) + + leaq 1(%r8),%rax + leaq 2(%r8),%rdx + bswapl %eax + bswapl %edx + xorl %r11d,%eax + xorl %r11d,%edx +.byte 102,15,58,34,216,3 + leaq 3(%r8),%rax + movdqa %xmm3,16(%rsp) +.byte 102,15,58,34,226,3 + bswapl %eax + movq %r10,%rdx + leaq 4(%r8),%r10 + movdqa %xmm4,32(%rsp) + xorl %r11d,%eax + bswapl %r10d +.byte 102,15,58,34,232,3 + xorl %r11d,%r10d + movdqa %xmm5,48(%rsp) + leaq 5(%r8),%r9 + movl %r10d,64+12(%rsp) + bswapl %r9d + leaq 6(%r8),%r10 movl 240(%rcx),%eax + xorl %r11d,%r9d bswapl %r10d - pxor %xmm12,%xmm12 - pxor %xmm13,%xmm13 -.byte 102,69,15,58,34,226,0 - leaq 3(%r10),%r11 -.byte 102,69,15,58,34,235,0 - incl %r10d -.byte 102,69,15,58,34,226,1 - incq %r11 -.byte 102,69,15,58,34,235,1 - incl %r10d -.byte 102,69,15,58,34,226,2 - incq %r11 -.byte 102,69,15,58,34,235,2 - movdqa %xmm12,-40(%rsp) -.byte 102,69,15,56,0,231 - movdqa %xmm13,-24(%rsp) -.byte 102,69,15,56,0,239 - - pshufd $192,%xmm12,%xmm2 - pshufd $128,%xmm12,%xmm3 - pshufd $64,%xmm12,%xmm4 - cmpq $6,%rdx + movl %r9d,80+12(%rsp) + xorl %r11d,%r10d + leaq 7(%r8),%r9 + movl %r10d,96+12(%rsp) + bswapl %r9d + movl _gnutls_x86_cpuid_s+4(%rip),%r10d + xorl %r11d,%r9d + andl $71303168,%r10d + movl %r9d,112+12(%rsp) + + movups 16(%rcx),%xmm1 + + movdqa 64(%rsp),%xmm6 + movdqa 80(%rsp),%xmm7 + + cmpq $8,%rdx jb .Lctr32_tail - shrl $1,%eax - movq %rcx,%r11 - movl %eax,%r10d + subq $6,%rdx + cmpl $4194304,%r10d + je .Lctr32_6x + + leaq 128(%rcx),%rcx + subq $2,%rdx + jmp .Lctr32_loop8 + +.align 16 +.Lctr32_6x: + shll $4,%eax + movl $48,%r10d + bswapl %r11d + leaq 32(%rcx,%rax,1),%rcx + subq %rax,%r10 jmp .Lctr32_loop6 .align 16 .Lctr32_loop6: - pshufd $192,%xmm13,%xmm5 - por %xmm14,%xmm2 - movups (%r11),%xmm0 - pshufd $128,%xmm13,%xmm6 - por %xmm14,%xmm3 - movups 16(%r11),%xmm1 - pshufd $64,%xmm13,%xmm7 - por %xmm14,%xmm4 - por %xmm14,%xmm5 - xorps %xmm0,%xmm2 - por %xmm14,%xmm6 - por %xmm14,%xmm7 + addl $6,%r8d + movups -48(%rcx,%r10,1),%xmm0 +.byte 102,15,56,220,209 + movl %r8d,%eax + xorl %r11d,%eax +.byte 102,15,56,220,217 +.byte 0x0f,0x38,0xf1,0x44,0x24,12 + leal 1(%r8),%eax +.byte 102,15,56,220,225 + xorl %r11d,%eax +.byte 0x0f,0x38,0xf1,0x44,0x24,28 +.byte 102,15,56,220,233 + leal 2(%r8),%eax + xorl %r11d,%eax +.byte 102,15,56,220,241 +.byte 0x0f,0x38,0xf1,0x44,0x24,44 + leal 3(%r8),%eax +.byte 102,15,56,220,249 + movups -32(%rcx,%r10,1),%xmm1 + xorl %r11d,%eax +.byte 102,15,56,220,208 +.byte 0x0f,0x38,0xf1,0x44,0x24,60 + leal 4(%r8),%eax +.byte 102,15,56,220,216 + xorl %r11d,%eax +.byte 0x0f,0x38,0xf1,0x44,0x24,76 +.byte 102,15,56,220,224 + leal 5(%r8),%eax + xorl %r11d,%eax +.byte 102,15,56,220,232 +.byte 0x0f,0x38,0xf1,0x44,0x24,92 + movq %r10,%rax +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups -16(%rcx,%r10,1),%xmm0 + call .Lenc_loop6 + movdqu (%rdi),%xmm8 + movdqu 16(%rdi),%xmm9 + movdqu 32(%rdi),%xmm10 + movdqu 48(%rdi),%xmm11 + movdqu 64(%rdi),%xmm12 + movdqu 80(%rdi),%xmm13 + leaq 96(%rdi),%rdi + movups -64(%rcx,%r10,1),%xmm1 + pxor %xmm2,%xmm8 + movaps 0(%rsp),%xmm2 + pxor %xmm3,%xmm9 + movaps 16(%rsp),%xmm3 + pxor %xmm4,%xmm10 + movaps 32(%rsp),%xmm4 + pxor %xmm5,%xmm11 + movaps 48(%rsp),%xmm5 + pxor %xmm6,%xmm12 + movaps 64(%rsp),%xmm6 + pxor %xmm7,%xmm13 + movaps 80(%rsp),%xmm7 + movdqu %xmm8,(%rsi) + movdqu %xmm9,16(%rsi) + movdqu %xmm10,32(%rsi) + movdqu %xmm11,48(%rsi) + movdqu %xmm12,64(%rsi) + movdqu %xmm13,80(%rsi) + leaq 96(%rsi),%rsi - pxor %xmm0,%xmm3 + subq $6,%rdx + jnc .Lctr32_loop6 + + addq $6,%rdx + jz .Lctr32_done + + leal -48(%r10),%eax + leaq -80(%rcx,%r10,1),%rcx + negl %eax + shrl $4,%eax + jmp .Lctr32_tail + +.align 32 +.Lctr32_loop8: + addl $8,%r8d + movdqa 96(%rsp),%xmm8 .byte 102,15,56,220,209 - leaq 32(%r11),%rcx - pxor %xmm0,%xmm4 + movl %r8d,%r9d + movdqa 112(%rsp),%xmm9 .byte 102,15,56,220,217 - movdqa .Lincrement32(%rip),%xmm13 - pxor %xmm0,%xmm5 + bswapl %r9d + movups 32-128(%rcx),%xmm0 .byte 102,15,56,220,225 - movdqa -40(%rsp),%xmm12 - pxor %xmm0,%xmm6 + xorl %r11d,%r9d + nop .byte 102,15,56,220,233 - pxor %xmm0,%xmm7 - movups (%rcx),%xmm0 - decl %eax + movl %r9d,0+12(%rsp) + leaq 1(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 - jmp .Lctr32_enc_loop6_enter -.align 16 -.Lctr32_enc_loop6: +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 48-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movl %r9d,16+12(%rsp) + leaq 2(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 64-128(%rcx),%xmm0 + bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax + xorl %r11d,%r9d +.byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 + movl %r9d,32+12(%rsp) + leaq 3(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -.Lctr32_enc_loop6_enter: - movups 16(%rcx),%xmm1 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 80-128(%rcx),%xmm1 + bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx + xorl %r11d,%r9d +.byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 + movl %r9d,48+12(%rsp) + leaq 4(%r8),%r9 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups (%rcx),%xmm0 - jnz .Lctr32_enc_loop6 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 96-128(%rcx),%xmm0 + bswapl %r9d +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movl %r9d,64+12(%rsp) + leaq 5(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 112-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movl %r9d,80+12(%rsp) + leaq 6(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 128-128(%rcx),%xmm0 + bswapl %r9d +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movl %r9d,96+12(%rsp) + leaq 7(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 144-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 + xorl %r11d,%r9d + movdqu 0(%rdi),%xmm10 +.byte 102,15,56,220,232 + movl %r9d,112+12(%rsp) + cmpl $11,%eax +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 160-128(%rcx),%xmm0 + + jb .Lctr32_enc_done .byte 102,15,56,220,209 - paddd %xmm13,%xmm12 .byte 102,15,56,220,217 - paddd -24(%rsp),%xmm13 .byte 102,15,56,220,225 - movdqa %xmm12,-40(%rsp) .byte 102,15,56,220,233 - movdqa %xmm13,-24(%rsp) .byte 102,15,56,220,241 -.byte 102,69,15,56,0,231 .byte 102,15,56,220,249 -.byte 102,69,15,56,0,239 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 176-128(%rcx),%xmm1 -.byte 102,15,56,221,208 - movups (%rdi),%xmm8 -.byte 102,15,56,221,216 - movups 16(%rdi),%xmm9 -.byte 102,15,56,221,224 - movups 32(%rdi),%xmm10 -.byte 102,15,56,221,232 - movups 48(%rdi),%xmm11 -.byte 102,15,56,221,240 - movups 64(%rdi),%xmm1 -.byte 102,15,56,221,248 - movups 80(%rdi),%xmm0 - leaq 96(%rdi),%rdi +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 192-128(%rcx),%xmm0 + je .Lctr32_enc_done - xorps %xmm2,%xmm8 - pshufd $192,%xmm12,%xmm2 - xorps %xmm3,%xmm9 - pshufd $128,%xmm12,%xmm3 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - pshufd $64,%xmm12,%xmm4 - movups %xmm9,16(%rsi) - xorps %xmm5,%xmm11 - movups %xmm10,32(%rsi) - xorps %xmm6,%xmm1 - movups %xmm11,48(%rsi) - xorps %xmm7,%xmm0 - movups %xmm1,64(%rsi) - movups %xmm0,80(%rsi) - leaq 96(%rsi),%rsi - movl %r10d,%eax - subq $6,%rdx - jnc .Lctr32_loop6 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 208-128(%rcx),%xmm1 - addq $6,%rdx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 224-128(%rcx),%xmm0 + jmp .Lctr32_enc_done + +.align 16 +.Lctr32_enc_done: + movdqu 16(%rdi),%xmm11 + pxor %xmm0,%xmm10 + movdqu 32(%rdi),%xmm12 + pxor %xmm0,%xmm11 + movdqu 48(%rdi),%xmm13 + pxor %xmm0,%xmm12 + movdqu 64(%rdi),%xmm14 + pxor %xmm0,%xmm13 + movdqu 80(%rdi),%xmm15 + pxor %xmm0,%xmm14 + pxor %xmm0,%xmm15 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movdqu 96(%rdi),%xmm1 + leaq 128(%rdi),%rdi + +.byte 102,65,15,56,221,210 + pxor %xmm0,%xmm1 + movdqu 112-128(%rdi),%xmm10 +.byte 102,65,15,56,221,219 + pxor %xmm0,%xmm10 + movdqa 0(%rsp),%xmm11 +.byte 102,65,15,56,221,228 +.byte 102,65,15,56,221,237 + movdqa 16(%rsp),%xmm12 + movdqa 32(%rsp),%xmm13 +.byte 102,65,15,56,221,246 +.byte 102,65,15,56,221,255 + movdqa 48(%rsp),%xmm14 + movdqa 64(%rsp),%xmm15 +.byte 102,68,15,56,221,193 + movdqa 80(%rsp),%xmm0 + movups 16-128(%rcx),%xmm1 +.byte 102,69,15,56,221,202 + + movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 + movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 + movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 + movups %xmm5,48(%rsi) + movdqa %xmm14,%xmm5 + movups %xmm6,64(%rsi) + movdqa %xmm15,%xmm6 + movups %xmm7,80(%rsi) + movdqa %xmm0,%xmm7 + movups %xmm8,96(%rsi) + movups %xmm9,112(%rsi) + leaq 128(%rsi),%rsi + + subq $8,%rdx + jnc .Lctr32_loop8 + + addq $8,%rdx jz .Lctr32_done - movq %r11,%rcx - leal 1(%rax,%rax,1),%eax + leaq -128(%rcx),%rcx .Lctr32_tail: - por %xmm14,%xmm2 - movups (%rdi),%xmm8 - cmpq $2,%rdx - jb .Lctr32_one + leaq 16(%rcx),%rcx + cmpq $4,%rdx + jb .Lctr32_loop3 + je .Lctr32_loop4 - por %xmm14,%xmm3 - movups 16(%rdi),%xmm9 - je .Lctr32_two + shll $4,%eax + movdqa 96(%rsp),%xmm8 + pxor %xmm9,%xmm9 - pshufd $192,%xmm13,%xmm5 - por %xmm14,%xmm4 - movups 32(%rdi),%xmm10 - cmpq $4,%rdx - jb .Lctr32_three + movups 16(%rcx),%xmm0 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + leaq 32-16(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,220,225 + addq $16,%rax + movups (%rdi),%xmm10 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 + movups 16(%rdi),%xmm11 + movups 32(%rdi),%xmm12 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 - pshufd $128,%xmm13,%xmm6 - por %xmm14,%xmm5 - movups 48(%rdi),%xmm11 - je .Lctr32_four + call .Lenc_loop8_enter - por %xmm14,%xmm6 - xorps %xmm7,%xmm7 + movdqu 48(%rdi),%xmm13 + pxor %xmm10,%xmm2 + movdqu 64(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm10,%xmm6 + movdqu %xmm5,48(%rsi) + movdqu %xmm6,64(%rsi) + cmpq $6,%rdx + jb .Lctr32_done - call _aesni_encrypt6 + movups 80(%rdi),%xmm11 + xorps %xmm11,%xmm7 + movups %xmm7,80(%rsi) + je .Lctr32_done - movups 64(%rdi),%xmm1 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - movups %xmm9,16(%rsi) - xorps %xmm5,%xmm11 - movups %xmm10,32(%rsi) - xorps %xmm6,%xmm1 - movups %xmm11,48(%rsi) - movups %xmm1,64(%rsi) + movups 96(%rdi),%xmm12 + xorps %xmm12,%xmm8 + movups %xmm8,96(%rsi) + jmp .Lctr32_done + +.align 32 +.Lctr32_loop4: +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx + decl %eax +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups (%rcx),%xmm1 + jnz .Lctr32_loop4 +.byte 102,15,56,221,209 +.byte 102,15,56,221,217 + movups (%rdi),%xmm10 + movups 16(%rdi),%xmm11 +.byte 102,15,56,221,225 +.byte 102,15,56,221,233 + movups 32(%rdi),%xmm12 + movups 48(%rdi),%xmm13 + + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm4,32(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm5,48(%rsi) + jmp .Lctr32_done + +.align 32 +.Lctr32_loop3: +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx + decl %eax +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 + movups (%rcx),%xmm1 + jnz .Lctr32_loop3 +.byte 102,15,56,221,209 +.byte 102,15,56,221,217 +.byte 102,15,56,221,225 + + movups (%rdi),%xmm10 + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + cmpq $2,%rdx + jb .Lctr32_done + + movups 16(%rdi),%xmm11 + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + je .Lctr32_done + + movups 32(%rdi),%xmm12 + xorps %xmm12,%xmm4 + movups %xmm4,32(%rsi) jmp .Lctr32_done .align 16 .Lctr32_one_shortcut: movups (%r8),%xmm2 - movups (%rdi),%xmm8 + movups (%rdi),%xmm10 movl 240(%rcx),%eax -.Lctr32_one: movups (%rcx),%xmm0 movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx @@ -1127,289 +1511,303 @@ aesni_ctr32_encrypt_blocks: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_7 + jnz .Loop_enc1_7 .byte 102,15,56,221,209 - xorps %xmm2,%xmm8 - movups %xmm8,(%rsi) - jmp .Lctr32_done - -.align 16 -.Lctr32_two: - xorps %xmm4,%xmm4 - call _aesni_encrypt3 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - movups %xmm9,16(%rsi) - jmp .Lctr32_done - -.align 16 -.Lctr32_three: - call _aesni_encrypt3 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - movups %xmm9,16(%rsi) - movups %xmm10,32(%rsi) + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) jmp .Lctr32_done .align 16 -.Lctr32_four: - call _aesni_encrypt4 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - movups %xmm9,16(%rsi) - xorps %xmm5,%xmm11 - movups %xmm10,32(%rsi) - movups %xmm11,48(%rsi) - .Lctr32_done: + leaq (%rbp),%rsp + popq %rbp +.Lctr32_epilogue: .byte 0xf3,0xc3 .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks .globl aesni_xts_encrypt .type aesni_xts_encrypt,@function .align 16 aesni_xts_encrypt: - leaq -104(%rsp),%rsp - movups (%r9),%xmm15 + leaq (%rsp),%rax + pushq %rbp + subq $112,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp + movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d movups (%r8),%xmm0 movups 16(%r8),%xmm1 leaq 32(%r8),%r8 - xorps %xmm0,%xmm15 + xorps %xmm0,%xmm2 .Loop_enc1_8: -.byte 102,68,15,56,220,249 +.byte 102,15,56,220,209 decl %eax movups (%r8),%xmm1 leaq 16(%r8),%r8 - jnz .Loop_enc1_8 -.byte 102,68,15,56,221,249 + jnz .Loop_enc1_8 +.byte 102,15,56,221,209 + movups (%rcx),%xmm0 movq %rcx,%r11 movl %r10d,%eax + shll $4,%r10d movq %rdx,%r9 andq $-16,%rdx + movups 16(%rcx,%r10,1),%xmm1 + movdqa .Lxts_magic(%rip),%xmm8 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + movdqa %xmm2,%xmm15 + pshufd $95,%xmm2,%xmm9 + pxor %xmm0,%xmm1 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm10 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm10 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm11 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm11 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm12 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm12 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm13 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm13 + pxor %xmm14,%xmm15 + movdqa %xmm15,%xmm14 + psrad $31,%xmm9 paddq %xmm15,%xmm15 pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 + pxor %xmm0,%xmm14 pxor %xmm9,%xmm15 + movaps %xmm1,96(%rsp) + subq $96,%rdx jc .Lxts_enc_short - shrl $1,%eax - subl $1,%eax - movl %eax,%r10d + movl $16+96,%eax + leaq 32(%r11,%r10,1),%rcx + subq %r10,%rax + movups 16(%r11),%xmm1 + movq %rax,%r10 + leaq .Lxts_magic(%rip),%r8 jmp .Lxts_enc_grandloop -.align 16 +.align 32 .Lxts_enc_grandloop: - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu 0(%rdi),%xmm2 - pand %xmm8,%xmm9 + movdqa %xmm0,%xmm8 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 + movdqu 32(%rdi),%xmm4 pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 +.byte 102,15,56,220,209 + movdqu 48(%rdi),%xmm5 pxor %xmm12,%xmm4 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi +.byte 102,15,56,220,217 + movdqu 64(%rdi),%xmm6 pxor %xmm13,%xmm5 - movups (%r11),%xmm0 +.byte 102,15,56,220,225 + movdqu 80(%rdi),%xmm7 + pxor %xmm15,%xmm8 + movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 - pxor %xmm15,%xmm7 - - +.byte 102,15,56,220,233 + movups 32(%r11),%xmm0 + leaq 96(%rdi),%rdi + pxor %xmm8,%xmm7 - movups 16(%r11),%xmm1 - pxor %xmm0,%xmm2 - pxor %xmm0,%xmm3 + pxor %xmm9,%xmm10 +.byte 102,15,56,220,241 + pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) -.byte 102,15,56,220,209 - leaq 32(%r11),%rcx - pxor %xmm0,%xmm4 +.byte 102,15,56,220,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm12 + +.byte 102,15,56,220,208 + pxor %xmm9,%xmm13 movdqa %xmm11,16(%rsp) -.byte 102,15,56,220,217 - pxor %xmm0,%xmm5 +.byte 102,15,56,220,216 + pxor %xmm9,%xmm14 movdqa %xmm12,32(%rsp) -.byte 102,15,56,220,225 - pxor %xmm0,%xmm6 - movdqa %xmm13,48(%rsp) -.byte 102,15,56,220,233 - pxor %xmm0,%xmm7 - movups (%rcx),%xmm0 - decl %eax +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + pxor %xmm9,%xmm8 movdqa %xmm14,64(%rsp) -.byte 102,15,56,220,241 - movdqa %xmm15,80(%rsp) -.byte 102,15,56,220,249 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - jmp .Lxts_enc_loop6_enter - -.align 16 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups 64(%r11),%xmm0 + movdqa %xmm8,80(%rsp) + pshufd $95,%xmm15,%xmm9 + jmp .Lxts_enc_loop6 +.align 32 .Lxts_enc_loop6: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -.Lxts_enc_loop6_enter: - movups 16(%rcx),%xmm1 + movups -64(%rcx,%rax,1),%xmm1 + addq $32,%rax + .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups (%rcx),%xmm0 + movups -80(%rcx,%rax,1),%xmm0 jnz .Lxts_enc_loop6 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - paddq %xmm15,%xmm15 + movdqa (%r8),%xmm8 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,220,209 - pand %xmm8,%xmm9 + paddq %xmm15,%xmm15 + psrad $31,%xmm14 .byte 102,15,56,220,217 - pcmpgtd %xmm15,%xmm14 + pand %xmm8,%xmm14 + movups (%r11),%xmm10 .byte 102,15,56,220,225 - pxor %xmm9,%xmm15 .byte 102,15,56,220,233 .byte 102,15,56,220,241 + pxor %xmm14,%xmm15 + movaps %xmm10,%xmm11 .byte 102,15,56,220,249 - movups 16(%rcx),%xmm1 + movups -64(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm10 - paddq %xmm15,%xmm15 + movdqa %xmm9,%xmm14 .byte 102,15,56,220,208 - pand %xmm8,%xmm9 + paddd %xmm9,%xmm9 + pxor %xmm15,%xmm10 .byte 102,15,56,220,216 - pcmpgtd %xmm15,%xmm14 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 .byte 102,15,56,220,224 - pxor %xmm9,%xmm15 .byte 102,15,56,220,232 + pand %xmm8,%xmm14 + movaps %xmm11,%xmm12 .byte 102,15,56,220,240 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 .byte 102,15,56,220,248 - movups 32(%rcx),%xmm0 + movups -48(%rcx),%xmm0 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm11 - paddq %xmm15,%xmm15 + paddd %xmm9,%xmm9 .byte 102,15,56,220,209 - pand %xmm8,%xmm9 + pxor %xmm15,%xmm11 + psrad $31,%xmm14 .byte 102,15,56,220,217 - pcmpgtd %xmm15,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 .byte 102,15,56,220,225 - pxor %xmm9,%xmm15 .byte 102,15,56,220,233 + movdqa %xmm13,48(%rsp) + pxor %xmm14,%xmm15 .byte 102,15,56,220,241 + movaps %xmm12,%xmm13 + movdqa %xmm9,%xmm14 .byte 102,15,56,220,249 + movups -32(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm12 + paddd %xmm9,%xmm9 +.byte 102,15,56,220,208 + pxor %xmm15,%xmm12 + psrad $31,%xmm14 +.byte 102,15,56,220,216 paddq %xmm15,%xmm15 -.byte 102,15,56,221,208 - pand %xmm8,%xmm9 -.byte 102,15,56,221,216 - pcmpgtd %xmm15,%xmm14 -.byte 102,15,56,221,224 - pxor %xmm9,%xmm15 -.byte 102,15,56,221,232 -.byte 102,15,56,221,240 -.byte 102,15,56,221,248 + pand %xmm8,%xmm14 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 + pxor %xmm14,%xmm15 + movaps %xmm13,%xmm14 +.byte 102,15,56,220,248 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm13 + movdqa %xmm9,%xmm0 + paddd %xmm9,%xmm9 +.byte 102,15,56,220,209 + pxor %xmm15,%xmm13 + psrad $31,%xmm0 +.byte 102,15,56,220,217 paddq %xmm15,%xmm15 - xorps 0(%rsp),%xmm2 + pand %xmm8,%xmm0 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + pxor %xmm0,%xmm15 + movups (%r11),%xmm0 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups 16(%r11),%xmm1 + + pxor %xmm15,%xmm14 +.byte 102,15,56,221,84,36,0 + psrad $31,%xmm9 + paddq %xmm15,%xmm15 +.byte 102,15,56,221,92,36,16 +.byte 102,15,56,221,100,36,32 pand %xmm8,%xmm9 - xorps 16(%rsp),%xmm3 - pcmpgtd %xmm15,%xmm14 + movq %r10,%rax +.byte 102,15,56,221,108,36,48 +.byte 102,15,56,221,116,36,64 +.byte 102,15,56,221,124,36,80 pxor %xmm9,%xmm15 - xorps 32(%rsp),%xmm4 - movups %xmm2,0(%rsi) - xorps 48(%rsp),%xmm5 - movups %xmm3,16(%rsi) - xorps 64(%rsp),%xmm6 - movups %xmm4,32(%rsi) - xorps 80(%rsp),%xmm7 - movups %xmm5,48(%rsi) - movl %r10d,%eax - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) leaq 96(%rsi),%rsi + movups %xmm2,-96(%rsi) + movups %xmm3,-80(%rsi) + movups %xmm4,-64(%rsi) + movups %xmm5,-48(%rsi) + movups %xmm6,-32(%rsi) + movups %xmm7,-16(%rsi) subq $96,%rdx jnc .Lxts_enc_grandloop - leal 3(%rax,%rax,1),%eax + movl $16+96,%eax + subl %r10d,%eax movq %r11,%rcx - movl %eax,%r10d + shrl $4,%eax .Lxts_enc_short: + movl %eax,%r10d + pxor %xmm0,%xmm10 addq $96,%rdx jz .Lxts_enc_done + pxor %xmm0,%xmm11 cmpq $32,%rdx jb .Lxts_enc_one + pxor %xmm0,%xmm12 je .Lxts_enc_two + pxor %xmm0,%xmm13 cmpq $64,%rdx jb .Lxts_enc_three + pxor %xmm0,%xmm14 je .Lxts_enc_four - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu (%rdi),%xmm2 - pand %xmm8,%xmm9 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 movdqu 48(%rdi),%xmm5 @@ -1450,7 +1848,7 @@ aesni_xts_encrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_9 + jnz .Loop_enc1_9 .byte 102,15,56,221,209 xorps %xmm10,%xmm2 movdqa %xmm11,%xmm10 @@ -1466,7 +1864,7 @@ aesni_xts_encrypt: xorps %xmm10,%xmm2 xorps %xmm11,%xmm3 - call _aesni_encrypt3 + call _aesni_encrypt2 xorps %xmm10,%xmm2 movdqa %xmm12,%xmm10 @@ -1512,15 +1910,15 @@ aesni_xts_encrypt: call _aesni_encrypt4 - xorps %xmm10,%xmm2 - movdqa %xmm15,%xmm10 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) + pxor %xmm10,%xmm2 + movdqa %xmm14,%xmm10 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm4 + movdqu %xmm2,(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm3,16(%rsi) + movdqu %xmm4,32(%rsi) + movdqu %xmm5,48(%rsi) leaq 64(%rsi),%rsi jmp .Lxts_enc_done @@ -1555,13 +1953,14 @@ aesni_xts_encrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_10 + jnz .Loop_enc1_10 .byte 102,15,56,221,209 xorps %xmm10,%xmm2 movups %xmm2,-16(%rsi) .Lxts_enc_ret: - leaq 104(%rsp),%rsp + leaq (%rbp),%rsp + popq %rbp .Lxts_enc_epilogue: .byte 0xf3,0xc3 .size aesni_xts_encrypt,.-aesni_xts_encrypt @@ -1569,249 +1968,292 @@ aesni_xts_encrypt: .type aesni_xts_decrypt,@function .align 16 aesni_xts_decrypt: - leaq -104(%rsp),%rsp - movups (%r9),%xmm15 + leaq (%rsp),%rax + pushq %rbp + subq $112,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp + movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d movups (%r8),%xmm0 movups 16(%r8),%xmm1 leaq 32(%r8),%r8 - xorps %xmm0,%xmm15 + xorps %xmm0,%xmm2 .Loop_enc1_11: -.byte 102,68,15,56,220,249 +.byte 102,15,56,220,209 decl %eax movups (%r8),%xmm1 leaq 16(%r8),%r8 - jnz .Loop_enc1_11 -.byte 102,68,15,56,221,249 + jnz .Loop_enc1_11 +.byte 102,15,56,221,209 xorl %eax,%eax testq $15,%rdx setnz %al shlq $4,%rax subq %rax,%rdx + movups (%rcx),%xmm0 movq %rcx,%r11 movl %r10d,%eax + shll $4,%r10d movq %rdx,%r9 andq $-16,%rdx + movups 16(%rcx,%r10,1),%xmm1 + movdqa .Lxts_magic(%rip),%xmm8 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + movdqa %xmm2,%xmm15 + pshufd $95,%xmm2,%xmm9 + pxor %xmm0,%xmm1 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm10 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm10 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm11 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm11 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm12 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm12 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm13 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm13 + pxor %xmm14,%xmm15 + movdqa %xmm15,%xmm14 + psrad $31,%xmm9 paddq %xmm15,%xmm15 pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 + pxor %xmm0,%xmm14 pxor %xmm9,%xmm15 + movaps %xmm1,96(%rsp) + subq $96,%rdx jc .Lxts_dec_short - shrl $1,%eax - subl $1,%eax - movl %eax,%r10d + movl $16+96,%eax + leaq 32(%r11,%r10,1),%rcx + subq %r10,%rax + movups 16(%r11),%xmm1 + movq %rax,%r10 + leaq .Lxts_magic(%rip),%r8 jmp .Lxts_dec_grandloop -.align 16 +.align 32 .Lxts_dec_grandloop: - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu 0(%rdi),%xmm2 - pand %xmm8,%xmm9 + movdqa %xmm0,%xmm8 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 + movdqu 32(%rdi),%xmm4 pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 +.byte 102,15,56,222,209 + movdqu 48(%rdi),%xmm5 pxor %xmm12,%xmm4 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi +.byte 102,15,56,222,217 + movdqu 64(%rdi),%xmm6 pxor %xmm13,%xmm5 - movups (%r11),%xmm0 +.byte 102,15,56,222,225 + movdqu 80(%rdi),%xmm7 + pxor %xmm15,%xmm8 + movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 - pxor %xmm15,%xmm7 - - +.byte 102,15,56,222,233 + movups 32(%r11),%xmm0 + leaq 96(%rdi),%rdi + pxor %xmm8,%xmm7 - movups 16(%r11),%xmm1 - pxor %xmm0,%xmm2 - pxor %xmm0,%xmm3 + pxor %xmm9,%xmm10 +.byte 102,15,56,222,241 + pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) -.byte 102,15,56,222,209 - leaq 32(%r11),%rcx - pxor %xmm0,%xmm4 +.byte 102,15,56,222,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm12 + +.byte 102,15,56,222,208 + pxor %xmm9,%xmm13 movdqa %xmm11,16(%rsp) -.byte 102,15,56,222,217 - pxor %xmm0,%xmm5 +.byte 102,15,56,222,216 + pxor %xmm9,%xmm14 movdqa %xmm12,32(%rsp) -.byte 102,15,56,222,225 - pxor %xmm0,%xmm6 - movdqa %xmm13,48(%rsp) -.byte 102,15,56,222,233 - pxor %xmm0,%xmm7 - movups (%rcx),%xmm0 - decl %eax +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + pxor %xmm9,%xmm8 movdqa %xmm14,64(%rsp) -.byte 102,15,56,222,241 - movdqa %xmm15,80(%rsp) -.byte 102,15,56,222,249 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - jmp .Lxts_dec_loop6_enter - -.align 16 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups 64(%r11),%xmm0 + movdqa %xmm8,80(%rsp) + pshufd $95,%xmm15,%xmm9 + jmp .Lxts_dec_loop6 +.align 32 .Lxts_dec_loop6: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 -.Lxts_dec_loop6_enter: - movups 16(%rcx),%xmm1 + movups -64(%rcx,%rax,1),%xmm1 + addq $32,%rax + .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups (%rcx),%xmm0 + movups -80(%rcx,%rax,1),%xmm0 jnz .Lxts_dec_loop6 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - paddq %xmm15,%xmm15 + movdqa (%r8),%xmm8 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,222,209 - pand %xmm8,%xmm9 + paddq %xmm15,%xmm15 + psrad $31,%xmm14 .byte 102,15,56,222,217 - pcmpgtd %xmm15,%xmm14 + pand %xmm8,%xmm14 + movups (%r11),%xmm10 .byte 102,15,56,222,225 - pxor %xmm9,%xmm15 .byte 102,15,56,222,233 .byte 102,15,56,222,241 + pxor %xmm14,%xmm15 + movaps %xmm10,%xmm11 .byte 102,15,56,222,249 - movups 16(%rcx),%xmm1 + movups -64(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm10 - paddq %xmm15,%xmm15 + movdqa %xmm9,%xmm14 .byte 102,15,56,222,208 - pand %xmm8,%xmm9 + paddd %xmm9,%xmm9 + pxor %xmm15,%xmm10 .byte 102,15,56,222,216 - pcmpgtd %xmm15,%xmm14 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 .byte 102,15,56,222,224 - pxor %xmm9,%xmm15 .byte 102,15,56,222,232 + pand %xmm8,%xmm14 + movaps %xmm11,%xmm12 .byte 102,15,56,222,240 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 .byte 102,15,56,222,248 - movups 32(%rcx),%xmm0 + movups -48(%rcx),%xmm0 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm11 - paddq %xmm15,%xmm15 + paddd %xmm9,%xmm9 .byte 102,15,56,222,209 - pand %xmm8,%xmm9 + pxor %xmm15,%xmm11 + psrad $31,%xmm14 .byte 102,15,56,222,217 - pcmpgtd %xmm15,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 .byte 102,15,56,222,225 - pxor %xmm9,%xmm15 .byte 102,15,56,222,233 + movdqa %xmm13,48(%rsp) + pxor %xmm14,%xmm15 .byte 102,15,56,222,241 + movaps %xmm12,%xmm13 + movdqa %xmm9,%xmm14 .byte 102,15,56,222,249 + movups -32(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm12 + paddd %xmm9,%xmm9 +.byte 102,15,56,222,208 + pxor %xmm15,%xmm12 + psrad $31,%xmm14 +.byte 102,15,56,222,216 paddq %xmm15,%xmm15 -.byte 102,15,56,223,208 - pand %xmm8,%xmm9 -.byte 102,15,56,223,216 - pcmpgtd %xmm15,%xmm14 -.byte 102,15,56,223,224 - pxor %xmm9,%xmm15 -.byte 102,15,56,223,232 -.byte 102,15,56,223,240 -.byte 102,15,56,223,248 + pand %xmm8,%xmm14 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 + pxor %xmm14,%xmm15 + movaps %xmm13,%xmm14 +.byte 102,15,56,222,248 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm13 + movdqa %xmm9,%xmm0 + paddd %xmm9,%xmm9 +.byte 102,15,56,222,209 + pxor %xmm15,%xmm13 + psrad $31,%xmm0 +.byte 102,15,56,222,217 paddq %xmm15,%xmm15 - xorps 0(%rsp),%xmm2 + pand %xmm8,%xmm0 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + pxor %xmm0,%xmm15 + movups (%r11),%xmm0 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups 16(%r11),%xmm1 + + pxor %xmm15,%xmm14 +.byte 102,15,56,223,84,36,0 + psrad $31,%xmm9 + paddq %xmm15,%xmm15 +.byte 102,15,56,223,92,36,16 +.byte 102,15,56,223,100,36,32 pand %xmm8,%xmm9 - xorps 16(%rsp),%xmm3 - pcmpgtd %xmm15,%xmm14 + movq %r10,%rax +.byte 102,15,56,223,108,36,48 +.byte 102,15,56,223,116,36,64 +.byte 102,15,56,223,124,36,80 pxor %xmm9,%xmm15 - xorps 32(%rsp),%xmm4 - movups %xmm2,0(%rsi) - xorps 48(%rsp),%xmm5 - movups %xmm3,16(%rsi) - xorps 64(%rsp),%xmm6 - movups %xmm4,32(%rsi) - xorps 80(%rsp),%xmm7 - movups %xmm5,48(%rsi) - movl %r10d,%eax - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) leaq 96(%rsi),%rsi + movups %xmm2,-96(%rsi) + movups %xmm3,-80(%rsi) + movups %xmm4,-64(%rsi) + movups %xmm5,-48(%rsi) + movups %xmm6,-32(%rsi) + movups %xmm7,-16(%rsi) subq $96,%rdx jnc .Lxts_dec_grandloop - leal 3(%rax,%rax,1),%eax + movl $16+96,%eax + subl %r10d,%eax movq %r11,%rcx - movl %eax,%r10d + shrl $4,%eax .Lxts_dec_short: + movl %eax,%r10d + pxor %xmm0,%xmm10 + pxor %xmm0,%xmm11 addq $96,%rdx jz .Lxts_dec_done + pxor %xmm0,%xmm12 cmpq $32,%rdx jb .Lxts_dec_one + pxor %xmm0,%xmm13 je .Lxts_dec_two + pxor %xmm0,%xmm14 cmpq $64,%rdx jb .Lxts_dec_three je .Lxts_dec_four - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu (%rdi),%xmm2 - pand %xmm8,%xmm9 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 movdqu 48(%rdi),%xmm5 @@ -1861,7 +2303,7 @@ aesni_xts_decrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_12 + jnz .Loop_dec1_12 .byte 102,15,56,223,209 xorps %xmm10,%xmm2 movdqa %xmm11,%xmm10 @@ -1878,7 +2320,7 @@ aesni_xts_decrypt: xorps %xmm10,%xmm2 xorps %xmm11,%xmm3 - call _aesni_decrypt3 + call _aesni_decrypt2 xorps %xmm10,%xmm2 movdqa %xmm12,%xmm10 @@ -1904,7 +2346,7 @@ aesni_xts_decrypt: xorps %xmm10,%xmm2 movdqa %xmm13,%xmm10 xorps %xmm11,%xmm3 - movdqa %xmm15,%xmm11 + movdqa %xmm14,%xmm11 xorps %xmm12,%xmm4 movups %xmm2,(%rsi) movups %xmm3,16(%rsi) @@ -1914,14 +2356,8 @@ aesni_xts_decrypt: .align 16 .Lxts_dec_four: - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movups (%rdi),%xmm2 - pand %xmm8,%xmm9 movups 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - movups 32(%rdi),%xmm4 xorps %xmm10,%xmm2 movups 48(%rdi),%xmm5 @@ -1932,16 +2368,16 @@ aesni_xts_decrypt: call _aesni_decrypt4 - xorps %xmm10,%xmm2 + pxor %xmm10,%xmm2 movdqa %xmm14,%xmm10 - xorps %xmm11,%xmm3 + pxor %xmm11,%xmm3 movdqa %xmm15,%xmm11 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm2,(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm3,16(%rsi) + movdqu %xmm4,32(%rsi) + movdqu %xmm5,48(%rsi) leaq 64(%rsi),%rsi jmp .Lxts_dec_done @@ -1965,7 +2401,7 @@ aesni_xts_decrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_13 + jnz .Loop_dec1_13 .byte 102,15,56,223,209 xorps %xmm11,%xmm2 movups %xmm2,(%rsi) @@ -1995,13 +2431,14 @@ aesni_xts_decrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_14 + jnz .Loop_dec1_14 .byte 102,15,56,223,209 xorps %xmm10,%xmm2 movups %xmm2,(%rsi) .Lxts_dec_ret: - leaq 104(%rsp),%rsp + leaq (%rbp),%rsp + popq %rbp .Lxts_dec_epilogue: .byte 0xf3,0xc3 .size aesni_xts_decrypt,.-aesni_xts_decrypt @@ -2038,7 +2475,7 @@ aesni_cbc_encrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_enc1_15 + jnz .Loop_enc1_15 .byte 102,15,56,221,209 movl %r10d,%eax movq %r11,%rcx @@ -2054,163 +2491,397 @@ aesni_cbc_encrypt: .Lcbc_enc_tail: movq %rdx,%rcx xchgq %rdi,%rsi -.long 0x9066A4F3 +.long 0x9066A4F3 movl $16,%ecx subq %rdx,%rcx xorl %eax,%eax -.long 0x9066AAF3 +.long 0x9066AAF3 leaq -16(%rdi),%rdi movl %r10d,%eax movq %rdi,%rsi movq %r11,%rcx xorq %rdx,%rdx - jmp .Lcbc_enc_loop + jmp .Lcbc_enc_loop .align 16 .Lcbc_decrypt: - movups (%r8),%xmm9 + leaq (%rsp),%rax + pushq %rbp + subq $16,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp + movups (%r8),%xmm10 movl %r10d,%eax - cmpq $112,%rdx + cmpq $80,%rdx jbe .Lcbc_dec_tail - shrl $1,%r10d - subq $112,%rdx - movl %r10d,%eax - movaps %xmm9,-24(%rsp) + + movups (%rcx),%xmm0 + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqa %xmm2,%xmm11 + movdqu 32(%rdi),%xmm4 + movdqa %xmm3,%xmm12 + movdqu 48(%rdi),%xmm5 + movdqa %xmm4,%xmm13 + movdqu 64(%rdi),%xmm6 + movdqa %xmm5,%xmm14 + movdqu 80(%rdi),%xmm7 + movdqa %xmm6,%xmm15 + movl _gnutls_x86_cpuid_s+4(%rip),%r9d + cmpq $112,%rdx + jbe .Lcbc_dec_six_or_seven + + andl $71303168,%r9d + subq $80,%rdx + cmpl $4194304,%r9d + je .Lcbc_dec_loop6_enter + subq $32,%rdx + leaq 112(%rcx),%rcx jmp .Lcbc_dec_loop8_enter .align 16 .Lcbc_dec_loop8: - movaps %xmm0,-24(%rsp) movups %xmm9,(%rsi) leaq 16(%rsi),%rsi .Lcbc_dec_loop8_enter: - movups (%rcx),%xmm0 - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 16(%rcx),%xmm1 + movdqu 96(%rdi),%xmm8 + pxor %xmm0,%xmm2 + movdqu 112(%rdi),%xmm9 + pxor %xmm0,%xmm3 + movups 16-112(%rcx),%xmm1 + pxor %xmm0,%xmm4 + xorq %r11,%r11 + cmpq $112,%rdx + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 + pxor %xmm0,%xmm8 - leaq 32(%rcx),%rcx - movdqu 32(%rdi),%xmm4 - xorps %xmm0,%xmm2 - movdqu 48(%rdi),%xmm5 - xorps %xmm0,%xmm3 - movdqu 64(%rdi),%xmm6 .byte 102,15,56,222,209 - pxor %xmm0,%xmm4 - movdqu 80(%rdi),%xmm7 + pxor %xmm0,%xmm9 + movups 32-112(%rcx),%xmm0 .byte 102,15,56,222,217 - pxor %xmm0,%xmm5 - movdqu 96(%rdi),%xmm8 .byte 102,15,56,222,225 - pxor %xmm0,%xmm6 - movdqu 112(%rdi),%xmm9 .byte 102,15,56,222,233 - pxor %xmm0,%xmm7 - decl %eax .byte 102,15,56,222,241 - pxor %xmm0,%xmm8 .byte 102,15,56,222,249 - pxor %xmm0,%xmm9 - movups (%rcx),%xmm0 .byte 102,68,15,56,222,193 + setnc %r11b + shlq $7,%r11 .byte 102,68,15,56,222,201 - movups 16(%rcx),%xmm1 - - call .Ldec_loop8_enter + addq %rdi,%r11 + movups 48-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 64-112(%rcx),%xmm0 + nop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 80-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 96-112(%rcx),%xmm0 + nop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 112-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 128-112(%rcx),%xmm0 + nop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 144-112(%rcx),%xmm1 + cmpl $11,%eax +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 160-112(%rcx),%xmm0 + jb .Lcbc_dec_done +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 176-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 192-112(%rcx),%xmm0 + je .Lcbc_dec_done +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 208-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 224-112(%rcx),%xmm0 + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_done: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 + pxor %xmm0,%xmm10 + pxor %xmm0,%xmm11 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + pxor %xmm0,%xmm12 + pxor %xmm0,%xmm13 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + pxor %xmm0,%xmm14 + pxor %xmm0,%xmm15 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movdqu 80(%rdi),%xmm1 + +.byte 102,65,15,56,223,210 + movdqu 96(%rdi),%xmm10 + pxor %xmm0,%xmm1 +.byte 102,65,15,56,223,219 + pxor %xmm0,%xmm10 + movdqu 112(%rdi),%xmm0 +.byte 102,65,15,56,223,228 + leaq 128(%rdi),%rdi + movdqu 0(%r11),%xmm11 +.byte 102,65,15,56,223,237 +.byte 102,65,15,56,223,246 + movdqu 16(%r11),%xmm12 + movdqu 32(%r11),%xmm13 +.byte 102,65,15,56,223,255 +.byte 102,68,15,56,223,193 + movdqu 48(%r11),%xmm14 + movdqu 64(%r11),%xmm15 +.byte 102,69,15,56,223,202 + movdqa %xmm0,%xmm10 + movdqu 80(%r11),%xmm1 + movups -112(%rcx),%xmm0 - movups (%rdi),%xmm1 - movups 16(%rdi),%xmm0 - xorps -24(%rsp),%xmm2 - xorps %xmm1,%xmm3 - movups 32(%rdi),%xmm1 - xorps %xmm0,%xmm4 - movups 48(%rdi),%xmm0 - xorps %xmm1,%xmm5 - movups 64(%rdi),%xmm1 - xorps %xmm0,%xmm6 - movups 80(%rdi),%xmm0 - xorps %xmm1,%xmm7 - movups 96(%rdi),%xmm1 - xorps %xmm0,%xmm8 - movups 112(%rdi),%xmm0 - xorps %xmm1,%xmm9 movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 movups %xmm5,48(%rsi) - movl %r10d,%eax + movdqa %xmm14,%xmm5 movups %xmm6,64(%rsi) - movq %r11,%rcx + movdqa %xmm15,%xmm6 movups %xmm7,80(%rsi) - leaq 128(%rdi),%rdi + movdqa %xmm1,%xmm7 movups %xmm8,96(%rsi) leaq 112(%rsi),%rsi + subq $128,%rdx ja .Lcbc_dec_loop8 movaps %xmm9,%xmm2 - movaps %xmm0,%xmm9 + leaq -112(%rcx),%rcx addq $112,%rdx jle .Lcbc_dec_tail_collected - movups %xmm2,(%rsi) - leal 1(%r10,%r10,1),%eax + movups %xmm9,(%rsi) + leaq 16(%rsi),%rsi + cmpq $80,%rdx + jbe .Lcbc_dec_tail + + movaps %xmm11,%xmm2 +.Lcbc_dec_six_or_seven: + cmpq $96,%rdx + ja .Lcbc_dec_seven + + movaps %xmm7,%xmm8 + call _aesni_decrypt6 + pxor %xmm10,%xmm2 + movaps %xmm8,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm15,%xmm7 + movdqu %xmm6,64(%rsi) + leaq 80(%rsi),%rsi + movdqa %xmm7,%xmm2 + jmp .Lcbc_dec_tail_collected + +.align 16 +.Lcbc_dec_seven: + movups 96(%rdi),%xmm8 + xorps %xmm9,%xmm9 + call _aesni_decrypt8 + movups 80(%rdi),%xmm9 + pxor %xmm10,%xmm2 + movups 96(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm15,%xmm7 + movdqu %xmm6,64(%rsi) + pxor %xmm9,%xmm8 + movdqu %xmm7,80(%rsi) + leaq 96(%rsi),%rsi + movdqa %xmm8,%xmm2 + jmp .Lcbc_dec_tail_collected + +.align 16 +.Lcbc_dec_loop6: + movups %xmm7,(%rsi) leaq 16(%rsi),%rsi + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqa %xmm2,%xmm11 + movdqu 32(%rdi),%xmm4 + movdqa %xmm3,%xmm12 + movdqu 48(%rdi),%xmm5 + movdqa %xmm4,%xmm13 + movdqu 64(%rdi),%xmm6 + movdqa %xmm5,%xmm14 + movdqu 80(%rdi),%xmm7 + movdqa %xmm6,%xmm15 +.Lcbc_dec_loop6_enter: + leaq 96(%rdi),%rdi + movdqa %xmm7,%xmm8 + + call _aesni_decrypt6 + + pxor %xmm10,%xmm2 + movdqa %xmm8,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movq %r11,%rcx + movdqu %xmm5,48(%rsi) + pxor %xmm15,%xmm7 + movl %r10d,%eax + movdqu %xmm6,64(%rsi) + leaq 80(%rsi),%rsi + subq $96,%rdx + ja .Lcbc_dec_loop6 + + movdqa %xmm7,%xmm2 + addq $80,%rdx + jle .Lcbc_dec_tail_collected + movups %xmm7,(%rsi) + leaq 16(%rsi),%rsi + .Lcbc_dec_tail: movups (%rdi),%xmm2 - movaps %xmm2,%xmm8 - cmpq $16,%rdx + subq $16,%rdx jbe .Lcbc_dec_one movups 16(%rdi),%xmm3 - movaps %xmm3,%xmm7 - cmpq $32,%rdx + movaps %xmm2,%xmm11 + subq $16,%rdx jbe .Lcbc_dec_two movups 32(%rdi),%xmm4 - movaps %xmm4,%xmm6 - cmpq $48,%rdx + movaps %xmm3,%xmm12 + subq $16,%rdx jbe .Lcbc_dec_three movups 48(%rdi),%xmm5 - cmpq $64,%rdx + movaps %xmm4,%xmm13 + subq $16,%rdx jbe .Lcbc_dec_four movups 64(%rdi),%xmm6 - cmpq $80,%rdx - jbe .Lcbc_dec_five - - movups 80(%rdi),%xmm7 - cmpq $96,%rdx - jbe .Lcbc_dec_six - - movups 96(%rdi),%xmm8 - movaps %xmm9,-24(%rsp) - call _aesni_decrypt8 - movups (%rdi),%xmm1 - movups 16(%rdi),%xmm0 - xorps -24(%rsp),%xmm2 - xorps %xmm1,%xmm3 - movups 32(%rdi),%xmm1 - xorps %xmm0,%xmm4 - movups 48(%rdi),%xmm0 - xorps %xmm1,%xmm5 - movups 64(%rdi),%xmm1 - xorps %xmm0,%xmm6 - movups 80(%rdi),%xmm0 - xorps %xmm1,%xmm7 - movups 96(%rdi),%xmm9 - xorps %xmm0,%xmm8 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) - leaq 96(%rsi),%rsi - movaps %xmm8,%xmm2 - subq $112,%rdx + movaps %xmm5,%xmm14 + movaps %xmm6,%xmm15 + xorps %xmm7,%xmm7 + call _aesni_decrypt6 + pxor %xmm10,%xmm2 + movaps %xmm15,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + leaq 64(%rsi),%rsi + movdqa %xmm6,%xmm2 + subq $16,%rdx jmp .Lcbc_dec_tail_collected + .align 16 .Lcbc_dec_one: + movaps %xmm2,%xmm11 movups (%rcx),%xmm0 movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx @@ -2220,113 +2891,70 @@ aesni_cbc_encrypt: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_16 + jnz .Loop_dec1_16 .byte 102,15,56,223,209 - xorps %xmm9,%xmm2 - movaps %xmm8,%xmm9 - subq $16,%rdx + xorps %xmm10,%xmm2 + movaps %xmm11,%xmm10 jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_two: - xorps %xmm4,%xmm4 - call _aesni_decrypt3 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - movups %xmm2,(%rsi) - movaps %xmm7,%xmm9 - movaps %xmm3,%xmm2 + movaps %xmm3,%xmm12 + call _aesni_decrypt2 + pxor %xmm10,%xmm2 + movaps %xmm12,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + movdqa %xmm3,%xmm2 leaq 16(%rsi),%rsi - subq $32,%rdx jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_three: + movaps %xmm4,%xmm13 call _aesni_decrypt3 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - movups %xmm2,(%rsi) - xorps %xmm7,%xmm4 - movups %xmm3,16(%rsi) - movaps %xmm6,%xmm9 - movaps %xmm4,%xmm2 + pxor %xmm10,%xmm2 + movaps %xmm13,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + movdqa %xmm4,%xmm2 leaq 32(%rsi),%rsi - subq $48,%rdx jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_four: + movaps %xmm5,%xmm14 call _aesni_decrypt4 - xorps %xmm9,%xmm2 - movups 48(%rdi),%xmm9 - xorps %xmm8,%xmm3 - movups %xmm2,(%rsi) - xorps %xmm7,%xmm4 - movups %xmm3,16(%rsi) - xorps %xmm6,%xmm5 - movups %xmm4,32(%rsi) - movaps %xmm5,%xmm2 + pxor %xmm10,%xmm2 + movaps %xmm14,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + movdqa %xmm5,%xmm2 leaq 48(%rsi),%rsi - subq $64,%rdx - jmp .Lcbc_dec_tail_collected -.align 16 -.Lcbc_dec_five: - xorps %xmm7,%xmm7 - call _aesni_decrypt6 - movups 16(%rdi),%xmm1 - movups 32(%rdi),%xmm0 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - xorps %xmm1,%xmm4 - movups 48(%rdi),%xmm1 - xorps %xmm0,%xmm5 - movups 64(%rdi),%xmm9 - xorps %xmm1,%xmm6 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - leaq 64(%rsi),%rsi - movaps %xmm6,%xmm2 - subq $80,%rdx - jmp .Lcbc_dec_tail_collected -.align 16 -.Lcbc_dec_six: - call _aesni_decrypt6 - movups 16(%rdi),%xmm1 - movups 32(%rdi),%xmm0 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - xorps %xmm1,%xmm4 - movups 48(%rdi),%xmm1 - xorps %xmm0,%xmm5 - movups 64(%rdi),%xmm0 - xorps %xmm1,%xmm6 - movups 80(%rdi),%xmm9 - xorps %xmm0,%xmm7 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - leaq 80(%rsi),%rsi - movaps %xmm7,%xmm2 - subq $96,%rdx jmp .Lcbc_dec_tail_collected + .align 16 .Lcbc_dec_tail_collected: + movups %xmm10,(%r8) andq $15,%rdx - movups %xmm9,(%r8) jnz .Lcbc_dec_tail_partial movups %xmm2,(%rsi) jmp .Lcbc_dec_ret .align 16 .Lcbc_dec_tail_partial: - movaps %xmm2,-24(%rsp) + movaps %xmm2,(%rsp) movq $16,%rcx movq %rsi,%rdi subq %rdx,%rcx - leaq -24(%rsp),%rsi -.long 0x9066A4F3 + leaq (%rsp),%rsi +.long 0x9066A4F3 .Lcbc_dec_ret: + leaq (%rbp),%rsp + popq %rbp .Lcbc_ret: .byte 0xf3,0xc3 .size aesni_cbc_encrypt,.-aesni_cbc_encrypt @@ -2334,7 +2962,7 @@ aesni_cbc_encrypt: .type aesni_set_decrypt_key,@function .align 16 aesni_set_decrypt_key: -.byte 0x48,0x83,0xEC,0x08 +.byte 0x48,0x83,0xEC,0x08 call __aesni_set_encrypt_key shll $4,%esi testl %eax,%eax @@ -2373,7 +3001,7 @@ aesni_set_decrypt_key: .align 16 aesni_set_encrypt_key: __aesni_set_encrypt_key: -.byte 0x48,0x83,0xEC,0x08 +.byte 0x48,0x83,0xEC,0x08 movq $-1,%rax testq %rdi,%rdi jz .Lenc_key_ret @@ -2569,6 +3197,8 @@ __aesni_set_encrypt_key: .long 1,0,0,0 .Lxts_magic: .long 0x87,0,1,0 +.Lincrement1: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 diff --git a/lib/accelerated/x86/elf/e_padlock-x86.s b/lib/accelerated/x86/elf/e_padlock-x86.s index 65f32f9601..0b8fc28810 100644 --- a/lib/accelerated/x86/elf/e_padlock-x86.s +++ b/lib/accelerated/x86/elf/e_padlock-x86.s @@ -187,16 +187,14 @@ padlock_ecb_encrypt: leal 16(%edx),%edx xorl %eax,%eax xorl %ebx,%ebx - cmpl $128,%ecx - jbe .L006ecb_short testl $32,(%edx) - jnz .L007ecb_aligned + jnz .L006ecb_aligned testl $15,%edi setz %al testl $15,%esi setz %bl testl %ebx,%eax - jnz .L007ecb_aligned + jnz .L006ecb_aligned negl %eax movl $512,%ebx notl %eax @@ -208,10 +206,28 @@ padlock_ecb_encrypt: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp - jmp .L008ecb_loop + movl %eax,16(%ebp) + cmpl %ebx,%ecx + ja .L007ecb_loop + movl %esi,%eax + cmpl %esp,%ebp + cmovel %edi,%eax + addl %ecx,%eax + negl %eax + andl $4095,%eax + cmpl $128,%eax + movl $-128,%eax + cmovael %ebx,%eax + andl %eax,%ebx + jz .L008ecb_unaligned_tail + jmp .L007ecb_loop .align 16 -.L008ecb_loop: +.L007ecb_loop: movl %edi,(%ebp) movl %esi,4(%ebp) movl %ecx,8(%ebp) @@ -236,8 +252,8 @@ padlock_ecb_encrypt: testl $15,%edi jz .L010ecb_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi .L010ecb_out_aligned: @@ -247,43 +263,75 @@ padlock_ecb_encrypt: addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz .L008ecb_loop + jz .L011ecb_break + cmpl %ebx,%ecx + jae .L007ecb_loop +.L008ecb_unaligned_tail: + xorl %eax,%eax cmpl %ebp,%esp - je .L011ecb_done + cmovel %ecx,%eax + subl %eax,%esp + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L007ecb_loop +.align 16 +.L011ecb_break: + cmpl %ebp,%esp + je .L012ecb_done pxor %xmm0,%xmm0 leal (%esp),%eax -.L012ecb_bzero: +.L013ecb_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja .L012ecb_bzero -.L011ecb_done: + ja .L013ecb_bzero +.L012ecb_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp - jmp .L013ecb_exit + jmp .L014ecb_exit .align 16 -.L006ecb_short: +.L006ecb_aligned: + leal (%esi,%ecx,1),%ebp + negl %ebp + andl $4095,%ebp xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -.L014ecb_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja .L014ecb_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp .L008ecb_loop -.align 16 -.L007ecb_aligned: + cmpl $128,%ebp + movl $127,%ebp + cmovael %eax,%ebp + andl %ecx,%ebp + subl %ebp,%ecx + jz .L015ecb_aligned_tail leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx .byte 243,15,167,200 -.L013ecb_exit: + testl %ebp,%ebp + jz .L014ecb_exit +.L015ecb_aligned_tail: + movl %ebp,%ecx + leal -24(%esp),%ebp + movl %ebp,%esp + movl %ebp,%eax + subl %ecx,%esp + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L007ecb_loop +.L014ecb_exit: movl $1,%eax leal 4(%esp),%esp .L004ecb_abort: @@ -307,19 +355,17 @@ padlock_cbc_encrypt: movl 28(%esp),%edx movl 32(%esp),%ecx testl $15,%edx - jnz .L015cbc_abort + jnz .L016cbc_abort testl $15,%ecx - jnz .L015cbc_abort - leal .Lpadlock_saved_context-.L016cbc_pic_point,%eax + jnz .L016cbc_abort + leal .Lpadlock_saved_context-.L017cbc_pic_point,%eax pushfl cld call _padlock_verify_ctx -.L016cbc_pic_point: +.L017cbc_pic_point: leal 16(%edx),%edx xorl %eax,%eax xorl %ebx,%ebx - cmpl $64,%ecx - jbe .L017cbc_short testl $32,(%edx) jnz .L018cbc_aligned testl $15,%edi @@ -339,7 +385,25 @@ padlock_cbc_encrypt: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp + movl %eax,16(%ebp) + cmpl %ebx,%ecx + ja .L019cbc_loop + movl %esi,%eax + cmpl %esp,%ebp + cmovel %edi,%eax + addl %ecx,%eax + negl %eax + andl $4095,%eax + cmpl $64,%eax + movl $-64,%eax + cmovael %ebx,%eax + andl %eax,%ebx + jz .L020cbc_unaligned_tail jmp .L019cbc_loop .align 16 .L019cbc_loop: @@ -351,13 +415,13 @@ padlock_cbc_encrypt: testl $15,%edi cmovnzl %esp,%edi testl $15,%esi - jz .L020cbc_inp_aligned + jz .L021cbc_inp_aligned shrl $2,%ecx .byte 243,165 subl %ebx,%edi movl %ebx,%ecx movl %edi,%esi -.L020cbc_inp_aligned: +.L021cbc_inp_aligned: leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx @@ -367,61 +431,93 @@ padlock_cbc_encrypt: movl (%ebp),%edi movl 12(%ebp),%ebx testl $15,%edi - jz .L021cbc_out_aligned + jz .L022cbc_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi -.L021cbc_out_aligned: +.L022cbc_out_aligned: movl 4(%ebp),%esi movl 8(%ebp),%ecx addl %ebx,%edi addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz .L019cbc_loop + jz .L023cbc_break + cmpl %ebx,%ecx + jae .L019cbc_loop +.L020cbc_unaligned_tail: + xorl %eax,%eax + cmpl %ebp,%esp + cmovel %ecx,%eax + subl %eax,%esp + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L019cbc_loop +.align 16 +.L023cbc_break: cmpl %ebp,%esp - je .L022cbc_done + je .L024cbc_done pxor %xmm0,%xmm0 leal (%esp),%eax -.L023cbc_bzero: +.L025cbc_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja .L023cbc_bzero -.L022cbc_done: + ja .L025cbc_bzero +.L024cbc_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp - jmp .L024cbc_exit -.align 16 -.L017cbc_short: - xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -.L025cbc_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja .L025cbc_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp .L019cbc_loop + jmp .L026cbc_exit .align 16 .L018cbc_aligned: + leal (%esi,%ecx,1),%ebp + negl %ebp + andl $4095,%ebp + xorl %eax,%eax + cmpl $64,%ebp + movl $63,%ebp + cmovael %eax,%ebp + andl %ecx,%ebp + subl %ebp,%ecx + jz .L027cbc_aligned_tail leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx .byte 243,15,167,208 movaps (%eax),%xmm0 movaps %xmm0,-16(%edx) -.L024cbc_exit: + testl %ebp,%ebp + jz .L026cbc_exit +.L027cbc_aligned_tail: + movl %ebp,%ecx + leal -24(%esp),%ebp + movl %ebp,%esp + movl %ebp,%eax + subl %ecx,%esp + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L019cbc_loop +.L026cbc_exit: movl $1,%eax leal 4(%esp),%esp -.L015cbc_abort: +.L016cbc_abort: popl %edi popl %esi popl %ebx @@ -442,25 +538,25 @@ padlock_cfb_encrypt: movl 28(%esp),%edx movl 32(%esp),%ecx testl $15,%edx - jnz .L026cfb_abort + jnz .L028cfb_abort testl $15,%ecx - jnz .L026cfb_abort - leal .Lpadlock_saved_context-.L027cfb_pic_point,%eax + jnz .L028cfb_abort + leal .Lpadlock_saved_context-.L029cfb_pic_point,%eax pushfl cld call _padlock_verify_ctx -.L027cfb_pic_point: +.L029cfb_pic_point: leal 16(%edx),%edx xorl %eax,%eax xorl %ebx,%ebx testl $32,(%edx) - jnz .L028cfb_aligned + jnz .L030cfb_aligned testl $15,%edi setz %al testl $15,%esi setz %bl testl %ebx,%eax - jnz .L028cfb_aligned + jnz .L030cfb_aligned negl %eax movl $512,%ebx notl %eax @@ -472,10 +568,15 @@ padlock_cfb_encrypt: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp - jmp .L029cfb_loop + movl %eax,16(%ebp) + jmp .L031cfb_loop .align 16 -.L029cfb_loop: +.L031cfb_loop: movl %edi,(%ebp) movl %esi,4(%ebp) movl %ecx,8(%ebp) @@ -484,13 +585,13 @@ padlock_cfb_encrypt: testl $15,%edi cmovnzl %esp,%edi testl $15,%esi - jz .L030cfb_inp_aligned + jz .L032cfb_inp_aligned shrl $2,%ecx .byte 243,165 subl %ebx,%edi movl %ebx,%ecx movl %edi,%esi -.L030cfb_inp_aligned: +.L032cfb_inp_aligned: leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx @@ -500,61 +601,45 @@ padlock_cfb_encrypt: movl (%ebp),%edi movl 12(%ebp),%ebx testl $15,%edi - jz .L031cfb_out_aligned + jz .L033cfb_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi -.L031cfb_out_aligned: +.L033cfb_out_aligned: movl 4(%ebp),%esi movl 8(%ebp),%ecx addl %ebx,%edi addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz .L029cfb_loop + jnz .L031cfb_loop cmpl %ebp,%esp - je .L032cfb_done + je .L034cfb_done pxor %xmm0,%xmm0 leal (%esp),%eax -.L033cfb_bzero: +.L035cfb_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja .L033cfb_bzero -.L032cfb_done: + ja .L035cfb_bzero +.L034cfb_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp - jmp .L034cfb_exit + jmp .L036cfb_exit .align 16 -.L035cfb_short: - xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -.L036cfb_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja .L036cfb_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp .L029cfb_loop -.align 16 -.L028cfb_aligned: +.L030cfb_aligned: leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx .byte 243,15,167,224 movaps (%eax),%xmm0 movaps %xmm0,-16(%edx) -.L034cfb_exit: +.L036cfb_exit: movl $1,%eax leal 4(%esp),%esp -.L026cfb_abort: +.L028cfb_abort: popl %edi popl %esi popl %ebx @@ -605,7 +690,12 @@ padlock_ofb_encrypt: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp + movl %eax,16(%ebp) jmp .L040ofb_loop .align 16 .L040ofb_loop: @@ -635,8 +725,8 @@ padlock_ofb_encrypt: testl $15,%edi jz .L042ofb_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi .L042ofb_out_aligned: @@ -657,26 +747,10 @@ padlock_ofb_encrypt: cmpl %eax,%ebp ja .L044ofb_bzero .L043ofb_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp jmp .L045ofb_exit .align 16 -.L046ofb_short: - xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -.L047ofb_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja .L047ofb_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp .L040ofb_loop -.align 16 .L039ofb_aligned: leal -16(%edx),%eax leal 16(%edx),%ebx @@ -708,14 +782,14 @@ padlock_ctr32_encrypt: movl 28(%esp),%edx movl 32(%esp),%ecx testl $15,%edx - jnz .L048ctr32_abort + jnz .L046ctr32_abort testl $15,%ecx - jnz .L048ctr32_abort - leal .Lpadlock_saved_context-.L049ctr32_pic_point,%eax + jnz .L046ctr32_abort + leal .Lpadlock_saved_context-.L047ctr32_pic_point,%eax pushfl cld call _padlock_verify_ctx -.L049ctr32_pic_point: +.L047ctr32_pic_point: leal 16(%edx),%edx xorl %eax,%eax movq -16(%edx),%mm0 @@ -729,10 +803,15 @@ padlock_ctr32_encrypt: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp - jmp .L050ctr32_loop + movl %eax,16(%ebp) + jmp .L048ctr32_loop .align 16 -.L050ctr32_loop: +.L048ctr32_loop: movl %edi,(%ebp) movl %esi,4(%ebp) movl %ecx,8(%ebp) @@ -741,7 +820,7 @@ padlock_ctr32_encrypt: movl -4(%edx),%ecx xorl %edi,%edi movl -8(%edx),%eax -.L051ctr32_prepare: +.L049ctr32_prepare: movl %ecx,12(%esp,%edi,1) bswap %ecx movq %mm0,(%esp,%edi,1) @@ -750,7 +829,7 @@ padlock_ctr32_encrypt: bswap %ecx leal 16(%edi),%edi cmpl %ebx,%edi - jb .L051ctr32_prepare + jb .L049ctr32_prepare movl %ecx,-4(%edx) leal (%esp),%esi leal (%esp),%edi @@ -763,32 +842,33 @@ padlock_ctr32_encrypt: movl 12(%ebp),%ebx movl 4(%ebp),%esi xorl %ecx,%ecx -.L052ctr32_xor: +.L050ctr32_xor: movups (%esi,%ecx,1),%xmm1 leal 16(%ecx),%ecx pxor -16(%esp,%ecx,1),%xmm1 movups %xmm1,-16(%edi,%ecx,1) cmpl %ebx,%ecx - jb .L052ctr32_xor + jb .L050ctr32_xor movl 8(%ebp),%ecx addl %ebx,%edi addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz .L050ctr32_loop + jnz .L048ctr32_loop pxor %xmm0,%xmm0 leal (%esp),%eax -.L053ctr32_bzero: +.L051ctr32_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja .L053ctr32_bzero -.L054ctr32_done: + ja .L051ctr32_bzero +.L052ctr32_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp movl $1,%eax leal 4(%esp),%esp emms -.L048ctr32_abort: +.L046ctr32_abort: popl %edi popl %esi popl %ebx @@ -814,10 +894,10 @@ _win32_segv_handler: movl 4(%esp),%edx movl 12(%esp),%ecx cmpl $3221225477,(%edx) - jne .L055ret + jne .L053ret addl $4,184(%ecx) movl $0,%eax -.L055ret: +.L053ret: ret .size _win32_segv_handler,.-_win32_segv_handler .globl padlock_sha1_oneshot diff --git a/lib/accelerated/x86/elf/e_padlock-x86_64.s b/lib/accelerated/x86/elf/e_padlock-x86_64.s index 014ecaf569..1f86a997a8 100644 --- a/lib/accelerated/x86/elf/e_padlock-x86_64.s +++ b/lib/accelerated/x86/elf/e_padlock-x86_64.s @@ -127,7 +127,7 @@ padlock_aes_block: movq $1,%rcx leaq 32(%rdx),%rbx leaq 16(%rdx),%rdx -.byte 0xf3,0x0f,0xa7,0xc8 +.byte 0xf3,0x0f,0xa7,0xc8 movq %r8,%rbx .byte 0xf3,0xc3 .size padlock_aes_block,.-padlock_aes_block @@ -137,7 +137,7 @@ padlock_aes_block: .align 16 padlock_xstore: movl %esi,%edx -.byte 0x0f,0xa7,0xc0 +.byte 0x0f,0xa7,0xc0 .byte 0xf3,0xc3 .size padlock_xstore,.-padlock_xstore @@ -154,7 +154,7 @@ padlock_sha1_oneshot: movq %rsp,%rdi movl %eax,16(%rsp) xorq %rax,%rax -.byte 0xf3,0x0f,0xa6,0xc8 +.byte 0xf3,0x0f,0xa6,0xc8 movaps (%rsp),%xmm0 movl 16(%rsp),%eax addq $128+8,%rsp @@ -176,7 +176,7 @@ padlock_sha1_blocks: movq %rsp,%rdi movl %eax,16(%rsp) movq $-1,%rax -.byte 0xf3,0x0f,0xa6,0xc8 +.byte 0xf3,0x0f,0xa6,0xc8 movaps (%rsp),%xmm0 movl 16(%rsp),%eax addq $128+8,%rsp @@ -198,7 +198,7 @@ padlock_sha256_oneshot: movq %rsp,%rdi movaps %xmm1,16(%rsp) xorq %rax,%rax -.byte 0xf3,0x0f,0xa6,0xd0 +.byte 0xf3,0x0f,0xa6,0xd0 movaps (%rsp),%xmm0 movaps 16(%rsp),%xmm1 addq $128+8,%rsp @@ -220,7 +220,7 @@ padlock_sha256_blocks: movq %rsp,%rdi movaps %xmm1,16(%rsp) movq $-1,%rax -.byte 0xf3,0x0f,0xa6,0xd0 +.byte 0xf3,0x0f,0xa6,0xd0 movaps (%rsp),%xmm0 movaps 16(%rsp),%xmm1 addq $128+8,%rsp @@ -245,7 +245,7 @@ padlock_sha512_blocks: movaps %xmm1,16(%rsp) movaps %xmm2,32(%rsp) movaps %xmm3,48(%rsp) -.byte 0xf3,0x0f,0xa6,0xe0 +.byte 0xf3,0x0f,0xa6,0xe0 movaps (%rsp),%xmm0 movaps 16(%rsp),%xmm1 movaps 32(%rsp),%xmm2 @@ -276,8 +276,6 @@ padlock_ecb_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $128,%rcx - jbe .Lecb_short testl $32,(%rdx) jnz .Lecb_aligned testq $15,%rdi @@ -297,6 +295,21 @@ padlock_ecb_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + cmpq %rbx,%rcx + ja .Lecb_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $128,%rax + movq $-128,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz .Lecb_unaligned_tail jmp .Lecb_loop .align 16 .Lecb_loop: @@ -312,7 +325,7 @@ padlock_ecb_encrypt: testq $15,%rsi jz .Lecb_inp_aligned shrq $3,%rcx -.byte 0xf3,0x48,0xa5 +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi movq %rbx,%rcx movq %rdi,%rsi @@ -320,15 +333,15 @@ padlock_ecb_encrypt: leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,200 +.byte 0xf3,0x0f,0xa7,200 movq %r8,%rdi movq %r11,%rbx testq $15,%rdi jz .Lecb_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi -.byte 0xf3,0x48,0xa5 + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi .Lecb_out_aligned: movq %r9,%rsi @@ -337,9 +350,26 @@ padlock_ecb_encrypt: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx - jnz .Lecb_loop - + jz .Lecb_break + cmpq %rbx,%rcx + jae .Lecb_loop +.Lecb_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp .Lecb_loop +.align 16 +.Lecb_break: + cmpq %rbp,%rsp je .Lecb_done pxor %xmm0,%xmm0 @@ -353,26 +383,39 @@ padlock_ecb_encrypt: .Lecb_done: leaq (%rbp),%rsp jmp .Lecb_exit -.align 16 -.Lecb_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -.Lecb_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja .Lecb_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp .Lecb_loop + .align 16 .Lecb_aligned: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $128,%rbp + movq $128-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz .Lecb_aligned_tail leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,200 +.byte 0xf3,0x0f,0xa7,200 + testq %rbp,%rbp + jz .Lecb_exit + +.Lecb_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp .Lecb_loop .Lecb_exit: movl $1,%eax leaq 8(%rsp),%rsp @@ -400,8 +443,6 @@ padlock_cbc_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $64,%rcx - jbe .Lcbc_short testl $32,(%rdx) jnz .Lcbc_aligned testq $15,%rdi @@ -421,6 +462,21 @@ padlock_cbc_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + cmpq %rbx,%rcx + ja .Lcbc_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $64,%rax + movq $-64,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz .Lcbc_unaligned_tail jmp .Lcbc_loop .align 16 .Lcbc_loop: @@ -436,7 +492,7 @@ padlock_cbc_encrypt: testq $15,%rsi jz .Lcbc_inp_aligned shrq $3,%rcx -.byte 0xf3,0x48,0xa5 +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi movq %rbx,%rcx movq %rdi,%rsi @@ -444,7 +500,7 @@ padlock_cbc_encrypt: leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,208 +.byte 0xf3,0x0f,0xa7,208 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) movq %r8,%rdi @@ -452,9 +508,9 @@ padlock_cbc_encrypt: testq $15,%rdi jz .Lcbc_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi -.byte 0xf3,0x48,0xa5 + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi .Lcbc_out_aligned: movq %r9,%rsi @@ -463,9 +519,26 @@ padlock_cbc_encrypt: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx - jnz .Lcbc_loop - + jz .Lcbc_break + cmpq %rbx,%rcx + jae .Lcbc_loop +.Lcbc_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp .Lcbc_loop +.align 16 +.Lcbc_break: + cmpq %rbp,%rsp je .Lcbc_done pxor %xmm0,%xmm0 @@ -479,28 +552,41 @@ padlock_cbc_encrypt: .Lcbc_done: leaq (%rbp),%rsp jmp .Lcbc_exit -.align 16 -.Lcbc_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -.Lcbc_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja .Lcbc_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp .Lcbc_loop + .align 16 .Lcbc_aligned: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $64,%rbp + movq $64-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz .Lcbc_aligned_tail leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,208 +.byte 0xf3,0x0f,0xa7,208 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) + testq %rbp,%rbp + jz .Lcbc_exit + +.Lcbc_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp .Lcbc_loop .Lcbc_exit: movl $1,%eax leaq 8(%rsp),%rsp @@ -547,6 +633,8 @@ padlock_cfb_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx jmp .Lcfb_loop .align 16 .Lcfb_loop: @@ -562,7 +650,7 @@ padlock_cfb_encrypt: testq $15,%rsi jz .Lcfb_inp_aligned shrq $3,%rcx -.byte 0xf3,0x48,0xa5 +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi movq %rbx,%rcx movq %rdi,%rsi @@ -570,7 +658,7 @@ padlock_cfb_encrypt: leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,224 +.byte 0xf3,0x0f,0xa7,224 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) movq %r8,%rdi @@ -578,9 +666,9 @@ padlock_cfb_encrypt: testq $15,%rdi jz .Lcfb_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi -.byte 0xf3,0x48,0xa5 + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi .Lcfb_out_aligned: movq %r9,%rsi @@ -590,8 +678,7 @@ padlock_cfb_encrypt: subq %rbx,%rcx movq $512,%rbx jnz .Lcfb_loop - - cmpq %rsp,%rbp + cmpq %rbp,%rsp je .Lcfb_done pxor %xmm0,%xmm0 @@ -605,12 +692,13 @@ padlock_cfb_encrypt: .Lcfb_done: leaq (%rbp),%rsp jmp .Lcfb_exit + .align 16 .Lcfb_aligned: leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,224 +.byte 0xf3,0x0f,0xa7,224 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) .Lcfb_exit: @@ -659,6 +747,8 @@ padlock_ofb_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx jmp .Lofb_loop .align 16 .Lofb_loop: @@ -674,7 +764,7 @@ padlock_ofb_encrypt: testq $15,%rsi jz .Lofb_inp_aligned shrq $3,%rcx -.byte 0xf3,0x48,0xa5 +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi movq %rbx,%rcx movq %rdi,%rsi @@ -682,7 +772,7 @@ padlock_ofb_encrypt: leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,232 +.byte 0xf3,0x0f,0xa7,232 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) movq %r8,%rdi @@ -690,9 +780,9 @@ padlock_ofb_encrypt: testq $15,%rdi jz .Lofb_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi -.byte 0xf3,0x48,0xa5 + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi .Lofb_out_aligned: movq %r9,%rsi @@ -702,8 +792,7 @@ padlock_ofb_encrypt: subq %rbx,%rcx movq $512,%rbx jnz .Lofb_loop - - cmpq %rsp,%rbp + cmpq %rbp,%rsp je .Lofb_done pxor %xmm0,%xmm0 @@ -717,12 +806,13 @@ padlock_ofb_encrypt: .Lofb_done: leaq (%rbp),%rsp jmp .Lofb_exit + .align 16 .Lofb_aligned: leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,232 +.byte 0xf3,0x0f,0xa7,232 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) .Lofb_exit: @@ -752,8 +842,6 @@ padlock_ctr32_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $64,%rcx - jbe .Lctr32_short testl $32,(%rdx) jnz .Lctr32_aligned testq $15,%rdi @@ -773,15 +861,32 @@ padlock_ctr32_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx .Lctr32_reenter: movl -4(%rdx),%eax bswapl %eax negl %eax andl $31,%eax - jz .Lctr32_loop + movq $512,%rbx shll $4,%eax + cmovzq %rbx,%rax cmpq %rax,%rcx cmovaq %rax,%rbx + cmovbeq %rcx,%rbx + cmpq %rbx,%rcx + ja .Lctr32_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $32,%rax + movq $-32,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz .Lctr32_unaligned_tail jmp .Lctr32_loop .align 16 .Lctr32_loop: @@ -797,7 +902,7 @@ padlock_ctr32_encrypt: testq $15,%rsi jz .Lctr32_inp_aligned shrq $3,%rcx -.byte 0xf3,0x48,0xa5 +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi movq %rbx,%rcx movq %rdi,%rsi @@ -805,23 +910,23 @@ padlock_ctr32_encrypt: leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,216 +.byte 0xf3,0x0f,0xa7,216 movl -4(%rdx),%eax testl $4294901760,%eax - jnz .Lctr32_no_corr + jnz .Lctr32_no_carry bswapl %eax addl $65536,%eax bswapl %eax movl %eax,-4(%rdx) -.Lctr32_no_corr: +.Lctr32_no_carry: movq %r8,%rdi movq %r11,%rbx testq $15,%rdi jz .Lctr32_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi -.byte 0xf3,0x48,0xa5 + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi .Lctr32_out_aligned: movq %r9,%rsi @@ -830,9 +935,38 @@ padlock_ctr32_encrypt: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx + jz .Lctr32_break + cmpq %rbx,%rcx + jae .Lctr32_loop + movq %rcx,%rbx + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $32,%rax + movq $-32,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx jnz .Lctr32_loop - +.Lctr32_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp .Lctr32_loop +.align 16 +.Lctr32_break: + cmpq %rbp,%rsp je .Lctr32_done pxor %xmm0,%xmm0 @@ -846,56 +980,75 @@ padlock_ctr32_encrypt: .Lctr32_done: leaq (%rbp),%rsp jmp .Lctr32_exit -.align 16 -.Lctr32_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -.Lctr32_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja .Lctr32_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp .Lctr32_reenter + .align 16 .Lctr32_aligned: movl -4(%rdx),%eax - movq $1048576,%rbx bswapl %eax - cmpq %rcx,%rbx - cmovaq %rcx,%rbx negl %eax andl $65535,%eax - jz .Lctr32_aligned_loop + movq $1048576,%rbx shll $4,%eax + cmovzq %rbx,%rax cmpq %rax,%rcx cmovaq %rax,%rbx - jmp .Lctr32_aligned_loop -.align 16 + cmovbeq %rcx,%rbx + jbe .Lctr32_aligned_skip + .Lctr32_aligned_loop: - cmpq %rcx,%rbx - cmovaq %rcx,%rbx movq %rcx,%r10 movq %rbx,%rcx movq %rbx,%r11 + leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,216 +.byte 0xf3,0x0f,0xa7,216 + movl -4(%rdx),%eax bswapl %eax addl $65536,%eax bswapl %eax movl %eax,-4(%rdx) - movq %r11,%rbx movq %r10,%rcx - subq %rbx,%rcx + subq %r11,%rcx movq $1048576,%rbx - jnz .Lctr32_aligned_loop + jz .Lctr32_exit + cmpq %rbx,%rcx + jae .Lctr32_aligned_loop + +.Lctr32_aligned_skip: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $32,%rbp + movq $32-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz .Lctr32_aligned_tail + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,216 + testq %rbp,%rbp + jz .Lctr32_exit + +.Lctr32_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp .Lctr32_loop .Lctr32_exit: movl $1,%eax leaq 8(%rsp),%rsp diff --git a/lib/accelerated/x86/elf/ghash-x86_64.s b/lib/accelerated/x86/elf/ghash-x86_64.s index f60c95b546..17f6bebe6e 100644 --- a/lib/accelerated/x86/elf/ghash-x86_64.s +++ b/lib/accelerated/x86/elf/ghash-x86_64.s @@ -39,6 +39,7 @@ # .text + .globl gcm_gmult_4bit .type gcm_gmult_4bit,@function .align 16 @@ -697,6 +698,7 @@ gcm_ghash_4bit: .type gcm_init_clmul,@function .align 16 gcm_init_clmul: +.L_init_clmul: movdqu (%rsi),%xmm2 pshufd $78,%xmm2,%xmm2 @@ -715,15 +717,15 @@ gcm_init_clmul: pxor %xmm5,%xmm2 + pshufd $78,%xmm2,%xmm6 movdqa %xmm2,%xmm0 + pxor %xmm2,%xmm6 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 +.byte 102,15,58,68,222,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 @@ -733,44 +735,134 @@ gcm_init_clmul: pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm2,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm2,%xmm3 + movdqu %xmm2,0(%rdi) + pxor %xmm0,%xmm4 + movdqu %xmm0,16(%rdi) +.byte 102,15,58,15,227,8 + movdqu %xmm4,32(%rdi) + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + movdqa %xmm0,%xmm5 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - movdqu %xmm2,(%rdi) - movdqu %xmm0,16(%rdi) + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm5,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm5,%xmm3 + movdqu %xmm5,48(%rdi) + pxor %xmm0,%xmm4 + movdqu %xmm0,64(%rdi) +.byte 102,15,58,15,227,8 + movdqu %xmm4,80(%rdi) .byte 0xf3,0xc3 .size gcm_init_clmul,.-gcm_init_clmul .globl gcm_gmult_clmul .type gcm_gmult_clmul,@function .align 16 gcm_gmult_clmul: +.L_gmult_clmul: movdqu (%rdi),%xmm0 movdqa .Lbswap_mask(%rip),%xmm5 movdqu (%rsi),%xmm2 + movdqu 32(%rsi),%xmm4 .byte 102,15,56,0,197 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,220,0 @@ -783,201 +875,381 @@ gcm_gmult_clmul: pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 .byte 102,15,56,0,197 movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 .size gcm_gmult_clmul,.-gcm_gmult_clmul .globl gcm_ghash_clmul .type gcm_ghash_clmul,@function -.align 16 +.align 32 gcm_ghash_clmul: - movdqa .Lbswap_mask(%rip),%xmm5 +.L_ghash_clmul: + movdqa .Lbswap_mask(%rip),%xmm10 movdqu (%rdi),%xmm0 movdqu (%rsi),%xmm2 -.byte 102,15,56,0,197 + movdqu 32(%rsi),%xmm7 +.byte 102,65,15,56,0,194 subq $16,%rcx jz .Lodd_tail - movdqu 16(%rsi),%xmm8 + movdqu 16(%rsi),%xmm6 + movl _gnutls_x86_cpuid_s+4(%rip),%eax + cmpq $48,%rcx + jb .Lskip4x + andl $71303168,%eax + cmpl $4194304,%eax + je .Lskip4x + subq $48,%rcx + movq $11547335547999543296,%rax + movdqu 48(%rsi),%xmm14 + movdqu 64(%rsi),%xmm15 - movdqu (%rdx),%xmm3 - movdqu 16(%rdx),%xmm6 -.byte 102,15,56,0,221 -.byte 102,15,56,0,245 - pxor %xmm3,%xmm0 - movdqa %xmm6,%xmm7 - pshufd $78,%xmm6,%xmm3 - pshufd $78,%xmm2,%xmm4 - pxor %xmm6,%xmm3 - pxor %xmm2,%xmm4 -.byte 102,15,58,68,242,0 -.byte 102,15,58,68,250,17 -.byte 102,15,58,68,220,0 - pxor %xmm6,%xmm3 - pxor %xmm7,%xmm3 - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm7 - pxor %xmm4,%xmm6 + movdqu 48(%rdx),%xmm3 + movdqu 32(%rdx),%xmm11 +.byte 102,65,15,56,0,218 +.byte 102,69,15,56,0,218 + movdqa %xmm3,%xmm5 + pshufd $78,%xmm3,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,68,218,0 +.byte 102,15,58,68,234,17 +.byte 102,15,58,68,231,0 + + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm11,%xmm12 +.byte 102,68,15,58,68,222,0 +.byte 102,68,15,58,68,238,17 + xorps %xmm11,%xmm3 +.byte 102,68,15,58,68,231,16 + xorps %xmm13,%xmm5 + movups 80(%rsi),%xmm7 + xorps %xmm12,%xmm4 + + movdqu 16(%rdx),%xmm11 + movdqu 0(%rdx),%xmm8 +.byte 102,69,15,56,0,218 +.byte 102,69,15,56,0,194 + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm8,%xmm0 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm8,%xmm4 - pxor %xmm0,%xmm3 - pxor %xmm8,%xmm4 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 +.byte 102,69,15,58,68,238,17 + xorps %xmm11,%xmm3 +.byte 102,68,15,58,68,231,0 + xorps %xmm13,%xmm5 - leaq 32(%rdx),%rdx - subq $32,%rcx - jbe .Leven_tail + leaq 64(%rdx),%rdx + subq $64,%rcx + jc .Ltail4x -.Lmod_loop: -.byte 102,65,15,58,68,192,0 -.byte 102,65,15,58,68,200,17 -.byte 102,15,58,68,220,0 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 + jmp .Lmod4_loop +.align 32 +.Lmod4_loop: +.byte 102,65,15,58,68,199,0 + xorps %xmm12,%xmm4 + movdqu 48(%rdx),%xmm11 +.byte 102,69,15,56,0,218 +.byte 102,65,15,58,68,207,17 + xorps %xmm3,%xmm0 + movdqu 32(%rdx),%xmm3 + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 +.byte 102,68,15,58,68,199,16 + xorps %xmm5,%xmm1 + pxor %xmm11,%xmm12 +.byte 102,65,15,56,0,218 + movups 32(%rsi),%xmm7 +.byte 102,68,15,58,68,218,0 + xorps %xmm4,%xmm8 + movdqa %xmm3,%xmm5 + pshufd $78,%xmm3,%xmm4 - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0 - movdqu (%rdx),%xmm3 - pxor %xmm6,%xmm0 - pxor %xmm7,%xmm1 - - movdqu 16(%rdx),%xmm6 -.byte 102,15,56,0,221 -.byte 102,15,56,0,245 - - movdqa %xmm6,%xmm7 - pshufd $78,%xmm6,%xmm9 - pshufd $78,%xmm2,%xmm10 - pxor %xmm6,%xmm9 - pxor %xmm2,%xmm10 - pxor %xmm3,%xmm1 + pxor %xmm0,%xmm8 + pxor %xmm3,%xmm4 + pxor %xmm1,%xmm8 + movdqa %xmm8,%xmm9 + pslldq $8,%xmm8 +.byte 102,68,15,58,68,234,17 + psrldq $8,%xmm9 + pxor %xmm8,%xmm0 + movdqa .L7_mask(%rip),%xmm8 + pxor %xmm9,%xmm1 +.byte 102,76,15,110,200 + + pand %xmm0,%xmm8 +.byte 102,69,15,56,0,200 +.byte 102,68,15,58,68,231,0 + pxor %xmm0,%xmm9 + psllq $57,%xmm9 + movdqa %xmm9,%xmm8 + pslldq $8,%xmm9 +.byte 102,15,58,68,222,0 + psrldq $8,%xmm8 + pxor %xmm9,%xmm0 + pxor %xmm8,%xmm1 + movdqu 0(%rdx),%xmm8 + movdqa %xmm0,%xmm9 + psrlq $1,%xmm0 +.byte 102,15,58,68,238,17 + xorps %xmm11,%xmm3 + movdqu 16(%rdx),%xmm11 +.byte 102,69,15,56,0,218 +.byte 102,15,58,68,231,16 + xorps %xmm13,%xmm5 + movups 80(%rsi),%xmm7 +.byte 102,69,15,56,0,194 + pxor %xmm9,%xmm1 + pxor %xmm0,%xmm9 + psrlq $5,%xmm0 + + movdqa %xmm11,%xmm13 + pxor %xmm12,%xmm4 + pshufd $78,%xmm11,%xmm12 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 + pxor %xmm9,%xmm0 + pxor %xmm8,%xmm1 + psrlq $1,%xmm0 +.byte 102,69,15,58,68,238,17 + xorps %xmm11,%xmm3 + pxor %xmm1,%xmm0 + +.byte 102,68,15,58,68,231,0 + xorps %xmm13,%xmm5 + + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 + + leaq 64(%rdx),%rdx + subq $64,%rcx + jnc .Lmod4_loop + +.Ltail4x: +.byte 102,65,15,58,68,199,0 + xorps %xmm12,%xmm4 +.byte 102,65,15,58,68,207,17 + xorps %xmm3,%xmm0 +.byte 102,68,15,58,68,199,16 + xorps %xmm5,%xmm1 + pxor %xmm0,%xmm1 + pxor %xmm4,%xmm8 + + pxor %xmm1,%xmm8 + pxor %xmm0,%xmm1 + + movdqa %xmm8,%xmm9 + psrldq $8,%xmm8 + pslldq $8,%xmm9 + pxor %xmm8,%xmm1 + pxor %xmm9,%xmm0 + + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 -.byte 102,15,58,68,242,0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + -.byte 102,15,58,68,250,17 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 + addq $64,%rcx + jz .Ldone + movdqu 32(%rsi),%xmm7 + subq $16,%rcx + jz .Lodd_tail +.Lskip4x: + + + + -.byte 102,69,15,58,68,202,0 + movdqu (%rdx),%xmm8 + movdqu 16(%rdx),%xmm3 +.byte 102,69,15,56,0,194 +.byte 102,65,15,56,0,218 + pxor %xmm8,%xmm0 + + movdqa %xmm3,%xmm5 + pshufd $78,%xmm3,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,68,218,0 +.byte 102,15,58,68,234,17 +.byte 102,15,58,68,231,0 + + leaq 32(%rdx),%rdx + nop + subq $32,%rcx + jbe .Leven_tail + nop + jmp .Lmod_loop + +.align 32 +.Lmod_loop: movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm8,%xmm4 - pxor %xmm0,%xmm3 + movdqa %xmm4,%xmm8 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 + +.byte 102,15,58,68,198,0 +.byte 102,15,58,68,206,17 +.byte 102,15,58,68,231,16 + + pxor %xmm3,%xmm0 + pxor %xmm5,%xmm1 + movdqu (%rdx),%xmm5 + pxor %xmm0,%xmm8 +.byte 102,65,15,56,0,234 + movdqu 16(%rdx),%xmm3 + + pxor %xmm1,%xmm8 + pxor %xmm5,%xmm1 pxor %xmm8,%xmm4 +.byte 102,65,15,56,0,218 + movdqa %xmm4,%xmm8 + psrldq $8,%xmm8 + pslldq $8,%xmm4 + pxor %xmm8,%xmm1 + pxor %xmm4,%xmm0 - pxor %xmm6,%xmm9 - pxor %xmm7,%xmm9 - movdqa %xmm9,%xmm10 - psrldq $8,%xmm9 - pslldq $8,%xmm10 - pxor %xmm9,%xmm7 - pxor %xmm10,%xmm6 + movdqa %xmm3,%xmm5 + + movdqa %xmm0,%xmm9 + movdqa %xmm0,%xmm8 + psllq $5,%xmm0 + pxor %xmm0,%xmm8 +.byte 102,15,58,68,218,0 + psllq $1,%xmm0 + pxor %xmm8,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm8 + pslldq $8,%xmm0 + psrldq $8,%xmm8 + pxor %xmm9,%xmm0 + pshufd $78,%xmm5,%xmm4 + pxor %xmm8,%xmm1 + pxor %xmm5,%xmm4 +.byte 102,15,58,68,234,17 + movdqa %xmm0,%xmm9 + psrlq $1,%xmm0 + pxor %xmm9,%xmm1 + pxor %xmm0,%xmm9 + psrlq $5,%xmm0 + pxor %xmm9,%xmm0 leaq 32(%rdx),%rdx + psrlq $1,%xmm0 +.byte 102,15,58,68,231,0 + pxor %xmm1,%xmm0 +.byte 0x66,0x90 + subq $32,%rcx ja .Lmod_loop .Leven_tail: -.byte 102,65,15,58,68,192,0 -.byte 102,65,15,58,68,200,17 -.byte 102,15,58,68,220,0 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm8 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 +.byte 102,15,58,68,198,0 +.byte 102,15,58,68,206,17 +.byte 102,15,58,68,231,16 + + pxor %xmm3,%xmm0 + pxor %xmm5,%xmm1 + pxor %xmm0,%xmm8 + pxor %xmm1,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm8 + psrldq $8,%xmm8 pslldq $8,%xmm4 - pxor %xmm3,%xmm1 + pxor %xmm8,%xmm1 pxor %xmm4,%xmm0 - pxor %xmm6,%xmm0 - pxor %xmm7,%xmm1 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 testq %rcx,%rcx jnz .Ldone .Lodd_tail: - movdqu (%rdx),%xmm3 -.byte 102,15,56,0,221 - pxor %xmm3,%xmm0 + movdqu (%rdx),%xmm8 +.byte 102,69,15,56,0,194 + pxor %xmm8,%xmm0 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 +.byte 102,15,58,68,223,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 @@ -987,38 +1259,60 @@ gcm_ghash_clmul: pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 .Ldone: -.byte 102,15,56,0,197 +.byte 102,65,15,56,0,194 movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 -.LSEH_end_gcm_ghash_clmul: .size gcm_ghash_clmul,.-gcm_ghash_clmul +.globl gcm_init_avx +.type gcm_init_avx,@function +.align 32 +gcm_init_avx: + jmp .L_init_clmul +.size gcm_init_avx,.-gcm_init_avx +.globl gcm_gmult_avx +.type gcm_gmult_avx,@function +.align 32 +gcm_gmult_avx: + jmp .L_gmult_clmul +.size gcm_gmult_avx,.-gcm_gmult_avx +.globl gcm_ghash_avx +.type gcm_ghash_avx,@function +.align 32 +gcm_ghash_avx: + jmp .L_ghash_clmul +.size gcm_ghash_avx,.-gcm_ghash_avx .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .L0x1c2_polynomial: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.L7_mask: +.long 7,0,7,0 +.L7_mask_poly: +.long 7,0,450,0 .align 64 .type .Lrem_4bit,@object .Lrem_4bit: diff --git a/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s b/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s index 3bb65a54e9..116efd0c18 100644 --- a/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s +++ b/lib/accelerated/x86/elf/sha1-ssse3-x86_64.s @@ -46,23 +46,25 @@ sha1_block_data_order: movl _gnutls_x86_cpuid_s+0(%rip),%r9d movl _gnutls_x86_cpuid_s+4(%rip),%r8d + movl _gnutls_x86_cpuid_s+8(%rip),%r10d testl $512,%r8d jz .Lialu jmp _ssse3_shortcut .align 16 .Lialu: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 - movq %rsp,%r11 + pushq %r14 movq %rdi,%r8 subq $72,%rsp movq %rsi,%r9 andq $-64,%rsp movq %rdx,%r10 - movq %r11,64(%rsp) + movq %rax,64(%rsp) .Lprologue: movl 0(%r8),%esi @@ -76,1230 +78,1168 @@ sha1_block_data_order: .Lloop: movl 0(%r9),%edx bswapl %edx - movl %edx,0(%rsp) - movl %r11d,%eax movl 4(%r9),%ebp + movl %r12d,%eax + movl %edx,0(%rsp) movl %esi,%ecx - xorl %r12d,%eax bswapl %ebp + xorl %r11d,%eax roll $5,%ecx - leal 1518500249(%rdx,%r13,1),%r13d andl %edi,%eax - movl %ebp,4(%rsp) + leal 1518500249(%rdx,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d - movl %edi,%eax - movl 8(%r9),%edx + movl 8(%r9),%r14d + movl %r11d,%eax + movl %ebp,4(%rsp) movl %r13d,%ecx - xorl %r11d,%eax - bswapl %edx + bswapl %r14d + xorl %edi,%eax roll $5,%ecx - leal 1518500249(%rbp,%r12,1),%r12d andl %esi,%eax - movl %edx,8(%rsp) + leal 1518500249(%rbp,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d - movl %esi,%eax - movl 12(%r9),%ebp + movl 12(%r9),%edx + movl %edi,%eax + movl %r14d,8(%rsp) movl %r12d,%ecx - xorl %edi,%eax - bswapl %ebp + bswapl %edx + xorl %esi,%eax roll $5,%ecx - leal 1518500249(%rdx,%r11,1),%r11d andl %r13d,%eax - movl %ebp,12(%rsp) + leal 1518500249(%r14,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d - movl %r13d,%eax - movl 16(%r9),%edx + movl 16(%r9),%ebp + movl %esi,%eax + movl %edx,12(%rsp) movl %r11d,%ecx - xorl %esi,%eax - bswapl %edx + bswapl %ebp + xorl %r13d,%eax roll $5,%ecx - leal 1518500249(%rbp,%rdi,1),%edi andl %r12d,%eax - movl %edx,16(%rsp) + leal 1518500249(%rdx,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi - movl %r12d,%eax - movl 20(%r9),%ebp + movl 20(%r9),%r14d + movl %r13d,%eax + movl %ebp,16(%rsp) movl %edi,%ecx - xorl %r13d,%eax - bswapl %ebp + bswapl %r14d + xorl %r12d,%eax roll $5,%ecx - leal 1518500249(%rdx,%rsi,1),%esi andl %r11d,%eax - movl %ebp,20(%rsp) + leal 1518500249(%rbp,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi - movl %r11d,%eax movl 24(%r9),%edx + movl %r12d,%eax + movl %r14d,20(%rsp) movl %esi,%ecx - xorl %r12d,%eax bswapl %edx + xorl %r11d,%eax roll $5,%ecx - leal 1518500249(%rbp,%r13,1),%r13d andl %edi,%eax - movl %edx,24(%rsp) + leal 1518500249(%r14,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d - movl %edi,%eax movl 28(%r9),%ebp + movl %r11d,%eax + movl %edx,24(%rsp) movl %r13d,%ecx - xorl %r11d,%eax bswapl %ebp + xorl %edi,%eax roll $5,%ecx - leal 1518500249(%rdx,%r12,1),%r12d andl %esi,%eax - movl %ebp,28(%rsp) + leal 1518500249(%rdx,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d - movl %esi,%eax - movl 32(%r9),%edx + movl 32(%r9),%r14d + movl %edi,%eax + movl %ebp,28(%rsp) movl %r12d,%ecx - xorl %edi,%eax - bswapl %edx + bswapl %r14d + xorl %esi,%eax roll $5,%ecx - leal 1518500249(%rbp,%r11,1),%r11d andl %r13d,%eax - movl %edx,32(%rsp) + leal 1518500249(%rbp,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d - movl %r13d,%eax - movl 36(%r9),%ebp + movl 36(%r9),%edx + movl %esi,%eax + movl %r14d,32(%rsp) movl %r11d,%ecx - xorl %esi,%eax - bswapl %ebp + bswapl %edx + xorl %r13d,%eax roll $5,%ecx - leal 1518500249(%rdx,%rdi,1),%edi andl %r12d,%eax - movl %ebp,36(%rsp) + leal 1518500249(%r14,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi - movl %r12d,%eax - movl 40(%r9),%edx + movl 40(%r9),%ebp + movl %r13d,%eax + movl %edx,36(%rsp) movl %edi,%ecx - xorl %r13d,%eax - bswapl %edx + bswapl %ebp + xorl %r12d,%eax roll $5,%ecx - leal 1518500249(%rbp,%rsi,1),%esi andl %r11d,%eax - movl %edx,40(%rsp) + leal 1518500249(%rdx,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi - movl %r11d,%eax - movl 44(%r9),%ebp + movl 44(%r9),%r14d + movl %r12d,%eax + movl %ebp,40(%rsp) movl %esi,%ecx - xorl %r12d,%eax - bswapl %ebp + bswapl %r14d + xorl %r11d,%eax roll $5,%ecx - leal 1518500249(%rdx,%r13,1),%r13d andl %edi,%eax - movl %ebp,44(%rsp) + leal 1518500249(%rbp,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d - movl %edi,%eax movl 48(%r9),%edx + movl %r11d,%eax + movl %r14d,44(%rsp) movl %r13d,%ecx - xorl %r11d,%eax bswapl %edx + xorl %edi,%eax roll $5,%ecx - leal 1518500249(%rbp,%r12,1),%r12d andl %esi,%eax - movl %edx,48(%rsp) + leal 1518500249(%r14,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d - movl %esi,%eax movl 52(%r9),%ebp + movl %edi,%eax + movl %edx,48(%rsp) movl %r12d,%ecx - xorl %edi,%eax bswapl %ebp + xorl %esi,%eax roll $5,%ecx - leal 1518500249(%rdx,%r11,1),%r11d andl %r13d,%eax - movl %ebp,52(%rsp) + leal 1518500249(%rdx,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d - movl %r13d,%eax - movl 56(%r9),%edx + movl 56(%r9),%r14d + movl %esi,%eax + movl %ebp,52(%rsp) movl %r11d,%ecx - xorl %esi,%eax - bswapl %edx + bswapl %r14d + xorl %r13d,%eax roll $5,%ecx - leal 1518500249(%rbp,%rdi,1),%edi andl %r12d,%eax - movl %edx,56(%rsp) + leal 1518500249(%rbp,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi - movl %r12d,%eax - movl 60(%r9),%ebp + movl 60(%r9),%edx + movl %r13d,%eax + movl %r14d,56(%rsp) movl %edi,%ecx - xorl %r13d,%eax - bswapl %ebp + bswapl %edx + xorl %r12d,%eax roll $5,%ecx - leal 1518500249(%rdx,%rsi,1),%esi andl %r11d,%eax - movl %ebp,60(%rsp) + leal 1518500249(%r14,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi - movl 0(%rsp),%edx - movl %r11d,%eax + xorl 0(%rsp),%ebp + movl %r12d,%eax + movl %edx,60(%rsp) movl %esi,%ecx - xorl 8(%rsp),%edx - xorl %r12d,%eax + xorl 8(%rsp),%ebp + xorl %r11d,%eax roll $5,%ecx - xorl 32(%rsp),%edx + xorl 32(%rsp),%ebp andl %edi,%eax - leal 1518500249(%rbp,%r13,1),%r13d - xorl 52(%rsp),%edx + leal 1518500249(%rdx,%r13,1),%r13d + roll $30,%edi xorl %r12d,%eax - roll $1,%edx addl %ecx,%r13d - roll $30,%edi - movl %edx,0(%rsp) + roll $1,%ebp addl %eax,%r13d - movl 4(%rsp),%ebp - movl %edi,%eax + xorl 4(%rsp),%r14d + movl %r11d,%eax + movl %ebp,0(%rsp) movl %r13d,%ecx - xorl 12(%rsp),%ebp - xorl %r11d,%eax + xorl 12(%rsp),%r14d + xorl %edi,%eax roll $5,%ecx - xorl 36(%rsp),%ebp + xorl 36(%rsp),%r14d andl %esi,%eax - leal 1518500249(%rdx,%r12,1),%r12d - xorl 56(%rsp),%ebp + leal 1518500249(%rbp,%r12,1),%r12d + roll $30,%esi xorl %r11d,%eax - roll $1,%ebp addl %ecx,%r12d - roll $30,%esi - movl %ebp,4(%rsp) + roll $1,%r14d addl %eax,%r12d - movl 8(%rsp),%edx - movl %esi,%eax + xorl 8(%rsp),%edx + movl %edi,%eax + movl %r14d,4(%rsp) movl %r12d,%ecx xorl 16(%rsp),%edx - xorl %edi,%eax + xorl %esi,%eax roll $5,%ecx xorl 40(%rsp),%edx andl %r13d,%eax - leal 1518500249(%rbp,%r11,1),%r11d - xorl 60(%rsp),%edx + leal 1518500249(%r14,%r11,1),%r11d + roll $30,%r13d xorl %edi,%eax - roll $1,%edx addl %ecx,%r11d - roll $30,%r13d - movl %edx,8(%rsp) + roll $1,%edx addl %eax,%r11d - movl 12(%rsp),%ebp - movl %r13d,%eax + xorl 12(%rsp),%ebp + movl %esi,%eax + movl %edx,8(%rsp) movl %r11d,%ecx xorl 20(%rsp),%ebp - xorl %esi,%eax + xorl %r13d,%eax roll $5,%ecx xorl 44(%rsp),%ebp andl %r12d,%eax leal 1518500249(%rdx,%rdi,1),%edi - xorl 0(%rsp),%ebp + roll $30,%r12d xorl %esi,%eax - roll $1,%ebp addl %ecx,%edi - roll $30,%r12d - movl %ebp,12(%rsp) + roll $1,%ebp addl %eax,%edi - movl 16(%rsp),%edx - movl %r12d,%eax + xorl 16(%rsp),%r14d + movl %r13d,%eax + movl %ebp,12(%rsp) movl %edi,%ecx - xorl 24(%rsp),%edx - xorl %r13d,%eax + xorl 24(%rsp),%r14d + xorl %r12d,%eax roll $5,%ecx - xorl 48(%rsp),%edx + xorl 48(%rsp),%r14d andl %r11d,%eax leal 1518500249(%rbp,%rsi,1),%esi - xorl 4(%rsp),%edx + roll $30,%r11d xorl %r13d,%eax - roll $1,%edx addl %ecx,%esi - roll $30,%r11d - movl %edx,16(%rsp) + roll $1,%r14d addl %eax,%esi - movl 20(%rsp),%ebp - movl %r11d,%eax + xorl 20(%rsp),%edx + movl %edi,%eax + movl %r14d,16(%rsp) movl %esi,%ecx - xorl 28(%rsp),%ebp - xorl %edi,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r13,1),%r13d - xorl 52(%rsp),%ebp + xorl 28(%rsp),%edx xorl %r12d,%eax + roll $5,%ecx + xorl 52(%rsp),%edx + leal 1859775393(%r14,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 8(%rsp),%ebp roll $30,%edi addl %eax,%r13d - roll $1,%ebp - movl %ebp,20(%rsp) - movl 24(%rsp),%edx - movl %edi,%eax + roll $1,%edx + xorl 24(%rsp),%ebp + movl %esi,%eax + movl %edx,20(%rsp) movl %r13d,%ecx - xorl 32(%rsp),%edx - xorl %esi,%eax - roll $5,%ecx - leal 1859775393(%rbp,%r12,1),%r12d - xorl 56(%rsp),%edx + xorl 32(%rsp),%ebp xorl %r11d,%eax + roll $5,%ecx + xorl 56(%rsp),%ebp + leal 1859775393(%rdx,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 12(%rsp),%edx roll $30,%esi addl %eax,%r12d - roll $1,%edx - movl %edx,24(%rsp) - movl 28(%rsp),%ebp - movl %esi,%eax + roll $1,%ebp + xorl 28(%rsp),%r14d + movl %r13d,%eax + movl %ebp,24(%rsp) movl %r12d,%ecx - xorl 36(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r11,1),%r11d - xorl 60(%rsp),%ebp + xorl 36(%rsp),%r14d xorl %edi,%eax + roll $5,%ecx + xorl 60(%rsp),%r14d + leal 1859775393(%rbp,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 16(%rsp),%ebp roll $30,%r13d addl %eax,%r11d - roll $1,%ebp - movl %ebp,28(%rsp) - movl 32(%rsp),%edx - movl %r13d,%eax + roll $1,%r14d + xorl 32(%rsp),%edx + movl %r12d,%eax + movl %r14d,28(%rsp) movl %r11d,%ecx xorl 40(%rsp),%edx - xorl %r12d,%eax + xorl %esi,%eax roll $5,%ecx - leal 1859775393(%rbp,%rdi,1),%edi xorl 0(%rsp),%edx - xorl %esi,%eax + leal 1859775393(%r14,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 20(%rsp),%edx roll $30,%r12d addl %eax,%edi roll $1,%edx + xorl 36(%rsp),%ebp + movl %r11d,%eax movl %edx,32(%rsp) - movl 36(%rsp),%ebp - movl %r12d,%eax movl %edi,%ecx xorl 44(%rsp),%ebp - xorl %r11d,%eax + xorl %r13d,%eax roll $5,%ecx - leal 1859775393(%rdx,%rsi,1),%esi xorl 4(%rsp),%ebp - xorl %r13d,%eax + leal 1859775393(%rdx,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 24(%rsp),%ebp roll $30,%r11d addl %eax,%esi roll $1,%ebp + xorl 40(%rsp),%r14d + movl %edi,%eax movl %ebp,36(%rsp) - movl 40(%rsp),%edx - movl %r11d,%eax movl %esi,%ecx - xorl 48(%rsp),%edx - xorl %edi,%eax + xorl 48(%rsp),%r14d + xorl %r12d,%eax roll $5,%ecx + xorl 8(%rsp),%r14d leal 1859775393(%rbp,%r13,1),%r13d - xorl 8(%rsp),%edx - xorl %r12d,%eax + xorl %r11d,%eax addl %ecx,%r13d - xorl 28(%rsp),%edx roll $30,%edi addl %eax,%r13d - roll $1,%edx - movl %edx,40(%rsp) - movl 44(%rsp),%ebp - movl %edi,%eax + roll $1,%r14d + xorl 44(%rsp),%edx + movl %esi,%eax + movl %r14d,40(%rsp) movl %r13d,%ecx - xorl 52(%rsp),%ebp - xorl %esi,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r12,1),%r12d - xorl 12(%rsp),%ebp + xorl 52(%rsp),%edx xorl %r11d,%eax + roll $5,%ecx + xorl 12(%rsp),%edx + leal 1859775393(%r14,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 32(%rsp),%ebp roll $30,%esi addl %eax,%r12d - roll $1,%ebp - movl %ebp,44(%rsp) - movl 48(%rsp),%edx - movl %esi,%eax + roll $1,%edx + xorl 48(%rsp),%ebp + movl %r13d,%eax + movl %edx,44(%rsp) movl %r12d,%ecx - xorl 56(%rsp),%edx - xorl %r13d,%eax - roll $5,%ecx - leal 1859775393(%rbp,%r11,1),%r11d - xorl 16(%rsp),%edx + xorl 56(%rsp),%ebp xorl %edi,%eax + roll $5,%ecx + xorl 16(%rsp),%ebp + leal 1859775393(%rdx,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 36(%rsp),%edx roll $30,%r13d addl %eax,%r11d - roll $1,%edx - movl %edx,48(%rsp) - movl 52(%rsp),%ebp - movl %r13d,%eax + roll $1,%ebp + xorl 52(%rsp),%r14d + movl %r12d,%eax + movl %ebp,48(%rsp) movl %r11d,%ecx - xorl 60(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%rdi,1),%edi - xorl 20(%rsp),%ebp + xorl 60(%rsp),%r14d xorl %esi,%eax + roll $5,%ecx + xorl 20(%rsp),%r14d + leal 1859775393(%rbp,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 40(%rsp),%ebp roll $30,%r12d addl %eax,%edi - roll $1,%ebp - movl %ebp,52(%rsp) - movl 56(%rsp),%edx - movl %r12d,%eax + roll $1,%r14d + xorl 56(%rsp),%edx + movl %r11d,%eax + movl %r14d,52(%rsp) movl %edi,%ecx xorl 0(%rsp),%edx - xorl %r11d,%eax + xorl %r13d,%eax roll $5,%ecx - leal 1859775393(%rbp,%rsi,1),%esi xorl 24(%rsp),%edx - xorl %r13d,%eax + leal 1859775393(%r14,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 44(%rsp),%edx roll $30,%r11d addl %eax,%esi roll $1,%edx + xorl 60(%rsp),%ebp + movl %edi,%eax movl %edx,56(%rsp) - movl 60(%rsp),%ebp - movl %r11d,%eax movl %esi,%ecx xorl 4(%rsp),%ebp - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal 1859775393(%rdx,%r13,1),%r13d xorl 28(%rsp),%ebp - xorl %r12d,%eax + leal 1859775393(%rdx,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 48(%rsp),%ebp roll $30,%edi addl %eax,%r13d roll $1,%ebp + xorl 0(%rsp),%r14d + movl %esi,%eax movl %ebp,60(%rsp) - movl 0(%rsp),%edx - movl %edi,%eax movl %r13d,%ecx - xorl 8(%rsp),%edx - xorl %esi,%eax + xorl 8(%rsp),%r14d + xorl %r11d,%eax roll $5,%ecx + xorl 32(%rsp),%r14d leal 1859775393(%rbp,%r12,1),%r12d - xorl 32(%rsp),%edx - xorl %r11d,%eax + xorl %edi,%eax addl %ecx,%r12d - xorl 52(%rsp),%edx roll $30,%esi addl %eax,%r12d - roll $1,%edx - movl %edx,0(%rsp) - movl 4(%rsp),%ebp - movl %esi,%eax + roll $1,%r14d + xorl 4(%rsp),%edx + movl %r13d,%eax + movl %r14d,0(%rsp) movl %r12d,%ecx - xorl 12(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r11,1),%r11d - xorl 36(%rsp),%ebp + xorl 12(%rsp),%edx xorl %edi,%eax + roll $5,%ecx + xorl 36(%rsp),%edx + leal 1859775393(%r14,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 56(%rsp),%ebp roll $30,%r13d addl %eax,%r11d - roll $1,%ebp - movl %ebp,4(%rsp) - movl 8(%rsp),%edx - movl %r13d,%eax + roll $1,%edx + xorl 8(%rsp),%ebp + movl %r12d,%eax + movl %edx,4(%rsp) movl %r11d,%ecx - xorl 16(%rsp),%edx - xorl %r12d,%eax - roll $5,%ecx - leal 1859775393(%rbp,%rdi,1),%edi - xorl 40(%rsp),%edx + xorl 16(%rsp),%ebp xorl %esi,%eax + roll $5,%ecx + xorl 40(%rsp),%ebp + leal 1859775393(%rdx,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 60(%rsp),%edx roll $30,%r12d addl %eax,%edi - roll $1,%edx - movl %edx,8(%rsp) - movl 12(%rsp),%ebp - movl %r12d,%eax + roll $1,%ebp + xorl 12(%rsp),%r14d + movl %r11d,%eax + movl %ebp,8(%rsp) movl %edi,%ecx - xorl 20(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%rsi,1),%esi - xorl 44(%rsp),%ebp + xorl 20(%rsp),%r14d xorl %r13d,%eax + roll $5,%ecx + xorl 44(%rsp),%r14d + leal 1859775393(%rbp,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 0(%rsp),%ebp roll $30,%r11d addl %eax,%esi - roll $1,%ebp - movl %ebp,12(%rsp) - movl 16(%rsp),%edx - movl %r11d,%eax + roll $1,%r14d + xorl 16(%rsp),%edx + movl %edi,%eax + movl %r14d,12(%rsp) movl %esi,%ecx xorl 24(%rsp),%edx - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal 1859775393(%rbp,%r13,1),%r13d xorl 48(%rsp),%edx - xorl %r12d,%eax + leal 1859775393(%r14,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 4(%rsp),%edx roll $30,%edi addl %eax,%r13d roll $1,%edx + xorl 20(%rsp),%ebp + movl %esi,%eax movl %edx,16(%rsp) - movl 20(%rsp),%ebp - movl %edi,%eax movl %r13d,%ecx xorl 28(%rsp),%ebp - xorl %esi,%eax + xorl %r11d,%eax roll $5,%ecx - leal 1859775393(%rdx,%r12,1),%r12d xorl 52(%rsp),%ebp - xorl %r11d,%eax + leal 1859775393(%rdx,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 8(%rsp),%ebp roll $30,%esi addl %eax,%r12d roll $1,%ebp + xorl 24(%rsp),%r14d + movl %r13d,%eax movl %ebp,20(%rsp) - movl 24(%rsp),%edx - movl %esi,%eax movl %r12d,%ecx - xorl 32(%rsp),%edx - xorl %r13d,%eax + xorl 32(%rsp),%r14d + xorl %edi,%eax roll $5,%ecx + xorl 56(%rsp),%r14d leal 1859775393(%rbp,%r11,1),%r11d - xorl 56(%rsp),%edx - xorl %edi,%eax + xorl %esi,%eax addl %ecx,%r11d - xorl 12(%rsp),%edx roll $30,%r13d addl %eax,%r11d - roll $1,%edx - movl %edx,24(%rsp) - movl 28(%rsp),%ebp - movl %r13d,%eax + roll $1,%r14d + xorl 28(%rsp),%edx + movl %r12d,%eax + movl %r14d,24(%rsp) movl %r11d,%ecx - xorl 36(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%rdi,1),%edi - xorl 60(%rsp),%ebp + xorl 36(%rsp),%edx xorl %esi,%eax + roll $5,%ecx + xorl 60(%rsp),%edx + leal 1859775393(%r14,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 16(%rsp),%ebp roll $30,%r12d addl %eax,%edi - roll $1,%ebp - movl %ebp,28(%rsp) - movl 32(%rsp),%edx - movl %r12d,%eax + roll $1,%edx + xorl 32(%rsp),%ebp + movl %r11d,%eax + movl %edx,28(%rsp) movl %edi,%ecx - xorl 40(%rsp),%edx - xorl %r11d,%eax - roll $5,%ecx - leal 1859775393(%rbp,%rsi,1),%esi - xorl 0(%rsp),%edx + xorl 40(%rsp),%ebp xorl %r13d,%eax + roll $5,%ecx + xorl 0(%rsp),%ebp + leal 1859775393(%rdx,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 20(%rsp),%edx roll $30,%r11d addl %eax,%esi - roll $1,%edx - movl %edx,32(%rsp) - movl 36(%rsp),%ebp - movl %r11d,%eax - movl %r11d,%ebx - xorl 44(%rsp),%ebp - andl %r12d,%eax + roll $1,%ebp + xorl 36(%rsp),%r14d + movl %r12d,%eax + movl %ebp,32(%rsp) + movl %r12d,%ebx + xorl 44(%rsp),%r14d + andl %r11d,%eax movl %esi,%ecx - xorl 4(%rsp),%ebp - xorl %r12d,%ebx - leal -1894007588(%rdx,%r13,1),%r13d + xorl 4(%rsp),%r14d + leal -1894007588(%rbp,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 24(%rsp),%ebp addl %eax,%r13d + roll $1,%r14d andl %edi,%ebx - roll $1,%ebp - addl %ebx,%r13d - roll $30,%edi - movl %ebp,36(%rsp) addl %ecx,%r13d - movl 40(%rsp),%edx - movl %edi,%eax - movl %edi,%ebx + roll $30,%edi + addl %ebx,%r13d + xorl 40(%rsp),%edx + movl %r11d,%eax + movl %r14d,36(%rsp) + movl %r11d,%ebx xorl 48(%rsp),%edx - andl %r11d,%eax + andl %edi,%eax movl %r13d,%ecx xorl 8(%rsp),%edx - xorl %r11d,%ebx - leal -1894007588(%rbp,%r12,1),%r12d + leal -1894007588(%r14,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 28(%rsp),%edx addl %eax,%r12d - andl %esi,%ebx roll $1,%edx - addl %ebx,%r12d + andl %esi,%ebx + addl %ecx,%r12d roll $30,%esi + addl %ebx,%r12d + xorl 44(%rsp),%ebp + movl %edi,%eax movl %edx,40(%rsp) - addl %ecx,%r12d - movl 44(%rsp),%ebp - movl %esi,%eax - movl %esi,%ebx + movl %edi,%ebx xorl 52(%rsp),%ebp - andl %edi,%eax + andl %esi,%eax movl %r12d,%ecx xorl 12(%rsp),%ebp - xorl %edi,%ebx leal -1894007588(%rdx,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 32(%rsp),%ebp addl %eax,%r11d - andl %r13d,%ebx roll $1,%ebp - addl %ebx,%r11d + andl %r13d,%ebx + addl %ecx,%r11d roll $30,%r13d + addl %ebx,%r11d + xorl 48(%rsp),%r14d + movl %esi,%eax movl %ebp,44(%rsp) - addl %ecx,%r11d - movl 48(%rsp),%edx - movl %r13d,%eax - movl %r13d,%ebx - xorl 56(%rsp),%edx - andl %esi,%eax + movl %esi,%ebx + xorl 56(%rsp),%r14d + andl %r13d,%eax movl %r11d,%ecx - xorl 16(%rsp),%edx - xorl %esi,%ebx + xorl 16(%rsp),%r14d leal -1894007588(%rbp,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 36(%rsp),%edx addl %eax,%edi + roll $1,%r14d andl %r12d,%ebx - roll $1,%edx - addl %ebx,%edi - roll $30,%r12d - movl %edx,48(%rsp) addl %ecx,%edi - movl 52(%rsp),%ebp - movl %r12d,%eax - movl %r12d,%ebx - xorl 60(%rsp),%ebp - andl %r13d,%eax + roll $30,%r12d + addl %ebx,%edi + xorl 52(%rsp),%edx + movl %r13d,%eax + movl %r14d,48(%rsp) + movl %r13d,%ebx + xorl 60(%rsp),%edx + andl %r12d,%eax movl %edi,%ecx - xorl 20(%rsp),%ebp - xorl %r13d,%ebx - leal -1894007588(%rdx,%rsi,1),%esi + xorl 20(%rsp),%edx + leal -1894007588(%r14,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 40(%rsp),%ebp addl %eax,%esi + roll $1,%edx andl %r11d,%ebx - roll $1,%ebp - addl %ebx,%esi - roll $30,%r11d - movl %ebp,52(%rsp) addl %ecx,%esi - movl 56(%rsp),%edx - movl %r11d,%eax - movl %r11d,%ebx - xorl 0(%rsp),%edx - andl %r12d,%eax + roll $30,%r11d + addl %ebx,%esi + xorl 56(%rsp),%ebp + movl %r12d,%eax + movl %edx,52(%rsp) + movl %r12d,%ebx + xorl 0(%rsp),%ebp + andl %r11d,%eax movl %esi,%ecx - xorl 24(%rsp),%edx - xorl %r12d,%ebx - leal -1894007588(%rbp,%r13,1),%r13d + xorl 24(%rsp),%ebp + leal -1894007588(%rdx,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 44(%rsp),%edx addl %eax,%r13d + roll $1,%ebp andl %edi,%ebx - roll $1,%edx - addl %ebx,%r13d - roll $30,%edi - movl %edx,56(%rsp) addl %ecx,%r13d - movl 60(%rsp),%ebp - movl %edi,%eax - movl %edi,%ebx - xorl 4(%rsp),%ebp - andl %r11d,%eax + roll $30,%edi + addl %ebx,%r13d + xorl 60(%rsp),%r14d + movl %r11d,%eax + movl %ebp,56(%rsp) + movl %r11d,%ebx + xorl 4(%rsp),%r14d + andl %edi,%eax movl %r13d,%ecx - xorl 28(%rsp),%ebp - xorl %r11d,%ebx - leal -1894007588(%rdx,%r12,1),%r12d + xorl 28(%rsp),%r14d + leal -1894007588(%rbp,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 48(%rsp),%ebp addl %eax,%r12d + roll $1,%r14d andl %esi,%ebx - roll $1,%ebp - addl %ebx,%r12d - roll $30,%esi - movl %ebp,60(%rsp) addl %ecx,%r12d - movl 0(%rsp),%edx - movl %esi,%eax - movl %esi,%ebx + roll $30,%esi + addl %ebx,%r12d + xorl 0(%rsp),%edx + movl %edi,%eax + movl %r14d,60(%rsp) + movl %edi,%ebx xorl 8(%rsp),%edx - andl %edi,%eax + andl %esi,%eax movl %r12d,%ecx xorl 32(%rsp),%edx - xorl %edi,%ebx - leal -1894007588(%rbp,%r11,1),%r11d + leal -1894007588(%r14,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 52(%rsp),%edx addl %eax,%r11d - andl %r13d,%ebx roll $1,%edx - addl %ebx,%r11d + andl %r13d,%ebx + addl %ecx,%r11d roll $30,%r13d + addl %ebx,%r11d + xorl 4(%rsp),%ebp + movl %esi,%eax movl %edx,0(%rsp) - addl %ecx,%r11d - movl 4(%rsp),%ebp - movl %r13d,%eax - movl %r13d,%ebx + movl %esi,%ebx xorl 12(%rsp),%ebp - andl %esi,%eax + andl %r13d,%eax movl %r11d,%ecx xorl 36(%rsp),%ebp - xorl %esi,%ebx leal -1894007588(%rdx,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 56(%rsp),%ebp addl %eax,%edi - andl %r12d,%ebx roll $1,%ebp - addl %ebx,%edi + andl %r12d,%ebx + addl %ecx,%edi roll $30,%r12d + addl %ebx,%edi + xorl 8(%rsp),%r14d + movl %r13d,%eax movl %ebp,4(%rsp) - addl %ecx,%edi - movl 8(%rsp),%edx - movl %r12d,%eax - movl %r12d,%ebx - xorl 16(%rsp),%edx - andl %r13d,%eax + movl %r13d,%ebx + xorl 16(%rsp),%r14d + andl %r12d,%eax movl %edi,%ecx - xorl 40(%rsp),%edx - xorl %r13d,%ebx + xorl 40(%rsp),%r14d leal -1894007588(%rbp,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 60(%rsp),%edx addl %eax,%esi + roll $1,%r14d andl %r11d,%ebx - roll $1,%edx - addl %ebx,%esi - roll $30,%r11d - movl %edx,8(%rsp) addl %ecx,%esi - movl 12(%rsp),%ebp - movl %r11d,%eax - movl %r11d,%ebx - xorl 20(%rsp),%ebp - andl %r12d,%eax + roll $30,%r11d + addl %ebx,%esi + xorl 12(%rsp),%edx + movl %r12d,%eax + movl %r14d,8(%rsp) + movl %r12d,%ebx + xorl 20(%rsp),%edx + andl %r11d,%eax movl %esi,%ecx - xorl 44(%rsp),%ebp - xorl %r12d,%ebx - leal -1894007588(%rdx,%r13,1),%r13d + xorl 44(%rsp),%edx + leal -1894007588(%r14,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 0(%rsp),%ebp addl %eax,%r13d + roll $1,%edx andl %edi,%ebx - roll $1,%ebp - addl %ebx,%r13d - roll $30,%edi - movl %ebp,12(%rsp) addl %ecx,%r13d - movl 16(%rsp),%edx - movl %edi,%eax - movl %edi,%ebx - xorl 24(%rsp),%edx - andl %r11d,%eax + roll $30,%edi + addl %ebx,%r13d + xorl 16(%rsp),%ebp + movl %r11d,%eax + movl %edx,12(%rsp) + movl %r11d,%ebx + xorl 24(%rsp),%ebp + andl %edi,%eax movl %r13d,%ecx - xorl 48(%rsp),%edx - xorl %r11d,%ebx - leal -1894007588(%rbp,%r12,1),%r12d + xorl 48(%rsp),%ebp + leal -1894007588(%rdx,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 4(%rsp),%edx addl %eax,%r12d + roll $1,%ebp andl %esi,%ebx - roll $1,%edx - addl %ebx,%r12d - roll $30,%esi - movl %edx,16(%rsp) addl %ecx,%r12d - movl 20(%rsp),%ebp - movl %esi,%eax - movl %esi,%ebx - xorl 28(%rsp),%ebp - andl %edi,%eax + roll $30,%esi + addl %ebx,%r12d + xorl 20(%rsp),%r14d + movl %edi,%eax + movl %ebp,16(%rsp) + movl %edi,%ebx + xorl 28(%rsp),%r14d + andl %esi,%eax movl %r12d,%ecx - xorl 52(%rsp),%ebp - xorl %edi,%ebx - leal -1894007588(%rdx,%r11,1),%r11d + xorl 52(%rsp),%r14d + leal -1894007588(%rbp,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 8(%rsp),%ebp addl %eax,%r11d + roll $1,%r14d andl %r13d,%ebx - roll $1,%ebp - addl %ebx,%r11d - roll $30,%r13d - movl %ebp,20(%rsp) addl %ecx,%r11d - movl 24(%rsp),%edx - movl %r13d,%eax - movl %r13d,%ebx + roll $30,%r13d + addl %ebx,%r11d + xorl 24(%rsp),%edx + movl %esi,%eax + movl %r14d,20(%rsp) + movl %esi,%ebx xorl 32(%rsp),%edx - andl %esi,%eax + andl %r13d,%eax movl %r11d,%ecx xorl 56(%rsp),%edx - xorl %esi,%ebx - leal -1894007588(%rbp,%rdi,1),%edi + leal -1894007588(%r14,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 12(%rsp),%edx addl %eax,%edi - andl %r12d,%ebx roll $1,%edx - addl %ebx,%edi + andl %r12d,%ebx + addl %ecx,%edi roll $30,%r12d + addl %ebx,%edi + xorl 28(%rsp),%ebp + movl %r13d,%eax movl %edx,24(%rsp) - addl %ecx,%edi - movl 28(%rsp),%ebp - movl %r12d,%eax - movl %r12d,%ebx + movl %r13d,%ebx xorl 36(%rsp),%ebp - andl %r13d,%eax + andl %r12d,%eax movl %edi,%ecx xorl 60(%rsp),%ebp - xorl %r13d,%ebx leal -1894007588(%rdx,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 16(%rsp),%ebp addl %eax,%esi - andl %r11d,%ebx roll $1,%ebp - addl %ebx,%esi + andl %r11d,%ebx + addl %ecx,%esi roll $30,%r11d + addl %ebx,%esi + xorl 32(%rsp),%r14d + movl %r12d,%eax movl %ebp,28(%rsp) - addl %ecx,%esi - movl 32(%rsp),%edx - movl %r11d,%eax - movl %r11d,%ebx - xorl 40(%rsp),%edx - andl %r12d,%eax + movl %r12d,%ebx + xorl 40(%rsp),%r14d + andl %r11d,%eax movl %esi,%ecx - xorl 0(%rsp),%edx - xorl %r12d,%ebx + xorl 0(%rsp),%r14d leal -1894007588(%rbp,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 20(%rsp),%edx addl %eax,%r13d + roll $1,%r14d andl %edi,%ebx - roll $1,%edx - addl %ebx,%r13d - roll $30,%edi - movl %edx,32(%rsp) addl %ecx,%r13d - movl 36(%rsp),%ebp - movl %edi,%eax - movl %edi,%ebx - xorl 44(%rsp),%ebp - andl %r11d,%eax + roll $30,%edi + addl %ebx,%r13d + xorl 36(%rsp),%edx + movl %r11d,%eax + movl %r14d,32(%rsp) + movl %r11d,%ebx + xorl 44(%rsp),%edx + andl %edi,%eax movl %r13d,%ecx - xorl 4(%rsp),%ebp - xorl %r11d,%ebx - leal -1894007588(%rdx,%r12,1),%r12d + xorl 4(%rsp),%edx + leal -1894007588(%r14,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 24(%rsp),%ebp addl %eax,%r12d + roll $1,%edx andl %esi,%ebx - roll $1,%ebp - addl %ebx,%r12d - roll $30,%esi - movl %ebp,36(%rsp) addl %ecx,%r12d - movl 40(%rsp),%edx - movl %esi,%eax - movl %esi,%ebx - xorl 48(%rsp),%edx - andl %edi,%eax + roll $30,%esi + addl %ebx,%r12d + xorl 40(%rsp),%ebp + movl %edi,%eax + movl %edx,36(%rsp) + movl %edi,%ebx + xorl 48(%rsp),%ebp + andl %esi,%eax movl %r12d,%ecx - xorl 8(%rsp),%edx - xorl %edi,%ebx - leal -1894007588(%rbp,%r11,1),%r11d + xorl 8(%rsp),%ebp + leal -1894007588(%rdx,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 28(%rsp),%edx addl %eax,%r11d + roll $1,%ebp andl %r13d,%ebx - roll $1,%edx - addl %ebx,%r11d - roll $30,%r13d - movl %edx,40(%rsp) addl %ecx,%r11d - movl 44(%rsp),%ebp - movl %r13d,%eax - movl %r13d,%ebx - xorl 52(%rsp),%ebp - andl %esi,%eax + roll $30,%r13d + addl %ebx,%r11d + xorl 44(%rsp),%r14d + movl %esi,%eax + movl %ebp,40(%rsp) + movl %esi,%ebx + xorl 52(%rsp),%r14d + andl %r13d,%eax movl %r11d,%ecx - xorl 12(%rsp),%ebp - xorl %esi,%ebx - leal -1894007588(%rdx,%rdi,1),%edi + xorl 12(%rsp),%r14d + leal -1894007588(%rbp,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 32(%rsp),%ebp addl %eax,%edi + roll $1,%r14d andl %r12d,%ebx - roll $1,%ebp - addl %ebx,%edi - roll $30,%r12d - movl %ebp,44(%rsp) addl %ecx,%edi - movl 48(%rsp),%edx - movl %r12d,%eax - movl %r12d,%ebx + roll $30,%r12d + addl %ebx,%edi + xorl 48(%rsp),%edx + movl %r13d,%eax + movl %r14d,44(%rsp) + movl %r13d,%ebx xorl 56(%rsp),%edx - andl %r13d,%eax + andl %r12d,%eax movl %edi,%ecx xorl 16(%rsp),%edx - xorl %r13d,%ebx - leal -1894007588(%rbp,%rsi,1),%esi + leal -1894007588(%r14,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 36(%rsp),%edx addl %eax,%esi - andl %r11d,%ebx roll $1,%edx - addl %ebx,%esi + andl %r11d,%ebx + addl %ecx,%esi roll $30,%r11d + addl %ebx,%esi + xorl 52(%rsp),%ebp + movl %edi,%eax movl %edx,48(%rsp) - addl %ecx,%esi - movl 52(%rsp),%ebp - movl %r11d,%eax movl %esi,%ecx xorl 60(%rsp),%ebp - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal -899497514(%rdx,%r13,1),%r13d xorl 20(%rsp),%ebp - xorl %r12d,%eax + leal -899497514(%rdx,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 40(%rsp),%ebp roll $30,%edi addl %eax,%r13d roll $1,%ebp + xorl 56(%rsp),%r14d + movl %esi,%eax movl %ebp,52(%rsp) - movl 56(%rsp),%edx - movl %edi,%eax movl %r13d,%ecx - xorl 0(%rsp),%edx - xorl %esi,%eax + xorl 0(%rsp),%r14d + xorl %r11d,%eax roll $5,%ecx + xorl 24(%rsp),%r14d leal -899497514(%rbp,%r12,1),%r12d - xorl 24(%rsp),%edx - xorl %r11d,%eax + xorl %edi,%eax addl %ecx,%r12d - xorl 44(%rsp),%edx roll $30,%esi addl %eax,%r12d - roll $1,%edx - movl %edx,56(%rsp) - movl 60(%rsp),%ebp - movl %esi,%eax + roll $1,%r14d + xorl 60(%rsp),%edx + movl %r13d,%eax + movl %r14d,56(%rsp) movl %r12d,%ecx - xorl 4(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - leal -899497514(%rdx,%r11,1),%r11d - xorl 28(%rsp),%ebp + xorl 4(%rsp),%edx xorl %edi,%eax + roll $5,%ecx + xorl 28(%rsp),%edx + leal -899497514(%r14,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 48(%rsp),%ebp roll $30,%r13d addl %eax,%r11d - roll $1,%ebp - movl %ebp,60(%rsp) - movl 0(%rsp),%edx - movl %r13d,%eax + roll $1,%edx + xorl 0(%rsp),%ebp + movl %r12d,%eax + movl %edx,60(%rsp) movl %r11d,%ecx - xorl 8(%rsp),%edx - xorl %r12d,%eax - roll $5,%ecx - leal -899497514(%rbp,%rdi,1),%edi - xorl 32(%rsp),%edx + xorl 8(%rsp),%ebp xorl %esi,%eax + roll $5,%ecx + xorl 32(%rsp),%ebp + leal -899497514(%rdx,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 52(%rsp),%edx roll $30,%r12d addl %eax,%edi - roll $1,%edx - movl %edx,0(%rsp) - movl 4(%rsp),%ebp - movl %r12d,%eax + roll $1,%ebp + xorl 4(%rsp),%r14d + movl %r11d,%eax + movl %ebp,0(%rsp) movl %edi,%ecx - xorl 12(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - leal -899497514(%rdx,%rsi,1),%esi - xorl 36(%rsp),%ebp + xorl 12(%rsp),%r14d xorl %r13d,%eax + roll $5,%ecx + xorl 36(%rsp),%r14d + leal -899497514(%rbp,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 56(%rsp),%ebp roll $30,%r11d addl %eax,%esi - roll $1,%ebp - movl %ebp,4(%rsp) - movl 8(%rsp),%edx - movl %r11d,%eax + roll $1,%r14d + xorl 8(%rsp),%edx + movl %edi,%eax + movl %r14d,4(%rsp) movl %esi,%ecx xorl 16(%rsp),%edx - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal -899497514(%rbp,%r13,1),%r13d xorl 40(%rsp),%edx - xorl %r12d,%eax + leal -899497514(%r14,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 60(%rsp),%edx roll $30,%edi addl %eax,%r13d roll $1,%edx + xorl 12(%rsp),%ebp + movl %esi,%eax movl %edx,8(%rsp) - movl 12(%rsp),%ebp - movl %edi,%eax movl %r13d,%ecx xorl 20(%rsp),%ebp - xorl %esi,%eax + xorl %r11d,%eax roll $5,%ecx - leal -899497514(%rdx,%r12,1),%r12d xorl 44(%rsp),%ebp - xorl %r11d,%eax + leal -899497514(%rdx,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 0(%rsp),%ebp roll $30,%esi addl %eax,%r12d roll $1,%ebp + xorl 16(%rsp),%r14d + movl %r13d,%eax movl %ebp,12(%rsp) - movl 16(%rsp),%edx - movl %esi,%eax movl %r12d,%ecx - xorl 24(%rsp),%edx - xorl %r13d,%eax + xorl 24(%rsp),%r14d + xorl %edi,%eax roll $5,%ecx + xorl 48(%rsp),%r14d leal -899497514(%rbp,%r11,1),%r11d - xorl 48(%rsp),%edx - xorl %edi,%eax + xorl %esi,%eax addl %ecx,%r11d - xorl 4(%rsp),%edx roll $30,%r13d addl %eax,%r11d - roll $1,%edx - movl %edx,16(%rsp) - movl 20(%rsp),%ebp - movl %r13d,%eax + roll $1,%r14d + xorl 20(%rsp),%edx + movl %r12d,%eax + movl %r14d,16(%rsp) movl %r11d,%ecx - xorl 28(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - leal -899497514(%rdx,%rdi,1),%edi - xorl 52(%rsp),%ebp + xorl 28(%rsp),%edx xorl %esi,%eax + roll $5,%ecx + xorl 52(%rsp),%edx + leal -899497514(%r14,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 8(%rsp),%ebp roll $30,%r12d addl %eax,%edi - roll $1,%ebp - movl %ebp,20(%rsp) - movl 24(%rsp),%edx - movl %r12d,%eax + roll $1,%edx + xorl 24(%rsp),%ebp + movl %r11d,%eax + movl %edx,20(%rsp) movl %edi,%ecx - xorl 32(%rsp),%edx - xorl %r11d,%eax - roll $5,%ecx - leal -899497514(%rbp,%rsi,1),%esi - xorl 56(%rsp),%edx + xorl 32(%rsp),%ebp xorl %r13d,%eax + roll $5,%ecx + xorl 56(%rsp),%ebp + leal -899497514(%rdx,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 12(%rsp),%edx roll $30,%r11d addl %eax,%esi - roll $1,%edx - movl %edx,24(%rsp) - movl 28(%rsp),%ebp - movl %r11d,%eax + roll $1,%ebp + xorl 28(%rsp),%r14d + movl %edi,%eax + movl %ebp,24(%rsp) movl %esi,%ecx - xorl 36(%rsp),%ebp - xorl %edi,%eax - roll $5,%ecx - leal -899497514(%rdx,%r13,1),%r13d - xorl 60(%rsp),%ebp + xorl 36(%rsp),%r14d xorl %r12d,%eax + roll $5,%ecx + xorl 60(%rsp),%r14d + leal -899497514(%rbp,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 16(%rsp),%ebp roll $30,%edi addl %eax,%r13d - roll $1,%ebp - movl %ebp,28(%rsp) - movl 32(%rsp),%edx - movl %edi,%eax + roll $1,%r14d + xorl 32(%rsp),%edx + movl %esi,%eax + movl %r14d,28(%rsp) movl %r13d,%ecx xorl 40(%rsp),%edx - xorl %esi,%eax + xorl %r11d,%eax roll $5,%ecx - leal -899497514(%rbp,%r12,1),%r12d xorl 0(%rsp),%edx - xorl %r11d,%eax + leal -899497514(%r14,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 20(%rsp),%edx roll $30,%esi addl %eax,%r12d roll $1,%edx - movl %edx,32(%rsp) - movl 36(%rsp),%ebp - movl %esi,%eax + xorl 36(%rsp),%ebp + movl %r13d,%eax + movl %r12d,%ecx xorl 44(%rsp),%ebp - xorl %r13d,%eax + xorl %edi,%eax roll $5,%ecx - leal -899497514(%rdx,%r11,1),%r11d xorl 4(%rsp),%ebp - xorl %edi,%eax + leal -899497514(%rdx,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 24(%rsp),%ebp roll $30,%r13d addl %eax,%r11d roll $1,%ebp - movl %ebp,36(%rsp) - movl 40(%rsp),%edx - movl %r13d,%eax + xorl 40(%rsp),%r14d + movl %r12d,%eax + movl %r11d,%ecx - xorl 48(%rsp),%edx - xorl %r12d,%eax + xorl 48(%rsp),%r14d + xorl %esi,%eax roll $5,%ecx + xorl 8(%rsp),%r14d leal -899497514(%rbp,%rdi,1),%edi - xorl 8(%rsp),%edx - xorl %esi,%eax + xorl %r13d,%eax addl %ecx,%edi - xorl 28(%rsp),%edx roll $30,%r12d addl %eax,%edi - roll $1,%edx - movl %edx,40(%rsp) - movl 44(%rsp),%ebp - movl %r12d,%eax + roll $1,%r14d + xorl 44(%rsp),%edx + movl %r11d,%eax + movl %edi,%ecx - xorl 52(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - leal -899497514(%rdx,%rsi,1),%esi - xorl 12(%rsp),%ebp + xorl 52(%rsp),%edx xorl %r13d,%eax + roll $5,%ecx + xorl 12(%rsp),%edx + leal -899497514(%r14,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 32(%rsp),%ebp roll $30,%r11d addl %eax,%esi - roll $1,%ebp - movl %ebp,44(%rsp) - movl 48(%rsp),%edx - movl %r11d,%eax + roll $1,%edx + xorl 48(%rsp),%ebp + movl %edi,%eax + movl %esi,%ecx - xorl 56(%rsp),%edx - xorl %edi,%eax - roll $5,%ecx - leal -899497514(%rbp,%r13,1),%r13d - xorl 16(%rsp),%edx + xorl 56(%rsp),%ebp xorl %r12d,%eax + roll $5,%ecx + xorl 16(%rsp),%ebp + leal -899497514(%rdx,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 36(%rsp),%edx roll $30,%edi addl %eax,%r13d - roll $1,%edx - movl %edx,48(%rsp) - movl 52(%rsp),%ebp - movl %edi,%eax + roll $1,%ebp + xorl 52(%rsp),%r14d + movl %esi,%eax + movl %r13d,%ecx - xorl 60(%rsp),%ebp - xorl %esi,%eax - roll $5,%ecx - leal -899497514(%rdx,%r12,1),%r12d - xorl 20(%rsp),%ebp + xorl 60(%rsp),%r14d xorl %r11d,%eax + roll $5,%ecx + xorl 20(%rsp),%r14d + leal -899497514(%rbp,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 40(%rsp),%ebp roll $30,%esi addl %eax,%r12d - roll $1,%ebp - movl 56(%rsp),%edx - movl %esi,%eax + roll $1,%r14d + xorl 56(%rsp),%edx + movl %r13d,%eax + movl %r12d,%ecx xorl 0(%rsp),%edx - xorl %r13d,%eax + xorl %edi,%eax roll $5,%ecx - leal -899497514(%rbp,%r11,1),%r11d xorl 24(%rsp),%edx - xorl %edi,%eax + leal -899497514(%r14,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 44(%rsp),%edx roll $30,%r13d addl %eax,%r11d roll $1,%edx - movl 60(%rsp),%ebp - movl %r13d,%eax + xorl 60(%rsp),%ebp + movl %r12d,%eax + movl %r11d,%ecx xorl 4(%rsp),%ebp - xorl %r12d,%eax + xorl %esi,%eax roll $5,%ecx - leal -899497514(%rdx,%rdi,1),%edi xorl 28(%rsp),%ebp - xorl %esi,%eax + leal -899497514(%rdx,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 48(%rsp),%ebp roll $30,%r12d addl %eax,%edi roll $1,%ebp - movl %r12d,%eax + movl %r11d,%eax movl %edi,%ecx - xorl %r11d,%eax + xorl %r13d,%eax leal -899497514(%rbp,%rsi,1),%esi roll $5,%ecx - xorl %r13d,%eax + xorl %r12d,%eax addl %ecx,%esi roll $30,%r11d addl %eax,%esi @@ -1319,11 +1259,12 @@ sha1_block_data_order: jnz .Lloop movq 64(%rsp),%rsi - movq (%rsi),%r13 - movq 8(%rsi),%r12 - movq 16(%rsi),%rbp - movq 24(%rsi),%rbx - leaq 32(%rsi),%rsp + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lepilogue: .byte 0xf3,0xc3 .size sha1_block_data_order,.-sha1_block_data_order @@ -1331,17 +1272,22 @@ sha1_block_data_order: .align 16 sha1_block_data_order_ssse3: _ssse3_shortcut: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 + pushq %r13 + pushq %r14 leaq -64(%rsp),%rsp + movq %rax,%r14 + andq $-64,%rsp movq %rdi,%r8 movq %rsi,%r9 movq %rdx,%r10 shlq $6,%r10 addq %r9,%r10 - leaq K_XX_XX(%rip),%r11 + leaq K_XX_XX+64(%rip),%r11 movl 0(%r8),%eax movl 4(%r8),%ebx @@ -1349,19 +1295,22 @@ _ssse3_shortcut: movl 12(%r8),%edx movl %ebx,%esi movl 16(%r8),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi movdqa 64(%r11),%xmm6 - movdqa 0(%r11),%xmm9 + movdqa -64(%r11),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 movdqu 48(%r9),%xmm3 .byte 102,15,56,0,198 - addq $64,%r9 .byte 102,15,56,0,206 .byte 102,15,56,0,214 -.byte 102,15,56,0,222 + addq $64,%r9 paddd %xmm9,%xmm0 +.byte 102,15,56,0,222 paddd %xmm9,%xmm1 paddd %xmm9,%xmm2 movdqa %xmm0,0(%rsp) @@ -1373,904 +1322,882 @@ _ssse3_shortcut: jmp .Loop_ssse3 .align 16 .Loop_ssse3: - movdqa %xmm1,%xmm4 - addl 0(%rsp),%ebp - xorl %edx,%ecx + rorl $2,%ebx + pshufd $238,%xmm0,%xmm4 + xorl %edx,%esi movdqa %xmm3,%xmm8 -.byte 102,15,58,15,224,8 + paddd %xmm3,%xmm9 movl %eax,%edi + addl 0(%rsp),%ebp + punpcklqdq %xmm1,%xmm4 + xorl %ecx,%ebx roll $5,%eax - paddd %xmm3,%xmm9 - andl %ecx,%esi - xorl %edx,%ecx + addl %esi,%ebp psrldq $4,%xmm8 - xorl %edx,%esi - addl %eax,%ebp + andl %ebx,%edi + xorl %ecx,%ebx pxor %xmm0,%xmm4 - rorl $2,%ebx - addl %esi,%ebp + addl %eax,%ebp + rorl $7,%eax pxor %xmm2,%xmm8 - addl 4(%rsp),%edx - xorl %ecx,%ebx + xorl %ecx,%edi movl %ebp,%esi - roll $5,%ebp + addl 4(%rsp),%edx pxor %xmm8,%xmm4 - andl %ebx,%edi - xorl %ecx,%ebx + xorl %ebx,%eax + roll $5,%ebp movdqa %xmm9,48(%rsp) - xorl %ecx,%edi - addl %ebp,%edx - movdqa %xmm4,%xmm10 - movdqa %xmm4,%xmm8 - rorl $7,%eax addl %edi,%edx - addl 8(%rsp),%ecx + andl %eax,%esi + movdqa %xmm4,%xmm10 xorl %ebx,%eax + addl %ebp,%edx + rorl $7,%ebp + movdqa %xmm4,%xmm8 + xorl %ebx,%esi pslldq $12,%xmm10 paddd %xmm4,%xmm4 movl %edx,%edi - roll $5,%edx - andl %eax,%esi - xorl %ebx,%eax + addl 8(%rsp),%ecx psrld $31,%xmm8 - xorl %ebx,%esi - addl %edx,%ecx - movdqa %xmm10,%xmm9 - rorl $7,%ebp + xorl %eax,%ebp + roll $5,%edx addl %esi,%ecx + movdqa %xmm10,%xmm9 + andl %ebp,%edi + xorl %eax,%ebp psrld $30,%xmm10 + addl %edx,%ecx + rorl $7,%edx por %xmm8,%xmm4 - addl 12(%rsp),%ebx - xorl %eax,%ebp + xorl %eax,%edi movl %ecx,%esi - roll $5,%ecx + addl 12(%rsp),%ebx pslld $2,%xmm9 pxor %xmm10,%xmm4 - andl %ebp,%edi - xorl %eax,%ebp - movdqa 0(%r11),%xmm10 - xorl %eax,%edi - addl %ecx,%ebx - pxor %xmm9,%xmm4 - rorl $7,%edx + xorl %ebp,%edx + movdqa -64(%r11),%xmm10 + roll $5,%ecx addl %edi,%ebx - movdqa %xmm2,%xmm5 - addl 16(%rsp),%eax + andl %edx,%esi + pxor %xmm9,%xmm4 xorl %ebp,%edx + addl %ecx,%ebx + rorl $7,%ecx + pshufd $238,%xmm1,%xmm5 + xorl %ebp,%esi movdqa %xmm4,%xmm9 -.byte 102,15,58,15,233,8 + paddd %xmm4,%xmm10 movl %ebx,%edi + addl 16(%rsp),%eax + punpcklqdq %xmm2,%xmm5 + xorl %edx,%ecx roll $5,%ebx - paddd %xmm4,%xmm10 - andl %edx,%esi - xorl %ebp,%edx + addl %esi,%eax psrldq $4,%xmm9 - xorl %ebp,%esi - addl %ebx,%eax + andl %ecx,%edi + xorl %edx,%ecx pxor %xmm1,%xmm5 - rorl $7,%ecx - addl %esi,%eax + addl %ebx,%eax + rorl $7,%ebx pxor %xmm3,%xmm9 - addl 20(%rsp),%ebp - xorl %edx,%ecx + xorl %edx,%edi movl %eax,%esi - roll $5,%eax + addl 20(%rsp),%ebp pxor %xmm9,%xmm5 - andl %ecx,%edi - xorl %edx,%ecx + xorl %ecx,%ebx + roll $5,%eax movdqa %xmm10,0(%rsp) - xorl %edx,%edi - addl %eax,%ebp - movdqa %xmm5,%xmm8 - movdqa %xmm5,%xmm9 - rorl $7,%ebx addl %edi,%ebp - addl 24(%rsp),%edx + andl %ebx,%esi + movdqa %xmm5,%xmm8 xorl %ecx,%ebx + addl %eax,%ebp + rorl $7,%eax + movdqa %xmm5,%xmm9 + xorl %ecx,%esi pslldq $12,%xmm8 paddd %xmm5,%xmm5 movl %ebp,%edi - roll $5,%ebp - andl %ebx,%esi - xorl %ecx,%ebx + addl 24(%rsp),%edx psrld $31,%xmm9 - xorl %ecx,%esi - addl %ebp,%edx - movdqa %xmm8,%xmm10 - rorl $7,%eax + xorl %ebx,%eax + roll $5,%ebp addl %esi,%edx + movdqa %xmm8,%xmm10 + andl %eax,%edi + xorl %ebx,%eax psrld $30,%xmm8 + addl %ebp,%edx + rorl $7,%ebp por %xmm9,%xmm5 - addl 28(%rsp),%ecx - xorl %ebx,%eax + xorl %ebx,%edi movl %edx,%esi - roll $5,%edx + addl 28(%rsp),%ecx pslld $2,%xmm10 pxor %xmm8,%xmm5 - andl %eax,%edi - xorl %ebx,%eax - movdqa 16(%r11),%xmm8 - xorl %ebx,%edi - addl %edx,%ecx - pxor %xmm10,%xmm5 - rorl $7,%ebp + xorl %eax,%ebp + movdqa -32(%r11),%xmm8 + roll $5,%edx addl %edi,%ecx - movdqa %xmm3,%xmm6 - addl 32(%rsp),%ebx + andl %ebp,%esi + pxor %xmm10,%xmm5 xorl %eax,%ebp + addl %edx,%ecx + rorl $7,%edx + pshufd $238,%xmm2,%xmm6 + xorl %eax,%esi movdqa %xmm5,%xmm10 -.byte 102,15,58,15,242,8 + paddd %xmm5,%xmm8 movl %ecx,%edi + addl 32(%rsp),%ebx + punpcklqdq %xmm3,%xmm6 + xorl %ebp,%edx roll $5,%ecx - paddd %xmm5,%xmm8 - andl %ebp,%esi - xorl %eax,%ebp + addl %esi,%ebx psrldq $4,%xmm10 - xorl %eax,%esi - addl %ecx,%ebx + andl %edx,%edi + xorl %ebp,%edx pxor %xmm2,%xmm6 - rorl $7,%edx - addl %esi,%ebx + addl %ecx,%ebx + rorl $7,%ecx pxor %xmm4,%xmm10 - addl 36(%rsp),%eax - xorl %ebp,%edx + xorl %ebp,%edi movl %ebx,%esi - roll $5,%ebx + addl 36(%rsp),%eax pxor %xmm10,%xmm6 - andl %edx,%edi - xorl %ebp,%edx + xorl %edx,%ecx + roll $5,%ebx movdqa %xmm8,16(%rsp) - xorl %ebp,%edi - addl %ebx,%eax - movdqa %xmm6,%xmm9 - movdqa %xmm6,%xmm10 - rorl $7,%ecx addl %edi,%eax - addl 40(%rsp),%ebp + andl %ecx,%esi + movdqa %xmm6,%xmm9 xorl %edx,%ecx + addl %ebx,%eax + rorl $7,%ebx + movdqa %xmm6,%xmm10 + xorl %edx,%esi pslldq $12,%xmm9 paddd %xmm6,%xmm6 movl %eax,%edi - roll $5,%eax - andl %ecx,%esi - xorl %edx,%ecx + addl 40(%rsp),%ebp psrld $31,%xmm10 - xorl %edx,%esi - addl %eax,%ebp - movdqa %xmm9,%xmm8 - rorl $7,%ebx + xorl %ecx,%ebx + roll $5,%eax addl %esi,%ebp + movdqa %xmm9,%xmm8 + andl %ebx,%edi + xorl %ecx,%ebx psrld $30,%xmm9 + addl %eax,%ebp + rorl $7,%eax por %xmm10,%xmm6 - addl 44(%rsp),%edx - xorl %ecx,%ebx + xorl %ecx,%edi movl %ebp,%esi - roll $5,%ebp + addl 44(%rsp),%edx pslld $2,%xmm8 pxor %xmm9,%xmm6 - andl %ebx,%edi - xorl %ecx,%ebx - movdqa 16(%r11),%xmm9 - xorl %ecx,%edi - addl %ebp,%edx - pxor %xmm8,%xmm6 - rorl $7,%eax + xorl %ebx,%eax + movdqa -32(%r11),%xmm9 + roll $5,%ebp addl %edi,%edx - movdqa %xmm4,%xmm7 - addl 48(%rsp),%ecx + andl %eax,%esi + pxor %xmm8,%xmm6 xorl %ebx,%eax + addl %ebp,%edx + rorl $7,%ebp + pshufd $238,%xmm3,%xmm7 + xorl %ebx,%esi movdqa %xmm6,%xmm8 -.byte 102,15,58,15,251,8 + paddd %xmm6,%xmm9 movl %edx,%edi + addl 48(%rsp),%ecx + punpcklqdq %xmm4,%xmm7 + xorl %eax,%ebp roll $5,%edx - paddd %xmm6,%xmm9 - andl %eax,%esi - xorl %ebx,%eax + addl %esi,%ecx psrldq $4,%xmm8 - xorl %ebx,%esi - addl %edx,%ecx + andl %ebp,%edi + xorl %eax,%ebp pxor %xmm3,%xmm7 - rorl $7,%ebp - addl %esi,%ecx + addl %edx,%ecx + rorl $7,%edx pxor %xmm5,%xmm8 - addl 52(%rsp),%ebx - xorl %eax,%ebp + xorl %eax,%edi movl %ecx,%esi - roll $5,%ecx + addl 52(%rsp),%ebx pxor %xmm8,%xmm7 - andl %ebp,%edi - xorl %eax,%ebp + xorl %ebp,%edx + roll $5,%ecx movdqa %xmm9,32(%rsp) - xorl %eax,%edi - addl %ecx,%ebx - movdqa %xmm7,%xmm10 - movdqa %xmm7,%xmm8 - rorl $7,%edx addl %edi,%ebx - addl 56(%rsp),%eax + andl %edx,%esi + movdqa %xmm7,%xmm10 xorl %ebp,%edx + addl %ecx,%ebx + rorl $7,%ecx + movdqa %xmm7,%xmm8 + xorl %ebp,%esi pslldq $12,%xmm10 paddd %xmm7,%xmm7 movl %ebx,%edi - roll $5,%ebx - andl %edx,%esi - xorl %ebp,%edx + addl 56(%rsp),%eax psrld $31,%xmm8 - xorl %ebp,%esi - addl %ebx,%eax - movdqa %xmm10,%xmm9 - rorl $7,%ecx + xorl %edx,%ecx + roll $5,%ebx addl %esi,%eax + movdqa %xmm10,%xmm9 + andl %ecx,%edi + xorl %edx,%ecx psrld $30,%xmm10 + addl %ebx,%eax + rorl $7,%ebx por %xmm8,%xmm7 - addl 60(%rsp),%ebp - xorl %edx,%ecx + xorl %edx,%edi movl %eax,%esi - roll $5,%eax + addl 60(%rsp),%ebp pslld $2,%xmm9 pxor %xmm10,%xmm7 - andl %ecx,%edi - xorl %edx,%ecx - movdqa 16(%r11),%xmm10 - xorl %edx,%edi - addl %eax,%ebp - pxor %xmm9,%xmm7 - rorl $7,%ebx + xorl %ecx,%ebx + movdqa -32(%r11),%xmm10 + roll $5,%eax addl %edi,%ebp - movdqa %xmm7,%xmm9 - addl 0(%rsp),%edx - pxor %xmm4,%xmm0 -.byte 102,68,15,58,15,206,8 + andl %ebx,%esi + pxor %xmm9,%xmm7 + pshufd $238,%xmm6,%xmm9 xorl %ecx,%ebx + addl %eax,%ebp + rorl $7,%eax + pxor %xmm4,%xmm0 + xorl %ecx,%esi movl %ebp,%edi + addl 0(%rsp),%edx + punpcklqdq %xmm7,%xmm9 + xorl %ebx,%eax roll $5,%ebp pxor %xmm1,%xmm0 - andl %ebx,%esi - xorl %ecx,%ebx + addl %esi,%edx + andl %eax,%edi movdqa %xmm10,%xmm8 + xorl %ebx,%eax paddd %xmm7,%xmm10 - xorl %ecx,%esi addl %ebp,%edx pxor %xmm9,%xmm0 - rorl $7,%eax - addl %esi,%edx + rorl $7,%ebp + xorl %ebx,%edi + movl %edx,%esi addl 4(%rsp),%ecx - xorl %ebx,%eax movdqa %xmm0,%xmm9 - movdqa %xmm10,48(%rsp) - movl %edx,%esi + xorl %eax,%ebp roll $5,%edx - andl %eax,%edi - xorl %ebx,%eax + movdqa %xmm10,48(%rsp) + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp pslld $2,%xmm0 - xorl %ebx,%edi addl %edx,%ecx + rorl $7,%edx psrld $30,%xmm9 - rorl $7,%ebp - addl %edi,%ecx - addl 8(%rsp),%ebx - xorl %eax,%ebp + xorl %eax,%esi movl %ecx,%edi - roll $5,%ecx + addl 8(%rsp),%ebx por %xmm9,%xmm0 - andl %ebp,%esi - xorl %eax,%ebp - movdqa %xmm0,%xmm10 - xorl %eax,%esi - addl %ecx,%ebx - rorl $7,%edx - addl %esi,%ebx - addl 12(%rsp),%eax xorl %ebp,%edx - movl %ebx,%esi - roll $5,%ebx + roll $5,%ecx + pshufd $238,%xmm7,%xmm10 + addl %esi,%ebx andl %edx,%edi xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax xorl %ebp,%edi - addl %ebx,%eax - rorl $7,%ecx + movl %ebx,%esi + roll $5,%ebx addl %edi,%eax - addl 16(%rsp),%ebp - pxor %xmm5,%xmm1 -.byte 102,68,15,58,15,215,8 xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + pxor %xmm5,%xmm1 + addl 16(%rsp),%ebp + xorl %ecx,%esi + punpcklqdq %xmm0,%xmm10 movl %eax,%edi roll $5,%eax pxor %xmm2,%xmm1 - xorl %ecx,%esi - addl %eax,%ebp + addl %esi,%ebp + xorl %ecx,%edi movdqa %xmm8,%xmm9 - paddd %xmm0,%xmm8 rorl $7,%ebx - addl %esi,%ebp + paddd %xmm0,%xmm8 + addl %eax,%ebp pxor %xmm10,%xmm1 addl 20(%rsp),%edx - xorl %ecx,%edi + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp movdqa %xmm1,%xmm10 + addl %edi,%edx + xorl %ebx,%esi movdqa %xmm8,0(%rsp) - xorl %ebx,%edi - addl %ebp,%edx rorl $7,%eax - addl %edi,%edx - pslld $2,%xmm1 + addl %ebp,%edx addl 24(%rsp),%ecx - xorl %ebx,%esi - psrld $30,%xmm10 + pslld $2,%xmm1 + xorl %eax,%esi movl %edx,%edi + psrld $30,%xmm10 roll $5,%edx - xorl %eax,%esi - addl %edx,%ecx - rorl $7,%ebp addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp por %xmm10,%xmm1 + addl %edx,%ecx addl 28(%rsp),%ebx - xorl %eax,%edi - movdqa %xmm1,%xmm8 + pshufd $238,%xmm0,%xmm8 + xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx - xorl %ebp,%edi - addl %ecx,%ebx - rorl $7,%edx addl %edi,%ebx - addl 32(%rsp),%eax - pxor %xmm6,%xmm2 -.byte 102,68,15,58,15,192,8 xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + pxor %xmm6,%xmm2 + addl 32(%rsp),%eax + xorl %edx,%esi + punpcklqdq %xmm1,%xmm8 movl %ebx,%edi roll $5,%ebx pxor %xmm3,%xmm2 - xorl %edx,%esi - addl %ebx,%eax - movdqa 32(%r11),%xmm10 - paddd %xmm1,%xmm9 - rorl $7,%ecx addl %esi,%eax + xorl %edx,%edi + movdqa 0(%r11),%xmm10 + rorl $7,%ecx + paddd %xmm1,%xmm9 + addl %ebx,%eax pxor %xmm8,%xmm2 addl 36(%rsp),%ebp - xorl %edx,%edi + xorl %ecx,%edi movl %eax,%esi roll $5,%eax movdqa %xmm2,%xmm8 + addl %edi,%ebp + xorl %ecx,%esi movdqa %xmm9,16(%rsp) - xorl %ecx,%edi - addl %eax,%ebp rorl $7,%ebx - addl %edi,%ebp - pslld $2,%xmm2 + addl %eax,%ebp addl 40(%rsp),%edx - xorl %ecx,%esi - psrld $30,%xmm8 + pslld $2,%xmm2 + xorl %ebx,%esi movl %ebp,%edi + psrld $30,%xmm8 roll $5,%ebp - xorl %ebx,%esi - addl %ebp,%edx - rorl $7,%eax addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax por %xmm8,%xmm2 + addl %ebp,%edx addl 44(%rsp),%ecx - xorl %ebx,%edi - movdqa %xmm2,%xmm9 + pshufd $238,%xmm1,%xmm9 + xorl %eax,%edi movl %edx,%esi roll $5,%edx - xorl %eax,%edi - addl %edx,%ecx - rorl $7,%ebp addl %edi,%ecx - addl 48(%rsp),%ebx - pxor %xmm7,%xmm3 -.byte 102,68,15,58,15,201,8 xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + pxor %xmm7,%xmm3 + addl 48(%rsp),%ebx + xorl %ebp,%esi + punpcklqdq %xmm2,%xmm9 movl %ecx,%edi roll $5,%ecx pxor %xmm4,%xmm3 - xorl %ebp,%esi - addl %ecx,%ebx + addl %esi,%ebx + xorl %ebp,%edi movdqa %xmm10,%xmm8 - paddd %xmm2,%xmm10 rorl $7,%edx - addl %esi,%ebx + paddd %xmm2,%xmm10 + addl %ecx,%ebx pxor %xmm9,%xmm3 addl 52(%rsp),%eax - xorl %ebp,%edi + xorl %edx,%edi movl %ebx,%esi roll $5,%ebx movdqa %xmm3,%xmm9 + addl %edi,%eax + xorl %edx,%esi movdqa %xmm10,32(%rsp) - xorl %edx,%edi - addl %ebx,%eax rorl $7,%ecx - addl %edi,%eax - pslld $2,%xmm3 + addl %ebx,%eax addl 56(%rsp),%ebp - xorl %edx,%esi - psrld $30,%xmm9 + pslld $2,%xmm3 + xorl %ecx,%esi movl %eax,%edi + psrld $30,%xmm9 roll $5,%eax - xorl %ecx,%esi - addl %eax,%ebp - rorl $7,%ebx addl %esi,%ebp + xorl %ecx,%edi + rorl $7,%ebx por %xmm9,%xmm3 + addl %eax,%ebp addl 60(%rsp),%edx - xorl %ecx,%edi - movdqa %xmm3,%xmm10 + pshufd $238,%xmm2,%xmm10 + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp - xorl %ebx,%edi - addl %ebp,%edx - rorl $7,%eax addl %edi,%edx - addl 0(%rsp),%ecx - pxor %xmm0,%xmm4 -.byte 102,68,15,58,15,210,8 xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + pxor %xmm0,%xmm4 + addl 0(%rsp),%ecx + xorl %eax,%esi + punpcklqdq %xmm3,%xmm10 movl %edx,%edi roll $5,%edx pxor %xmm5,%xmm4 - xorl %eax,%esi - addl %edx,%ecx + addl %esi,%ecx + xorl %eax,%edi movdqa %xmm8,%xmm9 - paddd %xmm3,%xmm8 rorl $7,%ebp - addl %esi,%ecx + paddd %xmm3,%xmm8 + addl %edx,%ecx pxor %xmm10,%xmm4 addl 4(%rsp),%ebx - xorl %eax,%edi + xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx movdqa %xmm4,%xmm10 + addl %edi,%ebx + xorl %ebp,%esi movdqa %xmm8,48(%rsp) - xorl %ebp,%edi - addl %ecx,%ebx rorl $7,%edx - addl %edi,%ebx - pslld $2,%xmm4 + addl %ecx,%ebx addl 8(%rsp),%eax - xorl %ebp,%esi - psrld $30,%xmm10 + pslld $2,%xmm4 + xorl %edx,%esi movl %ebx,%edi + psrld $30,%xmm10 roll $5,%ebx - xorl %edx,%esi - addl %ebx,%eax - rorl $7,%ecx addl %esi,%eax + xorl %edx,%edi + rorl $7,%ecx por %xmm10,%xmm4 + addl %ebx,%eax addl 12(%rsp),%ebp - xorl %edx,%edi - movdqa %xmm4,%xmm8 + pshufd $238,%xmm3,%xmm8 + xorl %ecx,%edi movl %eax,%esi roll $5,%eax - xorl %ecx,%edi - addl %eax,%ebp - rorl $7,%ebx addl %edi,%ebp - addl 16(%rsp),%edx - pxor %xmm1,%xmm5 -.byte 102,68,15,58,15,195,8 xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + pxor %xmm1,%xmm5 + addl 16(%rsp),%edx + xorl %ebx,%esi + punpcklqdq %xmm4,%xmm8 movl %ebp,%edi roll $5,%ebp pxor %xmm6,%xmm5 - xorl %ebx,%esi - addl %ebp,%edx + addl %esi,%edx + xorl %ebx,%edi movdqa %xmm9,%xmm10 - paddd %xmm4,%xmm9 rorl $7,%eax - addl %esi,%edx + paddd %xmm4,%xmm9 + addl %ebp,%edx pxor %xmm8,%xmm5 addl 20(%rsp),%ecx - xorl %ebx,%edi + xorl %eax,%edi movl %edx,%esi roll $5,%edx movdqa %xmm5,%xmm8 + addl %edi,%ecx + xorl %eax,%esi movdqa %xmm9,0(%rsp) - xorl %eax,%edi - addl %edx,%ecx rorl $7,%ebp - addl %edi,%ecx - pslld $2,%xmm5 + addl %edx,%ecx addl 24(%rsp),%ebx - xorl %eax,%esi - psrld $30,%xmm8 + pslld $2,%xmm5 + xorl %ebp,%esi movl %ecx,%edi + psrld $30,%xmm8 roll $5,%ecx - xorl %ebp,%esi - addl %ecx,%ebx - rorl $7,%edx addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx por %xmm8,%xmm5 + addl %ecx,%ebx addl 28(%rsp),%eax - xorl %ebp,%edi - movdqa %xmm5,%xmm9 + pshufd $238,%xmm4,%xmm9 + rorl $7,%ecx movl %ebx,%esi - roll $5,%ebx xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx + roll $5,%ebx addl %edi,%eax - movl %ecx,%edi - pxor %xmm2,%xmm6 -.byte 102,68,15,58,15,204,8 + xorl %ecx,%esi xorl %edx,%ecx + addl %ebx,%eax + pxor %xmm2,%xmm6 addl 32(%rsp),%ebp - andl %edx,%edi - pxor %xmm7,%xmm6 andl %ecx,%esi + xorl %edx,%ecx rorl $7,%ebx - movdqa %xmm10,%xmm8 - paddd %xmm5,%xmm10 - addl %edi,%ebp + punpcklqdq %xmm5,%xmm9 movl %eax,%edi - pxor %xmm9,%xmm6 + xorl %ecx,%esi + pxor %xmm7,%xmm6 roll $5,%eax addl %esi,%ebp - xorl %edx,%ecx - addl %eax,%ebp - movdqa %xmm6,%xmm9 - movdqa %xmm10,16(%rsp) - movl %ebx,%esi + movdqa %xmm10,%xmm8 + xorl %ebx,%edi + paddd %xmm5,%xmm10 xorl %ecx,%ebx + pxor %xmm9,%xmm6 + addl %eax,%ebp addl 36(%rsp),%edx - andl %ecx,%esi - pslld $2,%xmm6 andl %ebx,%edi + xorl %ecx,%ebx rorl $7,%eax - psrld $30,%xmm9 - addl %esi,%edx + movdqa %xmm6,%xmm9 movl %ebp,%esi + xorl %ebx,%edi + movdqa %xmm10,16(%rsp) roll $5,%ebp addl %edi,%edx - xorl %ecx,%ebx - addl %ebp,%edx - por %xmm9,%xmm6 - movl %eax,%edi + xorl %eax,%esi + pslld $2,%xmm6 xorl %ebx,%eax - movdqa %xmm6,%xmm10 + addl %ebp,%edx + psrld $30,%xmm9 addl 40(%rsp),%ecx - andl %ebx,%edi andl %eax,%esi + xorl %ebx,%eax + por %xmm9,%xmm6 rorl $7,%ebp - addl %edi,%ecx movl %edx,%edi + xorl %eax,%esi roll $5,%edx + pshufd $238,%xmm5,%xmm10 addl %esi,%ecx - xorl %ebx,%eax - addl %edx,%ecx - movl %ebp,%esi + xorl %ebp,%edi xorl %eax,%ebp + addl %edx,%ecx addl 44(%rsp),%ebx - andl %eax,%esi andl %ebp,%edi + xorl %eax,%ebp rorl $7,%edx - addl %esi,%ebx movl %ecx,%esi + xorl %ebp,%edi roll $5,%ecx addl %edi,%ebx - xorl %eax,%ebp + xorl %edx,%esi + xorl %ebp,%edx addl %ecx,%ebx - movl %edx,%edi pxor %xmm3,%xmm7 -.byte 102,68,15,58,15,213,8 - xorl %ebp,%edx addl 48(%rsp),%eax - andl %ebp,%edi - pxor %xmm0,%xmm7 andl %edx,%esi + xorl %ebp,%edx rorl $7,%ecx - movdqa 48(%r11),%xmm9 - paddd %xmm6,%xmm8 - addl %edi,%eax + punpcklqdq %xmm6,%xmm10 movl %ebx,%edi - pxor %xmm10,%xmm7 + xorl %edx,%esi + pxor %xmm0,%xmm7 roll $5,%ebx addl %esi,%eax - xorl %ebp,%edx - addl %ebx,%eax - movdqa %xmm7,%xmm10 - movdqa %xmm8,32(%rsp) - movl %ecx,%esi + movdqa 32(%r11),%xmm9 + xorl %ecx,%edi + paddd %xmm6,%xmm8 xorl %edx,%ecx + pxor %xmm10,%xmm7 + addl %ebx,%eax addl 52(%rsp),%ebp - andl %edx,%esi - pslld $2,%xmm7 andl %ecx,%edi + xorl %edx,%ecx rorl $7,%ebx - psrld $30,%xmm10 - addl %esi,%ebp + movdqa %xmm7,%xmm10 movl %eax,%esi + xorl %ecx,%edi + movdqa %xmm8,32(%rsp) roll $5,%eax addl %edi,%ebp - xorl %edx,%ecx - addl %eax,%ebp - por %xmm10,%xmm7 - movl %ebx,%edi + xorl %ebx,%esi + pslld $2,%xmm7 xorl %ecx,%ebx - movdqa %xmm7,%xmm8 + addl %eax,%ebp + psrld $30,%xmm10 addl 56(%rsp),%edx - andl %ecx,%edi andl %ebx,%esi + xorl %ecx,%ebx + por %xmm10,%xmm7 rorl $7,%eax - addl %edi,%edx movl %ebp,%edi + xorl %ebx,%esi roll $5,%ebp + pshufd $238,%xmm6,%xmm8 addl %esi,%edx - xorl %ecx,%ebx - addl %ebp,%edx - movl %eax,%esi + xorl %eax,%edi xorl %ebx,%eax + addl %ebp,%edx addl 60(%rsp),%ecx - andl %ebx,%esi andl %eax,%edi + xorl %ebx,%eax rorl $7,%ebp - addl %esi,%ecx movl %edx,%esi + xorl %eax,%edi roll $5,%edx addl %edi,%ecx - xorl %ebx,%eax + xorl %ebp,%esi + xorl %eax,%ebp addl %edx,%ecx - movl %ebp,%edi pxor %xmm4,%xmm0 -.byte 102,68,15,58,15,198,8 - xorl %eax,%ebp addl 0(%rsp),%ebx - andl %eax,%edi - pxor %xmm1,%xmm0 andl %ebp,%esi + xorl %eax,%ebp rorl $7,%edx - movdqa %xmm9,%xmm10 - paddd %xmm7,%xmm9 - addl %edi,%ebx + punpcklqdq %xmm7,%xmm8 movl %ecx,%edi - pxor %xmm8,%xmm0 + xorl %ebp,%esi + pxor %xmm1,%xmm0 roll $5,%ecx addl %esi,%ebx - xorl %eax,%ebp - addl %ecx,%ebx - movdqa %xmm0,%xmm8 - movdqa %xmm9,48(%rsp) - movl %edx,%esi + movdqa %xmm9,%xmm10 + xorl %edx,%edi + paddd %xmm7,%xmm9 xorl %ebp,%edx + pxor %xmm8,%xmm0 + addl %ecx,%ebx addl 4(%rsp),%eax - andl %ebp,%esi - pslld $2,%xmm0 andl %edx,%edi + xorl %ebp,%edx rorl $7,%ecx - psrld $30,%xmm8 - addl %esi,%eax + movdqa %xmm0,%xmm8 movl %ebx,%esi + xorl %edx,%edi + movdqa %xmm9,48(%rsp) roll $5,%ebx addl %edi,%eax - xorl %ebp,%edx - addl %ebx,%eax - por %xmm8,%xmm0 - movl %ecx,%edi + xorl %ecx,%esi + pslld $2,%xmm0 xorl %edx,%ecx - movdqa %xmm0,%xmm9 + addl %ebx,%eax + psrld $30,%xmm8 addl 8(%rsp),%ebp - andl %edx,%edi andl %ecx,%esi + xorl %edx,%ecx + por %xmm8,%xmm0 rorl $7,%ebx - addl %edi,%ebp movl %eax,%edi + xorl %ecx,%esi roll $5,%eax + pshufd $238,%xmm7,%xmm9 addl %esi,%ebp - xorl %edx,%ecx - addl %eax,%ebp - movl %ebx,%esi + xorl %ebx,%edi xorl %ecx,%ebx + addl %eax,%ebp addl 12(%rsp),%edx - andl %ecx,%esi andl %ebx,%edi + xorl %ecx,%ebx rorl $7,%eax - addl %esi,%edx movl %ebp,%esi + xorl %ebx,%edi roll $5,%ebp addl %edi,%edx - xorl %ecx,%ebx + xorl %eax,%esi + xorl %ebx,%eax addl %ebp,%edx - movl %eax,%edi pxor %xmm5,%xmm1 -.byte 102,68,15,58,15,207,8 - xorl %ebx,%eax addl 16(%rsp),%ecx - andl %ebx,%edi - pxor %xmm2,%xmm1 andl %eax,%esi + xorl %ebx,%eax rorl $7,%ebp - movdqa %xmm10,%xmm8 - paddd %xmm0,%xmm10 - addl %edi,%ecx + punpcklqdq %xmm0,%xmm9 movl %edx,%edi - pxor %xmm9,%xmm1 + xorl %eax,%esi + pxor %xmm2,%xmm1 roll $5,%edx addl %esi,%ecx - xorl %ebx,%eax - addl %edx,%ecx - movdqa %xmm1,%xmm9 - movdqa %xmm10,0(%rsp) - movl %ebp,%esi + movdqa %xmm10,%xmm8 + xorl %ebp,%edi + paddd %xmm0,%xmm10 xorl %eax,%ebp + pxor %xmm9,%xmm1 + addl %edx,%ecx addl 20(%rsp),%ebx - andl %eax,%esi - pslld $2,%xmm1 andl %ebp,%edi + xorl %eax,%ebp rorl $7,%edx - psrld $30,%xmm9 - addl %esi,%ebx + movdqa %xmm1,%xmm9 movl %ecx,%esi + xorl %ebp,%edi + movdqa %xmm10,0(%rsp) roll $5,%ecx addl %edi,%ebx - xorl %eax,%ebp - addl %ecx,%ebx - por %xmm9,%xmm1 - movl %edx,%edi + xorl %edx,%esi + pslld $2,%xmm1 xorl %ebp,%edx - movdqa %xmm1,%xmm10 + addl %ecx,%ebx + psrld $30,%xmm9 addl 24(%rsp),%eax - andl %ebp,%edi andl %edx,%esi + xorl %ebp,%edx + por %xmm9,%xmm1 rorl $7,%ecx - addl %edi,%eax movl %ebx,%edi + xorl %edx,%esi roll $5,%ebx + pshufd $238,%xmm0,%xmm10 addl %esi,%eax - xorl %ebp,%edx - addl %ebx,%eax - movl %ecx,%esi + xorl %ecx,%edi xorl %edx,%ecx + addl %ebx,%eax addl 28(%rsp),%ebp - andl %edx,%esi andl %ecx,%edi + xorl %edx,%ecx rorl $7,%ebx - addl %esi,%ebp movl %eax,%esi + xorl %ecx,%edi roll $5,%eax addl %edi,%ebp - xorl %edx,%ecx + xorl %ebx,%esi + xorl %ecx,%ebx addl %eax,%ebp - movl %ebx,%edi pxor %xmm6,%xmm2 -.byte 102,68,15,58,15,208,8 - xorl %ecx,%ebx addl 32(%rsp),%edx - andl %ecx,%edi - pxor %xmm3,%xmm2 andl %ebx,%esi + xorl %ecx,%ebx rorl $7,%eax - movdqa %xmm8,%xmm9 - paddd %xmm1,%xmm8 - addl %edi,%edx + punpcklqdq %xmm1,%xmm10 movl %ebp,%edi - pxor %xmm10,%xmm2 + xorl %ebx,%esi + pxor %xmm3,%xmm2 roll $5,%ebp addl %esi,%edx - xorl %ecx,%ebx - addl %ebp,%edx - movdqa %xmm2,%xmm10 - movdqa %xmm8,16(%rsp) - movl %eax,%esi + movdqa %xmm8,%xmm9 + xorl %eax,%edi + paddd %xmm1,%xmm8 xorl %ebx,%eax + pxor %xmm10,%xmm2 + addl %ebp,%edx addl 36(%rsp),%ecx - andl %ebx,%esi - pslld $2,%xmm2 andl %eax,%edi + xorl %ebx,%eax rorl $7,%ebp - psrld $30,%xmm10 - addl %esi,%ecx + movdqa %xmm2,%xmm10 movl %edx,%esi + xorl %eax,%edi + movdqa %xmm8,16(%rsp) roll $5,%edx addl %edi,%ecx - xorl %ebx,%eax - addl %edx,%ecx - por %xmm10,%xmm2 - movl %ebp,%edi + xorl %ebp,%esi + pslld $2,%xmm2 xorl %eax,%ebp - movdqa %xmm2,%xmm8 + addl %edx,%ecx + psrld $30,%xmm10 addl 40(%rsp),%ebx - andl %eax,%edi andl %ebp,%esi + xorl %eax,%ebp + por %xmm10,%xmm2 rorl $7,%edx - addl %edi,%ebx movl %ecx,%edi + xorl %ebp,%esi roll $5,%ecx + pshufd $238,%xmm1,%xmm8 addl %esi,%ebx - xorl %eax,%ebp - addl %ecx,%ebx - movl %edx,%esi + xorl %edx,%edi xorl %ebp,%edx + addl %ecx,%ebx addl 44(%rsp),%eax - andl %ebp,%esi andl %edx,%edi + xorl %ebp,%edx rorl $7,%ecx - addl %esi,%eax movl %ebx,%esi + xorl %edx,%edi roll $5,%ebx addl %edi,%eax - xorl %ebp,%edx + xorl %edx,%esi addl %ebx,%eax - addl 48(%rsp),%ebp pxor %xmm7,%xmm3 -.byte 102,68,15,58,15,193,8 - xorl %edx,%esi + addl 48(%rsp),%ebp + xorl %ecx,%esi + punpcklqdq %xmm2,%xmm8 movl %eax,%edi roll $5,%eax pxor %xmm4,%xmm3 - xorl %ecx,%esi - addl %eax,%ebp + addl %esi,%ebp + xorl %ecx,%edi movdqa %xmm9,%xmm10 - paddd %xmm2,%xmm9 rorl $7,%ebx - addl %esi,%ebp + paddd %xmm2,%xmm9 + addl %eax,%ebp pxor %xmm8,%xmm3 addl 52(%rsp),%edx - xorl %ecx,%edi + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp movdqa %xmm3,%xmm8 + addl %edi,%edx + xorl %ebx,%esi movdqa %xmm9,32(%rsp) - xorl %ebx,%edi - addl %ebp,%edx rorl $7,%eax - addl %edi,%edx - pslld $2,%xmm3 + addl %ebp,%edx addl 56(%rsp),%ecx - xorl %ebx,%esi - psrld $30,%xmm8 + pslld $2,%xmm3 + xorl %eax,%esi movl %edx,%edi + psrld $30,%xmm8 roll $5,%edx - xorl %eax,%esi - addl %edx,%ecx - rorl $7,%ebp addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp por %xmm8,%xmm3 + addl %edx,%ecx addl 60(%rsp),%ebx - xorl %eax,%edi + xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx - xorl %ebp,%edi - addl %ecx,%ebx - rorl $7,%edx addl %edi,%ebx - addl 0(%rsp),%eax - paddd %xmm3,%xmm10 xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + xorl %edx,%esi movl %ebx,%edi roll $5,%ebx - xorl %edx,%esi + paddd %xmm3,%xmm10 + addl %esi,%eax + xorl %edx,%edi movdqa %xmm10,48(%rsp) - addl %ebx,%eax rorl $7,%ecx - addl %esi,%eax + addl %ebx,%eax addl 4(%rsp),%ebp - xorl %edx,%edi + xorl %ecx,%edi movl %eax,%esi roll $5,%eax - xorl %ecx,%edi - addl %eax,%ebp - rorl $7,%ebx addl %edi,%ebp - addl 8(%rsp),%edx xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi movl %ebp,%edi roll $5,%ebp - xorl %ebx,%esi - addl %ebp,%edx - rorl $7,%eax addl %esi,%edx - addl 12(%rsp),%ecx xorl %ebx,%edi + rorl $7,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi movl %edx,%esi roll $5,%edx - xorl %eax,%edi - addl %edx,%ecx - rorl $7,%ebp addl %edi,%ecx + xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx cmpq %r10,%r9 je .Ldone_ssse3 movdqa 64(%r11),%xmm6 - movdqa 0(%r11),%xmm9 + movdqa -64(%r11),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 @@ -2278,113 +2205,112 @@ _ssse3_shortcut: .byte 102,15,56,0,198 addq $64,%r9 addl 16(%rsp),%ebx - xorl %eax,%esi -.byte 102,15,56,0,206 + xorl %ebp,%esi movl %ecx,%edi +.byte 102,15,56,0,206 roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx paddd %xmm9,%xmm0 - xorl %ebp,%esi addl %ecx,%ebx - rorl $7,%edx - addl %esi,%ebx - movdqa %xmm0,0(%rsp) addl 20(%rsp),%eax - xorl %ebp,%edi - psubd %xmm9,%xmm0 + xorl %edx,%edi movl %ebx,%esi + movdqa %xmm0,0(%rsp) roll $5,%ebx - xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx addl %edi,%eax - addl 24(%rsp),%ebp xorl %edx,%esi + rorl $7,%ecx + psubd %xmm9,%xmm0 + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi movl %eax,%edi roll $5,%eax - xorl %ecx,%esi - addl %eax,%ebp - rorl $7,%ebx addl %esi,%ebp - addl 28(%rsp),%edx xorl %ecx,%edi + rorl $7,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp - xorl %ebx,%edi - addl %ebp,%edx - rorl $7,%eax addl %edi,%edx - addl 32(%rsp),%ecx xorl %ebx,%esi -.byte 102,15,56,0,214 + rorl $7,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi movl %edx,%edi +.byte 102,15,56,0,214 roll $5,%edx + addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp paddd %xmm9,%xmm1 - xorl %eax,%esi addl %edx,%ecx - rorl $7,%ebp - addl %esi,%ecx - movdqa %xmm1,16(%rsp) addl 36(%rsp),%ebx - xorl %eax,%edi - psubd %xmm9,%xmm1 + xorl %ebp,%edi movl %ecx,%esi + movdqa %xmm1,16(%rsp) roll $5,%ecx - xorl %ebp,%edi - addl %ecx,%ebx - rorl $7,%edx addl %edi,%ebx - addl 40(%rsp),%eax xorl %ebp,%esi + rorl $7,%edx + psubd %xmm9,%xmm1 + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi movl %ebx,%edi roll $5,%ebx - xorl %edx,%esi - addl %ebx,%eax - rorl $7,%ecx addl %esi,%eax - addl 44(%rsp),%ebp xorl %edx,%edi + rorl $7,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi movl %eax,%esi roll $5,%eax - xorl %ecx,%edi - addl %eax,%ebp - rorl $7,%ebx addl %edi,%ebp - addl 48(%rsp),%edx xorl %ecx,%esi -.byte 102,15,56,0,222 + rorl $7,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi movl %ebp,%edi +.byte 102,15,56,0,222 roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax paddd %xmm9,%xmm2 - xorl %ebx,%esi addl %ebp,%edx - rorl $7,%eax - addl %esi,%edx - movdqa %xmm2,32(%rsp) addl 52(%rsp),%ecx - xorl %ebx,%edi - psubd %xmm9,%xmm2 + xorl %eax,%edi movl %edx,%esi + movdqa %xmm2,32(%rsp) roll $5,%edx - xorl %eax,%edi - addl %edx,%ecx - rorl $7,%ebp addl %edi,%ecx - addl 56(%rsp),%ebx xorl %eax,%esi + rorl $7,%ebp + psubd %xmm9,%xmm2 + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi movl %ecx,%edi roll $5,%ecx - xorl %ebp,%esi - addl %ecx,%ebx - rorl $7,%edx addl %esi,%ebx - addl 60(%rsp),%eax xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi movl %ebx,%esi roll $5,%ebx - xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx addl %edi,%eax + rorl $7,%ecx + addl %ebx,%eax addl 0(%r8),%eax addl 4(%r8),%esi addl 8(%r8),%ecx @@ -2394,108 +2320,110 @@ _ssse3_shortcut: movl %esi,4(%r8) movl %esi,%ebx movl %ecx,8(%r8) + movl %ecx,%edi movl %edx,12(%r8) + xorl %edx,%edi movl %ebp,16(%r8) + andl %edi,%esi jmp .Loop_ssse3 .align 16 .Ldone_ssse3: addl 16(%rsp),%ebx - xorl %eax,%esi + xorl %ebp,%esi movl %ecx,%edi roll $5,%ecx - xorl %ebp,%esi - addl %ecx,%ebx - rorl $7,%edx addl %esi,%ebx - addl 20(%rsp),%eax xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi movl %ebx,%esi roll $5,%ebx - xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx addl %edi,%eax - addl 24(%rsp),%ebp xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi movl %eax,%edi roll $5,%eax - xorl %ecx,%esi - addl %eax,%ebp - rorl $7,%ebx addl %esi,%ebp - addl 28(%rsp),%edx xorl %ecx,%edi + rorl $7,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp - xorl %ebx,%edi - addl %ebp,%edx - rorl $7,%eax addl %edi,%edx - addl 32(%rsp),%ecx xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi movl %edx,%edi roll $5,%edx - xorl %eax,%esi - addl %edx,%ecx - rorl $7,%ebp addl %esi,%ecx - addl 36(%rsp),%ebx xorl %eax,%edi + rorl $7,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx - xorl %ebp,%edi - addl %ecx,%ebx - rorl $7,%edx addl %edi,%ebx - addl 40(%rsp),%eax xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi movl %ebx,%edi roll $5,%ebx - xorl %edx,%esi - addl %ebx,%eax - rorl $7,%ecx addl %esi,%eax - addl 44(%rsp),%ebp xorl %edx,%edi + rorl $7,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi movl %eax,%esi roll $5,%eax - xorl %ecx,%edi - addl %eax,%ebp - rorl $7,%ebx addl %edi,%ebp - addl 48(%rsp),%edx xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi movl %ebp,%edi roll $5,%ebp - xorl %ebx,%esi - addl %ebp,%edx - rorl $7,%eax addl %esi,%edx - addl 52(%rsp),%ecx xorl %ebx,%edi + rorl $7,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi movl %edx,%esi roll $5,%edx - xorl %eax,%edi - addl %edx,%ecx - rorl $7,%ebp addl %edi,%ecx - addl 56(%rsp),%ebx xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi movl %ecx,%edi roll $5,%ecx - xorl %ebp,%esi - addl %ecx,%ebx - rorl $7,%edx addl %esi,%ebx - addl 60(%rsp),%eax xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi movl %ebx,%esi roll $5,%ebx - xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx addl %edi,%eax + rorl $7,%ecx + addl %ebx,%eax addl 0(%r8),%eax addl 4(%r8),%esi addl 8(%r8),%ecx @@ -2506,21 +2434,28 @@ _ssse3_shortcut: movl %ecx,8(%r8) movl %edx,12(%r8) movl %ebp,16(%r8) - leaq 64(%rsp),%rsi - movq 0(%rsi),%r12 - movq 8(%rsi),%rbp - movq 16(%rsi),%rbx - leaq 24(%rsi),%rsp + leaq (%r14),%rsi + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lepilogue_ssse3: .byte 0xf3,0xc3 .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 .align 64 K_XX_XX: -.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 -.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 -.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc -.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 diff --git a/lib/accelerated/x86/elf/sha256-ssse3-x86.s b/lib/accelerated/x86/elf/sha256-ssse3-x86.s index aab48fa387..211468c32a 100644 --- a/lib/accelerated/x86/elf/sha256-ssse3-x86.s +++ b/lib/accelerated/x86/elf/sha256-ssse3-x86.s @@ -64,195 +64,391 @@ sha256_block_data_order: movl %edi,4(%esp) movl %eax,8(%esp) movl %ebx,12(%esp) + jmp .L002loop .align 16 .L002loop: movl (%edi),%eax movl 4(%edi),%ebx movl 8(%edi),%ecx - movl 12(%edi),%edx bswap %eax + movl 12(%edi),%edx bswap %ebx - bswap %ecx - bswap %edx pushl %eax + bswap %ecx pushl %ebx + bswap %edx pushl %ecx pushl %edx movl 16(%edi),%eax movl 20(%edi),%ebx movl 24(%edi),%ecx - movl 28(%edi),%edx bswap %eax + movl 28(%edi),%edx bswap %ebx - bswap %ecx - bswap %edx pushl %eax + bswap %ecx pushl %ebx + bswap %edx pushl %ecx pushl %edx movl 32(%edi),%eax movl 36(%edi),%ebx movl 40(%edi),%ecx - movl 44(%edi),%edx bswap %eax + movl 44(%edi),%edx bswap %ebx - bswap %ecx - bswap %edx pushl %eax + bswap %ecx pushl %ebx + bswap %edx pushl %ecx pushl %edx movl 48(%edi),%eax movl 52(%edi),%ebx movl 56(%edi),%ecx - movl 60(%edi),%edx bswap %eax + movl 60(%edi),%edx bswap %ebx - bswap %ecx - bswap %edx pushl %eax + bswap %ecx pushl %ebx + bswap %edx pushl %ecx pushl %edx addl $64,%edi - subl $32,%esp - movl %edi,100(%esp) + leal -36(%esp),%esp + movl %edi,104(%esp) movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx movl 12(%esi),%edi - movl %ebx,4(%esp) - movl %ecx,8(%esp) - movl %edi,12(%esp) + movl %ebx,8(%esp) + xorl %ecx,%ebx + movl %ecx,12(%esp) + movl %edi,16(%esp) + movl %ebx,(%esp) movl 16(%esi),%edx movl 20(%esi),%ebx movl 24(%esi),%ecx movl 28(%esi),%edi - movl %ebx,20(%esp) - movl %ecx,24(%esp) - movl %edi,28(%esp) + movl %ebx,24(%esp) + movl %ecx,28(%esp) + movl %edi,32(%esp) .align 16 .L00300_15: - movl 92(%esp),%ebx movl %edx,%ecx + movl 24(%esp),%esi rorl $14,%ecx - movl 20(%esp),%esi + movl 28(%esp),%edi xorl %edx,%ecx - rorl $5,%ecx - xorl %edx,%ecx - rorl $6,%ecx - movl 24(%esp),%edi - addl %ecx,%ebx xorl %edi,%esi - movl %edx,16(%esp) - movl %eax,%ecx + movl 96(%esp),%ebx + rorl $5,%ecx andl %edx,%esi - movl 12(%esp),%edx + movl %edx,20(%esp) + xorl %ecx,%edx + addl 32(%esp),%ebx xorl %edi,%esi - movl %eax,%edi + rorl $6,%edx + movl %eax,%ecx addl %esi,%ebx rorl $9,%ecx - addl 28(%esp),%ebx + addl %edx,%ebx + movl 8(%esp),%edi xorl %eax,%ecx + movl %eax,4(%esp) + leal -4(%esp),%esp rorl $11,%ecx - movl 4(%esp),%esi + movl (%ebp),%esi xorl %eax,%ecx + movl 20(%esp),%edx + xorl %edi,%eax rorl $2,%ecx + addl %esi,%ebx + movl %eax,(%esp) addl %ebx,%edx - movl 8(%esp),%edi + andl 4(%esp),%eax addl %ecx,%ebx - movl %eax,(%esp) - movl %eax,%ecx - subl $4,%esp - orl %esi,%eax - andl %esi,%ecx - andl %edi,%eax - movl (%ebp),%esi - orl %ecx,%eax + xorl %edi,%eax addl $4,%ebp addl %ebx,%eax - addl %esi,%edx - addl %esi,%eax cmpl $3248222580,%esi jne .L00300_15 - movl 152(%esp),%ebx + movl 156(%esp),%ecx + jmp .L00416_63 .align 16 .L00416_63: - movl %ebx,%esi - movl 100(%esp),%ecx - rorl $11,%esi - movl %ecx,%edi - xorl %ebx,%esi - rorl $7,%esi + movl %ecx,%ebx + movl 104(%esp),%esi + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx shrl $3,%ebx - rorl $2,%edi - xorl %esi,%ebx - xorl %ecx,%edi - rorl $17,%edi - shrl $10,%ecx - addl 156(%esp),%ebx - xorl %ecx,%edi - addl 120(%esp),%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 160(%esp),%ebx + shrl $10,%edi + addl 124(%esp),%ebx movl %edx,%ecx - addl %edi,%ebx + xorl %esi,%edi + movl 24(%esp),%esi rorl $14,%ecx - movl 20(%esp),%esi - xorl %edx,%ecx - rorl $5,%ecx - movl %ebx,92(%esp) + addl %edi,%ebx + movl 28(%esp),%edi xorl %edx,%ecx - rorl $6,%ecx - movl 24(%esp),%edi - addl %ecx,%ebx xorl %edi,%esi - movl %edx,16(%esp) - movl %eax,%ecx + movl %ebx,96(%esp) + rorl $5,%ecx andl %edx,%esi - movl 12(%esp),%edx + movl %edx,20(%esp) + xorl %ecx,%edx + addl 32(%esp),%ebx xorl %edi,%esi - movl %eax,%edi + rorl $6,%edx + movl %eax,%ecx addl %esi,%ebx rorl $9,%ecx - addl 28(%esp),%ebx + addl %edx,%ebx + movl 8(%esp),%edi xorl %eax,%ecx + movl %eax,4(%esp) + leal -4(%esp),%esp rorl $11,%ecx - movl 4(%esp),%esi + movl (%ebp),%esi xorl %eax,%ecx + movl 20(%esp),%edx + xorl %edi,%eax rorl $2,%ecx + addl %esi,%ebx + movl %eax,(%esp) addl %ebx,%edx - movl 8(%esp),%edi + andl 4(%esp),%eax addl %ecx,%ebx + xorl %edi,%eax + movl 156(%esp),%ecx + addl $4,%ebp + addl %ebx,%eax + cmpl $3329325298,%esi + jne .L00416_63 + movl 356(%esp),%esi + movl 8(%esp),%ebx + movl 16(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl 24(%esp),%eax + movl 28(%esp),%ebx + movl 32(%esp),%ecx + movl 360(%esp),%edi + addl 16(%esi),%edx + addl 20(%esi),%eax + addl 24(%esi),%ebx + addl 28(%esi),%ecx + movl %edx,16(%esi) + movl %eax,20(%esi) + movl %ebx,24(%esi) + movl %ecx,28(%esi) + leal 356(%esp),%esp + subl $256,%ebp + cmpl 8(%esp),%edi + jb .L002loop + movl 12(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 32 +.L005loop_shrd: + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + bswap %eax + movl 12(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + movl 16(%edi),%eax + movl 20(%edi),%ebx + movl 24(%edi),%ecx + bswap %eax + movl 28(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + movl 32(%edi),%eax + movl 36(%edi),%ebx + movl 40(%edi),%ecx + bswap %eax + movl 44(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + movl 48(%edi),%eax + movl 52(%edi),%ebx + movl 56(%edi),%ecx + bswap %eax + movl 60(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + addl $64,%edi + leal -36(%esp),%esp + movl %edi,104(%esp) + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,8(%esp) + xorl %ecx,%ebx + movl %ecx,12(%esp) + movl %edi,16(%esp) + movl %ebx,(%esp) + movl 16(%esi),%edx + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%edi + movl %ebx,24(%esp) + movl %ecx,28(%esp) + movl %edi,32(%esp) +.align 16 +.L00600_15_shrd: + movl %edx,%ecx + movl 24(%esp),%esi + shrdl $14,%ecx,%ecx + movl 28(%esp),%edi + xorl %edx,%ecx + xorl %edi,%esi + movl 96(%esp),%ebx + shrdl $5,%ecx,%ecx + andl %edx,%esi + movl %edx,20(%esp) + xorl %ecx,%edx + addl 32(%esp),%ebx + xorl %edi,%esi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %esi,%ebx + shrdl $9,%ecx,%ecx + addl %edx,%ebx + movl 8(%esp),%edi + xorl %eax,%ecx + movl %eax,4(%esp) + leal -4(%esp),%esp + shrdl $11,%ecx,%ecx + movl (%ebp),%esi + xorl %eax,%ecx + movl 20(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %esi,%ebx movl %eax,(%esp) + addl %ebx,%edx + andl 4(%esp),%eax + addl %ecx,%ebx + xorl %edi,%eax + addl $4,%ebp + addl %ebx,%eax + cmpl $3248222580,%esi + jne .L00600_15_shrd + movl 156(%esp),%ecx + jmp .L00716_63_shrd +.align 16 +.L00716_63_shrd: + movl %ecx,%ebx + movl 104(%esp),%esi + shrdl $11,%ecx,%ecx + movl %esi,%edi + shrdl $2,%esi,%esi + xorl %ebx,%ecx + shrl $3,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + shrdl $17,%esi,%esi + addl 160(%esp),%ebx + shrl $10,%edi + addl 124(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 24(%esp),%esi + shrdl $14,%ecx,%ecx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %edx,%ecx + xorl %edi,%esi + movl %ebx,96(%esp) + shrdl $5,%ecx,%ecx + andl %edx,%esi + movl %edx,20(%esp) + xorl %ecx,%edx + addl 32(%esp),%ebx + xorl %edi,%esi + shrdl $6,%edx,%edx movl %eax,%ecx - subl $4,%esp - orl %esi,%eax - andl %esi,%ecx - andl %edi,%eax + addl %esi,%ebx + shrdl $9,%ecx,%ecx + addl %edx,%ebx + movl 8(%esp),%edi + xorl %eax,%ecx + movl %eax,4(%esp) + leal -4(%esp),%esp + shrdl $11,%ecx,%ecx movl (%ebp),%esi - orl %ecx,%eax + xorl %eax,%ecx + movl 20(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %esi,%ebx + movl %eax,(%esp) + addl %ebx,%edx + andl 4(%esp),%eax + addl %ecx,%ebx + xorl %edi,%eax + movl 156(%esp),%ecx addl $4,%ebp addl %ebx,%eax - movl 152(%esp),%ebx - addl %esi,%edx - addl %esi,%eax cmpl $3329325298,%esi - jne .L00416_63 - movl 352(%esp),%esi - movl 4(%esp),%ebx - movl 8(%esp),%ecx - movl 12(%esp),%edi + jne .L00716_63_shrd + movl 356(%esp),%esi + movl 8(%esp),%ebx + movl 16(%esp),%ecx addl (%esi),%eax addl 4(%esi),%ebx - addl 8(%esi),%ecx - addl 12(%esi),%edi + addl 8(%esi),%edi + addl 12(%esi),%ecx movl %eax,(%esi) movl %ebx,4(%esi) - movl %ecx,8(%esi) - movl %edi,12(%esi) - movl 20(%esp),%eax - movl 24(%esp),%ebx - movl 28(%esp),%ecx - movl 356(%esp),%edi + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl 24(%esp),%eax + movl 28(%esp),%ebx + movl 32(%esp),%ecx + movl 360(%esp),%edi addl 16(%esi),%edx addl 20(%esi),%eax addl 24(%esi),%ebx @@ -261,10 +457,10 @@ sha256_block_data_order: movl %eax,20(%esi) movl %ebx,24(%esi) movl %ecx,28(%esi) - addl $352,%esp + leal 356(%esp),%esp subl $256,%ebp cmpl 8(%esp),%edi - jb .L002loop + jb .L005loop_shrd movl 12(%esp),%esp popl %edi popl %esi @@ -273,27 +469,2920 @@ sha256_block_data_order: ret .align 64 .L001K256: -.long 1116352408,1899447441,3049323471,3921009573 -.long 961987163,1508970993,2453635748,2870763221 -.long 3624381080,310598401,607225278,1426881987 -.long 1925078388,2162078206,2614888103,3248222580 -.long 3835390401,4022224774,264347078,604807628 -.long 770255983,1249150122,1555081692,1996064986 -.long 2554220882,2821834349,2952996808,3210313671 -.long 3336571891,3584528711,113926993,338241895 -.long 666307205,773529912,1294757372,1396182291 -.long 1695183700,1986661051,2177026350,2456956037 -.long 2730485921,2820302411,3259730800,3345764771 -.long 3516065817,3600352804,4094571909,275423344 -.long 430227734,506948616,659060556,883997877 -.long 958139571,1322822218,1537002063,1747873779 -.long 1955562222,2024104815,2227730452,2361852424 -.long 2428436474,2756734187,3204031479,3329325298 -.size sha256_block_data_order,.-.L_sha256_block_data_order_begin +.long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298 +.long 66051,67438087,134810123,202182159 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 .byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 .byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 +.align 16 +.L008unrolled: + leal -96(%esp),%esp + movl (%esi),%eax + movl 4(%esi),%ebp + movl 8(%esi),%ecx + movl 12(%esi),%ebx + movl %ebp,4(%esp) + xorl %ecx,%ebp + movl %ecx,8(%esp) + movl %ebx,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %ebx,20(%esp) + movl %ecx,24(%esp) + movl %esi,28(%esp) + jmp .L009grand_loop +.align 16 +.L009grand_loop: + movl (%edi),%ebx + movl 4(%edi),%ecx + bswap %ebx + movl 8(%edi),%esi + bswap %ecx + movl %ebx,32(%esp) + bswap %esi + movl %ecx,36(%esp) + movl %esi,40(%esp) + movl 12(%edi),%ebx + movl 16(%edi),%ecx + bswap %ebx + movl 20(%edi),%esi + bswap %ecx + movl %ebx,44(%esp) + bswap %esi + movl %ecx,48(%esp) + movl %esi,52(%esp) + movl 24(%edi),%ebx + movl 28(%edi),%ecx + bswap %ebx + movl 32(%edi),%esi + bswap %ecx + movl %ebx,56(%esp) + bswap %esi + movl %ecx,60(%esp) + movl %esi,64(%esp) + movl 36(%edi),%ebx + movl 40(%edi),%ecx + bswap %ebx + movl 44(%edi),%esi + bswap %ecx + movl %ebx,68(%esp) + bswap %esi + movl %ecx,72(%esp) + movl %esi,76(%esp) + movl 48(%edi),%ebx + movl 52(%edi),%ecx + bswap %ebx + movl 56(%edi),%esi + bswap %ecx + movl %ebx,80(%esp) + bswap %esi + movl %ecx,84(%esp) + movl %esi,88(%esp) + movl 60(%edi),%ebx + addl $64,%edi + bswap %ebx + movl %edi,100(%esp) + movl %ebx,92(%esp) + movl %edx,%ecx + movl 20(%esp),%esi + rorl $14,%edx + movl 24(%esp),%edi + xorl %ecx,%edx + movl 32(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1116352408(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 16(%esp),%ecx + rorl $14,%edx + movl 20(%esp),%edi + xorl %esi,%edx + movl 36(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1899447441(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 12(%esp),%esi + rorl $14,%edx + movl 16(%esp),%edi + xorl %ecx,%edx + movl 40(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3049323471(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 8(%esp),%ecx + rorl $14,%edx + movl 12(%esp),%edi + xorl %esi,%edx + movl 44(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3921009573(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 4(%esp),%esi + rorl $14,%edx + movl 8(%esp),%edi + xorl %ecx,%edx + movl 48(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 961987163(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl (%esp),%ecx + rorl $14,%edx + movl 4(%esp),%edi + xorl %esi,%edx + movl 52(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1508970993(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 28(%esp),%esi + rorl $14,%edx + movl (%esp),%edi + xorl %ecx,%edx + movl 56(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2453635748(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 24(%esp),%ecx + rorl $14,%edx + movl 28(%esp),%edi + xorl %esi,%edx + movl 60(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2870763221(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 20(%esp),%esi + rorl $14,%edx + movl 24(%esp),%edi + xorl %ecx,%edx + movl 64(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3624381080(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 16(%esp),%ecx + rorl $14,%edx + movl 20(%esp),%edi + xorl %esi,%edx + movl 68(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 310598401(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 12(%esp),%esi + rorl $14,%edx + movl 16(%esp),%edi + xorl %ecx,%edx + movl 72(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 607225278(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 8(%esp),%ecx + rorl $14,%edx + movl 12(%esp),%edi + xorl %esi,%edx + movl 76(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1426881987(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 4(%esp),%esi + rorl $14,%edx + movl 8(%esp),%edi + xorl %ecx,%edx + movl 80(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1925078388(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl (%esp),%ecx + rorl $14,%edx + movl 4(%esp),%edi + xorl %esi,%edx + movl 84(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2162078206(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 28(%esp),%esi + rorl $14,%edx + movl (%esp),%edi + xorl %ecx,%edx + movl 88(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2614888103(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 24(%esp),%ecx + rorl $14,%edx + movl 28(%esp),%edi + xorl %esi,%edx + movl 92(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3248222580(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 36(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 88(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 32(%esp),%ebx + shrl $10,%edi + addl 68(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,32(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3835390401(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 40(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 92(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 36(%esp),%ebx + shrl $10,%edi + addl 72(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,36(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 4022224774(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 44(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 32(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 40(%esp),%ebx + shrl $10,%edi + addl 76(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,40(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 264347078(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 48(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 36(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 44(%esp),%ebx + shrl $10,%edi + addl 80(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,44(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 604807628(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 52(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 40(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 48(%esp),%ebx + shrl $10,%edi + addl 84(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,48(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 770255983(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 56(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 44(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 52(%esp),%ebx + shrl $10,%edi + addl 88(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,52(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1249150122(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 60(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 48(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 56(%esp),%ebx + shrl $10,%edi + addl 92(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,56(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1555081692(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 64(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 52(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 60(%esp),%ebx + shrl $10,%edi + addl 32(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,60(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1996064986(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 68(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 56(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 64(%esp),%ebx + shrl $10,%edi + addl 36(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,64(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2554220882(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 72(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 60(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 68(%esp),%ebx + shrl $10,%edi + addl 40(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,68(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2821834349(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 76(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 64(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 72(%esp),%ebx + shrl $10,%edi + addl 44(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,72(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2952996808(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 80(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 68(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 76(%esp),%ebx + shrl $10,%edi + addl 48(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,76(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3210313671(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 84(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 72(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 80(%esp),%ebx + shrl $10,%edi + addl 52(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,80(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3336571891(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 88(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 76(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 84(%esp),%ebx + shrl $10,%edi + addl 56(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,84(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3584528711(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 92(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 80(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 88(%esp),%ebx + shrl $10,%edi + addl 60(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,88(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 113926993(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 32(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 84(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 92(%esp),%ebx + shrl $10,%edi + addl 64(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,92(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 338241895(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 36(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 88(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 32(%esp),%ebx + shrl $10,%edi + addl 68(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,32(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 666307205(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 40(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 92(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 36(%esp),%ebx + shrl $10,%edi + addl 72(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,36(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 773529912(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 44(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 32(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 40(%esp),%ebx + shrl $10,%edi + addl 76(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,40(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1294757372(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 48(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 36(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 44(%esp),%ebx + shrl $10,%edi + addl 80(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,44(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1396182291(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 52(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 40(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 48(%esp),%ebx + shrl $10,%edi + addl 84(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,48(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1695183700(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 56(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 44(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 52(%esp),%ebx + shrl $10,%edi + addl 88(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,52(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1986661051(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 60(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 48(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 56(%esp),%ebx + shrl $10,%edi + addl 92(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,56(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2177026350(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 64(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 52(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 60(%esp),%ebx + shrl $10,%edi + addl 32(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,60(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2456956037(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 68(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 56(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 64(%esp),%ebx + shrl $10,%edi + addl 36(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,64(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2730485921(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 72(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 60(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 68(%esp),%ebx + shrl $10,%edi + addl 40(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,68(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2820302411(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 76(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 64(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 72(%esp),%ebx + shrl $10,%edi + addl 44(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,72(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3259730800(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 80(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 68(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 76(%esp),%ebx + shrl $10,%edi + addl 48(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,76(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3345764771(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 84(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 72(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 80(%esp),%ebx + shrl $10,%edi + addl 52(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,80(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3516065817(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 88(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 76(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 84(%esp),%ebx + shrl $10,%edi + addl 56(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,84(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3600352804(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 92(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 80(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 88(%esp),%ebx + shrl $10,%edi + addl 60(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,88(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 4094571909(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 32(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 84(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 92(%esp),%ebx + shrl $10,%edi + addl 64(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,92(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 275423344(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 36(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 88(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 32(%esp),%ebx + shrl $10,%edi + addl 68(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,32(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 430227734(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 40(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 92(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 36(%esp),%ebx + shrl $10,%edi + addl 72(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,36(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 506948616(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 44(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 32(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 40(%esp),%ebx + shrl $10,%edi + addl 76(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,40(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 659060556(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 48(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 36(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 44(%esp),%ebx + shrl $10,%edi + addl 80(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,44(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 883997877(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 52(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 40(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 48(%esp),%ebx + shrl $10,%edi + addl 84(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,48(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 958139571(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 56(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 44(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 52(%esp),%ebx + shrl $10,%edi + addl 88(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,52(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1322822218(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 60(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 48(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 56(%esp),%ebx + shrl $10,%edi + addl 92(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,56(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1537002063(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 64(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 52(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 60(%esp),%ebx + shrl $10,%edi + addl 32(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,60(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1747873779(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 68(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 56(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 64(%esp),%ebx + shrl $10,%edi + addl 36(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,64(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1955562222(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 72(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 60(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 68(%esp),%ebx + shrl $10,%edi + addl 40(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,68(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2024104815(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 76(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 64(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 72(%esp),%ebx + shrl $10,%edi + addl 44(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,72(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2227730452(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 80(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 68(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 76(%esp),%ebx + shrl $10,%edi + addl 48(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,76(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2361852424(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 84(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 72(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 80(%esp),%ebx + shrl $10,%edi + addl 52(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,80(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2428436474(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 88(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 76(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 84(%esp),%ebx + shrl $10,%edi + addl 56(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,84(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2756734187(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 92(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 80(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 88(%esp),%ebx + shrl $10,%edi + addl 60(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3204031479(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 32(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 84(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 92(%esp),%ebx + shrl $10,%edi + addl 64(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3329325298(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 96(%esp),%esi + xorl %edi,%ebp + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebp + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebp,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebp,4(%esp) + xorl %edi,%ebp + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ebx + movl 28(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ebx + addl 28(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %ebx,24(%esi) + movl %ecx,28(%esi) + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ebx,24(%esp) + movl %ecx,28(%esp) + cmpl 104(%esp),%edi + jb .L009grand_loop + movl 108(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size sha256_block_data_order,.-.L_sha256_block_data_order_begin .section .note.GNU-stack,"",%progbits diff --git a/lib/accelerated/x86/elf/sha512-ssse3-x86.s b/lib/accelerated/x86/elf/sha512-ssse3-x86.s index e8eeefe7ad..7fa849a267 100644 --- a/lib/accelerated/x86/elf/sha512-ssse3-x86.s +++ b/lib/accelerated/x86/elf/sha512-ssse3-x86.s @@ -594,6 +594,8 @@ sha512_block_data_order: .long 4234509866,1501505948 .long 987167468,1607167915 .long 1246189591,1816402316 +.long 67438087,66051 +.long 202182159,134810123 .size sha512_block_data_order,.-.L_sha512_block_data_order_begin .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97 .byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 diff --git a/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s b/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s index 22f55fe953..f85e0bb6d8 100644 --- a/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s +++ b/lib/accelerated/x86/elf/sha512-ssse3-x86_64.s @@ -39,10 +39,17 @@ # .text + .globl sha256_block_data_order .type sha256_block_data_order,@function .align 16 sha256_block_data_order: + leaq _gnutls_x86_cpuid_s(%rip),%r11 + movl 0(%r11),%r9d + movl 4(%r11),%r10d + movl 8(%r11),%r11d + testl $512,%r10d + jnz .Lssse3_shortcut pushq %rbx pushq %rbp pushq %r12 @@ -60,8 +67,6 @@ sha256_block_data_order: movq %r11,64+24(%rsp) .Lprologue: - leaq K256(%rip),%rbp - movl 0(%rdi),%eax movl 4(%rdi),%ebx movl 8(%rdi),%ecx @@ -74,1694 +79,1632 @@ sha256_block_data_order: .align 16 .Lloop: - xorq %rdi,%rdi + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi movl 0(%rsi),%r12d movl %r8d,%r13d movl %eax,%r14d bswapl %r12d rorl $14,%r13d movl %r9d,%r15d - movl %r12d,0(%rsp) - rorl $9,%r14d xorl %r8d,%r13d + rorl $9,%r14d xorl %r10d,%r15d - rorl $5,%r13d - addl %r11d,%r12d + movl %r12d,0(%rsp) xorl %eax,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r8d,%r15d - movl %ebx,%r11d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d - xorl %r10d,%r15d + addl %r15d,%r12d - xorl %ecx,%r11d + movl %eax,%r15d + addl (%rbp),%r12d xorl %eax,%r14d - addl %r15d,%r12d - movl %ebx,%r15d + xorl %ebx,%r15d rorl $6,%r13d - andl %eax,%r11d - andl %ecx,%r15d + movl %ebx,%r11d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r11d + xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d - leaq 1(%rdi),%rdi - addl %r14d,%r11d + leaq 4(%rbp),%rbp + addl %r14d,%r11d movl 4(%rsi),%r12d movl %edx,%r13d movl %r11d,%r14d bswapl %r12d rorl $14,%r13d - movl %r8d,%r15d - movl %r12d,4(%rsp) + movl %r8d,%edi - rorl $9,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + rorl $9,%r14d + xorl %r9d,%edi - rorl $5,%r13d - addl %r10d,%r12d + movl %r12d,4(%rsp) xorl %r11d,%r14d + andl %edx,%edi - addl (%rbp,%rdi,4),%r12d - andl %edx,%r15d - movl %eax,%r10d + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + addl %edi,%r12d - xorl %ebx,%r10d + movl %r11d,%edi + addl (%rbp),%r12d xorl %r11d,%r14d - addl %r15d,%r12d - movl %eax,%r15d + xorl %eax,%edi rorl $6,%r13d - andl %r11d,%r10d - andl %ebx,%r15d + movl %eax,%r10d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r10d + xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d - leaq 1(%rdi),%rdi - addl %r14d,%r10d + leaq 4(%rbp),%rbp + addl %r14d,%r10d movl 8(%rsi),%r12d movl %ecx,%r13d movl %r10d,%r14d bswapl %r12d rorl $14,%r13d movl %edx,%r15d - movl %r12d,8(%rsp) - rorl $9,%r14d xorl %ecx,%r13d + rorl $9,%r14d xorl %r8d,%r15d - rorl $5,%r13d - addl %r9d,%r12d + movl %r12d,8(%rsp) xorl %r10d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %ecx,%r15d - movl %r11d,%r9d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d - xorl %r8d,%r15d + addl %r15d,%r12d - xorl %eax,%r9d + movl %r10d,%r15d + addl (%rbp),%r12d xorl %r10d,%r14d - addl %r15d,%r12d - movl %r11d,%r15d + xorl %r11d,%r15d rorl $6,%r13d - andl %r10d,%r9d - andl %eax,%r15d + movl %r11d,%r9d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r9d + xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d - leaq 1(%rdi),%rdi - addl %r14d,%r9d + leaq 4(%rbp),%rbp + addl %r14d,%r9d movl 12(%rsi),%r12d movl %ebx,%r13d movl %r9d,%r14d bswapl %r12d rorl $14,%r13d - movl %ecx,%r15d - movl %r12d,12(%rsp) + movl %ecx,%edi - rorl $9,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + rorl $9,%r14d + xorl %edx,%edi - rorl $5,%r13d - addl %r8d,%r12d + movl %r12d,12(%rsp) xorl %r9d,%r14d + andl %ebx,%edi - addl (%rbp,%rdi,4),%r12d - andl %ebx,%r15d - movl %r10d,%r8d + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + addl %edi,%r12d - xorl %r11d,%r8d + movl %r9d,%edi + addl (%rbp),%r12d xorl %r9d,%r14d - addl %r15d,%r12d - movl %r10d,%r15d + xorl %r10d,%edi rorl $6,%r13d - andl %r9d,%r8d - andl %r11d,%r15d + movl %r10d,%r8d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r8d + xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d - leaq 1(%rdi),%rdi - addl %r14d,%r8d + leaq 20(%rbp),%rbp + addl %r14d,%r8d movl 16(%rsi),%r12d movl %eax,%r13d movl %r8d,%r14d bswapl %r12d rorl $14,%r13d movl %ebx,%r15d - movl %r12d,16(%rsp) - rorl $9,%r14d xorl %eax,%r13d + rorl $9,%r14d xorl %ecx,%r15d - rorl $5,%r13d - addl %edx,%r12d + movl %r12d,16(%rsp) xorl %r8d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %eax,%r15d - movl %r9d,%edx + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d - xorl %ecx,%r15d + addl %r15d,%r12d - xorl %r10d,%edx + movl %r8d,%r15d + addl (%rbp),%r12d xorl %r8d,%r14d - addl %r15d,%r12d - movl %r9d,%r15d + xorl %r9d,%r15d rorl $6,%r13d - andl %r8d,%edx - andl %r10d,%r15d + movl %r9d,%edx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%edx + xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx - leaq 1(%rdi),%rdi - addl %r14d,%edx + leaq 4(%rbp),%rbp + addl %r14d,%edx movl 20(%rsi),%r12d movl %r11d,%r13d movl %edx,%r14d bswapl %r12d rorl $14,%r13d - movl %eax,%r15d - movl %r12d,20(%rsp) + movl %eax,%edi - rorl $9,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + rorl $9,%r14d + xorl %ebx,%edi - rorl $5,%r13d - addl %ecx,%r12d + movl %r12d,20(%rsp) xorl %edx,%r14d + andl %r11d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r11d,%r15d - movl %r8d,%ecx + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + addl %edi,%r12d - xorl %r9d,%ecx + movl %edx,%edi + addl (%rbp),%r12d xorl %edx,%r14d - addl %r15d,%r12d - movl %r8d,%r15d + xorl %r8d,%edi rorl $6,%r13d - andl %edx,%ecx - andl %r9d,%r15d + movl %r8d,%ecx + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ecx + xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx - leaq 1(%rdi),%rdi - addl %r14d,%ecx + leaq 4(%rbp),%rbp + addl %r14d,%ecx movl 24(%rsi),%r12d movl %r10d,%r13d movl %ecx,%r14d bswapl %r12d rorl $14,%r13d movl %r11d,%r15d - movl %r12d,24(%rsp) - rorl $9,%r14d xorl %r10d,%r13d + rorl $9,%r14d xorl %eax,%r15d - rorl $5,%r13d - addl %ebx,%r12d + movl %r12d,24(%rsp) xorl %ecx,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r10d,%r15d - movl %edx,%ebx + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d - xorl %eax,%r15d + addl %r15d,%r12d - xorl %r8d,%ebx + movl %ecx,%r15d + addl (%rbp),%r12d xorl %ecx,%r14d - addl %r15d,%r12d - movl %edx,%r15d + xorl %edx,%r15d rorl $6,%r13d - andl %ecx,%ebx - andl %r8d,%r15d + movl %edx,%ebx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ebx + xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx - leaq 1(%rdi),%rdi - addl %r14d,%ebx + leaq 4(%rbp),%rbp + addl %r14d,%ebx movl 28(%rsi),%r12d movl %r9d,%r13d movl %ebx,%r14d bswapl %r12d rorl $14,%r13d - movl %r10d,%r15d - movl %r12d,28(%rsp) + movl %r10d,%edi - rorl $9,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + rorl $9,%r14d + xorl %r11d,%edi - rorl $5,%r13d - addl %eax,%r12d + movl %r12d,28(%rsp) xorl %ebx,%r14d + andl %r9d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r9d,%r15d - movl %ecx,%eax + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + addl %edi,%r12d - xorl %edx,%eax + movl %ebx,%edi + addl (%rbp),%r12d xorl %ebx,%r14d - addl %r15d,%r12d - movl %ecx,%r15d + xorl %ecx,%edi rorl $6,%r13d - andl %ebx,%eax - andl %edx,%r15d + movl %ecx,%eax + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%eax + xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax - leaq 1(%rdi),%rdi - addl %r14d,%eax + leaq 20(%rbp),%rbp + addl %r14d,%eax movl 32(%rsi),%r12d movl %r8d,%r13d movl %eax,%r14d bswapl %r12d rorl $14,%r13d movl %r9d,%r15d - movl %r12d,32(%rsp) - rorl $9,%r14d xorl %r8d,%r13d + rorl $9,%r14d xorl %r10d,%r15d - rorl $5,%r13d - addl %r11d,%r12d + movl %r12d,32(%rsp) xorl %eax,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r8d,%r15d - movl %ebx,%r11d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d - xorl %r10d,%r15d + addl %r15d,%r12d - xorl %ecx,%r11d + movl %eax,%r15d + addl (%rbp),%r12d xorl %eax,%r14d - addl %r15d,%r12d - movl %ebx,%r15d + xorl %ebx,%r15d rorl $6,%r13d - andl %eax,%r11d - andl %ecx,%r15d + movl %ebx,%r11d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r11d + xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d - leaq 1(%rdi),%rdi - addl %r14d,%r11d + leaq 4(%rbp),%rbp + addl %r14d,%r11d movl 36(%rsi),%r12d movl %edx,%r13d movl %r11d,%r14d bswapl %r12d rorl $14,%r13d - movl %r8d,%r15d - movl %r12d,36(%rsp) + movl %r8d,%edi - rorl $9,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + rorl $9,%r14d + xorl %r9d,%edi - rorl $5,%r13d - addl %r10d,%r12d + movl %r12d,36(%rsp) xorl %r11d,%r14d + andl %edx,%edi - addl (%rbp,%rdi,4),%r12d - andl %edx,%r15d - movl %eax,%r10d + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + addl %edi,%r12d - xorl %ebx,%r10d + movl %r11d,%edi + addl (%rbp),%r12d xorl %r11d,%r14d - addl %r15d,%r12d - movl %eax,%r15d + xorl %eax,%edi rorl $6,%r13d - andl %r11d,%r10d - andl %ebx,%r15d + movl %eax,%r10d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r10d + xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d - leaq 1(%rdi),%rdi - addl %r14d,%r10d + leaq 4(%rbp),%rbp + addl %r14d,%r10d movl 40(%rsi),%r12d movl %ecx,%r13d movl %r10d,%r14d bswapl %r12d rorl $14,%r13d movl %edx,%r15d - movl %r12d,40(%rsp) - rorl $9,%r14d xorl %ecx,%r13d + rorl $9,%r14d xorl %r8d,%r15d - rorl $5,%r13d - addl %r9d,%r12d + movl %r12d,40(%rsp) xorl %r10d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %ecx,%r15d - movl %r11d,%r9d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d - xorl %r8d,%r15d + addl %r15d,%r12d - xorl %eax,%r9d + movl %r10d,%r15d + addl (%rbp),%r12d xorl %r10d,%r14d - addl %r15d,%r12d - movl %r11d,%r15d + xorl %r11d,%r15d rorl $6,%r13d - andl %r10d,%r9d - andl %eax,%r15d + movl %r11d,%r9d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r9d + xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d - leaq 1(%rdi),%rdi - addl %r14d,%r9d + leaq 4(%rbp),%rbp + addl %r14d,%r9d movl 44(%rsi),%r12d movl %ebx,%r13d movl %r9d,%r14d bswapl %r12d rorl $14,%r13d - movl %ecx,%r15d - movl %r12d,44(%rsp) + movl %ecx,%edi - rorl $9,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + rorl $9,%r14d + xorl %edx,%edi - rorl $5,%r13d - addl %r8d,%r12d + movl %r12d,44(%rsp) xorl %r9d,%r14d + andl %ebx,%edi - addl (%rbp,%rdi,4),%r12d - andl %ebx,%r15d - movl %r10d,%r8d + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + addl %edi,%r12d - xorl %r11d,%r8d + movl %r9d,%edi + addl (%rbp),%r12d xorl %r9d,%r14d - addl %r15d,%r12d - movl %r10d,%r15d + xorl %r10d,%edi rorl $6,%r13d - andl %r9d,%r8d - andl %r11d,%r15d + movl %r10d,%r8d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r8d + xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d - leaq 1(%rdi),%rdi - addl %r14d,%r8d + leaq 20(%rbp),%rbp + addl %r14d,%r8d movl 48(%rsi),%r12d movl %eax,%r13d movl %r8d,%r14d bswapl %r12d rorl $14,%r13d movl %ebx,%r15d - movl %r12d,48(%rsp) - rorl $9,%r14d xorl %eax,%r13d + rorl $9,%r14d xorl %ecx,%r15d - rorl $5,%r13d - addl %edx,%r12d + movl %r12d,48(%rsp) xorl %r8d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %eax,%r15d - movl %r9d,%edx + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d - xorl %ecx,%r15d + addl %r15d,%r12d - xorl %r10d,%edx + movl %r8d,%r15d + addl (%rbp),%r12d xorl %r8d,%r14d - addl %r15d,%r12d - movl %r9d,%r15d + xorl %r9d,%r15d rorl $6,%r13d - andl %r8d,%edx - andl %r10d,%r15d + movl %r9d,%edx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%edx + xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx - leaq 1(%rdi),%rdi - addl %r14d,%edx + leaq 4(%rbp),%rbp + addl %r14d,%edx movl 52(%rsi),%r12d movl %r11d,%r13d movl %edx,%r14d bswapl %r12d rorl $14,%r13d - movl %eax,%r15d - movl %r12d,52(%rsp) + movl %eax,%edi - rorl $9,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + rorl $9,%r14d + xorl %ebx,%edi - rorl $5,%r13d - addl %ecx,%r12d + movl %r12d,52(%rsp) xorl %edx,%r14d + andl %r11d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r11d,%r15d - movl %r8d,%ecx + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + addl %edi,%r12d - xorl %r9d,%ecx + movl %edx,%edi + addl (%rbp),%r12d xorl %edx,%r14d - addl %r15d,%r12d - movl %r8d,%r15d + xorl %r8d,%edi rorl $6,%r13d - andl %edx,%ecx - andl %r9d,%r15d + movl %r8d,%ecx + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ecx + xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx - leaq 1(%rdi),%rdi - addl %r14d,%ecx + leaq 4(%rbp),%rbp + addl %r14d,%ecx movl 56(%rsi),%r12d movl %r10d,%r13d movl %ecx,%r14d bswapl %r12d rorl $14,%r13d movl %r11d,%r15d - movl %r12d,56(%rsp) - rorl $9,%r14d xorl %r10d,%r13d + rorl $9,%r14d xorl %eax,%r15d - rorl $5,%r13d - addl %ebx,%r12d + movl %r12d,56(%rsp) xorl %ecx,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r10d,%r15d - movl %edx,%ebx + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d - xorl %eax,%r15d + addl %r15d,%r12d - xorl %r8d,%ebx + movl %ecx,%r15d + addl (%rbp),%r12d xorl %ecx,%r14d - addl %r15d,%r12d - movl %edx,%r15d + xorl %edx,%r15d rorl $6,%r13d - andl %ecx,%ebx - andl %r8d,%r15d + movl %edx,%ebx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ebx + xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx - leaq 1(%rdi),%rdi - addl %r14d,%ebx + leaq 4(%rbp),%rbp + addl %r14d,%ebx movl 60(%rsi),%r12d movl %r9d,%r13d movl %ebx,%r14d bswapl %r12d rorl $14,%r13d - movl %r10d,%r15d - movl %r12d,60(%rsp) + movl %r10d,%edi - rorl $9,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + rorl $9,%r14d + xorl %r11d,%edi - rorl $5,%r13d - addl %eax,%r12d + movl %r12d,60(%rsp) xorl %ebx,%r14d + andl %r9d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r9d,%r15d - movl %ecx,%eax + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + addl %edi,%r12d - xorl %edx,%eax + movl %ebx,%edi + addl (%rbp),%r12d xorl %ebx,%r14d - addl %r15d,%r12d - movl %ecx,%r15d + xorl %ecx,%edi rorl $6,%r13d - andl %ebx,%eax - andl %edx,%r15d + movl %ecx,%eax + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%eax + xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax - leaq 1(%rdi),%rdi - addl %r14d,%eax + leaq 20(%rbp),%rbp jmp .Lrounds_16_xx .align 16 .Lrounds_16_xx: movl 4(%rsp),%r13d - movl 56(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 56(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 36(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d addl 0(%rsp),%r12d movl %r8d,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %eax,%r14d rorl $14,%r13d movl %r9d,%r15d - movl %r12d,0(%rsp) - rorl $9,%r14d xorl %r8d,%r13d + rorl $9,%r14d xorl %r10d,%r15d - rorl $5,%r13d - addl %r11d,%r12d + movl %r12d,0(%rsp) xorl %eax,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r8d,%r15d - movl %ebx,%r11d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d - xorl %r10d,%r15d + addl %r15d,%r12d - xorl %ecx,%r11d + movl %eax,%r15d + addl (%rbp),%r12d xorl %eax,%r14d - addl %r15d,%r12d - movl %ebx,%r15d + xorl %ebx,%r15d rorl $6,%r13d - andl %eax,%r11d - andl %ecx,%r15d + movl %ebx,%r11d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r11d + xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d - leaq 1(%rdi),%rdi - addl %r14d,%r11d + leaq 4(%rbp),%rbp movl 8(%rsp),%r13d - movl 60(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 60(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 40(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d addl 4(%rsp),%r12d movl %edx,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %r11d,%r14d rorl $14,%r13d - movl %r8d,%r15d - movl %r12d,4(%rsp) + movl %r8d,%edi - rorl $9,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + rorl $9,%r14d + xorl %r9d,%edi - rorl $5,%r13d - addl %r10d,%r12d + movl %r12d,4(%rsp) xorl %r11d,%r14d + andl %edx,%edi - addl (%rbp,%rdi,4),%r12d - andl %edx,%r15d - movl %eax,%r10d + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + addl %edi,%r12d - xorl %ebx,%r10d + movl %r11d,%edi + addl (%rbp),%r12d xorl %r11d,%r14d - addl %r15d,%r12d - movl %eax,%r15d + xorl %eax,%edi rorl $6,%r13d - andl %r11d,%r10d - andl %ebx,%r15d + movl %eax,%r10d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r10d + xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d - leaq 1(%rdi),%rdi - addl %r14d,%r10d + leaq 4(%rbp),%rbp movl 12(%rsp),%r13d - movl 0(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 0(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 44(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d addl 8(%rsp),%r12d movl %ecx,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %r10d,%r14d rorl $14,%r13d movl %edx,%r15d - movl %r12d,8(%rsp) - rorl $9,%r14d xorl %ecx,%r13d + rorl $9,%r14d xorl %r8d,%r15d - rorl $5,%r13d - addl %r9d,%r12d + movl %r12d,8(%rsp) xorl %r10d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %ecx,%r15d - movl %r11d,%r9d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d - xorl %r8d,%r15d + addl %r15d,%r12d - xorl %eax,%r9d + movl %r10d,%r15d + addl (%rbp),%r12d xorl %r10d,%r14d - addl %r15d,%r12d - movl %r11d,%r15d + xorl %r11d,%r15d rorl $6,%r13d - andl %r10d,%r9d - andl %eax,%r15d + movl %r11d,%r9d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r9d + xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d - leaq 1(%rdi),%rdi - addl %r14d,%r9d + leaq 4(%rbp),%rbp movl 16(%rsp),%r13d - movl 4(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 4(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 48(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d addl 12(%rsp),%r12d movl %ebx,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %r9d,%r14d rorl $14,%r13d - movl %ecx,%r15d - movl %r12d,12(%rsp) + movl %ecx,%edi - rorl $9,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + rorl $9,%r14d + xorl %edx,%edi - rorl $5,%r13d - addl %r8d,%r12d + movl %r12d,12(%rsp) xorl %r9d,%r14d + andl %ebx,%edi - addl (%rbp,%rdi,4),%r12d - andl %ebx,%r15d - movl %r10d,%r8d + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + addl %edi,%r12d - xorl %r11d,%r8d + movl %r9d,%edi + addl (%rbp),%r12d xorl %r9d,%r14d - addl %r15d,%r12d - movl %r10d,%r15d + xorl %r10d,%edi rorl $6,%r13d - andl %r9d,%r8d - andl %r11d,%r15d + movl %r10d,%r8d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r8d + xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d - leaq 1(%rdi),%rdi - addl %r14d,%r8d + leaq 20(%rbp),%rbp movl 20(%rsp),%r13d - movl 8(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 8(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 52(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d addl 16(%rsp),%r12d movl %eax,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %r8d,%r14d rorl $14,%r13d movl %ebx,%r15d - movl %r12d,16(%rsp) - rorl $9,%r14d xorl %eax,%r13d + rorl $9,%r14d xorl %ecx,%r15d - rorl $5,%r13d - addl %edx,%r12d + movl %r12d,16(%rsp) xorl %r8d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %eax,%r15d - movl %r9d,%edx + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d - xorl %ecx,%r15d + addl %r15d,%r12d - xorl %r10d,%edx + movl %r8d,%r15d + addl (%rbp),%r12d xorl %r8d,%r14d - addl %r15d,%r12d - movl %r9d,%r15d + xorl %r9d,%r15d rorl $6,%r13d - andl %r8d,%edx - andl %r10d,%r15d + movl %r9d,%edx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%edx + xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx - leaq 1(%rdi),%rdi - addl %r14d,%edx + leaq 4(%rbp),%rbp movl 24(%rsp),%r13d - movl 12(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 12(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 56(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d addl 20(%rsp),%r12d movl %r11d,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %edx,%r14d rorl $14,%r13d - movl %eax,%r15d - movl %r12d,20(%rsp) + movl %eax,%edi - rorl $9,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + rorl $9,%r14d + xorl %ebx,%edi - rorl $5,%r13d - addl %ecx,%r12d + movl %r12d,20(%rsp) xorl %edx,%r14d + andl %r11d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r11d,%r15d - movl %r8d,%ecx + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + addl %edi,%r12d - xorl %r9d,%ecx + movl %edx,%edi + addl (%rbp),%r12d xorl %edx,%r14d - addl %r15d,%r12d - movl %r8d,%r15d + xorl %r8d,%edi rorl $6,%r13d - andl %edx,%ecx - andl %r9d,%r15d + movl %r8d,%ecx + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ecx + xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx - leaq 1(%rdi),%rdi - addl %r14d,%ecx + leaq 4(%rbp),%rbp movl 28(%rsp),%r13d - movl 16(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 16(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 60(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d addl 24(%rsp),%r12d movl %r10d,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %ecx,%r14d rorl $14,%r13d movl %r11d,%r15d - movl %r12d,24(%rsp) - rorl $9,%r14d xorl %r10d,%r13d + rorl $9,%r14d xorl %eax,%r15d - rorl $5,%r13d - addl %ebx,%r12d + movl %r12d,24(%rsp) xorl %ecx,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r10d,%r15d - movl %edx,%ebx + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d - xorl %eax,%r15d + addl %r15d,%r12d - xorl %r8d,%ebx + movl %ecx,%r15d + addl (%rbp),%r12d xorl %ecx,%r14d - addl %r15d,%r12d - movl %edx,%r15d + xorl %edx,%r15d rorl $6,%r13d - andl %ecx,%ebx - andl %r8d,%r15d + movl %edx,%ebx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ebx + xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx - leaq 1(%rdi),%rdi - addl %r14d,%ebx + leaq 4(%rbp),%rbp movl 32(%rsp),%r13d - movl 20(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 20(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 0(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d addl 28(%rsp),%r12d movl %r9d,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %ebx,%r14d rorl $14,%r13d - movl %r10d,%r15d - movl %r12d,28(%rsp) + movl %r10d,%edi - rorl $9,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + rorl $9,%r14d + xorl %r11d,%edi - rorl $5,%r13d - addl %eax,%r12d + movl %r12d,28(%rsp) xorl %ebx,%r14d + andl %r9d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r9d,%r15d - movl %ecx,%eax + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + addl %edi,%r12d - xorl %edx,%eax + movl %ebx,%edi + addl (%rbp),%r12d xorl %ebx,%r14d - addl %r15d,%r12d - movl %ecx,%r15d + xorl %ecx,%edi rorl $6,%r13d - andl %ebx,%eax - andl %edx,%r15d + movl %ecx,%eax + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%eax + xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax - leaq 1(%rdi),%rdi - addl %r14d,%eax + leaq 20(%rbp),%rbp movl 36(%rsp),%r13d - movl 24(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 24(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 4(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d addl 32(%rsp),%r12d movl %r8d,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %eax,%r14d rorl $14,%r13d movl %r9d,%r15d - movl %r12d,32(%rsp) - rorl $9,%r14d xorl %r8d,%r13d + rorl $9,%r14d xorl %r10d,%r15d - rorl $5,%r13d - addl %r11d,%r12d + movl %r12d,32(%rsp) xorl %eax,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r8d,%r15d - movl %ebx,%r11d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d - xorl %r10d,%r15d + addl %r15d,%r12d - xorl %ecx,%r11d + movl %eax,%r15d + addl (%rbp),%r12d xorl %eax,%r14d - addl %r15d,%r12d - movl %ebx,%r15d + xorl %ebx,%r15d rorl $6,%r13d - andl %eax,%r11d - andl %ecx,%r15d + movl %ebx,%r11d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r11d + xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d - leaq 1(%rdi),%rdi - addl %r14d,%r11d + leaq 4(%rbp),%rbp movl 40(%rsp),%r13d - movl 28(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 28(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 8(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d addl 36(%rsp),%r12d movl %edx,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %r11d,%r14d rorl $14,%r13d - movl %r8d,%r15d - movl %r12d,36(%rsp) + movl %r8d,%edi - rorl $9,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + rorl $9,%r14d + xorl %r9d,%edi - rorl $5,%r13d - addl %r10d,%r12d + movl %r12d,36(%rsp) xorl %r11d,%r14d + andl %edx,%edi - addl (%rbp,%rdi,4),%r12d - andl %edx,%r15d - movl %eax,%r10d + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + addl %edi,%r12d - xorl %ebx,%r10d + movl %r11d,%edi + addl (%rbp),%r12d xorl %r11d,%r14d - addl %r15d,%r12d - movl %eax,%r15d + xorl %eax,%edi rorl $6,%r13d - andl %r11d,%r10d - andl %ebx,%r15d + movl %eax,%r10d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r10d + xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d - leaq 1(%rdi),%rdi - addl %r14d,%r10d + leaq 4(%rbp),%rbp movl 44(%rsp),%r13d - movl 32(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 32(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 12(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d addl 40(%rsp),%r12d movl %ecx,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %r10d,%r14d rorl $14,%r13d movl %edx,%r15d - movl %r12d,40(%rsp) - rorl $9,%r14d xorl %ecx,%r13d + rorl $9,%r14d xorl %r8d,%r15d - rorl $5,%r13d - addl %r9d,%r12d + movl %r12d,40(%rsp) xorl %r10d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %ecx,%r15d - movl %r11d,%r9d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d - xorl %r8d,%r15d + addl %r15d,%r12d - xorl %eax,%r9d + movl %r10d,%r15d + addl (%rbp),%r12d xorl %r10d,%r14d - addl %r15d,%r12d - movl %r11d,%r15d + xorl %r11d,%r15d rorl $6,%r13d - andl %r10d,%r9d - andl %eax,%r15d + movl %r11d,%r9d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r9d + xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d - leaq 1(%rdi),%rdi - addl %r14d,%r9d + leaq 4(%rbp),%rbp movl 48(%rsp),%r13d - movl 36(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 36(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 16(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d addl 44(%rsp),%r12d movl %ebx,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %r9d,%r14d rorl $14,%r13d - movl %ecx,%r15d - movl %r12d,44(%rsp) + movl %ecx,%edi - rorl $9,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + rorl $9,%r14d + xorl %edx,%edi - rorl $5,%r13d - addl %r8d,%r12d + movl %r12d,44(%rsp) xorl %r9d,%r14d + andl %ebx,%edi - addl (%rbp,%rdi,4),%r12d - andl %ebx,%r15d - movl %r10d,%r8d + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + addl %edi,%r12d - xorl %r11d,%r8d + movl %r9d,%edi + addl (%rbp),%r12d xorl %r9d,%r14d - addl %r15d,%r12d - movl %r10d,%r15d + xorl %r10d,%edi rorl $6,%r13d - andl %r9d,%r8d - andl %r11d,%r15d + movl %r10d,%r8d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r8d + xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d - leaq 1(%rdi),%rdi - addl %r14d,%r8d + leaq 20(%rbp),%rbp movl 52(%rsp),%r13d - movl 40(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 40(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 20(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d addl 48(%rsp),%r12d movl %eax,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %r8d,%r14d rorl $14,%r13d movl %ebx,%r15d - movl %r12d,48(%rsp) - rorl $9,%r14d xorl %eax,%r13d + rorl $9,%r14d xorl %ecx,%r15d - rorl $5,%r13d - addl %edx,%r12d + movl %r12d,48(%rsp) xorl %r8d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %eax,%r15d - movl %r9d,%edx + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d - xorl %ecx,%r15d + addl %r15d,%r12d - xorl %r10d,%edx + movl %r8d,%r15d + addl (%rbp),%r12d xorl %r8d,%r14d - addl %r15d,%r12d - movl %r9d,%r15d + xorl %r9d,%r15d rorl $6,%r13d - andl %r8d,%edx - andl %r10d,%r15d + movl %r9d,%edx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%edx + xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx - leaq 1(%rdi),%rdi - addl %r14d,%edx + leaq 4(%rbp),%rbp movl 56(%rsp),%r13d - movl 44(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 44(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 24(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d addl 52(%rsp),%r12d movl %r11d,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %edx,%r14d rorl $14,%r13d - movl %eax,%r15d - movl %r12d,52(%rsp) + movl %eax,%edi - rorl $9,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + rorl $9,%r14d + xorl %ebx,%edi - rorl $5,%r13d - addl %ecx,%r12d + movl %r12d,52(%rsp) xorl %edx,%r14d + andl %r11d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r11d,%r15d - movl %r8d,%ecx + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + addl %edi,%r12d - xorl %r9d,%ecx + movl %edx,%edi + addl (%rbp),%r12d xorl %edx,%r14d - addl %r15d,%r12d - movl %r8d,%r15d + xorl %r8d,%edi rorl $6,%r13d - andl %edx,%ecx - andl %r9d,%r15d + movl %r8d,%ecx + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ecx + xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx - leaq 1(%rdi),%rdi - addl %r14d,%ecx + leaq 4(%rbp),%rbp movl 60(%rsp),%r13d - movl 48(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 48(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 28(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d addl 56(%rsp),%r12d movl %r10d,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %ecx,%r14d rorl $14,%r13d movl %r11d,%r15d - movl %r12d,56(%rsp) - rorl $9,%r14d xorl %r10d,%r13d + rorl $9,%r14d xorl %eax,%r15d - rorl $5,%r13d - addl %ebx,%r12d + movl %r12d,56(%rsp) xorl %ecx,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r10d,%r15d - movl %edx,%ebx + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d - xorl %eax,%r15d + addl %r15d,%r12d - xorl %r8d,%ebx + movl %ecx,%r15d + addl (%rbp),%r12d xorl %ecx,%r14d - addl %r15d,%r12d - movl %edx,%r15d + xorl %edx,%r15d rorl $6,%r13d - andl %ecx,%ebx - andl %r8d,%r15d + movl %edx,%ebx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ebx + xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx - leaq 1(%rdi),%rdi - addl %r14d,%ebx + leaq 4(%rbp),%rbp movl 0(%rsp),%r13d - movl 52(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 52(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 32(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d addl 60(%rsp),%r12d movl %r9d,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %ebx,%r14d rorl $14,%r13d - movl %r10d,%r15d - movl %r12d,60(%rsp) + movl %r10d,%edi - rorl $9,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + rorl $9,%r14d + xorl %r11d,%edi - rorl $5,%r13d - addl %eax,%r12d + movl %r12d,60(%rsp) xorl %ebx,%r14d + andl %r9d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r9d,%r15d - movl %ecx,%eax + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + addl %edi,%r12d - xorl %edx,%eax + movl %ebx,%edi + addl (%rbp),%r12d xorl %ebx,%r14d - addl %r15d,%r12d - movl %ecx,%r15d + xorl %ecx,%edi rorl $6,%r13d - andl %ebx,%eax - andl %edx,%r15d + movl %ecx,%eax + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%eax + xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax - leaq 1(%rdi),%rdi - addl %r14d,%eax - cmpq $64,%rdi - jb .Lrounds_16_xx + leaq 20(%rbp),%rbp + cmpb $0,3(%rbp) + jnz .Lrounds_16_xx movq 64+0(%rsp),%rdi + addl %r14d,%eax leaq 64(%rsi),%rsi addl 0(%rdi),%eax @@ -1800,20 +1743,1139 @@ sha256_block_data_order: .type K256,@object K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.type sha256_block_data_order_ssse3,@function +.align 64 +sha256_block_data_order_ssse3: +.Lssse3_shortcut: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%r11 + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %r11,64+24(%rsp) +.Lprologue_ssse3: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp .Lloop_ssse3 +.align 16 +.Lloop_ssse3: + movdqa K256+512(%rip),%xmm7 + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 +.byte 102,15,56,0,199 + leaq K256(%rip),%rbp +.byte 102,15,56,0,207 + movdqa 0(%rbp),%xmm4 +.byte 102,15,56,0,215 + movdqa 32(%rbp),%xmm5 + paddd %xmm0,%xmm4 + movdqa 64(%rbp),%xmm6 +.byte 102,15,56,0,223 + movdqa 96(%rbp),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lssse3_00_47 + +.align 16 +.Lssse3_00_47: + subq $-128,%rbp + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 32(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 64(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 96(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne .Lssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_ssse3 + + movq 64+24(%rsp),%rsi + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lepilogue_ssse3: + .byte 0xf3,0xc3 +.size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 .section .note.GNU-stack,"",%progbits diff --git a/lib/accelerated/x86/macosx/aes-ssse3-x86.s b/lib/accelerated/x86/macosx/aes-ssse3-x86.s index 33f33fd4f3..3c12cbf20e 100644 --- a/lib/accelerated/x86/macosx/aes-ssse3-x86.s +++ b/lib/accelerated/x86/macosx/aes-ssse3-x86.s @@ -82,33 +82,33 @@ __vpaes_encrypt_core: movdqa %xmm6,%xmm1 movdqa (%ebp),%xmm2 pandn %xmm0,%xmm1 - movdqu (%edx),%xmm5 - psrld $4,%xmm1 pand %xmm6,%xmm0 + movdqu (%edx),%xmm5 .byte 102,15,56,0,208 movdqa 16(%ebp),%xmm0 -.byte 102,15,56,0,193 pxor %xmm5,%xmm2 - pxor %xmm2,%xmm0 + psrld $4,%xmm1 addl $16,%edx +.byte 102,15,56,0,193 leal 192(%ebp),%ebx + pxor %xmm2,%xmm0 jmp L000enc_entry .align 4,0x90 L001enc_loop: movdqa 32(%ebp),%xmm4 -.byte 102,15,56,0,226 - pxor %xmm5,%xmm4 movdqa 48(%ebp),%xmm0 +.byte 102,15,56,0,226 .byte 102,15,56,0,195 - pxor %xmm4,%xmm0 + pxor %xmm5,%xmm4 movdqa 64(%ebp),%xmm5 -.byte 102,15,56,0,234 + pxor %xmm4,%xmm0 movdqa -64(%ebx,%ecx,1),%xmm1 +.byte 102,15,56,0,234 movdqa 80(%ebp),%xmm2 -.byte 102,15,56,0,211 - pxor %xmm5,%xmm2 movdqa (%ebx,%ecx,1),%xmm4 +.byte 102,15,56,0,211 movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 .byte 102,15,56,0,193 addl $16,%edx pxor %xmm2,%xmm0 @@ -117,28 +117,28 @@ L001enc_loop: pxor %xmm0,%xmm3 .byte 102,15,56,0,193 andl $48,%ecx - pxor %xmm3,%xmm0 subl $1,%eax + pxor %xmm3,%xmm0 L000enc_entry: movdqa %xmm6,%xmm1 + movdqa -32(%ebp),%xmm5 pandn %xmm0,%xmm1 psrld $4,%xmm1 pand %xmm6,%xmm0 - movdqa -32(%ebp),%xmm5 .byte 102,15,56,0,232 - pxor %xmm1,%xmm0 movdqa %xmm7,%xmm3 + pxor %xmm1,%xmm0 .byte 102,15,56,0,217 - pxor %xmm5,%xmm3 movdqa %xmm7,%xmm4 + pxor %xmm5,%xmm3 .byte 102,15,56,0,224 - pxor %xmm5,%xmm4 movdqa %xmm7,%xmm2 + pxor %xmm5,%xmm4 .byte 102,15,56,0,211 - pxor %xmm0,%xmm2 movdqa %xmm7,%xmm3 - movdqu (%edx),%xmm5 + pxor %xmm0,%xmm2 .byte 102,15,56,0,220 + movdqu (%edx),%xmm5 pxor %xmm1,%xmm3 jnz L001enc_loop movdqa 96(%ebp),%xmm4 @@ -152,8 +152,8 @@ L000enc_entry: ret .align 4 __vpaes_decrypt_core: - movl 240(%edx),%eax leal 608(%ebp),%ebx + movl 240(%edx),%eax movdqa %xmm6,%xmm1 movdqa -64(%ebx),%xmm2 pandn %xmm0,%xmm1 @@ -176,56 +176,56 @@ __vpaes_decrypt_core: .align 4,0x90 L003dec_loop: movdqa -32(%ebx),%xmm4 + movdqa -16(%ebx),%xmm1 .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa -16(%ebx),%xmm0 -.byte 102,15,56,0,195 +.byte 102,15,56,0,203 pxor %xmm4,%xmm0 - addl $16,%edx -.byte 102,15,56,0,197 movdqa (%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 16(%ebx),%xmm1 .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 16(%ebx),%xmm0 -.byte 102,15,56,0,195 - pxor %xmm4,%xmm0 - subl $1,%eax .byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 movdqa 32(%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%ebx),%xmm1 .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 48(%ebx),%xmm0 -.byte 102,15,56,0,195 - pxor %xmm4,%xmm0 .byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 movdqa 64(%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%ebx),%xmm1 .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 80(%ebx),%xmm0 -.byte 102,15,56,0,195 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 pxor %xmm4,%xmm0 + addl $16,%edx .byte 102,15,58,15,237,12 + pxor %xmm1,%xmm0 + subl $1,%eax L002dec_entry: movdqa %xmm6,%xmm1 + movdqa -32(%ebp),%xmm2 pandn %xmm0,%xmm1 - psrld $4,%xmm1 pand %xmm6,%xmm0 - movdqa -32(%ebp),%xmm2 + psrld $4,%xmm1 .byte 102,15,56,0,208 - pxor %xmm1,%xmm0 movdqa %xmm7,%xmm3 + pxor %xmm1,%xmm0 .byte 102,15,56,0,217 - pxor %xmm2,%xmm3 movdqa %xmm7,%xmm4 + pxor %xmm2,%xmm3 .byte 102,15,56,0,224 pxor %xmm2,%xmm4 movdqa %xmm7,%xmm2 .byte 102,15,56,0,211 - pxor %xmm0,%xmm2 movdqa %xmm7,%xmm3 + pxor %xmm0,%xmm2 .byte 102,15,56,0,220 - pxor %xmm1,%xmm3 movdqu (%edx),%xmm0 + pxor %xmm1,%xmm3 jnz L003dec_loop movdqa 96(%ebx),%xmm4 .byte 102,15,56,0,226 @@ -330,12 +330,12 @@ L013schedule_mangle_last_dec: ret .align 4 __vpaes_schedule_192_smear: - pshufd $128,%xmm6,%xmm0 - pxor %xmm0,%xmm6 + pshufd $128,%xmm6,%xmm1 pshufd $254,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 pxor %xmm0,%xmm6 movdqa %xmm6,%xmm0 - pxor %xmm1,%xmm1 movhlps %xmm1,%xmm6 ret .align 4 @@ -588,6 +588,8 @@ L_vpaes_cbc_encrypt_begin: movl 24(%esp),%edi movl 28(%esp),%eax movl 32(%esp),%edx + subl $16,%eax + jc L020cbc_abort leal -56(%esp),%ebx movl 36(%esp),%ebp andl $-16,%ebx @@ -597,18 +599,17 @@ L_vpaes_cbc_encrypt_begin: subl %esi,%edi movl %ebx,48(%esp) movl %edi,(%esp) - subl $16,%eax movl %edx,4(%esp) movl %ebp,8(%esp) movl %eax,%edi - leal L_vpaes_consts+0x30-L020pic_point,%ebp + leal L_vpaes_consts+0x30-L021pic_point,%ebp call __vpaes_preheat -L020pic_point: +L021pic_point: cmpl $0,%ecx - je L021cbc_dec_loop - jmp L022cbc_enc_loop + je L022cbc_dec_loop + jmp L023cbc_enc_loop .align 4,0x90 -L022cbc_enc_loop: +L023cbc_enc_loop: movdqu (%esi),%xmm0 pxor %xmm1,%xmm0 call __vpaes_encrypt_core @@ -618,10 +619,10 @@ L022cbc_enc_loop: movdqu %xmm0,(%ebx,%esi,1) leal 16(%esi),%esi subl $16,%edi - jnc L022cbc_enc_loop - jmp L023cbc_done + jnc L023cbc_enc_loop + jmp L024cbc_done .align 4,0x90 -L021cbc_dec_loop: +L022cbc_dec_loop: movdqu (%esi),%xmm0 movdqa %xmm1,16(%esp) movdqa %xmm0,32(%esp) @@ -633,11 +634,12 @@ L021cbc_dec_loop: movdqu %xmm0,(%ebx,%esi,1) leal 16(%esi),%esi subl $16,%edi - jnc L021cbc_dec_loop -L023cbc_done: + jnc L022cbc_dec_loop +L024cbc_done: movl 8(%esp),%ebx movl 48(%esp),%esp movdqu %xmm1,(%ebx) +L020cbc_abort: popl %edi popl %esi popl %ebx diff --git a/lib/accelerated/x86/macosx/aes-ssse3-x86_64.s b/lib/accelerated/x86/macosx/aes-ssse3-x86_64.s index 1fff24b967..ecb0b770a4 100644 --- a/lib/accelerated/x86/macosx/aes-ssse3-x86_64.s +++ b/lib/accelerated/x86/macosx/aes-ssse3-x86_64.s @@ -43,8 +43,8 @@ _vpaes_encrypt_core: movdqa L$k_ipt+16(%rip),%xmm0 .byte 102,15,56,0,193 pxor %xmm5,%xmm2 - pxor %xmm2,%xmm0 addq $16,%r9 + pxor %xmm2,%xmm0 leaq L$k_mc_backward(%rip),%r10 jmp L$enc_entry @@ -52,19 +52,19 @@ _vpaes_encrypt_core: L$enc_loop: movdqa %xmm13,%xmm4 -.byte 102,15,56,0,226 - pxor %xmm5,%xmm4 movdqa %xmm12,%xmm0 +.byte 102,15,56,0,226 .byte 102,15,56,0,195 - pxor %xmm4,%xmm0 + pxor %xmm5,%xmm4 movdqa %xmm15,%xmm5 -.byte 102,15,56,0,234 + pxor %xmm4,%xmm0 movdqa -64(%r11,%r10,1),%xmm1 +.byte 102,15,56,0,234 + movdqa (%r11,%r10,1),%xmm4 movdqa %xmm14,%xmm2 .byte 102,15,56,0,211 - pxor %xmm5,%xmm2 - movdqa (%r11,%r10,1),%xmm4 movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 .byte 102,15,56,0,193 addq $16,%r9 pxor %xmm2,%xmm0 @@ -73,30 +73,30 @@ L$enc_loop: pxor %xmm0,%xmm3 .byte 102,15,56,0,193 andq $48,%r11 - pxor %xmm3,%xmm0 subq $1,%rax + pxor %xmm3,%xmm0 L$enc_entry: movdqa %xmm9,%xmm1 + movdqa %xmm11,%xmm5 pandn %xmm0,%xmm1 psrld $4,%xmm1 pand %xmm9,%xmm0 - movdqa %xmm11,%xmm5 .byte 102,15,56,0,232 - pxor %xmm1,%xmm0 movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 .byte 102,15,56,0,217 - pxor %xmm5,%xmm3 movdqa %xmm10,%xmm4 + pxor %xmm5,%xmm3 .byte 102,15,56,0,224 - pxor %xmm5,%xmm4 movdqa %xmm10,%xmm2 + pxor %xmm5,%xmm4 .byte 102,15,56,0,211 - pxor %xmm0,%xmm2 movdqa %xmm10,%xmm3 - movdqu (%r9),%xmm5 + pxor %xmm0,%xmm2 .byte 102,15,56,0,220 + movdqu (%r9),%xmm5 pxor %xmm1,%xmm3 jnz L$enc_loop @@ -149,62 +149,61 @@ L$dec_loop: movdqa -32(%r10),%xmm4 + movdqa -16(%r10),%xmm1 .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa -16(%r10),%xmm0 -.byte 102,15,56,0,195 +.byte 102,15,56,0,203 pxor %xmm4,%xmm0 - addq $16,%r9 - -.byte 102,15,56,0,197 movdqa 0(%r10),%xmm4 -.byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 16(%r10),%xmm0 -.byte 102,15,56,0,195 - pxor %xmm4,%xmm0 - subq $1,%rax + pxor %xmm1,%xmm0 + movdqa 16(%r10),%xmm1 -.byte 102,15,56,0,197 - movdqa 32(%r10),%xmm4 .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 48(%r10),%xmm0 -.byte 102,15,56,0,195 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 pxor %xmm4,%xmm0 + movdqa 32(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%r10),%xmm1 +.byte 102,15,56,0,226 .byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 movdqa 64(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%r10),%xmm1 + .byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 80(%r10),%xmm0 -.byte 102,15,56,0,195 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 pxor %xmm4,%xmm0 - + addq $16,%r9 .byte 102,15,58,15,237,12 + pxor %xmm1,%xmm0 + subq $1,%rax L$dec_entry: movdqa %xmm9,%xmm1 pandn %xmm0,%xmm1 + movdqa %xmm11,%xmm2 psrld $4,%xmm1 pand %xmm9,%xmm0 - movdqa %xmm11,%xmm2 .byte 102,15,56,0,208 - pxor %xmm1,%xmm0 movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 .byte 102,15,56,0,217 - pxor %xmm2,%xmm3 movdqa %xmm10,%xmm4 + pxor %xmm2,%xmm3 .byte 102,15,56,0,224 pxor %xmm2,%xmm4 movdqa %xmm10,%xmm2 .byte 102,15,56,0,211 - pxor %xmm0,%xmm2 movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 .byte 102,15,56,0,220 - pxor %xmm1,%xmm3 movdqu (%r9),%xmm0 + pxor %xmm1,%xmm3 jnz L$dec_loop @@ -212,7 +211,7 @@ L$dec_entry: .byte 102,15,56,0,226 pxor %xmm0,%xmm4 movdqa 112(%r10),%xmm0 - movdqa L$k_sr-L$k_dsbd(%r11),%xmm2 + movdqa -352(%r11),%xmm2 .byte 102,15,56,0,195 pxor %xmm4,%xmm0 .byte 102,15,56,0,194 @@ -232,7 +231,7 @@ _vpaes_schedule_core: - call _vpaes_preheat + call _vpaes_preheat movdqa L$k_rcon(%rip),%xmm8 movdqu (%rdi),%xmm0 @@ -278,7 +277,7 @@ L$oop_schedule_128: call _vpaes_schedule_round decq %rsi jz L$schedule_mangle_last - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle jmp L$oop_schedule_128 @@ -299,7 +298,7 @@ L$oop_schedule_128: .p2align 4 L$schedule_192: movdqu 8(%rdi),%xmm0 - call _vpaes_schedule_transform + call _vpaes_schedule_transform movdqa %xmm0,%xmm6 pxor %xmm4,%xmm4 movhlps %xmm4,%xmm6 @@ -308,13 +307,13 @@ L$schedule_192: L$oop_schedule_192: call _vpaes_schedule_round .byte 102,15,58,15,198,8 - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle call _vpaes_schedule_192_smear - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle call _vpaes_schedule_round decq %rsi jz L$schedule_mangle_last - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle call _vpaes_schedule_192_smear jmp L$oop_schedule_192 @@ -331,18 +330,18 @@ L$oop_schedule_192: .p2align 4 L$schedule_256: movdqu 16(%rdi),%xmm0 - call _vpaes_schedule_transform + call _vpaes_schedule_transform movl $7,%esi L$oop_schedule_256: - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle movdqa %xmm0,%xmm6 call _vpaes_schedule_round decq %rsi jz L$schedule_mangle_last - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle pshufd $255,%xmm0,%xmm0 @@ -380,7 +379,7 @@ L$schedule_mangle_last: L$schedule_mangle_last_dec: addq $-16,%rdx pxor L$k_s63(%rip),%xmm0 - call _vpaes_schedule_transform + call _vpaes_schedule_transform movdqu %xmm0,(%rdx) @@ -412,12 +411,12 @@ L$schedule_mangle_last_dec: .p2align 4 _vpaes_schedule_192_smear: - pshufd $128,%xmm6,%xmm0 - pxor %xmm0,%xmm6 + pshufd $128,%xmm6,%xmm1 pshufd $254,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 pxor %xmm0,%xmm6 movdqa %xmm6,%xmm0 - pxor %xmm1,%xmm1 movhlps %xmm1,%xmm6 .byte 0xf3,0xc3 @@ -680,9 +679,10 @@ _vpaes_decrypt: .p2align 4 _vpaes_cbc_encrypt: xchgq %rcx,%rdx + subq $16,%rcx + jc L$cbc_abort movdqu (%r8),%xmm6 subq %rdi,%rsi - subq $16,%rcx call _vpaes_preheat cmpl $0,%r9d je L$cbc_dec_loop @@ -711,6 +711,7 @@ L$cbc_dec_loop: jnc L$cbc_dec_loop L$cbc_done: movdqu %xmm6,(%r8) +L$cbc_abort: .byte 0xf3,0xc3 @@ -833,7 +834,7 @@ L$k_dsbe: L$k_dsbo: .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C -.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 .p2align 6 diff --git a/lib/accelerated/x86/macosx/aesni-x86.s b/lib/accelerated/x86/macosx/aesni-x86.s index 09ca1cbc5c..ef1a0c25a2 100644 --- a/lib/accelerated/x86/macosx/aesni-x86.s +++ b/lib/accelerated/x86/macosx/aesni-x86.s @@ -84,27 +84,78 @@ L001dec1_loop_2: movups %xmm2,(%eax) ret .align 4 +__aesni_encrypt2: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +L002enc2_loop: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + movups -16(%edx,%ecx,1),%xmm0 + jnz L002enc2_loop +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 + ret +.align 4 +__aesni_decrypt2: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +L003dec2_loop: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 + movups -16(%edx,%ecx,1),%xmm0 + jnz L003dec2_loop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,223,208 +.byte 102,15,56,223,216 + ret +.align 4 __aesni_encrypt3: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 - movups (%edx),%xmm0 -L002enc3_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +L004enc3_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %ecx .byte 102,15,56,220,225 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leal 32(%edx),%edx .byte 102,15,56,220,224 - movups (%edx),%xmm0 - jnz L002enc3_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz L004enc3_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -115,25 +166,26 @@ L002enc3_loop: .align 4 __aesni_decrypt3: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 - movups (%edx),%xmm0 -L003dec3_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +L005dec3_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %ecx .byte 102,15,56,222,225 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leal 32(%edx),%edx .byte 102,15,56,222,224 - movups (%edx),%xmm0 - jnz L003dec3_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz L005dec3_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -145,27 +197,29 @@ L003dec3_loop: __aesni_encrypt4: movups (%edx),%xmm0 movups 16(%edx),%xmm1 - shrl $1,%ecx - leal 32(%edx),%edx + shll $4,%ecx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 pxor %xmm0,%xmm5 - movups (%edx),%xmm0 -L004enc4_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 15,31,64,0 + addl $16,%ecx +L006enc4_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %ecx .byte 102,15,56,220,225 .byte 102,15,56,220,233 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leal 32(%edx),%edx .byte 102,15,56,220,224 .byte 102,15,56,220,232 - movups (%edx),%xmm0 - jnz L004enc4_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz L006enc4_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -179,27 +233,29 @@ L004enc4_loop: __aesni_decrypt4: movups (%edx),%xmm0 movups 16(%edx),%xmm1 - shrl $1,%ecx - leal 32(%edx),%edx + shll $4,%ecx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 pxor %xmm0,%xmm5 - movups (%edx),%xmm0 -L005dec4_loop: + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 15,31,64,0 + addl $16,%ecx +L007dec4_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %ecx .byte 102,15,56,222,225 .byte 102,15,56,222,233 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leal 32(%edx),%edx .byte 102,15,56,222,224 .byte 102,15,56,222,232 - movups (%edx),%xmm0 - jnz L005dec4_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz L007dec4_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -212,45 +268,44 @@ L005dec4_loop: .align 4 __aesni_encrypt6: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 -.byte 102,15,56,220,209 pxor %xmm0,%xmm4 -.byte 102,15,56,220,217 +.byte 102,15,56,220,209 pxor %xmm0,%xmm5 - decl %ecx -.byte 102,15,56,220,225 pxor %xmm0,%xmm6 -.byte 102,15,56,220,233 +.byte 102,15,56,220,217 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 102,15,56,220,225 pxor %xmm0,%xmm7 + addl $16,%ecx +.byte 102,15,56,220,233 .byte 102,15,56,220,241 - movups (%edx),%xmm0 .byte 102,15,56,220,249 + movups -16(%edx,%ecx,1),%xmm0 jmp L_aesni_encrypt6_enter .align 4,0x90 -L006enc6_loop: +L008enc6_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %ecx .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -.align 4,0x90 L_aesni_encrypt6_enter: - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leal 32(%edx),%edx .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups (%edx),%xmm0 - jnz L006enc6_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz L008enc6_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -267,45 +322,44 @@ L_aesni_encrypt6_enter: .align 4 __aesni_decrypt6: movups (%edx),%xmm0 - shrl $1,%ecx + shll $4,%ecx movups 16(%edx),%xmm1 - leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 -.byte 102,15,56,222,209 pxor %xmm0,%xmm4 -.byte 102,15,56,222,217 +.byte 102,15,56,222,209 pxor %xmm0,%xmm5 - decl %ecx -.byte 102,15,56,222,225 pxor %xmm0,%xmm6 -.byte 102,15,56,222,233 +.byte 102,15,56,222,217 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 102,15,56,222,225 pxor %xmm0,%xmm7 + addl $16,%ecx +.byte 102,15,56,222,233 .byte 102,15,56,222,241 - movups (%edx),%xmm0 .byte 102,15,56,222,249 + movups -16(%edx,%ecx,1),%xmm0 jmp L_aesni_decrypt6_enter .align 4,0x90 -L007dec6_loop: +L009dec6_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %ecx .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 -.align 4,0x90 L_aesni_decrypt6_enter: - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leal 32(%edx),%edx .byte 102,15,56,222,224 .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups (%edx),%xmm0 - jnz L007dec6_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz L009dec6_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -333,14 +387,14 @@ L_aesni_ecb_encrypt_begin: movl 32(%esp),%edx movl 36(%esp),%ebx andl $-16,%eax - jz L008ecb_ret + jz L010ecb_ret movl 240(%edx),%ecx testl %ebx,%ebx - jz L009ecb_decrypt + jz L011ecb_decrypt movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb L010ecb_enc_tail + jb L012ecb_enc_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -349,9 +403,9 @@ L_aesni_ecb_encrypt_begin: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp L011ecb_enc_loop6_enter + jmp L013ecb_enc_loop6_enter .align 4,0x90 -L012ecb_enc_loop6: +L014ecb_enc_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -366,12 +420,12 @@ L012ecb_enc_loop6: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -L011ecb_enc_loop6_enter: +L013ecb_enc_loop6_enter: call __aesni_encrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc L012ecb_enc_loop6 + jnc L014ecb_enc_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -380,18 +434,18 @@ L011ecb_enc_loop6_enter: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz L008ecb_ret -L010ecb_enc_tail: + jz L010ecb_ret +L012ecb_enc_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb L013ecb_enc_one + jb L015ecb_enc_one movups 16(%esi),%xmm3 - je L014ecb_enc_two + je L016ecb_enc_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb L015ecb_enc_three + jb L017ecb_enc_three movups 48(%esi),%xmm5 - je L016ecb_enc_four + je L018ecb_enc_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call __aesni_encrypt6 @@ -400,50 +454,49 @@ L010ecb_enc_tail: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp L008ecb_ret + jmp L010ecb_ret .align 4,0x90 -L013ecb_enc_one: +L015ecb_enc_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L017enc1_loop_3: +L019enc1_loop_3: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L017enc1_loop_3 + jnz L019enc1_loop_3 .byte 102,15,56,221,209 movups %xmm2,(%edi) - jmp L008ecb_ret + jmp L010ecb_ret .align 4,0x90 -L014ecb_enc_two: - xorps %xmm4,%xmm4 - call __aesni_encrypt3 +L016ecb_enc_two: + call __aesni_encrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp L008ecb_ret + jmp L010ecb_ret .align 4,0x90 -L015ecb_enc_three: +L017ecb_enc_three: call __aesni_encrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp L008ecb_ret + jmp L010ecb_ret .align 4,0x90 -L016ecb_enc_four: +L018ecb_enc_four: call __aesni_encrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) - jmp L008ecb_ret + jmp L010ecb_ret .align 4,0x90 -L009ecb_decrypt: +L011ecb_decrypt: movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb L018ecb_dec_tail + jb L020ecb_dec_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -452,9 +505,9 @@ L009ecb_decrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp L019ecb_dec_loop6_enter + jmp L021ecb_dec_loop6_enter .align 4,0x90 -L020ecb_dec_loop6: +L022ecb_dec_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -469,12 +522,12 @@ L020ecb_dec_loop6: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -L019ecb_dec_loop6_enter: +L021ecb_dec_loop6_enter: call __aesni_decrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc L020ecb_dec_loop6 + jnc L022ecb_dec_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -483,18 +536,18 @@ L019ecb_dec_loop6_enter: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz L008ecb_ret -L018ecb_dec_tail: + jz L010ecb_ret +L020ecb_dec_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb L021ecb_dec_one + jb L023ecb_dec_one movups 16(%esi),%xmm3 - je L022ecb_dec_two + je L024ecb_dec_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb L023ecb_dec_three + jb L025ecb_dec_three movups 48(%esi),%xmm5 - je L024ecb_dec_four + je L026ecb_dec_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call __aesni_decrypt6 @@ -503,44 +556,43 @@ L018ecb_dec_tail: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp L008ecb_ret + jmp L010ecb_ret .align 4,0x90 -L021ecb_dec_one: +L023ecb_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L025dec1_loop_4: +L027dec1_loop_4: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L025dec1_loop_4 + jnz L027dec1_loop_4 .byte 102,15,56,223,209 movups %xmm2,(%edi) - jmp L008ecb_ret + jmp L010ecb_ret .align 4,0x90 -L022ecb_dec_two: - xorps %xmm4,%xmm4 - call __aesni_decrypt3 +L024ecb_dec_two: + call __aesni_decrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp L008ecb_ret + jmp L010ecb_ret .align 4,0x90 -L023ecb_dec_three: +L025ecb_dec_three: call __aesni_decrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp L008ecb_ret + jmp L010ecb_ret .align 4,0x90 -L024ecb_dec_four: +L026ecb_dec_four: call __aesni_decrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) -L008ecb_ret: +L010ecb_ret: popl %edi popl %esi popl %ebx @@ -577,45 +629,45 @@ L_aesni_ccm64_encrypt_blocks_begin: movl %ebp,20(%esp) movl %ebp,24(%esp) movl %ebp,28(%esp) - shrl $1,%ecx + shll $4,%ecx + movl $16,%ebx leal (%edx),%ebp movdqa (%esp),%xmm5 movdqa %xmm7,%xmm2 - movl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + subl %ecx,%ebx .byte 102,15,56,0,253 -L026ccm64_enc_outer: +L028ccm64_enc_outer: movups (%ebp),%xmm0 movl %ebx,%ecx movups (%esi),%xmm6 xorps %xmm0,%xmm2 movups 16(%ebp),%xmm1 xorps %xmm6,%xmm0 - leal 32(%ebp),%edx xorps %xmm0,%xmm3 - movups (%edx),%xmm0 -L027ccm64_enc2_loop: + movups 32(%ebp),%xmm0 +L029ccm64_enc2_loop: .byte 102,15,56,220,209 - decl %ecx .byte 102,15,56,220,217 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 - leal 32(%edx),%edx .byte 102,15,56,220,216 - movups (%edx),%xmm0 - jnz L027ccm64_enc2_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz L029ccm64_enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 paddq 16(%esp),%xmm7 + decl %eax .byte 102,15,56,221,208 .byte 102,15,56,221,216 - decl %eax leal 16(%esi),%esi xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 movups %xmm6,(%edi) - leal 16(%edi),%edi .byte 102,15,56,0,213 - jnz L026ccm64_enc_outer + leal 16(%edi),%edi + jnz L028ccm64_enc_outer movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) @@ -664,67 +716,70 @@ L_aesni_ccm64_decrypt_blocks_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L028enc1_loop_5: +L030enc1_loop_5: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L028enc1_loop_5 + jnz L030enc1_loop_5 .byte 102,15,56,221,209 + shll $4,%ebx + movl $16,%ecx movups (%esi),%xmm6 paddq 16(%esp),%xmm7 leal 16(%esi),%esi - jmp L029ccm64_dec_outer + subl %ebx,%ecx + leal 32(%ebp,%ebx,1),%edx + movl %ecx,%ebx + jmp L031ccm64_dec_outer .align 4,0x90 -L029ccm64_dec_outer: +L031ccm64_dec_outer: xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 - movl %ebx,%ecx movups %xmm6,(%edi) leal 16(%edi),%edi .byte 102,15,56,0,213 subl $1,%eax - jz L030ccm64_dec_break + jz L032ccm64_dec_break movups (%ebp),%xmm0 - shrl $1,%ecx + movl %ebx,%ecx movups 16(%ebp),%xmm1 xorps %xmm0,%xmm6 - leal 32(%ebp),%edx xorps %xmm0,%xmm2 xorps %xmm6,%xmm3 - movups (%edx),%xmm0 -L031ccm64_dec2_loop: + movups 32(%ebp),%xmm0 +L033ccm64_dec2_loop: .byte 102,15,56,220,209 - decl %ecx .byte 102,15,56,220,217 - movups 16(%edx),%xmm1 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx .byte 102,15,56,220,208 - leal 32(%edx),%edx .byte 102,15,56,220,216 - movups (%edx),%xmm0 - jnz L031ccm64_dec2_loop + movups -16(%edx,%ecx,1),%xmm0 + jnz L033ccm64_dec2_loop movups (%esi),%xmm6 paddq 16(%esp),%xmm7 .byte 102,15,56,220,209 .byte 102,15,56,220,217 - leal 16(%esi),%esi .byte 102,15,56,221,208 .byte 102,15,56,221,216 - jmp L029ccm64_dec_outer + leal 16(%esi),%esi + jmp L031ccm64_dec_outer .align 4,0x90 -L030ccm64_dec_break: +L032ccm64_dec_break: + movl 240(%ebp),%ecx movl %ebp,%edx movups (%edx),%xmm0 movups 16(%edx),%xmm1 xorps %xmm0,%xmm6 leal 32(%edx),%edx xorps %xmm6,%xmm3 -L032enc1_loop_6: +L034enc1_loop_6: .byte 102,15,56,220,217 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L032enc1_loop_6 + jnz L034enc1_loop_6 .byte 102,15,56,221,217 movl 48(%esp),%esp movl 40(%esp),%edi @@ -752,7 +807,7 @@ L_aesni_ctr32_encrypt_blocks_begin: andl $-16,%esp movl %ebp,80(%esp) cmpl $1,%eax - je L033ctr32_one_shortcut + je L035ctr32_one_shortcut movdqu (%ebx),%xmm7 movl $202182159,(%esp) movl $134810123,4(%esp) @@ -768,63 +823,59 @@ L_aesni_ctr32_encrypt_blocks_begin: .byte 102,15,58,34,253,3 movl 240(%edx),%ecx bswap %ebx - pxor %xmm1,%xmm1 pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movdqa (%esp),%xmm2 -.byte 102,15,58,34,203,0 +.byte 102,15,58,34,195,0 leal 3(%ebx),%ebp -.byte 102,15,58,34,197,0 +.byte 102,15,58,34,205,0 incl %ebx -.byte 102,15,58,34,203,1 +.byte 102,15,58,34,195,1 incl %ebp -.byte 102,15,58,34,197,1 +.byte 102,15,58,34,205,1 incl %ebx -.byte 102,15,58,34,203,2 +.byte 102,15,58,34,195,2 incl %ebp -.byte 102,15,58,34,197,2 - movdqa %xmm1,48(%esp) -.byte 102,15,56,0,202 - movdqa %xmm0,64(%esp) +.byte 102,15,58,34,205,2 + movdqa %xmm0,48(%esp) .byte 102,15,56,0,194 - pshufd $192,%xmm1,%xmm2 - pshufd $128,%xmm1,%xmm3 + movdqu (%edx),%xmm6 + movdqa %xmm1,64(%esp) +.byte 102,15,56,0,202 + pshufd $192,%xmm0,%xmm2 + pshufd $128,%xmm0,%xmm3 cmpl $6,%eax - jb L034ctr32_tail + jb L036ctr32_tail + pxor %xmm6,%xmm7 + shll $4,%ecx + movl $16,%ebx movdqa %xmm7,32(%esp) - shrl $1,%ecx movl %edx,%ebp - movl %ecx,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx subl $6,%eax - jmp L035ctr32_loop6 + jmp L037ctr32_loop6 .align 4,0x90 -L035ctr32_loop6: - pshufd $64,%xmm1,%xmm4 - movdqa 32(%esp),%xmm1 - pshufd $192,%xmm0,%xmm5 - por %xmm1,%xmm2 - pshufd $128,%xmm0,%xmm6 - por %xmm1,%xmm3 - pshufd $64,%xmm0,%xmm7 - por %xmm1,%xmm4 - por %xmm1,%xmm5 - por %xmm1,%xmm6 - por %xmm1,%xmm7 - movups (%ebp),%xmm0 - movups 16(%ebp),%xmm1 - leal 32(%ebp),%edx - decl %ecx +L037ctr32_loop6: + pshufd $64,%xmm0,%xmm4 + movdqa 32(%esp),%xmm0 + pshufd $192,%xmm1,%xmm5 pxor %xmm0,%xmm2 + pshufd $128,%xmm1,%xmm6 pxor %xmm0,%xmm3 -.byte 102,15,56,220,209 + pshufd $64,%xmm1,%xmm7 + movups 16(%ebp),%xmm1 pxor %xmm0,%xmm4 -.byte 102,15,56,220,217 pxor %xmm0,%xmm5 -.byte 102,15,56,220,225 +.byte 102,15,56,220,209 pxor %xmm0,%xmm6 -.byte 102,15,56,220,233 pxor %xmm0,%xmm7 +.byte 102,15,56,220,217 + movups 32(%ebp),%xmm0 + movl %ebx,%ecx +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 .byte 102,15,56,220,241 - movups (%edx),%xmm0 .byte 102,15,56,220,249 call L_aesni_encrypt6_enter movups (%esi),%xmm1 @@ -835,51 +886,51 @@ L035ctr32_loop6: movups %xmm2,(%edi) movdqa 16(%esp),%xmm0 xorps %xmm1,%xmm4 - movdqa 48(%esp),%xmm1 + movdqa 64(%esp),%xmm1 movups %xmm3,16(%edi) movups %xmm4,32(%edi) paddd %xmm0,%xmm1 - paddd 64(%esp),%xmm0 + paddd 48(%esp),%xmm0 movdqa (%esp),%xmm2 movups 48(%esi),%xmm3 movups 64(%esi),%xmm4 xorps %xmm3,%xmm5 movups 80(%esi),%xmm3 leal 96(%esi),%esi - movdqa %xmm1,48(%esp) -.byte 102,15,56,0,202 + movdqa %xmm0,48(%esp) +.byte 102,15,56,0,194 xorps %xmm4,%xmm6 movups %xmm5,48(%edi) xorps %xmm3,%xmm7 - movdqa %xmm0,64(%esp) -.byte 102,15,56,0,194 + movdqa %xmm1,64(%esp) +.byte 102,15,56,0,202 movups %xmm6,64(%edi) - pshufd $192,%xmm1,%xmm2 + pshufd $192,%xmm0,%xmm2 movups %xmm7,80(%edi) leal 96(%edi),%edi - movl %ebx,%ecx - pshufd $128,%xmm1,%xmm3 + pshufd $128,%xmm0,%xmm3 subl $6,%eax - jnc L035ctr32_loop6 + jnc L037ctr32_loop6 addl $6,%eax - jz L036ctr32_ret + jz L038ctr32_ret + movdqu (%ebp),%xmm7 movl %ebp,%edx - leal 1(,%ecx,2),%ecx - movdqa 32(%esp),%xmm7 -L034ctr32_tail: + pxor 32(%esp),%xmm7 + movl 240(%ebp),%ecx +L036ctr32_tail: por %xmm7,%xmm2 cmpl $2,%eax - jb L037ctr32_one - pshufd $64,%xmm1,%xmm4 + jb L039ctr32_one + pshufd $64,%xmm0,%xmm4 por %xmm7,%xmm3 - je L038ctr32_two - pshufd $192,%xmm0,%xmm5 + je L040ctr32_two + pshufd $192,%xmm1,%xmm5 por %xmm7,%xmm4 cmpl $4,%eax - jb L039ctr32_three - pshufd $128,%xmm0,%xmm6 + jb L041ctr32_three + pshufd $128,%xmm1,%xmm6 por %xmm7,%xmm5 - je L040ctr32_four + je L042ctr32_four por %xmm7,%xmm6 call __aesni_encrypt6 movups (%esi),%xmm1 @@ -897,39 +948,39 @@ L034ctr32_tail: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp L036ctr32_ret + jmp L038ctr32_ret .align 4,0x90 -L033ctr32_one_shortcut: +L035ctr32_one_shortcut: movups (%ebx),%xmm2 movl 240(%edx),%ecx -L037ctr32_one: +L039ctr32_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L041enc1_loop_7: +L043enc1_loop_7: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L041enc1_loop_7 + jnz L043enc1_loop_7 .byte 102,15,56,221,209 movups (%esi),%xmm6 xorps %xmm2,%xmm6 movups %xmm6,(%edi) - jmp L036ctr32_ret + jmp L038ctr32_ret .align 4,0x90 -L038ctr32_two: - call __aesni_encrypt3 +L040ctr32_two: + call __aesni_encrypt2 movups (%esi),%xmm5 movups 16(%esi),%xmm6 xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp L036ctr32_ret + jmp L038ctr32_ret .align 4,0x90 -L039ctr32_three: +L041ctr32_three: call __aesni_encrypt3 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -940,9 +991,9 @@ L039ctr32_three: xorps %xmm7,%xmm4 movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp L036ctr32_ret + jmp L038ctr32_ret .align 4,0x90 -L040ctr32_four: +L042ctr32_four: call __aesni_encrypt4 movups (%esi),%xmm6 movups 16(%esi),%xmm7 @@ -956,7 +1007,7 @@ L040ctr32_four: xorps %xmm0,%xmm5 movups %xmm4,32(%edi) movups %xmm5,48(%edi) -L036ctr32_ret: +L038ctr32_ret: movl 80(%esp),%esp popl %edi popl %esi @@ -979,12 +1030,12 @@ L_aesni_xts_encrypt_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L042enc1_loop_8: +L044enc1_loop_8: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L042enc1_loop_8 + jnz L044enc1_loop_8 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1008,12 +1059,14 @@ L042enc1_loop_8: movl %edx,%ebp movl %ecx,%ebx subl $96,%eax - jc L043xts_enc_short - shrl $1,%ecx - movl %ecx,%ebx - jmp L044xts_enc_loop6 + jc L045xts_enc_short + shll $4,%ecx + movl $16,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + jmp L046xts_enc_loop6 .align 4,0x90 -L044xts_enc_loop6: +L046xts_enc_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1049,6 +1102,7 @@ L044xts_enc_loop6: pand %xmm3,%xmm7 movups (%esi),%xmm2 pxor %xmm1,%xmm7 + movl %ebx,%ecx movdqu 16(%esi),%xmm3 xorps %xmm0,%xmm2 movdqu 32(%esi),%xmm4 @@ -1064,19 +1118,17 @@ L044xts_enc_loop6: movdqa %xmm7,80(%esp) pxor %xmm1,%xmm7 movups 16(%ebp),%xmm1 - leal 32(%ebp),%edx pxor 16(%esp),%xmm3 -.byte 102,15,56,220,209 pxor 32(%esp),%xmm4 -.byte 102,15,56,220,217 +.byte 102,15,56,220,209 pxor 48(%esp),%xmm5 - decl %ecx -.byte 102,15,56,220,225 pxor 64(%esp),%xmm6 -.byte 102,15,56,220,233 +.byte 102,15,56,220,217 pxor %xmm0,%xmm7 + movups 32(%ebp),%xmm0 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 .byte 102,15,56,220,241 - movups (%edx),%xmm0 .byte 102,15,56,220,249 call L_aesni_encrypt6_enter movdqa 80(%esp),%xmm1 @@ -1101,26 +1153,25 @@ L044xts_enc_loop6: paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 - movl %ebx,%ecx pxor %xmm2,%xmm1 subl $96,%eax - jnc L044xts_enc_loop6 - leal 1(,%ecx,2),%ecx + jnc L046xts_enc_loop6 + movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -L043xts_enc_short: +L045xts_enc_short: addl $96,%eax - jz L045xts_enc_done6x + jz L047xts_enc_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb L046xts_enc_one + jb L048xts_enc_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je L047xts_enc_two + je L049xts_enc_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1129,7 +1180,7 @@ L043xts_enc_short: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb L048xts_enc_three + jb L050xts_enc_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1139,7 +1190,7 @@ L043xts_enc_short: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je L049xts_enc_four + je L051xts_enc_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1171,9 +1222,9 @@ L043xts_enc_short: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp L050xts_enc_done + jmp L052xts_enc_done .align 4,0x90 -L046xts_enc_one: +L048xts_enc_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1181,37 +1232,36 @@ L046xts_enc_one: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L051enc1_loop_9: +L053enc1_loop_9: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L051enc1_loop_9 + jnz L053enc1_loop_9 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp L050xts_enc_done + jmp L052xts_enc_done .align 4,0x90 -L047xts_enc_two: +L049xts_enc_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 leal 32(%esi),%esi xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 - xorps %xmm4,%xmm4 - call __aesni_encrypt3 + call __aesni_encrypt2 xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp L050xts_enc_done + jmp L052xts_enc_done .align 4,0x90 -L048xts_enc_three: +L050xts_enc_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1229,9 +1279,9 @@ L048xts_enc_three: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp L050xts_enc_done + jmp L052xts_enc_done .align 4,0x90 -L049xts_enc_four: +L051xts_enc_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1253,28 +1303,28 @@ L049xts_enc_four: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp L050xts_enc_done + jmp L052xts_enc_done .align 4,0x90 -L045xts_enc_done6x: +L047xts_enc_done6x: movl 112(%esp),%eax andl $15,%eax - jz L052xts_enc_ret + jz L054xts_enc_ret movdqa %xmm1,%xmm5 movl %eax,112(%esp) - jmp L053xts_enc_steal + jmp L055xts_enc_steal .align 4,0x90 -L050xts_enc_done: +L052xts_enc_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz L052xts_enc_ret + jz L054xts_enc_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm5 paddq %xmm1,%xmm1 pand 96(%esp),%xmm5 pxor %xmm1,%xmm5 -L053xts_enc_steal: +L055xts_enc_steal: movzbl (%esi),%ecx movzbl -16(%edi),%edx leal 1(%esi),%esi @@ -1282,7 +1332,7 @@ L053xts_enc_steal: movb %dl,(%edi) leal 1(%edi),%edi subl $1,%eax - jnz L053xts_enc_steal + jnz L055xts_enc_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1292,16 +1342,16 @@ L053xts_enc_steal: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L054enc1_loop_10: +L056enc1_loop_10: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L054enc1_loop_10 + jnz L056enc1_loop_10 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,-16(%edi) -L052xts_enc_ret: +L054xts_enc_ret: movl 116(%esp),%esp popl %edi popl %esi @@ -1324,12 +1374,12 @@ L_aesni_xts_decrypt_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L055enc1_loop_11: +L057enc1_loop_11: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L055enc1_loop_11 + jnz L057enc1_loop_11 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1358,12 +1408,14 @@ L055enc1_loop_11: pcmpgtd %xmm1,%xmm0 andl $-16,%eax subl $96,%eax - jc L056xts_dec_short - shrl $1,%ecx - movl %ecx,%ebx - jmp L057xts_dec_loop6 + jc L058xts_dec_short + shll $4,%ecx + movl $16,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + jmp L059xts_dec_loop6 .align 4,0x90 -L057xts_dec_loop6: +L059xts_dec_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1399,6 +1451,7 @@ L057xts_dec_loop6: pand %xmm3,%xmm7 movups (%esi),%xmm2 pxor %xmm1,%xmm7 + movl %ebx,%ecx movdqu 16(%esi),%xmm3 xorps %xmm0,%xmm2 movdqu 32(%esi),%xmm4 @@ -1414,19 +1467,17 @@ L057xts_dec_loop6: movdqa %xmm7,80(%esp) pxor %xmm1,%xmm7 movups 16(%ebp),%xmm1 - leal 32(%ebp),%edx pxor 16(%esp),%xmm3 -.byte 102,15,56,222,209 pxor 32(%esp),%xmm4 -.byte 102,15,56,222,217 +.byte 102,15,56,222,209 pxor 48(%esp),%xmm5 - decl %ecx -.byte 102,15,56,222,225 pxor 64(%esp),%xmm6 -.byte 102,15,56,222,233 +.byte 102,15,56,222,217 pxor %xmm0,%xmm7 + movups 32(%ebp),%xmm0 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 .byte 102,15,56,222,241 - movups (%edx),%xmm0 .byte 102,15,56,222,249 call L_aesni_decrypt6_enter movdqa 80(%esp),%xmm1 @@ -1451,26 +1502,25 @@ L057xts_dec_loop6: paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 - movl %ebx,%ecx pxor %xmm2,%xmm1 subl $96,%eax - jnc L057xts_dec_loop6 - leal 1(,%ecx,2),%ecx + jnc L059xts_dec_loop6 + movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -L056xts_dec_short: +L058xts_dec_short: addl $96,%eax - jz L058xts_dec_done6x + jz L060xts_dec_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb L059xts_dec_one + jb L061xts_dec_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je L060xts_dec_two + je L062xts_dec_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1479,7 +1529,7 @@ L056xts_dec_short: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb L061xts_dec_three + jb L063xts_dec_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1489,7 +1539,7 @@ L056xts_dec_short: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je L062xts_dec_four + je L064xts_dec_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1521,9 +1571,9 @@ L056xts_dec_short: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp L063xts_dec_done + jmp L065xts_dec_done .align 4,0x90 -L059xts_dec_one: +L061xts_dec_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1531,36 +1581,36 @@ L059xts_dec_one: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L064dec1_loop_12: +L066dec1_loop_12: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L064dec1_loop_12 + jnz L066dec1_loop_12 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp L063xts_dec_done + jmp L065xts_dec_done .align 4,0x90 -L060xts_dec_two: +L062xts_dec_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 leal 32(%esi),%esi xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 - call __aesni_decrypt3 + call __aesni_decrypt2 xorps %xmm5,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp L063xts_dec_done + jmp L065xts_dec_done .align 4,0x90 -L061xts_dec_three: +L063xts_dec_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1578,9 +1628,9 @@ L061xts_dec_three: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp L063xts_dec_done + jmp L065xts_dec_done .align 4,0x90 -L062xts_dec_four: +L064xts_dec_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1602,20 +1652,20 @@ L062xts_dec_four: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp L063xts_dec_done + jmp L065xts_dec_done .align 4,0x90 -L058xts_dec_done6x: +L060xts_dec_done6x: movl 112(%esp),%eax andl $15,%eax - jz L065xts_dec_ret + jz L067xts_dec_ret movl %eax,112(%esp) - jmp L066xts_dec_only_one_more + jmp L068xts_dec_only_one_more .align 4,0x90 -L063xts_dec_done: +L065xts_dec_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz L065xts_dec_ret + jz L067xts_dec_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm2 @@ -1625,7 +1675,7 @@ L063xts_dec_done: pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 -L066xts_dec_only_one_more: +L068xts_dec_only_one_more: pshufd $19,%xmm0,%xmm5 movdqa %xmm1,%xmm6 paddq %xmm1,%xmm1 @@ -1639,16 +1689,16 @@ L066xts_dec_only_one_more: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L067dec1_loop_13: +L069dec1_loop_13: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L067dec1_loop_13 + jnz L069dec1_loop_13 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) -L068xts_dec_steal: +L070xts_dec_steal: movzbl 16(%esi),%ecx movzbl (%edi),%edx leal 1(%esi),%esi @@ -1656,7 +1706,7 @@ L068xts_dec_steal: movb %dl,16(%edi) leal 1(%edi),%edi subl $1,%eax - jnz L068xts_dec_steal + jnz L070xts_dec_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1666,16 +1716,16 @@ L068xts_dec_steal: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L069dec1_loop_14: +L071dec1_loop_14: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L069dec1_loop_14 + jnz L071dec1_loop_14 .byte 102,15,56,223,209 xorps %xmm6,%xmm2 movups %xmm2,(%edi) -L065xts_dec_ret: +L067xts_dec_ret: movl 116(%esp),%esp popl %edi popl %esi @@ -1699,7 +1749,7 @@ L_aesni_cbc_encrypt_begin: movl 32(%esp),%edx movl 36(%esp),%ebp testl %eax,%eax - jz L070cbc_abort + jz L072cbc_abort cmpl $0,40(%esp) xchgl %esp,%ebx movups (%ebp),%xmm7 @@ -1707,14 +1757,14 @@ L_aesni_cbc_encrypt_begin: movl %edx,%ebp movl %ebx,16(%esp) movl %ecx,%ebx - je L071cbc_decrypt + je L073cbc_decrypt movaps %xmm7,%xmm2 cmpl $16,%eax - jb L072cbc_enc_tail + jb L074cbc_enc_tail subl $16,%eax - jmp L073cbc_enc_loop + jmp L075cbc_enc_loop .align 4,0x90 -L073cbc_enc_loop: +L075cbc_enc_loop: movups (%esi),%xmm7 leal 16(%esi),%esi movups (%edx),%xmm0 @@ -1722,24 +1772,24 @@ L073cbc_enc_loop: xorps %xmm0,%xmm7 leal 32(%edx),%edx xorps %xmm7,%xmm2 -L074enc1_loop_15: +L076enc1_loop_15: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L074enc1_loop_15 + jnz L076enc1_loop_15 .byte 102,15,56,221,209 movl %ebx,%ecx movl %ebp,%edx movups %xmm2,(%edi) leal 16(%edi),%edi subl $16,%eax - jnc L073cbc_enc_loop + jnc L075cbc_enc_loop addl $16,%eax - jnz L072cbc_enc_tail + jnz L074cbc_enc_tail movaps %xmm2,%xmm7 - jmp L075cbc_ret -L072cbc_enc_tail: + jmp L077cbc_ret +L074cbc_enc_tail: movl %eax,%ecx .long 2767451785 movl $16,%ecx @@ -1750,20 +1800,20 @@ L072cbc_enc_tail: movl %ebx,%ecx movl %edi,%esi movl %ebp,%edx - jmp L073cbc_enc_loop + jmp L075cbc_enc_loop .align 4,0x90 -L071cbc_decrypt: +L073cbc_decrypt: cmpl $80,%eax - jbe L076cbc_dec_tail + jbe L078cbc_dec_tail movaps %xmm7,(%esp) subl $80,%eax - jmp L077cbc_dec_loop6_enter + jmp L079cbc_dec_loop6_enter .align 4,0x90 -L078cbc_dec_loop6: +L080cbc_dec_loop6: movaps %xmm0,(%esp) movups %xmm7,(%edi) leal 16(%edi),%edi -L077cbc_dec_loop6_enter: +L079cbc_dec_loop6_enter: movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -1793,28 +1843,28 @@ L077cbc_dec_loop6_enter: movups %xmm6,64(%edi) leal 80(%edi),%edi subl $96,%eax - ja L078cbc_dec_loop6 + ja L080cbc_dec_loop6 movaps %xmm7,%xmm2 movaps %xmm0,%xmm7 addl $80,%eax - jle L079cbc_dec_tail_collected + jle L081cbc_dec_tail_collected movups %xmm2,(%edi) leal 16(%edi),%edi -L076cbc_dec_tail: +L078cbc_dec_tail: movups (%esi),%xmm2 movaps %xmm2,%xmm6 cmpl $16,%eax - jbe L080cbc_dec_one + jbe L082cbc_dec_one movups 16(%esi),%xmm3 movaps %xmm3,%xmm5 cmpl $32,%eax - jbe L081cbc_dec_two + jbe L083cbc_dec_two movups 32(%esi),%xmm4 cmpl $48,%eax - jbe L082cbc_dec_three + jbe L084cbc_dec_three movups 48(%esi),%xmm5 cmpl $64,%eax - jbe L083cbc_dec_four + jbe L085cbc_dec_four movups 64(%esi),%xmm6 movaps %xmm7,(%esp) movups (%esi),%xmm2 @@ -1837,28 +1887,27 @@ L076cbc_dec_tail: leal 64(%edi),%edi movaps %xmm6,%xmm2 subl $80,%eax - jmp L079cbc_dec_tail_collected + jmp L081cbc_dec_tail_collected .align 4,0x90 -L080cbc_dec_one: +L082cbc_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L084dec1_loop_16: +L086dec1_loop_16: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L084dec1_loop_16 + jnz L086dec1_loop_16 .byte 102,15,56,223,209 xorps %xmm7,%xmm2 movaps %xmm6,%xmm7 subl $16,%eax - jmp L079cbc_dec_tail_collected + jmp L081cbc_dec_tail_collected .align 4,0x90 -L081cbc_dec_two: - xorps %xmm4,%xmm4 - call __aesni_decrypt3 +L083cbc_dec_two: + call __aesni_decrypt2 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) @@ -1866,9 +1915,9 @@ L081cbc_dec_two: leal 16(%edi),%edi movaps %xmm5,%xmm7 subl $32,%eax - jmp L079cbc_dec_tail_collected + jmp L081cbc_dec_tail_collected .align 4,0x90 -L082cbc_dec_three: +L084cbc_dec_three: call __aesni_decrypt3 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 @@ -1879,9 +1928,9 @@ L082cbc_dec_three: leal 32(%edi),%edi movups 32(%esi),%xmm7 subl $48,%eax - jmp L079cbc_dec_tail_collected + jmp L081cbc_dec_tail_collected .align 4,0x90 -L083cbc_dec_four: +L085cbc_dec_four: call __aesni_decrypt4 movups 16(%esi),%xmm1 movups 32(%esi),%xmm0 @@ -1896,23 +1945,23 @@ L083cbc_dec_four: leal 48(%edi),%edi movaps %xmm5,%xmm2 subl $64,%eax -L079cbc_dec_tail_collected: +L081cbc_dec_tail_collected: andl $15,%eax - jnz L085cbc_dec_tail_partial + jnz L087cbc_dec_tail_partial movups %xmm2,(%edi) - jmp L075cbc_ret + jmp L077cbc_ret .align 4,0x90 -L085cbc_dec_tail_partial: +L087cbc_dec_tail_partial: movaps %xmm2,(%esp) movl $16,%ecx movl %esp,%esi subl %eax,%ecx .long 2767451785 -L075cbc_ret: +L077cbc_ret: movl 16(%esp),%esp movl 36(%esp),%ebp movups %xmm7,(%ebp) -L070cbc_abort: +L072cbc_abort: popl %edi popl %esi popl %ebx @@ -1921,51 +1970,51 @@ L070cbc_abort: .align 4 __aesni_set_encrypt_key: testl %eax,%eax - jz L086bad_pointer + jz L088bad_pointer testl %edx,%edx - jz L086bad_pointer + jz L088bad_pointer movups (%eax),%xmm0 xorps %xmm4,%xmm4 leal 16(%edx),%edx cmpl $256,%ecx - je L08714rounds + je L08914rounds cmpl $192,%ecx - je L08812rounds + je L09012rounds cmpl $128,%ecx - jne L089bad_keybits + jne L091bad_keybits .align 4,0x90 -L09010rounds: +L09210rounds: movl $9,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,200,1 - call L091key_128_cold + call L093key_128_cold .byte 102,15,58,223,200,2 - call L092key_128 + call L094key_128 .byte 102,15,58,223,200,4 - call L092key_128 + call L094key_128 .byte 102,15,58,223,200,8 - call L092key_128 + call L094key_128 .byte 102,15,58,223,200,16 - call L092key_128 + call L094key_128 .byte 102,15,58,223,200,32 - call L092key_128 + call L094key_128 .byte 102,15,58,223,200,64 - call L092key_128 + call L094key_128 .byte 102,15,58,223,200,128 - call L092key_128 + call L094key_128 .byte 102,15,58,223,200,27 - call L092key_128 + call L094key_128 .byte 102,15,58,223,200,54 - call L092key_128 + call L094key_128 movups %xmm0,(%edx) movl %ecx,80(%edx) xorl %eax,%eax ret .align 4,0x90 -L092key_128: +L094key_128: movups %xmm0,(%edx) leal 16(%edx),%edx -L091key_128_cold: +L093key_128_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -1974,38 +2023,38 @@ L091key_128_cold: xorps %xmm1,%xmm0 ret .align 4,0x90 -L08812rounds: +L09012rounds: movq 16(%eax),%xmm2 movl $11,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,202,1 - call L093key_192a_cold + call L095key_192a_cold .byte 102,15,58,223,202,2 - call L094key_192b + call L096key_192b .byte 102,15,58,223,202,4 - call L095key_192a + call L097key_192a .byte 102,15,58,223,202,8 - call L094key_192b + call L096key_192b .byte 102,15,58,223,202,16 - call L095key_192a + call L097key_192a .byte 102,15,58,223,202,32 - call L094key_192b + call L096key_192b .byte 102,15,58,223,202,64 - call L095key_192a + call L097key_192a .byte 102,15,58,223,202,128 - call L094key_192b + call L096key_192b movups %xmm0,(%edx) movl %ecx,48(%edx) xorl %eax,%eax ret .align 4,0x90 -L095key_192a: +L097key_192a: movups %xmm0,(%edx) leal 16(%edx),%edx .align 4,0x90 -L093key_192a_cold: +L095key_192a_cold: movaps %xmm2,%xmm5 -L096key_192b_warm: +L098key_192b_warm: shufps $16,%xmm0,%xmm4 movdqa %xmm2,%xmm3 xorps %xmm4,%xmm0 @@ -2019,56 +2068,56 @@ L096key_192b_warm: pxor %xmm3,%xmm2 ret .align 4,0x90 -L094key_192b: +L096key_192b: movaps %xmm0,%xmm3 shufps $68,%xmm0,%xmm5 movups %xmm5,(%edx) shufps $78,%xmm2,%xmm3 movups %xmm3,16(%edx) leal 32(%edx),%edx - jmp L096key_192b_warm + jmp L098key_192b_warm .align 4,0x90 -L08714rounds: +L08914rounds: movups 16(%eax),%xmm2 movl $13,%ecx leal 16(%edx),%edx movups %xmm0,-32(%edx) movups %xmm2,-16(%edx) .byte 102,15,58,223,202,1 - call L097key_256a_cold + call L099key_256a_cold .byte 102,15,58,223,200,1 - call L098key_256b + call L100key_256b .byte 102,15,58,223,202,2 - call L099key_256a + call L101key_256a .byte 102,15,58,223,200,2 - call L098key_256b + call L100key_256b .byte 102,15,58,223,202,4 - call L099key_256a + call L101key_256a .byte 102,15,58,223,200,4 - call L098key_256b + call L100key_256b .byte 102,15,58,223,202,8 - call L099key_256a + call L101key_256a .byte 102,15,58,223,200,8 - call L098key_256b + call L100key_256b .byte 102,15,58,223,202,16 - call L099key_256a + call L101key_256a .byte 102,15,58,223,200,16 - call L098key_256b + call L100key_256b .byte 102,15,58,223,202,32 - call L099key_256a + call L101key_256a .byte 102,15,58,223,200,32 - call L098key_256b + call L100key_256b .byte 102,15,58,223,202,64 - call L099key_256a + call L101key_256a movups %xmm0,(%edx) movl %ecx,16(%edx) xorl %eax,%eax ret .align 4,0x90 -L099key_256a: +L101key_256a: movups %xmm2,(%edx) leal 16(%edx),%edx -L097key_256a_cold: +L099key_256a_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2077,7 +2126,7 @@ L097key_256a_cold: xorps %xmm1,%xmm0 ret .align 4,0x90 -L098key_256b: +L100key_256b: movups %xmm0,(%edx) leal 16(%edx),%edx shufps $16,%xmm2,%xmm4 @@ -2088,11 +2137,11 @@ L098key_256b: xorps %xmm1,%xmm2 ret .align 2,0x90 -L086bad_pointer: +L088bad_pointer: movl $-1,%eax ret .align 2,0x90 -L089bad_keybits: +L091bad_keybits: movl $-2,%eax ret .globl _aesni_set_encrypt_key @@ -2115,7 +2164,7 @@ L_aesni_set_decrypt_key_begin: movl 12(%esp),%edx shll $4,%ecx testl %eax,%eax - jnz L100dec_key_ret + jnz L102dec_key_ret leal 16(%edx,%ecx,1),%eax movups (%edx),%xmm0 movups (%eax),%xmm1 @@ -2123,7 +2172,7 @@ L_aesni_set_decrypt_key_begin: movups %xmm1,(%edx) leal 16(%edx),%edx leal -16(%eax),%eax -L101dec_key_inverse: +L103dec_key_inverse: movups (%edx),%xmm0 movups (%eax),%xmm1 .byte 102,15,56,219,192 @@ -2133,12 +2182,12 @@ L101dec_key_inverse: movups %xmm0,16(%eax) movups %xmm1,-16(%edx) cmpl %edx,%eax - ja L101dec_key_inverse + ja L103dec_key_inverse movups (%edx),%xmm0 .byte 102,15,56,219,192 movups %xmm0,(%edx) xorl %eax,%eax -L100dec_key_ret: +L102dec_key_ret: ret .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 .byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 diff --git a/lib/accelerated/x86/macosx/aesni-x86_64.s b/lib/accelerated/x86/macosx/aesni-x86_64.s index 7089c7c9d6..b58973fe6e 100644 --- a/lib/accelerated/x86/macosx/aesni-x86_64.s +++ b/lib/accelerated/x86/macosx/aesni-x86_64.s @@ -38,6 +38,7 @@ # *** This file is auto-generated *** # .text + .globl _aesni_encrypt .p2align 4 @@ -53,7 +54,7 @@ L$oop_enc1_1: decl %eax movups (%rdx),%xmm1 leaq 16(%rdx),%rdx - jnz L$oop_enc1_1 + jnz L$oop_enc1_1 .byte 102,15,56,221,209 movups %xmm2,(%rsi) .byte 0xf3,0xc3 @@ -74,34 +75,93 @@ L$oop_dec1_2: decl %eax movups (%rdx),%xmm1 leaq 16(%rdx),%rdx - jnz L$oop_dec1_2 + jnz L$oop_dec1_2 .byte 102,15,56,223,209 movups %xmm2,(%rsi) .byte 0xf3,0xc3 .p2align 4 +_aesni_encrypt2: + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +L$enc_loop2: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$enc_loop2 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,221,208 +.byte 102,15,56,221,216 + .byte 0xf3,0xc3 + + +.p2align 4 +_aesni_decrypt2: + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +L$dec_loop2: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$dec_loop2 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,223,208 +.byte 102,15,56,223,216 + .byte 0xf3,0xc3 + + +.p2align 4 _aesni_encrypt3: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 - movups (%rcx),%xmm0 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax L$enc_loop3: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz L$enc_loop3 .byte 102,15,56,220,209 @@ -116,25 +176,26 @@ L$enc_loop3: .p2align 4 _aesni_decrypt3: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 - movups (%rcx),%xmm0 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax L$dec_loop3: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz L$dec_loop3 .byte 102,15,56,222,209 @@ -149,28 +210,30 @@ L$dec_loop3: .p2align 4 _aesni_encrypt4: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 xorps %xmm0,%xmm5 - movups (%rcx),%xmm0 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 0x0f,0x1f,0x00 + addq $16,%rax L$enc_loop4: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 .byte 102,15,56,220,233 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 .byte 102,15,56,220,232 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz L$enc_loop4 .byte 102,15,56,220,209 @@ -187,28 +250,30 @@ L$enc_loop4: .p2align 4 _aesni_decrypt4: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 xorps %xmm0,%xmm5 - movups (%rcx),%xmm0 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 0x0f,0x1f,0x00 + addq $16,%rax L$dec_loop4: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 .byte 102,15,56,222,233 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 .byte 102,15,56,222,232 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz L$dec_loop4 .byte 102,15,56,222,209 @@ -225,43 +290,43 @@ L$dec_loop4: .p2align 4 _aesni_encrypt6: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 -.byte 102,15,56,220,209 pxor %xmm0,%xmm4 +.byte 102,15,56,220,209 + leaq 32(%rcx,%rax,1),%rcx + negq %rax .byte 102,15,56,220,217 pxor %xmm0,%xmm5 -.byte 102,15,56,220,225 pxor %xmm0,%xmm6 -.byte 102,15,56,220,233 +.byte 102,15,56,220,225 pxor %xmm0,%xmm7 - decl %eax + addq $16,%rax +.byte 102,15,56,220,233 .byte 102,15,56,220,241 - movups (%rcx),%xmm0 .byte 102,15,56,220,249 + movups -16(%rcx,%rax,1),%xmm0 jmp L$enc_loop6_enter .p2align 4 L$enc_loop6: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 L$enc_loop6_enter: - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz L$enc_loop6 .byte 102,15,56,220,209 @@ -282,43 +347,43 @@ L$enc_loop6_enter: .p2align 4 _aesni_decrypt6: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 -.byte 102,15,56,222,209 pxor %xmm0,%xmm4 +.byte 102,15,56,222,209 + leaq 32(%rcx,%rax,1),%rcx + negq %rax .byte 102,15,56,222,217 pxor %xmm0,%xmm5 -.byte 102,15,56,222,225 pxor %xmm0,%xmm6 -.byte 102,15,56,222,233 +.byte 102,15,56,222,225 pxor %xmm0,%xmm7 - decl %eax + addq $16,%rax +.byte 102,15,56,222,233 .byte 102,15,56,222,241 - movups (%rcx),%xmm0 .byte 102,15,56,222,249 + movups -16(%rcx,%rax,1),%xmm0 jmp L$dec_loop6_enter .p2align 4 L$dec_loop6: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 L$dec_loop6_enter: - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz L$dec_loop6 .byte 102,15,56,222,209 @@ -339,52 +404,51 @@ L$dec_loop6_enter: .p2align 4 _aesni_encrypt8: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 -.byte 102,15,56,220,209 pxor %xmm0,%xmm4 -.byte 102,15,56,220,217 pxor %xmm0,%xmm5 -.byte 102,15,56,220,225 pxor %xmm0,%xmm6 -.byte 102,15,56,220,233 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,220,209 + addq $16,%rax pxor %xmm0,%xmm7 - decl %eax -.byte 102,15,56,220,241 +.byte 102,15,56,220,217 pxor %xmm0,%xmm8 -.byte 102,15,56,220,249 pxor %xmm0,%xmm9 - movups (%rcx),%xmm0 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 - movups 16(%rcx),%xmm1 + movups -16(%rcx,%rax,1),%xmm0 jmp L$enc_loop8_enter .p2align 4 L$enc_loop8: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 - movups 16(%rcx),%xmm1 L$enc_loop8_enter: + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 .byte 102,68,15,56,220,192 .byte 102,68,15,56,220,200 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz L$enc_loop8 .byte 102,15,56,220,209 @@ -409,52 +473,51 @@ L$enc_loop8_enter: .p2align 4 _aesni_decrypt8: movups (%rcx),%xmm0 - shrl $1,%eax + shll $4,%eax movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 -.byte 102,15,56,222,209 pxor %xmm0,%xmm4 -.byte 102,15,56,222,217 pxor %xmm0,%xmm5 -.byte 102,15,56,222,225 pxor %xmm0,%xmm6 -.byte 102,15,56,222,233 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,222,209 + addq $16,%rax pxor %xmm0,%xmm7 - decl %eax -.byte 102,15,56,222,241 +.byte 102,15,56,222,217 pxor %xmm0,%xmm8 -.byte 102,15,56,222,249 pxor %xmm0,%xmm9 - movups (%rcx),%xmm0 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 .byte 102,68,15,56,222,193 .byte 102,68,15,56,222,201 - movups 16(%rcx),%xmm1 + movups -16(%rcx,%rax,1),%xmm0 jmp L$dec_loop8_enter .p2align 4 L$dec_loop8: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 .byte 102,68,15,56,222,193 .byte 102,68,15,56,222,201 - movups 16(%rcx),%xmm1 L$dec_loop8_enter: + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 .byte 102,68,15,56,222,192 .byte 102,68,15,56,222,200 - movups (%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz L$dec_loop8 .byte 102,15,56,222,209 @@ -583,14 +646,13 @@ L$oop_enc1_3: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_enc1_3 + jnz L$oop_enc1_3 .byte 102,15,56,221,209 movups %xmm2,(%rsi) jmp L$ecb_ret .p2align 4 L$ecb_enc_two: - xorps %xmm4,%xmm4 - call _aesni_encrypt3 + call _aesni_encrypt2 movups %xmm2,(%rsi) movups %xmm3,16(%rsi) jmp L$ecb_ret @@ -728,14 +790,13 @@ L$oop_dec1_4: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_dec1_4 + jnz L$oop_dec1_4 .byte 102,15,56,223,209 movups %xmm2,(%rsi) jmp L$ecb_ret .p2align 4 L$ecb_dec_two: - xorps %xmm4,%xmm4 - call _aesni_decrypt3 + call _aesni_decrypt2 movups %xmm2,(%rsi) movups %xmm3,16(%rsi) jmp L$ecb_ret @@ -782,53 +843,53 @@ L$ecb_ret: .p2align 4 _aesni_ccm64_encrypt_blocks: movl 240(%rcx),%eax - movdqu (%r8),%xmm9 - movdqa L$increment64(%rip),%xmm6 + movdqu (%r8),%xmm6 + movdqa L$increment64(%rip),%xmm9 movdqa L$bswap_mask(%rip),%xmm7 - shrl $1,%eax + shll $4,%eax + movl $16,%r10d leaq 0(%rcx),%r11 movdqu (%r9),%xmm3 - movdqa %xmm9,%xmm2 - movl %eax,%r10d -.byte 102,68,15,56,0,207 + movdqa %xmm6,%xmm2 + leaq 32(%rcx,%rax,1),%rcx +.byte 102,15,56,0,247 + subq %rax,%r10 jmp L$ccm64_enc_outer .p2align 4 L$ccm64_enc_outer: movups (%r11),%xmm0 - movl %r10d,%eax + movq %r10,%rax movups (%rdi),%xmm8 xorps %xmm0,%xmm2 movups 16(%r11),%xmm1 xorps %xmm8,%xmm0 - leaq 32(%r11),%rcx xorps %xmm0,%xmm3 - movups (%rcx),%xmm0 + movups 32(%r11),%xmm0 L$ccm64_enc2_loop: .byte 102,15,56,220,209 - decl %eax .byte 102,15,56,220,217 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 - leaq 32(%rcx),%rcx .byte 102,15,56,220,216 - movups 0(%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz L$ccm64_enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 - paddq %xmm6,%xmm9 + paddq %xmm9,%xmm6 + decq %rdx .byte 102,15,56,221,208 .byte 102,15,56,221,216 - decq %rdx leaq 16(%rdi),%rdi xorps %xmm2,%xmm8 - movdqa %xmm9,%xmm2 + movdqa %xmm6,%xmm2 movups %xmm8,(%rsi) - leaq 16(%rsi),%rsi .byte 102,15,56,0,215 + leaq 16(%rsi),%rsi jnz L$ccm64_enc_outer movups %xmm3,(%r9) @@ -839,15 +900,15 @@ L$ccm64_enc2_loop: .p2align 4 _aesni_ccm64_decrypt_blocks: movl 240(%rcx),%eax - movups (%r8),%xmm9 + movups (%r8),%xmm6 movdqu (%r9),%xmm3 - movdqa L$increment64(%rip),%xmm6 + movdqa L$increment64(%rip),%xmm9 movdqa L$bswap_mask(%rip),%xmm7 - movaps %xmm9,%xmm2 + movaps %xmm6,%xmm2 movl %eax,%r10d movq %rcx,%r11 -.byte 102,68,15,56,0,207 +.byte 102,15,56,0,247 movups (%rcx),%xmm0 movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx @@ -857,17 +918,21 @@ L$oop_enc1_5: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_enc1_5 + jnz L$oop_enc1_5 .byte 102,15,56,221,209 + shll $4,%r10d + movl $16,%eax movups (%rdi),%xmm8 - paddq %xmm6,%xmm9 + paddq %xmm9,%xmm6 leaq 16(%rdi),%rdi + subq %r10,%rax + leaq 32(%r11,%r10,1),%rcx + movq %rax,%r10 jmp L$ccm64_dec_outer .p2align 4 L$ccm64_dec_outer: xorps %xmm2,%xmm8 - movdqa %xmm9,%xmm2 - movl %r10d,%eax + movdqa %xmm6,%xmm2 movups %xmm8,(%rsi) leaq 16(%rsi),%rsi .byte 102,15,56,0,215 @@ -876,36 +941,36 @@ L$ccm64_dec_outer: jz L$ccm64_dec_break movups (%r11),%xmm0 - shrl $1,%eax + movq %r10,%rax movups 16(%r11),%xmm1 xorps %xmm0,%xmm8 - leaq 32(%r11),%rcx xorps %xmm0,%xmm2 xorps %xmm8,%xmm3 - movups (%rcx),%xmm0 - + movups 32(%r11),%xmm0 + jmp L$ccm64_dec2_loop +.p2align 4 L$ccm64_dec2_loop: .byte 102,15,56,220,209 - decl %eax .byte 102,15,56,220,217 - movups 16(%rcx),%xmm1 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax .byte 102,15,56,220,208 - leaq 32(%rcx),%rcx .byte 102,15,56,220,216 - movups 0(%rcx),%xmm0 + movups -16(%rcx,%rax,1),%xmm0 jnz L$ccm64_dec2_loop movups (%rdi),%xmm8 - paddq %xmm6,%xmm9 + paddq %xmm9,%xmm6 .byte 102,15,56,220,209 .byte 102,15,56,220,217 - leaq 16(%rdi),%rdi .byte 102,15,56,221,208 .byte 102,15,56,221,216 + leaq 16(%rdi),%rdi jmp L$ccm64_dec_outer .p2align 4 L$ccm64_dec_break: + movl 240(%r11),%eax movups (%r11),%xmm0 movups 16(%r11),%xmm1 xorps %xmm0,%xmm8 @@ -916,7 +981,7 @@ L$oop_enc1_6: decl %eax movups (%r11),%xmm1 leaq 16(%r11),%r11 - jnz L$oop_enc1_6 + jnz L$oop_enc1_6 .byte 102,15,56,221,217 movups %xmm3,(%r9) .byte 0xf3,0xc3 @@ -925,199 +990,518 @@ L$oop_enc1_6: .p2align 4 _aesni_ctr32_encrypt_blocks: + leaq (%rsp),%rax + pushq %rbp + subq $128,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp + cmpq $1,%rdx je L$ctr32_one_shortcut - movdqu (%r8),%xmm14 - movdqa L$bswap_mask(%rip),%xmm15 - xorl %eax,%eax -.byte 102,69,15,58,22,242,3 -.byte 102,68,15,58,34,240,3 - + movdqu (%r8),%xmm2 + movdqu (%rcx),%xmm0 + movl 12(%r8),%r8d + pxor %xmm0,%xmm2 + movl 12(%rcx),%r11d + movdqa %xmm2,0(%rsp) + bswapl %r8d + movdqa %xmm2,%xmm3 + movdqa %xmm2,%xmm4 + movdqa %xmm2,%xmm5 + movdqa %xmm2,64(%rsp) + movdqa %xmm2,80(%rsp) + movdqa %xmm2,96(%rsp) + movq %rdx,%r10 + movdqa %xmm2,112(%rsp) + + leaq 1(%r8),%rax + leaq 2(%r8),%rdx + bswapl %eax + bswapl %edx + xorl %r11d,%eax + xorl %r11d,%edx +.byte 102,15,58,34,216,3 + leaq 3(%r8),%rax + movdqa %xmm3,16(%rsp) +.byte 102,15,58,34,226,3 + bswapl %eax + movq %r10,%rdx + leaq 4(%r8),%r10 + movdqa %xmm4,32(%rsp) + xorl %r11d,%eax + bswapl %r10d +.byte 102,15,58,34,232,3 + xorl %r11d,%r10d + movdqa %xmm5,48(%rsp) + leaq 5(%r8),%r9 + movl %r10d,64+12(%rsp) + bswapl %r9d + leaq 6(%r8),%r10 movl 240(%rcx),%eax + xorl %r11d,%r9d bswapl %r10d - pxor %xmm12,%xmm12 - pxor %xmm13,%xmm13 -.byte 102,69,15,58,34,226,0 - leaq 3(%r10),%r11 -.byte 102,69,15,58,34,235,0 - incl %r10d -.byte 102,69,15,58,34,226,1 - incq %r11 -.byte 102,69,15,58,34,235,1 - incl %r10d -.byte 102,69,15,58,34,226,2 - incq %r11 -.byte 102,69,15,58,34,235,2 - movdqa %xmm12,-40(%rsp) -.byte 102,69,15,56,0,231 - movdqa %xmm13,-24(%rsp) -.byte 102,69,15,56,0,239 - - pshufd $192,%xmm12,%xmm2 - pshufd $128,%xmm12,%xmm3 - pshufd $64,%xmm12,%xmm4 - cmpq $6,%rdx + movl %r9d,80+12(%rsp) + xorl %r11d,%r10d + leaq 7(%r8),%r9 + movl %r10d,96+12(%rsp) + bswapl %r9d + movl __gnutls_x86_cpuid_s+4(%rip),%r10d + xorl %r11d,%r9d + andl $71303168,%r10d + movl %r9d,112+12(%rsp) + + movups 16(%rcx),%xmm1 + + movdqa 64(%rsp),%xmm6 + movdqa 80(%rsp),%xmm7 + + cmpq $8,%rdx jb L$ctr32_tail - shrl $1,%eax - movq %rcx,%r11 - movl %eax,%r10d + subq $6,%rdx + cmpl $4194304,%r10d + je L$ctr32_6x + + leaq 128(%rcx),%rcx + subq $2,%rdx + jmp L$ctr32_loop8 + +.p2align 4 +L$ctr32_6x: + shll $4,%eax + movl $48,%r10d + bswapl %r11d + leaq 32(%rcx,%rax,1),%rcx + subq %rax,%r10 jmp L$ctr32_loop6 .p2align 4 L$ctr32_loop6: - pshufd $192,%xmm13,%xmm5 - por %xmm14,%xmm2 - movups (%r11),%xmm0 - pshufd $128,%xmm13,%xmm6 - por %xmm14,%xmm3 - movups 16(%r11),%xmm1 - pshufd $64,%xmm13,%xmm7 - por %xmm14,%xmm4 - por %xmm14,%xmm5 - xorps %xmm0,%xmm2 - por %xmm14,%xmm6 - por %xmm14,%xmm7 + addl $6,%r8d + movups -48(%rcx,%r10,1),%xmm0 +.byte 102,15,56,220,209 + movl %r8d,%eax + xorl %r11d,%eax +.byte 102,15,56,220,217 +.byte 0x0f,0x38,0xf1,0x44,0x24,12 + leal 1(%r8),%eax +.byte 102,15,56,220,225 + xorl %r11d,%eax +.byte 0x0f,0x38,0xf1,0x44,0x24,28 +.byte 102,15,56,220,233 + leal 2(%r8),%eax + xorl %r11d,%eax +.byte 102,15,56,220,241 +.byte 0x0f,0x38,0xf1,0x44,0x24,44 + leal 3(%r8),%eax +.byte 102,15,56,220,249 + movups -32(%rcx,%r10,1),%xmm1 + xorl %r11d,%eax + +.byte 102,15,56,220,208 +.byte 0x0f,0x38,0xf1,0x44,0x24,60 + leal 4(%r8),%eax +.byte 102,15,56,220,216 + xorl %r11d,%eax +.byte 0x0f,0x38,0xf1,0x44,0x24,76 +.byte 102,15,56,220,224 + leal 5(%r8),%eax + xorl %r11d,%eax +.byte 102,15,56,220,232 +.byte 0x0f,0x38,0xf1,0x44,0x24,92 + movq %r10,%rax +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups -16(%rcx,%r10,1),%xmm0 + call L$enc_loop6 + movdqu (%rdi),%xmm8 + movdqu 16(%rdi),%xmm9 + movdqu 32(%rdi),%xmm10 + movdqu 48(%rdi),%xmm11 + movdqu 64(%rdi),%xmm12 + movdqu 80(%rdi),%xmm13 + leaq 96(%rdi),%rdi + movups -64(%rcx,%r10,1),%xmm1 + pxor %xmm2,%xmm8 + movaps 0(%rsp),%xmm2 + pxor %xmm3,%xmm9 + movaps 16(%rsp),%xmm3 + pxor %xmm4,%xmm10 + movaps 32(%rsp),%xmm4 + pxor %xmm5,%xmm11 + movaps 48(%rsp),%xmm5 + pxor %xmm6,%xmm12 + movaps 64(%rsp),%xmm6 + pxor %xmm7,%xmm13 + movaps 80(%rsp),%xmm7 + movdqu %xmm8,(%rsi) + movdqu %xmm9,16(%rsi) + movdqu %xmm10,32(%rsi) + movdqu %xmm11,48(%rsi) + movdqu %xmm12,64(%rsi) + movdqu %xmm13,80(%rsi) + leaq 96(%rsi),%rsi + subq $6,%rdx + jnc L$ctr32_loop6 - pxor %xmm0,%xmm3 + addq $6,%rdx + jz L$ctr32_done + + leal -48(%r10),%eax + leaq -80(%rcx,%r10,1),%rcx + negl %eax + shrl $4,%eax + jmp L$ctr32_tail + +.p2align 5 +L$ctr32_loop8: + addl $8,%r8d + movdqa 96(%rsp),%xmm8 .byte 102,15,56,220,209 - leaq 32(%r11),%rcx - pxor %xmm0,%xmm4 + movl %r8d,%r9d + movdqa 112(%rsp),%xmm9 .byte 102,15,56,220,217 - movdqa L$increment32(%rip),%xmm13 - pxor %xmm0,%xmm5 + bswapl %r9d + movups 32-128(%rcx),%xmm0 .byte 102,15,56,220,225 - movdqa -40(%rsp),%xmm12 - pxor %xmm0,%xmm6 + xorl %r11d,%r9d + nop .byte 102,15,56,220,233 - pxor %xmm0,%xmm7 - movups (%rcx),%xmm0 - decl %eax + movl %r9d,0+12(%rsp) + leaq 1(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 - jmp L$ctr32_enc_loop6_enter -.p2align 4 -L$ctr32_enc_loop6: +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 48-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movl %r9d,16+12(%rsp) + leaq 2(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 64-128(%rcx),%xmm0 + bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax + xorl %r11d,%r9d +.byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 + movl %r9d,32+12(%rsp) + leaq 3(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -L$ctr32_enc_loop6_enter: - movups 16(%rcx),%xmm1 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 80-128(%rcx),%xmm1 + bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx + xorl %r11d,%r9d +.byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 + movl %r9d,48+12(%rsp) + leaq 4(%r8),%r9 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups (%rcx),%xmm0 - jnz L$ctr32_enc_loop6 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 96-128(%rcx),%xmm0 + bswapl %r9d +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movl %r9d,64+12(%rsp) + leaq 5(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 112-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movl %r9d,80+12(%rsp) + leaq 6(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 128-128(%rcx),%xmm0 + bswapl %r9d +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + xorl %r11d,%r9d +.byte 0x66,0x90 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movl %r9d,96+12(%rsp) + leaq 7(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 144-128(%rcx),%xmm1 + bswapl %r9d +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 + xorl %r11d,%r9d + movdqu 0(%rdi),%xmm10 +.byte 102,15,56,220,232 + movl %r9d,112+12(%rsp) + cmpl $11,%eax +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 160-128(%rcx),%xmm0 + + jb L$ctr32_enc_done .byte 102,15,56,220,209 - paddd %xmm13,%xmm12 .byte 102,15,56,220,217 - paddd -24(%rsp),%xmm13 .byte 102,15,56,220,225 - movdqa %xmm12,-40(%rsp) .byte 102,15,56,220,233 - movdqa %xmm13,-24(%rsp) .byte 102,15,56,220,241 -.byte 102,69,15,56,0,231 .byte 102,15,56,220,249 -.byte 102,69,15,56,0,239 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 176-128(%rcx),%xmm1 -.byte 102,15,56,221,208 - movups (%rdi),%xmm8 -.byte 102,15,56,221,216 - movups 16(%rdi),%xmm9 -.byte 102,15,56,221,224 - movups 32(%rdi),%xmm10 -.byte 102,15,56,221,232 - movups 48(%rdi),%xmm11 -.byte 102,15,56,221,240 - movups 64(%rdi),%xmm1 -.byte 102,15,56,221,248 - movups 80(%rdi),%xmm0 - leaq 96(%rdi),%rdi +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 192-128(%rcx),%xmm0 + je L$ctr32_enc_done - xorps %xmm2,%xmm8 - pshufd $192,%xmm12,%xmm2 - xorps %xmm3,%xmm9 - pshufd $128,%xmm12,%xmm3 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - pshufd $64,%xmm12,%xmm4 - movups %xmm9,16(%rsi) - xorps %xmm5,%xmm11 - movups %xmm10,32(%rsi) - xorps %xmm6,%xmm1 - movups %xmm11,48(%rsi) - xorps %xmm7,%xmm0 - movups %xmm1,64(%rsi) - movups %xmm0,80(%rsi) - leaq 96(%rsi),%rsi - movl %r10d,%eax - subq $6,%rdx - jnc L$ctr32_loop6 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 208-128(%rcx),%xmm1 - addq $6,%rdx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 224-128(%rcx),%xmm0 + jmp L$ctr32_enc_done + +.p2align 4 +L$ctr32_enc_done: + movdqu 16(%rdi),%xmm11 + pxor %xmm0,%xmm10 + movdqu 32(%rdi),%xmm12 + pxor %xmm0,%xmm11 + movdqu 48(%rdi),%xmm13 + pxor %xmm0,%xmm12 + movdqu 64(%rdi),%xmm14 + pxor %xmm0,%xmm13 + movdqu 80(%rdi),%xmm15 + pxor %xmm0,%xmm14 + pxor %xmm0,%xmm15 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movdqu 96(%rdi),%xmm1 + leaq 128(%rdi),%rdi + +.byte 102,65,15,56,221,210 + pxor %xmm0,%xmm1 + movdqu 112-128(%rdi),%xmm10 +.byte 102,65,15,56,221,219 + pxor %xmm0,%xmm10 + movdqa 0(%rsp),%xmm11 +.byte 102,65,15,56,221,228 +.byte 102,65,15,56,221,237 + movdqa 16(%rsp),%xmm12 + movdqa 32(%rsp),%xmm13 +.byte 102,65,15,56,221,246 +.byte 102,65,15,56,221,255 + movdqa 48(%rsp),%xmm14 + movdqa 64(%rsp),%xmm15 +.byte 102,68,15,56,221,193 + movdqa 80(%rsp),%xmm0 + movups 16-128(%rcx),%xmm1 +.byte 102,69,15,56,221,202 + + movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 + movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 + movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 + movups %xmm5,48(%rsi) + movdqa %xmm14,%xmm5 + movups %xmm6,64(%rsi) + movdqa %xmm15,%xmm6 + movups %xmm7,80(%rsi) + movdqa %xmm0,%xmm7 + movups %xmm8,96(%rsi) + movups %xmm9,112(%rsi) + leaq 128(%rsi),%rsi + + subq $8,%rdx + jnc L$ctr32_loop8 + + addq $8,%rdx jz L$ctr32_done - movq %r11,%rcx - leal 1(%rax,%rax,1),%eax + leaq -128(%rcx),%rcx L$ctr32_tail: - por %xmm14,%xmm2 - movups (%rdi),%xmm8 - cmpq $2,%rdx - jb L$ctr32_one + leaq 16(%rcx),%rcx + cmpq $4,%rdx + jb L$ctr32_loop3 + je L$ctr32_loop4 - por %xmm14,%xmm3 - movups 16(%rdi),%xmm9 - je L$ctr32_two + shll $4,%eax + movdqa 96(%rsp),%xmm8 + pxor %xmm9,%xmm9 - pshufd $192,%xmm13,%xmm5 - por %xmm14,%xmm4 - movups 32(%rdi),%xmm10 - cmpq $4,%rdx - jb L$ctr32_three + movups 16(%rcx),%xmm0 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + leaq 32-16(%rcx,%rax,1),%rcx + negq %rax +.byte 102,15,56,220,225 + addq $16,%rax + movups (%rdi),%xmm10 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 + movups 16(%rdi),%xmm11 + movups 32(%rdi),%xmm12 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 - pshufd $128,%xmm13,%xmm6 - por %xmm14,%xmm5 - movups 48(%rdi),%xmm11 - je L$ctr32_four + call L$enc_loop8_enter - por %xmm14,%xmm6 - xorps %xmm7,%xmm7 + movdqu 48(%rdi),%xmm13 + pxor %xmm10,%xmm2 + movdqu 64(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm10,%xmm6 + movdqu %xmm5,48(%rsi) + movdqu %xmm6,64(%rsi) + cmpq $6,%rdx + jb L$ctr32_done - call _aesni_encrypt6 + movups 80(%rdi),%xmm11 + xorps %xmm11,%xmm7 + movups %xmm7,80(%rsi) + je L$ctr32_done - movups 64(%rdi),%xmm1 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - movups %xmm9,16(%rsi) - xorps %xmm5,%xmm11 - movups %xmm10,32(%rsi) - xorps %xmm6,%xmm1 - movups %xmm11,48(%rsi) - movups %xmm1,64(%rsi) + movups 96(%rdi),%xmm12 + xorps %xmm12,%xmm8 + movups %xmm8,96(%rsi) + jmp L$ctr32_done + +.p2align 5 +L$ctr32_loop4: +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx + decl %eax +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups (%rcx),%xmm1 + jnz L$ctr32_loop4 +.byte 102,15,56,221,209 +.byte 102,15,56,221,217 + movups (%rdi),%xmm10 + movups 16(%rdi),%xmm11 +.byte 102,15,56,221,225 +.byte 102,15,56,221,233 + movups 32(%rdi),%xmm12 + movups 48(%rdi),%xmm13 + + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm4,32(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm5,48(%rsi) + jmp L$ctr32_done + +.p2align 5 +L$ctr32_loop3: +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx + decl %eax +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 + movups (%rcx),%xmm1 + jnz L$ctr32_loop3 +.byte 102,15,56,221,209 +.byte 102,15,56,221,217 +.byte 102,15,56,221,225 + + movups (%rdi),%xmm10 + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + cmpq $2,%rdx + jb L$ctr32_done + + movups 16(%rdi),%xmm11 + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + je L$ctr32_done + + movups 32(%rdi),%xmm12 + xorps %xmm12,%xmm4 + movups %xmm4,32(%rsi) jmp L$ctr32_done .p2align 4 L$ctr32_one_shortcut: movups (%r8),%xmm2 - movups (%rdi),%xmm8 + movups (%rdi),%xmm10 movl 240(%rcx),%eax -L$ctr32_one: movups (%rcx),%xmm0 movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx @@ -1127,289 +1511,303 @@ L$oop_enc1_7: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_enc1_7 + jnz L$oop_enc1_7 .byte 102,15,56,221,209 - xorps %xmm2,%xmm8 - movups %xmm8,(%rsi) - jmp L$ctr32_done - -.p2align 4 -L$ctr32_two: - xorps %xmm4,%xmm4 - call _aesni_encrypt3 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - movups %xmm9,16(%rsi) - jmp L$ctr32_done - -.p2align 4 -L$ctr32_three: - call _aesni_encrypt3 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - movups %xmm9,16(%rsi) - movups %xmm10,32(%rsi) + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) jmp L$ctr32_done .p2align 4 -L$ctr32_four: - call _aesni_encrypt4 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - movups %xmm9,16(%rsi) - xorps %xmm5,%xmm11 - movups %xmm10,32(%rsi) - movups %xmm11,48(%rsi) - L$ctr32_done: + leaq (%rbp),%rsp + popq %rbp +L$ctr32_epilogue: .byte 0xf3,0xc3 .globl _aesni_xts_encrypt .p2align 4 _aesni_xts_encrypt: - leaq -104(%rsp),%rsp - movups (%r9),%xmm15 + leaq (%rsp),%rax + pushq %rbp + subq $112,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp + movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d movups (%r8),%xmm0 movups 16(%r8),%xmm1 leaq 32(%r8),%r8 - xorps %xmm0,%xmm15 + xorps %xmm0,%xmm2 L$oop_enc1_8: -.byte 102,68,15,56,220,249 +.byte 102,15,56,220,209 decl %eax movups (%r8),%xmm1 leaq 16(%r8),%r8 - jnz L$oop_enc1_8 -.byte 102,68,15,56,221,249 + jnz L$oop_enc1_8 +.byte 102,15,56,221,209 + movups (%rcx),%xmm0 movq %rcx,%r11 movl %r10d,%eax + shll $4,%r10d movq %rdx,%r9 andq $-16,%rdx + movups 16(%rcx,%r10,1),%xmm1 + movdqa L$xts_magic(%rip),%xmm8 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + movdqa %xmm2,%xmm15 + pshufd $95,%xmm2,%xmm9 + pxor %xmm0,%xmm1 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm10 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm10 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm11 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm11 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm12 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm12 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm13 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm13 + pxor %xmm14,%xmm15 + movdqa %xmm15,%xmm14 + psrad $31,%xmm9 paddq %xmm15,%xmm15 pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 + pxor %xmm0,%xmm14 pxor %xmm9,%xmm15 + movaps %xmm1,96(%rsp) + subq $96,%rdx jc L$xts_enc_short - shrl $1,%eax - subl $1,%eax - movl %eax,%r10d + movl $16+96,%eax + leaq 32(%r11,%r10,1),%rcx + subq %r10,%rax + movups 16(%r11),%xmm1 + movq %rax,%r10 + leaq L$xts_magic(%rip),%r8 jmp L$xts_enc_grandloop -.p2align 4 +.p2align 5 L$xts_enc_grandloop: - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu 0(%rdi),%xmm2 - pand %xmm8,%xmm9 + movdqa %xmm0,%xmm8 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 + movdqu 32(%rdi),%xmm4 pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 +.byte 102,15,56,220,209 + movdqu 48(%rdi),%xmm5 pxor %xmm12,%xmm4 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi +.byte 102,15,56,220,217 + movdqu 64(%rdi),%xmm6 pxor %xmm13,%xmm5 - movups (%r11),%xmm0 +.byte 102,15,56,220,225 + movdqu 80(%rdi),%xmm7 + pxor %xmm15,%xmm8 + movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 - pxor %xmm15,%xmm7 - - +.byte 102,15,56,220,233 + movups 32(%r11),%xmm0 + leaq 96(%rdi),%rdi + pxor %xmm8,%xmm7 - movups 16(%r11),%xmm1 - pxor %xmm0,%xmm2 - pxor %xmm0,%xmm3 + pxor %xmm9,%xmm10 +.byte 102,15,56,220,241 + pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) -.byte 102,15,56,220,209 - leaq 32(%r11),%rcx - pxor %xmm0,%xmm4 +.byte 102,15,56,220,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm12 + +.byte 102,15,56,220,208 + pxor %xmm9,%xmm13 movdqa %xmm11,16(%rsp) -.byte 102,15,56,220,217 - pxor %xmm0,%xmm5 +.byte 102,15,56,220,216 + pxor %xmm9,%xmm14 movdqa %xmm12,32(%rsp) -.byte 102,15,56,220,225 - pxor %xmm0,%xmm6 - movdqa %xmm13,48(%rsp) -.byte 102,15,56,220,233 - pxor %xmm0,%xmm7 - movups (%rcx),%xmm0 - decl %eax +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + pxor %xmm9,%xmm8 movdqa %xmm14,64(%rsp) -.byte 102,15,56,220,241 - movdqa %xmm15,80(%rsp) -.byte 102,15,56,220,249 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - jmp L$xts_enc_loop6_enter - -.p2align 4 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups 64(%r11),%xmm0 + movdqa %xmm8,80(%rsp) + pshufd $95,%xmm15,%xmm9 + jmp L$xts_enc_loop6 +.p2align 5 L$xts_enc_loop6: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -L$xts_enc_loop6_enter: - movups 16(%rcx),%xmm1 + movups -64(%rcx,%rax,1),%xmm1 + addq $32,%rax + .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups (%rcx),%xmm0 + movups -80(%rcx,%rax,1),%xmm0 jnz L$xts_enc_loop6 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - paddq %xmm15,%xmm15 + movdqa (%r8),%xmm8 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,220,209 - pand %xmm8,%xmm9 + paddq %xmm15,%xmm15 + psrad $31,%xmm14 .byte 102,15,56,220,217 - pcmpgtd %xmm15,%xmm14 + pand %xmm8,%xmm14 + movups (%r11),%xmm10 .byte 102,15,56,220,225 - pxor %xmm9,%xmm15 .byte 102,15,56,220,233 .byte 102,15,56,220,241 + pxor %xmm14,%xmm15 + movaps %xmm10,%xmm11 .byte 102,15,56,220,249 - movups 16(%rcx),%xmm1 + movups -64(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm10 - paddq %xmm15,%xmm15 + movdqa %xmm9,%xmm14 .byte 102,15,56,220,208 - pand %xmm8,%xmm9 + paddd %xmm9,%xmm9 + pxor %xmm15,%xmm10 .byte 102,15,56,220,216 - pcmpgtd %xmm15,%xmm14 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 .byte 102,15,56,220,224 - pxor %xmm9,%xmm15 .byte 102,15,56,220,232 + pand %xmm8,%xmm14 + movaps %xmm11,%xmm12 .byte 102,15,56,220,240 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 .byte 102,15,56,220,248 - movups 32(%rcx),%xmm0 + movups -48(%rcx),%xmm0 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm11 - paddq %xmm15,%xmm15 + paddd %xmm9,%xmm9 .byte 102,15,56,220,209 - pand %xmm8,%xmm9 + pxor %xmm15,%xmm11 + psrad $31,%xmm14 .byte 102,15,56,220,217 - pcmpgtd %xmm15,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 .byte 102,15,56,220,225 - pxor %xmm9,%xmm15 .byte 102,15,56,220,233 + movdqa %xmm13,48(%rsp) + pxor %xmm14,%xmm15 .byte 102,15,56,220,241 + movaps %xmm12,%xmm13 + movdqa %xmm9,%xmm14 .byte 102,15,56,220,249 + movups -32(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm12 + paddd %xmm9,%xmm9 +.byte 102,15,56,220,208 + pxor %xmm15,%xmm12 + psrad $31,%xmm14 +.byte 102,15,56,220,216 paddq %xmm15,%xmm15 -.byte 102,15,56,221,208 - pand %xmm8,%xmm9 -.byte 102,15,56,221,216 - pcmpgtd %xmm15,%xmm14 -.byte 102,15,56,221,224 - pxor %xmm9,%xmm15 -.byte 102,15,56,221,232 -.byte 102,15,56,221,240 -.byte 102,15,56,221,248 + pand %xmm8,%xmm14 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 + pxor %xmm14,%xmm15 + movaps %xmm13,%xmm14 +.byte 102,15,56,220,248 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm13 + movdqa %xmm9,%xmm0 + paddd %xmm9,%xmm9 +.byte 102,15,56,220,209 + pxor %xmm15,%xmm13 + psrad $31,%xmm0 +.byte 102,15,56,220,217 paddq %xmm15,%xmm15 - xorps 0(%rsp),%xmm2 + pand %xmm8,%xmm0 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + pxor %xmm0,%xmm15 + movups (%r11),%xmm0 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups 16(%r11),%xmm1 + + pxor %xmm15,%xmm14 +.byte 102,15,56,221,84,36,0 + psrad $31,%xmm9 + paddq %xmm15,%xmm15 +.byte 102,15,56,221,92,36,16 +.byte 102,15,56,221,100,36,32 pand %xmm8,%xmm9 - xorps 16(%rsp),%xmm3 - pcmpgtd %xmm15,%xmm14 + movq %r10,%rax +.byte 102,15,56,221,108,36,48 +.byte 102,15,56,221,116,36,64 +.byte 102,15,56,221,124,36,80 pxor %xmm9,%xmm15 - xorps 32(%rsp),%xmm4 - movups %xmm2,0(%rsi) - xorps 48(%rsp),%xmm5 - movups %xmm3,16(%rsi) - xorps 64(%rsp),%xmm6 - movups %xmm4,32(%rsi) - xorps 80(%rsp),%xmm7 - movups %xmm5,48(%rsi) - movl %r10d,%eax - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) leaq 96(%rsi),%rsi + movups %xmm2,-96(%rsi) + movups %xmm3,-80(%rsi) + movups %xmm4,-64(%rsi) + movups %xmm5,-48(%rsi) + movups %xmm6,-32(%rsi) + movups %xmm7,-16(%rsi) subq $96,%rdx jnc L$xts_enc_grandloop - leal 3(%rax,%rax,1),%eax + movl $16+96,%eax + subl %r10d,%eax movq %r11,%rcx - movl %eax,%r10d + shrl $4,%eax L$xts_enc_short: + movl %eax,%r10d + pxor %xmm0,%xmm10 addq $96,%rdx jz L$xts_enc_done + pxor %xmm0,%xmm11 cmpq $32,%rdx jb L$xts_enc_one + pxor %xmm0,%xmm12 je L$xts_enc_two + pxor %xmm0,%xmm13 cmpq $64,%rdx jb L$xts_enc_three + pxor %xmm0,%xmm14 je L$xts_enc_four - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu (%rdi),%xmm2 - pand %xmm8,%xmm9 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 movdqu 48(%rdi),%xmm5 @@ -1450,7 +1848,7 @@ L$oop_enc1_9: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_enc1_9 + jnz L$oop_enc1_9 .byte 102,15,56,221,209 xorps %xmm10,%xmm2 movdqa %xmm11,%xmm10 @@ -1466,7 +1864,7 @@ L$xts_enc_two: xorps %xmm10,%xmm2 xorps %xmm11,%xmm3 - call _aesni_encrypt3 + call _aesni_encrypt2 xorps %xmm10,%xmm2 movdqa %xmm12,%xmm10 @@ -1512,15 +1910,15 @@ L$xts_enc_four: call _aesni_encrypt4 - xorps %xmm10,%xmm2 - movdqa %xmm15,%xmm10 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) + pxor %xmm10,%xmm2 + movdqa %xmm14,%xmm10 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm4 + movdqu %xmm2,(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm3,16(%rsi) + movdqu %xmm4,32(%rsi) + movdqu %xmm5,48(%rsi) leaq 64(%rsi),%rsi jmp L$xts_enc_done @@ -1555,13 +1953,14 @@ L$oop_enc1_10: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_enc1_10 + jnz L$oop_enc1_10 .byte 102,15,56,221,209 xorps %xmm10,%xmm2 movups %xmm2,-16(%rsi) L$xts_enc_ret: - leaq 104(%rsp),%rsp + leaq (%rbp),%rsp + popq %rbp L$xts_enc_epilogue: .byte 0xf3,0xc3 @@ -1569,249 +1968,292 @@ L$xts_enc_epilogue: .p2align 4 _aesni_xts_decrypt: - leaq -104(%rsp),%rsp - movups (%r9),%xmm15 + leaq (%rsp),%rax + pushq %rbp + subq $112,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp + movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d movups (%r8),%xmm0 movups 16(%r8),%xmm1 leaq 32(%r8),%r8 - xorps %xmm0,%xmm15 + xorps %xmm0,%xmm2 L$oop_enc1_11: -.byte 102,68,15,56,220,249 +.byte 102,15,56,220,209 decl %eax movups (%r8),%xmm1 leaq 16(%r8),%r8 - jnz L$oop_enc1_11 -.byte 102,68,15,56,221,249 + jnz L$oop_enc1_11 +.byte 102,15,56,221,209 xorl %eax,%eax testq $15,%rdx setnz %al shlq $4,%rax subq %rax,%rdx + movups (%rcx),%xmm0 movq %rcx,%r11 movl %r10d,%eax + shll $4,%r10d movq %rdx,%r9 andq $-16,%rdx + movups 16(%rcx,%r10,1),%xmm1 + movdqa L$xts_magic(%rip),%xmm8 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + movdqa %xmm2,%xmm15 + pshufd $95,%xmm2,%xmm9 + pxor %xmm0,%xmm1 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm10 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm10 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm11 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm11 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm12 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm12 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm13 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm13 + pxor %xmm14,%xmm15 + movdqa %xmm15,%xmm14 + psrad $31,%xmm9 paddq %xmm15,%xmm15 pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 + pxor %xmm0,%xmm14 pxor %xmm9,%xmm15 + movaps %xmm1,96(%rsp) + subq $96,%rdx jc L$xts_dec_short - shrl $1,%eax - subl $1,%eax - movl %eax,%r10d + movl $16+96,%eax + leaq 32(%r11,%r10,1),%rcx + subq %r10,%rax + movups 16(%r11),%xmm1 + movq %rax,%r10 + leaq L$xts_magic(%rip),%r8 jmp L$xts_dec_grandloop -.p2align 4 +.p2align 5 L$xts_dec_grandloop: - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu 0(%rdi),%xmm2 - pand %xmm8,%xmm9 + movdqa %xmm0,%xmm8 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 + movdqu 32(%rdi),%xmm4 pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 +.byte 102,15,56,222,209 + movdqu 48(%rdi),%xmm5 pxor %xmm12,%xmm4 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi +.byte 102,15,56,222,217 + movdqu 64(%rdi),%xmm6 pxor %xmm13,%xmm5 - movups (%r11),%xmm0 +.byte 102,15,56,222,225 + movdqu 80(%rdi),%xmm7 + pxor %xmm15,%xmm8 + movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 - pxor %xmm15,%xmm7 - - +.byte 102,15,56,222,233 + movups 32(%r11),%xmm0 + leaq 96(%rdi),%rdi + pxor %xmm8,%xmm7 - movups 16(%r11),%xmm1 - pxor %xmm0,%xmm2 - pxor %xmm0,%xmm3 + pxor %xmm9,%xmm10 +.byte 102,15,56,222,241 + pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) -.byte 102,15,56,222,209 - leaq 32(%r11),%rcx - pxor %xmm0,%xmm4 +.byte 102,15,56,222,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm12 + +.byte 102,15,56,222,208 + pxor %xmm9,%xmm13 movdqa %xmm11,16(%rsp) -.byte 102,15,56,222,217 - pxor %xmm0,%xmm5 +.byte 102,15,56,222,216 + pxor %xmm9,%xmm14 movdqa %xmm12,32(%rsp) -.byte 102,15,56,222,225 - pxor %xmm0,%xmm6 - movdqa %xmm13,48(%rsp) -.byte 102,15,56,222,233 - pxor %xmm0,%xmm7 - movups (%rcx),%xmm0 - decl %eax +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + pxor %xmm9,%xmm8 movdqa %xmm14,64(%rsp) -.byte 102,15,56,222,241 - movdqa %xmm15,80(%rsp) -.byte 102,15,56,222,249 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - jmp L$xts_dec_loop6_enter - -.p2align 4 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups 64(%r11),%xmm0 + movdqa %xmm8,80(%rsp) + pshufd $95,%xmm15,%xmm9 + jmp L$xts_dec_loop6 +.p2align 5 L$xts_dec_loop6: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 -L$xts_dec_loop6_enter: - movups 16(%rcx),%xmm1 + movups -64(%rcx,%rax,1),%xmm1 + addq $32,%rax + .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups (%rcx),%xmm0 + movups -80(%rcx,%rax,1),%xmm0 jnz L$xts_dec_loop6 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - paddq %xmm15,%xmm15 + movdqa (%r8),%xmm8 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,222,209 - pand %xmm8,%xmm9 + paddq %xmm15,%xmm15 + psrad $31,%xmm14 .byte 102,15,56,222,217 - pcmpgtd %xmm15,%xmm14 + pand %xmm8,%xmm14 + movups (%r11),%xmm10 .byte 102,15,56,222,225 - pxor %xmm9,%xmm15 .byte 102,15,56,222,233 .byte 102,15,56,222,241 + pxor %xmm14,%xmm15 + movaps %xmm10,%xmm11 .byte 102,15,56,222,249 - movups 16(%rcx),%xmm1 + movups -64(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm10 - paddq %xmm15,%xmm15 + movdqa %xmm9,%xmm14 .byte 102,15,56,222,208 - pand %xmm8,%xmm9 + paddd %xmm9,%xmm9 + pxor %xmm15,%xmm10 .byte 102,15,56,222,216 - pcmpgtd %xmm15,%xmm14 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 .byte 102,15,56,222,224 - pxor %xmm9,%xmm15 .byte 102,15,56,222,232 + pand %xmm8,%xmm14 + movaps %xmm11,%xmm12 .byte 102,15,56,222,240 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 .byte 102,15,56,222,248 - movups 32(%rcx),%xmm0 + movups -48(%rcx),%xmm0 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm11 - paddq %xmm15,%xmm15 + paddd %xmm9,%xmm9 .byte 102,15,56,222,209 - pand %xmm8,%xmm9 + pxor %xmm15,%xmm11 + psrad $31,%xmm14 .byte 102,15,56,222,217 - pcmpgtd %xmm15,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 .byte 102,15,56,222,225 - pxor %xmm9,%xmm15 .byte 102,15,56,222,233 + movdqa %xmm13,48(%rsp) + pxor %xmm14,%xmm15 .byte 102,15,56,222,241 + movaps %xmm12,%xmm13 + movdqa %xmm9,%xmm14 .byte 102,15,56,222,249 + movups -32(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm12 + paddd %xmm9,%xmm9 +.byte 102,15,56,222,208 + pxor %xmm15,%xmm12 + psrad $31,%xmm14 +.byte 102,15,56,222,216 paddq %xmm15,%xmm15 -.byte 102,15,56,223,208 - pand %xmm8,%xmm9 -.byte 102,15,56,223,216 - pcmpgtd %xmm15,%xmm14 -.byte 102,15,56,223,224 - pxor %xmm9,%xmm15 -.byte 102,15,56,223,232 -.byte 102,15,56,223,240 -.byte 102,15,56,223,248 + pand %xmm8,%xmm14 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 + pxor %xmm14,%xmm15 + movaps %xmm13,%xmm14 +.byte 102,15,56,222,248 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm13 + movdqa %xmm9,%xmm0 + paddd %xmm9,%xmm9 +.byte 102,15,56,222,209 + pxor %xmm15,%xmm13 + psrad $31,%xmm0 +.byte 102,15,56,222,217 paddq %xmm15,%xmm15 - xorps 0(%rsp),%xmm2 + pand %xmm8,%xmm0 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + pxor %xmm0,%xmm15 + movups (%r11),%xmm0 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups 16(%r11),%xmm1 + + pxor %xmm15,%xmm14 +.byte 102,15,56,223,84,36,0 + psrad $31,%xmm9 + paddq %xmm15,%xmm15 +.byte 102,15,56,223,92,36,16 +.byte 102,15,56,223,100,36,32 pand %xmm8,%xmm9 - xorps 16(%rsp),%xmm3 - pcmpgtd %xmm15,%xmm14 + movq %r10,%rax +.byte 102,15,56,223,108,36,48 +.byte 102,15,56,223,116,36,64 +.byte 102,15,56,223,124,36,80 pxor %xmm9,%xmm15 - xorps 32(%rsp),%xmm4 - movups %xmm2,0(%rsi) - xorps 48(%rsp),%xmm5 - movups %xmm3,16(%rsi) - xorps 64(%rsp),%xmm6 - movups %xmm4,32(%rsi) - xorps 80(%rsp),%xmm7 - movups %xmm5,48(%rsi) - movl %r10d,%eax - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) leaq 96(%rsi),%rsi + movups %xmm2,-96(%rsi) + movups %xmm3,-80(%rsi) + movups %xmm4,-64(%rsi) + movups %xmm5,-48(%rsi) + movups %xmm6,-32(%rsi) + movups %xmm7,-16(%rsi) subq $96,%rdx jnc L$xts_dec_grandloop - leal 3(%rax,%rax,1),%eax + movl $16+96,%eax + subl %r10d,%eax movq %r11,%rcx - movl %eax,%r10d + shrl $4,%eax L$xts_dec_short: + movl %eax,%r10d + pxor %xmm0,%xmm10 + pxor %xmm0,%xmm11 addq $96,%rdx jz L$xts_dec_done + pxor %xmm0,%xmm12 cmpq $32,%rdx jb L$xts_dec_one + pxor %xmm0,%xmm13 je L$xts_dec_two + pxor %xmm0,%xmm14 cmpq $64,%rdx jb L$xts_dec_three je L$xts_dec_four - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu (%rdi),%xmm2 - pand %xmm8,%xmm9 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 movdqu 48(%rdi),%xmm5 @@ -1861,7 +2303,7 @@ L$oop_dec1_12: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_dec1_12 + jnz L$oop_dec1_12 .byte 102,15,56,223,209 xorps %xmm10,%xmm2 movdqa %xmm11,%xmm10 @@ -1878,7 +2320,7 @@ L$xts_dec_two: xorps %xmm10,%xmm2 xorps %xmm11,%xmm3 - call _aesni_decrypt3 + call _aesni_decrypt2 xorps %xmm10,%xmm2 movdqa %xmm12,%xmm10 @@ -1904,7 +2346,7 @@ L$xts_dec_three: xorps %xmm10,%xmm2 movdqa %xmm13,%xmm10 xorps %xmm11,%xmm3 - movdqa %xmm15,%xmm11 + movdqa %xmm14,%xmm11 xorps %xmm12,%xmm4 movups %xmm2,(%rsi) movups %xmm3,16(%rsi) @@ -1914,14 +2356,8 @@ L$xts_dec_three: .p2align 4 L$xts_dec_four: - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movups (%rdi),%xmm2 - pand %xmm8,%xmm9 movups 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - movups 32(%rdi),%xmm4 xorps %xmm10,%xmm2 movups 48(%rdi),%xmm5 @@ -1932,16 +2368,16 @@ L$xts_dec_four: call _aesni_decrypt4 - xorps %xmm10,%xmm2 + pxor %xmm10,%xmm2 movdqa %xmm14,%xmm10 - xorps %xmm11,%xmm3 + pxor %xmm11,%xmm3 movdqa %xmm15,%xmm11 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm2,(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm3,16(%rsi) + movdqu %xmm4,32(%rsi) + movdqu %xmm5,48(%rsi) leaq 64(%rsi),%rsi jmp L$xts_dec_done @@ -1965,7 +2401,7 @@ L$oop_dec1_13: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_dec1_13 + jnz L$oop_dec1_13 .byte 102,15,56,223,209 xorps %xmm11,%xmm2 movups %xmm2,(%rsi) @@ -1995,13 +2431,14 @@ L$oop_dec1_14: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_dec1_14 + jnz L$oop_dec1_14 .byte 102,15,56,223,209 xorps %xmm10,%xmm2 movups %xmm2,(%rsi) L$xts_dec_ret: - leaq 104(%rsp),%rsp + leaq (%rbp),%rsp + popq %rbp L$xts_dec_epilogue: .byte 0xf3,0xc3 @@ -2038,7 +2475,7 @@ L$oop_enc1_15: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_enc1_15 + jnz L$oop_enc1_15 .byte 102,15,56,221,209 movl %r10d,%eax movq %r11,%rcx @@ -2054,163 +2491,397 @@ L$oop_enc1_15: L$cbc_enc_tail: movq %rdx,%rcx xchgq %rdi,%rsi -.long 0x9066A4F3 +.long 0x9066A4F3 movl $16,%ecx subq %rdx,%rcx xorl %eax,%eax -.long 0x9066AAF3 +.long 0x9066AAF3 leaq -16(%rdi),%rdi movl %r10d,%eax movq %rdi,%rsi movq %r11,%rcx xorq %rdx,%rdx - jmp L$cbc_enc_loop + jmp L$cbc_enc_loop .p2align 4 L$cbc_decrypt: - movups (%r8),%xmm9 + leaq (%rsp),%rax + pushq %rbp + subq $16,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp + movups (%r8),%xmm10 movl %r10d,%eax - cmpq $112,%rdx + cmpq $80,%rdx jbe L$cbc_dec_tail - shrl $1,%r10d - subq $112,%rdx - movl %r10d,%eax - movaps %xmm9,-24(%rsp) + + movups (%rcx),%xmm0 + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqa %xmm2,%xmm11 + movdqu 32(%rdi),%xmm4 + movdqa %xmm3,%xmm12 + movdqu 48(%rdi),%xmm5 + movdqa %xmm4,%xmm13 + movdqu 64(%rdi),%xmm6 + movdqa %xmm5,%xmm14 + movdqu 80(%rdi),%xmm7 + movdqa %xmm6,%xmm15 + movl __gnutls_x86_cpuid_s+4(%rip),%r9d + cmpq $112,%rdx + jbe L$cbc_dec_six_or_seven + + andl $71303168,%r9d + subq $80,%rdx + cmpl $4194304,%r9d + je L$cbc_dec_loop6_enter + subq $32,%rdx + leaq 112(%rcx),%rcx jmp L$cbc_dec_loop8_enter .p2align 4 L$cbc_dec_loop8: - movaps %xmm0,-24(%rsp) movups %xmm9,(%rsi) leaq 16(%rsi),%rsi L$cbc_dec_loop8_enter: - movups (%rcx),%xmm0 - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 16(%rcx),%xmm1 + movdqu 96(%rdi),%xmm8 + pxor %xmm0,%xmm2 + movdqu 112(%rdi),%xmm9 + pxor %xmm0,%xmm3 + movups 16-112(%rcx),%xmm1 + pxor %xmm0,%xmm4 + xorq %r11,%r11 + cmpq $112,%rdx + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 + pxor %xmm0,%xmm8 - leaq 32(%rcx),%rcx - movdqu 32(%rdi),%xmm4 - xorps %xmm0,%xmm2 - movdqu 48(%rdi),%xmm5 - xorps %xmm0,%xmm3 - movdqu 64(%rdi),%xmm6 .byte 102,15,56,222,209 - pxor %xmm0,%xmm4 - movdqu 80(%rdi),%xmm7 + pxor %xmm0,%xmm9 + movups 32-112(%rcx),%xmm0 .byte 102,15,56,222,217 - pxor %xmm0,%xmm5 - movdqu 96(%rdi),%xmm8 .byte 102,15,56,222,225 - pxor %xmm0,%xmm6 - movdqu 112(%rdi),%xmm9 .byte 102,15,56,222,233 - pxor %xmm0,%xmm7 - decl %eax .byte 102,15,56,222,241 - pxor %xmm0,%xmm8 .byte 102,15,56,222,249 - pxor %xmm0,%xmm9 - movups (%rcx),%xmm0 .byte 102,68,15,56,222,193 + setnc %r11b + shlq $7,%r11 .byte 102,68,15,56,222,201 - movups 16(%rcx),%xmm1 - - call L$dec_loop8_enter + addq %rdi,%r11 + movups 48-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 64-112(%rcx),%xmm0 + nop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 80-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 96-112(%rcx),%xmm0 + nop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 112-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 128-112(%rcx),%xmm0 + nop +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 144-112(%rcx),%xmm1 + cmpl $11,%eax +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 160-112(%rcx),%xmm0 + jb L$cbc_dec_done +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 176-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 192-112(%rcx),%xmm0 + je L$cbc_dec_done +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 208-112(%rcx),%xmm1 + nop +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 224-112(%rcx),%xmm0 + jmp L$cbc_dec_done +.p2align 4 +L$cbc_dec_done: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 + pxor %xmm0,%xmm10 + pxor %xmm0,%xmm11 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + pxor %xmm0,%xmm12 + pxor %xmm0,%xmm13 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + pxor %xmm0,%xmm14 + pxor %xmm0,%xmm15 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movdqu 80(%rdi),%xmm1 + +.byte 102,65,15,56,223,210 + movdqu 96(%rdi),%xmm10 + pxor %xmm0,%xmm1 +.byte 102,65,15,56,223,219 + pxor %xmm0,%xmm10 + movdqu 112(%rdi),%xmm0 +.byte 102,65,15,56,223,228 + leaq 128(%rdi),%rdi + movdqu 0(%r11),%xmm11 +.byte 102,65,15,56,223,237 +.byte 102,65,15,56,223,246 + movdqu 16(%r11),%xmm12 + movdqu 32(%r11),%xmm13 +.byte 102,65,15,56,223,255 +.byte 102,68,15,56,223,193 + movdqu 48(%r11),%xmm14 + movdqu 64(%r11),%xmm15 +.byte 102,69,15,56,223,202 + movdqa %xmm0,%xmm10 + movdqu 80(%r11),%xmm1 + movups -112(%rcx),%xmm0 - movups (%rdi),%xmm1 - movups 16(%rdi),%xmm0 - xorps -24(%rsp),%xmm2 - xorps %xmm1,%xmm3 - movups 32(%rdi),%xmm1 - xorps %xmm0,%xmm4 - movups 48(%rdi),%xmm0 - xorps %xmm1,%xmm5 - movups 64(%rdi),%xmm1 - xorps %xmm0,%xmm6 - movups 80(%rdi),%xmm0 - xorps %xmm1,%xmm7 - movups 96(%rdi),%xmm1 - xorps %xmm0,%xmm8 - movups 112(%rdi),%xmm0 - xorps %xmm1,%xmm9 movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 movups %xmm5,48(%rsi) - movl %r10d,%eax + movdqa %xmm14,%xmm5 movups %xmm6,64(%rsi) - movq %r11,%rcx + movdqa %xmm15,%xmm6 movups %xmm7,80(%rsi) - leaq 128(%rdi),%rdi + movdqa %xmm1,%xmm7 movups %xmm8,96(%rsi) leaq 112(%rsi),%rsi + subq $128,%rdx ja L$cbc_dec_loop8 movaps %xmm9,%xmm2 - movaps %xmm0,%xmm9 + leaq -112(%rcx),%rcx addq $112,%rdx jle L$cbc_dec_tail_collected - movups %xmm2,(%rsi) - leal 1(%r10,%r10,1),%eax + movups %xmm9,(%rsi) leaq 16(%rsi),%rsi + cmpq $80,%rdx + jbe L$cbc_dec_tail + + movaps %xmm11,%xmm2 +L$cbc_dec_six_or_seven: + cmpq $96,%rdx + ja L$cbc_dec_seven + + movaps %xmm7,%xmm8 + call _aesni_decrypt6 + pxor %xmm10,%xmm2 + movaps %xmm8,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm15,%xmm7 + movdqu %xmm6,64(%rsi) + leaq 80(%rsi),%rsi + movdqa %xmm7,%xmm2 + jmp L$cbc_dec_tail_collected + +.p2align 4 +L$cbc_dec_seven: + movups 96(%rdi),%xmm8 + xorps %xmm9,%xmm9 + call _aesni_decrypt8 + movups 80(%rdi),%xmm9 + pxor %xmm10,%xmm2 + movups 96(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm15,%xmm7 + movdqu %xmm6,64(%rsi) + pxor %xmm9,%xmm8 + movdqu %xmm7,80(%rsi) + leaq 96(%rsi),%rsi + movdqa %xmm8,%xmm2 + jmp L$cbc_dec_tail_collected + +.p2align 4 +L$cbc_dec_loop6: + movups %xmm7,(%rsi) + leaq 16(%rsi),%rsi + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqa %xmm2,%xmm11 + movdqu 32(%rdi),%xmm4 + movdqa %xmm3,%xmm12 + movdqu 48(%rdi),%xmm5 + movdqa %xmm4,%xmm13 + movdqu 64(%rdi),%xmm6 + movdqa %xmm5,%xmm14 + movdqu 80(%rdi),%xmm7 + movdqa %xmm6,%xmm15 +L$cbc_dec_loop6_enter: + leaq 96(%rdi),%rdi + movdqa %xmm7,%xmm8 + + call _aesni_decrypt6 + + pxor %xmm10,%xmm2 + movdqa %xmm8,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movq %r11,%rcx + movdqu %xmm5,48(%rsi) + pxor %xmm15,%xmm7 + movl %r10d,%eax + movdqu %xmm6,64(%rsi) + leaq 80(%rsi),%rsi + subq $96,%rdx + ja L$cbc_dec_loop6 + + movdqa %xmm7,%xmm2 + addq $80,%rdx + jle L$cbc_dec_tail_collected + movups %xmm7,(%rsi) + leaq 16(%rsi),%rsi + L$cbc_dec_tail: movups (%rdi),%xmm2 - movaps %xmm2,%xmm8 - cmpq $16,%rdx + subq $16,%rdx jbe L$cbc_dec_one movups 16(%rdi),%xmm3 - movaps %xmm3,%xmm7 - cmpq $32,%rdx + movaps %xmm2,%xmm11 + subq $16,%rdx jbe L$cbc_dec_two movups 32(%rdi),%xmm4 - movaps %xmm4,%xmm6 - cmpq $48,%rdx + movaps %xmm3,%xmm12 + subq $16,%rdx jbe L$cbc_dec_three movups 48(%rdi),%xmm5 - cmpq $64,%rdx + movaps %xmm4,%xmm13 + subq $16,%rdx jbe L$cbc_dec_four movups 64(%rdi),%xmm6 - cmpq $80,%rdx - jbe L$cbc_dec_five - - movups 80(%rdi),%xmm7 - cmpq $96,%rdx - jbe L$cbc_dec_six - - movups 96(%rdi),%xmm8 - movaps %xmm9,-24(%rsp) - call _aesni_decrypt8 - movups (%rdi),%xmm1 - movups 16(%rdi),%xmm0 - xorps -24(%rsp),%xmm2 - xorps %xmm1,%xmm3 - movups 32(%rdi),%xmm1 - xorps %xmm0,%xmm4 - movups 48(%rdi),%xmm0 - xorps %xmm1,%xmm5 - movups 64(%rdi),%xmm1 - xorps %xmm0,%xmm6 - movups 80(%rdi),%xmm0 - xorps %xmm1,%xmm7 - movups 96(%rdi),%xmm9 - xorps %xmm0,%xmm8 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) - leaq 96(%rsi),%rsi - movaps %xmm8,%xmm2 - subq $112,%rdx + movaps %xmm5,%xmm14 + movaps %xmm6,%xmm15 + xorps %xmm7,%xmm7 + call _aesni_decrypt6 + pxor %xmm10,%xmm2 + movaps %xmm15,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + leaq 64(%rsi),%rsi + movdqa %xmm6,%xmm2 + subq $16,%rdx jmp L$cbc_dec_tail_collected + .p2align 4 L$cbc_dec_one: + movaps %xmm2,%xmm11 movups (%rcx),%xmm0 movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx @@ -2220,113 +2891,70 @@ L$oop_dec1_16: decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_dec1_16 + jnz L$oop_dec1_16 .byte 102,15,56,223,209 - xorps %xmm9,%xmm2 - movaps %xmm8,%xmm9 - subq $16,%rdx + xorps %xmm10,%xmm2 + movaps %xmm11,%xmm10 jmp L$cbc_dec_tail_collected .p2align 4 L$cbc_dec_two: - xorps %xmm4,%xmm4 - call _aesni_decrypt3 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - movups %xmm2,(%rsi) - movaps %xmm7,%xmm9 - movaps %xmm3,%xmm2 + movaps %xmm3,%xmm12 + call _aesni_decrypt2 + pxor %xmm10,%xmm2 + movaps %xmm12,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + movdqa %xmm3,%xmm2 leaq 16(%rsi),%rsi - subq $32,%rdx jmp L$cbc_dec_tail_collected .p2align 4 L$cbc_dec_three: + movaps %xmm4,%xmm13 call _aesni_decrypt3 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - movups %xmm2,(%rsi) - xorps %xmm7,%xmm4 - movups %xmm3,16(%rsi) - movaps %xmm6,%xmm9 - movaps %xmm4,%xmm2 + pxor %xmm10,%xmm2 + movaps %xmm13,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + movdqa %xmm4,%xmm2 leaq 32(%rsi),%rsi - subq $48,%rdx jmp L$cbc_dec_tail_collected .p2align 4 L$cbc_dec_four: + movaps %xmm5,%xmm14 call _aesni_decrypt4 - xorps %xmm9,%xmm2 - movups 48(%rdi),%xmm9 - xorps %xmm8,%xmm3 - movups %xmm2,(%rsi) - xorps %xmm7,%xmm4 - movups %xmm3,16(%rsi) - xorps %xmm6,%xmm5 - movups %xmm4,32(%rsi) - movaps %xmm5,%xmm2 + pxor %xmm10,%xmm2 + movaps %xmm14,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + movdqa %xmm5,%xmm2 leaq 48(%rsi),%rsi - subq $64,%rdx - jmp L$cbc_dec_tail_collected -.p2align 4 -L$cbc_dec_five: - xorps %xmm7,%xmm7 - call _aesni_decrypt6 - movups 16(%rdi),%xmm1 - movups 32(%rdi),%xmm0 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - xorps %xmm1,%xmm4 - movups 48(%rdi),%xmm1 - xorps %xmm0,%xmm5 - movups 64(%rdi),%xmm9 - xorps %xmm1,%xmm6 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - leaq 64(%rsi),%rsi - movaps %xmm6,%xmm2 - subq $80,%rdx - jmp L$cbc_dec_tail_collected -.p2align 4 -L$cbc_dec_six: - call _aesni_decrypt6 - movups 16(%rdi),%xmm1 - movups 32(%rdi),%xmm0 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - xorps %xmm1,%xmm4 - movups 48(%rdi),%xmm1 - xorps %xmm0,%xmm5 - movups 64(%rdi),%xmm0 - xorps %xmm1,%xmm6 - movups 80(%rdi),%xmm9 - xorps %xmm0,%xmm7 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - leaq 80(%rsi),%rsi - movaps %xmm7,%xmm2 - subq $96,%rdx jmp L$cbc_dec_tail_collected + .p2align 4 L$cbc_dec_tail_collected: + movups %xmm10,(%r8) andq $15,%rdx - movups %xmm9,(%r8) jnz L$cbc_dec_tail_partial movups %xmm2,(%rsi) jmp L$cbc_dec_ret .p2align 4 L$cbc_dec_tail_partial: - movaps %xmm2,-24(%rsp) + movaps %xmm2,(%rsp) movq $16,%rcx movq %rsi,%rdi subq %rdx,%rcx - leaq -24(%rsp),%rsi -.long 0x9066A4F3 + leaq (%rsp),%rsi +.long 0x9066A4F3 L$cbc_dec_ret: + leaq (%rbp),%rsp + popq %rbp L$cbc_ret: .byte 0xf3,0xc3 @@ -2334,7 +2962,7 @@ L$cbc_ret: .p2align 4 _aesni_set_decrypt_key: -.byte 0x48,0x83,0xEC,0x08 +.byte 0x48,0x83,0xEC,0x08 call __aesni_set_encrypt_key shll $4,%esi testl %eax,%eax @@ -2373,7 +3001,7 @@ L$SEH_end_set_decrypt_key: .p2align 4 _aesni_set_encrypt_key: __aesni_set_encrypt_key: -.byte 0x48,0x83,0xEC,0x08 +.byte 0x48,0x83,0xEC,0x08 movq $-1,%rax testq %rdi,%rdi jz L$enc_key_ret @@ -2569,6 +3197,8 @@ L$increment64: .long 1,0,0,0 L$xts_magic: .long 0x87,0,1,0 +L$increment1: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 6 diff --git a/lib/accelerated/x86/macosx/e_padlock-x86.s b/lib/accelerated/x86/macosx/e_padlock-x86.s index 3ba69bd352..367962c7cb 100644 --- a/lib/accelerated/x86/macosx/e_padlock-x86.s +++ b/lib/accelerated/x86/macosx/e_padlock-x86.s @@ -174,16 +174,14 @@ L005ecb_pic_point: leal 16(%edx),%edx xorl %eax,%eax xorl %ebx,%ebx - cmpl $128,%ecx - jbe L006ecb_short testl $32,(%edx) - jnz L007ecb_aligned + jnz L006ecb_aligned testl $15,%edi setz %al testl $15,%esi setz %bl testl %ebx,%eax - jnz L007ecb_aligned + jnz L006ecb_aligned negl %eax movl $512,%ebx notl %eax @@ -195,10 +193,28 @@ L005ecb_pic_point: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp - jmp L008ecb_loop + movl %eax,16(%ebp) + cmpl %ebx,%ecx + ja L007ecb_loop + movl %esi,%eax + cmpl %esp,%ebp + cmovel %edi,%eax + addl %ecx,%eax + negl %eax + andl $4095,%eax + cmpl $128,%eax + movl $-128,%eax + cmovael %ebx,%eax + andl %eax,%ebx + jz L008ecb_unaligned_tail + jmp L007ecb_loop .align 4,0x90 -L008ecb_loop: +L007ecb_loop: movl %edi,(%ebp) movl %esi,4(%ebp) movl %ecx,8(%ebp) @@ -223,8 +239,8 @@ L009ecb_inp_aligned: testl $15,%edi jz L010ecb_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi L010ecb_out_aligned: @@ -234,43 +250,75 @@ L010ecb_out_aligned: addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz L008ecb_loop + jz L011ecb_break + cmpl %ebx,%ecx + jae L007ecb_loop +L008ecb_unaligned_tail: + xorl %eax,%eax cmpl %ebp,%esp - je L011ecb_done + cmovel %ecx,%eax + subl %eax,%esp + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp L007ecb_loop +.align 4,0x90 +L011ecb_break: + cmpl %ebp,%esp + je L012ecb_done pxor %xmm0,%xmm0 leal (%esp),%eax -L012ecb_bzero: +L013ecb_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja L012ecb_bzero -L011ecb_done: + ja L013ecb_bzero +L012ecb_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp - jmp L013ecb_exit + jmp L014ecb_exit .align 4,0x90 -L006ecb_short: +L006ecb_aligned: + leal (%esi,%ecx,1),%ebp + negl %ebp + andl $4095,%ebp xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -L014ecb_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja L014ecb_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp L008ecb_loop -.align 4,0x90 -L007ecb_aligned: + cmpl $128,%ebp + movl $127,%ebp + cmovael %eax,%ebp + andl %ecx,%ebp + subl %ebp,%ecx + jz L015ecb_aligned_tail leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx .byte 243,15,167,200 -L013ecb_exit: + testl %ebp,%ebp + jz L014ecb_exit +L015ecb_aligned_tail: + movl %ebp,%ecx + leal -24(%esp),%ebp + movl %ebp,%esp + movl %ebp,%eax + subl %ecx,%esp + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp L007ecb_loop +L014ecb_exit: movl $1,%eax leal 4(%esp),%esp L004ecb_abort: @@ -292,19 +340,17 @@ L_padlock_cbc_encrypt_begin: movl 28(%esp),%edx movl 32(%esp),%ecx testl $15,%edx - jnz L015cbc_abort + jnz L016cbc_abort testl $15,%ecx - jnz L015cbc_abort - leal Lpadlock_saved_context-L016cbc_pic_point,%eax + jnz L016cbc_abort + leal Lpadlock_saved_context-L017cbc_pic_point,%eax pushfl cld call __padlock_verify_ctx -L016cbc_pic_point: +L017cbc_pic_point: leal 16(%edx),%edx xorl %eax,%eax xorl %ebx,%ebx - cmpl $64,%ecx - jbe L017cbc_short testl $32,(%edx) jnz L018cbc_aligned testl $15,%edi @@ -324,7 +370,25 @@ L016cbc_pic_point: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp + movl %eax,16(%ebp) + cmpl %ebx,%ecx + ja L019cbc_loop + movl %esi,%eax + cmpl %esp,%ebp + cmovel %edi,%eax + addl %ecx,%eax + negl %eax + andl $4095,%eax + cmpl $64,%eax + movl $-64,%eax + cmovael %ebx,%eax + andl %eax,%ebx + jz L020cbc_unaligned_tail jmp L019cbc_loop .align 4,0x90 L019cbc_loop: @@ -336,13 +400,13 @@ L019cbc_loop: testl $15,%edi cmovnzl %esp,%edi testl $15,%esi - jz L020cbc_inp_aligned + jz L021cbc_inp_aligned shrl $2,%ecx .byte 243,165 subl %ebx,%edi movl %ebx,%ecx movl %edi,%esi -L020cbc_inp_aligned: +L021cbc_inp_aligned: leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx @@ -352,61 +416,93 @@ L020cbc_inp_aligned: movl (%ebp),%edi movl 12(%ebp),%ebx testl $15,%edi - jz L021cbc_out_aligned + jz L022cbc_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi -L021cbc_out_aligned: +L022cbc_out_aligned: movl 4(%ebp),%esi movl 8(%ebp),%ecx addl %ebx,%edi addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz L019cbc_loop + jz L023cbc_break + cmpl %ebx,%ecx + jae L019cbc_loop +L020cbc_unaligned_tail: + xorl %eax,%eax + cmpl %ebp,%esp + cmovel %ecx,%eax + subl %eax,%esp + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp L019cbc_loop +.align 4,0x90 +L023cbc_break: cmpl %ebp,%esp - je L022cbc_done + je L024cbc_done pxor %xmm0,%xmm0 leal (%esp),%eax -L023cbc_bzero: +L025cbc_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja L023cbc_bzero -L022cbc_done: + ja L025cbc_bzero +L024cbc_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp - jmp L024cbc_exit -.align 4,0x90 -L017cbc_short: - xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -L025cbc_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja L025cbc_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp L019cbc_loop + jmp L026cbc_exit .align 4,0x90 L018cbc_aligned: + leal (%esi,%ecx,1),%ebp + negl %ebp + andl $4095,%ebp + xorl %eax,%eax + cmpl $64,%ebp + movl $63,%ebp + cmovael %eax,%ebp + andl %ecx,%ebp + subl %ebp,%ecx + jz L027cbc_aligned_tail leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx .byte 243,15,167,208 movaps (%eax),%xmm0 movaps %xmm0,-16(%edx) -L024cbc_exit: + testl %ebp,%ebp + jz L026cbc_exit +L027cbc_aligned_tail: + movl %ebp,%ecx + leal -24(%esp),%ebp + movl %ebp,%esp + movl %ebp,%eax + subl %ecx,%esp + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp L019cbc_loop +L026cbc_exit: movl $1,%eax leal 4(%esp),%esp -L015cbc_abort: +L016cbc_abort: popl %edi popl %esi popl %ebx @@ -425,25 +521,25 @@ L_padlock_cfb_encrypt_begin: movl 28(%esp),%edx movl 32(%esp),%ecx testl $15,%edx - jnz L026cfb_abort + jnz L028cfb_abort testl $15,%ecx - jnz L026cfb_abort - leal Lpadlock_saved_context-L027cfb_pic_point,%eax + jnz L028cfb_abort + leal Lpadlock_saved_context-L029cfb_pic_point,%eax pushfl cld call __padlock_verify_ctx -L027cfb_pic_point: +L029cfb_pic_point: leal 16(%edx),%edx xorl %eax,%eax xorl %ebx,%ebx testl $32,(%edx) - jnz L028cfb_aligned + jnz L030cfb_aligned testl $15,%edi setz %al testl $15,%esi setz %bl testl %ebx,%eax - jnz L028cfb_aligned + jnz L030cfb_aligned negl %eax movl $512,%ebx notl %eax @@ -455,10 +551,15 @@ L027cfb_pic_point: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp - jmp L029cfb_loop + movl %eax,16(%ebp) + jmp L031cfb_loop .align 4,0x90 -L029cfb_loop: +L031cfb_loop: movl %edi,(%ebp) movl %esi,4(%ebp) movl %ecx,8(%ebp) @@ -467,13 +568,13 @@ L029cfb_loop: testl $15,%edi cmovnzl %esp,%edi testl $15,%esi - jz L030cfb_inp_aligned + jz L032cfb_inp_aligned shrl $2,%ecx .byte 243,165 subl %ebx,%edi movl %ebx,%ecx movl %edi,%esi -L030cfb_inp_aligned: +L032cfb_inp_aligned: leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx @@ -483,61 +584,45 @@ L030cfb_inp_aligned: movl (%ebp),%edi movl 12(%ebp),%ebx testl $15,%edi - jz L031cfb_out_aligned + jz L033cfb_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi -L031cfb_out_aligned: +L033cfb_out_aligned: movl 4(%ebp),%esi movl 8(%ebp),%ecx addl %ebx,%edi addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz L029cfb_loop + jnz L031cfb_loop cmpl %ebp,%esp - je L032cfb_done + je L034cfb_done pxor %xmm0,%xmm0 leal (%esp),%eax -L033cfb_bzero: +L035cfb_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja L033cfb_bzero -L032cfb_done: + ja L035cfb_bzero +L034cfb_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp - jmp L034cfb_exit + jmp L036cfb_exit .align 4,0x90 -L035cfb_short: - xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -L036cfb_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja L036cfb_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp L029cfb_loop -.align 4,0x90 -L028cfb_aligned: +L030cfb_aligned: leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx .byte 243,15,167,224 movaps (%eax),%xmm0 movaps %xmm0,-16(%edx) -L034cfb_exit: +L036cfb_exit: movl $1,%eax leal 4(%esp),%esp -L026cfb_abort: +L028cfb_abort: popl %edi popl %esi popl %ebx @@ -586,7 +671,12 @@ L038ofb_pic_point: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp + movl %eax,16(%ebp) jmp L040ofb_loop .align 4,0x90 L040ofb_loop: @@ -616,8 +706,8 @@ L041ofb_inp_aligned: testl $15,%edi jz L042ofb_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi L042ofb_out_aligned: @@ -638,26 +728,10 @@ L044ofb_bzero: cmpl %eax,%ebp ja L044ofb_bzero L043ofb_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp jmp L045ofb_exit .align 4,0x90 -L046ofb_short: - xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -L047ofb_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja L047ofb_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp L040ofb_loop -.align 4,0x90 L039ofb_aligned: leal -16(%edx),%eax leal 16(%edx),%ebx @@ -687,14 +761,14 @@ L_padlock_ctr32_encrypt_begin: movl 28(%esp),%edx movl 32(%esp),%ecx testl $15,%edx - jnz L048ctr32_abort + jnz L046ctr32_abort testl $15,%ecx - jnz L048ctr32_abort - leal Lpadlock_saved_context-L049ctr32_pic_point,%eax + jnz L046ctr32_abort + leal Lpadlock_saved_context-L047ctr32_pic_point,%eax pushfl cld call __padlock_verify_ctx -L049ctr32_pic_point: +L047ctr32_pic_point: leal 16(%edx),%edx xorl %eax,%eax movq -16(%edx),%mm0 @@ -708,10 +782,15 @@ L049ctr32_pic_point: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp - jmp L050ctr32_loop + movl %eax,16(%ebp) + jmp L048ctr32_loop .align 4,0x90 -L050ctr32_loop: +L048ctr32_loop: movl %edi,(%ebp) movl %esi,4(%ebp) movl %ecx,8(%ebp) @@ -720,7 +799,7 @@ L050ctr32_loop: movl -4(%edx),%ecx xorl %edi,%edi movl -8(%edx),%eax -L051ctr32_prepare: +L049ctr32_prepare: movl %ecx,12(%esp,%edi,1) bswap %ecx movq %mm0,(%esp,%edi,1) @@ -729,7 +808,7 @@ L051ctr32_prepare: bswap %ecx leal 16(%edi),%edi cmpl %ebx,%edi - jb L051ctr32_prepare + jb L049ctr32_prepare movl %ecx,-4(%edx) leal (%esp),%esi leal (%esp),%edi @@ -742,32 +821,33 @@ L051ctr32_prepare: movl 12(%ebp),%ebx movl 4(%ebp),%esi xorl %ecx,%ecx -L052ctr32_xor: +L050ctr32_xor: movups (%esi,%ecx,1),%xmm1 leal 16(%ecx),%ecx pxor -16(%esp,%ecx,1),%xmm1 movups %xmm1,-16(%edi,%ecx,1) cmpl %ebx,%ecx - jb L052ctr32_xor + jb L050ctr32_xor movl 8(%ebp),%ecx addl %ebx,%edi addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz L050ctr32_loop + jnz L048ctr32_loop pxor %xmm0,%xmm0 leal (%esp),%eax -L053ctr32_bzero: +L051ctr32_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja L053ctr32_bzero -L054ctr32_done: + ja L051ctr32_bzero +L052ctr32_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp movl $1,%eax leal 4(%esp),%esp emms -L048ctr32_abort: +L046ctr32_abort: popl %edi popl %esi popl %ebx @@ -789,10 +869,10 @@ __win32_segv_handler: movl 4(%esp),%edx movl 12(%esp),%ecx cmpl $3221225477,(%edx) - jne L055ret + jne L053ret addl $4,184(%ecx) movl $0,%eax -L055ret: +L053ret: ret .globl _padlock_sha1_oneshot .align 4 diff --git a/lib/accelerated/x86/macosx/e_padlock-x86_64.s b/lib/accelerated/x86/macosx/e_padlock-x86_64.s index 00c2bdaac0..ec1377ad88 100644 --- a/lib/accelerated/x86/macosx/e_padlock-x86_64.s +++ b/lib/accelerated/x86/macosx/e_padlock-x86_64.s @@ -127,7 +127,7 @@ _padlock_aes_block: movq $1,%rcx leaq 32(%rdx),%rbx leaq 16(%rdx),%rdx -.byte 0xf3,0x0f,0xa7,0xc8 +.byte 0xf3,0x0f,0xa7,0xc8 movq %r8,%rbx .byte 0xf3,0xc3 @@ -137,7 +137,7 @@ _padlock_aes_block: .p2align 4 _padlock_xstore: movl %esi,%edx -.byte 0x0f,0xa7,0xc0 +.byte 0x0f,0xa7,0xc0 .byte 0xf3,0xc3 @@ -154,7 +154,7 @@ _padlock_sha1_oneshot: movq %rsp,%rdi movl %eax,16(%rsp) xorq %rax,%rax -.byte 0xf3,0x0f,0xa6,0xc8 +.byte 0xf3,0x0f,0xa6,0xc8 movaps (%rsp),%xmm0 movl 16(%rsp),%eax addq $128+8,%rsp @@ -176,7 +176,7 @@ _padlock_sha1_blocks: movq %rsp,%rdi movl %eax,16(%rsp) movq $-1,%rax -.byte 0xf3,0x0f,0xa6,0xc8 +.byte 0xf3,0x0f,0xa6,0xc8 movaps (%rsp),%xmm0 movl 16(%rsp),%eax addq $128+8,%rsp @@ -198,7 +198,7 @@ _padlock_sha256_oneshot: movq %rsp,%rdi movaps %xmm1,16(%rsp) xorq %rax,%rax -.byte 0xf3,0x0f,0xa6,0xd0 +.byte 0xf3,0x0f,0xa6,0xd0 movaps (%rsp),%xmm0 movaps 16(%rsp),%xmm1 addq $128+8,%rsp @@ -220,7 +220,7 @@ _padlock_sha256_blocks: movq %rsp,%rdi movaps %xmm1,16(%rsp) movq $-1,%rax -.byte 0xf3,0x0f,0xa6,0xd0 +.byte 0xf3,0x0f,0xa6,0xd0 movaps (%rsp),%xmm0 movaps 16(%rsp),%xmm1 addq $128+8,%rsp @@ -245,7 +245,7 @@ _padlock_sha512_blocks: movaps %xmm1,16(%rsp) movaps %xmm2,32(%rsp) movaps %xmm3,48(%rsp) -.byte 0xf3,0x0f,0xa6,0xe0 +.byte 0xf3,0x0f,0xa6,0xe0 movaps (%rsp),%xmm0 movaps 16(%rsp),%xmm1 movaps 32(%rsp),%xmm2 @@ -276,8 +276,6 @@ _padlock_ecb_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $128,%rcx - jbe L$ecb_short testl $32,(%rdx) jnz L$ecb_aligned testq $15,%rdi @@ -297,6 +295,21 @@ _padlock_ecb_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + cmpq %rbx,%rcx + ja L$ecb_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $128,%rax + movq $-128,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz L$ecb_unaligned_tail jmp L$ecb_loop .p2align 4 L$ecb_loop: @@ -312,7 +325,7 @@ L$ecb_loop: testq $15,%rsi jz L$ecb_inp_aligned shrq $3,%rcx -.byte 0xf3,0x48,0xa5 +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi movq %rbx,%rcx movq %rdi,%rsi @@ -320,15 +333,15 @@ L$ecb_inp_aligned: leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,200 +.byte 0xf3,0x0f,0xa7,200 movq %r8,%rdi movq %r11,%rbx testq $15,%rdi jz L$ecb_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi -.byte 0xf3,0x48,0xa5 + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi L$ecb_out_aligned: movq %r9,%rsi @@ -337,9 +350,26 @@ L$ecb_out_aligned: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx - jnz L$ecb_loop - + jz L$ecb_break + cmpq %rbx,%rcx + jae L$ecb_loop +L$ecb_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp L$ecb_loop +.p2align 4 +L$ecb_break: + cmpq %rbp,%rsp je L$ecb_done pxor %xmm0,%xmm0 @@ -353,26 +383,39 @@ L$ecb_bzero: L$ecb_done: leaq (%rbp),%rsp jmp L$ecb_exit -.p2align 4 -L$ecb_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -L$ecb_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja L$ecb_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp L$ecb_loop + .p2align 4 L$ecb_aligned: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $128,%rbp + movq $128-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz L$ecb_aligned_tail leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,200 +.byte 0xf3,0x0f,0xa7,200 + testq %rbp,%rbp + jz L$ecb_exit + +L$ecb_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp L$ecb_loop L$ecb_exit: movl $1,%eax leaq 8(%rsp),%rsp @@ -400,8 +443,6 @@ _padlock_cbc_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $64,%rcx - jbe L$cbc_short testl $32,(%rdx) jnz L$cbc_aligned testq $15,%rdi @@ -421,6 +462,21 @@ _padlock_cbc_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + cmpq %rbx,%rcx + ja L$cbc_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $64,%rax + movq $-64,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz L$cbc_unaligned_tail jmp L$cbc_loop .p2align 4 L$cbc_loop: @@ -436,7 +492,7 @@ L$cbc_loop: testq $15,%rsi jz L$cbc_inp_aligned shrq $3,%rcx -.byte 0xf3,0x48,0xa5 +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi movq %rbx,%rcx movq %rdi,%rsi @@ -444,7 +500,7 @@ L$cbc_inp_aligned: leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,208 +.byte 0xf3,0x0f,0xa7,208 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) movq %r8,%rdi @@ -452,9 +508,9 @@ L$cbc_inp_aligned: testq $15,%rdi jz L$cbc_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi -.byte 0xf3,0x48,0xa5 + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi L$cbc_out_aligned: movq %r9,%rsi @@ -463,9 +519,26 @@ L$cbc_out_aligned: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx - jnz L$cbc_loop - + jz L$cbc_break + cmpq %rbx,%rcx + jae L$cbc_loop +L$cbc_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp L$cbc_loop +.p2align 4 +L$cbc_break: + cmpq %rbp,%rsp je L$cbc_done pxor %xmm0,%xmm0 @@ -479,28 +552,41 @@ L$cbc_bzero: L$cbc_done: leaq (%rbp),%rsp jmp L$cbc_exit -.p2align 4 -L$cbc_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -L$cbc_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja L$cbc_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp L$cbc_loop + .p2align 4 L$cbc_aligned: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $64,%rbp + movq $64-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz L$cbc_aligned_tail leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,208 +.byte 0xf3,0x0f,0xa7,208 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) + testq %rbp,%rbp + jz L$cbc_exit + +L$cbc_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp L$cbc_loop L$cbc_exit: movl $1,%eax leaq 8(%rsp),%rsp @@ -547,6 +633,8 @@ _padlock_cfb_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx jmp L$cfb_loop .p2align 4 L$cfb_loop: @@ -562,7 +650,7 @@ L$cfb_loop: testq $15,%rsi jz L$cfb_inp_aligned shrq $3,%rcx -.byte 0xf3,0x48,0xa5 +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi movq %rbx,%rcx movq %rdi,%rsi @@ -570,7 +658,7 @@ L$cfb_inp_aligned: leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,224 +.byte 0xf3,0x0f,0xa7,224 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) movq %r8,%rdi @@ -578,9 +666,9 @@ L$cfb_inp_aligned: testq $15,%rdi jz L$cfb_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi -.byte 0xf3,0x48,0xa5 + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi L$cfb_out_aligned: movq %r9,%rsi @@ -590,8 +678,7 @@ L$cfb_out_aligned: subq %rbx,%rcx movq $512,%rbx jnz L$cfb_loop - - cmpq %rsp,%rbp + cmpq %rbp,%rsp je L$cfb_done pxor %xmm0,%xmm0 @@ -605,12 +692,13 @@ L$cfb_bzero: L$cfb_done: leaq (%rbp),%rsp jmp L$cfb_exit + .p2align 4 L$cfb_aligned: leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,224 +.byte 0xf3,0x0f,0xa7,224 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) L$cfb_exit: @@ -659,6 +747,8 @@ _padlock_ofb_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx jmp L$ofb_loop .p2align 4 L$ofb_loop: @@ -674,7 +764,7 @@ L$ofb_loop: testq $15,%rsi jz L$ofb_inp_aligned shrq $3,%rcx -.byte 0xf3,0x48,0xa5 +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi movq %rbx,%rcx movq %rdi,%rsi @@ -682,7 +772,7 @@ L$ofb_inp_aligned: leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,232 +.byte 0xf3,0x0f,0xa7,232 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) movq %r8,%rdi @@ -690,9 +780,9 @@ L$ofb_inp_aligned: testq $15,%rdi jz L$ofb_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi -.byte 0xf3,0x48,0xa5 + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi L$ofb_out_aligned: movq %r9,%rsi @@ -702,8 +792,7 @@ L$ofb_out_aligned: subq %rbx,%rcx movq $512,%rbx jnz L$ofb_loop - - cmpq %rsp,%rbp + cmpq %rbp,%rsp je L$ofb_done pxor %xmm0,%xmm0 @@ -717,12 +806,13 @@ L$ofb_bzero: L$ofb_done: leaq (%rbp),%rsp jmp L$ofb_exit + .p2align 4 L$ofb_aligned: leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,232 +.byte 0xf3,0x0f,0xa7,232 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) L$ofb_exit: @@ -752,8 +842,6 @@ _padlock_ctr32_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $64,%rcx - jbe L$ctr32_short testl $32,(%rdx) jnz L$ctr32_aligned testq $15,%rdi @@ -773,15 +861,32 @@ _padlock_ctr32_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx L$ctr32_reenter: movl -4(%rdx),%eax bswapl %eax negl %eax andl $31,%eax - jz L$ctr32_loop + movq $512,%rbx shll $4,%eax + cmovzq %rbx,%rax cmpq %rax,%rcx cmovaq %rax,%rbx + cmovbeq %rcx,%rbx + cmpq %rbx,%rcx + ja L$ctr32_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $32,%rax + movq $-32,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz L$ctr32_unaligned_tail jmp L$ctr32_loop .p2align 4 L$ctr32_loop: @@ -797,7 +902,7 @@ L$ctr32_loop: testq $15,%rsi jz L$ctr32_inp_aligned shrq $3,%rcx -.byte 0xf3,0x48,0xa5 +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi movq %rbx,%rcx movq %rdi,%rsi @@ -805,23 +910,23 @@ L$ctr32_inp_aligned: leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,216 +.byte 0xf3,0x0f,0xa7,216 movl -4(%rdx),%eax testl $4294901760,%eax - jnz L$ctr32_no_corr + jnz L$ctr32_no_carry bswapl %eax addl $65536,%eax bswapl %eax movl %eax,-4(%rdx) -L$ctr32_no_corr: +L$ctr32_no_carry: movq %r8,%rdi movq %r11,%rbx testq $15,%rdi jz L$ctr32_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi -.byte 0xf3,0x48,0xa5 + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 subq %rbx,%rdi L$ctr32_out_aligned: movq %r9,%rsi @@ -830,9 +935,38 @@ L$ctr32_out_aligned: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx + jz L$ctr32_break + cmpq %rbx,%rcx + jae L$ctr32_loop + movq %rcx,%rbx + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $32,%rax + movq $-32,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx jnz L$ctr32_loop - +L$ctr32_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp L$ctr32_loop +.p2align 4 +L$ctr32_break: + cmpq %rbp,%rsp je L$ctr32_done pxor %xmm0,%xmm0 @@ -846,56 +980,75 @@ L$ctr32_bzero: L$ctr32_done: leaq (%rbp),%rsp jmp L$ctr32_exit -.p2align 4 -L$ctr32_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -L$ctr32_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja L$ctr32_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp L$ctr32_reenter + .p2align 4 L$ctr32_aligned: movl -4(%rdx),%eax - movq $1048576,%rbx bswapl %eax - cmpq %rcx,%rbx - cmovaq %rcx,%rbx negl %eax andl $65535,%eax - jz L$ctr32_aligned_loop + movq $1048576,%rbx shll $4,%eax + cmovzq %rbx,%rax cmpq %rax,%rcx cmovaq %rax,%rbx - jmp L$ctr32_aligned_loop -.p2align 4 + cmovbeq %rcx,%rbx + jbe L$ctr32_aligned_skip + L$ctr32_aligned_loop: - cmpq %rcx,%rbx - cmovaq %rcx,%rbx movq %rcx,%r10 movq %rbx,%rcx movq %rbx,%r11 + leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx -.byte 0xf3,0x0f,0xa7,216 +.byte 0xf3,0x0f,0xa7,216 + movl -4(%rdx),%eax bswapl %eax addl $65536,%eax bswapl %eax movl %eax,-4(%rdx) - movq %r11,%rbx movq %r10,%rcx - subq %rbx,%rcx + subq %r11,%rcx movq $1048576,%rbx - jnz L$ctr32_aligned_loop + jz L$ctr32_exit + cmpq %rbx,%rcx + jae L$ctr32_aligned_loop + +L$ctr32_aligned_skip: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $32,%rbp + movq $32-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz L$ctr32_aligned_tail + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,216 + testq %rbp,%rbp + jz L$ctr32_exit + +L$ctr32_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp L$ctr32_loop L$ctr32_exit: movl $1,%eax leaq 8(%rsp),%rsp diff --git a/lib/accelerated/x86/macosx/ghash-x86_64.s b/lib/accelerated/x86/macosx/ghash-x86_64.s index 8aa7ffc04f..a63034a6fc 100644 --- a/lib/accelerated/x86/macosx/ghash-x86_64.s +++ b/lib/accelerated/x86/macosx/ghash-x86_64.s @@ -39,6 +39,7 @@ # .text + .globl _gcm_gmult_4bit .p2align 4 @@ -697,6 +698,7 @@ L$ghash_epilogue: .p2align 4 _gcm_init_clmul: +L$_init_clmul: movdqu (%rsi),%xmm2 pshufd $78,%xmm2,%xmm2 @@ -715,15 +717,15 @@ _gcm_init_clmul: pxor %xmm5,%xmm2 + pshufd $78,%xmm2,%xmm6 movdqa %xmm2,%xmm0 + pxor %xmm2,%xmm6 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 +.byte 102,15,58,68,222,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 @@ -733,44 +735,134 @@ _gcm_init_clmul: pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm2,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm2,%xmm3 + movdqu %xmm2,0(%rdi) + pxor %xmm0,%xmm4 + movdqu %xmm0,16(%rdi) +.byte 102,15,58,15,227,8 + movdqu %xmm4,32(%rdi) + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + movdqa %xmm0,%xmm5 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - movdqu %xmm2,(%rdi) - movdqu %xmm0,16(%rdi) + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm5,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm5,%xmm3 + movdqu %xmm5,48(%rdi) + pxor %xmm0,%xmm4 + movdqu %xmm0,64(%rdi) +.byte 102,15,58,15,227,8 + movdqu %xmm4,80(%rdi) .byte 0xf3,0xc3 .globl _gcm_gmult_clmul .p2align 4 _gcm_gmult_clmul: +L$_gmult_clmul: movdqu (%rdi),%xmm0 movdqa L$bswap_mask(%rip),%xmm5 movdqu (%rsi),%xmm2 + movdqu 32(%rsi),%xmm4 .byte 102,15,56,0,197 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,220,0 @@ -783,201 +875,381 @@ _gcm_gmult_clmul: pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 .byte 102,15,56,0,197 movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 .globl _gcm_ghash_clmul -.p2align 4 +.p2align 5 _gcm_ghash_clmul: - movdqa L$bswap_mask(%rip),%xmm5 +L$_ghash_clmul: + movdqa L$bswap_mask(%rip),%xmm10 movdqu (%rdi),%xmm0 movdqu (%rsi),%xmm2 -.byte 102,15,56,0,197 + movdqu 32(%rsi),%xmm7 +.byte 102,65,15,56,0,194 subq $16,%rcx jz L$odd_tail - movdqu 16(%rsi),%xmm8 - + movdqu 16(%rsi),%xmm6 + movl __gnutls_x86_cpuid_s+4(%rip),%eax + cmpq $48,%rcx + jb L$skip4x + + andl $71303168,%eax + cmpl $4194304,%eax + je L$skip4x + + subq $48,%rcx + movq $11547335547999543296,%rax + movdqu 48(%rsi),%xmm14 + movdqu 64(%rsi),%xmm15 + + + + + movdqu 48(%rdx),%xmm3 + movdqu 32(%rdx),%xmm11 +.byte 102,65,15,56,0,218 +.byte 102,69,15,56,0,218 + movdqa %xmm3,%xmm5 + pshufd $78,%xmm3,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,68,218,0 +.byte 102,15,58,68,234,17 +.byte 102,15,58,68,231,0 + + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm11,%xmm12 +.byte 102,68,15,58,68,222,0 +.byte 102,68,15,58,68,238,17 + xorps %xmm11,%xmm3 +.byte 102,68,15,58,68,231,16 + xorps %xmm13,%xmm5 + movups 80(%rsi),%xmm7 + xorps %xmm12,%xmm4 + + movdqu 16(%rdx),%xmm11 + movdqu 0(%rdx),%xmm8 +.byte 102,69,15,56,0,218 +.byte 102,69,15,56,0,194 + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm8,%xmm0 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 +.byte 102,69,15,58,68,238,17 + xorps %xmm11,%xmm3 +.byte 102,68,15,58,68,231,0 + xorps %xmm13,%xmm5 + + leaq 64(%rdx),%rdx + subq $64,%rcx + jc L$tail4x + + jmp L$mod4_loop +.p2align 5 +L$mod4_loop: +.byte 102,65,15,58,68,199,0 + xorps %xmm12,%xmm4 + movdqu 48(%rdx),%xmm11 +.byte 102,69,15,56,0,218 +.byte 102,65,15,58,68,207,17 + xorps %xmm3,%xmm0 + movdqu 32(%rdx),%xmm3 + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 +.byte 102,68,15,58,68,199,16 + xorps %xmm5,%xmm1 + pxor %xmm11,%xmm12 +.byte 102,65,15,56,0,218 + movups 32(%rsi),%xmm7 +.byte 102,68,15,58,68,218,0 + xorps %xmm4,%xmm8 + movdqa %xmm3,%xmm5 + pshufd $78,%xmm3,%xmm4 + + pxor %xmm0,%xmm8 + pxor %xmm3,%xmm4 + pxor %xmm1,%xmm8 + movdqa %xmm8,%xmm9 + pslldq $8,%xmm8 +.byte 102,68,15,58,68,234,17 + psrldq $8,%xmm9 + pxor %xmm8,%xmm0 + movdqa L$7_mask(%rip),%xmm8 + pxor %xmm9,%xmm1 +.byte 102,76,15,110,200 + + pand %xmm0,%xmm8 +.byte 102,69,15,56,0,200 +.byte 102,68,15,58,68,231,0 + pxor %xmm0,%xmm9 + psllq $57,%xmm9 + movdqa %xmm9,%xmm8 + pslldq $8,%xmm9 +.byte 102,15,58,68,222,0 + psrldq $8,%xmm8 + pxor %xmm9,%xmm0 + pxor %xmm8,%xmm1 + movdqu 0(%rdx),%xmm8 + + movdqa %xmm0,%xmm9 + psrlq $1,%xmm0 +.byte 102,15,58,68,238,17 + xorps %xmm11,%xmm3 + movdqu 16(%rdx),%xmm11 +.byte 102,69,15,56,0,218 +.byte 102,15,58,68,231,16 + xorps %xmm13,%xmm5 + movups 80(%rsi),%xmm7 +.byte 102,69,15,56,0,194 + pxor %xmm9,%xmm1 + pxor %xmm0,%xmm9 + psrlq $5,%xmm0 + movdqa %xmm11,%xmm13 + pxor %xmm12,%xmm4 + pshufd $78,%xmm11,%xmm12 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 + pxor %xmm9,%xmm0 + pxor %xmm8,%xmm1 + psrlq $1,%xmm0 +.byte 102,69,15,58,68,238,17 + xorps %xmm11,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,68,15,58,68,231,0 + xorps %xmm13,%xmm5 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 + + leaq 64(%rdx),%rdx + subq $64,%rcx + jnc L$mod4_loop + +L$tail4x: +.byte 102,65,15,58,68,199,0 + xorps %xmm12,%xmm4 +.byte 102,65,15,58,68,207,17 + xorps %xmm3,%xmm0 +.byte 102,68,15,58,68,199,16 + xorps %xmm5,%xmm1 + pxor %xmm0,%xmm1 + pxor %xmm4,%xmm8 + + pxor %xmm1,%xmm8 + pxor %xmm0,%xmm1 + + movdqa %xmm8,%xmm9 + psrldq $8,%xmm8 + pslldq $8,%xmm9 + pxor %xmm8,%xmm1 + pxor %xmm9,%xmm0 - movdqu (%rdx),%xmm3 - movdqu 16(%rdx),%xmm6 -.byte 102,15,56,0,221 -.byte 102,15,56,0,245 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 - movdqa %xmm6,%xmm7 - pshufd $78,%xmm6,%xmm3 - pshufd $78,%xmm2,%xmm4 - pxor %xmm6,%xmm3 - pxor %xmm2,%xmm4 -.byte 102,15,58,68,242,0 -.byte 102,15,58,68,250,17 -.byte 102,15,58,68,220,0 - pxor %xmm6,%xmm3 - pxor %xmm7,%xmm3 - - movdqa %xmm3,%xmm4 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm7 - pxor %xmm4,%xmm6 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm8,%xmm4 - pxor %xmm0,%xmm3 - pxor %xmm8,%xmm4 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + addq $64,%rcx + jz L$done + movdqu 32(%rsi),%xmm7 + subq $16,%rcx + jz L$odd_tail +L$skip4x: + + + + + + movdqu (%rdx),%xmm8 + movdqu 16(%rdx),%xmm3 +.byte 102,69,15,56,0,194 +.byte 102,65,15,56,0,218 + pxor %xmm8,%xmm0 + + movdqa %xmm3,%xmm5 + pshufd $78,%xmm3,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,68,218,0 +.byte 102,15,58,68,234,17 +.byte 102,15,58,68,231,0 leaq 32(%rdx),%rdx + nop subq $32,%rcx jbe L$even_tail + nop + jmp L$mod_loop +.p2align 5 L$mod_loop: -.byte 102,65,15,58,68,192,0 -.byte 102,65,15,58,68,200,17 -.byte 102,15,58,68,220,0 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm8 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 +.byte 102,15,58,68,198,0 +.byte 102,15,58,68,206,17 +.byte 102,15,58,68,231,16 + + pxor %xmm3,%xmm0 + pxor %xmm5,%xmm1 + movdqu (%rdx),%xmm5 + pxor %xmm0,%xmm8 +.byte 102,65,15,56,0,234 + movdqu 16(%rdx),%xmm3 + + pxor %xmm1,%xmm8 + pxor %xmm5,%xmm1 + pxor %xmm8,%xmm4 +.byte 102,65,15,56,0,218 + movdqa %xmm4,%xmm8 + psrldq $8,%xmm8 pslldq $8,%xmm4 - pxor %xmm3,%xmm1 + pxor %xmm8,%xmm1 pxor %xmm4,%xmm0 - movdqu (%rdx),%xmm3 - pxor %xmm6,%xmm0 - pxor %xmm7,%xmm1 - - movdqu 16(%rdx),%xmm6 -.byte 102,15,56,0,221 -.byte 102,15,56,0,245 - - movdqa %xmm6,%xmm7 - pshufd $78,%xmm6,%xmm9 - pshufd $78,%xmm2,%xmm10 - pxor %xmm6,%xmm9 - pxor %xmm2,%xmm10 - pxor %xmm3,%xmm1 - movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 + movdqa %xmm3,%xmm5 + + movdqa %xmm0,%xmm9 + movdqa %xmm0,%xmm8 psllq $5,%xmm0 - pxor %xmm3,%xmm0 -.byte 102,15,58,68,242,0 + pxor %xmm0,%xmm8 +.byte 102,15,58,68,218,0 + psllq $1,%xmm0 + pxor %xmm8,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm8 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 - -.byte 102,15,58,68,250,17 - movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 + psrldq $8,%xmm8 + pxor %xmm9,%xmm0 + pshufd $78,%xmm5,%xmm4 + pxor %xmm8,%xmm1 + pxor %xmm5,%xmm4 + +.byte 102,15,58,68,234,17 + movdqa %xmm0,%xmm9 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 + pxor %xmm9,%xmm1 + pxor %xmm0,%xmm9 + psrlq $5,%xmm0 + pxor %xmm9,%xmm0 + leaq 32(%rdx),%rdx psrlq $1,%xmm0 - pxor %xmm4,%xmm0 - -.byte 102,69,15,58,68,202,0 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm8,%xmm4 - pxor %xmm0,%xmm3 - pxor %xmm8,%xmm4 - - pxor %xmm6,%xmm9 - pxor %xmm7,%xmm9 - movdqa %xmm9,%xmm10 - psrldq $8,%xmm9 - pslldq $8,%xmm10 - pxor %xmm9,%xmm7 - pxor %xmm10,%xmm6 +.byte 102,15,58,68,231,0 + pxor %xmm1,%xmm0 +.byte 0x66,0x90 - leaq 32(%rdx),%rdx subq $32,%rcx ja L$mod_loop L$even_tail: -.byte 102,65,15,58,68,192,0 -.byte 102,65,15,58,68,200,17 -.byte 102,15,58,68,220,0 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm8 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 +.byte 102,15,58,68,198,0 +.byte 102,15,58,68,206,17 +.byte 102,15,58,68,231,16 + + pxor %xmm3,%xmm0 + pxor %xmm5,%xmm1 + pxor %xmm0,%xmm8 + pxor %xmm1,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm8 + psrldq $8,%xmm8 pslldq $8,%xmm4 - pxor %xmm3,%xmm1 + pxor %xmm8,%xmm1 pxor %xmm4,%xmm0 - pxor %xmm6,%xmm0 - pxor %xmm7,%xmm1 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 testq %rcx,%rcx jnz L$done L$odd_tail: - movdqu (%rdx),%xmm3 -.byte 102,15,56,0,221 - pxor %xmm3,%xmm0 + movdqu (%rdx),%xmm8 +.byte 102,69,15,56,0,194 + pxor %xmm8,%xmm0 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 +.byte 102,15,58,68,223,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 @@ -987,38 +1259,60 @@ L$odd_tail: pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 L$done: -.byte 102,15,56,0,197 +.byte 102,65,15,56,0,194 movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 -L$SEH_end_gcm_ghash_clmul: + +.globl _gcm_init_avx + +.p2align 5 +_gcm_init_avx: + jmp L$_init_clmul + +.globl _gcm_gmult_avx + +.p2align 5 +_gcm_gmult_avx: + jmp L$_gmult_clmul + +.globl _gcm_ghash_avx + +.p2align 5 +_gcm_ghash_avx: + jmp L$_ghash_clmul .p2align 6 L$bswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 L$0x1c2_polynomial: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +L$7_mask: +.long 7,0,7,0 +L$7_mask_poly: +.long 7,0,450,0 .p2align 6 L$rem_4bit: diff --git a/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s b/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s index d7f1e40b18..9091bd802e 100644 --- a/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s +++ b/lib/accelerated/x86/macosx/sha1-ssse3-x86_64.s @@ -46,23 +46,25 @@ _sha1_block_data_order: movl __gnutls_x86_cpuid_s+0(%rip),%r9d movl __gnutls_x86_cpuid_s+4(%rip),%r8d + movl __gnutls_x86_cpuid_s+8(%rip),%r10d testl $512,%r8d jz L$ialu jmp _ssse3_shortcut .p2align 4 L$ialu: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 - movq %rsp,%r11 + pushq %r14 movq %rdi,%r8 subq $72,%rsp movq %rsi,%r9 andq $-64,%rsp movq %rdx,%r10 - movq %r11,64(%rsp) + movq %rax,64(%rsp) L$prologue: movl 0(%r8),%esi @@ -76,1230 +78,1168 @@ L$prologue: L$loop: movl 0(%r9),%edx bswapl %edx - movl %edx,0(%rsp) - movl %r11d,%eax movl 4(%r9),%ebp + movl %r12d,%eax + movl %edx,0(%rsp) movl %esi,%ecx - xorl %r12d,%eax bswapl %ebp + xorl %r11d,%eax roll $5,%ecx - leal 1518500249(%rdx,%r13,1),%r13d andl %edi,%eax - movl %ebp,4(%rsp) + leal 1518500249(%rdx,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d - movl %edi,%eax - movl 8(%r9),%edx + movl 8(%r9),%r14d + movl %r11d,%eax + movl %ebp,4(%rsp) movl %r13d,%ecx - xorl %r11d,%eax - bswapl %edx + bswapl %r14d + xorl %edi,%eax roll $5,%ecx - leal 1518500249(%rbp,%r12,1),%r12d andl %esi,%eax - movl %edx,8(%rsp) + leal 1518500249(%rbp,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d - movl %esi,%eax - movl 12(%r9),%ebp + movl 12(%r9),%edx + movl %edi,%eax + movl %r14d,8(%rsp) movl %r12d,%ecx - xorl %edi,%eax - bswapl %ebp + bswapl %edx + xorl %esi,%eax roll $5,%ecx - leal 1518500249(%rdx,%r11,1),%r11d andl %r13d,%eax - movl %ebp,12(%rsp) + leal 1518500249(%r14,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d - movl %r13d,%eax - movl 16(%r9),%edx + movl 16(%r9),%ebp + movl %esi,%eax + movl %edx,12(%rsp) movl %r11d,%ecx - xorl %esi,%eax - bswapl %edx + bswapl %ebp + xorl %r13d,%eax roll $5,%ecx - leal 1518500249(%rbp,%rdi,1),%edi andl %r12d,%eax - movl %edx,16(%rsp) + leal 1518500249(%rdx,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi - movl %r12d,%eax - movl 20(%r9),%ebp + movl 20(%r9),%r14d + movl %r13d,%eax + movl %ebp,16(%rsp) movl %edi,%ecx - xorl %r13d,%eax - bswapl %ebp + bswapl %r14d + xorl %r12d,%eax roll $5,%ecx - leal 1518500249(%rdx,%rsi,1),%esi andl %r11d,%eax - movl %ebp,20(%rsp) + leal 1518500249(%rbp,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi - movl %r11d,%eax movl 24(%r9),%edx + movl %r12d,%eax + movl %r14d,20(%rsp) movl %esi,%ecx - xorl %r12d,%eax bswapl %edx + xorl %r11d,%eax roll $5,%ecx - leal 1518500249(%rbp,%r13,1),%r13d andl %edi,%eax - movl %edx,24(%rsp) + leal 1518500249(%r14,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d - movl %edi,%eax movl 28(%r9),%ebp + movl %r11d,%eax + movl %edx,24(%rsp) movl %r13d,%ecx - xorl %r11d,%eax bswapl %ebp + xorl %edi,%eax roll $5,%ecx - leal 1518500249(%rdx,%r12,1),%r12d andl %esi,%eax - movl %ebp,28(%rsp) + leal 1518500249(%rdx,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d - movl %esi,%eax - movl 32(%r9),%edx + movl 32(%r9),%r14d + movl %edi,%eax + movl %ebp,28(%rsp) movl %r12d,%ecx - xorl %edi,%eax - bswapl %edx + bswapl %r14d + xorl %esi,%eax roll $5,%ecx - leal 1518500249(%rbp,%r11,1),%r11d andl %r13d,%eax - movl %edx,32(%rsp) + leal 1518500249(%rbp,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d - movl %r13d,%eax - movl 36(%r9),%ebp + movl 36(%r9),%edx + movl %esi,%eax + movl %r14d,32(%rsp) movl %r11d,%ecx - xorl %esi,%eax - bswapl %ebp + bswapl %edx + xorl %r13d,%eax roll $5,%ecx - leal 1518500249(%rdx,%rdi,1),%edi andl %r12d,%eax - movl %ebp,36(%rsp) + leal 1518500249(%r14,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi - movl %r12d,%eax - movl 40(%r9),%edx + movl 40(%r9),%ebp + movl %r13d,%eax + movl %edx,36(%rsp) movl %edi,%ecx - xorl %r13d,%eax - bswapl %edx + bswapl %ebp + xorl %r12d,%eax roll $5,%ecx - leal 1518500249(%rbp,%rsi,1),%esi andl %r11d,%eax - movl %edx,40(%rsp) + leal 1518500249(%rdx,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi - movl %r11d,%eax - movl 44(%r9),%ebp + movl 44(%r9),%r14d + movl %r12d,%eax + movl %ebp,40(%rsp) movl %esi,%ecx - xorl %r12d,%eax - bswapl %ebp + bswapl %r14d + xorl %r11d,%eax roll $5,%ecx - leal 1518500249(%rdx,%r13,1),%r13d andl %edi,%eax - movl %ebp,44(%rsp) + leal 1518500249(%rbp,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d - movl %edi,%eax movl 48(%r9),%edx + movl %r11d,%eax + movl %r14d,44(%rsp) movl %r13d,%ecx - xorl %r11d,%eax bswapl %edx + xorl %edi,%eax roll $5,%ecx - leal 1518500249(%rbp,%r12,1),%r12d andl %esi,%eax - movl %edx,48(%rsp) + leal 1518500249(%r14,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d - movl %esi,%eax movl 52(%r9),%ebp + movl %edi,%eax + movl %edx,48(%rsp) movl %r12d,%ecx - xorl %edi,%eax bswapl %ebp + xorl %esi,%eax roll $5,%ecx - leal 1518500249(%rdx,%r11,1),%r11d andl %r13d,%eax - movl %ebp,52(%rsp) + leal 1518500249(%rdx,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d - movl %r13d,%eax - movl 56(%r9),%edx + movl 56(%r9),%r14d + movl %esi,%eax + movl %ebp,52(%rsp) movl %r11d,%ecx - xorl %esi,%eax - bswapl %edx + bswapl %r14d + xorl %r13d,%eax roll $5,%ecx - leal 1518500249(%rbp,%rdi,1),%edi andl %r12d,%eax - movl %edx,56(%rsp) + leal 1518500249(%rbp,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi - movl %r12d,%eax - movl 60(%r9),%ebp + movl 60(%r9),%edx + movl %r13d,%eax + movl %r14d,56(%rsp) movl %edi,%ecx - xorl %r13d,%eax - bswapl %ebp + bswapl %edx + xorl %r12d,%eax roll $5,%ecx - leal 1518500249(%rdx,%rsi,1),%esi andl %r11d,%eax - movl %ebp,60(%rsp) + leal 1518500249(%r14,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi - movl 0(%rsp),%edx - movl %r11d,%eax + xorl 0(%rsp),%ebp + movl %r12d,%eax + movl %edx,60(%rsp) movl %esi,%ecx - xorl 8(%rsp),%edx - xorl %r12d,%eax + xorl 8(%rsp),%ebp + xorl %r11d,%eax roll $5,%ecx - xorl 32(%rsp),%edx + xorl 32(%rsp),%ebp andl %edi,%eax - leal 1518500249(%rbp,%r13,1),%r13d - xorl 52(%rsp),%edx + leal 1518500249(%rdx,%r13,1),%r13d + roll $30,%edi xorl %r12d,%eax - roll $1,%edx addl %ecx,%r13d - roll $30,%edi - movl %edx,0(%rsp) + roll $1,%ebp addl %eax,%r13d - movl 4(%rsp),%ebp - movl %edi,%eax + xorl 4(%rsp),%r14d + movl %r11d,%eax + movl %ebp,0(%rsp) movl %r13d,%ecx - xorl 12(%rsp),%ebp - xorl %r11d,%eax + xorl 12(%rsp),%r14d + xorl %edi,%eax roll $5,%ecx - xorl 36(%rsp),%ebp + xorl 36(%rsp),%r14d andl %esi,%eax - leal 1518500249(%rdx,%r12,1),%r12d - xorl 56(%rsp),%ebp + leal 1518500249(%rbp,%r12,1),%r12d + roll $30,%esi xorl %r11d,%eax - roll $1,%ebp addl %ecx,%r12d - roll $30,%esi - movl %ebp,4(%rsp) + roll $1,%r14d addl %eax,%r12d - movl 8(%rsp),%edx - movl %esi,%eax + xorl 8(%rsp),%edx + movl %edi,%eax + movl %r14d,4(%rsp) movl %r12d,%ecx xorl 16(%rsp),%edx - xorl %edi,%eax + xorl %esi,%eax roll $5,%ecx xorl 40(%rsp),%edx andl %r13d,%eax - leal 1518500249(%rbp,%r11,1),%r11d - xorl 60(%rsp),%edx + leal 1518500249(%r14,%r11,1),%r11d + roll $30,%r13d xorl %edi,%eax - roll $1,%edx addl %ecx,%r11d - roll $30,%r13d - movl %edx,8(%rsp) + roll $1,%edx addl %eax,%r11d - movl 12(%rsp),%ebp - movl %r13d,%eax + xorl 12(%rsp),%ebp + movl %esi,%eax + movl %edx,8(%rsp) movl %r11d,%ecx xorl 20(%rsp),%ebp - xorl %esi,%eax + xorl %r13d,%eax roll $5,%ecx xorl 44(%rsp),%ebp andl %r12d,%eax leal 1518500249(%rdx,%rdi,1),%edi - xorl 0(%rsp),%ebp + roll $30,%r12d xorl %esi,%eax - roll $1,%ebp addl %ecx,%edi - roll $30,%r12d - movl %ebp,12(%rsp) + roll $1,%ebp addl %eax,%edi - movl 16(%rsp),%edx - movl %r12d,%eax + xorl 16(%rsp),%r14d + movl %r13d,%eax + movl %ebp,12(%rsp) movl %edi,%ecx - xorl 24(%rsp),%edx - xorl %r13d,%eax + xorl 24(%rsp),%r14d + xorl %r12d,%eax roll $5,%ecx - xorl 48(%rsp),%edx + xorl 48(%rsp),%r14d andl %r11d,%eax leal 1518500249(%rbp,%rsi,1),%esi - xorl 4(%rsp),%edx + roll $30,%r11d xorl %r13d,%eax - roll $1,%edx addl %ecx,%esi - roll $30,%r11d - movl %edx,16(%rsp) + roll $1,%r14d addl %eax,%esi - movl 20(%rsp),%ebp - movl %r11d,%eax + xorl 20(%rsp),%edx + movl %edi,%eax + movl %r14d,16(%rsp) movl %esi,%ecx - xorl 28(%rsp),%ebp - xorl %edi,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r13,1),%r13d - xorl 52(%rsp),%ebp + xorl 28(%rsp),%edx xorl %r12d,%eax + roll $5,%ecx + xorl 52(%rsp),%edx + leal 1859775393(%r14,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 8(%rsp),%ebp roll $30,%edi addl %eax,%r13d - roll $1,%ebp - movl %ebp,20(%rsp) - movl 24(%rsp),%edx - movl %edi,%eax + roll $1,%edx + xorl 24(%rsp),%ebp + movl %esi,%eax + movl %edx,20(%rsp) movl %r13d,%ecx - xorl 32(%rsp),%edx - xorl %esi,%eax - roll $5,%ecx - leal 1859775393(%rbp,%r12,1),%r12d - xorl 56(%rsp),%edx + xorl 32(%rsp),%ebp xorl %r11d,%eax + roll $5,%ecx + xorl 56(%rsp),%ebp + leal 1859775393(%rdx,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 12(%rsp),%edx roll $30,%esi addl %eax,%r12d - roll $1,%edx - movl %edx,24(%rsp) - movl 28(%rsp),%ebp - movl %esi,%eax + roll $1,%ebp + xorl 28(%rsp),%r14d + movl %r13d,%eax + movl %ebp,24(%rsp) movl %r12d,%ecx - xorl 36(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r11,1),%r11d - xorl 60(%rsp),%ebp + xorl 36(%rsp),%r14d xorl %edi,%eax + roll $5,%ecx + xorl 60(%rsp),%r14d + leal 1859775393(%rbp,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 16(%rsp),%ebp roll $30,%r13d addl %eax,%r11d - roll $1,%ebp - movl %ebp,28(%rsp) - movl 32(%rsp),%edx - movl %r13d,%eax + roll $1,%r14d + xorl 32(%rsp),%edx + movl %r12d,%eax + movl %r14d,28(%rsp) movl %r11d,%ecx xorl 40(%rsp),%edx - xorl %r12d,%eax + xorl %esi,%eax roll $5,%ecx - leal 1859775393(%rbp,%rdi,1),%edi xorl 0(%rsp),%edx - xorl %esi,%eax + leal 1859775393(%r14,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 20(%rsp),%edx roll $30,%r12d addl %eax,%edi roll $1,%edx + xorl 36(%rsp),%ebp + movl %r11d,%eax movl %edx,32(%rsp) - movl 36(%rsp),%ebp - movl %r12d,%eax movl %edi,%ecx xorl 44(%rsp),%ebp - xorl %r11d,%eax + xorl %r13d,%eax roll $5,%ecx - leal 1859775393(%rdx,%rsi,1),%esi xorl 4(%rsp),%ebp - xorl %r13d,%eax + leal 1859775393(%rdx,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 24(%rsp),%ebp roll $30,%r11d addl %eax,%esi roll $1,%ebp + xorl 40(%rsp),%r14d + movl %edi,%eax movl %ebp,36(%rsp) - movl 40(%rsp),%edx - movl %r11d,%eax movl %esi,%ecx - xorl 48(%rsp),%edx - xorl %edi,%eax + xorl 48(%rsp),%r14d + xorl %r12d,%eax roll $5,%ecx + xorl 8(%rsp),%r14d leal 1859775393(%rbp,%r13,1),%r13d - xorl 8(%rsp),%edx - xorl %r12d,%eax + xorl %r11d,%eax addl %ecx,%r13d - xorl 28(%rsp),%edx roll $30,%edi addl %eax,%r13d - roll $1,%edx - movl %edx,40(%rsp) - movl 44(%rsp),%ebp - movl %edi,%eax + roll $1,%r14d + xorl 44(%rsp),%edx + movl %esi,%eax + movl %r14d,40(%rsp) movl %r13d,%ecx - xorl 52(%rsp),%ebp - xorl %esi,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r12,1),%r12d - xorl 12(%rsp),%ebp + xorl 52(%rsp),%edx xorl %r11d,%eax + roll $5,%ecx + xorl 12(%rsp),%edx + leal 1859775393(%r14,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 32(%rsp),%ebp roll $30,%esi addl %eax,%r12d - roll $1,%ebp - movl %ebp,44(%rsp) - movl 48(%rsp),%edx - movl %esi,%eax + roll $1,%edx + xorl 48(%rsp),%ebp + movl %r13d,%eax + movl %edx,44(%rsp) movl %r12d,%ecx - xorl 56(%rsp),%edx - xorl %r13d,%eax - roll $5,%ecx - leal 1859775393(%rbp,%r11,1),%r11d - xorl 16(%rsp),%edx + xorl 56(%rsp),%ebp xorl %edi,%eax + roll $5,%ecx + xorl 16(%rsp),%ebp + leal 1859775393(%rdx,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 36(%rsp),%edx roll $30,%r13d addl %eax,%r11d - roll $1,%edx - movl %edx,48(%rsp) - movl 52(%rsp),%ebp - movl %r13d,%eax + roll $1,%ebp + xorl 52(%rsp),%r14d + movl %r12d,%eax + movl %ebp,48(%rsp) movl %r11d,%ecx - xorl 60(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%rdi,1),%edi - xorl 20(%rsp),%ebp + xorl 60(%rsp),%r14d xorl %esi,%eax + roll $5,%ecx + xorl 20(%rsp),%r14d + leal 1859775393(%rbp,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 40(%rsp),%ebp roll $30,%r12d addl %eax,%edi - roll $1,%ebp - movl %ebp,52(%rsp) - movl 56(%rsp),%edx - movl %r12d,%eax + roll $1,%r14d + xorl 56(%rsp),%edx + movl %r11d,%eax + movl %r14d,52(%rsp) movl %edi,%ecx xorl 0(%rsp),%edx - xorl %r11d,%eax + xorl %r13d,%eax roll $5,%ecx - leal 1859775393(%rbp,%rsi,1),%esi xorl 24(%rsp),%edx - xorl %r13d,%eax + leal 1859775393(%r14,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 44(%rsp),%edx roll $30,%r11d addl %eax,%esi roll $1,%edx + xorl 60(%rsp),%ebp + movl %edi,%eax movl %edx,56(%rsp) - movl 60(%rsp),%ebp - movl %r11d,%eax movl %esi,%ecx xorl 4(%rsp),%ebp - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal 1859775393(%rdx,%r13,1),%r13d xorl 28(%rsp),%ebp - xorl %r12d,%eax + leal 1859775393(%rdx,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 48(%rsp),%ebp roll $30,%edi addl %eax,%r13d roll $1,%ebp + xorl 0(%rsp),%r14d + movl %esi,%eax movl %ebp,60(%rsp) - movl 0(%rsp),%edx - movl %edi,%eax movl %r13d,%ecx - xorl 8(%rsp),%edx - xorl %esi,%eax + xorl 8(%rsp),%r14d + xorl %r11d,%eax roll $5,%ecx + xorl 32(%rsp),%r14d leal 1859775393(%rbp,%r12,1),%r12d - xorl 32(%rsp),%edx - xorl %r11d,%eax + xorl %edi,%eax addl %ecx,%r12d - xorl 52(%rsp),%edx roll $30,%esi addl %eax,%r12d - roll $1,%edx - movl %edx,0(%rsp) - movl 4(%rsp),%ebp - movl %esi,%eax + roll $1,%r14d + xorl 4(%rsp),%edx + movl %r13d,%eax + movl %r14d,0(%rsp) movl %r12d,%ecx - xorl 12(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%r11,1),%r11d - xorl 36(%rsp),%ebp + xorl 12(%rsp),%edx xorl %edi,%eax + roll $5,%ecx + xorl 36(%rsp),%edx + leal 1859775393(%r14,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 56(%rsp),%ebp roll $30,%r13d addl %eax,%r11d - roll $1,%ebp - movl %ebp,4(%rsp) - movl 8(%rsp),%edx - movl %r13d,%eax + roll $1,%edx + xorl 8(%rsp),%ebp + movl %r12d,%eax + movl %edx,4(%rsp) movl %r11d,%ecx - xorl 16(%rsp),%edx - xorl %r12d,%eax - roll $5,%ecx - leal 1859775393(%rbp,%rdi,1),%edi - xorl 40(%rsp),%edx + xorl 16(%rsp),%ebp xorl %esi,%eax + roll $5,%ecx + xorl 40(%rsp),%ebp + leal 1859775393(%rdx,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 60(%rsp),%edx roll $30,%r12d addl %eax,%edi - roll $1,%edx - movl %edx,8(%rsp) - movl 12(%rsp),%ebp - movl %r12d,%eax + roll $1,%ebp + xorl 12(%rsp),%r14d + movl %r11d,%eax + movl %ebp,8(%rsp) movl %edi,%ecx - xorl 20(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%rsi,1),%esi - xorl 44(%rsp),%ebp + xorl 20(%rsp),%r14d xorl %r13d,%eax + roll $5,%ecx + xorl 44(%rsp),%r14d + leal 1859775393(%rbp,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 0(%rsp),%ebp roll $30,%r11d addl %eax,%esi - roll $1,%ebp - movl %ebp,12(%rsp) - movl 16(%rsp),%edx - movl %r11d,%eax + roll $1,%r14d + xorl 16(%rsp),%edx + movl %edi,%eax + movl %r14d,12(%rsp) movl %esi,%ecx xorl 24(%rsp),%edx - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal 1859775393(%rbp,%r13,1),%r13d xorl 48(%rsp),%edx - xorl %r12d,%eax + leal 1859775393(%r14,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 4(%rsp),%edx roll $30,%edi addl %eax,%r13d roll $1,%edx + xorl 20(%rsp),%ebp + movl %esi,%eax movl %edx,16(%rsp) - movl 20(%rsp),%ebp - movl %edi,%eax movl %r13d,%ecx xorl 28(%rsp),%ebp - xorl %esi,%eax + xorl %r11d,%eax roll $5,%ecx - leal 1859775393(%rdx,%r12,1),%r12d xorl 52(%rsp),%ebp - xorl %r11d,%eax + leal 1859775393(%rdx,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 8(%rsp),%ebp roll $30,%esi addl %eax,%r12d roll $1,%ebp + xorl 24(%rsp),%r14d + movl %r13d,%eax movl %ebp,20(%rsp) - movl 24(%rsp),%edx - movl %esi,%eax movl %r12d,%ecx - xorl 32(%rsp),%edx - xorl %r13d,%eax + xorl 32(%rsp),%r14d + xorl %edi,%eax roll $5,%ecx + xorl 56(%rsp),%r14d leal 1859775393(%rbp,%r11,1),%r11d - xorl 56(%rsp),%edx - xorl %edi,%eax + xorl %esi,%eax addl %ecx,%r11d - xorl 12(%rsp),%edx roll $30,%r13d addl %eax,%r11d - roll $1,%edx - movl %edx,24(%rsp) - movl 28(%rsp),%ebp - movl %r13d,%eax + roll $1,%r14d + xorl 28(%rsp),%edx + movl %r12d,%eax + movl %r14d,24(%rsp) movl %r11d,%ecx - xorl 36(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - leal 1859775393(%rdx,%rdi,1),%edi - xorl 60(%rsp),%ebp + xorl 36(%rsp),%edx xorl %esi,%eax + roll $5,%ecx + xorl 60(%rsp),%edx + leal 1859775393(%r14,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 16(%rsp),%ebp roll $30,%r12d addl %eax,%edi - roll $1,%ebp - movl %ebp,28(%rsp) - movl 32(%rsp),%edx - movl %r12d,%eax + roll $1,%edx + xorl 32(%rsp),%ebp + movl %r11d,%eax + movl %edx,28(%rsp) movl %edi,%ecx - xorl 40(%rsp),%edx - xorl %r11d,%eax - roll $5,%ecx - leal 1859775393(%rbp,%rsi,1),%esi - xorl 0(%rsp),%edx + xorl 40(%rsp),%ebp xorl %r13d,%eax + roll $5,%ecx + xorl 0(%rsp),%ebp + leal 1859775393(%rdx,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 20(%rsp),%edx roll $30,%r11d addl %eax,%esi - roll $1,%edx - movl %edx,32(%rsp) - movl 36(%rsp),%ebp - movl %r11d,%eax - movl %r11d,%ebx - xorl 44(%rsp),%ebp - andl %r12d,%eax + roll $1,%ebp + xorl 36(%rsp),%r14d + movl %r12d,%eax + movl %ebp,32(%rsp) + movl %r12d,%ebx + xorl 44(%rsp),%r14d + andl %r11d,%eax movl %esi,%ecx - xorl 4(%rsp),%ebp - xorl %r12d,%ebx - leal -1894007588(%rdx,%r13,1),%r13d + xorl 4(%rsp),%r14d + leal -1894007588(%rbp,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 24(%rsp),%ebp addl %eax,%r13d + roll $1,%r14d andl %edi,%ebx - roll $1,%ebp - addl %ebx,%r13d - roll $30,%edi - movl %ebp,36(%rsp) addl %ecx,%r13d - movl 40(%rsp),%edx - movl %edi,%eax - movl %edi,%ebx + roll $30,%edi + addl %ebx,%r13d + xorl 40(%rsp),%edx + movl %r11d,%eax + movl %r14d,36(%rsp) + movl %r11d,%ebx xorl 48(%rsp),%edx - andl %r11d,%eax + andl %edi,%eax movl %r13d,%ecx xorl 8(%rsp),%edx - xorl %r11d,%ebx - leal -1894007588(%rbp,%r12,1),%r12d + leal -1894007588(%r14,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 28(%rsp),%edx addl %eax,%r12d - andl %esi,%ebx roll $1,%edx - addl %ebx,%r12d + andl %esi,%ebx + addl %ecx,%r12d roll $30,%esi + addl %ebx,%r12d + xorl 44(%rsp),%ebp + movl %edi,%eax movl %edx,40(%rsp) - addl %ecx,%r12d - movl 44(%rsp),%ebp - movl %esi,%eax - movl %esi,%ebx + movl %edi,%ebx xorl 52(%rsp),%ebp - andl %edi,%eax + andl %esi,%eax movl %r12d,%ecx xorl 12(%rsp),%ebp - xorl %edi,%ebx leal -1894007588(%rdx,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 32(%rsp),%ebp addl %eax,%r11d - andl %r13d,%ebx roll $1,%ebp - addl %ebx,%r11d + andl %r13d,%ebx + addl %ecx,%r11d roll $30,%r13d + addl %ebx,%r11d + xorl 48(%rsp),%r14d + movl %esi,%eax movl %ebp,44(%rsp) - addl %ecx,%r11d - movl 48(%rsp),%edx - movl %r13d,%eax - movl %r13d,%ebx - xorl 56(%rsp),%edx - andl %esi,%eax + movl %esi,%ebx + xorl 56(%rsp),%r14d + andl %r13d,%eax movl %r11d,%ecx - xorl 16(%rsp),%edx - xorl %esi,%ebx + xorl 16(%rsp),%r14d leal -1894007588(%rbp,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 36(%rsp),%edx addl %eax,%edi + roll $1,%r14d andl %r12d,%ebx - roll $1,%edx - addl %ebx,%edi - roll $30,%r12d - movl %edx,48(%rsp) addl %ecx,%edi - movl 52(%rsp),%ebp - movl %r12d,%eax - movl %r12d,%ebx - xorl 60(%rsp),%ebp - andl %r13d,%eax + roll $30,%r12d + addl %ebx,%edi + xorl 52(%rsp),%edx + movl %r13d,%eax + movl %r14d,48(%rsp) + movl %r13d,%ebx + xorl 60(%rsp),%edx + andl %r12d,%eax movl %edi,%ecx - xorl 20(%rsp),%ebp - xorl %r13d,%ebx - leal -1894007588(%rdx,%rsi,1),%esi + xorl 20(%rsp),%edx + leal -1894007588(%r14,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 40(%rsp),%ebp addl %eax,%esi + roll $1,%edx andl %r11d,%ebx - roll $1,%ebp - addl %ebx,%esi - roll $30,%r11d - movl %ebp,52(%rsp) addl %ecx,%esi - movl 56(%rsp),%edx - movl %r11d,%eax - movl %r11d,%ebx - xorl 0(%rsp),%edx - andl %r12d,%eax + roll $30,%r11d + addl %ebx,%esi + xorl 56(%rsp),%ebp + movl %r12d,%eax + movl %edx,52(%rsp) + movl %r12d,%ebx + xorl 0(%rsp),%ebp + andl %r11d,%eax movl %esi,%ecx - xorl 24(%rsp),%edx - xorl %r12d,%ebx - leal -1894007588(%rbp,%r13,1),%r13d + xorl 24(%rsp),%ebp + leal -1894007588(%rdx,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 44(%rsp),%edx addl %eax,%r13d + roll $1,%ebp andl %edi,%ebx - roll $1,%edx - addl %ebx,%r13d - roll $30,%edi - movl %edx,56(%rsp) addl %ecx,%r13d - movl 60(%rsp),%ebp - movl %edi,%eax - movl %edi,%ebx - xorl 4(%rsp),%ebp - andl %r11d,%eax + roll $30,%edi + addl %ebx,%r13d + xorl 60(%rsp),%r14d + movl %r11d,%eax + movl %ebp,56(%rsp) + movl %r11d,%ebx + xorl 4(%rsp),%r14d + andl %edi,%eax movl %r13d,%ecx - xorl 28(%rsp),%ebp - xorl %r11d,%ebx - leal -1894007588(%rdx,%r12,1),%r12d + xorl 28(%rsp),%r14d + leal -1894007588(%rbp,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 48(%rsp),%ebp addl %eax,%r12d + roll $1,%r14d andl %esi,%ebx - roll $1,%ebp - addl %ebx,%r12d - roll $30,%esi - movl %ebp,60(%rsp) addl %ecx,%r12d - movl 0(%rsp),%edx - movl %esi,%eax - movl %esi,%ebx + roll $30,%esi + addl %ebx,%r12d + xorl 0(%rsp),%edx + movl %edi,%eax + movl %r14d,60(%rsp) + movl %edi,%ebx xorl 8(%rsp),%edx - andl %edi,%eax + andl %esi,%eax movl %r12d,%ecx xorl 32(%rsp),%edx - xorl %edi,%ebx - leal -1894007588(%rbp,%r11,1),%r11d + leal -1894007588(%r14,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 52(%rsp),%edx addl %eax,%r11d - andl %r13d,%ebx roll $1,%edx - addl %ebx,%r11d + andl %r13d,%ebx + addl %ecx,%r11d roll $30,%r13d + addl %ebx,%r11d + xorl 4(%rsp),%ebp + movl %esi,%eax movl %edx,0(%rsp) - addl %ecx,%r11d - movl 4(%rsp),%ebp - movl %r13d,%eax - movl %r13d,%ebx + movl %esi,%ebx xorl 12(%rsp),%ebp - andl %esi,%eax + andl %r13d,%eax movl %r11d,%ecx xorl 36(%rsp),%ebp - xorl %esi,%ebx leal -1894007588(%rdx,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 56(%rsp),%ebp addl %eax,%edi - andl %r12d,%ebx roll $1,%ebp - addl %ebx,%edi + andl %r12d,%ebx + addl %ecx,%edi roll $30,%r12d + addl %ebx,%edi + xorl 8(%rsp),%r14d + movl %r13d,%eax movl %ebp,4(%rsp) - addl %ecx,%edi - movl 8(%rsp),%edx - movl %r12d,%eax - movl %r12d,%ebx - xorl 16(%rsp),%edx - andl %r13d,%eax + movl %r13d,%ebx + xorl 16(%rsp),%r14d + andl %r12d,%eax movl %edi,%ecx - xorl 40(%rsp),%edx - xorl %r13d,%ebx + xorl 40(%rsp),%r14d leal -1894007588(%rbp,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 60(%rsp),%edx addl %eax,%esi + roll $1,%r14d andl %r11d,%ebx - roll $1,%edx - addl %ebx,%esi - roll $30,%r11d - movl %edx,8(%rsp) addl %ecx,%esi - movl 12(%rsp),%ebp - movl %r11d,%eax - movl %r11d,%ebx - xorl 20(%rsp),%ebp - andl %r12d,%eax + roll $30,%r11d + addl %ebx,%esi + xorl 12(%rsp),%edx + movl %r12d,%eax + movl %r14d,8(%rsp) + movl %r12d,%ebx + xorl 20(%rsp),%edx + andl %r11d,%eax movl %esi,%ecx - xorl 44(%rsp),%ebp - xorl %r12d,%ebx - leal -1894007588(%rdx,%r13,1),%r13d + xorl 44(%rsp),%edx + leal -1894007588(%r14,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 0(%rsp),%ebp addl %eax,%r13d + roll $1,%edx andl %edi,%ebx - roll $1,%ebp - addl %ebx,%r13d - roll $30,%edi - movl %ebp,12(%rsp) addl %ecx,%r13d - movl 16(%rsp),%edx - movl %edi,%eax - movl %edi,%ebx - xorl 24(%rsp),%edx - andl %r11d,%eax + roll $30,%edi + addl %ebx,%r13d + xorl 16(%rsp),%ebp + movl %r11d,%eax + movl %edx,12(%rsp) + movl %r11d,%ebx + xorl 24(%rsp),%ebp + andl %edi,%eax movl %r13d,%ecx - xorl 48(%rsp),%edx - xorl %r11d,%ebx - leal -1894007588(%rbp,%r12,1),%r12d + xorl 48(%rsp),%ebp + leal -1894007588(%rdx,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 4(%rsp),%edx addl %eax,%r12d + roll $1,%ebp andl %esi,%ebx - roll $1,%edx - addl %ebx,%r12d - roll $30,%esi - movl %edx,16(%rsp) addl %ecx,%r12d - movl 20(%rsp),%ebp - movl %esi,%eax - movl %esi,%ebx - xorl 28(%rsp),%ebp - andl %edi,%eax + roll $30,%esi + addl %ebx,%r12d + xorl 20(%rsp),%r14d + movl %edi,%eax + movl %ebp,16(%rsp) + movl %edi,%ebx + xorl 28(%rsp),%r14d + andl %esi,%eax movl %r12d,%ecx - xorl 52(%rsp),%ebp - xorl %edi,%ebx - leal -1894007588(%rdx,%r11,1),%r11d + xorl 52(%rsp),%r14d + leal -1894007588(%rbp,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 8(%rsp),%ebp addl %eax,%r11d + roll $1,%r14d andl %r13d,%ebx - roll $1,%ebp - addl %ebx,%r11d - roll $30,%r13d - movl %ebp,20(%rsp) addl %ecx,%r11d - movl 24(%rsp),%edx - movl %r13d,%eax - movl %r13d,%ebx + roll $30,%r13d + addl %ebx,%r11d + xorl 24(%rsp),%edx + movl %esi,%eax + movl %r14d,20(%rsp) + movl %esi,%ebx xorl 32(%rsp),%edx - andl %esi,%eax + andl %r13d,%eax movl %r11d,%ecx xorl 56(%rsp),%edx - xorl %esi,%ebx - leal -1894007588(%rbp,%rdi,1),%edi + leal -1894007588(%r14,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 12(%rsp),%edx addl %eax,%edi - andl %r12d,%ebx roll $1,%edx - addl %ebx,%edi + andl %r12d,%ebx + addl %ecx,%edi roll $30,%r12d + addl %ebx,%edi + xorl 28(%rsp),%ebp + movl %r13d,%eax movl %edx,24(%rsp) - addl %ecx,%edi - movl 28(%rsp),%ebp - movl %r12d,%eax - movl %r12d,%ebx + movl %r13d,%ebx xorl 36(%rsp),%ebp - andl %r13d,%eax + andl %r12d,%eax movl %edi,%ecx xorl 60(%rsp),%ebp - xorl %r13d,%ebx leal -1894007588(%rdx,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 16(%rsp),%ebp addl %eax,%esi - andl %r11d,%ebx roll $1,%ebp - addl %ebx,%esi + andl %r11d,%ebx + addl %ecx,%esi roll $30,%r11d + addl %ebx,%esi + xorl 32(%rsp),%r14d + movl %r12d,%eax movl %ebp,28(%rsp) - addl %ecx,%esi - movl 32(%rsp),%edx - movl %r11d,%eax - movl %r11d,%ebx - xorl 40(%rsp),%edx - andl %r12d,%eax + movl %r12d,%ebx + xorl 40(%rsp),%r14d + andl %r11d,%eax movl %esi,%ecx - xorl 0(%rsp),%edx - xorl %r12d,%ebx + xorl 0(%rsp),%r14d leal -1894007588(%rbp,%r13,1),%r13d + xorl %r11d,%ebx roll $5,%ecx - xorl 20(%rsp),%edx addl %eax,%r13d + roll $1,%r14d andl %edi,%ebx - roll $1,%edx - addl %ebx,%r13d - roll $30,%edi - movl %edx,32(%rsp) addl %ecx,%r13d - movl 36(%rsp),%ebp - movl %edi,%eax - movl %edi,%ebx - xorl 44(%rsp),%ebp - andl %r11d,%eax + roll $30,%edi + addl %ebx,%r13d + xorl 36(%rsp),%edx + movl %r11d,%eax + movl %r14d,32(%rsp) + movl %r11d,%ebx + xorl 44(%rsp),%edx + andl %edi,%eax movl %r13d,%ecx - xorl 4(%rsp),%ebp - xorl %r11d,%ebx - leal -1894007588(%rdx,%r12,1),%r12d + xorl 4(%rsp),%edx + leal -1894007588(%r14,%r12,1),%r12d + xorl %edi,%ebx roll $5,%ecx - xorl 24(%rsp),%ebp addl %eax,%r12d + roll $1,%edx andl %esi,%ebx - roll $1,%ebp - addl %ebx,%r12d - roll $30,%esi - movl %ebp,36(%rsp) addl %ecx,%r12d - movl 40(%rsp),%edx - movl %esi,%eax - movl %esi,%ebx - xorl 48(%rsp),%edx - andl %edi,%eax + roll $30,%esi + addl %ebx,%r12d + xorl 40(%rsp),%ebp + movl %edi,%eax + movl %edx,36(%rsp) + movl %edi,%ebx + xorl 48(%rsp),%ebp + andl %esi,%eax movl %r12d,%ecx - xorl 8(%rsp),%edx - xorl %edi,%ebx - leal -1894007588(%rbp,%r11,1),%r11d + xorl 8(%rsp),%ebp + leal -1894007588(%rdx,%r11,1),%r11d + xorl %esi,%ebx roll $5,%ecx - xorl 28(%rsp),%edx addl %eax,%r11d + roll $1,%ebp andl %r13d,%ebx - roll $1,%edx - addl %ebx,%r11d - roll $30,%r13d - movl %edx,40(%rsp) addl %ecx,%r11d - movl 44(%rsp),%ebp - movl %r13d,%eax - movl %r13d,%ebx - xorl 52(%rsp),%ebp - andl %esi,%eax + roll $30,%r13d + addl %ebx,%r11d + xorl 44(%rsp),%r14d + movl %esi,%eax + movl %ebp,40(%rsp) + movl %esi,%ebx + xorl 52(%rsp),%r14d + andl %r13d,%eax movl %r11d,%ecx - xorl 12(%rsp),%ebp - xorl %esi,%ebx - leal -1894007588(%rdx,%rdi,1),%edi + xorl 12(%rsp),%r14d + leal -1894007588(%rbp,%rdi,1),%edi + xorl %r13d,%ebx roll $5,%ecx - xorl 32(%rsp),%ebp addl %eax,%edi + roll $1,%r14d andl %r12d,%ebx - roll $1,%ebp - addl %ebx,%edi - roll $30,%r12d - movl %ebp,44(%rsp) addl %ecx,%edi - movl 48(%rsp),%edx - movl %r12d,%eax - movl %r12d,%ebx + roll $30,%r12d + addl %ebx,%edi + xorl 48(%rsp),%edx + movl %r13d,%eax + movl %r14d,44(%rsp) + movl %r13d,%ebx xorl 56(%rsp),%edx - andl %r13d,%eax + andl %r12d,%eax movl %edi,%ecx xorl 16(%rsp),%edx - xorl %r13d,%ebx - leal -1894007588(%rbp,%rsi,1),%esi + leal -1894007588(%r14,%rsi,1),%esi + xorl %r12d,%ebx roll $5,%ecx - xorl 36(%rsp),%edx addl %eax,%esi - andl %r11d,%ebx roll $1,%edx - addl %ebx,%esi + andl %r11d,%ebx + addl %ecx,%esi roll $30,%r11d + addl %ebx,%esi + xorl 52(%rsp),%ebp + movl %edi,%eax movl %edx,48(%rsp) - addl %ecx,%esi - movl 52(%rsp),%ebp - movl %r11d,%eax movl %esi,%ecx xorl 60(%rsp),%ebp - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal -899497514(%rdx,%r13,1),%r13d xorl 20(%rsp),%ebp - xorl %r12d,%eax + leal -899497514(%rdx,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 40(%rsp),%ebp roll $30,%edi addl %eax,%r13d roll $1,%ebp + xorl 56(%rsp),%r14d + movl %esi,%eax movl %ebp,52(%rsp) - movl 56(%rsp),%edx - movl %edi,%eax movl %r13d,%ecx - xorl 0(%rsp),%edx - xorl %esi,%eax + xorl 0(%rsp),%r14d + xorl %r11d,%eax roll $5,%ecx + xorl 24(%rsp),%r14d leal -899497514(%rbp,%r12,1),%r12d - xorl 24(%rsp),%edx - xorl %r11d,%eax + xorl %edi,%eax addl %ecx,%r12d - xorl 44(%rsp),%edx roll $30,%esi addl %eax,%r12d - roll $1,%edx - movl %edx,56(%rsp) - movl 60(%rsp),%ebp - movl %esi,%eax + roll $1,%r14d + xorl 60(%rsp),%edx + movl %r13d,%eax + movl %r14d,56(%rsp) movl %r12d,%ecx - xorl 4(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - leal -899497514(%rdx,%r11,1),%r11d - xorl 28(%rsp),%ebp + xorl 4(%rsp),%edx xorl %edi,%eax + roll $5,%ecx + xorl 28(%rsp),%edx + leal -899497514(%r14,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 48(%rsp),%ebp roll $30,%r13d addl %eax,%r11d - roll $1,%ebp - movl %ebp,60(%rsp) - movl 0(%rsp),%edx - movl %r13d,%eax + roll $1,%edx + xorl 0(%rsp),%ebp + movl %r12d,%eax + movl %edx,60(%rsp) movl %r11d,%ecx - xorl 8(%rsp),%edx - xorl %r12d,%eax - roll $5,%ecx - leal -899497514(%rbp,%rdi,1),%edi - xorl 32(%rsp),%edx + xorl 8(%rsp),%ebp xorl %esi,%eax + roll $5,%ecx + xorl 32(%rsp),%ebp + leal -899497514(%rdx,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 52(%rsp),%edx roll $30,%r12d addl %eax,%edi - roll $1,%edx - movl %edx,0(%rsp) - movl 4(%rsp),%ebp - movl %r12d,%eax + roll $1,%ebp + xorl 4(%rsp),%r14d + movl %r11d,%eax + movl %ebp,0(%rsp) movl %edi,%ecx - xorl 12(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - leal -899497514(%rdx,%rsi,1),%esi - xorl 36(%rsp),%ebp + xorl 12(%rsp),%r14d xorl %r13d,%eax + roll $5,%ecx + xorl 36(%rsp),%r14d + leal -899497514(%rbp,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 56(%rsp),%ebp roll $30,%r11d addl %eax,%esi - roll $1,%ebp - movl %ebp,4(%rsp) - movl 8(%rsp),%edx - movl %r11d,%eax + roll $1,%r14d + xorl 8(%rsp),%edx + movl %edi,%eax + movl %r14d,4(%rsp) movl %esi,%ecx xorl 16(%rsp),%edx - xorl %edi,%eax + xorl %r12d,%eax roll $5,%ecx - leal -899497514(%rbp,%r13,1),%r13d xorl 40(%rsp),%edx - xorl %r12d,%eax + leal -899497514(%r14,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 60(%rsp),%edx roll $30,%edi addl %eax,%r13d roll $1,%edx + xorl 12(%rsp),%ebp + movl %esi,%eax movl %edx,8(%rsp) - movl 12(%rsp),%ebp - movl %edi,%eax movl %r13d,%ecx xorl 20(%rsp),%ebp - xorl %esi,%eax + xorl %r11d,%eax roll $5,%ecx - leal -899497514(%rdx,%r12,1),%r12d xorl 44(%rsp),%ebp - xorl %r11d,%eax + leal -899497514(%rdx,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 0(%rsp),%ebp roll $30,%esi addl %eax,%r12d roll $1,%ebp + xorl 16(%rsp),%r14d + movl %r13d,%eax movl %ebp,12(%rsp) - movl 16(%rsp),%edx - movl %esi,%eax movl %r12d,%ecx - xorl 24(%rsp),%edx - xorl %r13d,%eax + xorl 24(%rsp),%r14d + xorl %edi,%eax roll $5,%ecx + xorl 48(%rsp),%r14d leal -899497514(%rbp,%r11,1),%r11d - xorl 48(%rsp),%edx - xorl %edi,%eax + xorl %esi,%eax addl %ecx,%r11d - xorl 4(%rsp),%edx roll $30,%r13d addl %eax,%r11d - roll $1,%edx - movl %edx,16(%rsp) - movl 20(%rsp),%ebp - movl %r13d,%eax + roll $1,%r14d + xorl 20(%rsp),%edx + movl %r12d,%eax + movl %r14d,16(%rsp) movl %r11d,%ecx - xorl 28(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - leal -899497514(%rdx,%rdi,1),%edi - xorl 52(%rsp),%ebp + xorl 28(%rsp),%edx xorl %esi,%eax + roll $5,%ecx + xorl 52(%rsp),%edx + leal -899497514(%r14,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 8(%rsp),%ebp roll $30,%r12d addl %eax,%edi - roll $1,%ebp - movl %ebp,20(%rsp) - movl 24(%rsp),%edx - movl %r12d,%eax + roll $1,%edx + xorl 24(%rsp),%ebp + movl %r11d,%eax + movl %edx,20(%rsp) movl %edi,%ecx - xorl 32(%rsp),%edx - xorl %r11d,%eax - roll $5,%ecx - leal -899497514(%rbp,%rsi,1),%esi - xorl 56(%rsp),%edx + xorl 32(%rsp),%ebp xorl %r13d,%eax + roll $5,%ecx + xorl 56(%rsp),%ebp + leal -899497514(%rdx,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 12(%rsp),%edx roll $30,%r11d addl %eax,%esi - roll $1,%edx - movl %edx,24(%rsp) - movl 28(%rsp),%ebp - movl %r11d,%eax + roll $1,%ebp + xorl 28(%rsp),%r14d + movl %edi,%eax + movl %ebp,24(%rsp) movl %esi,%ecx - xorl 36(%rsp),%ebp - xorl %edi,%eax - roll $5,%ecx - leal -899497514(%rdx,%r13,1),%r13d - xorl 60(%rsp),%ebp + xorl 36(%rsp),%r14d xorl %r12d,%eax + roll $5,%ecx + xorl 60(%rsp),%r14d + leal -899497514(%rbp,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 16(%rsp),%ebp roll $30,%edi addl %eax,%r13d - roll $1,%ebp - movl %ebp,28(%rsp) - movl 32(%rsp),%edx - movl %edi,%eax + roll $1,%r14d + xorl 32(%rsp),%edx + movl %esi,%eax + movl %r14d,28(%rsp) movl %r13d,%ecx xorl 40(%rsp),%edx - xorl %esi,%eax + xorl %r11d,%eax roll $5,%ecx - leal -899497514(%rbp,%r12,1),%r12d xorl 0(%rsp),%edx - xorl %r11d,%eax + leal -899497514(%r14,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 20(%rsp),%edx roll $30,%esi addl %eax,%r12d roll $1,%edx - movl %edx,32(%rsp) - movl 36(%rsp),%ebp - movl %esi,%eax + xorl 36(%rsp),%ebp + movl %r13d,%eax + movl %r12d,%ecx xorl 44(%rsp),%ebp - xorl %r13d,%eax + xorl %edi,%eax roll $5,%ecx - leal -899497514(%rdx,%r11,1),%r11d xorl 4(%rsp),%ebp - xorl %edi,%eax + leal -899497514(%rdx,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 24(%rsp),%ebp roll $30,%r13d addl %eax,%r11d roll $1,%ebp - movl %ebp,36(%rsp) - movl 40(%rsp),%edx - movl %r13d,%eax + xorl 40(%rsp),%r14d + movl %r12d,%eax + movl %r11d,%ecx - xorl 48(%rsp),%edx - xorl %r12d,%eax + xorl 48(%rsp),%r14d + xorl %esi,%eax roll $5,%ecx + xorl 8(%rsp),%r14d leal -899497514(%rbp,%rdi,1),%edi - xorl 8(%rsp),%edx - xorl %esi,%eax + xorl %r13d,%eax addl %ecx,%edi - xorl 28(%rsp),%edx roll $30,%r12d addl %eax,%edi - roll $1,%edx - movl %edx,40(%rsp) - movl 44(%rsp),%ebp - movl %r12d,%eax + roll $1,%r14d + xorl 44(%rsp),%edx + movl %r11d,%eax + movl %edi,%ecx - xorl 52(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - leal -899497514(%rdx,%rsi,1),%esi - xorl 12(%rsp),%ebp + xorl 52(%rsp),%edx xorl %r13d,%eax + roll $5,%ecx + xorl 12(%rsp),%edx + leal -899497514(%r14,%rsi,1),%esi + xorl %r12d,%eax addl %ecx,%esi - xorl 32(%rsp),%ebp roll $30,%r11d addl %eax,%esi - roll $1,%ebp - movl %ebp,44(%rsp) - movl 48(%rsp),%edx - movl %r11d,%eax + roll $1,%edx + xorl 48(%rsp),%ebp + movl %edi,%eax + movl %esi,%ecx - xorl 56(%rsp),%edx - xorl %edi,%eax - roll $5,%ecx - leal -899497514(%rbp,%r13,1),%r13d - xorl 16(%rsp),%edx + xorl 56(%rsp),%ebp xorl %r12d,%eax + roll $5,%ecx + xorl 16(%rsp),%ebp + leal -899497514(%rdx,%r13,1),%r13d + xorl %r11d,%eax addl %ecx,%r13d - xorl 36(%rsp),%edx roll $30,%edi addl %eax,%r13d - roll $1,%edx - movl %edx,48(%rsp) - movl 52(%rsp),%ebp - movl %edi,%eax + roll $1,%ebp + xorl 52(%rsp),%r14d + movl %esi,%eax + movl %r13d,%ecx - xorl 60(%rsp),%ebp - xorl %esi,%eax - roll $5,%ecx - leal -899497514(%rdx,%r12,1),%r12d - xorl 20(%rsp),%ebp + xorl 60(%rsp),%r14d xorl %r11d,%eax + roll $5,%ecx + xorl 20(%rsp),%r14d + leal -899497514(%rbp,%r12,1),%r12d + xorl %edi,%eax addl %ecx,%r12d - xorl 40(%rsp),%ebp roll $30,%esi addl %eax,%r12d - roll $1,%ebp - movl 56(%rsp),%edx - movl %esi,%eax + roll $1,%r14d + xorl 56(%rsp),%edx + movl %r13d,%eax + movl %r12d,%ecx xorl 0(%rsp),%edx - xorl %r13d,%eax + xorl %edi,%eax roll $5,%ecx - leal -899497514(%rbp,%r11,1),%r11d xorl 24(%rsp),%edx - xorl %edi,%eax + leal -899497514(%r14,%r11,1),%r11d + xorl %esi,%eax addl %ecx,%r11d - xorl 44(%rsp),%edx roll $30,%r13d addl %eax,%r11d roll $1,%edx - movl 60(%rsp),%ebp - movl %r13d,%eax + xorl 60(%rsp),%ebp + movl %r12d,%eax + movl %r11d,%ecx xorl 4(%rsp),%ebp - xorl %r12d,%eax + xorl %esi,%eax roll $5,%ecx - leal -899497514(%rdx,%rdi,1),%edi xorl 28(%rsp),%ebp - xorl %esi,%eax + leal -899497514(%rdx,%rdi,1),%edi + xorl %r13d,%eax addl %ecx,%edi - xorl 48(%rsp),%ebp roll $30,%r12d addl %eax,%edi roll $1,%ebp - movl %r12d,%eax + movl %r11d,%eax movl %edi,%ecx - xorl %r11d,%eax + xorl %r13d,%eax leal -899497514(%rbp,%rsi,1),%esi roll $5,%ecx - xorl %r13d,%eax + xorl %r12d,%eax addl %ecx,%esi roll $30,%r11d addl %eax,%esi @@ -1319,11 +1259,12 @@ L$loop: jnz L$loop movq 64(%rsp),%rsi - movq (%rsi),%r13 - movq 8(%rsi),%r12 - movq 16(%rsi),%rbp - movq 24(%rsi),%rbx - leaq 32(%rsi),%rsp + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$epilogue: .byte 0xf3,0xc3 @@ -1331,17 +1272,22 @@ L$epilogue: .p2align 4 sha1_block_data_order_ssse3: _ssse3_shortcut: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 + pushq %r13 + pushq %r14 leaq -64(%rsp),%rsp + movq %rax,%r14 + andq $-64,%rsp movq %rdi,%r8 movq %rsi,%r9 movq %rdx,%r10 shlq $6,%r10 addq %r9,%r10 - leaq K_XX_XX(%rip),%r11 + leaq K_XX_XX+64(%rip),%r11 movl 0(%r8),%eax movl 4(%r8),%ebx @@ -1349,19 +1295,22 @@ _ssse3_shortcut: movl 12(%r8),%edx movl %ebx,%esi movl 16(%r8),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi movdqa 64(%r11),%xmm6 - movdqa 0(%r11),%xmm9 + movdqa -64(%r11),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 movdqu 48(%r9),%xmm3 .byte 102,15,56,0,198 - addq $64,%r9 .byte 102,15,56,0,206 .byte 102,15,56,0,214 -.byte 102,15,56,0,222 + addq $64,%r9 paddd %xmm9,%xmm0 +.byte 102,15,56,0,222 paddd %xmm9,%xmm1 paddd %xmm9,%xmm2 movdqa %xmm0,0(%rsp) @@ -1373,904 +1322,882 @@ _ssse3_shortcut: jmp L$oop_ssse3 .p2align 4 L$oop_ssse3: - movdqa %xmm1,%xmm4 - addl 0(%rsp),%ebp - xorl %edx,%ecx + rorl $2,%ebx + pshufd $238,%xmm0,%xmm4 + xorl %edx,%esi movdqa %xmm3,%xmm8 -.byte 102,15,58,15,224,8 + paddd %xmm3,%xmm9 movl %eax,%edi + addl 0(%rsp),%ebp + punpcklqdq %xmm1,%xmm4 + xorl %ecx,%ebx roll $5,%eax - paddd %xmm3,%xmm9 - andl %ecx,%esi - xorl %edx,%ecx + addl %esi,%ebp psrldq $4,%xmm8 - xorl %edx,%esi - addl %eax,%ebp + andl %ebx,%edi + xorl %ecx,%ebx pxor %xmm0,%xmm4 - rorl $2,%ebx - addl %esi,%ebp + addl %eax,%ebp + rorl $7,%eax pxor %xmm2,%xmm8 - addl 4(%rsp),%edx - xorl %ecx,%ebx + xorl %ecx,%edi movl %ebp,%esi - roll $5,%ebp + addl 4(%rsp),%edx pxor %xmm8,%xmm4 - andl %ebx,%edi - xorl %ecx,%ebx + xorl %ebx,%eax + roll $5,%ebp movdqa %xmm9,48(%rsp) - xorl %ecx,%edi - addl %ebp,%edx - movdqa %xmm4,%xmm10 - movdqa %xmm4,%xmm8 - rorl $7,%eax addl %edi,%edx - addl 8(%rsp),%ecx + andl %eax,%esi + movdqa %xmm4,%xmm10 xorl %ebx,%eax + addl %ebp,%edx + rorl $7,%ebp + movdqa %xmm4,%xmm8 + xorl %ebx,%esi pslldq $12,%xmm10 paddd %xmm4,%xmm4 movl %edx,%edi - roll $5,%edx - andl %eax,%esi - xorl %ebx,%eax + addl 8(%rsp),%ecx psrld $31,%xmm8 - xorl %ebx,%esi - addl %edx,%ecx - movdqa %xmm10,%xmm9 - rorl $7,%ebp + xorl %eax,%ebp + roll $5,%edx addl %esi,%ecx + movdqa %xmm10,%xmm9 + andl %ebp,%edi + xorl %eax,%ebp psrld $30,%xmm10 + addl %edx,%ecx + rorl $7,%edx por %xmm8,%xmm4 - addl 12(%rsp),%ebx - xorl %eax,%ebp + xorl %eax,%edi movl %ecx,%esi - roll $5,%ecx + addl 12(%rsp),%ebx pslld $2,%xmm9 pxor %xmm10,%xmm4 - andl %ebp,%edi - xorl %eax,%ebp - movdqa 0(%r11),%xmm10 - xorl %eax,%edi - addl %ecx,%ebx - pxor %xmm9,%xmm4 - rorl $7,%edx + xorl %ebp,%edx + movdqa -64(%r11),%xmm10 + roll $5,%ecx addl %edi,%ebx - movdqa %xmm2,%xmm5 - addl 16(%rsp),%eax + andl %edx,%esi + pxor %xmm9,%xmm4 xorl %ebp,%edx + addl %ecx,%ebx + rorl $7,%ecx + pshufd $238,%xmm1,%xmm5 + xorl %ebp,%esi movdqa %xmm4,%xmm9 -.byte 102,15,58,15,233,8 + paddd %xmm4,%xmm10 movl %ebx,%edi + addl 16(%rsp),%eax + punpcklqdq %xmm2,%xmm5 + xorl %edx,%ecx roll $5,%ebx - paddd %xmm4,%xmm10 - andl %edx,%esi - xorl %ebp,%edx + addl %esi,%eax psrldq $4,%xmm9 - xorl %ebp,%esi - addl %ebx,%eax + andl %ecx,%edi + xorl %edx,%ecx pxor %xmm1,%xmm5 - rorl $7,%ecx - addl %esi,%eax + addl %ebx,%eax + rorl $7,%ebx pxor %xmm3,%xmm9 - addl 20(%rsp),%ebp - xorl %edx,%ecx + xorl %edx,%edi movl %eax,%esi - roll $5,%eax + addl 20(%rsp),%ebp pxor %xmm9,%xmm5 - andl %ecx,%edi - xorl %edx,%ecx + xorl %ecx,%ebx + roll $5,%eax movdqa %xmm10,0(%rsp) - xorl %edx,%edi - addl %eax,%ebp - movdqa %xmm5,%xmm8 - movdqa %xmm5,%xmm9 - rorl $7,%ebx addl %edi,%ebp - addl 24(%rsp),%edx + andl %ebx,%esi + movdqa %xmm5,%xmm8 xorl %ecx,%ebx + addl %eax,%ebp + rorl $7,%eax + movdqa %xmm5,%xmm9 + xorl %ecx,%esi pslldq $12,%xmm8 paddd %xmm5,%xmm5 movl %ebp,%edi - roll $5,%ebp - andl %ebx,%esi - xorl %ecx,%ebx + addl 24(%rsp),%edx psrld $31,%xmm9 - xorl %ecx,%esi - addl %ebp,%edx - movdqa %xmm8,%xmm10 - rorl $7,%eax + xorl %ebx,%eax + roll $5,%ebp addl %esi,%edx + movdqa %xmm8,%xmm10 + andl %eax,%edi + xorl %ebx,%eax psrld $30,%xmm8 + addl %ebp,%edx + rorl $7,%ebp por %xmm9,%xmm5 - addl 28(%rsp),%ecx - xorl %ebx,%eax + xorl %ebx,%edi movl %edx,%esi - roll $5,%edx + addl 28(%rsp),%ecx pslld $2,%xmm10 pxor %xmm8,%xmm5 - andl %eax,%edi - xorl %ebx,%eax - movdqa 16(%r11),%xmm8 - xorl %ebx,%edi - addl %edx,%ecx - pxor %xmm10,%xmm5 - rorl $7,%ebp + xorl %eax,%ebp + movdqa -32(%r11),%xmm8 + roll $5,%edx addl %edi,%ecx - movdqa %xmm3,%xmm6 - addl 32(%rsp),%ebx + andl %ebp,%esi + pxor %xmm10,%xmm5 xorl %eax,%ebp + addl %edx,%ecx + rorl $7,%edx + pshufd $238,%xmm2,%xmm6 + xorl %eax,%esi movdqa %xmm5,%xmm10 -.byte 102,15,58,15,242,8 + paddd %xmm5,%xmm8 movl %ecx,%edi + addl 32(%rsp),%ebx + punpcklqdq %xmm3,%xmm6 + xorl %ebp,%edx roll $5,%ecx - paddd %xmm5,%xmm8 - andl %ebp,%esi - xorl %eax,%ebp + addl %esi,%ebx psrldq $4,%xmm10 - xorl %eax,%esi - addl %ecx,%ebx + andl %edx,%edi + xorl %ebp,%edx pxor %xmm2,%xmm6 - rorl $7,%edx - addl %esi,%ebx + addl %ecx,%ebx + rorl $7,%ecx pxor %xmm4,%xmm10 - addl 36(%rsp),%eax - xorl %ebp,%edx + xorl %ebp,%edi movl %ebx,%esi - roll $5,%ebx + addl 36(%rsp),%eax pxor %xmm10,%xmm6 - andl %edx,%edi - xorl %ebp,%edx + xorl %edx,%ecx + roll $5,%ebx movdqa %xmm8,16(%rsp) - xorl %ebp,%edi - addl %ebx,%eax - movdqa %xmm6,%xmm9 - movdqa %xmm6,%xmm10 - rorl $7,%ecx addl %edi,%eax - addl 40(%rsp),%ebp + andl %ecx,%esi + movdqa %xmm6,%xmm9 xorl %edx,%ecx + addl %ebx,%eax + rorl $7,%ebx + movdqa %xmm6,%xmm10 + xorl %edx,%esi pslldq $12,%xmm9 paddd %xmm6,%xmm6 movl %eax,%edi - roll $5,%eax - andl %ecx,%esi - xorl %edx,%ecx + addl 40(%rsp),%ebp psrld $31,%xmm10 - xorl %edx,%esi - addl %eax,%ebp - movdqa %xmm9,%xmm8 - rorl $7,%ebx + xorl %ecx,%ebx + roll $5,%eax addl %esi,%ebp + movdqa %xmm9,%xmm8 + andl %ebx,%edi + xorl %ecx,%ebx psrld $30,%xmm9 + addl %eax,%ebp + rorl $7,%eax por %xmm10,%xmm6 - addl 44(%rsp),%edx - xorl %ecx,%ebx + xorl %ecx,%edi movl %ebp,%esi - roll $5,%ebp + addl 44(%rsp),%edx pslld $2,%xmm8 pxor %xmm9,%xmm6 - andl %ebx,%edi - xorl %ecx,%ebx - movdqa 16(%r11),%xmm9 - xorl %ecx,%edi - addl %ebp,%edx - pxor %xmm8,%xmm6 - rorl $7,%eax + xorl %ebx,%eax + movdqa -32(%r11),%xmm9 + roll $5,%ebp addl %edi,%edx - movdqa %xmm4,%xmm7 - addl 48(%rsp),%ecx + andl %eax,%esi + pxor %xmm8,%xmm6 xorl %ebx,%eax + addl %ebp,%edx + rorl $7,%ebp + pshufd $238,%xmm3,%xmm7 + xorl %ebx,%esi movdqa %xmm6,%xmm8 -.byte 102,15,58,15,251,8 + paddd %xmm6,%xmm9 movl %edx,%edi + addl 48(%rsp),%ecx + punpcklqdq %xmm4,%xmm7 + xorl %eax,%ebp roll $5,%edx - paddd %xmm6,%xmm9 - andl %eax,%esi - xorl %ebx,%eax + addl %esi,%ecx psrldq $4,%xmm8 - xorl %ebx,%esi - addl %edx,%ecx + andl %ebp,%edi + xorl %eax,%ebp pxor %xmm3,%xmm7 - rorl $7,%ebp - addl %esi,%ecx + addl %edx,%ecx + rorl $7,%edx pxor %xmm5,%xmm8 - addl 52(%rsp),%ebx - xorl %eax,%ebp + xorl %eax,%edi movl %ecx,%esi - roll $5,%ecx + addl 52(%rsp),%ebx pxor %xmm8,%xmm7 - andl %ebp,%edi - xorl %eax,%ebp + xorl %ebp,%edx + roll $5,%ecx movdqa %xmm9,32(%rsp) - xorl %eax,%edi - addl %ecx,%ebx - movdqa %xmm7,%xmm10 - movdqa %xmm7,%xmm8 - rorl $7,%edx addl %edi,%ebx - addl 56(%rsp),%eax + andl %edx,%esi + movdqa %xmm7,%xmm10 xorl %ebp,%edx + addl %ecx,%ebx + rorl $7,%ecx + movdqa %xmm7,%xmm8 + xorl %ebp,%esi pslldq $12,%xmm10 paddd %xmm7,%xmm7 movl %ebx,%edi - roll $5,%ebx - andl %edx,%esi - xorl %ebp,%edx + addl 56(%rsp),%eax psrld $31,%xmm8 - xorl %ebp,%esi - addl %ebx,%eax - movdqa %xmm10,%xmm9 - rorl $7,%ecx + xorl %edx,%ecx + roll $5,%ebx addl %esi,%eax + movdqa %xmm10,%xmm9 + andl %ecx,%edi + xorl %edx,%ecx psrld $30,%xmm10 + addl %ebx,%eax + rorl $7,%ebx por %xmm8,%xmm7 - addl 60(%rsp),%ebp - xorl %edx,%ecx + xorl %edx,%edi movl %eax,%esi - roll $5,%eax + addl 60(%rsp),%ebp pslld $2,%xmm9 pxor %xmm10,%xmm7 - andl %ecx,%edi - xorl %edx,%ecx - movdqa 16(%r11),%xmm10 - xorl %edx,%edi - addl %eax,%ebp - pxor %xmm9,%xmm7 - rorl $7,%ebx + xorl %ecx,%ebx + movdqa -32(%r11),%xmm10 + roll $5,%eax addl %edi,%ebp - movdqa %xmm7,%xmm9 - addl 0(%rsp),%edx - pxor %xmm4,%xmm0 -.byte 102,68,15,58,15,206,8 + andl %ebx,%esi + pxor %xmm9,%xmm7 + pshufd $238,%xmm6,%xmm9 xorl %ecx,%ebx + addl %eax,%ebp + rorl $7,%eax + pxor %xmm4,%xmm0 + xorl %ecx,%esi movl %ebp,%edi + addl 0(%rsp),%edx + punpcklqdq %xmm7,%xmm9 + xorl %ebx,%eax roll $5,%ebp pxor %xmm1,%xmm0 - andl %ebx,%esi - xorl %ecx,%ebx + addl %esi,%edx + andl %eax,%edi movdqa %xmm10,%xmm8 + xorl %ebx,%eax paddd %xmm7,%xmm10 - xorl %ecx,%esi addl %ebp,%edx pxor %xmm9,%xmm0 - rorl $7,%eax - addl %esi,%edx + rorl $7,%ebp + xorl %ebx,%edi + movl %edx,%esi addl 4(%rsp),%ecx - xorl %ebx,%eax movdqa %xmm0,%xmm9 - movdqa %xmm10,48(%rsp) - movl %edx,%esi + xorl %eax,%ebp roll $5,%edx - andl %eax,%edi - xorl %ebx,%eax + movdqa %xmm10,48(%rsp) + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp pslld $2,%xmm0 - xorl %ebx,%edi addl %edx,%ecx + rorl $7,%edx psrld $30,%xmm9 - rorl $7,%ebp - addl %edi,%ecx - addl 8(%rsp),%ebx - xorl %eax,%ebp + xorl %eax,%esi movl %ecx,%edi - roll $5,%ecx + addl 8(%rsp),%ebx por %xmm9,%xmm0 - andl %ebp,%esi - xorl %eax,%ebp - movdqa %xmm0,%xmm10 - xorl %eax,%esi - addl %ecx,%ebx - rorl $7,%edx - addl %esi,%ebx - addl 12(%rsp),%eax xorl %ebp,%edx - movl %ebx,%esi - roll $5,%ebx + roll $5,%ecx + pshufd $238,%xmm7,%xmm10 + addl %esi,%ebx andl %edx,%edi xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax xorl %ebp,%edi - addl %ebx,%eax - rorl $7,%ecx + movl %ebx,%esi + roll $5,%ebx addl %edi,%eax - addl 16(%rsp),%ebp - pxor %xmm5,%xmm1 -.byte 102,68,15,58,15,215,8 xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + pxor %xmm5,%xmm1 + addl 16(%rsp),%ebp + xorl %ecx,%esi + punpcklqdq %xmm0,%xmm10 movl %eax,%edi roll $5,%eax pxor %xmm2,%xmm1 - xorl %ecx,%esi - addl %eax,%ebp + addl %esi,%ebp + xorl %ecx,%edi movdqa %xmm8,%xmm9 - paddd %xmm0,%xmm8 rorl $7,%ebx - addl %esi,%ebp + paddd %xmm0,%xmm8 + addl %eax,%ebp pxor %xmm10,%xmm1 addl 20(%rsp),%edx - xorl %ecx,%edi + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp movdqa %xmm1,%xmm10 + addl %edi,%edx + xorl %ebx,%esi movdqa %xmm8,0(%rsp) - xorl %ebx,%edi - addl %ebp,%edx rorl $7,%eax - addl %edi,%edx - pslld $2,%xmm1 + addl %ebp,%edx addl 24(%rsp),%ecx - xorl %ebx,%esi - psrld $30,%xmm10 + pslld $2,%xmm1 + xorl %eax,%esi movl %edx,%edi + psrld $30,%xmm10 roll $5,%edx - xorl %eax,%esi - addl %edx,%ecx - rorl $7,%ebp addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp por %xmm10,%xmm1 + addl %edx,%ecx addl 28(%rsp),%ebx - xorl %eax,%edi - movdqa %xmm1,%xmm8 + pshufd $238,%xmm0,%xmm8 + xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx - xorl %ebp,%edi - addl %ecx,%ebx - rorl $7,%edx addl %edi,%ebx - addl 32(%rsp),%eax - pxor %xmm6,%xmm2 -.byte 102,68,15,58,15,192,8 xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + pxor %xmm6,%xmm2 + addl 32(%rsp),%eax + xorl %edx,%esi + punpcklqdq %xmm1,%xmm8 movl %ebx,%edi roll $5,%ebx pxor %xmm3,%xmm2 - xorl %edx,%esi - addl %ebx,%eax - movdqa 32(%r11),%xmm10 - paddd %xmm1,%xmm9 - rorl $7,%ecx addl %esi,%eax + xorl %edx,%edi + movdqa 0(%r11),%xmm10 + rorl $7,%ecx + paddd %xmm1,%xmm9 + addl %ebx,%eax pxor %xmm8,%xmm2 addl 36(%rsp),%ebp - xorl %edx,%edi + xorl %ecx,%edi movl %eax,%esi roll $5,%eax movdqa %xmm2,%xmm8 + addl %edi,%ebp + xorl %ecx,%esi movdqa %xmm9,16(%rsp) - xorl %ecx,%edi - addl %eax,%ebp rorl $7,%ebx - addl %edi,%ebp - pslld $2,%xmm2 + addl %eax,%ebp addl 40(%rsp),%edx - xorl %ecx,%esi - psrld $30,%xmm8 + pslld $2,%xmm2 + xorl %ebx,%esi movl %ebp,%edi + psrld $30,%xmm8 roll $5,%ebp - xorl %ebx,%esi - addl %ebp,%edx - rorl $7,%eax addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax por %xmm8,%xmm2 + addl %ebp,%edx addl 44(%rsp),%ecx - xorl %ebx,%edi - movdqa %xmm2,%xmm9 + pshufd $238,%xmm1,%xmm9 + xorl %eax,%edi movl %edx,%esi roll $5,%edx - xorl %eax,%edi - addl %edx,%ecx - rorl $7,%ebp addl %edi,%ecx - addl 48(%rsp),%ebx - pxor %xmm7,%xmm3 -.byte 102,68,15,58,15,201,8 xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + pxor %xmm7,%xmm3 + addl 48(%rsp),%ebx + xorl %ebp,%esi + punpcklqdq %xmm2,%xmm9 movl %ecx,%edi roll $5,%ecx pxor %xmm4,%xmm3 - xorl %ebp,%esi - addl %ecx,%ebx + addl %esi,%ebx + xorl %ebp,%edi movdqa %xmm10,%xmm8 - paddd %xmm2,%xmm10 rorl $7,%edx - addl %esi,%ebx + paddd %xmm2,%xmm10 + addl %ecx,%ebx pxor %xmm9,%xmm3 addl 52(%rsp),%eax - xorl %ebp,%edi + xorl %edx,%edi movl %ebx,%esi roll $5,%ebx movdqa %xmm3,%xmm9 + addl %edi,%eax + xorl %edx,%esi movdqa %xmm10,32(%rsp) - xorl %edx,%edi - addl %ebx,%eax rorl $7,%ecx - addl %edi,%eax - pslld $2,%xmm3 + addl %ebx,%eax addl 56(%rsp),%ebp - xorl %edx,%esi - psrld $30,%xmm9 + pslld $2,%xmm3 + xorl %ecx,%esi movl %eax,%edi + psrld $30,%xmm9 roll $5,%eax - xorl %ecx,%esi - addl %eax,%ebp - rorl $7,%ebx addl %esi,%ebp + xorl %ecx,%edi + rorl $7,%ebx por %xmm9,%xmm3 + addl %eax,%ebp addl 60(%rsp),%edx - xorl %ecx,%edi - movdqa %xmm3,%xmm10 + pshufd $238,%xmm2,%xmm10 + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp - xorl %ebx,%edi - addl %ebp,%edx - rorl $7,%eax addl %edi,%edx - addl 0(%rsp),%ecx - pxor %xmm0,%xmm4 -.byte 102,68,15,58,15,210,8 xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + pxor %xmm0,%xmm4 + addl 0(%rsp),%ecx + xorl %eax,%esi + punpcklqdq %xmm3,%xmm10 movl %edx,%edi roll $5,%edx pxor %xmm5,%xmm4 - xorl %eax,%esi - addl %edx,%ecx + addl %esi,%ecx + xorl %eax,%edi movdqa %xmm8,%xmm9 - paddd %xmm3,%xmm8 rorl $7,%ebp - addl %esi,%ecx + paddd %xmm3,%xmm8 + addl %edx,%ecx pxor %xmm10,%xmm4 addl 4(%rsp),%ebx - xorl %eax,%edi + xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx movdqa %xmm4,%xmm10 + addl %edi,%ebx + xorl %ebp,%esi movdqa %xmm8,48(%rsp) - xorl %ebp,%edi - addl %ecx,%ebx rorl $7,%edx - addl %edi,%ebx - pslld $2,%xmm4 + addl %ecx,%ebx addl 8(%rsp),%eax - xorl %ebp,%esi - psrld $30,%xmm10 + pslld $2,%xmm4 + xorl %edx,%esi movl %ebx,%edi + psrld $30,%xmm10 roll $5,%ebx - xorl %edx,%esi - addl %ebx,%eax - rorl $7,%ecx addl %esi,%eax + xorl %edx,%edi + rorl $7,%ecx por %xmm10,%xmm4 + addl %ebx,%eax addl 12(%rsp),%ebp - xorl %edx,%edi - movdqa %xmm4,%xmm8 + pshufd $238,%xmm3,%xmm8 + xorl %ecx,%edi movl %eax,%esi roll $5,%eax - xorl %ecx,%edi - addl %eax,%ebp - rorl $7,%ebx addl %edi,%ebp - addl 16(%rsp),%edx - pxor %xmm1,%xmm5 -.byte 102,68,15,58,15,195,8 xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + pxor %xmm1,%xmm5 + addl 16(%rsp),%edx + xorl %ebx,%esi + punpcklqdq %xmm4,%xmm8 movl %ebp,%edi roll $5,%ebp pxor %xmm6,%xmm5 - xorl %ebx,%esi - addl %ebp,%edx + addl %esi,%edx + xorl %ebx,%edi movdqa %xmm9,%xmm10 - paddd %xmm4,%xmm9 rorl $7,%eax - addl %esi,%edx + paddd %xmm4,%xmm9 + addl %ebp,%edx pxor %xmm8,%xmm5 addl 20(%rsp),%ecx - xorl %ebx,%edi + xorl %eax,%edi movl %edx,%esi roll $5,%edx movdqa %xmm5,%xmm8 + addl %edi,%ecx + xorl %eax,%esi movdqa %xmm9,0(%rsp) - xorl %eax,%edi - addl %edx,%ecx rorl $7,%ebp - addl %edi,%ecx - pslld $2,%xmm5 + addl %edx,%ecx addl 24(%rsp),%ebx - xorl %eax,%esi - psrld $30,%xmm8 + pslld $2,%xmm5 + xorl %ebp,%esi movl %ecx,%edi + psrld $30,%xmm8 roll $5,%ecx - xorl %ebp,%esi - addl %ecx,%ebx - rorl $7,%edx addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx por %xmm8,%xmm5 + addl %ecx,%ebx addl 28(%rsp),%eax - xorl %ebp,%edi - movdqa %xmm5,%xmm9 + pshufd $238,%xmm4,%xmm9 + rorl $7,%ecx movl %ebx,%esi - roll $5,%ebx xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx + roll $5,%ebx addl %edi,%eax - movl %ecx,%edi - pxor %xmm2,%xmm6 -.byte 102,68,15,58,15,204,8 + xorl %ecx,%esi xorl %edx,%ecx + addl %ebx,%eax + pxor %xmm2,%xmm6 addl 32(%rsp),%ebp - andl %edx,%edi - pxor %xmm7,%xmm6 andl %ecx,%esi + xorl %edx,%ecx rorl $7,%ebx - movdqa %xmm10,%xmm8 - paddd %xmm5,%xmm10 - addl %edi,%ebp + punpcklqdq %xmm5,%xmm9 movl %eax,%edi - pxor %xmm9,%xmm6 + xorl %ecx,%esi + pxor %xmm7,%xmm6 roll $5,%eax addl %esi,%ebp - xorl %edx,%ecx - addl %eax,%ebp - movdqa %xmm6,%xmm9 - movdqa %xmm10,16(%rsp) - movl %ebx,%esi + movdqa %xmm10,%xmm8 + xorl %ebx,%edi + paddd %xmm5,%xmm10 xorl %ecx,%ebx + pxor %xmm9,%xmm6 + addl %eax,%ebp addl 36(%rsp),%edx - andl %ecx,%esi - pslld $2,%xmm6 andl %ebx,%edi + xorl %ecx,%ebx rorl $7,%eax - psrld $30,%xmm9 - addl %esi,%edx + movdqa %xmm6,%xmm9 movl %ebp,%esi + xorl %ebx,%edi + movdqa %xmm10,16(%rsp) roll $5,%ebp addl %edi,%edx - xorl %ecx,%ebx - addl %ebp,%edx - por %xmm9,%xmm6 - movl %eax,%edi + xorl %eax,%esi + pslld $2,%xmm6 xorl %ebx,%eax - movdqa %xmm6,%xmm10 + addl %ebp,%edx + psrld $30,%xmm9 addl 40(%rsp),%ecx - andl %ebx,%edi andl %eax,%esi + xorl %ebx,%eax + por %xmm9,%xmm6 rorl $7,%ebp - addl %edi,%ecx movl %edx,%edi + xorl %eax,%esi roll $5,%edx + pshufd $238,%xmm5,%xmm10 addl %esi,%ecx - xorl %ebx,%eax - addl %edx,%ecx - movl %ebp,%esi + xorl %ebp,%edi xorl %eax,%ebp + addl %edx,%ecx addl 44(%rsp),%ebx - andl %eax,%esi andl %ebp,%edi + xorl %eax,%ebp rorl $7,%edx - addl %esi,%ebx movl %ecx,%esi + xorl %ebp,%edi roll $5,%ecx addl %edi,%ebx - xorl %eax,%ebp + xorl %edx,%esi + xorl %ebp,%edx addl %ecx,%ebx - movl %edx,%edi pxor %xmm3,%xmm7 -.byte 102,68,15,58,15,213,8 - xorl %ebp,%edx addl 48(%rsp),%eax - andl %ebp,%edi - pxor %xmm0,%xmm7 andl %edx,%esi + xorl %ebp,%edx rorl $7,%ecx - movdqa 48(%r11),%xmm9 - paddd %xmm6,%xmm8 - addl %edi,%eax + punpcklqdq %xmm6,%xmm10 movl %ebx,%edi - pxor %xmm10,%xmm7 + xorl %edx,%esi + pxor %xmm0,%xmm7 roll $5,%ebx addl %esi,%eax - xorl %ebp,%edx - addl %ebx,%eax - movdqa %xmm7,%xmm10 - movdqa %xmm8,32(%rsp) - movl %ecx,%esi + movdqa 32(%r11),%xmm9 + xorl %ecx,%edi + paddd %xmm6,%xmm8 xorl %edx,%ecx + pxor %xmm10,%xmm7 + addl %ebx,%eax addl 52(%rsp),%ebp - andl %edx,%esi - pslld $2,%xmm7 andl %ecx,%edi + xorl %edx,%ecx rorl $7,%ebx - psrld $30,%xmm10 - addl %esi,%ebp + movdqa %xmm7,%xmm10 movl %eax,%esi + xorl %ecx,%edi + movdqa %xmm8,32(%rsp) roll $5,%eax addl %edi,%ebp - xorl %edx,%ecx - addl %eax,%ebp - por %xmm10,%xmm7 - movl %ebx,%edi + xorl %ebx,%esi + pslld $2,%xmm7 xorl %ecx,%ebx - movdqa %xmm7,%xmm8 + addl %eax,%ebp + psrld $30,%xmm10 addl 56(%rsp),%edx - andl %ecx,%edi andl %ebx,%esi + xorl %ecx,%ebx + por %xmm10,%xmm7 rorl $7,%eax - addl %edi,%edx movl %ebp,%edi + xorl %ebx,%esi roll $5,%ebp + pshufd $238,%xmm6,%xmm8 addl %esi,%edx - xorl %ecx,%ebx - addl %ebp,%edx - movl %eax,%esi + xorl %eax,%edi xorl %ebx,%eax + addl %ebp,%edx addl 60(%rsp),%ecx - andl %ebx,%esi andl %eax,%edi + xorl %ebx,%eax rorl $7,%ebp - addl %esi,%ecx movl %edx,%esi + xorl %eax,%edi roll $5,%edx addl %edi,%ecx - xorl %ebx,%eax + xorl %ebp,%esi + xorl %eax,%ebp addl %edx,%ecx - movl %ebp,%edi pxor %xmm4,%xmm0 -.byte 102,68,15,58,15,198,8 - xorl %eax,%ebp addl 0(%rsp),%ebx - andl %eax,%edi - pxor %xmm1,%xmm0 andl %ebp,%esi + xorl %eax,%ebp rorl $7,%edx - movdqa %xmm9,%xmm10 - paddd %xmm7,%xmm9 - addl %edi,%ebx + punpcklqdq %xmm7,%xmm8 movl %ecx,%edi - pxor %xmm8,%xmm0 + xorl %ebp,%esi + pxor %xmm1,%xmm0 roll $5,%ecx addl %esi,%ebx - xorl %eax,%ebp - addl %ecx,%ebx - movdqa %xmm0,%xmm8 - movdqa %xmm9,48(%rsp) - movl %edx,%esi + movdqa %xmm9,%xmm10 + xorl %edx,%edi + paddd %xmm7,%xmm9 xorl %ebp,%edx + pxor %xmm8,%xmm0 + addl %ecx,%ebx addl 4(%rsp),%eax - andl %ebp,%esi - pslld $2,%xmm0 andl %edx,%edi + xorl %ebp,%edx rorl $7,%ecx - psrld $30,%xmm8 - addl %esi,%eax + movdqa %xmm0,%xmm8 movl %ebx,%esi + xorl %edx,%edi + movdqa %xmm9,48(%rsp) roll $5,%ebx addl %edi,%eax - xorl %ebp,%edx - addl %ebx,%eax - por %xmm8,%xmm0 - movl %ecx,%edi + xorl %ecx,%esi + pslld $2,%xmm0 xorl %edx,%ecx - movdqa %xmm0,%xmm9 + addl %ebx,%eax + psrld $30,%xmm8 addl 8(%rsp),%ebp - andl %edx,%edi andl %ecx,%esi + xorl %edx,%ecx + por %xmm8,%xmm0 rorl $7,%ebx - addl %edi,%ebp movl %eax,%edi + xorl %ecx,%esi roll $5,%eax + pshufd $238,%xmm7,%xmm9 addl %esi,%ebp - xorl %edx,%ecx - addl %eax,%ebp - movl %ebx,%esi + xorl %ebx,%edi xorl %ecx,%ebx + addl %eax,%ebp addl 12(%rsp),%edx - andl %ecx,%esi andl %ebx,%edi + xorl %ecx,%ebx rorl $7,%eax - addl %esi,%edx movl %ebp,%esi + xorl %ebx,%edi roll $5,%ebp addl %edi,%edx - xorl %ecx,%ebx + xorl %eax,%esi + xorl %ebx,%eax addl %ebp,%edx - movl %eax,%edi pxor %xmm5,%xmm1 -.byte 102,68,15,58,15,207,8 - xorl %ebx,%eax addl 16(%rsp),%ecx - andl %ebx,%edi - pxor %xmm2,%xmm1 andl %eax,%esi + xorl %ebx,%eax rorl $7,%ebp - movdqa %xmm10,%xmm8 - paddd %xmm0,%xmm10 - addl %edi,%ecx + punpcklqdq %xmm0,%xmm9 movl %edx,%edi - pxor %xmm9,%xmm1 + xorl %eax,%esi + pxor %xmm2,%xmm1 roll $5,%edx addl %esi,%ecx - xorl %ebx,%eax - addl %edx,%ecx - movdqa %xmm1,%xmm9 - movdqa %xmm10,0(%rsp) - movl %ebp,%esi + movdqa %xmm10,%xmm8 + xorl %ebp,%edi + paddd %xmm0,%xmm10 xorl %eax,%ebp + pxor %xmm9,%xmm1 + addl %edx,%ecx addl 20(%rsp),%ebx - andl %eax,%esi - pslld $2,%xmm1 andl %ebp,%edi + xorl %eax,%ebp rorl $7,%edx - psrld $30,%xmm9 - addl %esi,%ebx + movdqa %xmm1,%xmm9 movl %ecx,%esi + xorl %ebp,%edi + movdqa %xmm10,0(%rsp) roll $5,%ecx addl %edi,%ebx - xorl %eax,%ebp - addl %ecx,%ebx - por %xmm9,%xmm1 - movl %edx,%edi + xorl %edx,%esi + pslld $2,%xmm1 xorl %ebp,%edx - movdqa %xmm1,%xmm10 + addl %ecx,%ebx + psrld $30,%xmm9 addl 24(%rsp),%eax - andl %ebp,%edi andl %edx,%esi + xorl %ebp,%edx + por %xmm9,%xmm1 rorl $7,%ecx - addl %edi,%eax movl %ebx,%edi + xorl %edx,%esi roll $5,%ebx + pshufd $238,%xmm0,%xmm10 addl %esi,%eax - xorl %ebp,%edx - addl %ebx,%eax - movl %ecx,%esi + xorl %ecx,%edi xorl %edx,%ecx + addl %ebx,%eax addl 28(%rsp),%ebp - andl %edx,%esi andl %ecx,%edi + xorl %edx,%ecx rorl $7,%ebx - addl %esi,%ebp movl %eax,%esi + xorl %ecx,%edi roll $5,%eax addl %edi,%ebp - xorl %edx,%ecx + xorl %ebx,%esi + xorl %ecx,%ebx addl %eax,%ebp - movl %ebx,%edi pxor %xmm6,%xmm2 -.byte 102,68,15,58,15,208,8 - xorl %ecx,%ebx addl 32(%rsp),%edx - andl %ecx,%edi - pxor %xmm3,%xmm2 andl %ebx,%esi + xorl %ecx,%ebx rorl $7,%eax - movdqa %xmm8,%xmm9 - paddd %xmm1,%xmm8 - addl %edi,%edx + punpcklqdq %xmm1,%xmm10 movl %ebp,%edi - pxor %xmm10,%xmm2 + xorl %ebx,%esi + pxor %xmm3,%xmm2 roll $5,%ebp addl %esi,%edx - xorl %ecx,%ebx - addl %ebp,%edx - movdqa %xmm2,%xmm10 - movdqa %xmm8,16(%rsp) - movl %eax,%esi + movdqa %xmm8,%xmm9 + xorl %eax,%edi + paddd %xmm1,%xmm8 xorl %ebx,%eax + pxor %xmm10,%xmm2 + addl %ebp,%edx addl 36(%rsp),%ecx - andl %ebx,%esi - pslld $2,%xmm2 andl %eax,%edi + xorl %ebx,%eax rorl $7,%ebp - psrld $30,%xmm10 - addl %esi,%ecx + movdqa %xmm2,%xmm10 movl %edx,%esi + xorl %eax,%edi + movdqa %xmm8,16(%rsp) roll $5,%edx addl %edi,%ecx - xorl %ebx,%eax - addl %edx,%ecx - por %xmm10,%xmm2 - movl %ebp,%edi + xorl %ebp,%esi + pslld $2,%xmm2 xorl %eax,%ebp - movdqa %xmm2,%xmm8 + addl %edx,%ecx + psrld $30,%xmm10 addl 40(%rsp),%ebx - andl %eax,%edi andl %ebp,%esi + xorl %eax,%ebp + por %xmm10,%xmm2 rorl $7,%edx - addl %edi,%ebx movl %ecx,%edi + xorl %ebp,%esi roll $5,%ecx + pshufd $238,%xmm1,%xmm8 addl %esi,%ebx - xorl %eax,%ebp - addl %ecx,%ebx - movl %edx,%esi + xorl %edx,%edi xorl %ebp,%edx + addl %ecx,%ebx addl 44(%rsp),%eax - andl %ebp,%esi andl %edx,%edi + xorl %ebp,%edx rorl $7,%ecx - addl %esi,%eax movl %ebx,%esi + xorl %edx,%edi roll $5,%ebx addl %edi,%eax - xorl %ebp,%edx + xorl %edx,%esi addl %ebx,%eax - addl 48(%rsp),%ebp pxor %xmm7,%xmm3 -.byte 102,68,15,58,15,193,8 - xorl %edx,%esi + addl 48(%rsp),%ebp + xorl %ecx,%esi + punpcklqdq %xmm2,%xmm8 movl %eax,%edi roll $5,%eax pxor %xmm4,%xmm3 - xorl %ecx,%esi - addl %eax,%ebp + addl %esi,%ebp + xorl %ecx,%edi movdqa %xmm9,%xmm10 - paddd %xmm2,%xmm9 rorl $7,%ebx - addl %esi,%ebp + paddd %xmm2,%xmm9 + addl %eax,%ebp pxor %xmm8,%xmm3 addl 52(%rsp),%edx - xorl %ecx,%edi + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp movdqa %xmm3,%xmm8 + addl %edi,%edx + xorl %ebx,%esi movdqa %xmm9,32(%rsp) - xorl %ebx,%edi - addl %ebp,%edx rorl $7,%eax - addl %edi,%edx - pslld $2,%xmm3 + addl %ebp,%edx addl 56(%rsp),%ecx - xorl %ebx,%esi - psrld $30,%xmm8 + pslld $2,%xmm3 + xorl %eax,%esi movl %edx,%edi + psrld $30,%xmm8 roll $5,%edx - xorl %eax,%esi - addl %edx,%ecx - rorl $7,%ebp addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp por %xmm8,%xmm3 + addl %edx,%ecx addl 60(%rsp),%ebx - xorl %eax,%edi + xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx - xorl %ebp,%edi - addl %ecx,%ebx - rorl $7,%edx addl %edi,%ebx - addl 0(%rsp),%eax - paddd %xmm3,%xmm10 xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + xorl %edx,%esi movl %ebx,%edi roll $5,%ebx - xorl %edx,%esi + paddd %xmm3,%xmm10 + addl %esi,%eax + xorl %edx,%edi movdqa %xmm10,48(%rsp) - addl %ebx,%eax rorl $7,%ecx - addl %esi,%eax + addl %ebx,%eax addl 4(%rsp),%ebp - xorl %edx,%edi + xorl %ecx,%edi movl %eax,%esi roll $5,%eax - xorl %ecx,%edi - addl %eax,%ebp - rorl $7,%ebx addl %edi,%ebp - addl 8(%rsp),%edx xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi movl %ebp,%edi roll $5,%ebp - xorl %ebx,%esi - addl %ebp,%edx - rorl $7,%eax addl %esi,%edx - addl 12(%rsp),%ecx xorl %ebx,%edi + rorl $7,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi movl %edx,%esi roll $5,%edx - xorl %eax,%edi - addl %edx,%ecx - rorl $7,%ebp addl %edi,%ecx + xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx cmpq %r10,%r9 je L$done_ssse3 movdqa 64(%r11),%xmm6 - movdqa 0(%r11),%xmm9 + movdqa -64(%r11),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 @@ -2278,113 +2205,112 @@ L$oop_ssse3: .byte 102,15,56,0,198 addq $64,%r9 addl 16(%rsp),%ebx - xorl %eax,%esi -.byte 102,15,56,0,206 + xorl %ebp,%esi movl %ecx,%edi +.byte 102,15,56,0,206 roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx paddd %xmm9,%xmm0 - xorl %ebp,%esi addl %ecx,%ebx - rorl $7,%edx - addl %esi,%ebx - movdqa %xmm0,0(%rsp) addl 20(%rsp),%eax - xorl %ebp,%edi - psubd %xmm9,%xmm0 + xorl %edx,%edi movl %ebx,%esi + movdqa %xmm0,0(%rsp) roll $5,%ebx - xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx addl %edi,%eax - addl 24(%rsp),%ebp xorl %edx,%esi + rorl $7,%ecx + psubd %xmm9,%xmm0 + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi movl %eax,%edi roll $5,%eax - xorl %ecx,%esi - addl %eax,%ebp - rorl $7,%ebx addl %esi,%ebp - addl 28(%rsp),%edx xorl %ecx,%edi + rorl $7,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp - xorl %ebx,%edi - addl %ebp,%edx - rorl $7,%eax addl %edi,%edx - addl 32(%rsp),%ecx xorl %ebx,%esi -.byte 102,15,56,0,214 + rorl $7,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi movl %edx,%edi +.byte 102,15,56,0,214 roll $5,%edx + addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp paddd %xmm9,%xmm1 - xorl %eax,%esi addl %edx,%ecx - rorl $7,%ebp - addl %esi,%ecx - movdqa %xmm1,16(%rsp) addl 36(%rsp),%ebx - xorl %eax,%edi - psubd %xmm9,%xmm1 + xorl %ebp,%edi movl %ecx,%esi + movdqa %xmm1,16(%rsp) roll $5,%ecx - xorl %ebp,%edi - addl %ecx,%ebx - rorl $7,%edx addl %edi,%ebx - addl 40(%rsp),%eax xorl %ebp,%esi + rorl $7,%edx + psubd %xmm9,%xmm1 + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi movl %ebx,%edi roll $5,%ebx - xorl %edx,%esi - addl %ebx,%eax - rorl $7,%ecx addl %esi,%eax - addl 44(%rsp),%ebp xorl %edx,%edi + rorl $7,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi movl %eax,%esi roll $5,%eax - xorl %ecx,%edi - addl %eax,%ebp - rorl $7,%ebx addl %edi,%ebp - addl 48(%rsp),%edx xorl %ecx,%esi -.byte 102,15,56,0,222 + rorl $7,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi movl %ebp,%edi +.byte 102,15,56,0,222 roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax paddd %xmm9,%xmm2 - xorl %ebx,%esi addl %ebp,%edx - rorl $7,%eax - addl %esi,%edx - movdqa %xmm2,32(%rsp) addl 52(%rsp),%ecx - xorl %ebx,%edi - psubd %xmm9,%xmm2 + xorl %eax,%edi movl %edx,%esi + movdqa %xmm2,32(%rsp) roll $5,%edx - xorl %eax,%edi - addl %edx,%ecx - rorl $7,%ebp addl %edi,%ecx - addl 56(%rsp),%ebx xorl %eax,%esi + rorl $7,%ebp + psubd %xmm9,%xmm2 + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi movl %ecx,%edi roll $5,%ecx - xorl %ebp,%esi - addl %ecx,%ebx - rorl $7,%edx addl %esi,%ebx - addl 60(%rsp),%eax xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi movl %ebx,%esi roll $5,%ebx - xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx addl %edi,%eax + rorl $7,%ecx + addl %ebx,%eax addl 0(%r8),%eax addl 4(%r8),%esi addl 8(%r8),%ecx @@ -2394,108 +2320,110 @@ L$oop_ssse3: movl %esi,4(%r8) movl %esi,%ebx movl %ecx,8(%r8) + movl %ecx,%edi movl %edx,12(%r8) + xorl %edx,%edi movl %ebp,16(%r8) + andl %edi,%esi jmp L$oop_ssse3 .p2align 4 L$done_ssse3: addl 16(%rsp),%ebx - xorl %eax,%esi + xorl %ebp,%esi movl %ecx,%edi roll $5,%ecx - xorl %ebp,%esi - addl %ecx,%ebx - rorl $7,%edx addl %esi,%ebx - addl 20(%rsp),%eax xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi movl %ebx,%esi roll $5,%ebx - xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx addl %edi,%eax - addl 24(%rsp),%ebp xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi movl %eax,%edi roll $5,%eax - xorl %ecx,%esi - addl %eax,%ebp - rorl $7,%ebx addl %esi,%ebp - addl 28(%rsp),%edx xorl %ecx,%edi + rorl $7,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp - xorl %ebx,%edi - addl %ebp,%edx - rorl $7,%eax addl %edi,%edx - addl 32(%rsp),%ecx xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi movl %edx,%edi roll $5,%edx - xorl %eax,%esi - addl %edx,%ecx - rorl $7,%ebp addl %esi,%ecx - addl 36(%rsp),%ebx xorl %eax,%edi + rorl $7,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx - xorl %ebp,%edi - addl %ecx,%ebx - rorl $7,%edx addl %edi,%ebx - addl 40(%rsp),%eax xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi movl %ebx,%edi roll $5,%ebx - xorl %edx,%esi - addl %ebx,%eax - rorl $7,%ecx addl %esi,%eax - addl 44(%rsp),%ebp xorl %edx,%edi + rorl $7,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi movl %eax,%esi roll $5,%eax - xorl %ecx,%edi - addl %eax,%ebp - rorl $7,%ebx addl %edi,%ebp - addl 48(%rsp),%edx xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi movl %ebp,%edi roll $5,%ebp - xorl %ebx,%esi - addl %ebp,%edx - rorl $7,%eax addl %esi,%edx - addl 52(%rsp),%ecx xorl %ebx,%edi + rorl $7,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi movl %edx,%esi roll $5,%edx - xorl %eax,%edi - addl %edx,%ecx - rorl $7,%ebp addl %edi,%ecx - addl 56(%rsp),%ebx xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi movl %ecx,%edi roll $5,%ecx - xorl %ebp,%esi - addl %ecx,%ebx - rorl $7,%edx addl %esi,%ebx - addl 60(%rsp),%eax xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi movl %ebx,%esi roll $5,%ebx - xorl %edx,%edi - addl %ebx,%eax - rorl $7,%ecx addl %edi,%eax + rorl $7,%ecx + addl %ebx,%eax addl 0(%r8),%eax addl 4(%r8),%esi addl 8(%r8),%ecx @@ -2506,21 +2434,28 @@ L$done_ssse3: movl %ecx,8(%r8) movl %edx,12(%r8) movl %ebp,16(%r8) - leaq 64(%rsp),%rsi - movq 0(%rsi),%r12 - movq 8(%rsi),%rbp - movq 16(%rsi),%rbx - leaq 24(%rsi),%rsp + leaq (%r14),%rsi + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$epilogue_ssse3: .byte 0xf3,0xc3 .p2align 6 K_XX_XX: -.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 -.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 -.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc -.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 6 diff --git a/lib/accelerated/x86/macosx/sha256-ssse3-x86.s b/lib/accelerated/x86/macosx/sha256-ssse3-x86.s index 36e0a06f18..e35d28fee0 100644 --- a/lib/accelerated/x86/macosx/sha256-ssse3-x86.s +++ b/lib/accelerated/x86/macosx/sha256-ssse3-x86.s @@ -63,195 +63,391 @@ L000pic_point: movl %edi,4(%esp) movl %eax,8(%esp) movl %ebx,12(%esp) + jmp L002loop .align 4,0x90 L002loop: movl (%edi),%eax movl 4(%edi),%ebx movl 8(%edi),%ecx - movl 12(%edi),%edx bswap %eax + movl 12(%edi),%edx bswap %ebx - bswap %ecx - bswap %edx pushl %eax + bswap %ecx pushl %ebx + bswap %edx pushl %ecx pushl %edx movl 16(%edi),%eax movl 20(%edi),%ebx movl 24(%edi),%ecx - movl 28(%edi),%edx bswap %eax + movl 28(%edi),%edx bswap %ebx - bswap %ecx - bswap %edx pushl %eax + bswap %ecx pushl %ebx + bswap %edx pushl %ecx pushl %edx movl 32(%edi),%eax movl 36(%edi),%ebx movl 40(%edi),%ecx - movl 44(%edi),%edx bswap %eax + movl 44(%edi),%edx bswap %ebx - bswap %ecx - bswap %edx pushl %eax + bswap %ecx pushl %ebx + bswap %edx pushl %ecx pushl %edx movl 48(%edi),%eax movl 52(%edi),%ebx movl 56(%edi),%ecx - movl 60(%edi),%edx bswap %eax + movl 60(%edi),%edx bswap %ebx - bswap %ecx - bswap %edx pushl %eax + bswap %ecx pushl %ebx + bswap %edx pushl %ecx pushl %edx addl $64,%edi - subl $32,%esp - movl %edi,100(%esp) + leal -36(%esp),%esp + movl %edi,104(%esp) movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx movl 12(%esi),%edi - movl %ebx,4(%esp) - movl %ecx,8(%esp) - movl %edi,12(%esp) + movl %ebx,8(%esp) + xorl %ecx,%ebx + movl %ecx,12(%esp) + movl %edi,16(%esp) + movl %ebx,(%esp) movl 16(%esi),%edx movl 20(%esi),%ebx movl 24(%esi),%ecx movl 28(%esi),%edi - movl %ebx,20(%esp) - movl %ecx,24(%esp) - movl %edi,28(%esp) + movl %ebx,24(%esp) + movl %ecx,28(%esp) + movl %edi,32(%esp) .align 4,0x90 L00300_15: - movl 92(%esp),%ebx movl %edx,%ecx + movl 24(%esp),%esi rorl $14,%ecx - movl 20(%esp),%esi + movl 28(%esp),%edi xorl %edx,%ecx - rorl $5,%ecx - xorl %edx,%ecx - rorl $6,%ecx - movl 24(%esp),%edi - addl %ecx,%ebx xorl %edi,%esi - movl %edx,16(%esp) - movl %eax,%ecx + movl 96(%esp),%ebx + rorl $5,%ecx andl %edx,%esi - movl 12(%esp),%edx + movl %edx,20(%esp) + xorl %ecx,%edx + addl 32(%esp),%ebx xorl %edi,%esi - movl %eax,%edi + rorl $6,%edx + movl %eax,%ecx addl %esi,%ebx rorl $9,%ecx - addl 28(%esp),%ebx + addl %edx,%ebx + movl 8(%esp),%edi xorl %eax,%ecx + movl %eax,4(%esp) + leal -4(%esp),%esp rorl $11,%ecx - movl 4(%esp),%esi + movl (%ebp),%esi xorl %eax,%ecx + movl 20(%esp),%edx + xorl %edi,%eax rorl $2,%ecx + addl %esi,%ebx + movl %eax,(%esp) addl %ebx,%edx - movl 8(%esp),%edi + andl 4(%esp),%eax addl %ecx,%ebx - movl %eax,(%esp) - movl %eax,%ecx - subl $4,%esp - orl %esi,%eax - andl %esi,%ecx - andl %edi,%eax - movl (%ebp),%esi - orl %ecx,%eax + xorl %edi,%eax addl $4,%ebp addl %ebx,%eax - addl %esi,%edx - addl %esi,%eax cmpl $3248222580,%esi jne L00300_15 - movl 152(%esp),%ebx + movl 156(%esp),%ecx + jmp L00416_63 .align 4,0x90 L00416_63: - movl %ebx,%esi - movl 100(%esp),%ecx - rorl $11,%esi - movl %ecx,%edi - xorl %ebx,%esi - rorl $7,%esi + movl %ecx,%ebx + movl 104(%esp),%esi + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx shrl $3,%ebx - rorl $2,%edi - xorl %esi,%ebx - xorl %ecx,%edi - rorl $17,%edi - shrl $10,%ecx - addl 156(%esp),%ebx - xorl %ecx,%edi - addl 120(%esp),%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 160(%esp),%ebx + shrl $10,%edi + addl 124(%esp),%ebx movl %edx,%ecx - addl %edi,%ebx + xorl %esi,%edi + movl 24(%esp),%esi rorl $14,%ecx - movl 20(%esp),%esi - xorl %edx,%ecx - rorl $5,%ecx - movl %ebx,92(%esp) + addl %edi,%ebx + movl 28(%esp),%edi xorl %edx,%ecx - rorl $6,%ecx - movl 24(%esp),%edi - addl %ecx,%ebx xorl %edi,%esi - movl %edx,16(%esp) - movl %eax,%ecx + movl %ebx,96(%esp) + rorl $5,%ecx andl %edx,%esi - movl 12(%esp),%edx + movl %edx,20(%esp) + xorl %ecx,%edx + addl 32(%esp),%ebx xorl %edi,%esi - movl %eax,%edi + rorl $6,%edx + movl %eax,%ecx addl %esi,%ebx rorl $9,%ecx - addl 28(%esp),%ebx + addl %edx,%ebx + movl 8(%esp),%edi xorl %eax,%ecx + movl %eax,4(%esp) + leal -4(%esp),%esp rorl $11,%ecx - movl 4(%esp),%esi + movl (%ebp),%esi xorl %eax,%ecx + movl 20(%esp),%edx + xorl %edi,%eax rorl $2,%ecx + addl %esi,%ebx + movl %eax,(%esp) addl %ebx,%edx - movl 8(%esp),%edi + andl 4(%esp),%eax addl %ecx,%ebx + xorl %edi,%eax + movl 156(%esp),%ecx + addl $4,%ebp + addl %ebx,%eax + cmpl $3329325298,%esi + jne L00416_63 + movl 356(%esp),%esi + movl 8(%esp),%ebx + movl 16(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl 24(%esp),%eax + movl 28(%esp),%ebx + movl 32(%esp),%ecx + movl 360(%esp),%edi + addl 16(%esi),%edx + addl 20(%esi),%eax + addl 24(%esi),%ebx + addl 28(%esi),%ecx + movl %edx,16(%esi) + movl %eax,20(%esi) + movl %ebx,24(%esi) + movl %ecx,28(%esi) + leal 356(%esp),%esp + subl $256,%ebp + cmpl 8(%esp),%edi + jb L002loop + movl 12(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 5,0x90 +L005loop_shrd: + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + bswap %eax + movl 12(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + movl 16(%edi),%eax + movl 20(%edi),%ebx + movl 24(%edi),%ecx + bswap %eax + movl 28(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + movl 32(%edi),%eax + movl 36(%edi),%ebx + movl 40(%edi),%ecx + bswap %eax + movl 44(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + movl 48(%edi),%eax + movl 52(%edi),%ebx + movl 56(%edi),%ecx + bswap %eax + movl 60(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + addl $64,%edi + leal -36(%esp),%esp + movl %edi,104(%esp) + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,8(%esp) + xorl %ecx,%ebx + movl %ecx,12(%esp) + movl %edi,16(%esp) + movl %ebx,(%esp) + movl 16(%esi),%edx + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%edi + movl %ebx,24(%esp) + movl %ecx,28(%esp) + movl %edi,32(%esp) +.align 4,0x90 +L00600_15_shrd: + movl %edx,%ecx + movl 24(%esp),%esi + shrdl $14,%ecx,%ecx + movl 28(%esp),%edi + xorl %edx,%ecx + xorl %edi,%esi + movl 96(%esp),%ebx + shrdl $5,%ecx,%ecx + andl %edx,%esi + movl %edx,20(%esp) + xorl %ecx,%edx + addl 32(%esp),%ebx + xorl %edi,%esi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %esi,%ebx + shrdl $9,%ecx,%ecx + addl %edx,%ebx + movl 8(%esp),%edi + xorl %eax,%ecx + movl %eax,4(%esp) + leal -4(%esp),%esp + shrdl $11,%ecx,%ecx + movl (%ebp),%esi + xorl %eax,%ecx + movl 20(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %esi,%ebx movl %eax,(%esp) + addl %ebx,%edx + andl 4(%esp),%eax + addl %ecx,%ebx + xorl %edi,%eax + addl $4,%ebp + addl %ebx,%eax + cmpl $3248222580,%esi + jne L00600_15_shrd + movl 156(%esp),%ecx + jmp L00716_63_shrd +.align 4,0x90 +L00716_63_shrd: + movl %ecx,%ebx + movl 104(%esp),%esi + shrdl $11,%ecx,%ecx + movl %esi,%edi + shrdl $2,%esi,%esi + xorl %ebx,%ecx + shrl $3,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + shrdl $17,%esi,%esi + addl 160(%esp),%ebx + shrl $10,%edi + addl 124(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 24(%esp),%esi + shrdl $14,%ecx,%ecx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %edx,%ecx + xorl %edi,%esi + movl %ebx,96(%esp) + shrdl $5,%ecx,%ecx + andl %edx,%esi + movl %edx,20(%esp) + xorl %ecx,%edx + addl 32(%esp),%ebx + xorl %edi,%esi + shrdl $6,%edx,%edx movl %eax,%ecx - subl $4,%esp - orl %esi,%eax - andl %esi,%ecx - andl %edi,%eax + addl %esi,%ebx + shrdl $9,%ecx,%ecx + addl %edx,%ebx + movl 8(%esp),%edi + xorl %eax,%ecx + movl %eax,4(%esp) + leal -4(%esp),%esp + shrdl $11,%ecx,%ecx movl (%ebp),%esi - orl %ecx,%eax + xorl %eax,%ecx + movl 20(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %esi,%ebx + movl %eax,(%esp) + addl %ebx,%edx + andl 4(%esp),%eax + addl %ecx,%ebx + xorl %edi,%eax + movl 156(%esp),%ecx addl $4,%ebp addl %ebx,%eax - movl 152(%esp),%ebx - addl %esi,%edx - addl %esi,%eax cmpl $3329325298,%esi - jne L00416_63 - movl 352(%esp),%esi - movl 4(%esp),%ebx - movl 8(%esp),%ecx - movl 12(%esp),%edi + jne L00716_63_shrd + movl 356(%esp),%esi + movl 8(%esp),%ebx + movl 16(%esp),%ecx addl (%esi),%eax addl 4(%esi),%ebx - addl 8(%esi),%ecx - addl 12(%esi),%edi + addl 8(%esi),%edi + addl 12(%esi),%ecx movl %eax,(%esi) movl %ebx,4(%esi) - movl %ecx,8(%esi) - movl %edi,12(%esi) - movl 20(%esp),%eax - movl 24(%esp),%ebx - movl 28(%esp),%ecx - movl 356(%esp),%edi + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl 24(%esp),%eax + movl 28(%esp),%ebx + movl 32(%esp),%ecx + movl 360(%esp),%edi addl 16(%esi),%edx addl 20(%esi),%eax addl 24(%esi),%ebx @@ -260,10 +456,10 @@ L00416_63: movl %eax,20(%esi) movl %ebx,24(%esi) movl %ecx,28(%esi) - addl $352,%esp + leal 356(%esp),%esp subl $256,%ebp cmpl 8(%esp),%edi - jb L002loop + jb L005loop_shrd movl 12(%esp),%esp popl %edi popl %esi @@ -272,25 +468,2918 @@ L00416_63: ret .align 6,0x90 L001K256: -.long 1116352408,1899447441,3049323471,3921009573 -.long 961987163,1508970993,2453635748,2870763221 -.long 3624381080,310598401,607225278,1426881987 -.long 1925078388,2162078206,2614888103,3248222580 -.long 3835390401,4022224774,264347078,604807628 -.long 770255983,1249150122,1555081692,1996064986 -.long 2554220882,2821834349,2952996808,3210313671 -.long 3336571891,3584528711,113926993,338241895 -.long 666307205,773529912,1294757372,1396182291 -.long 1695183700,1986661051,2177026350,2456956037 -.long 2730485921,2820302411,3259730800,3345764771 -.long 3516065817,3600352804,4094571909,275423344 -.long 430227734,506948616,659060556,883997877 -.long 958139571,1322822218,1537002063,1747873779 -.long 1955562222,2024104815,2227730452,2361852424 -.long 2428436474,2756734187,3204031479,3329325298 +.long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298 +.long 66051,67438087,134810123,202182159 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 .byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 .byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 +.align 4,0x90 +L008unrolled: + leal -96(%esp),%esp + movl (%esi),%eax + movl 4(%esi),%ebp + movl 8(%esi),%ecx + movl 12(%esi),%ebx + movl %ebp,4(%esp) + xorl %ecx,%ebp + movl %ecx,8(%esp) + movl %ebx,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %ebx,20(%esp) + movl %ecx,24(%esp) + movl %esi,28(%esp) + jmp L009grand_loop +.align 4,0x90 +L009grand_loop: + movl (%edi),%ebx + movl 4(%edi),%ecx + bswap %ebx + movl 8(%edi),%esi + bswap %ecx + movl %ebx,32(%esp) + bswap %esi + movl %ecx,36(%esp) + movl %esi,40(%esp) + movl 12(%edi),%ebx + movl 16(%edi),%ecx + bswap %ebx + movl 20(%edi),%esi + bswap %ecx + movl %ebx,44(%esp) + bswap %esi + movl %ecx,48(%esp) + movl %esi,52(%esp) + movl 24(%edi),%ebx + movl 28(%edi),%ecx + bswap %ebx + movl 32(%edi),%esi + bswap %ecx + movl %ebx,56(%esp) + bswap %esi + movl %ecx,60(%esp) + movl %esi,64(%esp) + movl 36(%edi),%ebx + movl 40(%edi),%ecx + bswap %ebx + movl 44(%edi),%esi + bswap %ecx + movl %ebx,68(%esp) + bswap %esi + movl %ecx,72(%esp) + movl %esi,76(%esp) + movl 48(%edi),%ebx + movl 52(%edi),%ecx + bswap %ebx + movl 56(%edi),%esi + bswap %ecx + movl %ebx,80(%esp) + bswap %esi + movl %ecx,84(%esp) + movl %esi,88(%esp) + movl 60(%edi),%ebx + addl $64,%edi + bswap %ebx + movl %edi,100(%esp) + movl %ebx,92(%esp) + movl %edx,%ecx + movl 20(%esp),%esi + rorl $14,%edx + movl 24(%esp),%edi + xorl %ecx,%edx + movl 32(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1116352408(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 16(%esp),%ecx + rorl $14,%edx + movl 20(%esp),%edi + xorl %esi,%edx + movl 36(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1899447441(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 12(%esp),%esi + rorl $14,%edx + movl 16(%esp),%edi + xorl %ecx,%edx + movl 40(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3049323471(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 8(%esp),%ecx + rorl $14,%edx + movl 12(%esp),%edi + xorl %esi,%edx + movl 44(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3921009573(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 4(%esp),%esi + rorl $14,%edx + movl 8(%esp),%edi + xorl %ecx,%edx + movl 48(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 961987163(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl (%esp),%ecx + rorl $14,%edx + movl 4(%esp),%edi + xorl %esi,%edx + movl 52(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1508970993(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 28(%esp),%esi + rorl $14,%edx + movl (%esp),%edi + xorl %ecx,%edx + movl 56(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2453635748(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 24(%esp),%ecx + rorl $14,%edx + movl 28(%esp),%edi + xorl %esi,%edx + movl 60(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2870763221(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 20(%esp),%esi + rorl $14,%edx + movl 24(%esp),%edi + xorl %ecx,%edx + movl 64(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3624381080(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 16(%esp),%ecx + rorl $14,%edx + movl 20(%esp),%edi + xorl %esi,%edx + movl 68(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 310598401(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 12(%esp),%esi + rorl $14,%edx + movl 16(%esp),%edi + xorl %ecx,%edx + movl 72(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 607225278(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 8(%esp),%ecx + rorl $14,%edx + movl 12(%esp),%edi + xorl %esi,%edx + movl 76(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1426881987(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 4(%esp),%esi + rorl $14,%edx + movl 8(%esp),%edi + xorl %ecx,%edx + movl 80(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1925078388(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl (%esp),%ecx + rorl $14,%edx + movl 4(%esp),%edi + xorl %esi,%edx + movl 84(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2162078206(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 28(%esp),%esi + rorl $14,%edx + movl (%esp),%edi + xorl %ecx,%edx + movl 88(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2614888103(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 24(%esp),%ecx + rorl $14,%edx + movl 28(%esp),%edi + xorl %esi,%edx + movl 92(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3248222580(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 36(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 88(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 32(%esp),%ebx + shrl $10,%edi + addl 68(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,32(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3835390401(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 40(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 92(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 36(%esp),%ebx + shrl $10,%edi + addl 72(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,36(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 4022224774(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 44(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 32(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 40(%esp),%ebx + shrl $10,%edi + addl 76(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,40(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 264347078(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 48(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 36(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 44(%esp),%ebx + shrl $10,%edi + addl 80(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,44(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 604807628(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 52(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 40(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 48(%esp),%ebx + shrl $10,%edi + addl 84(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,48(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 770255983(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 56(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 44(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 52(%esp),%ebx + shrl $10,%edi + addl 88(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,52(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1249150122(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 60(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 48(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 56(%esp),%ebx + shrl $10,%edi + addl 92(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,56(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1555081692(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 64(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 52(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 60(%esp),%ebx + shrl $10,%edi + addl 32(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,60(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1996064986(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 68(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 56(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 64(%esp),%ebx + shrl $10,%edi + addl 36(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,64(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2554220882(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 72(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 60(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 68(%esp),%ebx + shrl $10,%edi + addl 40(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,68(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2821834349(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 76(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 64(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 72(%esp),%ebx + shrl $10,%edi + addl 44(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,72(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2952996808(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 80(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 68(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 76(%esp),%ebx + shrl $10,%edi + addl 48(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,76(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3210313671(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 84(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 72(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 80(%esp),%ebx + shrl $10,%edi + addl 52(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,80(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3336571891(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 88(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 76(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 84(%esp),%ebx + shrl $10,%edi + addl 56(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,84(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3584528711(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 92(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 80(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 88(%esp),%ebx + shrl $10,%edi + addl 60(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,88(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 113926993(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 32(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 84(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 92(%esp),%ebx + shrl $10,%edi + addl 64(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,92(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 338241895(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 36(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 88(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 32(%esp),%ebx + shrl $10,%edi + addl 68(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,32(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 666307205(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 40(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 92(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 36(%esp),%ebx + shrl $10,%edi + addl 72(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,36(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 773529912(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 44(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 32(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 40(%esp),%ebx + shrl $10,%edi + addl 76(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,40(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1294757372(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 48(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 36(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 44(%esp),%ebx + shrl $10,%edi + addl 80(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,44(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1396182291(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 52(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 40(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 48(%esp),%ebx + shrl $10,%edi + addl 84(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,48(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1695183700(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 56(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 44(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 52(%esp),%ebx + shrl $10,%edi + addl 88(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,52(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1986661051(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 60(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 48(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 56(%esp),%ebx + shrl $10,%edi + addl 92(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,56(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2177026350(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 64(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 52(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 60(%esp),%ebx + shrl $10,%edi + addl 32(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,60(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2456956037(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 68(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 56(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 64(%esp),%ebx + shrl $10,%edi + addl 36(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,64(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2730485921(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 72(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 60(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 68(%esp),%ebx + shrl $10,%edi + addl 40(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,68(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2820302411(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 76(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 64(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 72(%esp),%ebx + shrl $10,%edi + addl 44(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,72(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3259730800(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 80(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 68(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 76(%esp),%ebx + shrl $10,%edi + addl 48(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,76(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3345764771(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 84(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 72(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 80(%esp),%ebx + shrl $10,%edi + addl 52(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,80(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3516065817(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 88(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 76(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 84(%esp),%ebx + shrl $10,%edi + addl 56(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,84(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3600352804(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 92(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 80(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 88(%esp),%ebx + shrl $10,%edi + addl 60(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,88(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 4094571909(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 32(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 84(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 92(%esp),%ebx + shrl $10,%edi + addl 64(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,92(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 275423344(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 36(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 88(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 32(%esp),%ebx + shrl $10,%edi + addl 68(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,32(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 430227734(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 40(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 92(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 36(%esp),%ebx + shrl $10,%edi + addl 72(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,36(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 506948616(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 44(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 32(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 40(%esp),%ebx + shrl $10,%edi + addl 76(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,40(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 659060556(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 48(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 36(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 44(%esp),%ebx + shrl $10,%edi + addl 80(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,44(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 883997877(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 52(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 40(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 48(%esp),%ebx + shrl $10,%edi + addl 84(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,48(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 958139571(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 56(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 44(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 52(%esp),%ebx + shrl $10,%edi + addl 88(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,52(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1322822218(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 60(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 48(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 56(%esp),%ebx + shrl $10,%edi + addl 92(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,56(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1537002063(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 64(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 52(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 60(%esp),%ebx + shrl $10,%edi + addl 32(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,60(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1747873779(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 68(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 56(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 64(%esp),%ebx + shrl $10,%edi + addl 36(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,64(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1955562222(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 72(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 60(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 68(%esp),%ebx + shrl $10,%edi + addl 40(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,68(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2024104815(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 76(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 64(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 72(%esp),%ebx + shrl $10,%edi + addl 44(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,72(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2227730452(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 80(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 68(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 76(%esp),%ebx + shrl $10,%edi + addl 48(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,76(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2361852424(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 84(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 72(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 80(%esp),%ebx + shrl $10,%edi + addl 52(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,80(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2428436474(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 88(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 76(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 84(%esp),%ebx + shrl $10,%edi + addl 56(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,84(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2756734187(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 92(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 80(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 88(%esp),%ebx + shrl $10,%edi + addl 60(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3204031479(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 32(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 84(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 92(%esp),%ebx + shrl $10,%edi + addl 64(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3329325298(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 96(%esp),%esi + xorl %edi,%ebp + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebp + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebp,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebp,4(%esp) + xorl %edi,%ebp + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ebx + movl 28(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ebx + addl 28(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %ebx,24(%esi) + movl %ecx,28(%esi) + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ebx,24(%esp) + movl %ecx,28(%esp) + cmpl 104(%esp),%edi + jb L009grand_loop + movl 108(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret diff --git a/lib/accelerated/x86/macosx/sha512-ssse3-x86.s b/lib/accelerated/x86/macosx/sha512-ssse3-x86.s index 0014a8116b..53a7945f1c 100644 --- a/lib/accelerated/x86/macosx/sha512-ssse3-x86.s +++ b/lib/accelerated/x86/macosx/sha512-ssse3-x86.s @@ -593,6 +593,8 @@ L001K512: .long 4234509866,1501505948 .long 987167468,1607167915 .long 1246189591,1816402316 +.long 67438087,66051 +.long 202182159,134810123 .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97 .byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 .byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 diff --git a/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s b/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s index 23da5e6e39..c48240f457 100644 --- a/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s +++ b/lib/accelerated/x86/macosx/sha512-ssse3-x86_64.s @@ -39,10 +39,17 @@ # .text + .globl _sha256_block_data_order .p2align 4 _sha256_block_data_order: + leaq __gnutls_x86_cpuid_s(%rip),%r11 + movl 0(%r11),%r9d + movl 4(%r11),%r10d + movl 8(%r11),%r11d + testl $512,%r10d + jnz L$ssse3_shortcut pushq %rbx pushq %rbp pushq %r12 @@ -60,8 +67,6 @@ _sha256_block_data_order: movq %r11,64+24(%rsp) L$prologue: - leaq K256(%rip),%rbp - movl 0(%rdi),%eax movl 4(%rdi),%ebx movl 8(%rdi),%ecx @@ -74,1694 +79,1632 @@ L$prologue: .p2align 4 L$loop: - xorq %rdi,%rdi + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi movl 0(%rsi),%r12d movl %r8d,%r13d movl %eax,%r14d bswapl %r12d rorl $14,%r13d movl %r9d,%r15d - movl %r12d,0(%rsp) - rorl $9,%r14d xorl %r8d,%r13d + rorl $9,%r14d xorl %r10d,%r15d - rorl $5,%r13d - addl %r11d,%r12d + movl %r12d,0(%rsp) xorl %eax,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r8d,%r15d - movl %ebx,%r11d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d - xorl %r10d,%r15d + addl %r15d,%r12d - xorl %ecx,%r11d + movl %eax,%r15d + addl (%rbp),%r12d xorl %eax,%r14d - addl %r15d,%r12d - movl %ebx,%r15d + xorl %ebx,%r15d rorl $6,%r13d - andl %eax,%r11d - andl %ecx,%r15d + movl %ebx,%r11d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r11d + xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d - leaq 1(%rdi),%rdi - addl %r14d,%r11d + leaq 4(%rbp),%rbp + addl %r14d,%r11d movl 4(%rsi),%r12d movl %edx,%r13d movl %r11d,%r14d bswapl %r12d rorl $14,%r13d - movl %r8d,%r15d - movl %r12d,4(%rsp) + movl %r8d,%edi - rorl $9,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + rorl $9,%r14d + xorl %r9d,%edi - rorl $5,%r13d - addl %r10d,%r12d + movl %r12d,4(%rsp) xorl %r11d,%r14d + andl %edx,%edi - addl (%rbp,%rdi,4),%r12d - andl %edx,%r15d - movl %eax,%r10d + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + addl %edi,%r12d - xorl %ebx,%r10d + movl %r11d,%edi + addl (%rbp),%r12d xorl %r11d,%r14d - addl %r15d,%r12d - movl %eax,%r15d + xorl %eax,%edi rorl $6,%r13d - andl %r11d,%r10d - andl %ebx,%r15d + movl %eax,%r10d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r10d + xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d - leaq 1(%rdi),%rdi - addl %r14d,%r10d + leaq 4(%rbp),%rbp + addl %r14d,%r10d movl 8(%rsi),%r12d movl %ecx,%r13d movl %r10d,%r14d bswapl %r12d rorl $14,%r13d movl %edx,%r15d - movl %r12d,8(%rsp) - rorl $9,%r14d xorl %ecx,%r13d + rorl $9,%r14d xorl %r8d,%r15d - rorl $5,%r13d - addl %r9d,%r12d + movl %r12d,8(%rsp) xorl %r10d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %ecx,%r15d - movl %r11d,%r9d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d - xorl %r8d,%r15d + addl %r15d,%r12d - xorl %eax,%r9d + movl %r10d,%r15d + addl (%rbp),%r12d xorl %r10d,%r14d - addl %r15d,%r12d - movl %r11d,%r15d + xorl %r11d,%r15d rorl $6,%r13d - andl %r10d,%r9d - andl %eax,%r15d + movl %r11d,%r9d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r9d + xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d - leaq 1(%rdi),%rdi - addl %r14d,%r9d + leaq 4(%rbp),%rbp + addl %r14d,%r9d movl 12(%rsi),%r12d movl %ebx,%r13d movl %r9d,%r14d bswapl %r12d rorl $14,%r13d - movl %ecx,%r15d - movl %r12d,12(%rsp) + movl %ecx,%edi - rorl $9,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + rorl $9,%r14d + xorl %edx,%edi - rorl $5,%r13d - addl %r8d,%r12d + movl %r12d,12(%rsp) xorl %r9d,%r14d + andl %ebx,%edi - addl (%rbp,%rdi,4),%r12d - andl %ebx,%r15d - movl %r10d,%r8d + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + addl %edi,%r12d - xorl %r11d,%r8d + movl %r9d,%edi + addl (%rbp),%r12d xorl %r9d,%r14d - addl %r15d,%r12d - movl %r10d,%r15d + xorl %r10d,%edi rorl $6,%r13d - andl %r9d,%r8d - andl %r11d,%r15d + movl %r10d,%r8d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r8d + xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d - leaq 1(%rdi),%rdi - addl %r14d,%r8d + leaq 20(%rbp),%rbp + addl %r14d,%r8d movl 16(%rsi),%r12d movl %eax,%r13d movl %r8d,%r14d bswapl %r12d rorl $14,%r13d movl %ebx,%r15d - movl %r12d,16(%rsp) - rorl $9,%r14d xorl %eax,%r13d + rorl $9,%r14d xorl %ecx,%r15d - rorl $5,%r13d - addl %edx,%r12d + movl %r12d,16(%rsp) xorl %r8d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %eax,%r15d - movl %r9d,%edx + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d - xorl %ecx,%r15d + addl %r15d,%r12d - xorl %r10d,%edx + movl %r8d,%r15d + addl (%rbp),%r12d xorl %r8d,%r14d - addl %r15d,%r12d - movl %r9d,%r15d + xorl %r9d,%r15d rorl $6,%r13d - andl %r8d,%edx - andl %r10d,%r15d + movl %r9d,%edx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%edx + xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx - leaq 1(%rdi),%rdi - addl %r14d,%edx + leaq 4(%rbp),%rbp + addl %r14d,%edx movl 20(%rsi),%r12d movl %r11d,%r13d movl %edx,%r14d bswapl %r12d rorl $14,%r13d - movl %eax,%r15d - movl %r12d,20(%rsp) + movl %eax,%edi - rorl $9,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + rorl $9,%r14d + xorl %ebx,%edi - rorl $5,%r13d - addl %ecx,%r12d + movl %r12d,20(%rsp) xorl %edx,%r14d + andl %r11d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r11d,%r15d - movl %r8d,%ecx + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + addl %edi,%r12d - xorl %r9d,%ecx + movl %edx,%edi + addl (%rbp),%r12d xorl %edx,%r14d - addl %r15d,%r12d - movl %r8d,%r15d + xorl %r8d,%edi rorl $6,%r13d - andl %edx,%ecx - andl %r9d,%r15d + movl %r8d,%ecx + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ecx + xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx - leaq 1(%rdi),%rdi - addl %r14d,%ecx + leaq 4(%rbp),%rbp + addl %r14d,%ecx movl 24(%rsi),%r12d movl %r10d,%r13d movl %ecx,%r14d bswapl %r12d rorl $14,%r13d movl %r11d,%r15d - movl %r12d,24(%rsp) - rorl $9,%r14d xorl %r10d,%r13d + rorl $9,%r14d xorl %eax,%r15d - rorl $5,%r13d - addl %ebx,%r12d + movl %r12d,24(%rsp) xorl %ecx,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r10d,%r15d - movl %edx,%ebx + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d - xorl %eax,%r15d + addl %r15d,%r12d - xorl %r8d,%ebx + movl %ecx,%r15d + addl (%rbp),%r12d xorl %ecx,%r14d - addl %r15d,%r12d - movl %edx,%r15d + xorl %edx,%r15d rorl $6,%r13d - andl %ecx,%ebx - andl %r8d,%r15d + movl %edx,%ebx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ebx + xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx - leaq 1(%rdi),%rdi - addl %r14d,%ebx + leaq 4(%rbp),%rbp + addl %r14d,%ebx movl 28(%rsi),%r12d movl %r9d,%r13d movl %ebx,%r14d bswapl %r12d rorl $14,%r13d - movl %r10d,%r15d - movl %r12d,28(%rsp) + movl %r10d,%edi - rorl $9,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + rorl $9,%r14d + xorl %r11d,%edi - rorl $5,%r13d - addl %eax,%r12d + movl %r12d,28(%rsp) xorl %ebx,%r14d + andl %r9d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r9d,%r15d - movl %ecx,%eax + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + addl %edi,%r12d - xorl %edx,%eax + movl %ebx,%edi + addl (%rbp),%r12d xorl %ebx,%r14d - addl %r15d,%r12d - movl %ecx,%r15d + xorl %ecx,%edi rorl $6,%r13d - andl %ebx,%eax - andl %edx,%r15d + movl %ecx,%eax + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%eax + xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax - leaq 1(%rdi),%rdi - addl %r14d,%eax + leaq 20(%rbp),%rbp + addl %r14d,%eax movl 32(%rsi),%r12d movl %r8d,%r13d movl %eax,%r14d bswapl %r12d rorl $14,%r13d movl %r9d,%r15d - movl %r12d,32(%rsp) - rorl $9,%r14d xorl %r8d,%r13d + rorl $9,%r14d xorl %r10d,%r15d - rorl $5,%r13d - addl %r11d,%r12d + movl %r12d,32(%rsp) xorl %eax,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r8d,%r15d - movl %ebx,%r11d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d - xorl %r10d,%r15d + addl %r15d,%r12d - xorl %ecx,%r11d + movl %eax,%r15d + addl (%rbp),%r12d xorl %eax,%r14d - addl %r15d,%r12d - movl %ebx,%r15d + xorl %ebx,%r15d rorl $6,%r13d - andl %eax,%r11d - andl %ecx,%r15d + movl %ebx,%r11d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r11d + xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d - leaq 1(%rdi),%rdi - addl %r14d,%r11d + leaq 4(%rbp),%rbp + addl %r14d,%r11d movl 36(%rsi),%r12d movl %edx,%r13d movl %r11d,%r14d bswapl %r12d rorl $14,%r13d - movl %r8d,%r15d - movl %r12d,36(%rsp) + movl %r8d,%edi - rorl $9,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + rorl $9,%r14d + xorl %r9d,%edi - rorl $5,%r13d - addl %r10d,%r12d + movl %r12d,36(%rsp) xorl %r11d,%r14d + andl %edx,%edi - addl (%rbp,%rdi,4),%r12d - andl %edx,%r15d - movl %eax,%r10d + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + addl %edi,%r12d - xorl %ebx,%r10d + movl %r11d,%edi + addl (%rbp),%r12d xorl %r11d,%r14d - addl %r15d,%r12d - movl %eax,%r15d + xorl %eax,%edi rorl $6,%r13d - andl %r11d,%r10d - andl %ebx,%r15d + movl %eax,%r10d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r10d + xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d - leaq 1(%rdi),%rdi - addl %r14d,%r10d + leaq 4(%rbp),%rbp + addl %r14d,%r10d movl 40(%rsi),%r12d movl %ecx,%r13d movl %r10d,%r14d bswapl %r12d rorl $14,%r13d movl %edx,%r15d - movl %r12d,40(%rsp) - rorl $9,%r14d xorl %ecx,%r13d + rorl $9,%r14d xorl %r8d,%r15d - rorl $5,%r13d - addl %r9d,%r12d + movl %r12d,40(%rsp) xorl %r10d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %ecx,%r15d - movl %r11d,%r9d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d - xorl %r8d,%r15d + addl %r15d,%r12d - xorl %eax,%r9d + movl %r10d,%r15d + addl (%rbp),%r12d xorl %r10d,%r14d - addl %r15d,%r12d - movl %r11d,%r15d + xorl %r11d,%r15d rorl $6,%r13d - andl %r10d,%r9d - andl %eax,%r15d + movl %r11d,%r9d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r9d + xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d - leaq 1(%rdi),%rdi - addl %r14d,%r9d + leaq 4(%rbp),%rbp + addl %r14d,%r9d movl 44(%rsi),%r12d movl %ebx,%r13d movl %r9d,%r14d bswapl %r12d rorl $14,%r13d - movl %ecx,%r15d - movl %r12d,44(%rsp) + movl %ecx,%edi - rorl $9,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + rorl $9,%r14d + xorl %edx,%edi - rorl $5,%r13d - addl %r8d,%r12d + movl %r12d,44(%rsp) xorl %r9d,%r14d + andl %ebx,%edi - addl (%rbp,%rdi,4),%r12d - andl %ebx,%r15d - movl %r10d,%r8d + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + addl %edi,%r12d - xorl %r11d,%r8d + movl %r9d,%edi + addl (%rbp),%r12d xorl %r9d,%r14d - addl %r15d,%r12d - movl %r10d,%r15d + xorl %r10d,%edi rorl $6,%r13d - andl %r9d,%r8d - andl %r11d,%r15d + movl %r10d,%r8d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r8d + xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d - leaq 1(%rdi),%rdi - addl %r14d,%r8d + leaq 20(%rbp),%rbp + addl %r14d,%r8d movl 48(%rsi),%r12d movl %eax,%r13d movl %r8d,%r14d bswapl %r12d rorl $14,%r13d movl %ebx,%r15d - movl %r12d,48(%rsp) - rorl $9,%r14d xorl %eax,%r13d + rorl $9,%r14d xorl %ecx,%r15d - rorl $5,%r13d - addl %edx,%r12d + movl %r12d,48(%rsp) xorl %r8d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %eax,%r15d - movl %r9d,%edx + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d - xorl %ecx,%r15d + addl %r15d,%r12d - xorl %r10d,%edx + movl %r8d,%r15d + addl (%rbp),%r12d xorl %r8d,%r14d - addl %r15d,%r12d - movl %r9d,%r15d + xorl %r9d,%r15d rorl $6,%r13d - andl %r8d,%edx - andl %r10d,%r15d + movl %r9d,%edx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%edx + xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx - leaq 1(%rdi),%rdi - addl %r14d,%edx + leaq 4(%rbp),%rbp + addl %r14d,%edx movl 52(%rsi),%r12d movl %r11d,%r13d movl %edx,%r14d bswapl %r12d rorl $14,%r13d - movl %eax,%r15d - movl %r12d,52(%rsp) + movl %eax,%edi - rorl $9,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + rorl $9,%r14d + xorl %ebx,%edi - rorl $5,%r13d - addl %ecx,%r12d + movl %r12d,52(%rsp) xorl %edx,%r14d + andl %r11d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r11d,%r15d - movl %r8d,%ecx + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + addl %edi,%r12d - xorl %r9d,%ecx + movl %edx,%edi + addl (%rbp),%r12d xorl %edx,%r14d - addl %r15d,%r12d - movl %r8d,%r15d + xorl %r8d,%edi rorl $6,%r13d - andl %edx,%ecx - andl %r9d,%r15d + movl %r8d,%ecx + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ecx + xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx - leaq 1(%rdi),%rdi - addl %r14d,%ecx + leaq 4(%rbp),%rbp + addl %r14d,%ecx movl 56(%rsi),%r12d movl %r10d,%r13d movl %ecx,%r14d bswapl %r12d rorl $14,%r13d movl %r11d,%r15d - movl %r12d,56(%rsp) - rorl $9,%r14d xorl %r10d,%r13d + rorl $9,%r14d xorl %eax,%r15d - rorl $5,%r13d - addl %ebx,%r12d + movl %r12d,56(%rsp) xorl %ecx,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r10d,%r15d - movl %edx,%ebx + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d - xorl %eax,%r15d + addl %r15d,%r12d - xorl %r8d,%ebx + movl %ecx,%r15d + addl (%rbp),%r12d xorl %ecx,%r14d - addl %r15d,%r12d - movl %edx,%r15d + xorl %edx,%r15d rorl $6,%r13d - andl %ecx,%ebx - andl %r8d,%r15d + movl %edx,%ebx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ebx + xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx - leaq 1(%rdi),%rdi - addl %r14d,%ebx + leaq 4(%rbp),%rbp + addl %r14d,%ebx movl 60(%rsi),%r12d movl %r9d,%r13d movl %ebx,%r14d bswapl %r12d rorl $14,%r13d - movl %r10d,%r15d - movl %r12d,60(%rsp) + movl %r10d,%edi - rorl $9,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + rorl $9,%r14d + xorl %r11d,%edi - rorl $5,%r13d - addl %eax,%r12d + movl %r12d,60(%rsp) xorl %ebx,%r14d + andl %r9d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r9d,%r15d - movl %ecx,%eax + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + addl %edi,%r12d - xorl %edx,%eax + movl %ebx,%edi + addl (%rbp),%r12d xorl %ebx,%r14d - addl %r15d,%r12d - movl %ecx,%r15d + xorl %ecx,%edi rorl $6,%r13d - andl %ebx,%eax - andl %edx,%r15d + movl %ecx,%eax + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%eax + xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax - leaq 1(%rdi),%rdi - addl %r14d,%eax + leaq 20(%rbp),%rbp jmp L$rounds_16_xx .p2align 4 L$rounds_16_xx: movl 4(%rsp),%r13d - movl 56(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 56(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 36(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d addl 0(%rsp),%r12d movl %r8d,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %eax,%r14d rorl $14,%r13d movl %r9d,%r15d - movl %r12d,0(%rsp) - rorl $9,%r14d xorl %r8d,%r13d + rorl $9,%r14d xorl %r10d,%r15d - rorl $5,%r13d - addl %r11d,%r12d + movl %r12d,0(%rsp) xorl %eax,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r8d,%r15d - movl %ebx,%r11d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d - xorl %r10d,%r15d + addl %r15d,%r12d - xorl %ecx,%r11d + movl %eax,%r15d + addl (%rbp),%r12d xorl %eax,%r14d - addl %r15d,%r12d - movl %ebx,%r15d + xorl %ebx,%r15d rorl $6,%r13d - andl %eax,%r11d - andl %ecx,%r15d + movl %ebx,%r11d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r11d + xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d - leaq 1(%rdi),%rdi - addl %r14d,%r11d + leaq 4(%rbp),%rbp movl 8(%rsp),%r13d - movl 60(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 60(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 40(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d addl 4(%rsp),%r12d movl %edx,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %r11d,%r14d rorl $14,%r13d - movl %r8d,%r15d - movl %r12d,4(%rsp) + movl %r8d,%edi - rorl $9,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + rorl $9,%r14d + xorl %r9d,%edi - rorl $5,%r13d - addl %r10d,%r12d + movl %r12d,4(%rsp) xorl %r11d,%r14d + andl %edx,%edi - addl (%rbp,%rdi,4),%r12d - andl %edx,%r15d - movl %eax,%r10d + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + addl %edi,%r12d - xorl %ebx,%r10d + movl %r11d,%edi + addl (%rbp),%r12d xorl %r11d,%r14d - addl %r15d,%r12d - movl %eax,%r15d + xorl %eax,%edi rorl $6,%r13d - andl %r11d,%r10d - andl %ebx,%r15d + movl %eax,%r10d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r10d + xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d - leaq 1(%rdi),%rdi - addl %r14d,%r10d + leaq 4(%rbp),%rbp movl 12(%rsp),%r13d - movl 0(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 0(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 44(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d addl 8(%rsp),%r12d movl %ecx,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %r10d,%r14d rorl $14,%r13d movl %edx,%r15d - movl %r12d,8(%rsp) - rorl $9,%r14d xorl %ecx,%r13d + rorl $9,%r14d xorl %r8d,%r15d - rorl $5,%r13d - addl %r9d,%r12d + movl %r12d,8(%rsp) xorl %r10d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %ecx,%r15d - movl %r11d,%r9d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d - xorl %r8d,%r15d + addl %r15d,%r12d - xorl %eax,%r9d + movl %r10d,%r15d + addl (%rbp),%r12d xorl %r10d,%r14d - addl %r15d,%r12d - movl %r11d,%r15d + xorl %r11d,%r15d rorl $6,%r13d - andl %r10d,%r9d - andl %eax,%r15d + movl %r11d,%r9d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r9d + xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d - leaq 1(%rdi),%rdi - addl %r14d,%r9d + leaq 4(%rbp),%rbp movl 16(%rsp),%r13d - movl 4(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 4(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 48(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d addl 12(%rsp),%r12d movl %ebx,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %r9d,%r14d rorl $14,%r13d - movl %ecx,%r15d - movl %r12d,12(%rsp) + movl %ecx,%edi - rorl $9,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + rorl $9,%r14d + xorl %edx,%edi - rorl $5,%r13d - addl %r8d,%r12d + movl %r12d,12(%rsp) xorl %r9d,%r14d + andl %ebx,%edi - addl (%rbp,%rdi,4),%r12d - andl %ebx,%r15d - movl %r10d,%r8d + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + addl %edi,%r12d - xorl %r11d,%r8d + movl %r9d,%edi + addl (%rbp),%r12d xorl %r9d,%r14d - addl %r15d,%r12d - movl %r10d,%r15d + xorl %r10d,%edi rorl $6,%r13d - andl %r9d,%r8d - andl %r11d,%r15d + movl %r10d,%r8d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r8d + xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d - leaq 1(%rdi),%rdi - addl %r14d,%r8d + leaq 20(%rbp),%rbp movl 20(%rsp),%r13d - movl 8(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 8(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 52(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d addl 16(%rsp),%r12d movl %eax,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %r8d,%r14d rorl $14,%r13d movl %ebx,%r15d - movl %r12d,16(%rsp) - rorl $9,%r14d xorl %eax,%r13d + rorl $9,%r14d xorl %ecx,%r15d - rorl $5,%r13d - addl %edx,%r12d + movl %r12d,16(%rsp) xorl %r8d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %eax,%r15d - movl %r9d,%edx + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d - xorl %ecx,%r15d + addl %r15d,%r12d - xorl %r10d,%edx + movl %r8d,%r15d + addl (%rbp),%r12d xorl %r8d,%r14d - addl %r15d,%r12d - movl %r9d,%r15d + xorl %r9d,%r15d rorl $6,%r13d - andl %r8d,%edx - andl %r10d,%r15d + movl %r9d,%edx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%edx + xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx - leaq 1(%rdi),%rdi - addl %r14d,%edx + leaq 4(%rbp),%rbp movl 24(%rsp),%r13d - movl 12(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 12(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 56(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d addl 20(%rsp),%r12d movl %r11d,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %edx,%r14d rorl $14,%r13d - movl %eax,%r15d - movl %r12d,20(%rsp) + movl %eax,%edi - rorl $9,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + rorl $9,%r14d + xorl %ebx,%edi - rorl $5,%r13d - addl %ecx,%r12d + movl %r12d,20(%rsp) xorl %edx,%r14d + andl %r11d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r11d,%r15d - movl %r8d,%ecx + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + addl %edi,%r12d - xorl %r9d,%ecx + movl %edx,%edi + addl (%rbp),%r12d xorl %edx,%r14d - addl %r15d,%r12d - movl %r8d,%r15d + xorl %r8d,%edi rorl $6,%r13d - andl %edx,%ecx - andl %r9d,%r15d + movl %r8d,%ecx + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ecx + xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx - leaq 1(%rdi),%rdi - addl %r14d,%ecx + leaq 4(%rbp),%rbp movl 28(%rsp),%r13d - movl 16(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 16(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 60(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d addl 24(%rsp),%r12d movl %r10d,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %ecx,%r14d rorl $14,%r13d movl %r11d,%r15d - movl %r12d,24(%rsp) - rorl $9,%r14d xorl %r10d,%r13d + rorl $9,%r14d xorl %eax,%r15d - rorl $5,%r13d - addl %ebx,%r12d + movl %r12d,24(%rsp) xorl %ecx,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r10d,%r15d - movl %edx,%ebx + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d - xorl %eax,%r15d + addl %r15d,%r12d - xorl %r8d,%ebx + movl %ecx,%r15d + addl (%rbp),%r12d xorl %ecx,%r14d - addl %r15d,%r12d - movl %edx,%r15d + xorl %edx,%r15d rorl $6,%r13d - andl %ecx,%ebx - andl %r8d,%r15d + movl %edx,%ebx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ebx + xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx - leaq 1(%rdi),%rdi - addl %r14d,%ebx + leaq 4(%rbp),%rbp movl 32(%rsp),%r13d - movl 20(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 20(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 0(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d addl 28(%rsp),%r12d movl %r9d,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %ebx,%r14d rorl $14,%r13d - movl %r10d,%r15d - movl %r12d,28(%rsp) + movl %r10d,%edi - rorl $9,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + rorl $9,%r14d + xorl %r11d,%edi - rorl $5,%r13d - addl %eax,%r12d + movl %r12d,28(%rsp) xorl %ebx,%r14d + andl %r9d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r9d,%r15d - movl %ecx,%eax + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + addl %edi,%r12d - xorl %edx,%eax + movl %ebx,%edi + addl (%rbp),%r12d xorl %ebx,%r14d - addl %r15d,%r12d - movl %ecx,%r15d + xorl %ecx,%edi rorl $6,%r13d - andl %ebx,%eax - andl %edx,%r15d + movl %ecx,%eax + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%eax + xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax - leaq 1(%rdi),%rdi - addl %r14d,%eax + leaq 20(%rbp),%rbp movl 36(%rsp),%r13d - movl 24(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 24(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 4(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d addl 32(%rsp),%r12d movl %r8d,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %eax,%r14d rorl $14,%r13d movl %r9d,%r15d - movl %r12d,32(%rsp) - rorl $9,%r14d xorl %r8d,%r13d + rorl $9,%r14d xorl %r10d,%r15d - rorl $5,%r13d - addl %r11d,%r12d + movl %r12d,32(%rsp) xorl %eax,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r8d,%r15d - movl %ebx,%r11d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d - xorl %r10d,%r15d + addl %r15d,%r12d - xorl %ecx,%r11d + movl %eax,%r15d + addl (%rbp),%r12d xorl %eax,%r14d - addl %r15d,%r12d - movl %ebx,%r15d + xorl %ebx,%r15d rorl $6,%r13d - andl %eax,%r11d - andl %ecx,%r15d + movl %ebx,%r11d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r11d + xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d - leaq 1(%rdi),%rdi - addl %r14d,%r11d + leaq 4(%rbp),%rbp movl 40(%rsp),%r13d - movl 28(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 28(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 8(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d addl 36(%rsp),%r12d movl %edx,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %r11d,%r14d rorl $14,%r13d - movl %r8d,%r15d - movl %r12d,36(%rsp) + movl %r8d,%edi - rorl $9,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + rorl $9,%r14d + xorl %r9d,%edi - rorl $5,%r13d - addl %r10d,%r12d + movl %r12d,36(%rsp) xorl %r11d,%r14d + andl %edx,%edi - addl (%rbp,%rdi,4),%r12d - andl %edx,%r15d - movl %eax,%r10d + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d - xorl %r9d,%r15d + addl %edi,%r12d - xorl %ebx,%r10d + movl %r11d,%edi + addl (%rbp),%r12d xorl %r11d,%r14d - addl %r15d,%r12d - movl %eax,%r15d + xorl %eax,%edi rorl $6,%r13d - andl %r11d,%r10d - andl %ebx,%r15d + movl %eax,%r10d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r10d + xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d - leaq 1(%rdi),%rdi - addl %r14d,%r10d + leaq 4(%rbp),%rbp movl 44(%rsp),%r13d - movl 32(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 32(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 12(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d addl 40(%rsp),%r12d movl %ecx,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %r10d,%r14d rorl $14,%r13d movl %edx,%r15d - movl %r12d,40(%rsp) - rorl $9,%r14d xorl %ecx,%r13d + rorl $9,%r14d xorl %r8d,%r15d - rorl $5,%r13d - addl %r9d,%r12d + movl %r12d,40(%rsp) xorl %r10d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %ecx,%r15d - movl %r11d,%r9d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d - xorl %r8d,%r15d + addl %r15d,%r12d - xorl %eax,%r9d + movl %r10d,%r15d + addl (%rbp),%r12d xorl %r10d,%r14d - addl %r15d,%r12d - movl %r11d,%r15d + xorl %r11d,%r15d rorl $6,%r13d - andl %r10d,%r9d - andl %eax,%r15d + movl %r11d,%r9d + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r9d + xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d - leaq 1(%rdi),%rdi - addl %r14d,%r9d + leaq 4(%rbp),%rbp movl 48(%rsp),%r13d - movl 36(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 36(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 16(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d addl 44(%rsp),%r12d movl %ebx,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %r9d,%r14d rorl $14,%r13d - movl %ecx,%r15d - movl %r12d,44(%rsp) + movl %ecx,%edi - rorl $9,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + rorl $9,%r14d + xorl %edx,%edi - rorl $5,%r13d - addl %r8d,%r12d + movl %r12d,44(%rsp) xorl %r9d,%r14d + andl %ebx,%edi - addl (%rbp,%rdi,4),%r12d - andl %ebx,%r15d - movl %r10d,%r8d + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d - xorl %edx,%r15d + addl %edi,%r12d - xorl %r11d,%r8d + movl %r9d,%edi + addl (%rbp),%r12d xorl %r9d,%r14d - addl %r15d,%r12d - movl %r10d,%r15d + xorl %r10d,%edi rorl $6,%r13d - andl %r9d,%r8d - andl %r11d,%r15d + movl %r10d,%r8d + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%r8d + xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d - leaq 1(%rdi),%rdi - addl %r14d,%r8d + leaq 20(%rbp),%rbp movl 52(%rsp),%r13d - movl 40(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 40(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 20(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d addl 48(%rsp),%r12d movl %eax,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %r8d,%r14d rorl $14,%r13d movl %ebx,%r15d - movl %r12d,48(%rsp) - rorl $9,%r14d xorl %eax,%r13d + rorl $9,%r14d xorl %ecx,%r15d - rorl $5,%r13d - addl %edx,%r12d + movl %r12d,48(%rsp) xorl %r8d,%r14d - - addl (%rbp,%rdi,4),%r12d andl %eax,%r15d - movl %r9d,%edx + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d - xorl %ecx,%r15d + addl %r15d,%r12d - xorl %r10d,%edx + movl %r8d,%r15d + addl (%rbp),%r12d xorl %r8d,%r14d - addl %r15d,%r12d - movl %r9d,%r15d + xorl %r9d,%r15d rorl $6,%r13d - andl %r8d,%edx - andl %r10d,%r15d + movl %r9d,%edx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%edx + xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx - leaq 1(%rdi),%rdi - addl %r14d,%edx + leaq 4(%rbp),%rbp movl 56(%rsp),%r13d - movl 44(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 44(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 24(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d addl 52(%rsp),%r12d movl %r11d,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %edx,%r14d rorl $14,%r13d - movl %eax,%r15d - movl %r12d,52(%rsp) + movl %eax,%edi - rorl $9,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + rorl $9,%r14d + xorl %ebx,%edi - rorl $5,%r13d - addl %ecx,%r12d + movl %r12d,52(%rsp) xorl %edx,%r14d + andl %r11d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r11d,%r15d - movl %r8d,%ecx + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d - xorl %ebx,%r15d + addl %edi,%r12d - xorl %r9d,%ecx + movl %edx,%edi + addl (%rbp),%r12d xorl %edx,%r14d - addl %r15d,%r12d - movl %r8d,%r15d + xorl %r8d,%edi rorl $6,%r13d - andl %edx,%ecx - andl %r9d,%r15d + movl %r8d,%ecx + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ecx + xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx - leaq 1(%rdi),%rdi - addl %r14d,%ecx + leaq 4(%rbp),%rbp movl 60(%rsp),%r13d - movl 48(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 48(%rsp),%r15d - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d - rorl $7,%r12d xorl %r12d,%r13d - movl 28(%rsp),%r12d - - rorl $2,%r15d + shrl $3,%r12d + rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d addl 56(%rsp),%r12d movl %r10d,%r13d - addl %r14d,%r12d + addl %r15d,%r12d movl %ecx,%r14d rorl $14,%r13d movl %r11d,%r15d - movl %r12d,56(%rsp) - rorl $9,%r14d xorl %r10d,%r13d + rorl $9,%r14d xorl %eax,%r15d - rorl $5,%r13d - addl %ebx,%r12d + movl %r12d,56(%rsp) xorl %ecx,%r14d - - addl (%rbp,%rdi,4),%r12d andl %r10d,%r15d - movl %edx,%ebx + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d - xorl %eax,%r15d + addl %r15d,%r12d - xorl %r8d,%ebx + movl %ecx,%r15d + addl (%rbp),%r12d xorl %ecx,%r14d - addl %r15d,%r12d - movl %edx,%r15d + xorl %edx,%r15d rorl $6,%r13d - andl %ecx,%ebx - andl %r8d,%r15d + movl %edx,%ebx + andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%ebx + xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx - leaq 1(%rdi),%rdi - addl %r14d,%ebx + leaq 4(%rbp),%rbp movl 0(%rsp),%r13d - movl 52(%rsp),%r14d - movl %r13d,%r12d - movl %r14d,%r15d + movl 52(%rsp),%edi - rorl $11,%r12d - xorl %r13d,%r12d - shrl $3,%r13d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi - rorl $7,%r12d xorl %r12d,%r13d - movl 32(%rsp),%r12d - - rorl $2,%r15d - xorl %r14d,%r15d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi shrl $10,%r14d - rorl $17,%r15d - addl %r13d,%r12d - xorl %r15d,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d addl 60(%rsp),%r12d movl %r9d,%r13d - addl %r14d,%r12d + addl %edi,%r12d movl %ebx,%r14d rorl $14,%r13d - movl %r10d,%r15d - movl %r12d,60(%rsp) + movl %r10d,%edi - rorl $9,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + rorl $9,%r14d + xorl %r11d,%edi - rorl $5,%r13d - addl %eax,%r12d + movl %r12d,60(%rsp) xorl %ebx,%r14d + andl %r9d,%edi - addl (%rbp,%rdi,4),%r12d - andl %r9d,%r15d - movl %ecx,%eax + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d - xorl %r11d,%r15d + addl %edi,%r12d - xorl %edx,%eax + movl %ebx,%edi + addl (%rbp),%r12d xorl %ebx,%r14d - addl %r15d,%r12d - movl %ecx,%r15d + xorl %ecx,%edi rorl $6,%r13d - andl %ebx,%eax - andl %edx,%r15d + movl %ecx,%eax + andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d - addl %r15d,%eax + xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax - leaq 1(%rdi),%rdi - addl %r14d,%eax - cmpq $64,%rdi - jb L$rounds_16_xx + leaq 20(%rbp),%rbp + cmpb $0,3(%rbp) + jnz L$rounds_16_xx movq 64+0(%rsp),%rdi + addl %r14d,%eax leaq 64(%rsi),%rsi addl 0(%rdi),%eax @@ -1800,19 +1743,1138 @@ L$epilogue: K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 + +.p2align 6 +sha256_block_data_order_ssse3: +L$ssse3_shortcut: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%r11 + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %r11,64+24(%rsp) +L$prologue_ssse3: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp L$loop_ssse3 +.p2align 4 +L$loop_ssse3: + movdqa K256+512(%rip),%xmm7 + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 +.byte 102,15,56,0,199 + leaq K256(%rip),%rbp +.byte 102,15,56,0,207 + movdqa 0(%rbp),%xmm4 +.byte 102,15,56,0,215 + movdqa 32(%rbp),%xmm5 + paddd %xmm0,%xmm4 + movdqa 64(%rbp),%xmm6 +.byte 102,15,56,0,223 + movdqa 96(%rbp),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp L$ssse3_00_47 + +.p2align 4 +L$ssse3_00_47: + subq $-128,%rbp + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 32(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 64(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 96(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne L$ssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb L$loop_ssse3 + + movq 64+24(%rsp),%rsi + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +L$epilogue_ssse3: + .byte 0xf3,0xc3 + |