diff options
Diffstat (limited to 'deps/openssl/config/archs/BSD-x86/asm/crypto/chacha/chacha-x86.s')
-rw-r--r-- | deps/openssl/config/archs/BSD-x86/asm/crypto/chacha/chacha-x86.s | 1443 |
1 files changed, 1443 insertions, 0 deletions
diff --git a/deps/openssl/config/archs/BSD-x86/asm/crypto/chacha/chacha-x86.s b/deps/openssl/config/archs/BSD-x86/asm/crypto/chacha/chacha-x86.s new file mode 100644 index 0000000000..1fea91cddf --- /dev/null +++ b/deps/openssl/config/archs/BSD-x86/asm/crypto/chacha/chacha-x86.s @@ -0,0 +1,1443 @@ +.text +.globl _ChaCha20_ctr32 +.type _ChaCha20_ctr32,@function +.align 4 +_ChaCha20_ctr32: +L_ChaCha20_ctr32_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + xorl %eax,%eax + cmpl 28(%esp),%eax + je L000no_data + call Lpic_point +Lpic_point: + popl %eax + leal __GLOBAL_OFFSET_TABLE_+[.-Lpic_point](%eax),%ebp + movl _OPENSSL_ia32cap_P@GOT(%ebp),%ebp + testl $16777216,(%ebp) + jz L001x86 + testl $512,4(%ebp) + jz L001x86 + jmp Lssse3_shortcut +L001x86: + movl 32(%esp),%esi + movl 36(%esp),%edi + subl $132,%esp + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,80(%esp) + movl %ebx,84(%esp) + movl %ecx,88(%esp) + movl %edx,92(%esp) + movl 16(%esi),%eax + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%edx + movl %eax,96(%esp) + movl %ebx,100(%esp) + movl %ecx,104(%esp) + movl %edx,108(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + subl $1,%eax + movl %eax,112(%esp) + movl %ebx,116(%esp) + movl %ecx,120(%esp) + movl %edx,124(%esp) + jmp L002entry +.align 4,0x90 +L003outer_loop: + movl %ebx,156(%esp) + movl %eax,152(%esp) + movl %ecx,160(%esp) +L002entry: + movl $1634760805,%eax + movl $857760878,4(%esp) + movl $2036477234,8(%esp) + movl $1797285236,12(%esp) + movl 84(%esp),%ebx + movl 88(%esp),%ebp + movl 104(%esp),%ecx + movl 108(%esp),%esi + movl 116(%esp),%edx + movl 120(%esp),%edi + movl %ebx,20(%esp) + movl %ebp,24(%esp) + movl %ecx,40(%esp) + movl %esi,44(%esp) + movl %edx,52(%esp) + movl %edi,56(%esp) + movl 92(%esp),%ebx + movl 124(%esp),%edi + movl 112(%esp),%edx + movl 80(%esp),%ebp + movl 96(%esp),%ecx + movl 100(%esp),%esi + addl $1,%edx + movl %ebx,28(%esp) + movl %edi,60(%esp) + movl %edx,112(%esp) + movl $10,%ebx + jmp L004loop +.align 4,0x90 +L004loop: + addl %ebp,%eax + movl %ebx,128(%esp) + movl %ebp,%ebx + xorl %eax,%edx + roll $16,%edx + addl %edx,%ecx + xorl %ecx,%ebx + movl 52(%esp),%edi + roll $12,%ebx + movl 20(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,(%esp) + roll $8,%edx + movl 4(%esp),%eax + addl %edx,%ecx + movl %edx,48(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + movl %ecx,32(%esp) + roll $16,%edi + movl %ebx,16(%esp) + addl %edi,%esi + movl 40(%esp),%ecx + xorl %esi,%ebp + movl 56(%esp),%edx + roll $12,%ebp + movl 24(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,4(%esp) + roll $8,%edi + movl 8(%esp),%eax + addl %edi,%esi + movl %edi,52(%esp) + xorl %esi,%ebp + addl %ebx,%eax + roll $7,%ebp + xorl %eax,%edx + movl %esi,36(%esp) + roll $16,%edx + movl %ebp,20(%esp) + addl %edx,%ecx + movl 44(%esp),%esi + xorl %ecx,%ebx + movl 60(%esp),%edi + roll $12,%ebx + movl 28(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,8(%esp) + roll $8,%edx + movl 12(%esp),%eax + addl %edx,%ecx + movl %edx,56(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + roll $16,%edi + movl %ebx,24(%esp) + addl %edi,%esi + xorl %esi,%ebp + roll $12,%ebp + movl 20(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,12(%esp) + roll $8,%edi + movl (%esp),%eax + addl %edi,%esi + movl %edi,%edx + xorl %esi,%ebp + addl %ebx,%eax + roll $7,%ebp + xorl %eax,%edx + roll $16,%edx + movl %ebp,28(%esp) + addl %edx,%ecx + xorl %ecx,%ebx + movl 48(%esp),%edi + roll $12,%ebx + movl 24(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,(%esp) + roll $8,%edx + movl 4(%esp),%eax + addl %edx,%ecx + movl %edx,60(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + movl %ecx,40(%esp) + roll $16,%edi + movl %ebx,20(%esp) + addl %edi,%esi + movl 32(%esp),%ecx + xorl %esi,%ebp + movl 52(%esp),%edx + roll $12,%ebp + movl 28(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,4(%esp) + roll $8,%edi + movl 8(%esp),%eax + addl %edi,%esi + movl %edi,48(%esp) + xorl %esi,%ebp + addl %ebx,%eax + roll $7,%ebp + xorl %eax,%edx + movl %esi,44(%esp) + roll $16,%edx + movl %ebp,24(%esp) + addl %edx,%ecx + movl 36(%esp),%esi + xorl %ecx,%ebx + movl 56(%esp),%edi + roll $12,%ebx + movl 16(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,8(%esp) + roll $8,%edx + movl 12(%esp),%eax + addl %edx,%ecx + movl %edx,52(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + roll $16,%edi + movl %ebx,28(%esp) + addl %edi,%esi + xorl %esi,%ebp + movl 48(%esp),%edx + roll $12,%ebp + movl 128(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,12(%esp) + roll $8,%edi + movl (%esp),%eax + addl %edi,%esi + movl %edi,56(%esp) + xorl %esi,%ebp + roll $7,%ebp + decl %ebx + jnz L004loop + movl 160(%esp),%ebx + addl $1634760805,%eax + addl 80(%esp),%ebp + addl 96(%esp),%ecx + addl 100(%esp),%esi + cmpl $64,%ebx + jb L005tail + movl 156(%esp),%ebx + addl 112(%esp),%edx + addl 120(%esp),%edi + xorl (%ebx),%eax + xorl 16(%ebx),%ebp + movl %eax,(%esp) + movl 152(%esp),%eax + xorl 32(%ebx),%ecx + xorl 36(%ebx),%esi + xorl 48(%ebx),%edx + xorl 56(%ebx),%edi + movl %ebp,16(%eax) + movl %ecx,32(%eax) + movl %esi,36(%eax) + movl %edx,48(%eax) + movl %edi,56(%eax) + movl 4(%esp),%ebp + movl 8(%esp),%ecx + movl 12(%esp),%esi + movl 20(%esp),%edx + movl 24(%esp),%edi + addl $857760878,%ebp + addl $2036477234,%ecx + addl $1797285236,%esi + addl 84(%esp),%edx + addl 88(%esp),%edi + xorl 4(%ebx),%ebp + xorl 8(%ebx),%ecx + xorl 12(%ebx),%esi + xorl 20(%ebx),%edx + xorl 24(%ebx),%edi + movl %ebp,4(%eax) + movl %ecx,8(%eax) + movl %esi,12(%eax) + movl %edx,20(%eax) + movl %edi,24(%eax) + movl 28(%esp),%ebp + movl 40(%esp),%ecx + movl 44(%esp),%esi + movl 52(%esp),%edx + movl 60(%esp),%edi + addl 92(%esp),%ebp + addl 104(%esp),%ecx + addl 108(%esp),%esi + addl 116(%esp),%edx + addl 124(%esp),%edi + xorl 28(%ebx),%ebp + xorl 40(%ebx),%ecx + xorl 44(%ebx),%esi + xorl 52(%ebx),%edx + xorl 60(%ebx),%edi + leal 64(%ebx),%ebx + movl %ebp,28(%eax) + movl (%esp),%ebp + movl %ecx,40(%eax) + movl 160(%esp),%ecx + movl %esi,44(%eax) + movl %edx,52(%eax) + movl %edi,60(%eax) + movl %ebp,(%eax) + leal 64(%eax),%eax + subl $64,%ecx + jnz L003outer_loop + jmp L006done +L005tail: + addl 112(%esp),%edx + addl 120(%esp),%edi + movl %eax,(%esp) + movl %ebp,16(%esp) + movl %ecx,32(%esp) + movl %esi,36(%esp) + movl %edx,48(%esp) + movl %edi,56(%esp) + movl 4(%esp),%ebp + movl 8(%esp),%ecx + movl 12(%esp),%esi + movl 20(%esp),%edx + movl 24(%esp),%edi + addl $857760878,%ebp + addl $2036477234,%ecx + addl $1797285236,%esi + addl 84(%esp),%edx + addl 88(%esp),%edi + movl %ebp,4(%esp) + movl %ecx,8(%esp) + movl %esi,12(%esp) + movl %edx,20(%esp) + movl %edi,24(%esp) + movl 28(%esp),%ebp + movl 40(%esp),%ecx + movl 44(%esp),%esi + movl 52(%esp),%edx + movl 60(%esp),%edi + addl 92(%esp),%ebp + addl 104(%esp),%ecx + addl 108(%esp),%esi + addl 116(%esp),%edx + addl 124(%esp),%edi + movl %ebp,28(%esp) + movl 156(%esp),%ebp + movl %ecx,40(%esp) + movl 152(%esp),%ecx + movl %esi,44(%esp) + xorl %esi,%esi + movl %edx,52(%esp) + movl %edi,60(%esp) + xorl %eax,%eax + xorl %edx,%edx +L007tail_loop: + movb (%esi,%ebp,1),%al + movb (%esp,%esi,1),%dl + leal 1(%esi),%esi + xorb %dl,%al + movb %al,-1(%ecx,%esi,1) + decl %ebx + jnz L007tail_loop +L006done: + addl $132,%esp +L000no_data: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _ChaCha20_ssse3 +.type _ChaCha20_ssse3,@function +.align 4 +_ChaCha20_ssse3: +L_ChaCha20_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +Lssse3_shortcut: + testl $2048,4(%ebp) + jnz Lxop_shortcut + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 32(%esp),%edx + movl 36(%esp),%ebx + movl %esp,%ebp + subl $524,%esp + andl $-64,%esp + movl %ebp,512(%esp) + leal Lssse3_data-Lpic_point(%eax),%eax + movdqu (%ebx),%xmm3 + cmpl $256,%ecx + jb L0081x + movl %edx,516(%esp) + movl %ebx,520(%esp) + subl $256,%ecx + leal 384(%esp),%ebp + movdqu (%edx),%xmm7 + pshufd $0,%xmm3,%xmm0 + pshufd $85,%xmm3,%xmm1 + pshufd $170,%xmm3,%xmm2 + pshufd $255,%xmm3,%xmm3 + paddd 48(%eax),%xmm0 + pshufd $0,%xmm7,%xmm4 + pshufd $85,%xmm7,%xmm5 + psubd 64(%eax),%xmm0 + pshufd $170,%xmm7,%xmm6 + pshufd $255,%xmm7,%xmm7 + movdqa %xmm0,64(%ebp) + movdqa %xmm1,80(%ebp) + movdqa %xmm2,96(%ebp) + movdqa %xmm3,112(%ebp) + movdqu 16(%edx),%xmm3 + movdqa %xmm4,-64(%ebp) + movdqa %xmm5,-48(%ebp) + movdqa %xmm6,-32(%ebp) + movdqa %xmm7,-16(%ebp) + movdqa 32(%eax),%xmm7 + leal 128(%esp),%ebx + pshufd $0,%xmm3,%xmm0 + pshufd $85,%xmm3,%xmm1 + pshufd $170,%xmm3,%xmm2 + pshufd $255,%xmm3,%xmm3 + pshufd $0,%xmm7,%xmm4 + pshufd $85,%xmm7,%xmm5 + pshufd $170,%xmm7,%xmm6 + pshufd $255,%xmm7,%xmm7 + movdqa %xmm0,(%ebp) + movdqa %xmm1,16(%ebp) + movdqa %xmm2,32(%ebp) + movdqa %xmm3,48(%ebp) + movdqa %xmm4,-128(%ebp) + movdqa %xmm5,-112(%ebp) + movdqa %xmm6,-96(%ebp) + movdqa %xmm7,-80(%ebp) + leal 128(%esi),%esi + leal 128(%edi),%edi + jmp L009outer_loop +.align 4,0x90 +L009outer_loop: + movdqa -112(%ebp),%xmm1 + movdqa -96(%ebp),%xmm2 + movdqa -80(%ebp),%xmm3 + movdqa -48(%ebp),%xmm5 + movdqa -32(%ebp),%xmm6 + movdqa -16(%ebp),%xmm7 + movdqa %xmm1,-112(%ebx) + movdqa %xmm2,-96(%ebx) + movdqa %xmm3,-80(%ebx) + movdqa %xmm5,-48(%ebx) + movdqa %xmm6,-32(%ebx) + movdqa %xmm7,-16(%ebx) + movdqa 32(%ebp),%xmm2 + movdqa 48(%ebp),%xmm3 + movdqa 64(%ebp),%xmm4 + movdqa 80(%ebp),%xmm5 + movdqa 96(%ebp),%xmm6 + movdqa 112(%ebp),%xmm7 + paddd 64(%eax),%xmm4 + movdqa %xmm2,32(%ebx) + movdqa %xmm3,48(%ebx) + movdqa %xmm4,64(%ebx) + movdqa %xmm5,80(%ebx) + movdqa %xmm6,96(%ebx) + movdqa %xmm7,112(%ebx) + movdqa %xmm4,64(%ebp) + movdqa -128(%ebp),%xmm0 + movdqa %xmm4,%xmm6 + movdqa -64(%ebp),%xmm3 + movdqa (%ebp),%xmm4 + movdqa 16(%ebp),%xmm5 + movl $10,%edx + nop +.align 4,0x90 +L010loop: + paddd %xmm3,%xmm0 + movdqa %xmm3,%xmm2 + pxor %xmm0,%xmm6 + pshufb (%eax),%xmm6 + paddd %xmm6,%xmm4 + pxor %xmm4,%xmm2 + movdqa -48(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -112(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 80(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-128(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,64(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + movdqa %xmm4,(%ebx) + pshufb (%eax),%xmm7 + movdqa %xmm2,-64(%ebx) + paddd %xmm7,%xmm5 + movdqa 32(%ebx),%xmm4 + pxor %xmm5,%xmm3 + movdqa -32(%ebx),%xmm2 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -96(%ebx),%xmm0 + paddd %xmm3,%xmm1 + movdqa 96(%ebx),%xmm6 + pxor %xmm1,%xmm7 + movdqa %xmm1,-112(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,80(%ebx) + pxor %xmm5,%xmm3 + paddd %xmm2,%xmm0 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + pxor %xmm0,%xmm6 + por %xmm1,%xmm3 + movdqa %xmm5,16(%ebx) + pshufb (%eax),%xmm6 + movdqa %xmm3,-48(%ebx) + paddd %xmm6,%xmm4 + movdqa 48(%ebx),%xmm5 + pxor %xmm4,%xmm2 + movdqa -16(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -80(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 112(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-96(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,96(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + pshufb (%eax),%xmm7 + movdqa %xmm2,-32(%ebx) + paddd %xmm7,%xmm5 + pxor %xmm5,%xmm3 + movdqa -48(%ebx),%xmm2 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -128(%ebx),%xmm0 + paddd %xmm3,%xmm1 + pxor %xmm1,%xmm7 + movdqa %xmm1,-80(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,%xmm6 + pxor %xmm5,%xmm3 + paddd %xmm2,%xmm0 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + pxor %xmm0,%xmm6 + por %xmm1,%xmm3 + pshufb (%eax),%xmm6 + movdqa %xmm3,-16(%ebx) + paddd %xmm6,%xmm4 + pxor %xmm4,%xmm2 + movdqa -32(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -112(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 64(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-128(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,112(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + movdqa %xmm4,32(%ebx) + pshufb (%eax),%xmm7 + movdqa %xmm2,-48(%ebx) + paddd %xmm7,%xmm5 + movdqa (%ebx),%xmm4 + pxor %xmm5,%xmm3 + movdqa -16(%ebx),%xmm2 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -96(%ebx),%xmm0 + paddd %xmm3,%xmm1 + movdqa 80(%ebx),%xmm6 + pxor %xmm1,%xmm7 + movdqa %xmm1,-112(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,64(%ebx) + pxor %xmm5,%xmm3 + paddd %xmm2,%xmm0 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + pxor %xmm0,%xmm6 + por %xmm1,%xmm3 + movdqa %xmm5,48(%ebx) + pshufb (%eax),%xmm6 + movdqa %xmm3,-32(%ebx) + paddd %xmm6,%xmm4 + movdqa 16(%ebx),%xmm5 + pxor %xmm4,%xmm2 + movdqa -64(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -80(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 96(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-96(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,80(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + pshufb (%eax),%xmm7 + movdqa %xmm2,-16(%ebx) + paddd %xmm7,%xmm5 + pxor %xmm5,%xmm3 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -128(%ebx),%xmm0 + paddd %xmm3,%xmm1 + movdqa 64(%ebx),%xmm6 + pxor %xmm1,%xmm7 + movdqa %xmm1,-80(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,96(%ebx) + pxor %xmm5,%xmm3 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + por %xmm1,%xmm3 + decl %edx + jnz L010loop + movdqa %xmm3,-64(%ebx) + movdqa %xmm4,(%ebx) + movdqa %xmm5,16(%ebx) + movdqa %xmm6,64(%ebx) + movdqa %xmm7,96(%ebx) + movdqa -112(%ebx),%xmm1 + movdqa -96(%ebx),%xmm2 + movdqa -80(%ebx),%xmm3 + paddd -128(%ebp),%xmm0 + paddd -112(%ebp),%xmm1 + paddd -96(%ebp),%xmm2 + paddd -80(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 16(%esi),%esi + pxor %xmm0,%xmm4 + movdqa -64(%ebx),%xmm0 + pxor %xmm1,%xmm5 + movdqa -48(%ebx),%xmm1 + pxor %xmm2,%xmm6 + movdqa -32(%ebx),%xmm2 + pxor %xmm3,%xmm7 + movdqa -16(%ebx),%xmm3 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 16(%edi),%edi + paddd -64(%ebp),%xmm0 + paddd -48(%ebp),%xmm1 + paddd -32(%ebp),%xmm2 + paddd -16(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 16(%esi),%esi + pxor %xmm0,%xmm4 + movdqa (%ebx),%xmm0 + pxor %xmm1,%xmm5 + movdqa 16(%ebx),%xmm1 + pxor %xmm2,%xmm6 + movdqa 32(%ebx),%xmm2 + pxor %xmm3,%xmm7 + movdqa 48(%ebx),%xmm3 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 16(%edi),%edi + paddd (%ebp),%xmm0 + paddd 16(%ebp),%xmm1 + paddd 32(%ebp),%xmm2 + paddd 48(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 16(%esi),%esi + pxor %xmm0,%xmm4 + movdqa 64(%ebx),%xmm0 + pxor %xmm1,%xmm5 + movdqa 80(%ebx),%xmm1 + pxor %xmm2,%xmm6 + movdqa 96(%ebx),%xmm2 + pxor %xmm3,%xmm7 + movdqa 112(%ebx),%xmm3 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 16(%edi),%edi + paddd 64(%ebp),%xmm0 + paddd 80(%ebp),%xmm1 + paddd 96(%ebp),%xmm2 + paddd 112(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 208(%esi),%esi + pxor %xmm0,%xmm4 + pxor %xmm1,%xmm5 + pxor %xmm2,%xmm6 + pxor %xmm3,%xmm7 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 208(%edi),%edi + subl $256,%ecx + jnc L009outer_loop + addl $256,%ecx + jz L011done + movl 520(%esp),%ebx + leal -128(%esi),%esi + movl 516(%esp),%edx + leal -128(%edi),%edi + movd 64(%ebp),%xmm2 + movdqu (%ebx),%xmm3 + paddd 96(%eax),%xmm2 + pand 112(%eax),%xmm3 + por %xmm2,%xmm3 +L0081x: + movdqa 32(%eax),%xmm0 + movdqu (%edx),%xmm1 + movdqu 16(%edx),%xmm2 + movdqa (%eax),%xmm6 + movdqa 16(%eax),%xmm7 + movl %ebp,48(%esp) + movdqa %xmm0,(%esp) + movdqa %xmm1,16(%esp) + movdqa %xmm2,32(%esp) + movdqa %xmm3,48(%esp) + movl $10,%edx + jmp L012loop1x +.align 4,0x90 +L013outer1x: + movdqa 80(%eax),%xmm3 + movdqa (%esp),%xmm0 + movdqa 16(%esp),%xmm1 + movdqa 32(%esp),%xmm2 + paddd 48(%esp),%xmm3 + movl $10,%edx + movdqa %xmm3,48(%esp) + jmp L012loop1x +.align 4,0x90 +L012loop1x: + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,222 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,223 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $57,%xmm1,%xmm1 + pshufd $147,%xmm3,%xmm3 + nop + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,222 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,223 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $147,%xmm1,%xmm1 + pshufd $57,%xmm3,%xmm3 + decl %edx + jnz L012loop1x + paddd (%esp),%xmm0 + paddd 16(%esp),%xmm1 + paddd 32(%esp),%xmm2 + paddd 48(%esp),%xmm3 + cmpl $64,%ecx + jb L014tail + movdqu (%esi),%xmm4 + movdqu 16(%esi),%xmm5 + pxor %xmm4,%xmm0 + movdqu 32(%esi),%xmm4 + pxor %xmm5,%xmm1 + movdqu 48(%esi),%xmm5 + pxor %xmm4,%xmm2 + pxor %xmm5,%xmm3 + leal 64(%esi),%esi + movdqu %xmm0,(%edi) + movdqu %xmm1,16(%edi) + movdqu %xmm2,32(%edi) + movdqu %xmm3,48(%edi) + leal 64(%edi),%edi + subl $64,%ecx + jnz L013outer1x + jmp L011done +L014tail: + movdqa %xmm0,(%esp) + movdqa %xmm1,16(%esp) + movdqa %xmm2,32(%esp) + movdqa %xmm3,48(%esp) + xorl %eax,%eax + xorl %edx,%edx + xorl %ebp,%ebp +L015tail_loop: + movb (%esp,%ebp,1),%al + movb (%esi,%ebp,1),%dl + leal 1(%ebp),%ebp + xorb %dl,%al + movb %al,-1(%edi,%ebp,1) + decl %ecx + jnz L015tail_loop +L011done: + movl 512(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 6,0x90 +Lssse3_data: +.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 +.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 +.long 1634760805,857760878,2036477234,1797285236 +.long 0,1,2,3 +.long 4,4,4,4 +.long 1,0,0,0 +.long 4,0,0,0 +.long 0,-1,-1,-1 +.align 6,0x90 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 +.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 +.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 +.byte 114,103,62,0 +.globl _ChaCha20_xop +.type _ChaCha20_xop,@function +.align 4 +_ChaCha20_xop: +L_ChaCha20_xop_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +Lxop_shortcut: + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 32(%esp),%edx + movl 36(%esp),%ebx + vzeroupper + movl %esp,%ebp + subl $524,%esp + andl $-64,%esp + movl %ebp,512(%esp) + leal Lssse3_data-Lpic_point(%eax),%eax + vmovdqu (%ebx),%xmm3 + cmpl $256,%ecx + jb L0161x + movl %edx,516(%esp) + movl %ebx,520(%esp) + subl $256,%ecx + leal 384(%esp),%ebp + vmovdqu (%edx),%xmm7 + vpshufd $0,%xmm3,%xmm0 + vpshufd $85,%xmm3,%xmm1 + vpshufd $170,%xmm3,%xmm2 + vpshufd $255,%xmm3,%xmm3 + vpaddd 48(%eax),%xmm0,%xmm0 + vpshufd $0,%xmm7,%xmm4 + vpshufd $85,%xmm7,%xmm5 + vpsubd 64(%eax),%xmm0,%xmm0 + vpshufd $170,%xmm7,%xmm6 + vpshufd $255,%xmm7,%xmm7 + vmovdqa %xmm0,64(%ebp) + vmovdqa %xmm1,80(%ebp) + vmovdqa %xmm2,96(%ebp) + vmovdqa %xmm3,112(%ebp) + vmovdqu 16(%edx),%xmm3 + vmovdqa %xmm4,-64(%ebp) + vmovdqa %xmm5,-48(%ebp) + vmovdqa %xmm6,-32(%ebp) + vmovdqa %xmm7,-16(%ebp) + vmovdqa 32(%eax),%xmm7 + leal 128(%esp),%ebx + vpshufd $0,%xmm3,%xmm0 + vpshufd $85,%xmm3,%xmm1 + vpshufd $170,%xmm3,%xmm2 + vpshufd $255,%xmm3,%xmm3 + vpshufd $0,%xmm7,%xmm4 + vpshufd $85,%xmm7,%xmm5 + vpshufd $170,%xmm7,%xmm6 + vpshufd $255,%xmm7,%xmm7 + vmovdqa %xmm0,(%ebp) + vmovdqa %xmm1,16(%ebp) + vmovdqa %xmm2,32(%ebp) + vmovdqa %xmm3,48(%ebp) + vmovdqa %xmm4,-128(%ebp) + vmovdqa %xmm5,-112(%ebp) + vmovdqa %xmm6,-96(%ebp) + vmovdqa %xmm7,-80(%ebp) + leal 128(%esi),%esi + leal 128(%edi),%edi + jmp L017outer_loop +.align 5,0x90 +L017outer_loop: + vmovdqa -112(%ebp),%xmm1 + vmovdqa -96(%ebp),%xmm2 + vmovdqa -80(%ebp),%xmm3 + vmovdqa -48(%ebp),%xmm5 + vmovdqa -32(%ebp),%xmm6 + vmovdqa -16(%ebp),%xmm7 + vmovdqa %xmm1,-112(%ebx) + vmovdqa %xmm2,-96(%ebx) + vmovdqa %xmm3,-80(%ebx) + vmovdqa %xmm5,-48(%ebx) + vmovdqa %xmm6,-32(%ebx) + vmovdqa %xmm7,-16(%ebx) + vmovdqa 32(%ebp),%xmm2 + vmovdqa 48(%ebp),%xmm3 + vmovdqa 64(%ebp),%xmm4 + vmovdqa 80(%ebp),%xmm5 + vmovdqa 96(%ebp),%xmm6 + vmovdqa 112(%ebp),%xmm7 + vpaddd 64(%eax),%xmm4,%xmm4 + vmovdqa %xmm2,32(%ebx) + vmovdqa %xmm3,48(%ebx) + vmovdqa %xmm4,64(%ebx) + vmovdqa %xmm5,80(%ebx) + vmovdqa %xmm6,96(%ebx) + vmovdqa %xmm7,112(%ebx) + vmovdqa %xmm4,64(%ebp) + vmovdqa -128(%ebp),%xmm0 + vmovdqa %xmm4,%xmm6 + vmovdqa -64(%ebp),%xmm3 + vmovdqa (%ebp),%xmm4 + vmovdqa 16(%ebp),%xmm5 + movl $10,%edx + nop +.align 5,0x90 +L018loop: + vpaddd %xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm6,%xmm6 +.byte 143,232,120,194,246,16 + vpaddd %xmm6,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm2 + vmovdqa -112(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -48(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 80(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-128(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,64(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 + vmovdqa %xmm4,(%ebx) +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-64(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa 32(%ebx),%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -96(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vmovdqa -32(%ebx),%xmm2 + vpaddd %xmm3,%xmm1,%xmm1 + vmovdqa 96(%ebx),%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + vpaddd %xmm2,%xmm0,%xmm0 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-112(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa %xmm7,80(%ebx) + vpxor %xmm5,%xmm3,%xmm3 + vpxor %xmm0,%xmm6,%xmm6 +.byte 143,232,120,194,219,7 + vmovdqa %xmm5,16(%ebx) +.byte 143,232,120,194,246,16 + vmovdqa %xmm3,-48(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa 48(%ebx),%xmm5 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa -80(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -16(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 112(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-96(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,96(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-32(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -128(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vmovdqa -48(%ebx),%xmm2 + vpaddd %xmm3,%xmm1,%xmm1 + vpxor %xmm1,%xmm7,%xmm7 + vpaddd %xmm2,%xmm0,%xmm0 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-80(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm5,%xmm3,%xmm3 + vpxor %xmm0,%xmm7,%xmm6 +.byte 143,232,120,194,219,7 +.byte 143,232,120,194,246,16 + vmovdqa %xmm3,-16(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa -112(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -32(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 64(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-128(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,112(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 + vmovdqa %xmm4,32(%ebx) +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-48(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa (%ebx),%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -96(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vmovdqa -16(%ebx),%xmm2 + vpaddd %xmm3,%xmm1,%xmm1 + vmovdqa 80(%ebx),%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + vpaddd %xmm2,%xmm0,%xmm0 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-112(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa %xmm7,64(%ebx) + vpxor %xmm5,%xmm3,%xmm3 + vpxor %xmm0,%xmm6,%xmm6 +.byte 143,232,120,194,219,7 + vmovdqa %xmm5,48(%ebx) +.byte 143,232,120,194,246,16 + vmovdqa %xmm3,-32(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa 16(%ebx),%xmm5 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa -80(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -64(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 96(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-96(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,80(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-16(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -128(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vpaddd %xmm3,%xmm1,%xmm1 + vmovdqa 64(%ebx),%xmm6 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-80(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa %xmm7,96(%ebx) + vpxor %xmm5,%xmm3,%xmm3 +.byte 143,232,120,194,219,7 + decl %edx + jnz L018loop + vmovdqa %xmm3,-64(%ebx) + vmovdqa %xmm4,(%ebx) + vmovdqa %xmm5,16(%ebx) + vmovdqa %xmm6,64(%ebx) + vmovdqa %xmm7,96(%ebx) + vmovdqa -112(%ebx),%xmm1 + vmovdqa -96(%ebx),%xmm2 + vmovdqa -80(%ebx),%xmm3 + vpaddd -128(%ebp),%xmm0,%xmm0 + vpaddd -112(%ebp),%xmm1,%xmm1 + vpaddd -96(%ebp),%xmm2,%xmm2 + vpaddd -80(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 16(%esi),%esi + vmovdqa -64(%ebx),%xmm0 + vmovdqa -48(%ebx),%xmm1 + vmovdqa -32(%ebx),%xmm2 + vmovdqa -16(%ebx),%xmm3 + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 16(%edi),%edi + vpaddd -64(%ebp),%xmm0,%xmm0 + vpaddd -48(%ebp),%xmm1,%xmm1 + vpaddd -32(%ebp),%xmm2,%xmm2 + vpaddd -16(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 16(%esi),%esi + vmovdqa (%ebx),%xmm0 + vmovdqa 16(%ebx),%xmm1 + vmovdqa 32(%ebx),%xmm2 + vmovdqa 48(%ebx),%xmm3 + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 16(%edi),%edi + vpaddd (%ebp),%xmm0,%xmm0 + vpaddd 16(%ebp),%xmm1,%xmm1 + vpaddd 32(%ebp),%xmm2,%xmm2 + vpaddd 48(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 16(%esi),%esi + vmovdqa 64(%ebx),%xmm0 + vmovdqa 80(%ebx),%xmm1 + vmovdqa 96(%ebx),%xmm2 + vmovdqa 112(%ebx),%xmm3 + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 16(%edi),%edi + vpaddd 64(%ebp),%xmm0,%xmm0 + vpaddd 80(%ebp),%xmm1,%xmm1 + vpaddd 96(%ebp),%xmm2,%xmm2 + vpaddd 112(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 208(%esi),%esi + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 208(%edi),%edi + subl $256,%ecx + jnc L017outer_loop + addl $256,%ecx + jz L019done + movl 520(%esp),%ebx + leal -128(%esi),%esi + movl 516(%esp),%edx + leal -128(%edi),%edi + vmovd 64(%ebp),%xmm2 + vmovdqu (%ebx),%xmm3 + vpaddd 96(%eax),%xmm2,%xmm2 + vpand 112(%eax),%xmm3,%xmm3 + vpor %xmm2,%xmm3,%xmm3 +L0161x: + vmovdqa 32(%eax),%xmm0 + vmovdqu (%edx),%xmm1 + vmovdqu 16(%edx),%xmm2 + vmovdqa (%eax),%xmm6 + vmovdqa 16(%eax),%xmm7 + movl %ebp,48(%esp) + vmovdqa %xmm0,(%esp) + vmovdqa %xmm1,16(%esp) + vmovdqa %xmm2,32(%esp) + vmovdqa %xmm3,48(%esp) + movl $10,%edx + jmp L020loop1x +.align 4,0x90 +L021outer1x: + vmovdqa 80(%eax),%xmm3 + vmovdqa (%esp),%xmm0 + vmovdqa 16(%esp),%xmm1 + vmovdqa 32(%esp),%xmm2 + vpaddd 48(%esp),%xmm3,%xmm3 + movl $10,%edx + vmovdqa %xmm3,48(%esp) + jmp L020loop1x +.align 4,0x90 +L020loop1x: + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,16 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,12 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,8 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,7 + vpshufd $78,%xmm2,%xmm2 + vpshufd $57,%xmm1,%xmm1 + vpshufd $147,%xmm3,%xmm3 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,16 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,12 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,8 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,7 + vpshufd $78,%xmm2,%xmm2 + vpshufd $147,%xmm1,%xmm1 + vpshufd $57,%xmm3,%xmm3 + decl %edx + jnz L020loop1x + vpaddd (%esp),%xmm0,%xmm0 + vpaddd 16(%esp),%xmm1,%xmm1 + vpaddd 32(%esp),%xmm2,%xmm2 + vpaddd 48(%esp),%xmm3,%xmm3 + cmpl $64,%ecx + jb L022tail + vpxor (%esi),%xmm0,%xmm0 + vpxor 16(%esi),%xmm1,%xmm1 + vpxor 32(%esi),%xmm2,%xmm2 + vpxor 48(%esi),%xmm3,%xmm3 + leal 64(%esi),%esi + vmovdqu %xmm0,(%edi) + vmovdqu %xmm1,16(%edi) + vmovdqu %xmm2,32(%edi) + vmovdqu %xmm3,48(%edi) + leal 64(%edi),%edi + subl $64,%ecx + jnz L021outer1x + jmp L019done +L022tail: + vmovdqa %xmm0,(%esp) + vmovdqa %xmm1,16(%esp) + vmovdqa %xmm2,32(%esp) + vmovdqa %xmm3,48(%esp) + xorl %eax,%eax + xorl %edx,%edx + xorl %ebp,%ebp +L023tail_loop: + movb (%esp,%ebp,1),%al + movb (%esi,%ebp,1),%dl + leal 1(%ebp),%ebp + xorb %dl,%al + movb %al,-1(%edi,%ebp,1) + decl %ecx + jnz L023tail_loop +L019done: + vzeroupper + movl 512(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.comm _OPENSSL_ia32cap_P,16 |