From 982ba6fa712d44275c2541b6b9badf489cf9eda6 Mon Sep 17 00:00:00 2001 From: weidai Date: Mon, 24 Sep 2007 00:43:57 +0000 Subject: - port x64 assembly code to MASM - improve stack unwindability on x64 for GCC by not modifying RBP/RSP registers in inline assembly git-svn-id: svn://svn.code.sf.net/p/cryptopp/code/trunk/c5@396 57ff6487-cd31-0410-9ec3-f628ee90f5f0 --- x64masm.asm | 1842 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 1840 insertions(+), 2 deletions(-) (limited to 'x64masm.asm') diff --git a/x64masm.asm b/x64masm.asm index 76676a7..a395c9a 100755 --- a/x64masm.asm +++ b/x64masm.asm @@ -1,5 +1,6 @@ -PUBLIC Baseline_Add -PUBLIC Baseline_Sub +include ksamd64.inc +EXTERNDEF s_sosemanukMulTables:FAR + .CODE ALIGN 8 Baseline_Add PROC @@ -54,5 +55,1842 @@ $1@Baseline_Sub: ret Baseline_Sub ENDP +ALIGN 8 +Salsa20_OperateKeystream PROC FRAME +mov r10, [rsp + 5*8] +alloc_stack(10*16 + 32*16 + 8) +save_xmm128 xmm6, 0200h +save_xmm128 xmm7, 0210h +save_xmm128 xmm8, 0220h +save_xmm128 xmm9, 0230h +save_xmm128 xmm10, 0240h +save_xmm128 xmm11, 0250h +save_xmm128 xmm12, 0260h +save_xmm128 xmm13, 0270h +save_xmm128 xmm14, 0280h +save_xmm128 xmm15, 0290h +.endprolog +cmp r8, 4 +jl label5 +movdqa xmm0, [r10 + 0*16] +movdqa xmm1, [r10 + 1*16] +movdqa xmm2, [r10 + 2*16] +movdqa xmm3, [r10 + 3*16] +pshufd xmm4, xmm0, 0*64+0*16+0*4+0 +movdqa [rsp + (0*4+0)*16 + 256], xmm4 +pshufd xmm4, xmm0, 1*64+1*16+1*4+1 +movdqa [rsp + (0*4+1)*16 + 256], xmm4 +pshufd xmm4, xmm0, 2*64+2*16+2*4+2 +movdqa [rsp + (0*4+2)*16 + 256], xmm4 +pshufd xmm4, xmm0, 3*64+3*16+3*4+3 +movdqa [rsp + (0*4+3)*16 + 256], xmm4 +pshufd xmm4, xmm1, 0*64+0*16+0*4+0 +movdqa [rsp + (1*4+0)*16 + 256], xmm4 +pshufd xmm4, xmm1, 2*64+2*16+2*4+2 +movdqa [rsp + (1*4+2)*16 + 256], xmm4 +pshufd xmm4, xmm1, 3*64+3*16+3*4+3 +movdqa [rsp + (1*4+3)*16 + 256], xmm4 +pshufd xmm4, xmm2, 1*64+1*16+1*4+1 +movdqa [rsp + (2*4+1)*16 + 256], xmm4 +pshufd xmm4, xmm2, 2*64+2*16+2*4+2 +movdqa [rsp + (2*4+2)*16 + 256], xmm4 +pshufd xmm4, xmm2, 3*64+3*16+3*4+3 +movdqa [rsp + (2*4+3)*16 + 256], xmm4 +pshufd xmm4, xmm3, 0*64+0*16+0*4+0 +movdqa [rsp + (3*4+0)*16 + 256], xmm4 +pshufd xmm4, xmm3, 1*64+1*16+1*4+1 +movdqa [rsp + (3*4+1)*16 + 256], xmm4 +pshufd xmm4, xmm3, 2*64+2*16+2*4+2 +movdqa [rsp + (3*4+2)*16 + 256], xmm4 +pshufd xmm4, xmm3, 3*64+3*16+3*4+3 +movdqa [rsp + (3*4+3)*16 + 256], xmm4 +label1: +mov eax, dword ptr [r10 + 8*4] +mov r11d, dword ptr [r10 + 5*4] +mov dword ptr [rsp + 8*16 + 0*4 + 256], eax +mov dword ptr [rsp + 5*16 + 0*4 + 256], r11d +add eax, 1 +adc r11d, 0 +mov dword ptr [rsp + 8*16 + 1*4 + 256], eax +mov dword ptr [rsp + 5*16 + 1*4 + 256], r11d +add eax, 1 +adc r11d, 0 +mov dword ptr [rsp + 8*16 + 2*4 + 256], eax +mov dword ptr [rsp + 5*16 + 2*4 + 256], r11d +add eax, 1 +adc r11d, 0 +mov dword ptr [rsp + 8*16 + 3*4 + 256], eax +mov dword ptr [rsp + 5*16 + 3*4 + 256], r11d +add eax, 1 +adc r11d, 0 +mov dword ptr [r10 + 8*4], eax +mov dword ptr [r10 + 5*4], r11d +movdqa xmm0, [rsp + 12*16 + 1*256] +movdqa xmm4, [rsp + 13*16 + 1*256] +movdqa xmm8, [rsp + 14*16 + 1*256] +movdqa xmm12, [rsp + 15*16 + 1*256] +movdqa xmm2, [rsp + 0*16 + 1*256] +movdqa xmm6, [rsp + 1*16 + 1*256] +movdqa xmm10, [rsp + 2*16 + 1*256] +movdqa xmm14, [rsp + 3*16 + 1*256] +paddd xmm0, xmm2 +paddd xmm4, xmm6 +paddd xmm8, xmm10 +paddd xmm12, xmm14 +movdqa xmm1, xmm0 +movdqa xmm5, xmm4 +movdqa xmm9, xmm8 +movdqa xmm13, xmm12 +pslld xmm0, 7 +pslld xmm4, 7 +pslld xmm8, 7 +pslld xmm12, 7 +psrld xmm1, 32-7 +psrld xmm5, 32-7 +psrld xmm9, 32-7 +psrld xmm13, 32-7 +pxor xmm0, [rsp + 4*16 + 1*256] +pxor xmm4, [rsp + 5*16 + 1*256] +pxor xmm8, [rsp + 6*16 + 1*256] +pxor xmm12, [rsp + 7*16 + 1*256] +pxor xmm0, xmm1 +pxor xmm4, xmm5 +pxor xmm8, xmm9 +pxor xmm12, xmm13 +movdqa [rsp + 4*16], xmm0 +movdqa [rsp + 5*16], xmm4 +movdqa [rsp + 6*16], xmm8 +movdqa [rsp + 7*16], xmm12 +movdqa xmm1, xmm0 +movdqa xmm5, xmm4 +movdqa xmm9, xmm8 +movdqa xmm13, xmm12 +paddd xmm0, xmm2 +paddd xmm4, xmm6 +paddd xmm8, xmm10 +paddd xmm12, xmm14 +movdqa xmm3, xmm0 +movdqa xmm7, xmm4 +movdqa xmm11, xmm8 +movdqa xmm15, xmm12 +pslld xmm0, 9 +pslld xmm4, 9 +pslld xmm8, 9 +pslld xmm12, 9 +psrld xmm3, 32-9 +psrld xmm7, 32-9 +psrld xmm11, 32-9 +psrld xmm15, 32-9 +pxor xmm0, [rsp + 8*16 + 1*256] +pxor xmm4, [rsp + 9*16 + 1*256] +pxor xmm8, [rsp + 10*16 + 1*256] +pxor xmm12, [rsp + 11*16 + 1*256] +pxor xmm0, xmm3 +pxor xmm4, xmm7 +pxor xmm8, xmm11 +pxor xmm12, xmm15 +movdqa [rsp + 8*16], xmm0 +movdqa [rsp + 9*16], xmm4 +movdqa [rsp + 10*16], xmm8 +movdqa [rsp + 11*16], xmm12 +movdqa xmm3, xmm0 +movdqa xmm7, xmm4 +movdqa xmm11, xmm8 +movdqa xmm15, xmm12 +paddd xmm0, xmm1 +paddd xmm4, xmm5 +paddd xmm8, xmm9 +paddd xmm12, xmm13 +movdqa xmm1, xmm0 +movdqa xmm5, xmm4 +movdqa xmm9, xmm8 +movdqa xmm13, xmm12 +pslld xmm0, 13 +pslld xmm4, 13 +pslld xmm8, 13 +pslld xmm12, 13 +psrld xmm1, 32-13 +psrld xmm5, 32-13 +psrld xmm9, 32-13 +psrld xmm13, 32-13 +pxor xmm0, [rsp + 12*16 + 1*256] +pxor xmm4, [rsp + 13*16 + 1*256] +pxor xmm8, [rsp + 14*16 + 1*256] +pxor xmm12, [rsp + 15*16 + 1*256] +pxor xmm0, xmm1 +pxor xmm4, xmm5 +pxor xmm8, xmm9 +pxor xmm12, xmm13 +movdqa [rsp + 12*16], xmm0 +movdqa [rsp + 13*16], xmm4 +movdqa [rsp + 14*16], xmm8 +movdqa [rsp + 15*16], xmm12 +paddd xmm0, xmm3 +paddd xmm4, xmm7 +paddd xmm8, xmm11 +paddd xmm12, xmm15 +movdqa xmm3, xmm0 +movdqa xmm7, xmm4 +movdqa xmm11, xmm8 +movdqa xmm15, xmm12 +pslld xmm0, 18 +pslld xmm4, 18 +pslld xmm8, 18 +pslld xmm12, 18 +psrld xmm3, 32-18 +psrld xmm7, 32-18 +psrld xmm11, 32-18 +psrld xmm15, 32-18 +pxor xmm0, xmm2 +pxor xmm4, xmm6 +pxor xmm8, xmm10 +pxor xmm12, xmm14 +pxor xmm0, xmm3 +pxor xmm4, xmm7 +pxor xmm8, xmm11 +pxor xmm12, xmm15 +movdqa [rsp + 0*16], xmm0 +movdqa [rsp + 1*16], xmm4 +movdqa [rsp + 2*16], xmm8 +movdqa [rsp + 3*16], xmm12 +mov rax, r9 +jmp label2 +labelSSE2_Salsa_Output: +movdqa xmm0, xmm4 +punpckldq xmm4, xmm5 +movdqa xmm1, xmm6 +punpckldq xmm6, xmm7 +movdqa xmm2, xmm4 +punpcklqdq xmm4, xmm6 +punpckhqdq xmm2, xmm6 +punpckhdq xmm0, xmm5 +punpckhdq xmm1, xmm7 +movdqa xmm6, xmm0 +punpcklqdq xmm0, xmm1 +punpckhqdq xmm6, xmm1 +test rdx, rdx +jz labelSSE2_Salsa_Output_A3 +test rdx, 15 +jnz labelSSE2_Salsa_Output_A7 +pxor xmm4, [rdx+0*16] +pxor xmm2, [rdx+4*16] +pxor xmm0, [rdx+8*16] +pxor xmm6, [rdx+12*16] +add rdx, 1*16 +jmp labelSSE2_Salsa_Output_A3 +labelSSE2_Salsa_Output_A7: +movdqu xmm1, [rdx+0*16] +pxor xmm4, xmm1 +movdqu xmm1, [rdx+4*16] +pxor xmm2, xmm1 +movdqu xmm1, [rdx+8*16] +pxor xmm0, xmm1 +movdqu xmm1, [rdx+12*16] +pxor xmm6, xmm1 +add rdx, 1*16 +labelSSE2_Salsa_Output_A3: +test rcx, 15 +jnz labelSSE2_Salsa_Output_A8 +movdqa [rcx+0*16], xmm4 +movdqa [rcx+4*16], xmm2 +movdqa [rcx+8*16], xmm0 +movdqa [rcx+12*16], xmm6 +jmp labelSSE2_Salsa_Output_A9 +labelSSE2_Salsa_Output_A8: +movdqu [rcx+0*16], xmm4 +movdqu [rcx+4*16], xmm2 +movdqu [rcx+8*16], xmm0 +movdqu [rcx+12*16], xmm6 +labelSSE2_Salsa_Output_A9: +add rcx, 1*16 +ret +label6: +movdqa xmm0, [rsp + 12*16 + 0*256] +movdqa xmm4, [rsp + 13*16 + 0*256] +movdqa xmm8, [rsp + 14*16 + 0*256] +movdqa xmm12, [rsp + 15*16 + 0*256] +movdqa xmm2, [rsp + 0*16 + 0*256] +movdqa xmm6, [rsp + 1*16 + 0*256] +movdqa xmm10, [rsp + 2*16 + 0*256] +movdqa xmm14, [rsp + 3*16 + 0*256] +paddd xmm0, xmm2 +paddd xmm4, xmm6 +paddd xmm8, xmm10 +paddd xmm12, xmm14 +movdqa xmm1, xmm0 +movdqa xmm5, xmm4 +movdqa xmm9, xmm8 +movdqa xmm13, xmm12 +pslld xmm0, 7 +pslld xmm4, 7 +pslld xmm8, 7 +pslld xmm12, 7 +psrld xmm1, 32-7 +psrld xmm5, 32-7 +psrld xmm9, 32-7 +psrld xmm13, 32-7 +pxor xmm0, [rsp + 4*16 + 0*256] +pxor xmm4, [rsp + 5*16 + 0*256] +pxor xmm8, [rsp + 6*16 + 0*256] +pxor xmm12, [rsp + 7*16 + 0*256] +pxor xmm0, xmm1 +pxor xmm4, xmm5 +pxor xmm8, xmm9 +pxor xmm12, xmm13 +movdqa [rsp + 4*16], xmm0 +movdqa [rsp + 5*16], xmm4 +movdqa [rsp + 6*16], xmm8 +movdqa [rsp + 7*16], xmm12 +movdqa xmm1, xmm0 +movdqa xmm5, xmm4 +movdqa xmm9, xmm8 +movdqa xmm13, xmm12 +paddd xmm0, xmm2 +paddd xmm4, xmm6 +paddd xmm8, xmm10 +paddd xmm12, xmm14 +movdqa xmm3, xmm0 +movdqa xmm7, xmm4 +movdqa xmm11, xmm8 +movdqa xmm15, xmm12 +pslld xmm0, 9 +pslld xmm4, 9 +pslld xmm8, 9 +pslld xmm12, 9 +psrld xmm3, 32-9 +psrld xmm7, 32-9 +psrld xmm11, 32-9 +psrld xmm15, 32-9 +pxor xmm0, [rsp + 8*16 + 0*256] +pxor xmm4, [rsp + 9*16 + 0*256] +pxor xmm8, [rsp + 10*16 + 0*256] +pxor xmm12, [rsp + 11*16 + 0*256] +pxor xmm0, xmm3 +pxor xmm4, xmm7 +pxor xmm8, xmm11 +pxor xmm12, xmm15 +movdqa [rsp + 8*16], xmm0 +movdqa [rsp + 9*16], xmm4 +movdqa [rsp + 10*16], xmm8 +movdqa [rsp + 11*16], xmm12 +movdqa xmm3, xmm0 +movdqa xmm7, xmm4 +movdqa xmm11, xmm8 +movdqa xmm15, xmm12 +paddd xmm0, xmm1 +paddd xmm4, xmm5 +paddd xmm8, xmm9 +paddd xmm12, xmm13 +movdqa xmm1, xmm0 +movdqa xmm5, xmm4 +movdqa xmm9, xmm8 +movdqa xmm13, xmm12 +pslld xmm0, 13 +pslld xmm4, 13 +pslld xmm8, 13 +pslld xmm12, 13 +psrld xmm1, 32-13 +psrld xmm5, 32-13 +psrld xmm9, 32-13 +psrld xmm13, 32-13 +pxor xmm0, [rsp + 12*16 + 0*256] +pxor xmm4, [rsp + 13*16 + 0*256] +pxor xmm8, [rsp + 14*16 + 0*256] +pxor xmm12, [rsp + 15*16 + 0*256] +pxor xmm0, xmm1 +pxor xmm4, xmm5 +pxor xmm8, xmm9 +pxor xmm12, xmm13 +movdqa [rsp + 12*16], xmm0 +movdqa [rsp + 13*16], xmm4 +movdqa [rsp + 14*16], xmm8 +movdqa [rsp + 15*16], xmm12 +paddd xmm0, xmm3 +paddd xmm4, xmm7 +paddd xmm8, xmm11 +paddd xmm12, xmm15 +movdqa xmm3, xmm0 +movdqa xmm7, xmm4 +movdqa xmm11, xmm8 +movdqa xmm15, xmm12 +pslld xmm0, 18 +pslld xmm4, 18 +pslld xmm8, 18 +pslld xmm12, 18 +psrld xmm3, 32-18 +psrld xmm7, 32-18 +psrld xmm11, 32-18 +psrld xmm15, 32-18 +pxor xmm0, xmm2 +pxor xmm4, xmm6 +pxor xmm8, xmm10 +pxor xmm12, xmm14 +pxor xmm0, xmm3 +pxor xmm4, xmm7 +pxor xmm8, xmm11 +pxor xmm12, xmm15 +movdqa [rsp + 0*16], xmm0 +movdqa [rsp + 1*16], xmm4 +movdqa [rsp + 2*16], xmm8 +movdqa [rsp + 3*16], xmm12 +label2: +movdqa xmm0, [rsp + 7*16 + 0*256] +movdqa xmm4, [rsp + 4*16 + 0*256] +movdqa xmm8, [rsp + 5*16 + 0*256] +movdqa xmm12, [rsp + 6*16 + 0*256] +movdqa xmm2, [rsp + 0*16 + 0*256] +movdqa xmm6, [rsp + 1*16 + 0*256] +movdqa xmm10, [rsp + 2*16 + 0*256] +movdqa xmm14, [rsp + 3*16 + 0*256] +paddd xmm0, xmm2 +paddd xmm4, xmm6 +paddd xmm8, xmm10 +paddd xmm12, xmm14 +movdqa xmm1, xmm0 +movdqa xmm5, xmm4 +movdqa xmm9, xmm8 +movdqa xmm13, xmm12 +pslld xmm0, 7 +pslld xmm4, 7 +pslld xmm8, 7 +pslld xmm12, 7 +psrld xmm1, 32-7 +psrld xmm5, 32-7 +psrld xmm9, 32-7 +psrld xmm13, 32-7 +pxor xmm0, [rsp + 13*16 + 0*256] +pxor xmm4, [rsp + 14*16 + 0*256] +pxor xmm8, [rsp + 15*16 + 0*256] +pxor xmm12, [rsp + 12*16 + 0*256] +pxor xmm0, xmm1 +pxor xmm4, xmm5 +pxor xmm8, xmm9 +pxor xmm12, xmm13 +movdqa [rsp + 13*16], xmm0 +movdqa [rsp + 14*16], xmm4 +movdqa [rsp + 15*16], xmm8 +movdqa [rsp + 12*16], xmm12 +movdqa xmm1, xmm0 +movdqa xmm5, xmm4 +movdqa xmm9, xmm8 +movdqa xmm13, xmm12 +paddd xmm0, xmm2 +paddd xmm4, xmm6 +paddd xmm8, xmm10 +paddd xmm12, xmm14 +movdqa xmm3, xmm0 +movdqa xmm7, xmm4 +movdqa xmm11, xmm8 +movdqa xmm15, xmm12 +pslld xmm0, 9 +pslld xmm4, 9 +pslld xmm8, 9 +pslld xmm12, 9 +psrld xmm3, 32-9 +psrld xmm7, 32-9 +psrld xmm11, 32-9 +psrld xmm15, 32-9 +pxor xmm0, [rsp + 10*16 + 0*256] +pxor xmm4, [rsp + 11*16 + 0*256] +pxor xmm8, [rsp + 8*16 + 0*256] +pxor xmm12, [rsp + 9*16 + 0*256] +pxor xmm0, xmm3 +pxor xmm4, xmm7 +pxor xmm8, xmm11 +pxor xmm12, xmm15 +movdqa [rsp + 10*16], xmm0 +movdqa [rsp + 11*16], xmm4 +movdqa [rsp + 8*16], xmm8 +movdqa [rsp + 9*16], xmm12 +movdqa xmm3, xmm0 +movdqa xmm7, xmm4 +movdqa xmm11, xmm8 +movdqa xmm15, xmm12 +paddd xmm0, xmm1 +paddd xmm4, xmm5 +paddd xmm8, xmm9 +paddd xmm12, xmm13 +movdqa xmm1, xmm0 +movdqa xmm5, xmm4 +movdqa xmm9, xmm8 +movdqa xmm13, xmm12 +pslld xmm0, 13 +pslld xmm4, 13 +pslld xmm8, 13 +pslld xmm12, 13 +psrld xmm1, 32-13 +psrld xmm5, 32-13 +psrld xmm9, 32-13 +psrld xmm13, 32-13 +pxor xmm0, [rsp + 7*16 + 0*256] +pxor xmm4, [rsp + 4*16 + 0*256] +pxor xmm8, [rsp + 5*16 + 0*256] +pxor xmm12, [rsp + 6*16 + 0*256] +pxor xmm0, xmm1 +pxor xmm4, xmm5 +pxor xmm8, xmm9 +pxor xmm12, xmm13 +movdqa [rsp + 7*16], xmm0 +movdqa [rsp + 4*16], xmm4 +movdqa [rsp + 5*16], xmm8 +movdqa [rsp + 6*16], xmm12 +paddd xmm0, xmm3 +paddd xmm4, xmm7 +paddd xmm8, xmm11 +paddd xmm12, xmm15 +movdqa xmm3, xmm0 +movdqa xmm7, xmm4 +movdqa xmm11, xmm8 +movdqa xmm15, xmm12 +pslld xmm0, 18 +pslld xmm4, 18 +pslld xmm8, 18 +pslld xmm12, 18 +psrld xmm3, 32-18 +psrld xmm7, 32-18 +psrld xmm11, 32-18 +psrld xmm15, 32-18 +pxor xmm0, xmm2 +pxor xmm4, xmm6 +pxor xmm8, xmm10 +pxor xmm12, xmm14 +pxor xmm0, xmm3 +pxor xmm4, xmm7 +pxor xmm8, xmm11 +pxor xmm12, xmm15 +movdqa [rsp + 0*16], xmm0 +movdqa [rsp + 1*16], xmm4 +movdqa [rsp + 2*16], xmm8 +movdqa [rsp + 3*16], xmm12 +sub eax, 2 +jnz label6 +movdqa xmm4, [rsp + 0*16 + 256] +paddd xmm4, [rsp + 0*16] +movdqa xmm5, [rsp + 13*16 + 256] +paddd xmm5, [rsp + 13*16] +movdqa xmm6, [rsp + 10*16 + 256] +paddd xmm6, [rsp + 10*16] +movdqa xmm7, [rsp + 7*16 + 256] +paddd xmm7, [rsp + 7*16] +call labelSSE2_Salsa_Output +movdqa xmm4, [rsp + 4*16 + 256] +paddd xmm4, [rsp + 4*16] +movdqa xmm5, [rsp + 1*16 + 256] +paddd xmm5, [rsp + 1*16] +movdqa xmm6, [rsp + 14*16 + 256] +paddd xmm6, [rsp + 14*16] +movdqa xmm7, [rsp + 11*16 + 256] +paddd xmm7, [rsp + 11*16] +call labelSSE2_Salsa_Output +movdqa xmm4, [rsp + 8*16 + 256] +paddd xmm4, [rsp + 8*16] +movdqa xmm5, [rsp + 5*16 + 256] +paddd xmm5, [rsp + 5*16] +movdqa xmm6, [rsp + 2*16 + 256] +paddd xmm6, [rsp + 2*16] +movdqa xmm7, [rsp + 15*16 + 256] +paddd xmm7, [rsp + 15*16] +call labelSSE2_Salsa_Output +movdqa xmm4, [rsp + 12*16 + 256] +paddd xmm4, [rsp + 12*16] +movdqa xmm5, [rsp + 9*16 + 256] +paddd xmm5, [rsp + 9*16] +movdqa xmm6, [rsp + 6*16 + 256] +paddd xmm6, [rsp + 6*16] +movdqa xmm7, [rsp + 3*16 + 256] +paddd xmm7, [rsp + 3*16] +call labelSSE2_Salsa_Output +test rdx, rdx +jz label9 +add rdx, 12*16 +label9: +add rcx, 12*16 +sub r8, 4 +cmp r8, 4 +jge label1 +label5: +sub r8, 1 +jl label4 +movdqa xmm0, [r10 + 0*16] +movdqa xmm1, [r10 + 1*16] +movdqa xmm2, [r10 + 2*16] +movdqa xmm3, [r10 + 3*16] +mov rax, r9 +label0: +movdqa xmm4, xmm3 +paddd xmm4, xmm0 +movdqa xmm5, xmm4 +pslld xmm4, 7 +psrld xmm5, 32-7 +pxor xmm1, xmm4 +pxor xmm1, xmm5 +movdqa xmm4, xmm0 +paddd xmm4, xmm1 +movdqa xmm5, xmm4 +pslld xmm4, 9 +psrld xmm5, 32-9 +pxor xmm2, xmm4 +pxor xmm2, xmm5 +movdqa xmm4, xmm1 +paddd xmm4, xmm2 +movdqa xmm5, xmm4 +pslld xmm4, 13 +psrld xmm5, 32-13 +pxor xmm3, xmm4 +pxor xmm3, xmm5 +movdqa xmm4, xmm2 +paddd xmm4, xmm3 +movdqa xmm5, xmm4 +pslld xmm4, 18 +psrld xmm5, 32-18 +pxor xmm0, xmm4 +pxor xmm0, xmm5 +pshufd xmm1, xmm1, 2*64+1*16+0*4+3 +pshufd xmm2, xmm2, 1*64+0*16+3*4+2 +pshufd xmm3, xmm3, 0*64+3*16+2*4+1 +movdqa xmm4, xmm1 +paddd xmm4, xmm0 +movdqa xmm5, xmm4 +pslld xmm4, 7 +psrld xmm5, 32-7 +pxor xmm3, xmm4 +pxor xmm3, xmm5 +movdqa xmm4, xmm0 +paddd xmm4, xmm3 +movdqa xmm5, xmm4 +pslld xmm4, 9 +psrld xmm5, 32-9 +pxor xmm2, xmm4 +pxor xmm2, xmm5 +movdqa xmm4, xmm3 +paddd xmm4, xmm2 +movdqa xmm5, xmm4 +pslld xmm4, 13 +psrld xmm5, 32-13 +pxor xmm1, xmm4 +pxor xmm1, xmm5 +movdqa xmm4, xmm2 +paddd xmm4, xmm1 +movdqa xmm5, xmm4 +pslld xmm4, 18 +psrld xmm5, 32-18 +pxor xmm0, xmm4 +pxor xmm0, xmm5 +pshufd xmm1, xmm1, 0*64+3*16+2*4+1 +pshufd xmm2, xmm2, 1*64+0*16+3*4+2 +pshufd xmm3, xmm3, 2*64+1*16+0*4+3 +sub eax, 2 +jnz label0 +paddd xmm0, [r10 + 0*16] +paddd xmm1, [r10 + 1*16] +paddd xmm2, [r10 + 2*16] +paddd xmm3, [r10 + 3*16] +add dword ptr [r10 + 8*4], 1 +adc dword ptr [r10 + 5*4], 0 +pcmpeqb xmm6, xmm6 +psrlq xmm6, 32 +pshufd xmm7, xmm6, 0*64+1*16+2*4+3 +movdqa xmm4, xmm0 +movdqa xmm5, xmm3 +pand xmm0, xmm7 +pand xmm4, xmm6 +pand xmm3, xmm6 +pand xmm5, xmm7 +por xmm4, xmm5 +movdqa xmm5, xmm1 +pand xmm1, xmm7 +pand xmm5, xmm6 +por xmm0, xmm5 +pand xmm6, xmm2 +pand xmm2, xmm7 +por xmm1, xmm6 +por xmm2, xmm3 +movdqa xmm5, xmm4 +movdqa xmm6, xmm0 +shufpd xmm4, xmm1, 2 +shufpd xmm0, xmm2, 2 +shufpd xmm1, xmm5, 2 +shufpd xmm2, xmm6, 2 +test rdx, rdx +jz labelSSE2_Salsa_Output_B3 +test rdx, 15 +jnz labelSSE2_Salsa_Output_B7 +pxor xmm4, [rdx+0*16] +pxor xmm0, [rdx+1*16] +pxor xmm1, [rdx+2*16] +pxor xmm2, [rdx+3*16] +add rdx, 4*16 +jmp labelSSE2_Salsa_Output_B3 +labelSSE2_Salsa_Output_B7: +movdqu xmm3, [rdx+0*16] +pxor xmm4, xmm3 +movdqu xmm3, [rdx+1*16] +pxor xmm0, xmm3 +movdqu xmm3, [rdx+2*16] +pxor xmm1, xmm3 +movdqu xmm3, [rdx+3*16] +pxor xmm2, xmm3 +add rdx, 4*16 +labelSSE2_Salsa_Output_B3: +test rcx, 15 +jnz labelSSE2_Salsa_Output_B8 +movdqa [rcx+0*16], xmm4 +movdqa [rcx+1*16], xmm0 +movdqa [rcx+2*16], xmm1 +movdqa [rcx+3*16], xmm2 +jmp labelSSE2_Salsa_Output_B9 +labelSSE2_Salsa_Output_B8: +movdqu [rcx+0*16], xmm4 +movdqu [rcx+1*16], xmm0 +movdqu [rcx+2*16], xmm1 +movdqu [rcx+3*16], xmm2 +labelSSE2_Salsa_Output_B9: +add rcx, 4*16 +jmp label5 +label4: +movdqa xmm6, [rsp + 0200h] +movdqa xmm7, [rsp + 0210h] +movdqa xmm8, [rsp + 0220h] +movdqa xmm9, [rsp + 0230h] +movdqa xmm10, [rsp + 0240h] +movdqa xmm11, [rsp + 0250h] +movdqa xmm12, [rsp + 0260h] +movdqa xmm13, [rsp + 0270h] +movdqa xmm14, [rsp + 0280h] +movdqa xmm15, [rsp + 0290h] +add rsp, 10*16 + 32*16 + 8 +ret +Salsa20_OperateKeystream ENDP +ALIGN 8 +Rijndael_Enc_ProcessAndXorBlock PROC FRAME +rex_push_reg rbx +push_reg rsi +push_reg rdi +push_reg r12 +push_reg r13 +push_reg r14 +push_reg r15 +.endprolog +mov r11, rcx +mov rdi, [rsp + 5*8 + 7*8] ; inBlock +mov eax, [r8+0*4] +xor eax, [rdi+0*4] +mov r13d, eax +mov ebx, [r8+1*4] +xor ebx, [rdi+1*4] +mov r14d, ebx +and ebx, eax +mov eax, [r8+2*4] +xor eax, [rdi+2*4] +mov r15d, eax +and ebx, eax +mov ecx, [r8+3*4] +xor ecx, [rdi+3*4] +and ebx, ecx +and ebx, 0 +mov edi, ebx +label2: +and ebx, [r11+rdi] +add edi, edx +and ebx, [r11+rdi] +add edi, edx +and ebx, [r11+rdi] +add edi, edx +and ebx, [r11+rdi] +add edi, edx +cmp edi, 1024 +jl label2 +and ebx, [r11+1020] +xor r13d, ebx +xor r14d, ebx +xor r15d, ebx +xor ecx, ebx +mov edi, [r8+4*4] +mov eax, [r8+5*4] +mov ebx, [r8+6*4] +mov edx, [r8+7*4] +add r8, 8*4 +movzx esi, cl +xor edx, [r11+0*1024+4*rsi] +movzx esi, ch +xor ebx, [r11+1*1024+4*rsi] +shr ecx, 16 +movzx esi, cl +xor eax, [r11+2*1024+4*rsi] +movzx esi, ch +xor edi, [r11+3*1024+4*rsi] +mov ecx, r15d +movzx esi, cl +xor ebx, [r11+0*1024+4*rsi] +movzx esi, ch +xor eax, [r11+1*1024+4*rsi] +shr ecx, 16 +movzx esi, cl +xor edi, [r11+2*1024+4*rsi] +movzx esi, ch +xor edx, [r11+3*1024+4*rsi] +mov ecx, r14d +movzx esi, cl +xor eax, [r11+0*1024+4*rsi] +movzx esi, ch +xor edi, [r11+1*1024+4*rsi] +shr ecx, 16 +movzx esi, cl +xor edx, [r11+2*1024+4*rsi] +movzx esi, ch +xor ebx, [r11+3*1024+4*rsi] +mov ecx, r13d +movzx esi, cl +xor edi, [r11+0*1024+4*rsi] +movzx esi, ch +xor edx, [r11+1*1024+4*rsi] +shr ecx, 16 +movzx esi, cl +xor ebx, [r11+2*1024+4*rsi] +movzx esi, ch +xor eax, [r11+3*1024+4*rsi] +mov r15d, ebx +mov r14d, eax +mov r13d, edi +label0: +mov edi, [r8+0*4] +mov eax, [r8+1*4] +mov ebx, [r8+2*4] +mov ecx, [r8+3*4] +movzx esi, dl +xor edi, [r11+3*1024+4*rsi] +movzx esi, dh +xor eax, [r11+2*1024+4*rsi] +shr edx, 16 +movzx esi, dl +xor ebx, [r11+1*1024+4*rsi] +movzx esi, dh +xor ecx, [r11+0*1024+4*rsi] +mov edx, r15d +movzx esi, dl +xor ecx, [r11+3*1024+4*rsi] +movzx esi, dh +xor edi, [r11+2*1024+4*rsi] +shr edx, 16 +movzx esi, dl +xor eax, [r11+1*1024+4*rsi] +movzx esi, dh +xor ebx, [r11+0*1024+4*rsi] +mov edx, r14d +movzx esi, dl +xor ebx, [r11+3*1024+4*rsi] +movzx esi, dh +xor ecx, [r11+2*1024+4*rsi] +shr edx, 16 +movzx esi, dl +xor edi, [r11+1*1024+4*rsi] +movzx esi, dh +xor eax, [r11+0*1024+4*rsi] +mov edx, r13d +movzx esi, dl +xor eax, [r11+3*1024+4*rsi] +movzx esi, dh +xor ebx, [r11+2*1024+4*rsi] +shr edx, 16 +movzx esi, dl +xor ecx, [r11+1*1024+4*rsi] +movzx esi, dh +xor edi, [r11+0*1024+4*rsi] +mov r15d, ebx +mov r14d, eax +mov r13d, edi +mov edi, [r8+4*4] +mov eax, [r8+5*4] +mov ebx, [r8+6*4] +mov edx, [r8+7*4] +movzx esi, cl +xor edi, [r11+3*1024+4*rsi] +movzx esi, ch +xor eax, [r11+2*1024+4*rsi] +shr ecx, 16 +movzx esi, cl +xor ebx, [r11+1*1024+4*rsi] +movzx esi, ch +xor edx, [r11+0*1024+4*rsi] +mov ecx, r15d +movzx esi, cl +xor edx, [r11+3*1024+4*rsi] +movzx esi, ch +xor edi, [r11+2*1024+4*rsi] +shr ecx, 16 +movzx esi, cl +xor eax, [r11+1*1024+4*rsi] +movzx esi, ch +xor ebx, [r11+0*1024+4*rsi] +mov ecx, r14d +movzx esi, cl +xor ebx, [r11+3*1024+4*rsi] +movzx esi, ch +xor edx, [r11+2*1024+4*rsi] +shr ecx, 16 +movzx esi, cl +xor edi, [r11+1*1024+4*rsi] +movzx esi, ch +xor eax, [r11+0*1024+4*rsi] +mov ecx, r13d +movzx esi, cl +xor eax, [r11+3*1024+4*rsi] +movzx esi, ch +xor ebx, [r11+2*1024+4*rsi] +shr ecx, 16 +movzx esi, cl +xor edx, [r11+1*1024+4*rsi] +movzx esi, ch +xor edi, [r11+0*1024+4*rsi] +mov r15d, ebx +mov r14d, eax +mov r13d, edi +add r8, 8*4 +cmp r9, r8 +jne label0 +mov eax, [r9+0*4] +mov ecx, [r9+1*4] +mov esi, [r9+2*4] +mov edi, [r9+3*4] +movzx ebx, dl +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 3*8 +xor eax, ebx +movzx ebx, dh +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 2*8 +xor ecx, ebx +shr edx, 16 +movzx ebx, dl +shr edx, 8 +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 1*8 +xor esi, ebx +movzx ebx, BYTE PTR [r11+1+4*rdx] +xor edi, ebx +mov edx, r15d +movzx ebx, dl +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 3*8 +xor edi, ebx +movzx ebx, dh +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 2*8 +xor eax, ebx +shr edx, 16 +movzx ebx, dl +shr edx, 8 +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 1*8 +xor ecx, ebx +movzx ebx, BYTE PTR [r11+1+4*rdx] +xor esi, ebx +mov edx, r14d +movzx ebx, dl +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 3*8 +xor esi, ebx +movzx ebx, dh +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 2*8 +xor edi, ebx +shr edx, 16 +movzx ebx, dl +shr edx, 8 +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 1*8 +xor eax, ebx +movzx ebx, BYTE PTR [r11+1+4*rdx] +xor ecx, ebx +mov edx, r13d +movzx ebx, dl +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 3*8 +xor ecx, ebx +movzx ebx, dh +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 2*8 +xor esi, ebx +shr edx, 16 +movzx ebx, dl +shr edx, 8 +movzx ebx, BYTE PTR [r11+1+4*rbx] +shl ebx, 1*8 +xor edi, ebx +movzx ebx, BYTE PTR [r11+1+4*rdx] +xor eax, ebx +mov rbx, [rsp + 6*8 + 7*8] ; xorBlock +test rbx, rbx +jz label1 +xor eax, [rbx+0*4] +xor ecx, [rbx+1*4] +xor esi, [rbx+2*4] +xor edi, [rbx+3*4] +label1: +mov rbx, [rsp + 7*8 + 7*8] ; outBlock +mov [rbx+0*4], eax +mov [rbx+1*4], ecx +mov [rbx+2*4], esi +mov [rbx+3*4], edi +pop r15 +pop r14 +pop r13 +pop r12 +pop rdi +pop rsi +pop rbx +ret +Rijndael_Enc_ProcessAndXorBlock ENDP + +ALIGN 8 +Sosemanuk_OperateKeystream PROC FRAME +rex_push_reg rsi +push_reg rdi +alloc_stack(80*4*2+12*4+8*8 + 2*16+8) +save_xmm128 xmm6, 02f0h +save_xmm128 xmm7, 0300h +.endprolog +mov rdi, r8 +mov rax, r9 +mov QWORD PTR [rsp+1*8], rdi +mov QWORD PTR [rsp+2*8], rdx +mov QWORD PTR [rsp+6*8], rax +lea rcx, [4*rcx+rcx] +lea rsi, [4*rcx] +mov QWORD PTR [rsp+3*8], rsi +movdqa xmm0, [rax+0*16] +movdqa [rsp + 8*8+0*16], xmm0 +movdqa xmm0, [rax+1*16] +movdqa [rsp + 8*8+1*16], xmm0 +movq xmm0, QWORD PTR [rax+2*16] +movq QWORD PTR [rsp + 8*8+2*16], xmm0 +psrlq xmm0, 32 +movd r10d, xmm0 +mov ecx, [rax+10*4] +mov edx, [rax+11*4] +pcmpeqb xmm7, xmm7 +label2: +lea rdi, [rsp + 8*8 + 12*4] +mov rax, 80 +cmp rsi, 80 +cmovg rsi, rax +mov QWORD PTR [rsp+7*8], rsi +lea rsi, [rdi+rsi] +mov QWORD PTR [rsp+4*8], rsi +lea rsi, s_sosemanukMulTables +label0: +mov eax, [rsp + 8*8 + ((0+0)-((0+0)/(10))*(10))*4] +mov [rdi + (((0)-((0)/(4))*(4))*20 + (0/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + edx] +xor r11d, ecx +mov [rdi + (((0)-((0)/(4))*(4))*20 + (0/4)) * 4], r11d +mov r11d, 1 +and r11d, edx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((0+3)-((0+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((0+2)-((0+2)/(10))*(10))*4] +add ecx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul edx, 54655307h +rol edx, 7 +mov [rsp + 8*8 + ((0+0)-((0+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((1+0)-((1+0)/(10))*(10))*4] +mov [rdi + (((1)-((1)/(4))*(4))*20 + (1/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + ecx] +xor r11d, edx +mov [rdi + (((1)-((1)/(4))*(4))*20 + (1/4)) * 4], r11d +mov r11d, 1 +and r11d, ecx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((1+3)-((1+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((1+2)-((1+2)/(10))*(10))*4] +add edx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul ecx, 54655307h +rol ecx, 7 +mov [rsp + 8*8 + ((1+0)-((1+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((2+0)-((2+0)/(10))*(10))*4] +mov [rdi + (((2)-((2)/(4))*(4))*20 + (2/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + edx] +xor r11d, ecx +mov [rdi + (((2)-((2)/(4))*(4))*20 + (2/4)) * 4], r11d +mov r11d, 1 +and r11d, edx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((2+3)-((2+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((2+2)-((2+2)/(10))*(10))*4] +add ecx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul edx, 54655307h +rol edx, 7 +mov [rsp + 8*8 + ((2+0)-((2+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((3+0)-((3+0)/(10))*(10))*4] +mov [rdi + (((3)-((3)/(4))*(4))*20 + (3/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + ecx] +xor r11d, edx +mov [rdi + (((3)-((3)/(4))*(4))*20 + (3/4)) * 4], r11d +mov r11d, 1 +and r11d, ecx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((3+3)-((3+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((3+2)-((3+2)/(10))*(10))*4] +add edx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul ecx, 54655307h +rol ecx, 7 +mov [rsp + 8*8 + ((3+0)-((3+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((4+0)-((4+0)/(10))*(10))*4] +mov [rdi + (((4)-((4)/(4))*(4))*20 + (4/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + edx] +xor r11d, ecx +mov [rdi + (((4)-((4)/(4))*(4))*20 + (4/4)) * 4], r11d +mov r11d, 1 +and r11d, edx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((4+3)-((4+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((4+2)-((4+2)/(10))*(10))*4] +add ecx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul edx, 54655307h +rol edx, 7 +mov [rsp + 8*8 + ((4+0)-((4+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((5+0)-((5+0)/(10))*(10))*4] +mov [rdi + (((5)-((5)/(4))*(4))*20 + (5/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + ecx] +xor r11d, edx +mov [rdi + (((5)-((5)/(4))*(4))*20 + (5/4)) * 4], r11d +mov r11d, 1 +and r11d, ecx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((5+3)-((5+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((5+2)-((5+2)/(10))*(10))*4] +add edx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul ecx, 54655307h +rol ecx, 7 +mov [rsp + 8*8 + ((5+0)-((5+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((6+0)-((6+0)/(10))*(10))*4] +mov [rdi + (((6)-((6)/(4))*(4))*20 + (6/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + edx] +xor r11d, ecx +mov [rdi + (((6)-((6)/(4))*(4))*20 + (6/4)) * 4], r11d +mov r11d, 1 +and r11d, edx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((6+3)-((6+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((6+2)-((6+2)/(10))*(10))*4] +add ecx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul edx, 54655307h +rol edx, 7 +mov [rsp + 8*8 + ((6+0)-((6+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((7+0)-((7+0)/(10))*(10))*4] +mov [rdi + (((7)-((7)/(4))*(4))*20 + (7/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + ecx] +xor r11d, edx +mov [rdi + (((7)-((7)/(4))*(4))*20 + (7/4)) * 4], r11d +mov r11d, 1 +and r11d, ecx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((7+3)-((7+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((7+2)-((7+2)/(10))*(10))*4] +add edx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul ecx, 54655307h +rol ecx, 7 +mov [rsp + 8*8 + ((7+0)-((7+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((8+0)-((8+0)/(10))*(10))*4] +mov [rdi + (((8)-((8)/(4))*(4))*20 + (8/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + edx] +xor r11d, ecx +mov [rdi + (((8)-((8)/(4))*(4))*20 + (8/4)) * 4], r11d +mov r11d, 1 +and r11d, edx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((8+3)-((8+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((8+2)-((8+2)/(10))*(10))*4] +add ecx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul edx, 54655307h +rol edx, 7 +mov [rsp + 8*8 + ((8+0)-((8+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((9+0)-((9+0)/(10))*(10))*4] +mov [rdi + (((9)-((9)/(4))*(4))*20 + (9/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + ecx] +xor r11d, edx +mov [rdi + (((9)-((9)/(4))*(4))*20 + (9/4)) * 4], r11d +mov r11d, 1 +and r11d, ecx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((9+3)-((9+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((9+2)-((9+2)/(10))*(10))*4] +add edx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul ecx, 54655307h +rol ecx, 7 +mov [rsp + 8*8 + ((9+0)-((9+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((10+0)-((10+0)/(10))*(10))*4] +mov [rdi + (((10)-((10)/(4))*(4))*20 + (10/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + edx] +xor r11d, ecx +mov [rdi + (((10)-((10)/(4))*(4))*20 + (10/4)) * 4], r11d +mov r11d, 1 +and r11d, edx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((10+3)-((10+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((10+2)-((10+2)/(10))*(10))*4] +add ecx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul edx, 54655307h +rol edx, 7 +mov [rsp + 8*8 + ((10+0)-((10+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((11+0)-((11+0)/(10))*(10))*4] +mov [rdi + (((11)-((11)/(4))*(4))*20 + (11/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + ecx] +xor r11d, edx +mov [rdi + (((11)-((11)/(4))*(4))*20 + (11/4)) * 4], r11d +mov r11d, 1 +and r11d, ecx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((11+3)-((11+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((11+2)-((11+2)/(10))*(10))*4] +add edx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul ecx, 54655307h +rol ecx, 7 +mov [rsp + 8*8 + ((11+0)-((11+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((12+0)-((12+0)/(10))*(10))*4] +mov [rdi + (((12)-((12)/(4))*(4))*20 + (12/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + edx] +xor r11d, ecx +mov [rdi + (((12)-((12)/(4))*(4))*20 + (12/4)) * 4], r11d +mov r11d, 1 +and r11d, edx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((12+3)-((12+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((12+2)-((12+2)/(10))*(10))*4] +add ecx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul edx, 54655307h +rol edx, 7 +mov [rsp + 8*8 + ((12+0)-((12+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((13+0)-((13+0)/(10))*(10))*4] +mov [rdi + (((13)-((13)/(4))*(4))*20 + (13/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + ecx] +xor r11d, edx +mov [rdi + (((13)-((13)/(4))*(4))*20 + (13/4)) * 4], r11d +mov r11d, 1 +and r11d, ecx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((13+3)-((13+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((13+2)-((13+2)/(10))*(10))*4] +add edx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul ecx, 54655307h +rol ecx, 7 +mov [rsp + 8*8 + ((13+0)-((13+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((14+0)-((14+0)/(10))*(10))*4] +mov [rdi + (((14)-((14)/(4))*(4))*20 + (14/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + edx] +xor r11d, ecx +mov [rdi + (((14)-((14)/(4))*(4))*20 + (14/4)) * 4], r11d +mov r11d, 1 +and r11d, edx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((14+3)-((14+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((14+2)-((14+2)/(10))*(10))*4] +add ecx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul edx, 54655307h +rol edx, 7 +mov [rsp + 8*8 + ((14+0)-((14+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((15+0)-((15+0)/(10))*(10))*4] +mov [rdi + (((15)-((15)/(4))*(4))*20 + (15/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + ecx] +xor r11d, edx +mov [rdi + (((15)-((15)/(4))*(4))*20 + (15/4)) * 4], r11d +mov r11d, 1 +and r11d, ecx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((15+3)-((15+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((15+2)-((15+2)/(10))*(10))*4] +add edx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul ecx, 54655307h +rol ecx, 7 +mov [rsp + 8*8 + ((15+0)-((15+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((16+0)-((16+0)/(10))*(10))*4] +mov [rdi + (((16)-((16)/(4))*(4))*20 + (16/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + edx] +xor r11d, ecx +mov [rdi + (((16)-((16)/(4))*(4))*20 + (16/4)) * 4], r11d +mov r11d, 1 +and r11d, edx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((16+3)-((16+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((16+2)-((16+2)/(10))*(10))*4] +add ecx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul edx, 54655307h +rol edx, 7 +mov [rsp + 8*8 + ((16+0)-((16+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((17+0)-((17+0)/(10))*(10))*4] +mov [rdi + (((17)-((17)/(4))*(4))*20 + (17/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + ecx] +xor r11d, edx +mov [rdi + (((17)-((17)/(4))*(4))*20 + (17/4)) * 4], r11d +mov r11d, 1 +and r11d, ecx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((17+3)-((17+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((17+2)-((17+2)/(10))*(10))*4] +add edx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul ecx, 54655307h +rol ecx, 7 +mov [rsp + 8*8 + ((17+0)-((17+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((18+0)-((18+0)/(10))*(10))*4] +mov [rdi + (((18)-((18)/(4))*(4))*20 + (18/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + edx] +xor r11d, ecx +mov [rdi + (((18)-((18)/(4))*(4))*20 + (18/4)) * 4], r11d +mov r11d, 1 +and r11d, edx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((18+3)-((18+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((18+2)-((18+2)/(10))*(10))*4] +add ecx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul edx, 54655307h +rol edx, 7 +mov [rsp + 8*8 + ((18+0)-((18+0)/(10))*(10))*4], r10d +mov eax, [rsp + 8*8 + ((19+0)-((19+0)/(10))*(10))*4] +mov [rdi + (((19)-((19)/(4))*(4))*20 + (19/4)) * 4 + 80*4], eax +rol eax, 8 +lea r11d, [r10d + ecx] +xor r11d, edx +mov [rdi + (((19)-((19)/(4))*(4))*20 + (19/4)) * 4], r11d +mov r11d, 1 +and r11d, ecx +neg r11d +and r11d, r10d +xor r10d, eax +movzx eax, al +xor r10d, [rsi+rax*4] +mov eax, [rsp + 8*8 + ((19+3)-((19+3)/(10))*(10))*4] +xor r11d, [rsp + 8*8 + ((19+2)-((19+2)/(10))*(10))*4] +add edx, r11d +movzx r11d, al +shr eax, 8 +xor r10d, [rsi+1024+r11*4] +xor r10d, eax +imul ecx, 54655307h +rol ecx, 7 +mov [rsp + 8*8 + ((19+0)-((19+0)/(10))*(10))*4], r10d +add rdi, 5*4 +cmp rdi, QWORD PTR [rsp+4*8] +jne label0 +mov rax, QWORD PTR [rsp+2*8] +mov r11, QWORD PTR [rsp+1*8] +lea rdi, [rsp + 8*8 + 12*4] +mov rsi, QWORD PTR [rsp+7*8] +label1: +movdqa xmm0, [rdi+0*20*4] +movdqa xmm2, [rdi+2*20*4] +movdqa xmm3, [rdi+3*20*4] +movdqa xmm1, [rdi+1*20*4] +movdqa xmm4, xmm0 +pand xmm0, xmm2 +pxor xmm0, xmm3 +pxor xmm2, xmm1 +pxor xmm2, xmm0 +por xmm3, xmm4 +pxor xmm3, xmm1 +pxor xmm4, xmm2 +movdqa xmm1, xmm3 +por xmm3, xmm4 +pxor xmm3, xmm0 +pand xmm0, xmm1 +pxor xmm4, xmm0 +pxor xmm1, xmm3 +pxor xmm1, xmm4 +pxor xmm4, xmm7 +pxor xmm2, [rdi+80*4] +pxor xmm3, [rdi+80*5] +pxor xmm1, [rdi+80*6] +pxor xmm4, [rdi+80*7] +cmp rsi, 16 +jl label4 +movdqa xmm6, xmm2 +punpckldq xmm2, xmm3 +movdqa xmm5, xmm1 +punpckldq xmm1, xmm4 +movdqa xmm0, xmm2 +punpcklqdq xmm2, xmm1 +punpckhqdq xmm0, xmm1 +punpckhdq xmm6, xmm3 +punpckhdq xmm5, xmm4 +movdqa xmm3, xmm6 +punpcklqdq xmm6, xmm5 +punpckhqdq xmm3, xmm5 +test rax, rax +jz labelSSE2_Sosemanuk_Output3 +test rax, 15 +jnz labelSSE2_Sosemanuk_Output7 +pxor xmm2, [rax+0*16] +pxor xmm0, [rax+1*16] +pxor xmm6, [rax+2*16] +pxor xmm3, [rax+3*16] +add rax, 4*16 +jmp labelSSE2_Sosemanuk_Output3 +labelSSE2_Sosemanuk_Output7: +movdqu xmm1, [rax+0*16] +pxor xmm2, xmm1 +movdqu xmm1, [rax+1*16] +pxor xmm0, xmm1 +movdqu xmm1, [rax+2*16] +pxor xmm6, xmm1 +movdqu xmm1, [rax+3*16] +pxor xmm3, xmm1 +add rax, 4*16 +labelSSE2_Sosemanuk_Output3: +test r11, 15 +jnz labelSSE2_Sosemanuk_Output8 +movdqa [r11+0*16], xmm2 +movdqa [r11+1*16], xmm0 +movdqa [r11+2*16], xmm6 +movdqa [r11+3*16], xmm3 +jmp labelSSE2_Sosemanuk_Output9 +labelSSE2_Sosemanuk_Output8: +movdqu [r11+0*16], xmm2 +movdqu [r11+1*16], xmm0 +movdqu [r11+2*16], xmm6 +movdqu [r11+3*16], xmm3 +labelSSE2_Sosemanuk_Output9: +add r11, 4*16 +add rdi, 4*4 +sub rsi, 16 +jnz label1 +mov rsi, QWORD PTR [rsp+3*8] +sub rsi, 80 +jz label6 +mov QWORD PTR [rsp+3*8], rsi +mov QWORD PTR [rsp+2*8], rax +mov QWORD PTR [rsp+1*8], r11 +jmp label2 +label4: +test rax, rax +jz label5 +movd xmm0, dword ptr [rax+0*4] +pxor xmm2, xmm0 +movd xmm0, dword ptr [rax+1*4] +pxor xmm3, xmm0 +movd xmm0, dword ptr [rax+2*4] +pxor xmm1, xmm0 +movd xmm0, dword ptr [rax+3*4] +pxor xmm4, xmm0 +add rax, 16 +label5: +movd dword ptr [r11+0*4], xmm2 +movd dword ptr [r11+1*4], xmm3 +movd dword ptr [r11+2*4], xmm1 +movd dword ptr [r11+3*4], xmm4 +sub rsi, 4 +jz label6 +add r11, 16 +psrldq xmm2, 4 +psrldq xmm3, 4 +psrldq xmm1, 4 +psrldq xmm4, 4 +jmp label4 +label6: +mov r10, QWORD PTR [rsp+6*8] +movdqa xmm0, [rsp + 8*8+0*16] +movdqa [r10+0*16], xmm0 +movdqa xmm0, [rsp + 8*8+1*16] +movdqa [r10+1*16], xmm0 +movq xmm0, QWORD PTR [rsp + 8*8+2*16] +movq QWORD PTR [r10+2*16], xmm0 +mov [r10+10*4], ecx +mov [r10+11*4], edx +movdqa xmm6, [rsp + 02f0h] +movdqa xmm7, [rsp + 0300h] +add rsp, 80*4*2+12*4+8*8 + 2*16+8 +pop rdi +pop rsi +ret +Sosemanuk_OperateKeystream ENDP + +Panama_SSE2_Pull PROC FRAME +alloc_stack(2*16+8) +save_xmm128 xmm6, 0h +save_xmm128 xmm7, 10h +.endprolog +shl rcx, 5 +jz label5 +mov r10d, [rdx+4*17] +add rcx, r10 +mov rdi, rcx +movdqa xmm0, xmmword ptr [rdx+0*16] +movdqa xmm1, xmmword ptr [rdx+1*16] +movdqa xmm2, xmmword ptr [rdx+2*16] +movdqa xmm3, xmmword ptr [rdx+3*16] +mov eax, dword ptr [rdx+4*16] +label4: +movdqa xmm6, xmm2 +movss xmm6, xmm3 +pshufd xmm5, xmm6, 0*64+3*16+2*4+1 +movd xmm6, eax +movdqa xmm7, xmm3 +movss xmm7, xmm6 +pshufd xmm6, xmm7, 0*64+3*16+2*4+1 +movd ecx, xmm2 +not ecx +movd r11d, xmm3 +or ecx, r11d +xor eax, ecx +pcmpeqb xmm7, xmm7 +pxor xmm7, xmm1 +por xmm7, xmm2 +pxor xmm7, xmm3 +movd ecx, xmm7 +rol ecx, (((((5*1) MOD (17))*(((5*1) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(1)) MOD (17)))*13+16)) MOD (17))*4], ecx +pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 +movd ecx, xmm7 +rol ecx, (((((5*5) MOD (17))*(((5*5) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(5)) MOD (17)))*13+16)) MOD (17))*4], ecx +punpckhqdq xmm7, xmm7 +movd ecx, xmm7 +rol ecx, (((((5*9) MOD (17))*(((5*9) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(9)) MOD (17)))*13+16)) MOD (17))*4], ecx +pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 +movd ecx, xmm7 +rol ecx, (((((5*13) MOD (17))*(((5*13) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(13)) MOD (17)))*13+16)) MOD (17))*4], ecx +pcmpeqb xmm7, xmm7 +pxor xmm7, xmm0 +por xmm7, xmm1 +pxor xmm7, xmm2 +movd ecx, xmm7 +rol ecx, (((((5*2) MOD (17))*(((5*2) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(2)) MOD (17)))*13+16)) MOD (17))*4], ecx +pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 +movd ecx, xmm7 +rol ecx, (((((5*6) MOD (17))*(((5*6) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(6)) MOD (17)))*13+16)) MOD (17))*4], ecx +punpckhqdq xmm7, xmm7 +movd ecx, xmm7 +rol ecx, (((((5*10) MOD (17))*(((5*10) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(10)) MOD (17)))*13+16)) MOD (17))*4], ecx +pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 +movd ecx, xmm7 +rol ecx, (((((5*14) MOD (17))*(((5*14) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(14)) MOD (17)))*13+16)) MOD (17))*4], ecx +pcmpeqb xmm7, xmm7 +pxor xmm7, xmm6 +por xmm7, xmm0 +pxor xmm7, xmm1 +movd ecx, xmm7 +rol ecx, (((((5*3) MOD (17))*(((5*3) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(3)) MOD (17)))*13+16)) MOD (17))*4], ecx +pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 +movd ecx, xmm7 +rol ecx, (((((5*7) MOD (17))*(((5*7) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(7)) MOD (17)))*13+16)) MOD (17))*4], ecx +punpckhqdq xmm7, xmm7 +movd ecx, xmm7 +rol ecx, (((((5*11) MOD (17))*(((5*11) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(11)) MOD (17)))*13+16)) MOD (17))*4], ecx +pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 +movd ecx, xmm7 +rol ecx, (((((5*15) MOD (17))*(((5*15) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(15)) MOD (17)))*13+16)) MOD (17))*4], ecx +pcmpeqb xmm7, xmm7 +pxor xmm7, xmm5 +por xmm7, xmm6 +pxor xmm7, xmm0 +movd ecx, xmm7 +rol ecx, (((((5*4) MOD (17))*(((5*4) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(4)) MOD (17)))*13+16)) MOD (17))*4], ecx +pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 +movd ecx, xmm7 +rol ecx, (((((5*8) MOD (17))*(((5*8) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(8)) MOD (17)))*13+16)) MOD (17))*4], ecx +punpckhqdq xmm7, xmm7 +movd ecx, xmm7 +rol ecx, (((((5*12) MOD (17))*(((5*12) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(12)) MOD (17)))*13+16)) MOD (17))*4], ecx +pshuflw xmm7, xmm7, 1*64+0*16+3*4+2 +movd ecx, xmm7 +rol ecx, (((((5*16) MOD (17))*(((5*16) MOD (17))+1)/2)) MOD (32)) +mov [rdx+((((((5*(16)) MOD (17)))*13+16)) MOD (17))*4], ecx +movdqa xmm4, xmm3 +punpcklqdq xmm3, xmm2 +punpckhdq xmm4, xmm2 +movdqa xmm2, xmm1 +punpcklqdq xmm1, xmm0 +punpckhdq xmm2, xmm0 +test r8, r8 +jz label0 +movdqa xmm6, xmm4 +punpcklqdq xmm4, xmm2 +punpckhqdq xmm6, xmm2 +test r9, 15 +jnz label2 +test r9, r9 +jz label1 +pxor xmm4, [r9] +pxor xmm6, [r9+16] +add r9, 32 +jmp label1 +label2: +movdqu xmm0, [r9] +movdqu xmm2, [r9+16] +pxor xmm4, xmm0 +pxor xmm6, xmm2 +add r9, 32 +label1: +test r8, 15 +jnz label3 +movdqa xmmword ptr [r8], xmm4 +movdqa xmmword ptr [r8+16], xmm6 +add r8, 32 +jmp label0 +label3: +movdqu xmmword ptr [r8], xmm4 +movdqu xmmword ptr [r8+16], xmm6 +add r8, 32 +label0: +lea rcx, [r10 + 32] +and rcx, 31*32 +lea r11, [r10 + (32-24)*32] +and r11, 31*32 +movdqa xmm0, xmmword ptr [rdx+20*4+rcx+0*8] +pxor xmm3, xmm0 +pshufd xmm0, xmm0, 2*64+3*16+0*4+1 +movdqa xmmword ptr [rdx+20*4+rcx+0*8], xmm3 +pxor xmm0, xmmword ptr [rdx+20*4+r11+2*8] +movdqa xmmword ptr [rdx+20*4+r11+2*8], xmm0 +movdqa xmm4, xmmword ptr [rdx+20*4+rcx+2*8] +pxor xmm1, xmm4 +movdqa xmmword ptr [rdx+20*4+rcx+2*8], xmm1 +pxor xmm4, xmmword ptr [rdx+20*4+r11+0*8] +movdqa xmmword ptr [rdx+20*4+r11+0*8], xmm4 +movdqa xmm3, xmmword ptr [rdx+3*16] +movdqa xmm2, xmmword ptr [rdx+2*16] +movdqa xmm1, xmmword ptr [rdx+1*16] +movdqa xmm0, xmmword ptr [rdx+0*16] +movd xmm6, eax +movdqa xmm7, xmm3 +movss xmm7, xmm6 +movdqa xmm6, xmm2 +movss xmm6, xmm3 +movdqa xmm5, xmm1 +movss xmm5, xmm2 +movdqa xmm4, xmm0 +movss xmm4, xmm1 +pshufd xmm7, xmm7, 0*64+3*16+2*4+1 +pshufd xmm6, xmm6, 0*64+3*16+2*4+1 +pshufd xmm5, xmm5, 0*64+3*16+2*4+1 +pshufd xmm4, xmm4, 0*64+3*16+2*4+1 +xor eax, 1 +movd ecx, xmm0 +xor eax, ecx +movd ecx, xmm3 +xor eax, ecx +pxor xmm3, xmm2 +pxor xmm2, xmm1 +pxor xmm1, xmm0 +pxor xmm0, xmm7 +pxor xmm3, xmm7 +pxor xmm2, xmm6 +pxor xmm1, xmm5 +pxor xmm0, xmm4 +lea rcx, [r10 + (32-4)*32] +and rcx, 31*32 +lea r11, [r10 + 16*32] +and r11, 31*32 +movdqa xmm4, xmmword ptr [rdx+20*4+rcx+0*16] +movdqa xmm5, xmmword ptr [rdx+20*4+r11+0*16] +movdqa xmm6, xmm4 +punpcklqdq xmm4, xmm5 +punpckhqdq xmm6, xmm5 +pxor xmm3, xmm4 +pxor xmm2, xmm6 +movdqa xmm4, xmmword ptr [rdx+20*4+rcx+1*16] +movdqa xmm5, xmmword ptr [rdx+20*4+r11+1*16] +movdqa xmm6, xmm4 +punpcklqdq xmm4, xmm5 +punpckhqdq xmm6, xmm5 +pxor xmm1, xmm4 +pxor xmm0, xmm6 +add r10, 32 +cmp r10, rdi +jne label4 +mov [rdx+4*16], eax +movdqa xmmword ptr [rdx+3*16], xmm3 +movdqa xmmword ptr [rdx+2*16], xmm2 +movdqa xmmword ptr [rdx+1*16], xmm1 +movdqa xmmword ptr [rdx+0*16], xmm0 +label5: +movdqa xmm6, [rsp + 0h] +movdqa xmm7, [rsp + 10h] +add rsp, 2*16+8 +ret +Panama_SSE2_Pull ENDP + _TEXT ENDS END -- cgit v1.2.1