From caf9e032e6b4ccb114a74a3936c916bcfaba262d Mon Sep 17 00:00:00 2001 From: weidai Date: Mon, 2 Mar 2009 02:39:17 +0000 Subject: changes for 5.6: - added AuthenticatedSymmetricCipher interface class and Filter wrappers - added CCM, GCM (with SSE2 assembly), CMAC, and SEED - improved AES speed on x86 and x64 - removed WORD64_AVAILABLE; compiler 64-bit int support is now required git-svn-id: svn://svn.code.sf.net/p/cryptopp/code/trunk/c5@433 57ff6487-cd31-0410-9ec3-f628ee90f5f0 --- x64masm.asm | 333 ------------------------------------------------------------ 1 file changed, 333 deletions(-) (limited to 'x64masm.asm') diff --git a/x64masm.asm b/x64masm.asm index a395c9a..f27c002 100755 --- a/x64masm.asm +++ b/x64masm.asm @@ -1,59 +1,6 @@ include ksamd64.inc EXTERNDEF s_sosemanukMulTables:FAR - .CODE - ALIGN 8 -Baseline_Add PROC - lea rdx, [rdx+8*rcx] - lea r8, [r8+8*rcx] - lea r9, [r9+8*rcx] - neg rcx ; rcx is negative index - jz $1@Baseline_Add - mov rax,[r8+8*rcx] - add rax,[r9+8*rcx] - mov [rdx+8*rcx],rax -$0@Baseline_Add: - mov rax,[r8+8*rcx+8] - adc rax,[r9+8*rcx+8] - mov [rdx+8*rcx+8],rax - lea rcx,[rcx+2] ; advance index, avoid inc which causes slowdown on Intel Core 2 - jrcxz $1@Baseline_Add ; loop until rcx overflows and becomes zero - mov rax,[r8+8*rcx] - adc rax,[r9+8*rcx] - mov [rdx+8*rcx],rax - jmp $0@Baseline_Add -$1@Baseline_Add: - mov rax, 0 - adc rax, rax ; store carry into rax (return result register) - ret -Baseline_Add ENDP - - ALIGN 8 -Baseline_Sub PROC - lea rdx, [rdx+8*rcx] - lea r8, [r8+8*rcx] - lea r9, [r9+8*rcx] - neg rcx ; rcx is negative index - jz $1@Baseline_Sub - mov rax,[r8+8*rcx] - sub rax,[r9+8*rcx] - mov [rdx+8*rcx],rax -$0@Baseline_Sub: - mov rax,[r8+8*rcx+8] - sbb rax,[r9+8*rcx+8] - mov [rdx+8*rcx+8],rax - lea rcx,[rcx+2] ; advance index, avoid inc which causes slowdown on Intel Core 2 - jrcxz $1@Baseline_Sub ; loop until rcx overflows and becomes zero - mov rax,[r8+8*rcx] - sbb rax,[r9+8*rcx] - mov [rdx+8*rcx],rax - jmp $0@Baseline_Sub -$1@Baseline_Sub: - mov rax, 0 - adc rax, rax ; store carry into rax (return result register) - - ret -Baseline_Sub ENDP ALIGN 8 Salsa20_OperateKeystream PROC FRAME @@ -761,286 +708,6 @@ movdqa xmm15, [rsp + 0290h] add rsp, 10*16 + 32*16 + 8 ret Salsa20_OperateKeystream ENDP -ALIGN 8 -Rijndael_Enc_ProcessAndXorBlock PROC FRAME -rex_push_reg rbx -push_reg rsi -push_reg rdi -push_reg r12 -push_reg r13 -push_reg r14 -push_reg r15 -.endprolog -mov r11, rcx -mov rdi, [rsp + 5*8 + 7*8] ; inBlock -mov eax, [r8+0*4] -xor eax, [rdi+0*4] -mov r13d, eax -mov ebx, [r8+1*4] -xor ebx, [rdi+1*4] -mov r14d, ebx -and ebx, eax -mov eax, [r8+2*4] -xor eax, [rdi+2*4] -mov r15d, eax -and ebx, eax -mov ecx, [r8+3*4] -xor ecx, [rdi+3*4] -and ebx, ecx -and ebx, 0 -mov edi, ebx -label2: -and ebx, [r11+rdi] -add edi, edx -and ebx, [r11+rdi] -add edi, edx -and ebx, [r11+rdi] -add edi, edx -and ebx, [r11+rdi] -add edi, edx -cmp edi, 1024 -jl label2 -and ebx, [r11+1020] -xor r13d, ebx -xor r14d, ebx -xor r15d, ebx -xor ecx, ebx -mov edi, [r8+4*4] -mov eax, [r8+5*4] -mov ebx, [r8+6*4] -mov edx, [r8+7*4] -add r8, 8*4 -movzx esi, cl -xor edx, [r11+0*1024+4*rsi] -movzx esi, ch -xor ebx, [r11+1*1024+4*rsi] -shr ecx, 16 -movzx esi, cl -xor eax, [r11+2*1024+4*rsi] -movzx esi, ch -xor edi, [r11+3*1024+4*rsi] -mov ecx, r15d -movzx esi, cl -xor ebx, [r11+0*1024+4*rsi] -movzx esi, ch -xor eax, [r11+1*1024+4*rsi] -shr ecx, 16 -movzx esi, cl -xor edi, [r11+2*1024+4*rsi] -movzx esi, ch -xor edx, [r11+3*1024+4*rsi] -mov ecx, r14d -movzx esi, cl -xor eax, [r11+0*1024+4*rsi] -movzx esi, ch -xor edi, [r11+1*1024+4*rsi] -shr ecx, 16 -movzx esi, cl -xor edx, [r11+2*1024+4*rsi] -movzx esi, ch -xor ebx, [r11+3*1024+4*rsi] -mov ecx, r13d -movzx esi, cl -xor edi, [r11+0*1024+4*rsi] -movzx esi, ch -xor edx, [r11+1*1024+4*rsi] -shr ecx, 16 -movzx esi, cl -xor ebx, [r11+2*1024+4*rsi] -movzx esi, ch -xor eax, [r11+3*1024+4*rsi] -mov r15d, ebx -mov r14d, eax -mov r13d, edi -label0: -mov edi, [r8+0*4] -mov eax, [r8+1*4] -mov ebx, [r8+2*4] -mov ecx, [r8+3*4] -movzx esi, dl -xor edi, [r11+3*1024+4*rsi] -movzx esi, dh -xor eax, [r11+2*1024+4*rsi] -shr edx, 16 -movzx esi, dl -xor ebx, [r11+1*1024+4*rsi] -movzx esi, dh -xor ecx, [r11+0*1024+4*rsi] -mov edx, r15d -movzx esi, dl -xor ecx, [r11+3*1024+4*rsi] -movzx esi, dh -xor edi, [r11+2*1024+4*rsi] -shr edx, 16 -movzx esi, dl -xor eax, [r11+1*1024+4*rsi] -movzx esi, dh -xor ebx, [r11+0*1024+4*rsi] -mov edx, r14d -movzx esi, dl -xor ebx, [r11+3*1024+4*rsi] -movzx esi, dh -xor ecx, [r11+2*1024+4*rsi] -shr edx, 16 -movzx esi, dl -xor edi, [r11+1*1024+4*rsi] -movzx esi, dh -xor eax, [r11+0*1024+4*rsi] -mov edx, r13d -movzx esi, dl -xor eax, [r11+3*1024+4*rsi] -movzx esi, dh -xor ebx, [r11+2*1024+4*rsi] -shr edx, 16 -movzx esi, dl -xor ecx, [r11+1*1024+4*rsi] -movzx esi, dh -xor edi, [r11+0*1024+4*rsi] -mov r15d, ebx -mov r14d, eax -mov r13d, edi -mov edi, [r8+4*4] -mov eax, [r8+5*4] -mov ebx, [r8+6*4] -mov edx, [r8+7*4] -movzx esi, cl -xor edi, [r11+3*1024+4*rsi] -movzx esi, ch -xor eax, [r11+2*1024+4*rsi] -shr ecx, 16 -movzx esi, cl -xor ebx, [r11+1*1024+4*rsi] -movzx esi, ch -xor edx, [r11+0*1024+4*rsi] -mov ecx, r15d -movzx esi, cl -xor edx, [r11+3*1024+4*rsi] -movzx esi, ch -xor edi, [r11+2*1024+4*rsi] -shr ecx, 16 -movzx esi, cl -xor eax, [r11+1*1024+4*rsi] -movzx esi, ch -xor ebx, [r11+0*1024+4*rsi] -mov ecx, r14d -movzx esi, cl -xor ebx, [r11+3*1024+4*rsi] -movzx esi, ch -xor edx, [r11+2*1024+4*rsi] -shr ecx, 16 -movzx esi, cl -xor edi, [r11+1*1024+4*rsi] -movzx esi, ch -xor eax, [r11+0*1024+4*rsi] -mov ecx, r13d -movzx esi, cl -xor eax, [r11+3*1024+4*rsi] -movzx esi, ch -xor ebx, [r11+2*1024+4*rsi] -shr ecx, 16 -movzx esi, cl -xor edx, [r11+1*1024+4*rsi] -movzx esi, ch -xor edi, [r11+0*1024+4*rsi] -mov r15d, ebx -mov r14d, eax -mov r13d, edi -add r8, 8*4 -cmp r9, r8 -jne label0 -mov eax, [r9+0*4] -mov ecx, [r9+1*4] -mov esi, [r9+2*4] -mov edi, [r9+3*4] -movzx ebx, dl -movzx ebx, BYTE PTR [r11+1+4*rbx] -shl ebx, 3*8 -xor eax, ebx -movzx ebx, dh -movzx ebx, BYTE PTR [r11+1+4*rbx] -shl ebx, 2*8 -xor ecx, ebx -shr edx, 16 -movzx ebx, dl -shr edx, 8 -movzx ebx, BYTE PTR [r11+1+4*rbx] -shl ebx, 1*8 -xor esi, ebx -movzx ebx, BYTE PTR [r11+1+4*rdx] -xor edi, ebx -mov edx, r15d -movzx ebx, dl -movzx ebx, BYTE PTR [r11+1+4*rbx] -shl ebx, 3*8 -xor edi, ebx -movzx ebx, dh -movzx ebx, BYTE PTR [r11+1+4*rbx] -shl ebx, 2*8 -xor eax, ebx -shr edx, 16 -movzx ebx, dl -shr edx, 8 -movzx ebx, BYTE PTR [r11+1+4*rbx] -shl ebx, 1*8 -xor ecx, ebx -movzx ebx, BYTE PTR [r11+1+4*rdx] -xor esi, ebx -mov edx, r14d -movzx ebx, dl -movzx ebx, BYTE PTR [r11+1+4*rbx] -shl ebx, 3*8 -xor esi, ebx -movzx ebx, dh -movzx ebx, BYTE PTR [r11+1+4*rbx] -shl ebx, 2*8 -xor edi, ebx -shr edx, 16 -movzx ebx, dl -shr edx, 8 -movzx ebx, BYTE PTR [r11+1+4*rbx] -shl ebx, 1*8 -xor eax, ebx -movzx ebx, BYTE PTR [r11+1+4*rdx] -xor ecx, ebx -mov edx, r13d -movzx ebx, dl -movzx ebx, BYTE PTR [r11+1+4*rbx] -shl ebx, 3*8 -xor ecx, ebx -movzx ebx, dh -movzx ebx, BYTE PTR [r11+1+4*rbx] -shl ebx, 2*8 -xor esi, ebx -shr edx, 16 -movzx ebx, dl -shr edx, 8 -movzx ebx, BYTE PTR [r11+1+4*rbx] -shl ebx, 1*8 -xor edi, ebx -movzx ebx, BYTE PTR [r11+1+4*rdx] -xor eax, ebx -mov rbx, [rsp + 6*8 + 7*8] ; xorBlock -test rbx, rbx -jz label1 -xor eax, [rbx+0*4] -xor ecx, [rbx+1*4] -xor esi, [rbx+2*4] -xor edi, [rbx+3*4] -label1: -mov rbx, [rsp + 7*8 + 7*8] ; outBlock -mov [rbx+0*4], eax -mov [rbx+1*4], ecx -mov [rbx+2*4], esi -mov [rbx+3*4], edi -pop r15 -pop r14 -pop r13 -pop r12 -pop rdi -pop rsi -pop rbx -ret -Rijndael_Enc_ProcessAndXorBlock ENDP ALIGN 8 Sosemanuk_OperateKeystream PROC FRAME -- cgit v1.2.1