From df1ffe1e41f89222c379d982e543c2a32da78cbd Mon Sep 17 00:00:00 2001 From: weidai Date: Fri, 4 May 2007 15:24:09 +0000 Subject: fix compile for x64, DLL and VC 6 git-svn-id: svn://svn.code.sf.net/p/cryptopp/code/trunk/c5@332 57ff6487-cd31-0410-9ec3-f628ee90f5f0 --- rijndael.cpp | 254 +++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 158 insertions(+), 96 deletions(-) (limited to 'rijndael.cpp') diff --git a/rijndael.cpp b/rijndael.cpp index 4a8572f..ac4f769 100644 --- a/rijndael.cpp +++ b/rijndael.cpp @@ -149,81 +149,133 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const { -#ifdef CRYPTOPP_X86_ASM_AVAILABLE +#if defined(CRYPTOPP_X86_ASM_AVAILABLE) if (HasMMX()) { const word32 *k = m_key; const word32 *kLoopEnd = k + m_rounds*4; + #if CRYPTOPP_BOOL_X64 + #define K_REG r8 + #define K_END_REG r9 + #define SAVE_K + #define RESTORE_K + #define RESTORE_K_END + #define SAVE_0(x) AS2(mov r10d, x) + #define SAVE_1(x) AS2(mov r11d, x) + #define SAVE_2(x) AS2(mov r12d, x) + #define RESTORE_0(x) AS2(mov x, r10d) + #define RESTORE_1(x) AS2(mov x, r11d) + #define RESTORE_2(x) AS2(mov x, r12d) + #else + #define K_REG esi + #define K_END_REG edi + #define SAVE_K AS2(movd mm4, esi) + #define RESTORE_K AS2(movd esi, mm4) + #define RESTORE_K_END AS2(movd edi, mm5) + #define SAVE_0(x) AS2(movd mm0, x) + #define SAVE_1(x) AS2(movd mm1, x) + #define SAVE_2(x) AS2(movd mm2, x) + #define RESTORE_0(x) AS2(movd x, mm0) + #define RESTORE_1(x) AS2(movd x, mm1) + #define RESTORE_2(x) AS2(movd x, mm2) + #endif #ifdef __GNUC__ word32 t0, t1, t2, t3; __asm__ __volatile__ ( ".intel_syntax noprefix;" - AS1( push ebx) - AS1( push ebp) - AS2( mov ebp, eax) + AS_PUSH( bx) + AS_PUSH( bp) + AS2( mov WORD_REG(bp), WORD_REG(ax)) + #if CRYPTOPP_BOOL_X64 + // save these manually. clobber list doesn't seem to work as of GCC 4.1.0 + AS1( pushq K_REG) + AS1( pushq K_END_REG) + AS1( pushq r10) + AS1( pushq r11) + AS1( pushq r12) + AS2( mov K_REG, rsi) + AS2( mov K_END_REG, rcx) + #else AS2( movd mm5, ecx) + #endif #else + #if _MSC_VER < 1300 + const word32 *t = Te; + AS2( mov eax, t) + #endif AS2( mov edx, g_cacheLineSize) - AS2( mov edi, inBlock) - AS2( mov esi, k) + AS2( mov WORD_REG(di), inBlock) + AS2( mov K_REG, k) AS2( movd mm5, kLoopEnd) - AS1( push ebp) + #if _MSC_VER < 1300 + AS_PUSH( bx) + AS_PUSH( bp) + AS2( mov ebp, eax) + #else + AS_PUSH( bp) AS2( lea ebp, Te) + #endif #endif - AS2( mov eax, [esi+0*4]) // s0 - AS2( xor eax, [edi+0*4]) - AS2( movd mm0, eax) - AS2( mov ebx, [esi+1*4]) - AS2( xor ebx, [edi+1*4]) - AS2( movd mm1, ebx) + AS2( mov eax, [K_REG+0*4]) // s0 + AS2( xor eax, [WORD_REG(di)+0*4]) + SAVE_0(eax) + AS2( mov ebx, [K_REG+1*4]) + AS2( xor ebx, [WORD_REG(di)+1*4]) + SAVE_1(ebx) AS2( and ebx, eax) - AS2( mov eax, [esi+2*4]) - AS2( xor eax, [edi+2*4]) - AS2( movd mm2, eax) + AS2( mov eax, [K_REG+2*4]) + AS2( xor eax, [WORD_REG(di)+2*4]) + SAVE_2(eax) AS2( and ebx, eax) - AS2( mov ecx, [esi+3*4]) - AS2( xor ecx, [edi+3*4]) + AS2( mov ecx, [K_REG+3*4]) + AS2( xor ecx, [WORD_REG(di)+3*4]) AS2( and ebx, ecx) // read Te0 into L1 cache. this code could be simplifed by using lfence, but that is an SSE2 instruction AS2( and ebx, 0) AS2( mov edi, ebx) // make index depend on previous loads to simulate lfence ASL(2) - AS2( and ebx, [ebp+edi]) + AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)]) AS2( add edi, edx) - AS2( and ebx, [ebp+edi]) + AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)]) AS2( add edi, edx) - AS2( and ebx, [ebp+edi]) + AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)]) AS2( add edi, edx) - AS2( and ebx, [ebp+edi]) + AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)]) AS2( add edi, edx) AS2( cmp edi, 1024) ASJ( jl, 2, b) - AS2( and ebx, [ebp+1020]) + AS2( and ebx, [WORD_REG(bp)+1020]) +#if CRYPTOPP_BOOL_X64 + AS2( xor r10d, ebx) + AS2( xor r11d, ebx) + AS2( xor r12d, ebx) +#else AS2( movd mm6, ebx) AS2( pxor mm2, mm6) AS2( pxor mm1, mm6) AS2( pxor mm0, mm6) +#endif AS2( xor ecx, ebx) - AS2( mov edi, [esi+4*4]) // t0 - AS2( mov eax, [esi+5*4]) - AS2( mov ebx, [esi+6*4]) - AS2( mov edx, [esi+7*4]) - AS2( add esi, 8*4) - AS2( movd mm4, esi) + AS2( mov edi, [K_REG+4*4]) // t0 + AS2( mov eax, [K_REG+5*4]) + AS2( mov ebx, [K_REG+6*4]) + AS2( mov edx, [K_REG+7*4]) + AS2( add K_REG, 8*4) + SAVE_K #define QUARTER_ROUND(t, a, b, c, d) \ AS2(movzx esi, t##l)\ - AS2(d, [ebp+0*1024+4*esi])\ + AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)])\ AS2(movzx esi, t##h)\ - AS2(c, [ebp+1*1024+4*esi])\ + AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\ AS2(shr e##t##x, 16)\ AS2(movzx esi, t##l)\ - AS2(b, [ebp+2*1024+4*esi])\ + AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\ AS2(movzx esi, t##h)\ - AS2(a, [ebp+3*1024+4*esi]) + AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)]) #define s0 xor edi #define s1 xor eax @@ -235,69 +287,69 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock #define t3 xor edx QUARTER_ROUND(c, t0, t1, t2, t3) - AS2( movd ecx, mm2) + RESTORE_2(ecx) QUARTER_ROUND(c, t3, t0, t1, t2) - AS2( movd ecx, mm1) + RESTORE_1(ecx) QUARTER_ROUND(c, t2, t3, t0, t1) - AS2( movd ecx, mm0) + RESTORE_0(ecx) QUARTER_ROUND(c, t1, t2, t3, t0) - AS2( movd mm2, ebx) - AS2( movd mm1, eax) - AS2( movd mm0, edi) + SAVE_2(ebx) + SAVE_1(eax) + SAVE_0(edi) #undef QUARTER_ROUND - AS2( movd esi, mm4) + RESTORE_K ASL(0) - AS2( mov edi, [esi+0*4]) - AS2( mov eax, [esi+1*4]) - AS2( mov ebx, [esi+2*4]) - AS2( mov ecx, [esi+3*4]) + AS2( mov edi, [K_REG+0*4]) + AS2( mov eax, [K_REG+1*4]) + AS2( mov ebx, [K_REG+2*4]) + AS2( mov ecx, [K_REG+3*4]) #define QUARTER_ROUND(t, a, b, c, d) \ AS2(movzx esi, t##l)\ - AS2(a, [ebp+3*1024+4*esi])\ + AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)])\ AS2(movzx esi, t##h)\ - AS2(b, [ebp+2*1024+4*esi])\ + AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\ AS2(shr e##t##x, 16)\ AS2(movzx esi, t##l)\ - AS2(c, [ebp+1*1024+4*esi])\ + AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\ AS2(movzx esi, t##h)\ - AS2(d, [ebp+0*1024+4*esi]) + AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)]) QUARTER_ROUND(d, s0, s1, s2, s3) - AS2( movd edx, mm2) + RESTORE_2(edx) QUARTER_ROUND(d, s3, s0, s1, s2) - AS2( movd edx, mm1) + RESTORE_1(edx) QUARTER_ROUND(d, s2, s3, s0, s1) - AS2( movd edx, mm0) + RESTORE_0(edx) QUARTER_ROUND(d, s1, s2, s3, s0) - AS2( movd esi, mm4) - AS2( movd mm2, ebx) - AS2( movd mm1, eax) - AS2( movd mm0, edi) + RESTORE_K + SAVE_2(ebx) + SAVE_1(eax) + SAVE_0(edi) - AS2( mov edi, [esi+4*4]) - AS2( mov eax, [esi+5*4]) - AS2( mov ebx, [esi+6*4]) - AS2( mov edx, [esi+7*4]) + AS2( mov edi, [K_REG+4*4]) + AS2( mov eax, [K_REG+5*4]) + AS2( mov ebx, [K_REG+6*4]) + AS2( mov edx, [K_REG+7*4]) QUARTER_ROUND(c, t0, t1, t2, t3) - AS2( movd ecx, mm2) + RESTORE_2(ecx) QUARTER_ROUND(c, t3, t0, t1, t2) - AS2( movd ecx, mm1) + RESTORE_1(ecx) QUARTER_ROUND(c, t2, t3, t0, t1) - AS2( movd ecx, mm0) + RESTORE_0(ecx) QUARTER_ROUND(c, t1, t2, t3, t0) - AS2( movd mm2, ebx) - AS2( movd mm1, eax) - AS2( movd mm0, edi) - - AS2( movd esi, mm4) - AS2( movd edi, mm5) - AS2( add esi, 8*4) - AS2( movd mm4, esi) - AS2( cmp edi, esi) + SAVE_2(ebx) + SAVE_1(eax) + SAVE_0(edi) + + RESTORE_K + RESTORE_K_END + AS2( add K_REG, 8*4) + SAVE_K + AS2( cmp K_END_REG, K_REG) ASJ( jne, 0, b) #undef QUARTER_ROUND @@ -310,44 +362,54 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock #undef t2 #undef t3 - AS2( mov eax, [edi+0*4]) - AS2( mov ecx, [edi+1*4]) - AS2( mov esi, [edi+2*4]) - AS2( mov edi, [edi+3*4]) + AS2( mov eax, [K_END_REG+0*4]) + AS2( mov ecx, [K_END_REG+1*4]) + AS2( mov esi, [K_END_REG+2*4]) + AS2( mov edi, [K_END_REG+3*4]) #define QUARTER_ROUND(a, b, c, d) \ AS2( movzx ebx, dl)\ - AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\ + AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\ AS2( shl ebx, 3*8)\ AS2( xor a, ebx)\ AS2( movzx ebx, dh)\ - AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\ + AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\ AS2( shl ebx, 2*8)\ AS2( xor b, ebx)\ AS2( shr edx, 16)\ AS2( movzx ebx, dl)\ AS2( shr edx, 8)\ - AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\ + AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\ AS2( shl ebx, 1*8)\ AS2( xor c, ebx)\ - AS2( movzx ebx, BYTE PTR [ebp+1+4*edx])\ + AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(dx)])\ AS2( xor d, ebx) QUARTER_ROUND(eax, ecx, esi, edi) - AS2( movd edx, mm2) + RESTORE_2(edx) QUARTER_ROUND(edi, eax, ecx, esi) - AS2( movd edx, mm1) + RESTORE_1(edx) QUARTER_ROUND(esi, edi, eax, ecx) - AS2( movd edx, mm0) + RESTORE_0(edx) QUARTER_ROUND(ecx, esi, edi, eax) #undef QUARTER_ROUND - AS1( pop ebp) - AS1( emms) +#if CRYPTOPP_BOOL_X64 + AS1(popq r12) + AS1(popq r11) + AS1(popq r10) + AS1(popq K_END_REG) + AS1(popq K_REG) +#else + AS1(emms) +#endif + AS_POP( bp) +#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300) + AS_POP( bx) +#endif #ifdef __GNUC__ - AS1( pop ebx) ".att_syntax prefix;" : "=a" (t0), "=c" (t1), "=S" (t2), "=D" (t3) : "a" (Te), "D" (inBlock), "S" (k), "c" (kLoopEnd), "d" (g_cacheLineSize) @@ -366,19 +428,19 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock ((word32 *)outBlock)[2] = t2; ((word32 *)outBlock)[3] = t3; #else - AS2( mov ebx, xorBlock) - AS2( test ebx, ebx) + AS2( mov WORD_REG(bx), xorBlock) + AS2( test WORD_REG(bx), WORD_REG(bx)) ASJ( jz, 1, f) - AS2( xor eax, [ebx+0*4]) - AS2( xor ecx, [ebx+1*4]) - AS2( xor esi, [ebx+2*4]) - AS2( xor edi, [ebx+3*4]) + AS2( xor eax, [WORD_REG(bx)+0*4]) + AS2( xor ecx, [WORD_REG(bx)+1*4]) + AS2( xor esi, [WORD_REG(bx)+2*4]) + AS2( xor edi, [WORD_REG(bx)+3*4]) ASL(1) - AS2( mov ebx, outBlock) - AS2( mov [ebx+0*4], eax) - AS2( mov [ebx+1*4], ecx) - AS2( mov [ebx+2*4], esi) - AS2( mov [ebx+3*4], edi) + AS2( mov WORD_REG(bx), outBlock) + AS2( mov [WORD_REG(bx)+0*4], eax) + AS2( mov [WORD_REG(bx)+1*4], ecx) + AS2( mov [WORD_REG(bx)+2*4], esi) + AS2( mov [WORD_REG(bx)+3*4], edi) #endif } else -- cgit v1.2.1