From caf9e032e6b4ccb114a74a3936c916bcfaba262d Mon Sep 17 00:00:00 2001 From: weidai Date: Mon, 2 Mar 2009 02:39:17 +0000 Subject: changes for 5.6: - added AuthenticatedSymmetricCipher interface class and Filter wrappers - added CCM, GCM (with SSE2 assembly), CMAC, and SEED - improved AES speed on x86 and x64 - removed WORD64_AVAILABLE; compiler 64-bit int support is now required git-svn-id: svn://svn.code.sf.net/p/cryptopp/code/trunk/c5@433 57ff6487-cd31-0410-9ec3-f628ee90f5f0 --- rijndael.cpp | 967 ++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 619 insertions(+), 348 deletions(-) (limited to 'rijndael.cpp') diff --git a/rijndael.cpp b/rijndael.cpp index b89e3b3..05c403a 100644 --- a/rijndael.cpp +++ b/rijndael.cpp @@ -4,6 +4,16 @@ // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code +/* +The assembly code was rewritten in Feb 2009 by Wei Dai to do counter mode +caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein +and Peter Schwabe in their paper "New AES software speed records". The round +function was also modified to include a trick similar to one in Brian Gladman's +x86 assembly code, doing an 8-bit register move to minimize the number of +register spills. Also switched to compressed tables and copying round keys to +the stack. +*/ + /* Defense against timing attacks was added in July 2006 by Wei Dai. @@ -58,6 +68,72 @@ being unloaded from L1 cache, until that round is finished. NAMESPACE_BEGIN(CryptoPP) +#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) +namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];} +using namespace rdtable; +#else +static word64 Te[256]; +#endif +static word32 Td[256*4]; +#else +static word32 Te[256*4], Td[256*4]; +#endif +static bool s_TeFilled = false, s_TdFilled = false; + +#define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) +#define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) +#define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) + +#define f3(x) (f2(x) ^ x) +#define f9(x) (f8(x) ^ x) +#define fb(x) (f8(x) ^ f2(x) ^ x) +#define fd(x) (f8(x) ^ f4(x) ^ x) +#define fe(x) (f8(x) ^ f4(x) ^ f2(x)) + +void Rijndael::Base::FillEncTable() +{ + for (int i=0; i<256; i++) + { + byte x = Se[i]; +#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS + word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24; + Te[i] = word64(y | f3(x))<<32 | y; +#else + word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24; + for (int j=0; j<4; j++) + { + Te[i+j*256] = y; + y = rotrFixed(y, 8); + } +#endif + } +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + Te[256] = Te[257] = 0; +#endif + s_TeFilled = true; +} + +void Rijndael::Base::FillDecTable() +{ + for (int i=0; i<256; i++) + { + byte x = Sd[i]; +#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS_ + word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24; + Td[i] = word64(y | fb(x))<<32 | y | x; +#else + word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;; + for (int j=0; j<4; j++) + { + Td[i+j*256] = y; + y = rotrFixed(y, 8); + } +#endif + } + s_TdFilled = true; +} + void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &) { AssertValidKeyLength(keylen); @@ -106,8 +182,16 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c rk += keylen/4; } - if (!IsForwardTransformation()) + if (IsForwardTransformation()) { + if (!s_TeFilled) + FillEncTable(); + } + else + { + if (!s_TdFilled) + FillDecTable(); + unsigned int i, j; rk = m_key; @@ -148,349 +232,530 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key + m_rounds*4, m_key + m_rounds*4, 16); } -#ifdef CRYPTOPP_X64_MASM_AVAILABLE -extern "C" { -void Rijndael_Enc_ProcessAndXorBlock(const word32 *table, word32 cacheLineSize, const word32 *k, const word32 *kLoopEnd, const byte *inBlock, const byte *xorBlock, byte *outBlock); -} -#endif - #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code -void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const +#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM + +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + +CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k) { -#endif // #ifdef CRYPTOPP_GENERATE_X64_MASM +#if CRYPTOPP_BOOL_X86 + +#define L_REG esp +#define L_INDEX(i) (L_REG+512+i) +#define L_INXORBLOCKS L_INBLOCKS+4 +#define L_OUTXORBLOCKS L_INBLOCKS+8 +#define L_OUTBLOCKS L_INBLOCKS+12 +#define L_INCREMENTS L_INDEX(16*15) +#define L_SP L_INDEX(16*16) +#define L_LENGTH L_INDEX(16*16+4) +#define L_KEYS_BEGIN L_INDEX(16*16+8) + +#define MOVD movd +#define MM(i) mm##i + +#define MXOR(a,b,c) \ + AS2( movzx ebp, b)\ + AS2( movd mm7, DWORD PTR [WORD_REG(si)+8*WORD_REG(bp)+MAP0TO4(c)])\ + AS2( pxor MM(a), mm7)\ + +#define MMOV(a,b,c) \ + AS2( movzx ebp, b)\ + AS2( movd MM(a), DWORD PTR [WORD_REG(si)+8*WORD_REG(bp)+MAP0TO4(c)])\ + +#else + +#define L_REG r8 +#define L_INDEX(i) (r8+i) +#define L_INXORBLOCKS L_INBLOCKS+8 +#define L_OUTXORBLOCKS L_INBLOCKS+16 +#define L_OUTBLOCKS L_INBLOCKS+24 +#define L_INCREMENTS L_INDEX(16*16) +#define L_BP L_INDEX(16*18) +#define L_LENGTH L_INDEX(16*18+8) +#define L_KEYS_BEGIN L_INDEX(16*19) + +#define MOVD mov +#define MM(i) r1##i##d + +#define MXOR(a,b,c) \ + AS2( movzx ebp, b)\ + AS2( xor MM(a), DWORD PTR [WORD_REG(si)+8*WORD_REG(bp)+MAP0TO4(c)])\ + +#define MMOV(a,b,c) \ + AS2( movzx ebp, b)\ + AS2( mov MM(a), DWORD PTR [WORD_REG(si)+8*WORD_REG(bp)+MAP0TO4(c)])\ -#ifdef CRYPTOPP_X64_MASM_AVAILABLE - Rijndael_Enc_ProcessAndXorBlock(Te, g_cacheLineSize, m_key, m_key + m_rounds*4, inBlock, xorBlock, outBlock); - return; #endif -#if defined(CRYPTOPP_X86_ASM_AVAILABLE) - #ifdef CRYPTOPP_GENERATE_X64_MASM +#define L_SUBKEYS L_INDEX(0) +#define L_SAVED_X L_SUBKEYS +#define L_KEY12 L_INDEX(16*12) +#define L_LASTROUND L_INDEX(16*13) +#define L_INBLOCKS L_INDEX(16*14) +#define MAP0TO4(i) (ASM_MOD(i+3,4)+1) + +#define XOR(a,b,c) \ + AS2( movzx ebp, b)\ + AS2( xor a, DWORD PTR [WORD_REG(si)+8*WORD_REG(bp)+MAP0TO4(c)])\ + +#define MOV(a,b,c) \ + AS2( movzx ebp, b)\ + AS2( mov a, DWORD PTR [WORD_REG(si)+8*WORD_REG(bp)+MAP0TO4(c)])\ + +#ifdef CRYPTOPP_GENERATE_X64_MASM ALIGN 8 - Rijndael_Enc_ProcessAndXorBlock PROC FRAME - rex_push_reg rbx - push_reg rsi + Rijndael_Enc_AdvancedProcessBlocks PROC FRAME + rex_push_reg rsi push_reg rdi + push_reg rbx + push_reg rbp push_reg r12 - push_reg r13 - push_reg r14 - push_reg r15 .endprolog - mov AS_REG_7, rcx - mov rdi, [rsp + 5*8 + 7*8] ; inBlock - #else - if (HasMMX()) - { - const word32 *k = m_key; - const word32 *kLoopEnd = k + m_rounds*4; - #endif - - #if CRYPTOPP_BOOL_X64 - #define K_REG r8 - #define K_END_REG r9 - #define SAVE_K - #define RESTORE_K - #define RESTORE_K_END - #define SAVE_0(x) AS2(mov r13d, x) - #define SAVE_1(x) AS2(mov r14d, x) - #define SAVE_2(x) AS2(mov r15d, x) - #define RESTORE_0(x) AS2(mov x, r13d) - #define RESTORE_1(x) AS2(mov x, r14d) - #define RESTORE_2(x) AS2(mov x, r15d) - #else - #define K_REG esi - #define K_END_REG edi - #define SAVE_K AS2(movd mm4, esi) - #define RESTORE_K AS2(movd esi, mm4) - #define RESTORE_K_END AS2(movd edi, mm5) - #define SAVE_0(x) AS2(movd mm0, x) - #define SAVE_1(x) AS2(movd mm1, x) - #define SAVE_2(x) AS2(movd mm2, x) - #define RESTORE_0(x) AS2(movd x, mm0) - #define RESTORE_1(x) AS2(movd x, mm1) - #define RESTORE_2(x) AS2(movd x, mm2) - #endif -#ifdef __GNUC__ - word32 t0, t1, t2, t3; - __asm__ __volatile__ - ( - ".intel_syntax noprefix;" + mov r8, rcx + mov rsi, ?Te@rdtable@CryptoPP@@3PA_KA + mov rdi, QWORD PTR [?g_cacheLineSize@CryptoPP@@3IA] +#elif defined(__GNUC__) + __asm__ __volatile__ + ( + ".intel_syntax noprefix;" + ASL(Rijndael_Enc_AdvancedProcessBlocks) #if CRYPTOPP_BOOL_X64 - AS2( mov K_REG, rsi) - AS2( mov K_END_REG, rcx) - #else - AS1( push ebx) - AS1( push ebp) - AS2( movd mm5, ecx) + AS2( mov r8, rcx) + AS2( mov [L_BP], rbp) #endif - AS2( mov AS_REG_7, WORD_REG(ax)) -#elif CRYPTOPP_BOOL_X86 - #if _MSC_VER < 1300 - const word32 *t = Te; - AS2( mov eax, t) - #endif - AS2( mov edx, g_cacheLineSize) - AS2( mov WORD_REG(di), inBlock) - AS2( mov K_REG, k) - AS2( movd mm5, kLoopEnd) - #if _MSC_VER < 1300 - AS1( push ebx) - AS1( push ebp) - AS2( mov AS_REG_7, eax) - #else - AS1( push ebp) - AS2( lea AS_REG_7, Te) - #endif -#endif - AS2( mov eax, [K_REG+0*4]) // s0 - AS2( xor eax, [WORD_REG(di)+0*4]) - SAVE_0(eax) - AS2( mov ebx, [K_REG+1*4]) - AS2( xor ebx, [WORD_REG(di)+1*4]) - SAVE_1(ebx) - AS2( and ebx, eax) - AS2( mov eax, [K_REG+2*4]) - AS2( xor eax, [WORD_REG(di)+2*4]) - SAVE_2(eax) - AS2( and ebx, eax) - AS2( mov ecx, [K_REG+3*4]) - AS2( xor ecx, [WORD_REG(di)+3*4]) - AS2( and ebx, ecx) - - // read Te0 into L1 cache. this code could be simplifed by using lfence, but that is an SSE2 instruction - AS2( and ebx, 0) - AS2( mov edi, ebx) // make index depend on previous loads to simulate lfence - ASL(2) - AS2( and ebx, [AS_REG_7+WORD_REG(di)]) - AS2( add edi, edx) - AS2( and ebx, [AS_REG_7+WORD_REG(di)]) - AS2( add edi, edx) - AS2( and ebx, [AS_REG_7+WORD_REG(di)]) - AS2( add edi, edx) - AS2( and ebx, [AS_REG_7+WORD_REG(di)]) - AS2( add edi, edx) - AS2( cmp edi, 1024) - ASJ( jl, 2, b) - AS2( and ebx, [AS_REG_7+1020]) -#if CRYPTOPP_BOOL_X64 - AS2( xor r13d, ebx) - AS2( xor r14d, ebx) - AS2( xor r15d, ebx) #else - AS2( movd mm6, ebx) - AS2( pxor mm2, mm6) - AS2( pxor mm1, mm6) - AS2( pxor mm0, mm6) + AS1( push esi) + AS1( push edi) + AS2( lea esi, [Te]) + AS2( mov edi, [g_cacheLineSize]) #endif - AS2( xor ecx, ebx) - AS2( mov edi, [K_REG+4*4]) // t0 - AS2( mov eax, [K_REG+5*4]) - AS2( mov ebx, [K_REG+6*4]) - AS2( mov edx, [K_REG+7*4]) - AS2( add K_REG, 8*4) - SAVE_K - -#define QUARTER_ROUND(t, a, b, c, d) \ - AS2(movzx esi, t##l)\ - AS2(d, [AS_REG_7+0*1024+4*WORD_REG(si)])\ - AS2(movzx esi, t##h)\ - AS2(c, [AS_REG_7+1*1024+4*WORD_REG(si)])\ - AS2(shr e##t##x, 16)\ - AS2(movzx esi, t##l)\ - AS2(b, [AS_REG_7+2*1024+4*WORD_REG(si)])\ - AS2(movzx esi, t##h)\ - AS2(a, [AS_REG_7+3*1024+4*WORD_REG(si)]) - -#define s0 xor edi -#define s1 xor eax -#define s2 xor ebx -#define s3 xor ecx -#define t0 xor edi -#define t1 xor eax -#define t2 xor ebx -#define t3 xor edx - - QUARTER_ROUND(c, t0, t1, t2, t3) - RESTORE_2(ecx) - QUARTER_ROUND(c, t3, t0, t1, t2) - RESTORE_1(ecx) - QUARTER_ROUND(c, t2, t3, t0, t1) - RESTORE_0(ecx) - QUARTER_ROUND(c, t1, t2, t3, t0) - SAVE_2(ebx) - SAVE_1(eax) - SAVE_0(edi) -#undef QUARTER_ROUND +#if CRYPTOPP_BOOL_X86 + AS_PUSH_IF86( bx) + AS_PUSH_IF86( bp) + AS2( mov [ecx+16*12+16*4], esp) + AS2( lea esp, [ecx-512]) +#endif - RESTORE_K + // copy subkeys to stack + AS2( mov WORD_REG(bp), [L_KEYS_BEGIN]) + AS2( mov WORD_REG(ax), 16) + AS2( and WORD_REG(ax), WORD_REG(bp)) + AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter) + AS2( movdqa [L_KEY12], xmm3) + AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16]) + AS2( sub WORD_REG(ax), WORD_REG(bp)) + ASL(0) + AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(bp)]) + AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(bp)], xmm0) + AS2( add WORD_REG(bp), 16) + AS2( cmp WORD_REG(bp), 16*12) + ASJ( jl, 0, b) + + // read subkeys 0, 1 and last + AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(bp)]) // last subkey + AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0 + AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3 + AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7 + AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11 + AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15 + + // load table into cache + AS2( xor WORD_REG(ax), WORD_REG(ax)) + ASL(9) + AS2( mov ebp, [WORD_REG(si)+WORD_REG(ax)]) + AS2( add WORD_REG(ax), WORD_REG(di)) + AS2( mov ebp, [WORD_REG(si)+WORD_REG(ax)]) + AS2( add WORD_REG(ax), WORD_REG(di)) + AS2( mov ebp, [WORD_REG(si)+WORD_REG(ax)]) + AS2( add WORD_REG(ax), WORD_REG(di)) + AS2( mov ebp, [WORD_REG(si)+WORD_REG(ax)]) + AS2( add WORD_REG(ax), WORD_REG(di)) + AS2( cmp WORD_REG(ax), 2048) + ASJ( jl, 9, b) + AS1( lfence) + + AS2( test DWORD PTR [L_LENGTH], 1) + ASJ( jz, 8, f) + + // counter mode one-time setup + AS2( mov WORD_REG(bp), [L_INBLOCKS]) + AS2( movdqa xmm2, [WORD_REG(bp)]) // counter + AS2( pxor xmm2, xmm1) + AS2( psrldq xmm1, 14) + AS2( movd eax, xmm1) + AS2( mov al, BYTE PTR [WORD_REG(bp)+15]) + AS2( MOVD MM(2), eax) +#if CRYPTOPP_BOOL_X86 + AS2( mov eax, 1) + AS2( movd mm3, eax) +#endif - ASL(0) - AS2( mov edi, [K_REG+0*4]) - AS2( mov eax, [K_REG+1*4]) - AS2( mov ebx, [K_REG+2*4]) - AS2( mov ecx, [K_REG+3*4]) + // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx + AS2( movd eax, xmm2) + AS2( psrldq xmm2, 4) + AS2( movd edi, xmm2) + AS2( psrldq xmm2, 4) + MXOR( 1, al, 0) // 0 + XOR( edx, ah, 1) // 1 + AS2( shr eax, 16) + XOR( ecx, al, 2) // 2 + XOR( ebx, ah, 3) // 3 + AS2( mov eax, edi) + AS2( movd edi, xmm2) + AS2( psrldq xmm2, 4) + XOR( ebx, al, 0) // 4 + MXOR( 1, ah, 1) // 5 + AS2( shr eax, 16) + XOR( edx, al, 2) // 6 + XOR( ecx, ah, 3) // 7 + AS2( mov eax, edi) + AS2( movd edi, xmm2) + XOR( ecx, al, 0) // 8 + XOR( ebx, ah, 1) // 9 + AS2( shr eax, 16) + MXOR( 1, al, 2) // 10 + XOR( edx, ah, 3) // 11 + AS2( mov eax, edi) + XOR( edx, al, 0) // 12 + XOR( ecx, ah, 1) // 13 + AS2( shr eax, 16) + XOR( ebx, al, 2) // 14 + AS2( psrldq xmm2, 3) + + // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0 + AS2( mov eax, [L_KEY12+0*4]) + AS2( mov edi, [L_KEY12+2*4]) + AS2( MOVD MM(0), [L_KEY12+3*4]) + MXOR( 0, cl, 3) /* 11 */ + XOR( edi, bl, 3) /* 7 */ + MXOR( 0, bh, 2) /* 6 */ + AS2( shr ebx, 16) /* 4,5 */ + XOR( eax, bl, 1) /* 5 */ + MOV( ebx, bh, 0) /* 4 */ + AS2( xor ebx, [L_KEY12+1*4]) + XOR( eax, ch, 2) /* 10 */ + AS2( shr ecx, 16) /* 8,9 */ + XOR( eax, dl, 3) /* 15 */ + XOR( ebx, dh, 2) /* 14 */ + AS2( shr edx, 16) /* 12,13 */ + XOR( edi, ch, 0) /* 8 */ + XOR( ebx, cl, 1) /* 9 */ + XOR( edi, dl, 1) /* 13 */ + MXOR( 0, dh, 0) /* 12 */ + + AS2( movd ecx, xmm2) + AS2( MOVD edx, MM(1)) + AS2( MOVD [L_SAVED_X+3*4], MM(0)) + AS2( mov [L_SAVED_X+0*4], eax) + AS2( mov [L_SAVED_X+1*4], ebx) + AS2( mov [L_SAVED_X+2*4], edi) + ASJ( jmp, 5, f) + + ASL(3) + // non-counter mode per-block setup + AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3 + AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7 + AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11 + AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15 + ASL(8) + AS2( mov WORD_REG(ax), [L_INBLOCKS]) + AS2( movdqu xmm2, [WORD_REG(ax)]) + AS2( mov WORD_REG(bp), [L_INXORBLOCKS]) + AS2( movdqu xmm5, [WORD_REG(bp)]) + AS2( pxor xmm2, xmm1) + AS2( pxor xmm2, xmm5) + + // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx + AS2( movd eax, xmm2) + AS2( psrldq xmm2, 4) + AS2( movd edi, xmm2) + AS2( psrldq xmm2, 4) + MXOR( 1, al, 0) // 0 + XOR( edx, ah, 1) // 1 + AS2( shr eax, 16) + XOR( ecx, al, 2) // 2 + XOR( ebx, ah, 3) // 3 + AS2( mov eax, edi) + AS2( movd edi, xmm2) + AS2( psrldq xmm2, 4) + XOR( ebx, al, 0) // 4 + MXOR( 1, ah, 1) // 5 + AS2( shr eax, 16) + XOR( edx, al, 2) // 6 + XOR( ecx, ah, 3) // 7 + AS2( mov eax, edi) + AS2( movd edi, xmm2) + XOR( ecx, al, 0) // 8 + XOR( ebx, ah, 1) // 9 + AS2( shr eax, 16) + MXOR( 1, al, 2) // 10 + XOR( edx, ah, 3) // 11 + AS2( mov eax, edi) + XOR( edx, al, 0) // 12 + XOR( ecx, ah, 1) // 13 + AS2( shr eax, 16) + XOR( ebx, al, 2) // 14 + MXOR( 1, ah, 3) // 15 + AS2( MOVD eax, MM(1)) + + AS2( add L_REG, [L_KEYS_BEGIN]) + AS2( add L_REG, 4*16) + ASJ( jmp, 2, f) + + ASL(1) + // counter-mode per-block setup + AS2( MOVD ecx, MM(2)) + AS2( MOVD edx, MM(1)) + AS2( mov eax, [L_SAVED_X+0*4]) + AS2( mov ebx, [L_SAVED_X+1*4]) + AS2( xor cl, ch) + AS2( and WORD_REG(cx), 255) + ASL(5) +#if CRYPTOPP_BOOL_X86 + AS2( paddb MM(2), mm3) +#else + AS2( add MM(2), 1) +#endif + // remaining part of second round, in: edx(previous round),ebp(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx + AS2( xor edx, DWORD PTR [WORD_REG(si)+WORD_REG(cx)*8+3]) + XOR( ebx, dl, 3) + MOV( ecx, dh, 2) + AS2( shr edx, 16) + AS2( xor ecx, [L_SAVED_X+2*4]) + XOR( eax, dh, 0) + MOV( edx, dl, 1) + AS2( xor edx, [L_SAVED_X+3*4]) + + AS2( add L_REG, [L_KEYS_BEGIN]) + AS2( add L_REG, 3*16) + ASJ( jmp, 4, f) + +// in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15) +// out: eax, ebx, edi, mm0 +#define ROUND() \ + MXOR( 0, cl, 3) /* 11 */\ + AS2( mov cl, al) /* 8,9,10,3 */\ + XOR( edi, ah, 2) /* 2 */\ + AS2( shr eax, 16) /* 0,1 */\ + XOR( edi, bl, 3) /* 7 */\ + MXOR( 0, bh, 2) /* 6 */\ + AS2( shr ebx, 16) /* 4,5 */\ + MXOR( 0, al, 1) /* 1 */\ + MOV( eax, ah, 0) /* 0 */\ + XOR( eax, bl, 1) /* 5 */\ + MOV( ebx, bh, 0) /* 4 */\ + XOR( eax, ch, 2) /* 10 */\ + XOR( ebx, cl, 3) /* 3 */\ + AS2( shr ecx, 16) /* 8,9 */\ + XOR( eax, dl, 3) /* 15 */\ + XOR( ebx, dh, 2) /* 14 */\ + AS2( shr edx, 16) /* 12,13 */\ + XOR( edi, ch, 0) /* 8 */\ + XOR( ebx, cl, 1) /* 9 */\ + XOR( edi, dl, 1) /* 13 */\ + MXOR( 0, dh, 0) /* 12 */\ + + ASL(2) // 2-round loop + AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4]) + AS2( mov edi, [L_SUBKEYS-4*16+2*4]) + ROUND() + AS2( mov ecx, edi) + AS2( xor eax, [L_SUBKEYS-4*16+0*4]) + AS2( xor ebx, [L_SUBKEYS-4*16+1*4]) + AS2( MOVD edx, MM(0)) + + ASL(4) + AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4]) + AS2( mov edi, [L_SUBKEYS-4*16+6*4]) + ROUND() + AS2( mov ecx, edi) + AS2( xor eax, [L_SUBKEYS-4*16+4*4]) + AS2( xor ebx, [L_SUBKEYS-4*16+5*4]) + AS2( MOVD edx, MM(0)) + + AS2( add L_REG, 32) + AS2( test L_REG, 255) + ASJ( jnz, 2, b) + AS2( sub L_REG, 16*16) + +#define LAST(a, b, c) \ + AS2( movzx ebp, a )\ + AS2( movzx edi, BYTE PTR [WORD_REG(si)+WORD_REG(bp)*8+1] )\ + AS2( movzx ebp, b )\ + AS2( xor edi, DWORD PTR [WORD_REG(si)+WORD_REG(bp)*8+0] )\ + AS2( mov WORD PTR [L_LASTROUND+c], di )\ + + // last round + LAST(ch, dl, 2) + LAST(dh, al, 6) + AS2( shr edx, 16) + LAST(ah, bl, 10) + AS2( shr eax, 16) + LAST(bh, cl, 14) + AS2( shr ebx, 16) + LAST(dh, al, 12) + AS2( shr ecx, 16) + LAST(ah, bl, 0) + LAST(bh, cl, 4) + LAST(ch, dl, 8) + + AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS]) + AS2( mov WORD_REG(bx), [L_OUTBLOCKS]) + + AS2( mov WORD_REG(cx), [L_LENGTH]) + AS2( sub WORD_REG(cx), 16) + + AS2( movdqu xmm2, [WORD_REG(ax)]) + AS2( pxor xmm2, xmm4) -#define QUARTER_ROUND(t, a, b, c, d) \ - AS2(movzx esi, t##l)\ - AS2(a, [AS_REG_7+3*1024+4*WORD_REG(si)])\ - AS2(movzx esi, t##h)\ - AS2(b, [AS_REG_7+2*1024+4*WORD_REG(si)])\ - AS2(shr e##t##x, 16)\ - AS2(movzx esi, t##l)\ - AS2(c, [AS_REG_7+1*1024+4*WORD_REG(si)])\ - AS2(movzx esi, t##h)\ - AS2(d, [AS_REG_7+0*1024+4*WORD_REG(si)]) - - QUARTER_ROUND(d, s0, s1, s2, s3) - RESTORE_2(edx) - QUARTER_ROUND(d, s3, s0, s1, s2) - RESTORE_1(edx) - QUARTER_ROUND(d, s2, s3, s0, s1) - RESTORE_0(edx) - QUARTER_ROUND(d, s1, s2, s3, s0) - RESTORE_K - SAVE_2(ebx) - SAVE_1(eax) - SAVE_0(edi) - - AS2( mov edi, [K_REG+4*4]) - AS2( mov eax, [K_REG+5*4]) - AS2( mov ebx, [K_REG+6*4]) - AS2( mov edx, [K_REG+7*4]) - - QUARTER_ROUND(c, t0, t1, t2, t3) - RESTORE_2(ecx) - QUARTER_ROUND(c, t3, t0, t1, t2) - RESTORE_1(ecx) - QUARTER_ROUND(c, t2, t3, t0, t1) - RESTORE_0(ecx) - QUARTER_ROUND(c, t1, t2, t3, t0) - SAVE_2(ebx) - SAVE_1(eax) - SAVE_0(edi) - - RESTORE_K - RESTORE_K_END - AS2( add K_REG, 8*4) - SAVE_K - AS2( cmp K_END_REG, K_REG) - ASJ( jne, 0, b) +#if CRYPTOPP_BOOL_X86 + AS2( movdqa xmm0, [L_INCREMENTS]) + AS2( paddd xmm0, [L_INBLOCKS]) + AS2( movdqa [L_INBLOCKS], xmm0) +#else + AS2( movdqa xmm0, [L_INCREMENTS+16]) + AS2( paddq xmm0, [L_INBLOCKS+16]) + AS2( movdqa [L_INBLOCKS+16], xmm0) +#endif -#undef QUARTER_ROUND -#undef s0 -#undef s1 -#undef s2 -#undef s3 -#undef t0 -#undef t1 -#undef t2 -#undef t3 - - AS2( mov eax, [K_END_REG+0*4]) - AS2( mov ecx, [K_END_REG+1*4]) - AS2( mov esi, [K_END_REG+2*4]) - AS2( mov edi, [K_END_REG+3*4]) - -#define QUARTER_ROUND(a, b, c, d) \ - AS2( movzx ebx, dl)\ - AS2( movzx ebx, BYTE PTR [AS_REG_7+1+4*WORD_REG(bx)])\ - AS2( shl ebx, 3*8)\ - AS2( xor a, ebx)\ - AS2( movzx ebx, dh)\ - AS2( movzx ebx, BYTE PTR [AS_REG_7+1+4*WORD_REG(bx)])\ - AS2( shl ebx, 2*8)\ - AS2( xor b, ebx)\ - AS2( shr edx, 16)\ - AS2( movzx ebx, dl)\ - AS2( shr edx, 8)\ - AS2( movzx ebx, BYTE PTR [AS_REG_7+1+4*WORD_REG(bx)])\ - AS2( shl ebx, 1*8)\ - AS2( xor c, ebx)\ - AS2( movzx ebx, BYTE PTR [AS_REG_7+1+4*WORD_REG(dx)])\ - AS2( xor d, ebx) - - QUARTER_ROUND(eax, ecx, esi, edi) - RESTORE_2(edx) - QUARTER_ROUND(edi, eax, ecx, esi) - RESTORE_1(edx) - QUARTER_ROUND(esi, edi, eax, ecx) - RESTORE_0(edx) - QUARTER_ROUND(ecx, esi, edi, eax) + AS2( pxor xmm2, [L_LASTROUND]) + AS2( movdqu [WORD_REG(bx)], xmm2) -#undef QUARTER_ROUND + ASJ( jle, 7, f) + AS2( mov [L_LENGTH], WORD_REG(cx)) + AS2( test WORD_REG(cx), 1) + ASJ( jnz, 1, b) +#if CRYPTOPP_BOOL_X64 + AS2( movdqa xmm0, [L_INCREMENTS]) + AS2( paddd xmm0, [L_INBLOCKS]) + AS2( movdqa [L_INBLOCKS], xmm0) +#endif + ASJ( jmp, 3, b) + ASL(7) #if CRYPTOPP_BOOL_X86 - AS1(emms) - AS1(pop ebp) - #if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300) - AS1(pop ebx) - #endif + AS2( mov esp, [L_SP]) + AS1( emms) +#else + AS2( mov rbp, [L_BP]) +#endif + AS_POP_IF86( bp) + AS_POP_IF86( bx) +#ifndef __GNUC__ + AS_POP_IF86( di) + AS_POP_IF86( si) +#endif +#ifdef CRYPTOPP_GENERATE_X64_MASM + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret + Rijndael_Enc_AdvancedProcessBlocks ENDP +#else + AS1( ret) #endif - #ifdef __GNUC__ - ".att_syntax prefix;" - : "=a" (t0), "=c" (t1), "=S" (t2), "=D" (t3) - : "a" (Te), "D" (inBlock), "S" (k), "c" (kLoopEnd), "d" (g_cacheLineSize) - : "memory", "cc" - #if CRYPTOPP_BOOL_X64 - , "%ebx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" - #endif - ); + ".att_syntax prefix;" + ); +#endif +} - if (xorBlock) - { - t0 ^= ((const word32 *)xorBlock)[0]; - t1 ^= ((const word32 *)xorBlock)[1]; - t2 ^= ((const word32 *)xorBlock)[2]; - t3 ^= ((const word32 *)xorBlock)[3]; - } - ((word32 *)outBlock)[0] = t0; - ((word32 *)outBlock)[1] = t1; - ((word32 *)outBlock)[2] = t2; - ((word32 *)outBlock)[3] = t3; -#else - #if CRYPTOPP_BOOL_X64 - mov rbx, [rsp + 6*8 + 7*8] ; xorBlock - #else - AS2( mov ebx, xorBlock) - #endif - AS2( test WORD_REG(bx), WORD_REG(bx)) - ASJ( jz, 1, f) - AS2( xor eax, [WORD_REG(bx)+0*4]) - AS2( xor ecx, [WORD_REG(bx)+1*4]) - AS2( xor esi, [WORD_REG(bx)+2*4]) - AS2( xor edi, [WORD_REG(bx)+3*4]) - ASL(1) - #if CRYPTOPP_BOOL_X64 - mov rbx, [rsp + 7*8 + 7*8] ; outBlock - #else - AS2( mov ebx, outBlock) - #endif - AS2( mov [WORD_REG(bx)+0*4], eax) - AS2( mov [WORD_REG(bx)+1*4], ecx) - AS2( mov [WORD_REG(bx)+2*4], esi) - AS2( mov [WORD_REG(bx)+3*4], edi) #endif -#if CRYPTOPP_GENERATE_X64_MASM - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbx - ret - Rijndael_Enc_ProcessAndXorBlock ENDP -#else +#ifndef CRYPTOPP_GENERATE_X64_MASM + +#ifdef CRYPTOPP_X64_MASM_AVAILABLE +extern "C" { +void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k); +} +#endif + +static inline bool AliasedWithTable(const byte *begin, const byte *end) +{ + size_t s0 = size_t(begin)%4096, s1 = size_t(end)%4096; + size_t t0 = size_t(Te)%4096, t1 = (size_t(Te)+sizeof(Te))%4096; + if (t1 > t0) + return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1); + else + return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0); +} + +size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const +{ + if (length < BLOCKSIZE) + return length; + + if (HasSSE2()) + { + struct Locals + { + word32 subkeys[4*12], workspace[8]; + const byte *inBlocks, *inXorBlocks, *outXorBlocks; + byte *outBlocks; + size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement; + size_t regSpill, lengthAndCounterFlag, keysBegin; + }; + + const byte* zeros = (byte *)(Te+256); + byte *space; + + do { + space = (byte *)alloca(255+sizeof(Locals)); + space += (256-(size_t)space%256)%256; + } + while (AliasedWithTable(space, space+sizeof(Locals))); + + Locals &locals = *(Locals *)space; + + locals.inBlocks = inBlocks; + locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros; + locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks; + locals.outBlocks = outBlocks; + + locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : BLOCKSIZE; + locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? BLOCKSIZE : 0; + locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : BLOCKSIZE; + locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : BLOCKSIZE; + + locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter); + int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2); + locals.keysBegin = (12-keysToCopy)*16; + + #ifdef __GNUC__ + __asm__ __volatile__ + ( + AS1(call Rijndael_Enc_AdvancedProcessBlocks) + : + : "c" (&locals), "d" (m_key.begin()), "S" (Te), "D" (g_cacheLineSize) + : "memory", "cc", "%eax" + #if CRYPTOPP_BOOL_X64 + , "%rbx", "%r8", "%r10", "%r11", "%r12" + #endif + ); + #else + Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key); + #endif + return length%16; } else -#endif -#endif // #ifdef CRYPTOPP_X86_ASM_AVAILABLE -#ifndef CRYPTOPP_GENERATE_X64_MASM + return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); +} + +void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const +{ +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + if (HasSSE2()) { + Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); + return; + } +#endif + word32 s0, s1, s2, s3, t0, t1, t2, t3; const word32 *rk = m_key; @@ -508,42 +773,56 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock const int cacheLineSize = GetCacheLineSize(); unsigned int i; word32 u = 0; +#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS + for (i=0; i<2048; i+=cacheLineSize) +#else for (i=0; i<1024; i+=cacheLineSize) +#endif u &= *(const word32 *)(((const byte *)Te)+i); u &= Te[255]; s0 |= u; s1 |= u; s2 |= u; s3 |= u; - // first round -#ifdef IS_BIG_ENDIAN #define QUARTER_ROUND(t, a, b, c, d) \ - a ^= rotrFixed(Te[byte(t)], 24); t >>= 8;\ - b ^= rotrFixed(Te[byte(t)], 16); t >>= 8;\ - c ^= rotrFixed(Te[byte(t)], 8); t >>= 8;\ - d ^= Te[t]; + a ^= TL(3, byte(t)); t >>= 8;\ + b ^= TL(2, byte(t)); t >>= 8;\ + c ^= TL(1, byte(t)); t >>= 8;\ + d ^= TL(0, t); + +#ifdef IS_LITTLE_ENDIAN + #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS + #define TL(i, x) (*(word32 *)((byte *)Te + x*8 + (6-i)%4+1)) + #else + #define TL(i, x) rotrFixed(Te[x], (3-i)*8) + #endif + #define QUARTER_ROUND1(t, a, b, c, d) QUARTER_ROUND(t, d, c, b, a) #else -#define QUARTER_ROUND(t, a, b, c, d) \ - d ^= Te[byte(t)]; t >>= 8;\ - c ^= rotrFixed(Te[byte(t)], 8); t >>= 8;\ - b ^= rotrFixed(Te[byte(t)], 16); t >>= 8;\ - a ^= rotrFixed(Te[t], 24); + #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS + #define TL(i, x) (*(word32 *)((byte *)Te + x*8 + (4-i)%4)) + #else + #define TL(i, x) rotrFixed(Te[x], i*8) + #endif + #define QUARTER_ROUND1 QUARTER_ROUND #endif - QUARTER_ROUND(s3, t0, t1, t2, t3) - QUARTER_ROUND(s2, t3, t0, t1, t2) - QUARTER_ROUND(s1, t2, t3, t0, t1) - QUARTER_ROUND(s0, t1, t2, t3, t0) -#undef QUARTER_ROUND + QUARTER_ROUND1(s3, t0, t1, t2, t3) + QUARTER_ROUND1(s2, t3, t0, t1, t2) + QUARTER_ROUND1(s1, t2, t3, t0, t1) + QUARTER_ROUND1(s0, t1, t2, t3, t0) + +#if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) && defined(IS_LITTLE_ENDIAN) + #undef TL + #define TL(i, x) (*(word32 *)((byte *)Te + x*8 + (i+3)%4+1)) +#endif + +#ifndef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS + #undef TL + #define TL(i, x) Te[i*256 + x] +#endif // Nr - 2 full rounds: unsigned int r = m_rounds/2 - 1; do { -#define QUARTER_ROUND(t, a, b, c, d) \ - a ^= Te[3*256+byte(t)]; t >>= 8;\ - b ^= Te[2*256+byte(t)]; t >>= 8;\ - c ^= Te[1*256+byte(t)]; t >>= 8;\ - d ^= Te[t]; - s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3]; QUARTER_ROUND(t3, s0, s1, s2, s3) @@ -562,23 +841,16 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock rk += 8; } while (--r); - // timing attack countermeasure. see comments at top for more details - u = 0; - for (i=0; i<256; i+=cacheLineSize) - u &= *(const word32 *)(Se+i); - u &= *(const word32 *)(Se+252); - t0 |= u; t1 |= u; t2 |= u; t3 |= u; - word32 tbw[4]; byte *const tempBlock = (byte *)tbw; word32 *const obw = (word32 *)outBlock; const word32 *const xbw = (const word32 *)xorBlock; #define QUARTER_ROUND(t, a, b, c, d) \ - tempBlock[a] = Se[byte(t)]; t >>= 8;\ - tempBlock[b] = Se[byte(t)]; t >>= 8;\ - tempBlock[c] = Se[byte(t)]; t >>= 8;\ - tempBlock[d] = Se[t]; + tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\ + tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\ + tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\ + tempBlock[d] = ((byte *)(Te+t))[1]; QUARTER_ROUND(t2, 15, 2, 5, 8) QUARTER_ROUND(t1, 11, 14, 1, 4) @@ -600,7 +872,6 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock obw[2] = tbw[2] ^ rk[2]; obw[3] = tbw[3] ^ rk[3]; } - } } void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const -- cgit v1.2.1