diff options
author | weidai <weidai@57ff6487-cd31-0410-9ec3-f628ee90f5f0> | 2007-04-15 23:00:27 +0000 |
---|---|---|
committer | weidai <weidai@57ff6487-cd31-0410-9ec3-f628ee90f5f0> | 2007-04-15 23:00:27 +0000 |
commit | 2bca8928ee8333a9e8823f1776c92285ade384bd (patch) | |
tree | 6d117430c0ea76a7444fa3871d1898995d825ada /rijndael.cpp | |
parent | 9c4e83bd0dc50000ab945875cf1c4b71c2d9c5ee (diff) | |
download | cryptopp-2bca8928ee8333a9e8823f1776c92285ade384bd.tar.gz |
MMX/SSE2 optimizations
git-svn-id: svn://svn.code.sf.net/p/cryptopp/code/trunk/c5@287 57ff6487-cd31-0410-9ec3-f628ee90f5f0
Diffstat (limited to 'rijndael.cpp')
-rw-r--r-- | rijndael.cpp | 576 |
1 files changed, 369 insertions, 207 deletions
diff --git a/rijndael.cpp b/rijndael.cpp index 2a1a19e..4a8572f 100644 --- a/rijndael.cpp +++ b/rijndael.cpp @@ -51,10 +51,7 @@ being unloaded from L1 cache, until that round is finished. #include "rijndael.h" #include "misc.h" - -#ifdef CRYPTOPP_L1_CACHE_ALIGN_NOT_AVAILABLE -#pragma message("Don't know how to align data on L1 cache boundary. Defense against AES timing attack may be affected.") -#endif +#include "cpu.h" NAMESPACE_BEGIN(CryptoPP) @@ -122,25 +119,25 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c for (i = 1; i < m_rounds; i++) { rk += 4; rk[0] = - Td0[Se[GETBYTE(rk[0], 3)]] ^ - Td1[Se[GETBYTE(rk[0], 2)]] ^ - Td2[Se[GETBYTE(rk[0], 1)]] ^ - Td3[Se[GETBYTE(rk[0], 0)]]; + Td[0*256+Se[GETBYTE(rk[0], 3)]] ^ + Td[1*256+Se[GETBYTE(rk[0], 2)]] ^ + Td[2*256+Se[GETBYTE(rk[0], 1)]] ^ + Td[3*256+Se[GETBYTE(rk[0], 0)]]; rk[1] = - Td0[Se[GETBYTE(rk[1], 3)]] ^ - Td1[Se[GETBYTE(rk[1], 2)]] ^ - Td2[Se[GETBYTE(rk[1], 1)]] ^ - Td3[Se[GETBYTE(rk[1], 0)]]; + Td[0*256+Se[GETBYTE(rk[1], 3)]] ^ + Td[1*256+Se[GETBYTE(rk[1], 2)]] ^ + Td[2*256+Se[GETBYTE(rk[1], 1)]] ^ + Td[3*256+Se[GETBYTE(rk[1], 0)]]; rk[2] = - Td0[Se[GETBYTE(rk[2], 3)]] ^ - Td1[Se[GETBYTE(rk[2], 2)]] ^ - Td2[Se[GETBYTE(rk[2], 1)]] ^ - Td3[Se[GETBYTE(rk[2], 0)]]; + Td[0*256+Se[GETBYTE(rk[2], 3)]] ^ + Td[1*256+Se[GETBYTE(rk[2], 2)]] ^ + Td[2*256+Se[GETBYTE(rk[2], 1)]] ^ + Td[3*256+Se[GETBYTE(rk[2], 0)]]; rk[3] = - Td0[Se[GETBYTE(rk[3], 3)]] ^ - Td1[Se[GETBYTE(rk[3], 2)]] ^ - Td2[Se[GETBYTE(rk[3], 1)]] ^ - Td3[Se[GETBYTE(rk[3], 0)]]; + Td[0*256+Se[GETBYTE(rk[3], 3)]] ^ + Td[1*256+Se[GETBYTE(rk[3], 2)]] ^ + Td[2*256+Se[GETBYTE(rk[3], 1)]] ^ + Td[3*256+Se[GETBYTE(rk[3], 0)]]; } } @@ -148,15 +145,245 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key + m_rounds*4, m_key + m_rounds*4, 16); } -const static unsigned int s_lineSizeDiv4 = CRYPTOPP_L1_CACHE_LINE_SIZE/4; -#ifdef IS_BIG_ENDIAN -const static unsigned int s_i3=3, s_i2=2, s_i1=1, s_i0=0; -#else -const static unsigned int s_i3=0, s_i2=1, s_i1=2, s_i0=3; -#endif +#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const { +#ifdef CRYPTOPP_X86_ASM_AVAILABLE + if (HasMMX()) + { + const word32 *k = m_key; + const word32 *kLoopEnd = k + m_rounds*4; +#ifdef __GNUC__ + word32 t0, t1, t2, t3; + __asm__ __volatile__ + ( + ".intel_syntax noprefix;" + AS1( push ebx) + AS1( push ebp) + AS2( mov ebp, eax) + AS2( movd mm5, ecx) +#else + AS2( mov edx, g_cacheLineSize) + AS2( mov edi, inBlock) + AS2( mov esi, k) + AS2( movd mm5, kLoopEnd) + AS1( push ebp) + AS2( lea ebp, Te) +#endif + AS2( mov eax, [esi+0*4]) // s0 + AS2( xor eax, [edi+0*4]) + AS2( movd mm0, eax) + AS2( mov ebx, [esi+1*4]) + AS2( xor ebx, [edi+1*4]) + AS2( movd mm1, ebx) + AS2( and ebx, eax) + AS2( mov eax, [esi+2*4]) + AS2( xor eax, [edi+2*4]) + AS2( movd mm2, eax) + AS2( and ebx, eax) + AS2( mov ecx, [esi+3*4]) + AS2( xor ecx, [edi+3*4]) + AS2( and ebx, ecx) + + // read Te0 into L1 cache. this code could be simplifed by using lfence, but that is an SSE2 instruction + AS2( and ebx, 0) + AS2( mov edi, ebx) // make index depend on previous loads to simulate lfence + ASL(2) + AS2( and ebx, [ebp+edi]) + AS2( add edi, edx) + AS2( and ebx, [ebp+edi]) + AS2( add edi, edx) + AS2( and ebx, [ebp+edi]) + AS2( add edi, edx) + AS2( and ebx, [ebp+edi]) + AS2( add edi, edx) + AS2( cmp edi, 1024) + ASJ( jl, 2, b) + AS2( and ebx, [ebp+1020]) + AS2( movd mm6, ebx) + AS2( pxor mm2, mm6) + AS2( pxor mm1, mm6) + AS2( pxor mm0, mm6) + AS2( xor ecx, ebx) + + AS2( mov edi, [esi+4*4]) // t0 + AS2( mov eax, [esi+5*4]) + AS2( mov ebx, [esi+6*4]) + AS2( mov edx, [esi+7*4]) + AS2( add esi, 8*4) + AS2( movd mm4, esi) + +#define QUARTER_ROUND(t, a, b, c, d) \ + AS2(movzx esi, t##l)\ + AS2(d, [ebp+0*1024+4*esi])\ + AS2(movzx esi, t##h)\ + AS2(c, [ebp+1*1024+4*esi])\ + AS2(shr e##t##x, 16)\ + AS2(movzx esi, t##l)\ + AS2(b, [ebp+2*1024+4*esi])\ + AS2(movzx esi, t##h)\ + AS2(a, [ebp+3*1024+4*esi]) + +#define s0 xor edi +#define s1 xor eax +#define s2 xor ebx +#define s3 xor ecx +#define t0 xor edi +#define t1 xor eax +#define t2 xor ebx +#define t3 xor edx + + QUARTER_ROUND(c, t0, t1, t2, t3) + AS2( movd ecx, mm2) + QUARTER_ROUND(c, t3, t0, t1, t2) + AS2( movd ecx, mm1) + QUARTER_ROUND(c, t2, t3, t0, t1) + AS2( movd ecx, mm0) + QUARTER_ROUND(c, t1, t2, t3, t0) + AS2( movd mm2, ebx) + AS2( movd mm1, eax) + AS2( movd mm0, edi) +#undef QUARTER_ROUND + + AS2( movd esi, mm4) + + ASL(0) + AS2( mov edi, [esi+0*4]) + AS2( mov eax, [esi+1*4]) + AS2( mov ebx, [esi+2*4]) + AS2( mov ecx, [esi+3*4]) + +#define QUARTER_ROUND(t, a, b, c, d) \ + AS2(movzx esi, t##l)\ + AS2(a, [ebp+3*1024+4*esi])\ + AS2(movzx esi, t##h)\ + AS2(b, [ebp+2*1024+4*esi])\ + AS2(shr e##t##x, 16)\ + AS2(movzx esi, t##l)\ + AS2(c, [ebp+1*1024+4*esi])\ + AS2(movzx esi, t##h)\ + AS2(d, [ebp+0*1024+4*esi]) + + QUARTER_ROUND(d, s0, s1, s2, s3) + AS2( movd edx, mm2) + QUARTER_ROUND(d, s3, s0, s1, s2) + AS2( movd edx, mm1) + QUARTER_ROUND(d, s2, s3, s0, s1) + AS2( movd edx, mm0) + QUARTER_ROUND(d, s1, s2, s3, s0) + AS2( movd esi, mm4) + AS2( movd mm2, ebx) + AS2( movd mm1, eax) + AS2( movd mm0, edi) + + AS2( mov edi, [esi+4*4]) + AS2( mov eax, [esi+5*4]) + AS2( mov ebx, [esi+6*4]) + AS2( mov edx, [esi+7*4]) + + QUARTER_ROUND(c, t0, t1, t2, t3) + AS2( movd ecx, mm2) + QUARTER_ROUND(c, t3, t0, t1, t2) + AS2( movd ecx, mm1) + QUARTER_ROUND(c, t2, t3, t0, t1) + AS2( movd ecx, mm0) + QUARTER_ROUND(c, t1, t2, t3, t0) + AS2( movd mm2, ebx) + AS2( movd mm1, eax) + AS2( movd mm0, edi) + + AS2( movd esi, mm4) + AS2( movd edi, mm5) + AS2( add esi, 8*4) + AS2( movd mm4, esi) + AS2( cmp edi, esi) + ASJ( jne, 0, b) + +#undef QUARTER_ROUND +#undef s0 +#undef s1 +#undef s2 +#undef s3 +#undef t0 +#undef t1 +#undef t2 +#undef t3 + + AS2( mov eax, [edi+0*4]) + AS2( mov ecx, [edi+1*4]) + AS2( mov esi, [edi+2*4]) + AS2( mov edi, [edi+3*4]) + +#define QUARTER_ROUND(a, b, c, d) \ + AS2( movzx ebx, dl)\ + AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\ + AS2( shl ebx, 3*8)\ + AS2( xor a, ebx)\ + AS2( movzx ebx, dh)\ + AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\ + AS2( shl ebx, 2*8)\ + AS2( xor b, ebx)\ + AS2( shr edx, 16)\ + AS2( movzx ebx, dl)\ + AS2( shr edx, 8)\ + AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\ + AS2( shl ebx, 1*8)\ + AS2( xor c, ebx)\ + AS2( movzx ebx, BYTE PTR [ebp+1+4*edx])\ + AS2( xor d, ebx) + + QUARTER_ROUND(eax, ecx, esi, edi) + AS2( movd edx, mm2) + QUARTER_ROUND(edi, eax, ecx, esi) + AS2( movd edx, mm1) + QUARTER_ROUND(esi, edi, eax, ecx) + AS2( movd edx, mm0) + QUARTER_ROUND(ecx, esi, edi, eax) + +#undef QUARTER_ROUND + + AS1( pop ebp) + AS1( emms) + +#ifdef __GNUC__ + AS1( pop ebx) + ".att_syntax prefix;" + : "=a" (t0), "=c" (t1), "=S" (t2), "=D" (t3) + : "a" (Te), "D" (inBlock), "S" (k), "c" (kLoopEnd), "d" (g_cacheLineSize) + : "memory", "cc" + ); + + if (xorBlock) + { + t0 ^= ((const word32 *)xorBlock)[0]; + t1 ^= ((const word32 *)xorBlock)[1]; + t2 ^= ((const word32 *)xorBlock)[2]; + t3 ^= ((const word32 *)xorBlock)[3]; + } + ((word32 *)outBlock)[0] = t0; + ((word32 *)outBlock)[1] = t1; + ((word32 *)outBlock)[2] = t2; + ((word32 *)outBlock)[3] = t3; +#else + AS2( mov ebx, xorBlock) + AS2( test ebx, ebx) + ASJ( jz, 1, f) + AS2( xor eax, [ebx+0*4]) + AS2( xor ecx, [ebx+1*4]) + AS2( xor esi, [ebx+2*4]) + AS2( xor edi, [ebx+3*4]) + ASL(1) + AS2( mov ebx, outBlock) + AS2( mov [ebx+0*4], eax) + AS2( mov [ebx+1*4], ecx) + AS2( mov [ebx+2*4], esi) + AS2( mov [ebx+3*4], edi) +#endif + } + else +#endif // #ifdef CRYPTOPP_X86_ASM_AVAILABLE + { word32 s0, s1, s2, s3, t0, t1, t2, t3; const word32 *rk = m_key; @@ -171,95 +398,68 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock rk += 8; // timing attack countermeasure. see comments at top for more details + const int cacheLineSize = GetCacheLineSize(); unsigned int i; word32 u = 0; - for (i=0; i<sizeof(Te0)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE) - u &= (Te0[i+0*s_lineSizeDiv4] & Te0[i+2*s_lineSizeDiv4]) & (Te0[i+1*s_lineSizeDiv4] & Te0[i+3*s_lineSizeDiv4]); + for (i=0; i<1024; i+=cacheLineSize) + u &= *(const word32 *)(((const byte *)Te)+i); + u &= Te[255]; s0 |= u; s1 |= u; s2 |= u; s3 |= u; // first round - t0 ^= - Te0[GETBYTE(s0, s_i3)] ^ - rotrFixed(Te0[GETBYTE(s1, s_i2)], 8) ^ - rotrFixed(Te0[GETBYTE(s2, s_i1)], 16) ^ - rotrFixed(Te0[GETBYTE(s3, s_i0)], 24); - t1 ^= - Te0[GETBYTE(s1, s_i3)] ^ - rotrFixed(Te0[GETBYTE(s2, s_i2)], 8) ^ - rotrFixed(Te0[GETBYTE(s3, s_i1)], 16) ^ - rotrFixed(Te0[GETBYTE(s0, s_i0)], 24); - t2 ^= - Te0[GETBYTE(s2, s_i3)] ^ - rotrFixed(Te0[GETBYTE(s3, s_i2)], 8) ^ - rotrFixed(Te0[GETBYTE(s0, s_i1)], 16) ^ - rotrFixed(Te0[GETBYTE(s1, s_i0)], 24); - t3 ^= - Te0[GETBYTE(s3, s_i3)] ^ - rotrFixed(Te0[GETBYTE(s0, s_i2)], 8) ^ - rotrFixed(Te0[GETBYTE(s1, s_i1)], 16) ^ - rotrFixed(Te0[GETBYTE(s2, s_i0)], 24); +#ifdef IS_BIG_ENDIAN +#define QUARTER_ROUND(t, a, b, c, d) \ + a ^= rotrFixed(Te[byte(t)], 24); t >>= 8;\ + b ^= rotrFixed(Te[byte(t)], 16); t >>= 8;\ + c ^= rotrFixed(Te[byte(t)], 8); t >>= 8;\ + d ^= Te[t]; +#else +#define QUARTER_ROUND(t, a, b, c, d) \ + d ^= Te[byte(t)]; t >>= 8;\ + c ^= rotrFixed(Te[byte(t)], 8); t >>= 8;\ + b ^= rotrFixed(Te[byte(t)], 16); t >>= 8;\ + a ^= rotrFixed(Te[t], 24); +#endif + + QUARTER_ROUND(s3, t0, t1, t2, t3) + QUARTER_ROUND(s2, t3, t0, t1, t2) + QUARTER_ROUND(s1, t2, t3, t0, t1) + QUARTER_ROUND(s0, t1, t2, t3, t0) +#undef QUARTER_ROUND // Nr - 2 full rounds: unsigned int r = m_rounds/2 - 1; do { - s0 = - Te0[GETBYTE(t0, 3)] ^ - Te1[GETBYTE(t1, 2)] ^ - Te2[GETBYTE(t2, 1)] ^ - Te3[GETBYTE(t3, 0)] ^ - rk[0]; - s1 = - Te0[GETBYTE(t1, 3)] ^ - Te1[GETBYTE(t2, 2)] ^ - Te2[GETBYTE(t3, 1)] ^ - Te3[GETBYTE(t0, 0)] ^ - rk[1]; - s2 = - Te0[GETBYTE(t2, 3)] ^ - Te1[GETBYTE(t3, 2)] ^ - Te2[GETBYTE(t0, 1)] ^ - Te3[GETBYTE(t1, 0)] ^ - rk[2]; - s3 = - Te0[GETBYTE(t3, 3)] ^ - Te1[GETBYTE(t0, 2)] ^ - Te2[GETBYTE(t1, 1)] ^ - Te3[GETBYTE(t2, 0)] ^ - rk[3]; - - t0 = - Te0[GETBYTE(s0, 3)] ^ - Te1[GETBYTE(s1, 2)] ^ - Te2[GETBYTE(s2, 1)] ^ - Te3[GETBYTE(s3, 0)] ^ - rk[4]; - t1 = - Te0[GETBYTE(s1, 3)] ^ - Te1[GETBYTE(s2, 2)] ^ - Te2[GETBYTE(s3, 1)] ^ - Te3[GETBYTE(s0, 0)] ^ - rk[5]; - t2 = - Te0[GETBYTE(s2, 3)] ^ - Te1[GETBYTE(s3, 2)] ^ - Te2[GETBYTE(s0, 1)] ^ - Te3[GETBYTE(s1, 0)] ^ - rk[6]; - t3 = - Te0[GETBYTE(s3, 3)] ^ - Te1[GETBYTE(s0, 2)] ^ - Te2[GETBYTE(s1, 1)] ^ - Te3[GETBYTE(s2, 0)] ^ - rk[7]; +#define QUARTER_ROUND(t, a, b, c, d) \ + a ^= Te[3*256+byte(t)]; t >>= 8;\ + b ^= Te[2*256+byte(t)]; t >>= 8;\ + c ^= Te[1*256+byte(t)]; t >>= 8;\ + d ^= Te[t]; + + s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3]; + + QUARTER_ROUND(t3, s0, s1, s2, s3) + QUARTER_ROUND(t2, s3, s0, s1, s2) + QUARTER_ROUND(t1, s2, s3, s0, s1) + QUARTER_ROUND(t0, s1, s2, s3, s0) + + t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7]; + + QUARTER_ROUND(s3, t0, t1, t2, t3) + QUARTER_ROUND(s2, t3, t0, t1, t2) + QUARTER_ROUND(s1, t2, t3, t0, t1) + QUARTER_ROUND(s0, t1, t2, t3, t0) +#undef QUARTER_ROUND rk += 8; } while (--r); // timing attack countermeasure. see comments at top for more details u = 0; - for (i=0; i<sizeof(Se)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE) - u &= (((word32*)Se)[i+0*s_lineSizeDiv4] & ((word32*)Se)[i+2*s_lineSizeDiv4]) & (((word32*)Se)[i+1*s_lineSizeDiv4] & ((word32*)Se)[i+3*s_lineSizeDiv4]); + for (i=0; i<256; i+=cacheLineSize) + u &= *(const word32 *)(Se+i); + u &= *(const word32 *)(Se+252); t0 |= u; t1 |= u; t2 |= u; t3 |= u; word32 tbw[4]; @@ -267,23 +467,17 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock word32 *const obw = (word32 *)outBlock; const word32 *const xbw = (const word32 *)xorBlock; - // last round - tempBlock[0] = Se[GETBYTE(t0, 3)]; - tempBlock[1] = Se[GETBYTE(t1, 2)]; - tempBlock[2] = Se[GETBYTE(t2, 1)]; - tempBlock[3] = Se[GETBYTE(t3, 0)]; - tempBlock[4] = Se[GETBYTE(t1, 3)]; - tempBlock[5] = Se[GETBYTE(t2, 2)]; - tempBlock[6] = Se[GETBYTE(t3, 1)]; - tempBlock[7] = Se[GETBYTE(t0, 0)]; - tempBlock[8] = Se[GETBYTE(t2, 3)]; - tempBlock[9] = Se[GETBYTE(t3, 2)]; - tempBlock[10] = Se[GETBYTE(t0, 1)]; - tempBlock[11] = Se[GETBYTE(t1, 0)]; - tempBlock[12] = Se[GETBYTE(t3, 3)]; - tempBlock[13] = Se[GETBYTE(t0, 2)]; - tempBlock[14] = Se[GETBYTE(t1, 1)]; - tempBlock[15] = Se[GETBYTE(t2, 0)]; +#define QUARTER_ROUND(t, a, b, c, d) \ + tempBlock[a] = Se[byte(t)]; t >>= 8;\ + tempBlock[b] = Se[byte(t)]; t >>= 8;\ + tempBlock[c] = Se[byte(t)]; t >>= 8;\ + tempBlock[d] = Se[t]; + + QUARTER_ROUND(t2, 15, 2, 5, 8) + QUARTER_ROUND(t1, 11, 14, 1, 4) + QUARTER_ROUND(t0, 7, 10, 13, 0) + QUARTER_ROUND(t3, 3, 6, 9, 12) +#undef QUARTER_ROUND if (xbw) { @@ -299,12 +493,13 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock obw[2] = tbw[2] ^ rk[2]; obw[3] = tbw[3] ^ rk[3]; } + } } void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const { word32 s0, s1, s2, s3, t0, t1, t2, t3; - const word32 *rk = m_key; + const word32 *rk = m_key; s0 = ((const word32 *)inBlock)[0] ^ rk[0]; s1 = ((const word32 *)inBlock)[1] ^ rk[1]; @@ -317,95 +512,68 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock rk += 8; // timing attack countermeasure. see comments at top for more details + const int cacheLineSize = GetCacheLineSize(); unsigned int i; word32 u = 0; - for (i=0; i<sizeof(Td0)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE) - u &= (Td0[i+0*s_lineSizeDiv4] & Td0[i+2*s_lineSizeDiv4]) & (Td0[i+1*s_lineSizeDiv4] & Td0[i+3*s_lineSizeDiv4]); + for (i=0; i<1024; i+=cacheLineSize) + u &= *(const word32 *)(((const byte *)Td)+i); + u &= Td[255]; s0 |= u; s1 |= u; s2 |= u; s3 |= u; // first round - t0 ^= - Td0[GETBYTE(s0, s_i3)] ^ - rotrFixed(Td0[GETBYTE(s3, s_i2)], 8) ^ - rotrFixed(Td0[GETBYTE(s2, s_i1)], 16) ^ - rotrFixed(Td0[GETBYTE(s1, s_i0)], 24); - t1 ^= - Td0[GETBYTE(s1, s_i3)] ^ - rotrFixed(Td0[GETBYTE(s0, s_i2)], 8) ^ - rotrFixed(Td0[GETBYTE(s3, s_i1)], 16) ^ - rotrFixed(Td0[GETBYTE(s2, s_i0)], 24); - t2 ^= - Td0[GETBYTE(s2, s_i3)] ^ - rotrFixed(Td0[GETBYTE(s1, s_i2)], 8) ^ - rotrFixed(Td0[GETBYTE(s0, s_i1)], 16) ^ - rotrFixed(Td0[GETBYTE(s3, s_i0)], 24); - t3 ^= - Td0[GETBYTE(s3, s_i3)] ^ - rotrFixed(Td0[GETBYTE(s2, s_i2)], 8) ^ - rotrFixed(Td0[GETBYTE(s1, s_i1)], 16) ^ - rotrFixed(Td0[GETBYTE(s0, s_i0)], 24); +#ifdef IS_BIG_ENDIAN +#define QUARTER_ROUND(t, a, b, c, d) \ + a ^= rotrFixed(Td[byte(t)], 24); t >>= 8;\ + b ^= rotrFixed(Td[byte(t)], 16); t >>= 8;\ + c ^= rotrFixed(Td[byte(t)], 8); t >>= 8;\ + d ^= Td[t]; +#else +#define QUARTER_ROUND(t, a, b, c, d) \ + d ^= Td[byte(t)]; t >>= 8;\ + c ^= rotrFixed(Td[byte(t)], 8); t >>= 8;\ + b ^= rotrFixed(Td[byte(t)], 16); t >>= 8;\ + a ^= rotrFixed(Td[t], 24); +#endif + + QUARTER_ROUND(s3, t2, t1, t0, t3) + QUARTER_ROUND(s2, t1, t0, t3, t2) + QUARTER_ROUND(s1, t0, t3, t2, t1) + QUARTER_ROUND(s0, t3, t2, t1, t0) +#undef QUARTER_ROUND // Nr - 2 full rounds: unsigned int r = m_rounds/2 - 1; do { - s0 = - Td0[GETBYTE(t0, 3)] ^ - Td1[GETBYTE(t3, 2)] ^ - Td2[GETBYTE(t2, 1)] ^ - Td3[GETBYTE(t1, 0)] ^ - rk[0]; - s1 = - Td0[GETBYTE(t1, 3)] ^ - Td1[GETBYTE(t0, 2)] ^ - Td2[GETBYTE(t3, 1)] ^ - Td3[GETBYTE(t2, 0)] ^ - rk[1]; - s2 = - Td0[GETBYTE(t2, 3)] ^ - Td1[GETBYTE(t1, 2)] ^ - Td2[GETBYTE(t0, 1)] ^ - Td3[GETBYTE(t3, 0)] ^ - rk[2]; - s3 = - Td0[GETBYTE(t3, 3)] ^ - Td1[GETBYTE(t2, 2)] ^ - Td2[GETBYTE(t1, 1)] ^ - Td3[GETBYTE(t0, 0)] ^ - rk[3]; - - t0 = - Td0[GETBYTE(s0, 3)] ^ - Td1[GETBYTE(s3, 2)] ^ - Td2[GETBYTE(s2, 1)] ^ - Td3[GETBYTE(s1, 0)] ^ - rk[4]; - t1 = - Td0[GETBYTE(s1, 3)] ^ - Td1[GETBYTE(s0, 2)] ^ - Td2[GETBYTE(s3, 1)] ^ - Td3[GETBYTE(s2, 0)] ^ - rk[5]; - t2 = - Td0[GETBYTE(s2, 3)] ^ - Td1[GETBYTE(s1, 2)] ^ - Td2[GETBYTE(s0, 1)] ^ - Td3[GETBYTE(s3, 0)] ^ - rk[6]; - t3 = - Td0[GETBYTE(s3, 3)] ^ - Td1[GETBYTE(s2, 2)] ^ - Td2[GETBYTE(s1, 1)] ^ - Td3[GETBYTE(s0, 0)] ^ - rk[7]; +#define QUARTER_ROUND(t, a, b, c, d) \ + a ^= Td[3*256+byte(t)]; t >>= 8;\ + b ^= Td[2*256+byte(t)]; t >>= 8;\ + c ^= Td[1*256+byte(t)]; t >>= 8;\ + d ^= Td[t]; + + s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3]; + + QUARTER_ROUND(t3, s2, s1, s0, s3) + QUARTER_ROUND(t2, s1, s0, s3, s2) + QUARTER_ROUND(t1, s0, s3, s2, s1) + QUARTER_ROUND(t0, s3, s2, s1, s0) + + t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7]; + + QUARTER_ROUND(s3, t2, t1, t0, t3) + QUARTER_ROUND(s2, t1, t0, t3, t2) + QUARTER_ROUND(s1, t0, t3, t2, t1) + QUARTER_ROUND(s0, t3, t2, t1, t0) +#undef QUARTER_ROUND rk += 8; } while (--r); // timing attack countermeasure. see comments at top for more details u = 0; - for (i=0; i<sizeof(Sd)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE) - u &= (((word32*)Sd)[i+0*s_lineSizeDiv4] & ((word32*)Sd)[i+2*s_lineSizeDiv4]) & (((word32*)Sd)[i+1*s_lineSizeDiv4] & ((word32*)Sd)[i+3*s_lineSizeDiv4]); + for (i=0; i<256; i+=cacheLineSize) + u &= *(const word32 *)(Sd+i); + u &= *(const word32 *)(Sd+252); t0 |= u; t1 |= u; t2 |= u; t3 |= u; word32 tbw[4]; @@ -413,23 +581,17 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock word32 *const obw = (word32 *)outBlock; const word32 *const xbw = (const word32 *)xorBlock; - // last round - tempBlock[0] = Sd[GETBYTE(t0, 3)]; - tempBlock[1] = Sd[GETBYTE(t3, 2)]; - tempBlock[2] = Sd[GETBYTE(t2, 1)]; - tempBlock[3] = Sd[GETBYTE(t1, 0)]; - tempBlock[4] = Sd[GETBYTE(t1, 3)]; - tempBlock[5] = Sd[GETBYTE(t0, 2)]; - tempBlock[6] = Sd[GETBYTE(t3, 1)]; - tempBlock[7] = Sd[GETBYTE(t2, 0)]; - tempBlock[8] = Sd[GETBYTE(t2, 3)]; - tempBlock[9] = Sd[GETBYTE(t1, 2)]; - tempBlock[10] = Sd[GETBYTE(t0, 1)]; - tempBlock[11] = Sd[GETBYTE(t3, 0)]; - tempBlock[12] = Sd[GETBYTE(t3, 3)]; - tempBlock[13] = Sd[GETBYTE(t2, 2)]; - tempBlock[14] = Sd[GETBYTE(t1, 1)]; - tempBlock[15] = Sd[GETBYTE(t0, 0)]; +#define QUARTER_ROUND(t, a, b, c, d) \ + tempBlock[a] = Sd[byte(t)]; t >>= 8;\ + tempBlock[b] = Sd[byte(t)]; t >>= 8;\ + tempBlock[c] = Sd[byte(t)]; t >>= 8;\ + tempBlock[d] = Sd[t]; + + QUARTER_ROUND(t2, 7, 2, 13, 8) + QUARTER_ROUND(t1, 3, 14, 9, 4) + QUARTER_ROUND(t0, 15, 10, 5, 0) + QUARTER_ROUND(t3, 11, 6, 1, 12) +#undef QUARTER_ROUND if (xbw) { |