summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorweidai <weidai@57ff6487-cd31-0410-9ec3-f628ee90f5f0>2010-07-24 05:55:22 +0000
committerweidai <weidai@57ff6487-cd31-0410-9ec3-f628ee90f5f0>2010-07-24 05:55:22 +0000
commit8532f317b3440154b421b1e8b8b004ead28f847e (patch)
tree9fa57aeee5c779a3c9b4f88006050d81ff68e6ef
parent5e47408d6c3c40f0aafaa2b32a2ae0889f9fc089 (diff)
downloadcryptopp-8532f317b3440154b421b1e8b8b004ead28f847e.tar.gz
add support for AES-NI and CLMUL instruction sets in AES and GMAC/GCM
git-svn-id: svn://svn.code.sf.net/p/cryptopp/code/trunk/c5@508 57ff6487-cd31-0410-9ec3-f628ee90f5f0
-rw-r--r--Readme.txt39
-rw-r--r--bench.cpp19
-rw-r--r--config.h7
-rwxr-xr-xcpu.cpp52
-rwxr-xr-xcpu.h48
-rw-r--r--cryptlib.h2
-rw-r--r--datatest.cpp41
-rw-r--r--files.cpp4
-rw-r--r--gcm.cpp207
-rw-r--r--gcm.h1
-rw-r--r--modes.cpp6
-rw-r--r--modes.h1
-rw-r--r--rijndael.cpp358
-rw-r--r--rijndael.h3
-rw-r--r--validat1.cpp1
15 files changed, 631 insertions, 158 deletions
diff --git a/Readme.txt b/Readme.txt
index 178c5f7..ff995a6 100644
--- a/Readme.txt
+++ b/Readme.txt
@@ -414,19 +414,30 @@ the mailing list.
- ported to MSVC 2008, GCC 4.2, Sun CC 5.9, Intel C++ Compiler 10.0,
and Borland C++Builder 2007
-5.6 - added AuthenticatedSymmetricCipher interface class and Filter wrappers
- - added CCM, GCM (with SSE2 assembly), EAX, CMAC, XSalsa20, and SEED
- - added support for variable length IVs
- - added OIDs for Brainpool elliptic curve parameters
- - improved AES and SHA-256 speed on x86 and x64
- - fixed incorrect VMAC computation on message lengths
- that are >64 mod 128 (x86 assembly version is not affected)
- - fixed compiler error in vmac.cpp on x86 with GCC -fPIC
- - fixed run-time validation error on x86-64 with GCC 4.3.2 -O2
- - fixed HashFilter bug when putMessage=true
- - removed WORD64_AVAILABLE; compiler support for 64-bit int is now required
- - ported to GCC 4.3, C++Builder 2009, Sun CC 5.10, Intel C++ Compiler 11
-
-5.6.1 - switched to a public domain implementation of MARS
+5.6.0 - added AuthenticatedSymmetricCipher interface class and Filter wrappers
+ - added CCM, GCM (with SSE2 assembly), EAX, CMAC, XSalsa20, and SEED
+ - added support for variable length IVs
+ - added OIDs for Brainpool elliptic curve parameters
+ - improved AES and SHA-256 speed on x86 and x64
+ - changed BlockTransformation interface to no longer assume data alignment
+ - fixed incorrect VMAC computation on message lengths
+ that are >64 mod 128 (x86 assembly version is not affected)
+ - fixed compiler error in vmac.cpp on x86 with GCC -fPIC
+ - fixed run-time validation error on x86-64 with GCC 4.3.2 -O2
+ - fixed HashFilter bug when putMessage=true
+ - fixed AES-CTR data alignment bug that causes incorrect encryption on ARM
+ - removed WORD64_AVAILABLE; compiler support for 64-bit int is now required
+ - ported to GCC 4.3, C++Builder 2009, Sun CC 5.10, Intel C++ Compiler 11
+
+5.6.1 - added support for AES-NI and CLMUL instruction sets in AES and GMAC/GCM
+ - removed WAKE-CFB
+ - fixed several bugs in the SHA-256 x86/x64 assembly code:
+ * incorrect hash on non-SSE2 x86 machines on non-aligned input
+ * incorrect hash on x86 machines when input crosses 0x80000000
+ * incorrect hash on x64 when compiled with GCC with optimizations enabled
+ - fixed bugs in AES x86 and x64 assembly causing crashes in some MSVC build configurations
+ - switched to a public domain implementation of MARS
+ - ported to MSVC 2010, Sun Studio 12u1
+ - renamed the MSVC DLL project to "cryptopp" for compatibility with MSVC 2010
Written by Wei Dai
diff --git a/bench.cpp b/bench.cpp
index cee316c..8521a5a 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -10,6 +10,7 @@
#include "hex.h"
#include "modes.h"
#include "factory.h"
+#include "cpu.h"
#include <time.h>
#include <math.h>
@@ -242,14 +243,24 @@ void BenchmarkAll(double t, double hertz)
cout << "<THEAD><TR><TH>Algorithm<TH>MiB/Second" << cpb << "<TH>Microseconds to<br>Setup Key and IV" << cpk << endl;
cout << "\n<TBODY style=\"background: yellow\">";
- BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/GCM", 0, "AES/GCM (2K tables)", MakeParameters(Name::TableSize(), 2048));
- BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/GCM", 0, "AES/GCM (64K tables)", MakeParameters(Name::TableSize(), 64*1024));
+ if (CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && HasCLMUL())
+ BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/GCM", 0, "AES/GCM");
+ else
+ {
+ BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/GCM", 0, "AES/GCM (2K tables)", MakeParameters(Name::TableSize(), 2048));
+ BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/GCM", 0, "AES/GCM (64K tables)", MakeParameters(Name::TableSize(), 64*1024));
+ }
BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/CCM");
BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/EAX");
cout << "\n<TBODY style=\"background: white\">";
- BenchMarkByName2<AuthenticatedSymmetricCipher, MessageAuthenticationCode>("AES/GCM", 0, "GMAC(AES) (2K tables)", MakeParameters(Name::TableSize(), 2048));
- BenchMarkByName2<AuthenticatedSymmetricCipher, MessageAuthenticationCode>("AES/GCM", 0, "GMAC(AES) (64K tables)", MakeParameters(Name::TableSize(), 64*1024));
+ if (CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && HasCLMUL())
+ BenchMarkByName2<AuthenticatedSymmetricCipher, MessageAuthenticationCode>("AES/GCM", 0, "GMAC(AES)");
+ else
+ {
+ BenchMarkByName2<AuthenticatedSymmetricCipher, MessageAuthenticationCode>("AES/GCM", 0, "GMAC(AES) (2K tables)", MakeParameters(Name::TableSize(), 2048));
+ BenchMarkByName2<AuthenticatedSymmetricCipher, MessageAuthenticationCode>("AES/GCM", 0, "GMAC(AES) (64K tables)", MakeParameters(Name::TableSize(), 64*1024));
+ }
BenchMarkByName<MessageAuthenticationCode>("VMAC(AES)-64");
BenchMarkByName<MessageAuthenticationCode>("VMAC(AES)-128");
BenchMarkByName<MessageAuthenticationCode>("HMAC(SHA-1)");
diff --git a/config.h b/config.h
index f53cbce..3a03844 100644
--- a/config.h
+++ b/config.h
@@ -257,6 +257,7 @@ NAMESPACE_END
#endif
#if !defined(CRYPTOPP_DISABLE_ASM) && ((defined(_MSC_VER) && defined(_M_IX86)) || (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))))
+ // C++Builder 2010 does not allow "call label" where label is defined within inline assembly
#define CRYPTOPP_X86_ASM_AVAILABLE
#if !defined(CRYPTOPP_DISABLE_SSE2) && (defined(CRYPTOPP_MSVC6PP_OR_LATER) || CRYPTOPP_GCC_VERSION >= 30300)
@@ -288,6 +289,12 @@ NAMESPACE_END
#define CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 0
#endif
+#if defined(CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE) && (CRYPTOPP_GCC_VERSION >= 40400 || _MSC_FULL_VER >= 150030729 || __INTEL_COMPILER >= 1110)
+ #define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 1
+#else
+ #define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 0
+#endif
+
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
#define CRYPTOPP_BOOL_ALIGN16_ENABLED 1
#else
diff --git a/cpu.cpp b/cpu.cpp
index 11c27b8..c1a1d95 100755
--- a/cpu.cpp
+++ b/cpu.cpp
@@ -8,7 +8,7 @@
#include "misc.h"
#include <algorithm>
-#ifdef __GNUC__
+#ifndef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
#include <signal.h>
#include <setjmp.h>
#endif
@@ -19,9 +19,19 @@
NAMESPACE_BEGIN(CryptoPP)
-#ifdef CRYPTOPP_X86_ASM_AVAILABLE
+#ifdef CRYPTOPP_CPUID_AVAILABLE
+
+#if _MSC_VER >= 1400 && CRYPTOPP_BOOL_X64
-#ifndef _MSC_VER
+bool CpuId(word32 input, word32 *output)
+{
+ __cpuid((int *)output, input);
+ return true;
+}
+
+#else
+
+#ifndef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
typedef void (*SigHandler)(int);
static jmp_buf s_jmpNoCPUID;
@@ -29,11 +39,17 @@ static void SigIllHandlerCPUID(int)
{
longjmp(s_jmpNoCPUID, 1);
}
+
+static jmp_buf s_jmpNoSSE2;
+static void SigIllHandlerSSE2(int)
+{
+ longjmp(s_jmpNoSSE2, 1);
+}
#endif
bool CpuId(word32 input, word32 *output)
{
-#ifdef _MSC_VER
+#ifdef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
__try
{
__asm
@@ -80,31 +96,13 @@ bool CpuId(word32 input, word32 *output)
#endif
}
-#ifndef _MSC_VER
-static jmp_buf s_jmpNoSSE2;
-static void SigIllHandlerSSE2(int)
-{
- longjmp(s_jmpNoSSE2, 1);
-}
-#endif
-
-#elif _MSC_VER >= 1400 && CRYPTOPP_BOOL_X64
-
-bool CpuId(word32 input, word32 *output)
-{
- __cpuid((int *)output, input);
- return true;
-}
-
#endif
-#ifdef CRYPTOPP_CPUID_AVAILABLE
-
static bool TrySSE2()
{
#if CRYPTOPP_BOOL_X64
return true;
-#elif defined(_MSC_VER)
+#elif defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
__try
{
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
@@ -119,7 +117,7 @@ static bool TrySSE2()
return false;
}
return true;
-#elif defined(__GNUC__)
+#else
SigHandler oldHandler = signal(SIGILL, SigIllHandlerSSE2);
if (oldHandler == SIG_ERR)
return false;
@@ -139,8 +137,6 @@ static bool TrySSE2()
signal(SIGILL, oldHandler);
return result;
-#else
- return false;
#endif
}
@@ -160,8 +156,8 @@ void DetectX86Features()
if ((cpuid1[3] & (1 << 26)) != 0)
g_hasSSE2 = TrySSE2();
g_hasSSSE3 = g_hasSSE2 && (cpuid1[2] & (1<<9));
- g_hasAESNI = (cpuid1[2] & (1<<25)) != 0;
- g_hasCLMUL = (cpuid1[2] & (1<<1)) != 0;
+ g_hasAESNI = g_hasSSE2 && (cpuid1[2] & (1<<25));
+ g_hasCLMUL = g_hasSSE2 && (cpuid1[2] & (1<<1));
if ((cpuid1[3] & (1 << 25)) != 0)
g_hasISSE = true;
diff --git a/cpu.h b/cpu.h
index 79e5ea8..9a6ee22 100755
--- a/cpu.h
+++ b/cpu.h
@@ -18,22 +18,18 @@
NAMESPACE_BEGIN(CryptoPP)
-#if defined(CRYPTOPP_X86_ASM_AVAILABLE) || (_MSC_VER >= 1400 && CRYPTOPP_BOOL_X64)
+#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
#define CRYPTOPP_CPUID_AVAILABLE
// these should not be used directly
extern CRYPTOPP_DLL bool g_x86DetectionDone;
-extern CRYPTOPP_DLL bool g_hasSSE2;
-extern CRYPTOPP_DLL bool g_hasISSE;
-extern CRYPTOPP_DLL bool g_hasMMX;
extern CRYPTOPP_DLL bool g_hasSSSE3;
extern CRYPTOPP_DLL bool g_hasAESNI;
extern CRYPTOPP_DLL bool g_hasCLMUL;
extern CRYPTOPP_DLL bool g_isP4;
extern CRYPTOPP_DLL word32 g_cacheLineSize;
CRYPTOPP_DLL void CRYPTOPP_API DetectX86Features();
-
CRYPTOPP_DLL bool CRYPTOPP_API CpuId(word32 input, word32 *output);
#if CRYPTOPP_BOOL_X64
@@ -42,6 +38,10 @@ inline bool HasISSE() {return true;}
inline bool HasMMX() {return true;}
#else
+extern CRYPTOPP_DLL bool g_hasSSE2;
+extern CRYPTOPP_DLL bool g_hasISSE;
+extern CRYPTOPP_DLL bool g_hasMMX;
+
inline bool HasSSE2()
{
if (!g_x86DetectionDone)
@@ -107,22 +107,8 @@ inline int GetCacheLineSize()
return CRYPTOPP_L1_CACHE_LINE_SIZE;
}
-inline bool HasSSSE3() {return false;}
-inline bool IsP4() {return false;}
-
-// assume MMX and SSE2 if intrinsics are enabled
-#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_X64
-inline bool HasSSE2() {return true;}
-inline bool HasISSE() {return true;}
-inline bool HasMMX() {return true;}
-#else
-inline bool HasSSE2() {return false;}
-inline bool HasISSE() {return false;}
-inline bool HasMMX() {return false;}
#endif
-#endif // #ifdef CRYPTOPP_X86_ASM_AVAILABLE || _MSC_VER >= 1400
-
#endif
#ifdef CRYPTOPP_GENERATE_X64_MASM
@@ -134,7 +120,19 @@ inline bool HasMMX() {return false;}
#define ASJ(x, y, z) x label##y*newline*
#define ASC(x, y) x label##y*newline*
#define AS_HEX(y) 0##y##h
-#elif defined(__GNUC__)
+#elif defined(_MSC_VER) || defined(__BORLANDC__)
+ #define CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
+ #define AS1(x) __asm {x}
+ #define AS2(x, y) __asm {x, y}
+ #define AS3(x, y, z) __asm {x, y, z}
+ #define ASS(x, y, a, b, c, d) __asm {x, y, (a)*64+(b)*16+(c)*4+(d)}
+ #define ASL(x) __asm {label##x:}
+ #define ASJ(x, y, z) __asm {x label##y}
+ #define ASC(x, y) __asm {x label##y}
+ #define CRYPTOPP_NAKED __declspec(naked)
+ #define AS_HEX(y) 0x##y
+#else
+ #define CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
// define these in two steps to allow arguments to be expanded
#define GNU_AS1(x) #x ";"
#define GNU_AS2(x, y) #x ", " #y ";"
@@ -150,16 +148,6 @@ inline bool HasMMX() {return false;}
#define ASC(x, y) #x " " #y ";"
#define CRYPTOPP_NAKED
#define AS_HEX(y) 0x##y
-#else
- #define AS1(x) __asm {x}
- #define AS2(x, y) __asm {x, y}
- #define AS3(x, y, z) __asm {x, y, z}
- #define ASS(x, y, a, b, c, d) __asm {x, y, _MM_SHUFFLE(a, b, c, d)}
- #define ASL(x) __asm {label##x:}
- #define ASJ(x, y, z) __asm {x label##y}
- #define ASC(x, y) __asm {x label##y}
- #define CRYPTOPP_NAKED __declspec(naked)
- #define AS_HEX(y) 0x##y
#endif
#define IF0(y)
diff --git a/cryptlib.h b/cryptlib.h
index fbd7631..c6f4c42 100644
--- a/cryptlib.h
+++ b/cryptlib.h
@@ -456,7 +456,7 @@ public:
//! return number of blocks that can be processed in parallel, for bit-slicing implementations
virtual unsigned int OptimalNumberOfParallelBlocks() const {return 1;}
- enum {BT_InBlockIsCounter=1, BT_DontIncrementInOutPointers=2, BT_XorInput=4, BT_ReverseDirection=8} FlagsForAdvancedProcessBlocks;
+ enum {BT_InBlockIsCounter=1, BT_DontIncrementInOutPointers=2, BT_XorInput=4, BT_ReverseDirection=8, BT_AllowParallel=16} FlagsForAdvancedProcessBlocks;
//! encrypt and xor blocks according to flags (see FlagsForAdvancedProcessBlocks)
/*! /note If BT_InBlockIsCounter is set, last byte of inBlocks may be modified. */
diff --git a/datatest.cpp b/datatest.cpp
index cfac74a..121e0c5 100644
--- a/datatest.cpp
+++ b/datatest.cpp
@@ -57,15 +57,15 @@ const std::string & GetRequiredDatum(const TestData &data, const char *name)
return i->second;
}
-void RandomizedTransfer(BufferedTransformation &source, BufferedTransformation &target, bool finish)
+void RandomizedTransfer(BufferedTransformation &source, BufferedTransformation &target, bool finish, const std::string &channel=DEFAULT_CHANNEL)
{
while (source.MaxRetrievable() > (finish ? 0 : 4096))
{
byte buf[4096+64];
- word32 start = GlobalRNG().GenerateWord32(0, 63);
- word32 len = GlobalRNG().GenerateWord32(1, UnsignedMin(4096U, source.MaxRetrievable()));
- source.Get(buf+start, len);
- target.Put(buf+start, len);
+ size_t start = GlobalRNG().GenerateWord32(0, 63);
+ size_t len = GlobalRNG().GenerateWord32(1, UnsignedMin(4096U, 3*source.MaxRetrievable()/2));
+ len = source.Get(buf+start, len);
+ target.ChannelPut(channel, buf+start, len);
}
}
@@ -397,9 +397,9 @@ void TestSymmetricCipher(TestData &v, const NameValuePairs &overrideParameters)
return;
}
- StringSource ss(plaintext, false, new StreamTransformationFilter(*encryptor, new StringSink(encrypted), StreamTransformationFilter::NO_PADDING));
- ss.Pump(plaintext.size()/2 + 1);
- ss.PumpAll();
+ StreamTransformationFilter encFilter(*encryptor, new StringSink(encrypted), StreamTransformationFilter::NO_PADDING);
+ RandomizedTransfer(StringStore(plaintext).Ref(), encFilter, true);
+ encFilter.MessageEnd();
/*{
std::string z;
encryptor->Seek(seek);
@@ -422,14 +422,14 @@ void TestSymmetricCipher(TestData &v, const NameValuePairs &overrideParameters)
{
std::cout << "incorrectly encrypted: ";
StringSource xx(encrypted, false, new HexEncoder(new FileSink(std::cout)));
- xx.Pump(256); xx.Flush(false);
+ xx.Pump(2048); xx.Flush(false);
std::cout << "\n";
SignalTestFailure();
}
std::string decrypted;
- StringSource dd(encrypted, false, new StreamTransformationFilter(*decryptor, new StringSink(decrypted), StreamTransformationFilter::NO_PADDING));
- dd.Pump(plaintext.size()/2 + 1);
- dd.PumpAll();
+ StreamTransformationFilter decFilter(*decryptor, new StringSink(decrypted), StreamTransformationFilter::NO_PADDING);
+ RandomizedTransfer(StringStore(encrypted).Ref(), decFilter, true);
+ decFilter.MessageEnd();
if (decrypted != plaintext)
{
std::cout << "incorrectly decrypted: ";
@@ -484,27 +484,24 @@ void TestAuthenticatedSymmetricCipher(TestData &v, const NameValuePairs &overrid
StringStore sh(header), sp(plaintext), sc(ciphertext), sf(footer), sm(mac);
if (macAtBegin)
- sm.TransferTo(df);
+ RandomizedTransfer(sm, df, true);
sh.CopyTo(df, LWORD_MAX, AAD_CHANNEL);
- sc.TransferTo(df);
+ RandomizedTransfer(sc, df, true);
sf.CopyTo(df, LWORD_MAX, AAD_CHANNEL);
if (!macAtBegin)
- sm.TransferTo(df);
+ RandomizedTransfer(sm, df, true);
df.MessageEnd();
- sh.TransferTo(ef, sh.MaxRetrievable()/2+1, AAD_CHANNEL);
- sh.TransferTo(ef, LWORD_MAX, AAD_CHANNEL);
- sp.TransferTo(ef, sp.MaxRetrievable()/2+1);
- sp.TransferTo(ef);
- sf.TransferTo(ef, sf.MaxRetrievable()/2+1, AAD_CHANNEL);
- sf.TransferTo(ef, LWORD_MAX, AAD_CHANNEL);
+ RandomizedTransfer(sh, ef, true, AAD_CHANNEL);
+ RandomizedTransfer(sp, ef, true);
+ RandomizedTransfer(sf, ef, true, AAD_CHANNEL);
ef.MessageEnd();
if (test == "Encrypt" && encrypted != ciphertext+mac)
{
std::cout << "incorrectly encrypted: ";
StringSource xx(encrypted, false, new HexEncoder(new FileSink(std::cout)));
- xx.Pump(256); xx.Flush(false);
+ xx.Pump(2048); xx.Flush(false);
std::cout << "\n";
SignalTestFailure();
}
diff --git a/files.cpp b/files.cpp
index 6a29ed5..453b562 100644
--- a/files.cpp
+++ b/files.cpp
@@ -95,7 +95,7 @@ size_t FileStore::TransferTo2(BufferedTransformation &target, lword &transferByt
m_stream->read((char *)m_space, (unsigned int)STDMIN(size, (lword)spaceSize));
}
- m_len = m_stream->gcount();
+ m_len = (size_t)m_stream->gcount();
size_t blockedBytes;
output:
blockedBytes = target.ChannelPutModifiable2(channel, m_space, m_len, 0, blocking);
@@ -242,7 +242,7 @@ size_t FileSink::Put2(const byte *inString, size_t length, int messageEnd, bool
size = numeric_limits<std::streamsize>::max();
m_stream->write((const char *)inString, size);
inString += size;
- length -= size;
+ length -= (size_t)size;
}
if (messageEnd)
diff --git a/gcm.cpp b/gcm.cpp
index 8367227..610db97 100644
--- a/gcm.cpp
+++ b/gcm.cpp
@@ -14,6 +14,11 @@
#include <emmintrin.h>
#endif
+#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
+#include <tmmintrin.h>
+#include <wmmintrin.h>
+#endif
+
NAMESPACE_BEGIN(CryptoPP)
word16 GCM_Base::s_reductionTable[256];
@@ -47,6 +52,21 @@ void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *
}
Block::Put(NULL, c)(Z0)(Z1);
}
+
+__m128i _mm_clmulepi64_si128(const __m128i &a, const __m128i &b, int i)
+{
+ word64 A[1] = {ByteReverse(((word64*)&a)[i&1])};
+ word64 B[1] = {ByteReverse(((word64*)&b)[i>>4])};
+
+ PolynomialMod2 pa((byte *)A, 8);
+ PolynomialMod2 pb((byte *)B, 8);
+ PolynomialMod2 c = pa*pb;
+
+ __m128i output;
+ for (int i=0; i<16; i++)
+ ((byte *)&output)[i] = c.GetByte(i);
+ return output;
+}
#endif
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
@@ -66,6 +86,56 @@ inline static void Xor16(byte *a, const byte *b, const byte *c)
((word64 *)a)[1] = ((word64 *)b)[1] ^ ((word64 *)c)[1];
}
+#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
+static CRYPTOPP_ALIGN_DATA(16) const word64 s_clmulConstants64[] = {
+ 0xe100000000000000, 0xc200000000000000,
+ 0x08090a0b0c0d0e0f, 0x0001020304050607,
+ 0x0001020304050607, 0x08090a0b0c0d0e0f};
+static const __m128i *s_clmulConstants = (const __m128i *)s_clmulConstants64;
+static const unsigned int s_clmulTableSizeInBlocks = 8;
+
+inline __m128i CLMUL_Reduce(__m128i c0, __m128i c1, __m128i c2, const __m128i &r)
+{
+ /*
+ The polynomial to be reduced is c0 * x^128 + c1 * x^64 + c2. c0t below refers to the most
+ significant half of c0 as a polynomial, which, due to GCM's bit reflection, are in the
+ rightmost bit positions, and the lowest byte addresses.
+
+ c1 ^= c0t * 0xc200000000000000
+ c2t ^= c0t
+ t = shift (c1t ^ c0b) left 1 bit
+ c2 ^= t * 0xe100000000000000
+ c2t ^= c1b
+ shift c2 left 1 bit and xor in lowest bit of c1t
+ */
+#if 0 // MSVC 2010 workaround: see http://connect.microsoft.com/VisualStudio/feedback/details/575301
+ c2 = _mm_xor_si128(c2, _mm_move_epi64(c0));
+#else
+ c1 = _mm_xor_si128(c1, _mm_slli_si128(c0, 8));
+#endif
+ c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(c0, r, 0x10));
+ c0 = _mm_srli_si128(c0, 8);
+ c0 = _mm_xor_si128(c0, c1);
+ c0 = _mm_slli_epi64(c0, 1);
+ c0 = _mm_clmulepi64_si128(c0, r, 0);
+ c2 = _mm_xor_si128(c2, c0);
+ c2 = _mm_xor_si128(c2, _mm_srli_si128(c1, 8));
+ c1 = _mm_unpacklo_epi64(c1, c2);
+ c1 = _mm_srli_epi64(c1, 63);
+ c2 = _mm_slli_epi64(c2, 1);
+ return _mm_xor_si128(c2, c1);
+}
+
+inline __m128i CLMUL_GF_Mul(const __m128i &x, const __m128i &h, const __m128i &r)
+{
+ __m128i c0 = _mm_clmulepi64_si128(x,h,0);
+ __m128i c1 = _mm_xor_si128(_mm_clmulepi64_si128(x,h,1), _mm_clmulepi64_si128(x,h,0x10));
+ __m128i c2 = _mm_clmulepi64_si128(x,h,0x11);
+
+ return CLMUL_Reduce(c0, c1, c2, r);
+}
+#endif
+
void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const NameValuePairs &params)
{
BlockCipher &blockCipher = AccessBlockCipher();
@@ -74,26 +144,56 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
if (blockCipher.BlockSize() != REQUIRED_BLOCKSIZE)
throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not 16");
- int tableSize;
- if (params.GetIntValue(Name::TableSize(), tableSize))
- tableSize = (tableSize >= 64*1024) ? 64*1024 : 2*1024;
+ int tableSize, i, j, k;
+
+#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
+ if (HasCLMUL())
+ {
+ params.GetIntValue(Name::TableSize(), tableSize); // avoid "parameter not used" error
+ tableSize = s_clmulTableSizeInBlocks * REQUIRED_BLOCKSIZE;
+ }
else
- tableSize = (GetTablesOption() == GCM_64K_Tables) ? 64*1024 : 2*1024;
+#endif
+ {
+ if (params.GetIntValue(Name::TableSize(), tableSize))
+ tableSize = (tableSize >= 64*1024) ? 64*1024 : 2*1024;
+ else
+ tableSize = (GetTablesOption() == GCM_64K_Tables) ? 64*1024 : 2*1024;
#if defined(_MSC_VER) && (_MSC_VER >= 1300 && _MSC_VER < 1400)
- // VC 2003 workaround: compiler generates bad code for 64K tables
- tableSize = 2*1024;
+ // VC 2003 workaround: compiler generates bad code for 64K tables
+ tableSize = 2*1024;
#endif
+ }
m_buffer.resize(3*REQUIRED_BLOCKSIZE + tableSize);
+ byte *table = MulTable();
byte *hashKey = HashKey();
memset(hashKey, 0, REQUIRED_BLOCKSIZE);
blockCipher.ProcessBlock(hashKey);
- byte *table = MulTable();
- int i, j, k;
- word64 V0, V1;
+#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
+ if (HasCLMUL())
+ {
+ const __m128i r = s_clmulConstants[0];
+ __m128i h0 = _mm_shuffle_epi8(_mm_load_si128((__m128i *)hashKey), s_clmulConstants[1]);
+ __m128i h = h0;
+ for (i=0; i<tableSize; i+=32)
+ {
+ __m128i h1 = CLMUL_GF_Mul(h, h0, r);
+ _mm_storel_epi64((__m128i *)(table+i), h);
+ _mm_storeu_si128((__m128i *)(table+i+16), h1);
+ _mm_storeu_si128((__m128i *)(table+i+8), h);
+ _mm_storel_epi64((__m128i *)(table+i+8), h1);
+ h = CLMUL_GF_Mul(h1, h0, r);
+ }
+
+ return;
+ }
+#endif
+
+ word64 V0, V1;
typedef BlockGetAndPut<word64, BigEndian> Block;
Block::Get(hashKey)(V0)(V1);
@@ -178,6 +278,17 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
}
}
+inline void GCM_Base::ReverseHashBufferIfNeeded()
+{
+#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
+ if (HasCLMUL())
+ {
+ __m128i &x = *(__m128i *)HashBuffer();
+ x = _mm_shuffle_epi8(x, s_clmulConstants[1]);
+ }
+#endif
+}
+
void GCM_Base::Resync(const byte *iv, size_t len)
{
BlockCipher &cipher = AccessBlockCipher();
@@ -209,6 +320,8 @@ void GCM_Base::Resync(const byte *iv, size_t len)
PutBlock<word64, BigEndian, true>(NULL, m_buffer)(0)(origLen*8);
GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE);
+
+ ReverseHashBufferIfNeeded();
}
if (m_state >= State_IVSet)
@@ -241,6 +354,73 @@ void GCM_AuthenticateBlocks_64K(const byte *data, size_t blocks, word64 *hashBuf
size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
{
+#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
+ if (HasCLMUL())
+ {
+ const __m128i *table = (const __m128i *)MulTable();
+ __m128i x = _mm_load_si128((__m128i *)HashBuffer());
+ const __m128i r = s_clmulConstants[0], bswapMask = s_clmulConstants[1], bswapMask2 = s_clmulConstants[2];
+
+ while (len >= 16)
+ {
+ size_t s = UnsignedMin(len/16, s_clmulTableSizeInBlocks), i=0;
+ __m128i d, d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(data+(s-1)*16)), bswapMask2);;
+ __m128i c0 = _mm_setzero_si128();
+ __m128i c1 = _mm_setzero_si128();
+ __m128i c2 = _mm_setzero_si128();
+
+ while (true)
+ {
+ __m128i h0 = _mm_load_si128(table+i);
+ __m128i h1 = _mm_load_si128(table+i+1);
+ __m128i h01 = _mm_xor_si128(h0, h1);
+
+ if (++i == s)
+ {
+ d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)data), bswapMask);
+ d = _mm_xor_si128(d, x);
+ c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0));
+ c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 1));
+ d = _mm_xor_si128(d, _mm_shuffle_epi32(d, _MM_SHUFFLE(1, 0, 3, 2)));
+ c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0));
+ break;
+ }
+
+ d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(data+(s-i)*16-8)), bswapMask2);
+ c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d2, h0, 1));
+ c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 1));
+ d2 = _mm_xor_si128(d2, d);
+ c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d2, h01, 1));
+
+ if (++i == s)
+ {
+ d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)data), bswapMask);
+ d = _mm_xor_si128(d, x);
+ c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0x10));
+ c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 0x11));
+ d = _mm_xor_si128(d, _mm_shuffle_epi32(d, _MM_SHUFFLE(1, 0, 3, 2)));
+ c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0x10));
+ break;
+ }
+
+ d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(data+(s-i)*16-8)), bswapMask);
+ c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0x10));
+ c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d2, h1, 0x10));
+ d = _mm_xor_si128(d, d2);
+ c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0x10));
+ }
+ data += s*16;
+ len -= s*16;
+
+ c1 = _mm_xor_si128(_mm_xor_si128(c1, c0), c2);
+ x = CLMUL_Reduce(c0, c1, c2, r);
+ }
+
+ _mm_store_si128((__m128i *)HashBuffer(), x);
+ return len;
+ }
+#endif
+
typedef BlockGetAndPut<word64, NativeByteOrder> Block;
word64 *hashBuffer = (word64 *)HashBuffer();
@@ -414,9 +594,7 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
AS2( shr WORD_REG(dx), 4 )
#endif
- #if !defined(_MSC_VER) || (_MSC_VER < 1400)
- AS_PUSH_IF86( bx)
- #endif
+ AS_PUSH_IF86( bx)
AS_PUSH_IF86( bp)
#ifdef __GNUC__
@@ -524,9 +702,7 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
AS2( movdqa [WORD_REG(si)], xmm0 )
AS_POP_IF86( bp)
- #if !defined(_MSC_VER) || (_MSC_VER < 1400)
- AS_POP_IF86( bx)
- #endif
+ AS_POP_IF86( bx)
#ifdef __GNUC__
".att_syntax prefix;"
@@ -647,6 +823,7 @@ void GCM_Base::AuthenticateLastConfidentialBlock()
void GCM_Base::AuthenticateLastFooterBlock(byte *mac, size_t macSize)
{
m_ctr.Seek(0);
+ ReverseHashBufferIfNeeded();
m_ctr.ProcessData(mac, HashBuffer(), macSize);
}
diff --git a/gcm.h b/gcm.h
index 0133ffe..0b32524 100644
--- a/gcm.h
+++ b/gcm.h
@@ -63,6 +63,7 @@ protected:
byte *HashBuffer() {return m_buffer+REQUIRED_BLOCKSIZE;}
byte *HashKey() {return m_buffer+2*REQUIRED_BLOCKSIZE;}
byte *MulTable() {return m_buffer+3*REQUIRED_BLOCKSIZE;}
+ inline void ReverseHashBufferIfNeeded();
class CRYPTOPP_DLL GCTR : public CTR_Mode_ExternalCipher::Encryption
{
diff --git a/modes.cpp b/modes.cpp
index 81bf4de..789fafb 100644
--- a/modes.cpp
+++ b/modes.cpp
@@ -115,7 +115,7 @@ void CTR_ModePolicy::OperateKeystream(KeystreamOperation operation, byte *output
{
byte lsb = m_counterArray[s-1];
size_t blocks = UnsignedMin(iterationCount, 256U-lsb);
- m_cipher->AdvancedProcessBlocks(m_counterArray, input, output, blocks*s, BlockTransformation::BT_InBlockIsCounter);
+ m_cipher->AdvancedProcessBlocks(m_counterArray, input, output, blocks*s, BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_AllowParallel);
if ((m_counterArray[s-1] = lsb + (byte)blocks) == 0)
IncrementCounterBy256();
@@ -147,7 +147,7 @@ void BlockOrientedCipherModeBase::UncheckedSetKey(const byte *key, unsigned int
void ECB_OneWay::ProcessData(byte *outString, const byte *inString, size_t length)
{
assert(length%BlockSize()==0);
- m_cipher->AdvancedProcessBlocks(inString, NULL, outString, length, 0);
+ m_cipher->AdvancedProcessBlocks(inString, NULL, outString, length, BlockTransformation::BT_AllowParallel);
}
void CBC_Encryption::ProcessData(byte *outString, const byte *inString, size_t length)
@@ -199,7 +199,7 @@ void CBC_Decryption::ProcessData(byte *outString, const byte *inString, size_t l
unsigned int blockSize = BlockSize();
memcpy(m_temp, inString+length-blockSize, blockSize); // save copy now in case of in-place decryption
if (length > blockSize)
- m_cipher->AdvancedProcessBlocks(inString+blockSize, inString, outString+blockSize, length-blockSize, BlockTransformation::BT_ReverseDirection);
+ m_cipher->AdvancedProcessBlocks(inString+blockSize, inString, outString+blockSize, length-blockSize, BlockTransformation::BT_ReverseDirection|BlockTransformation::BT_AllowParallel);
m_cipher->ProcessAndXorBlock(inString, m_register, outString);
m_register.swap(m_temp);
}
diff --git a/modes.h b/modes.h
index ff88d31..c0c30c4 100644
--- a/modes.h
+++ b/modes.h
@@ -340,6 +340,7 @@ struct OFB_Mode_ExternalCipher : public CipherModeDocumentation
};
CRYPTOPP_DLL_TEMPLATE_CLASS AdditiveCipherTemplate<AbstractPolicyHolder<AdditiveCipherAbstractPolicy, CTR_ModePolicy> >;
+CRYPTOPP_DLL_TEMPLATE_CLASS CipherModeFinalTemplate_ExternalCipher<ConcretePolicyHolder<Empty, AdditiveCipherTemplate<AbstractPolicyHolder<AdditiveCipherAbstractPolicy, CTR_ModePolicy> > > >;
//! CTR mode
template <class CIPHER>
diff --git a/rijndael.cpp b/rijndael.cpp
index a39b65d..fbc7dcc 100644
--- a/rijndael.cpp
+++ b/rijndael.cpp
@@ -5,6 +5,10 @@
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
/*
+July 2010: Added support for AES-NI instructions via compiler intrinsics.
+*/
+
+/*
Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
and Peter Schwabe in their paper "New AES software speed records". The round
@@ -69,6 +73,10 @@ being unloaded from L1 cache, until that round is finished.
#include "misc.h"
#include "cpu.h"
+#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
+#include <wmmintrin.h>
+#endif
+
NAMESPACE_BEGIN(CryptoPP)
#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
@@ -198,20 +206,83 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
m_rounds = keylen/4 + 6;
m_key.New(4*(m_rounds+1));
- word32 temp, *rk = m_key;
- const word32 *rc = rcon;
+ word32 *rk = m_key;
+
+#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86)
+ // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
+ if (HasAESNI())
+ {
+ static const word32 rcLE[] = {
+ 0x01, 0x02, 0x04, 0x08,
+ 0x10, 0x20, 0x40, 0x80,
+ 0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+ };
+ const word32 *rc = rcLE;
+
+ __m128i temp = _mm_loadu_si128((__m128i *)(userKey+keylen-16));
+ memcpy(rk, userKey, keylen);
+
+ while (true)
+ {
+ rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
+ rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
+ rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
+ rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
+
+ if (rk + keylen/4 + 4 == m_key.end())
+ break;
+
+ if (keylen == 24)
+ {
+ rk[10] = rk[ 4] ^ rk[ 9];
+ rk[11] = rk[ 5] ^ rk[10];
+ temp = _mm_insert_epi32(temp, rk[11], 3);
+ }
+ else if (keylen == 32)
+ {
+ temp = _mm_insert_epi32(temp, rk[11], 3);
+ rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
+ rk[13] = rk[ 5] ^ rk[12];
+ rk[14] = rk[ 6] ^ rk[13];
+ rk[15] = rk[ 7] ^ rk[14];
+ temp = _mm_insert_epi32(temp, rk[15], 3);
+ }
+ else
+ temp = _mm_insert_epi32(temp, rk[7], 3);
+
+ rk += keylen/4;
+ }
+
+ if (!IsForwardTransformation())
+ {
+ rk = m_key;
+ unsigned int i, j;
+
+ std::swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds));
+
+ for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
+ {
+ temp = _mm_aesimc_si128(*(__m128i *)(rk+i));
+ *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+j));
+ *(__m128i *)(rk+j) = temp;
+ }
+
+ *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+i));
+ }
+
+ return;
+ }
+#endif
GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
+ const word32 *rc = rcon;
+ word32 temp;
while (true)
{
temp = rk[keylen/4-1];
- rk[keylen/4] = rk[0] ^
- (word32(Se[GETBYTE(temp, 2)]) << 24) ^
- (word32(Se[GETBYTE(temp, 1)]) << 16) ^
- (word32(Se[GETBYTE(temp, 0)]) << 8) ^
- Se[GETBYTE(temp, 3)] ^
- *(rc++);
+ word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
+ rk[keylen/4] = rk[0] ^ x ^ *(rc++);
rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
@@ -227,11 +298,7 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
else if (keylen == 32)
{
temp = rk[11];
- rk[12] = rk[ 4] ^
- (word32(Se[GETBYTE(temp, 3)]) << 24) ^
- (word32(Se[GETBYTE(temp, 2)]) << 16) ^
- (word32(Se[GETBYTE(temp, 1)]) << 8) ^
- Se[GETBYTE(temp, 0)];
+ rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
rk[13] = rk[ 5] ^ rk[12];
rk[14] = rk[ 6] ^ rk[13];
rk[15] = rk[ 7] ^ rk[14];
@@ -239,10 +306,15 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
rk += keylen/4;
}
+ rk = m_key;
+
if (IsForwardTransformation())
{
if (!s_TeFilled)
FillEncTable();
+
+ ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16);
+ ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
}
else
{
@@ -250,35 +322,37 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
FillDecTable();
unsigned int i, j;
- rk = m_key;
-
- /* invert the order of the round keys: */
- for (i = 0, j = 4*m_rounds; i < j; i += 4, j -= 4) {
- temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
- temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
- temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
- temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
- }
-#define InverseMixColumn(x) x = TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
+#define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
- /* apply the inverse MixColumn transform to all round keys but the first and the last: */
- for (i = 1; i < m_rounds; i++) {
- rk += 4;
- InverseMixColumn(rk[0]);
- InverseMixColumn(rk[1]);
- InverseMixColumn(rk[2]);
- InverseMixColumn(rk[3]);
+ for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
+ {
+ temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
+ temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
+ temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
+ temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
}
+
+ rk[i+0] = InverseMixColumn(rk[i+0]);
+ rk[i+1] = InverseMixColumn(rk[i+1]);
+ rk[i+2] = InverseMixColumn(rk[i+2]);
+ rk[i+3] = InverseMixColumn(rk[i+3]);
+
+ temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
+ temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
+ temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
+ temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
}
- ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key.begin(), m_key.begin(), 16);
- ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key + m_rounds*4, m_key + m_rounds*4, 16);
+#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
+ if (HasAESNI())
+ ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
+#endif
}
void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
{
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
if (HasSSE2())
{
Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
@@ -354,6 +428,14 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
{
+#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
+ if (HasAESNI())
+ {
+ Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
+ return;
+ }
+#endif
+
typedef BlockGetAndPut<word32, NativeByteOrder> Block;
word32 s0, s1, s2, s3, t0, t1, t2, t3;
@@ -913,14 +995,200 @@ static inline bool AliasedWithTable(const byte *begin, const byte *end)
return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
}
+#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
+
+inline void AESNI_Enc_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
+{
+ block = _mm_xor_si128(block, subkeys[0]);
+ for (unsigned int i=1; i<rounds-1; i+=2)
+ {
+ block = _mm_aesenc_si128(block, subkeys[i]);
+ block = _mm_aesenc_si128(block, subkeys[i+1]);
+ }
+ block = _mm_aesenc_si128(block, subkeys[rounds-1]);
+ block = _mm_aesenclast_si128(block, subkeys[rounds]);
+}
+
+inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
+{
+ __m128i rk = subkeys[0];
+ block0 = _mm_xor_si128(block0, rk);
+ block1 = _mm_xor_si128(block1, rk);
+ block2 = _mm_xor_si128(block2, rk);
+ block3 = _mm_xor_si128(block3, rk);
+ for (unsigned int i=1; i<rounds; i++)
+ {
+ rk = subkeys[i];
+ block0 = _mm_aesenc_si128(block0, rk);
+ block1 = _mm_aesenc_si128(block1, rk);
+ block2 = _mm_aesenc_si128(block2, rk);
+ block3 = _mm_aesenc_si128(block3, rk);
+ }
+ rk = subkeys[rounds];
+ block0 = _mm_aesenclast_si128(block0, rk);
+ block1 = _mm_aesenclast_si128(block1, rk);
+ block2 = _mm_aesenclast_si128(block2, rk);
+ block3 = _mm_aesenclast_si128(block3, rk);
+}
+
+inline void AESNI_Dec_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
+{
+ block = _mm_xor_si128(block, subkeys[0]);
+ for (unsigned int i=1; i<rounds-1; i+=2)
+ {
+ block = _mm_aesdec_si128(block, subkeys[i]);
+ block = _mm_aesdec_si128(block, subkeys[i+1]);
+ }
+ block = _mm_aesdec_si128(block, subkeys[rounds-1]);
+ block = _mm_aesdeclast_si128(block, subkeys[rounds]);
+}
+
+inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
+{
+ __m128i rk = subkeys[0];
+ block0 = _mm_xor_si128(block0, rk);
+ block1 = _mm_xor_si128(block1, rk);
+ block2 = _mm_xor_si128(block2, rk);
+ block3 = _mm_xor_si128(block3, rk);
+ for (unsigned int i=1; i<rounds; i++)
+ {
+ rk = subkeys[i];
+ block0 = _mm_aesdec_si128(block0, rk);
+ block1 = _mm_aesdec_si128(block1, rk);
+ block2 = _mm_aesdec_si128(block2, rk);
+ block3 = _mm_aesdec_si128(block3, rk);
+ }
+ rk = subkeys[rounds];
+ block0 = _mm_aesdeclast_si128(block0, rk);
+ block1 = _mm_aesdeclast_si128(block1, rk);
+ block2 = _mm_aesdeclast_si128(block2, rk);
+ block3 = _mm_aesdeclast_si128(block3, rk);
+}
+
+static CRYPTOPP_ALIGN_DATA(16) const word32 s_one[] = {0, 0, 0, 1<<24};
+
+template <typename F1, typename F4>
+inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, const __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+ size_t blockSize = 16;
+ size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
+ size_t xorIncrement = xorBlocks ? blockSize : 0;
+ size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
+
+ if (flags & BlockTransformation::BT_ReverseDirection)
+ {
+ assert(length % blockSize == 0);
+ inBlocks += length - blockSize;
+ xorBlocks += length - blockSize;
+ outBlocks += length - blockSize;
+ inIncrement = 0-inIncrement;
+ xorIncrement = 0-xorIncrement;
+ outIncrement = 0-outIncrement;
+ }
+
+ if (flags & BlockTransformation::BT_AllowParallel)
+ {
+ while (length >= 4*blockSize)
+ {
+ __m128i block0 = _mm_loadu_si128((const __m128i *)inBlocks), block1, block2, block3;
+ if (flags & BlockTransformation::BT_InBlockIsCounter)
+ {
+ const __m128i be1 = *(const __m128i *)s_one;
+ block1 = _mm_add_epi32(block0, be1);
+ block2 = _mm_add_epi32(block1, be1);
+ block3 = _mm_add_epi32(block2, be1);
+ _mm_storeu_si128((__m128i *)inBlocks, _mm_add_epi32(block3, be1));
+ }
+ else
+ {
+ inBlocks += inIncrement;
+ block1 = _mm_loadu_si128((const __m128i *)inBlocks);
+ inBlocks += inIncrement;
+ block2 = _mm_loadu_si128((const __m128i *)inBlocks);
+ inBlocks += inIncrement;
+ block3 = _mm_loadu_si128((const __m128i *)inBlocks);
+ inBlocks += inIncrement;
+ }
+
+ if (flags & BlockTransformation::BT_XorInput)
+ {
+ block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
+ xorBlocks += xorIncrement;
+ block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
+ xorBlocks += xorIncrement;
+ block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
+ xorBlocks += xorIncrement;
+ block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
+ xorBlocks += xorIncrement;
+ }
+
+ func4(block0, block1, block2, block3, subkeys, rounds);
+
+ if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
+ {
+ block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
+ xorBlocks += xorIncrement;
+ block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
+ xorBlocks += xorIncrement;
+ block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
+ xorBlocks += xorIncrement;
+ block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
+ xorBlocks += xorIncrement;
+ }
+
+ _mm_storeu_si128((__m128i *)outBlocks, block0);
+ outBlocks += outIncrement;
+ _mm_storeu_si128((__m128i *)outBlocks, block1);
+ outBlocks += outIncrement;
+ _mm_storeu_si128((__m128i *)outBlocks, block2);
+ outBlocks += outIncrement;
+ _mm_storeu_si128((__m128i *)outBlocks, block3);
+ outBlocks += outIncrement;
+
+ length -= 4*blockSize;
+ }
+ }
+
+ while (length >= blockSize)
+ {
+ __m128i block = _mm_loadu_si128((const __m128i *)inBlocks);
+
+ if (flags & BlockTransformation::BT_XorInput)
+ block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
+
+ if (flags & BlockTransformation::BT_InBlockIsCounter)
+ const_cast<byte *>(inBlocks)[15]++;
+
+ func1(block, subkeys, rounds);
+
+ if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
+ block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
+
+ _mm_storeu_si128((__m128i *)outBlocks, block);
+
+ inBlocks += inIncrement;
+ outBlocks += outIncrement;
+ xorBlocks += xorIncrement;
+ length -= blockSize;
+ }
+
+ return length;
+}
+#endif
+
size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
{
+#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
+ if (HasAESNI())
+ return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
+
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
- if (length < BLOCKSIZE)
- return length;
-
if (HasSSE2())
{
+ if (length < BLOCKSIZE)
+ return length;
+
struct Locals
{
word32 subkeys[4*12], workspace[8];
@@ -966,15 +1234,27 @@ size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo
locals.keysBegin = (12-keysToCopy)*16;
Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
- return length%16;
+ return length % BLOCKSIZE;
}
- else
#endif
- return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
+
+ return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
}
#endif
+#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
+
+size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
+{
+ if (HasAESNI())
+ return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+
+ return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+#endif // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
+
NAMESPACE_END
#endif
diff --git a/rijndael.h b/rijndael.h
index d602186..64c784b 100644
--- a/rijndael.h
+++ b/rijndael.h
@@ -50,6 +50,9 @@ class CRYPTOPP_DLL Rijndael : public Rijndael_Info, public BlockCipherDocumentat
{
public:
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
+#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
+ size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
+#endif
};
public:
diff --git a/validat1.cpp b/validat1.cpp
index dafeb52..696327d 100644
--- a/validat1.cpp
+++ b/validat1.cpp
@@ -252,6 +252,7 @@ bool TestSettings()
cout << "passed: ";
cout << "hasMMX == " << hasMMX << ", hasISSE == " << hasISSE << ", hasSSE2 == " << hasSSE2 << ", hasSSSE3 == " << hasSSSE3 << ", hasAESNI == " << HasAESNI() << ", hasCLMUL == " << HasCLMUL() << ", isP4 == " << isP4 << ", cacheLineSize == " << cacheLineSize;
+ cout << ", AESNI_INTRINSICS == " << CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE << endl;
if (!pass)
{