diff options
author | weidai <weidai@57ff6487-cd31-0410-9ec3-f628ee90f5f0> | 2007-04-15 22:54:31 +0000 |
---|---|---|
committer | weidai <weidai@57ff6487-cd31-0410-9ec3-f628ee90f5f0> | 2007-04-15 22:54:31 +0000 |
commit | 851f567904d2e6ef236fede79b02d638c71e4e2f (patch) | |
tree | d7c8c74edd548369be89d8f6e88d055995ecd95a /salsa.cpp | |
parent | 6da041704487c4c8d90b3caa5112ff1ecbb62fb2 (diff) | |
download | cryptopp-851f567904d2e6ef236fede79b02d638c71e4e2f.tar.gz |
SSE2 optimizations
git-svn-id: svn://svn.code.sf.net/p/cryptopp/code/trunk/c5@282 57ff6487-cd31-0410-9ec3-f628ee90f5f0
Diffstat (limited to 'salsa.cpp')
-rwxr-xr-x | salsa.cpp | 369 |
1 files changed, 283 insertions, 86 deletions
@@ -4,6 +4,9 @@ #include "salsa.h" #include "misc.h" #include "argnames.h" +#include "cpu.h" + +#include <emmintrin.h> NAMESPACE_BEGIN(CryptoPP) @@ -14,11 +17,13 @@ void Salsa20_TestInstantiations() void Salsa20_Policy::CipherGetNextIV(byte *IV) { - word32 j6 = m_state[6] + 1; - word32 j7 = m_state[7] + (j6 == 0); + word32 j6, j7; + + j6 = m_state[14] + 1; + j7 = m_state[11] + (j6 == 0); - UnalignedPutWord(LITTLE_ENDIAN_ORDER, IV, j6); - UnalignedPutWord(LITTLE_ENDIAN_ORDER, IV+4, j7); + PutWord(false, LITTLE_ENDIAN_ORDER, IV, j6); + PutWord(false, LITTLE_ENDIAN_ORDER, IV+4, j7); } void Salsa20_Policy::CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length) @@ -28,112 +33,304 @@ void Salsa20_Policy::CipherSetKey(const NameValuePairs ¶ms, const byte *key, if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20)) throw InvalidRounds(StaticAlgorithmName(), m_rounds); - GetUserKey(LITTLE_ENDIAN_ORDER, m_state+1, 4, key, 16); - GetUserKey(LITTLE_ENDIAN_ORDER, m_state+11, 4, key + length - 16, 16); + // m_state is reordered for SSE2 + GetBlock<word32, LittleEndian, false> get1(key); + get1(m_state[13])(m_state[10])(m_state[7])(m_state[4]); + GetBlock<word32, LittleEndian, false> get2(key + length - 16); + get2(m_state[15])(m_state[12])(m_state[9])(m_state[6]); - // m_state[0,5,10,15] forms "expand 16-byte k" or "expand 32-byte k" + // "expand 16-byte k" or "expand 32-byte k" m_state[0] = 0x61707865; - m_state[5] = (length == 16) ? 0x3120646e : 0x3320646e; - m_state[10] = (length == 16) ? 0x79622d36 : 0x79622d32; - m_state[15] = 0x6b206574; + m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e; + m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32; + m_state[3] = 0x6b206574; } void Salsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV) { - GetUserKey(LITTLE_ENDIAN_ORDER, m_state+6, 4, IV, 8); + GetBlock<word32, LittleEndian, false> get(IV); + get(m_state[14])(m_state[11]); + m_state[8] = m_state[5] = 0; } void Salsa20_Policy::SeekToIteration(lword iterationCount) { m_state[8] = (word32)iterationCount; - m_state[9] = (word32)SafeRightShift<32>(iterationCount); + m_state[5] = (word32)SafeRightShift<32>(iterationCount); +} + +#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64 +unsigned int Salsa20_Policy::GetAlignment() const +{ +#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE + if (HasSSE2()) + return 16; + else +#endif + return 1; +} + +unsigned int Salsa20_Policy::GetOptimalBlockSize() const +{ +#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE + if (HasSSE2()) + return 4*BYTES_PER_ITERATION; + else +#endif + return BYTES_PER_ITERATION; } +#endif void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) { - KeystreamOutput<LittleEndian> keystreamOutput(operation, output, input); + int i; +#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE + if (HasSSE2()) + { + __m128i *s = (__m128i *)m_state.data(); + + if (iterationCount >= 4) + { + __m128i ss[16]; + ss[0] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(0, 0, 0, 0)); + ss[1] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(1, 1, 1, 1)); + ss[2] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(2, 2, 2, 2)); + ss[3] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(3, 3, 3, 3)); + ss[4] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(0, 0, 0, 0)); + ss[6] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(2, 2, 2, 2)); + ss[7] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(3, 3, 3, 3)); + ss[9] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(1, 1, 1, 1)); + ss[10] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(2, 2, 2, 2)); + ss[11] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(3, 3, 3, 3)); + ss[12] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(0, 0, 0, 0)); + ss[13] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(1, 1, 1, 1)); + ss[14] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(2, 2, 2, 2)); + ss[15] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(3, 3, 3, 3)); + + do + { + word32 *countersLo = (word32*)&(ss[8]), *countersHi = (word32*)&(ss[5]); + for (i=0; i<4; i++) + { + countersLo[i] = m_state[8]; + countersHi[i] = m_state[5]; + if (++m_state[8] == 0) + ++m_state[5]; + } + + __m128i x0 = ss[0]; + __m128i x1 = ss[1]; + __m128i x2 = ss[2]; + __m128i x3 = ss[3]; + __m128i x4 = ss[4]; + __m128i x5 = ss[5]; + __m128i x6 = ss[6]; + __m128i x7 = ss[7]; + __m128i x8 = ss[8]; + __m128i x9 = ss[9]; + __m128i x10 = ss[10]; + __m128i x11 = ss[11]; + __m128i x12 = ss[12]; + __m128i x13 = ss[13]; + __m128i x14 = ss[14]; + __m128i x15 = ss[15]; + + for (i=m_rounds; i>0; i-=2) + { + #define SSE2_QUARTER_ROUND(a, b, d, i) {\ + __m128i t = _mm_add_epi32(a, d); \ + b = _mm_xor_si128(b, _mm_slli_epi32(t, i)); \ + b = _mm_xor_si128(b, _mm_srli_epi32(t, 32-i));} + + #define QUARTER_ROUND(a, b, c, d) \ + SSE2_QUARTER_ROUND(a, b, d, 7) \ + SSE2_QUARTER_ROUND(b, c, a, 9) \ + SSE2_QUARTER_ROUND(c, d, b, 13) \ + SSE2_QUARTER_ROUND(d, a, c, 18) + + QUARTER_ROUND(x0, x4, x8, x12) + QUARTER_ROUND(x1, x5, x9, x13) + QUARTER_ROUND(x2, x6, x10, x14) + QUARTER_ROUND(x3, x7, x11, x15) + + QUARTER_ROUND(x0, x13, x10, x7) + QUARTER_ROUND(x1, x14, x11, x4) + QUARTER_ROUND(x2, x15, x8, x5) + QUARTER_ROUND(x3, x12, x9, x6) + + #undef QUARTER_ROUND + } + + x0 = _mm_add_epi32(x0, ss[0]); + x1 = _mm_add_epi32(x1, ss[1]); + x2 = _mm_add_epi32(x2, ss[2]); + x3 = _mm_add_epi32(x3, ss[3]); + x4 = _mm_add_epi32(x4, ss[4]); + x5 = _mm_add_epi32(x5, ss[5]); + x6 = _mm_add_epi32(x6, ss[6]); + x7 = _mm_add_epi32(x7, ss[7]); + x8 = _mm_add_epi32(x8, ss[8]); + x9 = _mm_add_epi32(x9, ss[9]); + x10 = _mm_add_epi32(x10, ss[10]); + x11 = _mm_add_epi32(x11, ss[11]); + x12 = _mm_add_epi32(x12, ss[12]); + x13 = _mm_add_epi32(x13, ss[13]); + x14 = _mm_add_epi32(x14, ss[14]); + x15 = _mm_add_epi32(x15, ss[15]); + + #define OUTPUT_4(x, a, b, c, d, e, f, g, h) {\ + __m128i t0 = _mm_unpacklo_epi32(a, b);\ + __m128i t1 = _mm_unpacklo_epi32(c, d);\ + __m128i t2 = _mm_unpacklo_epi64(t0, t1);\ + CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, e, t2)\ + t2 = _mm_unpackhi_epi64(t0, t1);\ + CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, f, t2)\ + t0 = _mm_unpackhi_epi32(a, b);\ + t1 = _mm_unpackhi_epi32(c, d);\ + t2 = _mm_unpacklo_epi64(t0, t1);\ + CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, g, t2)\ + t2 = _mm_unpackhi_epi64(t0, t1);\ + CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, h, t2)} + + #define SALSA_OUTPUT(x) \ + OUTPUT_4(x, x0, x13, x10, x7, 0, 4, 8, 12)\ + OUTPUT_4(x, x4, x1, x14, x11, 1, 5, 9, 13)\ + OUTPUT_4(x, x8, x5, x2, x15, 2, 6, 10, 14)\ + OUTPUT_4(x, x12, x9, x6, x3, 3, 7, 11, 15) + + CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, 4*BYTES_PER_ITERATION) + + #undef SALSA_OUTPUT + } while ((iterationCount-=4) >= 4); + } + + if (!IsP4()) while (iterationCount) + { + --iterationCount; + __m128i x0 = s[0]; + __m128i x1 = s[1]; + __m128i x2 = s[2]; + __m128i x3 = s[3]; + + for (i=m_rounds; i>0; i-=2) + { + SSE2_QUARTER_ROUND(x0, x1, x3, 7) + SSE2_QUARTER_ROUND(x1, x2, x0, 9) + SSE2_QUARTER_ROUND(x2, x3, x1, 13) + SSE2_QUARTER_ROUND(x3, x0, x2, 18) + + x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2, 1, 0, 3)); + x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2)); + x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(0, 3, 2, 1)); + + SSE2_QUARTER_ROUND(x0, x3, x1, 7) + SSE2_QUARTER_ROUND(x3, x2, x0, 9) + SSE2_QUARTER_ROUND(x2, x1, x3, 13) + SSE2_QUARTER_ROUND(x1, x0, x2, 18) + + x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0, 3, 2, 1)); + x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2)); + x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(2, 1, 0, 3)); + } + + x0 = _mm_add_epi32(x0, s[0]); + x1 = _mm_add_epi32(x1, s[1]); + x2 = _mm_add_epi32(x2, s[2]); + x3 = _mm_add_epi32(x3, s[3]); + + if (++m_state[8] == 0) + ++m_state[5]; + + CRYPTOPP_ALIGN_DATA(16) static const word32 masks[8] CRYPTOPP_SECTION_ALIGN16 = + {0, 0xffffffff, 0, 0xffffffff, 0xffffffff, 0, 0xffffffff, 0}; + + __m128i k02 = _mm_or_si128(_mm_slli_epi64(x0, 32), _mm_srli_epi64(x3, 32)); + k02 = _mm_shuffle_epi32(k02, _MM_SHUFFLE(0, 1, 2, 3)); + __m128i k13 = _mm_or_si128(_mm_slli_epi64(x1, 32), _mm_srli_epi64(x0, 32)); + k13 = _mm_shuffle_epi32(k13, _MM_SHUFFLE(0, 1, 2, 3)); + __m128i maskLo32 = ((__m128i*)masks)[1], maskHi32 = ((__m128i*)masks)[0]; + __m128i k20 = _mm_or_si128(_mm_and_si128(x2, maskLo32), _mm_and_si128(x1, maskHi32)); + __m128i k31 = _mm_or_si128(_mm_and_si128(x3, maskLo32), _mm_and_si128(x2, maskHi32)); + + __m128i k0 = _mm_unpackhi_epi64(k02, k20); + __m128i k1 = _mm_unpackhi_epi64(k13, k31); + __m128i k2 = _mm_unpacklo_epi64(k20, k02); + __m128i k3 = _mm_unpacklo_epi64(k31, k13); + + #define SSE2_OUTPUT(x) {\ + CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 0, k0)\ + CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 1, k1)\ + CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 2, k2)\ + CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 3, k3)} + + CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SSE2_OUTPUT, BYTES_PER_ITERATION); + } + } +#endif word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - word32 j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15; - - j0 = m_state[0]; - j1 = m_state[1]; - j2 = m_state[2]; - j3 = m_state[3]; - j4 = m_state[4]; - j5 = m_state[5]; - j6 = m_state[6]; - j7 = m_state[7]; - j8 = m_state[8]; - j9 = m_state[9]; - j10 = m_state[10]; - j11 = m_state[11]; - j12 = m_state[12]; - j13 = m_state[13]; - j14 = m_state[14]; - j15 = m_state[15]; - - for (size_t iteration = 0; iteration < iterationCount; ++iteration) + + while (iterationCount--) { - x0 = j0; - x1 = j1; - x2 = j2; - x3 = j3; - x4 = j4; - x5 = j5; - x6 = j6; - x7 = j7; - x8 = j8; - x9 = j9; - x10 = j10; - x11 = j11; - x12 = j12; - x13 = j13; - x14 = j14; - x15 = j15; - - for (int i=m_rounds; i>0; i-=2) + x0 = m_state[0]; + x1 = m_state[1]; + x2 = m_state[2]; + x3 = m_state[3]; + x4 = m_state[4]; + x5 = m_state[5]; + x6 = m_state[6]; + x7 = m_state[7]; + x8 = m_state[8]; + x9 = m_state[9]; + x10 = m_state[10]; + x11 = m_state[11]; + x12 = m_state[12]; + x13 = m_state[13]; + x14 = m_state[14]; + x15 = m_state[15]; + + for (i=m_rounds; i>0; i-=2) { -#define QUARTER_ROUND(a, b, c, d) \ - b = b ^ rotlFixed(a + d, 7); \ - c = c ^ rotlFixed(b + a, 9); \ - d = d ^ rotlFixed(c + b, 13); \ - a = a ^ rotlFixed(d + c, 18); + #define QUARTER_ROUND(a, b, c, d) \ + b = b ^ rotlFixed(a + d, 7); \ + c = c ^ rotlFixed(b + a, 9); \ + d = d ^ rotlFixed(c + b, 13); \ + a = a ^ rotlFixed(d + c, 18); QUARTER_ROUND(x0, x4, x8, x12) - QUARTER_ROUND(x5, x9, x13, x1) - QUARTER_ROUND(x10, x14, x2, x6) - QUARTER_ROUND(x15, x3, x7, x11) - - QUARTER_ROUND(x0, x1, x2, x3) - QUARTER_ROUND(x5, x6, x7, x4) - QUARTER_ROUND(x10, x11, x8, x9) - QUARTER_ROUND(x15, x12, x13, x14) + QUARTER_ROUND(x1, x5, x9, x13) + QUARTER_ROUND(x2, x6, x10, x14) + QUARTER_ROUND(x3, x7, x11, x15) + + QUARTER_ROUND(x0, x13, x10, x7) + QUARTER_ROUND(x1, x14, x11, x4) + QUARTER_ROUND(x2, x15, x8, x5) + QUARTER_ROUND(x3, x12, x9, x6) } - keystreamOutput (x0 + j0) - (x1 + j1) - (x2 + j2) - (x3 + j3) - (x4 + j4) - (x5 + j5) - (x6 + j6) - (x7 + j7) - (x8 + j8) - (x9 + j9) - (x10 + j10) - (x11 + j11) - (x12 + j12) - (x13 + j13) - (x14 + j14) - (x15 + j15); - - if (++j8 == 0) - ++j9; - } + #define SALSA_OUTPUT(x) {\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);} - m_state[8] = j8; - m_state[9] = j9; + CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION); + + if (++m_state[8] == 0) + ++m_state[5]; + } } NAMESPACE_END |