summaryrefslogtreecommitdiff
path: root/salsa.cpp
diff options
context:
space:
mode:
authorweidai <weidai@57ff6487-cd31-0410-9ec3-f628ee90f5f0>2007-09-24 00:43:57 +0000
committerweidai <weidai@57ff6487-cd31-0410-9ec3-f628ee90f5f0>2007-09-24 00:43:57 +0000
commit982ba6fa712d44275c2541b6b9badf489cf9eda6 (patch)
tree7d4e77f11bb8dc49557b634d8380767aef1b8502 /salsa.cpp
parent489a156f9bc41028439b6375af6314e473565847 (diff)
downloadcryptopp-982ba6fa712d44275c2541b6b9badf489cf9eda6.tar.gz
- port x64 assembly code to MASM
- improve stack unwindability on x64 for GCC by not modifying RBP/RSP registers in inline assembly git-svn-id: svn://svn.code.sf.net/p/cryptopp/code/trunk/c5@396 57ff6487-cd31-0410-9ec3-f628ee90f5f0
Diffstat (limited to 'salsa.cpp')
-rwxr-xr-xsalsa.cpp719
1 files changed, 473 insertions, 246 deletions
diff --git a/salsa.cpp b/salsa.cpp
index f24e7ab..4681ec6 100755
--- a/salsa.cpp
+++ b/salsa.cpp
@@ -1,6 +1,11 @@
// salsa.cpp - written and placed in the public domain by Wei Dai
+// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM salsa.cpp" to generate MASM code
+
#include "pch.h"
+
+#ifndef CRYPTOPP_GENERATE_X64_MASM
+
#include "salsa.h"
#include "misc.h"
#include "argnames.h"
@@ -53,7 +58,7 @@ void Salsa20_Policy::SeekToIteration(lword iterationCount)
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
unsigned int Salsa20_Policy::GetAlignment() const
{
-#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
if (HasSSE2())
return 16;
else
@@ -63,7 +68,7 @@ unsigned int Salsa20_Policy::GetAlignment() const
unsigned int Salsa20_Policy::GetOptimalBlockSize() const
{
-#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
if (HasSSE2())
return 4*BYTES_PER_ITERATION;
else
@@ -72,267 +77,489 @@ unsigned int Salsa20_Policy::GetOptimalBlockSize() const
}
#endif
+#ifdef CRYPTOPP_X64_MASM_AVAILABLE
+extern "C" {
+void Salsa20_OperateKeystream(byte *output, const byte *input, size_t iterationCount, int rounds, void *state);
+}
+#endif
+
void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
{
- int i;
-#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
- #define SSE2_QUARTER_ROUND(a, b, d, i) {\
- __m128i t = _mm_add_epi32(a, d); \
- b = _mm_xor_si128(b, _mm_slli_epi32(t, i)); \
- b = _mm_xor_si128(b, _mm_srli_epi32(t, 32-i));}
+#endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
+
+#ifdef CRYPTOPP_X64_MASM_AVAILABLE
+ Salsa20_OperateKeystream(output, input, iterationCount, m_rounds, m_state.data());
+ return;
+#endif
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+#ifdef CRYPTOPP_GENERATE_X64_MASM
+ ALIGN 8
+ Salsa20_OperateKeystream PROC FRAME
+ mov r10, [rsp + 5*8] ; state
+ alloc_stack(10*16 + 32*16 + 8)
+ save_xmm128 xmm6, 0200h
+ save_xmm128 xmm7, 0210h
+ save_xmm128 xmm8, 0220h
+ save_xmm128 xmm9, 0230h
+ save_xmm128 xmm10, 0240h
+ save_xmm128 xmm11, 0250h
+ save_xmm128 xmm12, 0260h
+ save_xmm128 xmm13, 0270h
+ save_xmm128 xmm14, 0280h
+ save_xmm128 xmm15, 0290h
+ .endprolog
+
+ #define REG_output rcx
+ #define REG_input rdx
+ #define REG_iterationCount r8
+ #define REG_state r10
+ #define REG_rounds eax
+ #define REG_temp32 r11d
+ #define REG_temp r11
+ #define SSE2_WORKSPACE rsp
+ #define SSE2_LOAD_ROUNDS mov eax, r9d
+#else
if (HasSSE2())
{
- __m128i *s = (__m128i *)m_state.data();
-
-#if _MSC_VER > 1400 || (defined(_MSC_VER) && CRYPTOPP_BOOL_X86) || (CRYPTOPP_GCC_VERSION >= 40000 && CRYPTOPP_BOOL_X86)
- // This code triggers an internal compiler error on MSVC 2005 when compiling
- // for x64 with optimizations on. hopefully it will get fixed in the next release.
- // A bug report has been submitted at http://connect.microsoft.com/VisualStudio/feedback/ViewFeedback.aspx?FeedbackID=274123
- // Also, GCC 3.4.4 generates incorrect code for x86 at -O2.
- // GCC 4.1.1 generates incorrect code for x64 at -O2
- if (iterationCount >= 4)
- {
- __m128i ss[16];
- ss[0] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(0, 0, 0, 0));
- ss[1] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(1, 1, 1, 1));
- ss[2] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(2, 2, 2, 2));
- ss[3] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(3, 3, 3, 3));
- ss[4] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(0, 0, 0, 0));
- ss[6] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(2, 2, 2, 2));
- ss[7] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(3, 3, 3, 3));
- ss[9] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(1, 1, 1, 1));
- ss[10] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(2, 2, 2, 2));
- ss[11] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(3, 3, 3, 3));
- ss[12] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(0, 0, 0, 0));
- ss[13] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(1, 1, 1, 1));
- ss[14] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(2, 2, 2, 2));
- ss[15] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(3, 3, 3, 3));
-
- do
- {
- word32 *countersLo = (word32*)&(ss[8]), *countersHi = (word32*)&(ss[5]);
- for (i=0; i<4; i++)
- {
- countersLo[i] = m_state[8];
- countersHi[i] = m_state[5];
- if (++m_state[8] == 0)
- ++m_state[5];
- }
-
- __m128i x0 = ss[0];
- __m128i x1 = ss[1];
- __m128i x2 = ss[2];
- __m128i x3 = ss[3];
- __m128i x4 = ss[4];
- __m128i x5 = ss[5];
- __m128i x6 = ss[6];
- __m128i x7 = ss[7];
- __m128i x8 = ss[8];
- __m128i x9 = ss[9];
- __m128i x10 = ss[10];
- __m128i x11 = ss[11];
- __m128i x12 = ss[12];
- __m128i x13 = ss[13];
- __m128i x14 = ss[14];
- __m128i x15 = ss[15];
-
- for (i=m_rounds; i>0; i-=2)
- {
- #define QUARTER_ROUND(a, b, c, d) \
- SSE2_QUARTER_ROUND(a, b, d, 7) \
- SSE2_QUARTER_ROUND(b, c, a, 9) \
- SSE2_QUARTER_ROUND(c, d, b, 13) \
- SSE2_QUARTER_ROUND(d, a, c, 18)
-
- QUARTER_ROUND(x0, x4, x8, x12)
- QUARTER_ROUND(x1, x5, x9, x13)
- QUARTER_ROUND(x2, x6, x10, x14)
- QUARTER_ROUND(x3, x7, x11, x15)
-
- QUARTER_ROUND(x0, x13, x10, x7)
- QUARTER_ROUND(x1, x14, x11, x4)
- QUARTER_ROUND(x2, x15, x8, x5)
- QUARTER_ROUND(x3, x12, x9, x6)
-
- #undef QUARTER_ROUND
- }
-
- x0 = _mm_add_epi32(x0, ss[0]);
- x1 = _mm_add_epi32(x1, ss[1]);
- x2 = _mm_add_epi32(x2, ss[2]);
- x3 = _mm_add_epi32(x3, ss[3]);
- x4 = _mm_add_epi32(x4, ss[4]);
- x5 = _mm_add_epi32(x5, ss[5]);
- x6 = _mm_add_epi32(x6, ss[6]);
- x7 = _mm_add_epi32(x7, ss[7]);
- x8 = _mm_add_epi32(x8, ss[8]);
- x9 = _mm_add_epi32(x9, ss[9]);
- x10 = _mm_add_epi32(x10, ss[10]);
- x11 = _mm_add_epi32(x11, ss[11]);
- x12 = _mm_add_epi32(x12, ss[12]);
- x13 = _mm_add_epi32(x13, ss[13]);
- x14 = _mm_add_epi32(x14, ss[14]);
- x15 = _mm_add_epi32(x15, ss[15]);
-
- #define OUTPUT_4(x, a, b, c, d, e, f, g, h) {\
- __m128i t0 = _mm_unpacklo_epi32(a, b);\
- __m128i t1 = _mm_unpacklo_epi32(c, d);\
- __m128i t2 = _mm_unpacklo_epi64(t0, t1);\
- CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, e, t2)\
- t2 = _mm_unpackhi_epi64(t0, t1);\
- CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, f, t2)\
- t0 = _mm_unpackhi_epi32(a, b);\
- t1 = _mm_unpackhi_epi32(c, d);\
- t2 = _mm_unpacklo_epi64(t0, t1);\
- CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, g, t2)\
- t2 = _mm_unpackhi_epi64(t0, t1);\
- CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, h, t2)}
-
- #define SALSA_OUTPUT(x) \
- OUTPUT_4(x, x0, x13, x10, x7, 0, 4, 8, 12)\
- OUTPUT_4(x, x4, x1, x14, x11, 1, 5, 9, 13)\
- OUTPUT_4(x, x8, x5, x2, x15, 2, 6, 10, 14)\
- OUTPUT_4(x, x12, x9, x6, x3, 3, 7, 11, 15)
-
- CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, 4*BYTES_PER_ITERATION)
-
- #undef SALSA_OUTPUT
- } while ((iterationCount-=4) >= 4);
- }
+ #if CRYPTOPP_BOOL_X64
+ #define REG_output %4
+ #define REG_input %1
+ #define REG_iterationCount %2
+ #define REG_state %3
+ #define REG_rounds eax
+ #define REG_temp32 edx
+ #define REG_temp rdx
+ #define SSE2_WORKSPACE %5
+ #define SSE2_LOAD_ROUNDS AS2(mov eax, %0)
+
+ __m128i workspace[32];
+ #else
+ #define REG_output edi
+ #define REG_input eax
+ #define REG_iterationCount ecx
+ #define REG_state esi
+ #define REG_rounds ebx
+ #define REG_temp32 edx
+ #define REG_temp edx
+ #define SSE2_WORKSPACE esp + WORD_SZ
+ #ifdef __GNUC__
+ // this assumes that a frame pointer is used
+ #define SSE2_LOAD_ROUNDS ".att_syntax prefix;movl %0, %%ebx;.intel_syntax noprefix;"
+ #else
+ #define SSE2_LOAD_ROUNDS AS2(mov REG_rounds, r)
+ #endif
+ #endif
+
+ word32 r = m_rounds;
+
+ #ifdef __GNUC__
+ __asm__ __volatile__
+ (
+ ".intel_syntax noprefix;"
+ AS_PUSH_IF86( bx)
+ #else
+ void *s = m_state.data();
+
+ AS2( mov REG_iterationCount, iterationCount)
+ AS2( mov REG_state, s)
+ AS2( mov REG_input, input)
+ AS2( mov REG_output, output)
+ #endif
+#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
+
+ AS2( cmp REG_iterationCount, 4)
+ ASJ( jl, 5, f)
+
+#if CRYPTOPP_BOOL_X86
+ AS2( mov ebx, esp)
+ AS2( and esp, -16)
+ AS2( sub esp, 32*16)
+ AS1( push ebx)
#endif
- if (!IsP4() && iterationCount > 0)
- {
- const __m128i s_maskLo32 = _mm_shuffle_epi32(_mm_cvtsi32_si128(-1), _MM_SHUFFLE(1, 0, 1, 0));
- const __m128i s_maskHi32 = _mm_slli_epi64(s_maskLo32, 32);
-
- do
- {
- __m128i x0 = s[0];
- __m128i x1 = s[1];
- __m128i x2 = s[2];
- __m128i x3 = s[3];
-
- for (i=m_rounds; i>0; i-=2)
- {
- SSE2_QUARTER_ROUND(x0, x1, x3, 7)
- SSE2_QUARTER_ROUND(x1, x2, x0, 9)
- SSE2_QUARTER_ROUND(x2, x3, x1, 13)
- SSE2_QUARTER_ROUND(x3, x0, x2, 18)
-
- x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2, 1, 0, 3));
- x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2));
- x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(0, 3, 2, 1));
-
- SSE2_QUARTER_ROUND(x0, x3, x1, 7)
- SSE2_QUARTER_ROUND(x3, x2, x0, 9)
- SSE2_QUARTER_ROUND(x2, x1, x3, 13)
- SSE2_QUARTER_ROUND(x1, x0, x2, 18)
-
- x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0, 3, 2, 1));
- x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2));
- x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(2, 1, 0, 3));
- }
-
- x0 = _mm_add_epi32(x0, s[0]);
- x1 = _mm_add_epi32(x1, s[1]);
- x2 = _mm_add_epi32(x2, s[2]);
- x3 = _mm_add_epi32(x3, s[3]);
-
- if (++m_state[8] == 0)
- ++m_state[5];
-
- __m128i k02 = _mm_or_si128(_mm_slli_epi64(x0, 32), _mm_srli_epi64(x3, 32));
- k02 = _mm_shuffle_epi32(k02, _MM_SHUFFLE(0, 1, 2, 3));
- __m128i k13 = _mm_or_si128(_mm_slli_epi64(x1, 32), _mm_srli_epi64(x0, 32));
- k13 = _mm_shuffle_epi32(k13, _MM_SHUFFLE(0, 1, 2, 3));
- __m128i k20 = _mm_or_si128(_mm_and_si128(x2, s_maskLo32), _mm_and_si128(x1, s_maskHi32));
- __m128i k31 = _mm_or_si128(_mm_and_si128(x3, s_maskLo32), _mm_and_si128(x2, s_maskHi32));
-
- __m128i k0 = _mm_unpackhi_epi64(k02, k20);
- __m128i k1 = _mm_unpackhi_epi64(k13, k31);
- __m128i k2 = _mm_unpacklo_epi64(k20, k02);
- __m128i k3 = _mm_unpacklo_epi64(k31, k13);
-
- #define SSE2_OUTPUT(x) {\
- CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 0, k0)\
- CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 1, k1)\
- CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 2, k2)\
- CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 3, k3)}
-
- CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SSE2_OUTPUT, BYTES_PER_ITERATION);
- }
- while (--iterationCount);
- }
+#define SSE2_EXPAND_S(i, j) \
+ ASS( pshufd xmm4, xmm##i, j, j, j, j) \
+ AS2( movdqa [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
+
+ AS2( movdqa xmm0, [REG_state + 0*16])
+ AS2( movdqa xmm1, [REG_state + 1*16])
+ AS2( movdqa xmm2, [REG_state + 2*16])
+ AS2( movdqa xmm3, [REG_state + 3*16])
+ SSE2_EXPAND_S(0, 0)
+ SSE2_EXPAND_S(0, 1)
+ SSE2_EXPAND_S(0, 2)
+ SSE2_EXPAND_S(0, 3)
+ SSE2_EXPAND_S(1, 0)
+ SSE2_EXPAND_S(1, 2)
+ SSE2_EXPAND_S(1, 3)
+ SSE2_EXPAND_S(2, 1)
+ SSE2_EXPAND_S(2, 2)
+ SSE2_EXPAND_S(2, 3)
+ SSE2_EXPAND_S(3, 0)
+ SSE2_EXPAND_S(3, 1)
+ SSE2_EXPAND_S(3, 2)
+ SSE2_EXPAND_S(3, 3)
+
+#define SSE2_EXPAND_S85(i) \
+ AS2( mov dword ptr [SSE2_WORKSPACE + 8*16 + i*4 + 256], REG_rounds) \
+ AS2( mov dword ptr [SSE2_WORKSPACE + 5*16 + i*4 + 256], REG_temp32) \
+ AS2( add REG_rounds, 1) \
+ AS2( adc REG_temp32, 0)
+
+ ASL(1)
+ AS2( mov REG_rounds, dword ptr [REG_state + 8*4])
+ AS2( mov REG_temp32, dword ptr [REG_state + 5*4])
+ SSE2_EXPAND_S85(0)
+ SSE2_EXPAND_S85(1)
+ SSE2_EXPAND_S85(2)
+ SSE2_EXPAND_S85(3)
+ AS2( mov dword ptr [REG_state + 8*4], REG_rounds)
+ AS2( mov dword ptr [REG_state + 5*4], REG_temp32)
+
+#define SSE2_QUARTER_ROUND(a, b, d, i) \
+ AS2( movdqa xmm4, xmm##d) \
+ AS2( paddd xmm4, xmm##a) \
+ AS2( movdqa xmm5, xmm4) \
+ AS2( pslld xmm4, i) \
+ AS2( psrld xmm5, 32-i) \
+ AS2( pxor xmm##b, xmm4) \
+ AS2( pxor xmm##b, xmm5)
+
+#define L01(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) /* y3 */
+#define L02(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##C, [SSE2_WORKSPACE + a*16 + i*256]) /* y0 */
+#define L03(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* y0+y3 */
+#define L04(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
+#define L05(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 7)
+#define L06(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-7)
+#define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256])
+#define L08(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z1 */
+#define L09(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + b*16], xmm##A)
+#define L10(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
+#define L11(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* z1+y0 */
+#define L12(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
+#define L13(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 9)
+#define L14(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-9)
+#define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256])
+#define L16(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z2 */
+#define L17(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + c*16], xmm##A)
+#define L18(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
+#define L19(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##B) /* z2+z1 */
+#define L20(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
+#define L21(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 13)
+#define L22(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-13)
+#define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
+#define L24(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z3 */
+#define L25(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + d*16], xmm##A)
+#define L26(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##D) /* z3+z2 */
+#define L27(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
+#define L28(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 18)
+#define L29(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-18)
+#define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C) /* xor y0 */
+#define L31(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z0 */
+#define L32(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + a*16], xmm##A)
+
+#define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h) \
+ L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) \
+ L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) \
+ L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) \
+ L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) \
+ L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) \
+ L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) \
+ L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) \
+ L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) \
+ L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) \
+ L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) \
+ L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) \
+ L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) \
+ L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) \
+ L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) \
+ L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) \
+ L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) \
+ L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) \
+ L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) \
+ L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) \
+ L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) \
+ L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) \
+ L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) \
+ L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) \
+ L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) \
+ L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) \
+ L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) \
+ L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) \
+ L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) \
+ L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) \
+ L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) \
+ L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) \
+ L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i)
+
+#define SSE2_QUARTER_ROUND_X16(i, a, b, c, d, e, f, g, h, A, B, C, D, E, F, G, H) \
+ L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) L01(8,9,10,11, A,B,C,D, i) L01(12,13,14,15, E,F,G,H, i) \
+ L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) L02(8,9,10,11, A,B,C,D, i) L02(12,13,14,15, E,F,G,H, i) \
+ L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) L03(8,9,10,11, A,B,C,D, i) L03(12,13,14,15, E,F,G,H, i) \
+ L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) L04(8,9,10,11, A,B,C,D, i) L04(12,13,14,15, E,F,G,H, i) \
+ L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) L05(8,9,10,11, A,B,C,D, i) L05(12,13,14,15, E,F,G,H, i) \
+ L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) L06(8,9,10,11, A,B,C,D, i) L06(12,13,14,15, E,F,G,H, i) \
+ L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) L07(8,9,10,11, A,B,C,D, i) L07(12,13,14,15, E,F,G,H, i) \
+ L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) L08(8,9,10,11, A,B,C,D, i) L08(12,13,14,15, E,F,G,H, i) \
+ L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) L09(8,9,10,11, A,B,C,D, i) L09(12,13,14,15, E,F,G,H, i) \
+ L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) L10(8,9,10,11, A,B,C,D, i) L10(12,13,14,15, E,F,G,H, i) \
+ L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) L11(8,9,10,11, A,B,C,D, i) L11(12,13,14,15, E,F,G,H, i) \
+ L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) L12(8,9,10,11, A,B,C,D, i) L12(12,13,14,15, E,F,G,H, i) \
+ L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) L13(8,9,10,11, A,B,C,D, i) L13(12,13,14,15, E,F,G,H, i) \
+ L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) L14(8,9,10,11, A,B,C,D, i) L14(12,13,14,15, E,F,G,H, i) \
+ L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) L15(8,9,10,11, A,B,C,D, i) L15(12,13,14,15, E,F,G,H, i) \
+ L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) L16(8,9,10,11, A,B,C,D, i) L16(12,13,14,15, E,F,G,H, i) \
+ L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) L17(8,9,10,11, A,B,C,D, i) L17(12,13,14,15, E,F,G,H, i) \
+ L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) L18(8,9,10,11, A,B,C,D, i) L18(12,13,14,15, E,F,G,H, i) \
+ L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) L19(8,9,10,11, A,B,C,D, i) L19(12,13,14,15, E,F,G,H, i) \
+ L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) L20(8,9,10,11, A,B,C,D, i) L20(12,13,14,15, E,F,G,H, i) \
+ L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) L21(8,9,10,11, A,B,C,D, i) L21(12,13,14,15, E,F,G,H, i) \
+ L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) L22(8,9,10,11, A,B,C,D, i) L22(12,13,14,15, E,F,G,H, i) \
+ L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) L23(8,9,10,11, A,B,C,D, i) L23(12,13,14,15, E,F,G,H, i) \
+ L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) L24(8,9,10,11, A,B,C,D, i) L24(12,13,14,15, E,F,G,H, i) \
+ L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) L25(8,9,10,11, A,B,C,D, i) L25(12,13,14,15, E,F,G,H, i) \
+ L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) L26(8,9,10,11, A,B,C,D, i) L26(12,13,14,15, E,F,G,H, i) \
+ L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) L27(8,9,10,11, A,B,C,D, i) L27(12,13,14,15, E,F,G,H, i) \
+ L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) L28(8,9,10,11, A,B,C,D, i) L28(12,13,14,15, E,F,G,H, i) \
+ L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) L29(8,9,10,11, A,B,C,D, i) L29(12,13,14,15, E,F,G,H, i) \
+ L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) L30(8,9,10,11, A,B,C,D, i) L30(12,13,14,15, E,F,G,H, i) \
+ L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) L31(8,9,10,11, A,B,C,D, i) L31(12,13,14,15, E,F,G,H, i) \
+ L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i) L32(8,9,10,11, A,B,C,D, i) L32(12,13,14,15, E,F,G,H, i)
+
+#if CRYPTOPP_BOOL_X64
+ SSE2_QUARTER_ROUND_X16(1, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
+#else
+ SSE2_QUARTER_ROUND_X8(1, 2, 6, 10, 14, 3, 7, 11, 15)
+ SSE2_QUARTER_ROUND_X8(1, 0, 4, 8, 12, 1, 5, 9, 13)
+#endif
+ SSE2_LOAD_ROUNDS
+ ASJ( jmp, 2, f)
+
+ ASL(SSE2_Salsa_Output)
+ AS2( movdqa xmm0, xmm4)
+ AS2( punpckldq xmm4, xmm5)
+ AS2( movdqa xmm1, xmm6)
+ AS2( punpckldq xmm6, xmm7)
+ AS2( movdqa xmm2, xmm4)
+ AS2( punpcklqdq xmm4, xmm6) // e
+ AS2( punpckhqdq xmm2, xmm6) // f
+ AS2( punpckhdq xmm0, xmm5)
+ AS2( punpckhdq xmm1, xmm7)
+ AS2( movdqa xmm6, xmm0)
+ AS2( punpcklqdq xmm0, xmm1) // g
+ AS2( punpckhqdq xmm6, xmm1) // h
+ AS_XMM_OUTPUT4(SSE2_Salsa_Output_A, REG_input, REG_output, 4, 2, 0, 6, 1, 0, 4, 8, 12, 1)
+ AS1( ret)
+
+ ASL(6)
+#if CRYPTOPP_BOOL_X64
+ SSE2_QUARTER_ROUND_X16(0, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
+ ASL(2)
+ SSE2_QUARTER_ROUND_X16(0, 0, 13, 10, 7, 1, 14, 11, 4, 2, 15, 8, 5, 3, 12, 9, 6)
+#else
+ SSE2_QUARTER_ROUND_X8(0, 2, 6, 10, 14, 3, 7, 11, 15)
+ SSE2_QUARTER_ROUND_X8(0, 0, 4, 8, 12, 1, 5, 9, 13)
+ ASL(2)
+ SSE2_QUARTER_ROUND_X8(0, 2, 15, 8, 5, 3, 12, 9, 6)
+ SSE2_QUARTER_ROUND_X8(0, 0, 13, 10, 7, 1, 14, 11, 4)
+#endif
+ AS2( sub REG_rounds, 2)
+ ASJ( jnz, 6, b)
+
+#define SSE2_OUTPUT_4(a, b, c, d) \
+ AS2( movdqa xmm4, [SSE2_WORKSPACE + a*16 + 256])\
+ AS2( paddd xmm4, [SSE2_WORKSPACE + a*16])\
+ AS2( movdqa xmm5, [SSE2_WORKSPACE + b*16 + 256])\
+ AS2( paddd xmm5, [SSE2_WORKSPACE + b*16])\
+ AS2( movdqa xmm6, [SSE2_WORKSPACE + c*16 + 256])\
+ AS2( paddd xmm6, [SSE2_WORKSPACE + c*16])\
+ AS2( movdqa xmm7, [SSE2_WORKSPACE + d*16 + 256])\
+ AS2( paddd xmm7, [SSE2_WORKSPACE + d*16])\
+ ASC( call, SSE2_Salsa_Output)
+
+ SSE2_OUTPUT_4(0, 13, 10, 7)
+ SSE2_OUTPUT_4(4, 1, 14, 11)
+ SSE2_OUTPUT_4(8, 5, 2, 15)
+ SSE2_OUTPUT_4(12, 9, 6, 3)
+ AS2( test REG_input, REG_input)
+ ASJ( jz, 9, f)
+ AS2( add REG_input, 12*16)
+ ASL(9)
+ AS2( add REG_output, 12*16)
+ AS2( sub REG_iterationCount, 4)
+ AS2( cmp REG_iterationCount, 4)
+ ASJ( jge, 1, b)
+ AS_POP_IF86( sp)
+
+ ASL(5)
+ AS2( sub REG_iterationCount, 1)
+ ASJ( jl, 4, f)
+ AS2( movdqa xmm0, [REG_state + 0*16])
+ AS2( movdqa xmm1, [REG_state + 1*16])
+ AS2( movdqa xmm2, [REG_state + 2*16])
+ AS2( movdqa xmm3, [REG_state + 3*16])
+ SSE2_LOAD_ROUNDS
+
+ ASL(0)
+ SSE2_QUARTER_ROUND(0, 1, 3, 7)
+ SSE2_QUARTER_ROUND(1, 2, 0, 9)
+ SSE2_QUARTER_ROUND(2, 3, 1, 13)
+ SSE2_QUARTER_ROUND(3, 0, 2, 18)
+ ASS( pshufd xmm1, xmm1, 2, 1, 0, 3)
+ ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
+ ASS( pshufd xmm3, xmm3, 0, 3, 2, 1)
+ SSE2_QUARTER_ROUND(0, 3, 1, 7)
+ SSE2_QUARTER_ROUND(3, 2, 0, 9)
+ SSE2_QUARTER_ROUND(2, 1, 3, 13)
+ SSE2_QUARTER_ROUND(1, 0, 2, 18)
+ ASS( pshufd xmm1, xmm1, 0, 3, 2, 1)
+ ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
+ ASS( pshufd xmm3, xmm3, 2, 1, 0, 3)
+ AS2( sub REG_rounds, 2)
+ ASJ( jnz, 0, b)
+
+ AS2( paddd xmm0, [REG_state + 0*16])
+ AS2( paddd xmm1, [REG_state + 1*16])
+ AS2( paddd xmm2, [REG_state + 2*16])
+ AS2( paddd xmm3, [REG_state + 3*16])
+
+ AS2( add dword ptr [REG_state + 8*4], 1)
+ AS2( adc dword ptr [REG_state + 5*4], 0)
+
+ AS2( pcmpeqb xmm6, xmm6) // all ones
+ AS2( psrlq xmm6, 32) // lo32 mask
+ ASS( pshufd xmm7, xmm6, 0, 1, 2, 3) // hi32 mask
+ AS2( movdqa xmm4, xmm0)
+ AS2( movdqa xmm5, xmm3)
+ AS2( pand xmm0, xmm7)
+ AS2( pand xmm4, xmm6)
+ AS2( pand xmm3, xmm6)
+ AS2( pand xmm5, xmm7)
+ AS2( por xmm4, xmm5) // 0,13,2,15
+ AS2( movdqa xmm5, xmm1)
+ AS2( pand xmm1, xmm7)
+ AS2( pand xmm5, xmm6)
+ AS2( por xmm0, xmm5) // 4,1,6,3
+ AS2( pand xmm6, xmm2)
+ AS2( pand xmm2, xmm7)
+ AS2( por xmm1, xmm6) // 8,5,10,7
+ AS2( por xmm2, xmm3) // 12,9,14,11
+
+ AS2( movdqa xmm5, xmm4)
+ AS2( movdqa xmm6, xmm0)
+ AS3( shufpd xmm4, xmm1, 2) // 0,13,10,7
+ AS3( shufpd xmm0, xmm2, 2) // 4,1,14,11
+ AS3( shufpd xmm1, xmm5, 2) // 8,5,2,15
+ AS3( shufpd xmm2, xmm6, 2) // 12,9,6,3
+
+ // output keystream
+ AS_XMM_OUTPUT4(SSE2_Salsa_Output_B, REG_input, REG_output, 4, 0, 1, 2, 3, 0, 1, 2, 3, 4)
+ ASJ( jmp, 5, b)
+ ASL(4)
+
+#ifdef __GNUC__
+ AS_POP_IF86( bx)
+ ".att_syntax prefix;"
+ :
+ #if CRYPTOPP_BOOL_X64
+ : "r" (r), "r" (input), "r" (iterationCount), "r" (m_state.data()), "r" (output), "r" (workspace)
+ : "%eax", "%edx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
+ #else
+ : "m" (r), "a" (input), "c" (iterationCount), "S" (m_state.data()), "D" (output)
+ : "%edx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+ #endif
+ );
+#endif
+#ifdef CRYPTOPP_GENERATE_X64_MASM
+ movdqa xmm6, [rsp + 0200h]
+ movdqa xmm7, [rsp + 0210h]
+ movdqa xmm8, [rsp + 0220h]
+ movdqa xmm9, [rsp + 0230h]
+ movdqa xmm10, [rsp + 0240h]
+ movdqa xmm11, [rsp + 0250h]
+ movdqa xmm12, [rsp + 0260h]
+ movdqa xmm13, [rsp + 0270h]
+ movdqa xmm14, [rsp + 0280h]
+ movdqa xmm15, [rsp + 0290h]
+ add rsp, 10*16 + 32*16 + 8
+ ret
+Salsa20_OperateKeystream ENDP
+#else
}
+ else
#endif
-
- word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
- while (iterationCount--)
+#endif
+#ifndef CRYPTOPP_GENERATE_X64_MASM
{
- x0 = m_state[0];
- x1 = m_state[1];
- x2 = m_state[2];
- x3 = m_state[3];
- x4 = m_state[4];
- x5 = m_state[5];
- x6 = m_state[6];
- x7 = m_state[7];
- x8 = m_state[8];
- x9 = m_state[9];
- x10 = m_state[10];
- x11 = m_state[11];
- x12 = m_state[12];
- x13 = m_state[13];
- x14 = m_state[14];
- x15 = m_state[15];
-
- for (i=m_rounds; i>0; i-=2)
+ word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+
+ while (iterationCount--)
{
- #define QUARTER_ROUND(a, b, c, d) \
- b = b ^ rotlFixed(a + d, 7); \
- c = c ^ rotlFixed(b + a, 9); \
- d = d ^ rotlFixed(c + b, 13); \
- a = a ^ rotlFixed(d + c, 18);
-
- QUARTER_ROUND(x0, x4, x8, x12)
- QUARTER_ROUND(x1, x5, x9, x13)
- QUARTER_ROUND(x2, x6, x10, x14)
- QUARTER_ROUND(x3, x7, x11, x15)
-
- QUARTER_ROUND(x0, x13, x10, x7)
- QUARTER_ROUND(x1, x14, x11, x4)
- QUARTER_ROUND(x2, x15, x8, x5)
- QUARTER_ROUND(x3, x12, x9, x6)
- }
+ x0 = m_state[0];
+ x1 = m_state[1];
+ x2 = m_state[2];
+ x3 = m_state[3];
+ x4 = m_state[4];
+ x5 = m_state[5];
+ x6 = m_state[6];
+ x7 = m_state[7];
+ x8 = m_state[8];
+ x9 = m_state[9];
+ x10 = m_state[10];
+ x11 = m_state[11];
+ x12 = m_state[12];
+ x13 = m_state[13];
+ x14 = m_state[14];
+ x15 = m_state[15];
+
+ for (int i=m_rounds; i>0; i-=2)
+ {
+ #define QUARTER_ROUND(a, b, c, d) \
+ b = b ^ rotlFixed(a + d, 7); \
+ c = c ^ rotlFixed(b + a, 9); \
+ d = d ^ rotlFixed(c + b, 13); \
+ a = a ^ rotlFixed(d + c, 18);
+
+ QUARTER_ROUND(x0, x4, x8, x12)
+ QUARTER_ROUND(x1, x5, x9, x13)
+ QUARTER_ROUND(x2, x6, x10, x14)
+ QUARTER_ROUND(x3, x7, x11, x15)
+
+ QUARTER_ROUND(x0, x13, x10, x7)
+ QUARTER_ROUND(x1, x14, x11, x4)
+ QUARTER_ROUND(x2, x15, x8, x5)
+ QUARTER_ROUND(x3, x12, x9, x6)
+ }
- #define SALSA_OUTPUT(x) {\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\
- CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);}
+ #define SALSA_OUTPUT(x) {\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\
+ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);}
#ifndef CRYPTOPP_DOXYGEN_PROCESSING
- CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION);
+ CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION);
#endif
- if (++m_state[8] == 0)
- ++m_state[5];
+ if (++m_state[8] == 0)
+ ++m_state[5];
+ }
}
} // see comment above if an internal compiler error occurs here
NAMESPACE_END
+
+#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM