diff options
author | weidai <weidai11@users.noreply.github.com> | 2007-04-15 23:00:27 +0000 |
---|---|---|
committer | weidai <weidai11@users.noreply.github.com> | 2007-04-15 23:00:27 +0000 |
commit | 643b3022278591c8a784f5a34f689548a4ad62b1 (patch) | |
tree | 6d117430c0ea76a7444fa3871d1898995d825ada /integer.cpp | |
parent | 3b89824be3a8d3974ad0316f2067451c3072bffc (diff) | |
download | cryptopp-git-643b3022278591c8a784f5a34f689548a4ad62b1.tar.gz |
MMX/SSE2 optimizations
Diffstat (limited to 'integer.cpp')
-rw-r--r-- | integer.cpp | 3267 |
1 files changed, 1555 insertions, 1712 deletions
diff --git a/integer.cpp b/integer.cpp index 0c5018ee..515643ed 100644 --- a/integer.cpp +++ b/integer.cpp @@ -14,30 +14,20 @@ #include "algparam.h" #include "pubkey.h" // for P1363_KDF2 #include "sha.h" +#include "cpu.h" #include <iostream> -#ifdef _M_X64 -#include <Intrin.h> +#if defined(_MSC_VER) && _MSC_VER >= 1400 + #include <intrin.h> #endif -#ifdef SSE2_INTRINSICS_AVAILABLE - #ifdef __GNUC__ - #include <xmmintrin.h> - #include <signal.h> - #include <setjmp.h> - #ifdef CRYPTOPP_MEMALIGN_AVAILABLE - #include <malloc.h> - #else - #include <stdlib.h> - #endif - #else - #include <emmintrin.h> - #endif -#elif defined(_MSC_VER) && defined(_M_IX86) - #pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 intrinsics will be disabled.") -#elif defined(__GNUC__) && defined(__i386__) - #warning "You do not have GCC 3.3 or later, or did not specify -msse2 compiler option, so use of SSE2 intrinsics will be disabled." +#ifdef __DECCXX + #include <c_asm.h> +#endif + +#ifdef CRYPTOPP_MSVC6_NO_PP + #pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 instructions will be disabled.") #endif NAMESPACE_BEGIN(CryptoPP) @@ -50,67 +40,7 @@ bool AssignIntToInteger(const std::type_info &valueType, void *pInteger, const v return true; } -#ifdef SSE2_INTRINSICS_AVAILABLE -template <class T> -CPP_TYPENAME AlignedAllocator<T>::pointer AlignedAllocator<T>::allocate(size_type n, const void *) -{ - CheckSize(n); - if (n == 0) - return NULL; - if (n >= 4) - { - void *p; - #ifdef CRYPTOPP_MM_MALLOC_AVAILABLE - while (!(p = _mm_malloc(sizeof(T)*n, 16))) - #elif defined(CRYPTOPP_MEMALIGN_AVAILABLE) - while (!(p = memalign(16, sizeof(T)*n))) - #elif defined(CRYPTOPP_MALLOC_ALIGNMENT_IS_16) - while (!(p = malloc(sizeof(T)*n))) - #else - while (!(p = (byte *)malloc(sizeof(T)*n + 8))) // assume malloc alignment is at least 8 - #endif - CallNewHandler(); - - #ifdef CRYPTOPP_NO_ALIGNED_ALLOC - assert(m_pBlock == NULL); - m_pBlock = p; - if (!IsAlignedOn(p, 16)) - { - assert(IsAlignedOn(p, 8)); - p = (byte *)p + 8; - } - #endif - - assert(IsAlignedOn(p, 16)); - return (T*)p; - } - return new T[n]; -} - -template <class T> -void AlignedAllocator<T>::deallocate(void *p, size_type n) -{ - memset(p, 0, n*sizeof(T)); - if (n >= 4) - { - #ifdef CRYPTOPP_MM_MALLOC_AVAILABLE - _mm_free(p); - #elif defined(CRYPTOPP_NO_ALIGNED_ALLOC) - assert(m_pBlock == p || (byte *)m_pBlock+8 == p); - free(m_pBlock); - m_pBlock = NULL; - #else - free(p); - #endif - } - else - delete [] (T *)p; -} - -template class CRYPTOPP_DLL AlignedAllocator<word>; -#endif - -static int Compare(const word *A, const word *B, size_t N) +inline static int Compare(const word *A, const word *B, size_t N) { while (N--) if (A[N] > B[N]) @@ -121,7 +51,7 @@ static int Compare(const word *A, const word *B, size_t N) return 0; } -static int Increment(word *A, size_t N, word B=1) +inline static int Increment(word *A, size_t N, word B=1) { assert(N); word t = A[0]; @@ -134,7 +64,7 @@ static int Increment(word *A, size_t N, word B=1) return 1; } -static int Decrement(word *A, size_t N, word B=1) +inline static int Decrement(word *A, size_t N, word B=1) { assert(N); word t = A[0]; @@ -169,6 +99,45 @@ static word AtomicInverseModPower2(word A) // ******************************************************** +#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE + #define Declare2Words(x) dword x; + #if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER) + #define MultiplyWords(p, a, b) p = __emulu(a, b); + #else + #define MultiplyWords(p, a, b) p = (dword)a*b; + #endif + #define AssignWord(a, b) a = b; + #define Add2WordsBy1(a, b, c) a = b + c; + #define Acc2WordsBy1(a, b) a += b; + #define Acc2WordsBy2(a, b) a += b; + #define LowWord(a) (word)a + #define HighWord(a) (word)(a>>WORD_BITS) + #define Double2Words(a) a += a; + #define AddWithCarry(u, a, b) u = dword(a) + b + GetCarry(u); + #define SubtractWithBorrow(u, a, b) u = dword(a) - b - GetBorrow(u); + #define GetCarry(u) HighWord(u) + #define GetBorrow(u) word(u>>(WORD_BITS*2-1)) +#else + #define Declare2Words(x) word x##0, x##1; + #define AssignWord(a, b) a##0 = b; a##1 = 0; + #define Add2WordsBy1(a, b, c) a##0 = b##0 + c; a##1 = b##1 + (a##0 < c); + #define Acc2WordsBy1(a, b) Add2WordsBy1(a, a, b) + #define Acc2WordsBy2(a, b) a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1; + #define LowWord(a) a##0 + #define HighWord(a) a##1 + #ifdef _MSC_VER + #define MultiplyWords(p, a, b) p##0 = _umul128(a, b, &p##1); + #define Double2Words(a) a##1 = __shiftleft128(a##0, a##1, 1); a##0 += a##0; + #elif defined(__DECCXX) + #define MultiplyWords(p, a, b) p##0 = a*b; p##1 = asm("umulh %a0, %a1, %v0", a, b); + #define Double2Words(a) a##1 = (a##1 + a##1) + (a##0 >> (WORD_BITS-1)); a##0 += a##0; + #endif + #define AddWithCarry(u, a, b) {word t = a+b; u##0 = t + u##1; u##1 = (t<a) + (u##0<t);} + #define SubtractWithBorrow(u, a, b) {word t = a-b; u##0 = t - u##1; u##1 = (t>a) + (u##0>t);} + #define GetCarry(u) u##1 + #define GetBorrow(u) u##1 +#endif + class DWord { public: @@ -198,25 +167,8 @@ public: DWord r; #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE r.m_whole = (dword)a * b; - #elif defined(__alpha__) - r.m_halfs.low = a*b; __asm__("umulh %1,%2,%0" : "=r" (r.m_halfs.high) : "r" (a), "r" (b)); - #elif defined(__ia64__) - r.m_halfs.low = a*b; __asm__("xmpy.hu %0=%1,%2" : "=f" (r.m_halfs.high) : "f" (a), "f" (b)); - #elif defined(_ARCH_PPC64) - r.m_halfs.low = a*b; __asm__("mulhdu %0,%1,%2" : "=r" (r.m_halfs.high) : "r" (a), "r" (b) : "cc"); - #elif defined(__x86_64__) - __asm__("mulq %3" : "=d" (r.m_halfs.high), "=a" (r.m_halfs.low) : "a" (a), "rm" (b) : "cc"); - #elif defined(__mips64) - __asm__("dmultu %2,%3" : "=h" (r.m_halfs.high), "=l" (r.m_halfs.low) : "r" (a), "r" (b)); - #elif defined(_M_X64) - r.m_halfs.low = _umul128(a, b, &r.m_halfs.high); - #elif defined(_M_IX86) - // for testing - word64 t = (word64)a * b; - r.m_halfs.high = ((word32 *)(&t))[1]; - r.m_halfs.low = (word32)t; #else - #error can not implement DWord + r.m_halfs.low = _umul128(a, b, &r.m_halfs.high); #endif return r; } @@ -457,1529 +409,1449 @@ inline word DWord::operator%(word a) // ******************************************************** -class Portable -{ -public: - static int Add(word *C, const word *A, const word *B, size_t N); - static int Subtract(word *C, const word *A, const word *B, size_t N); - - static inline void Multiply2(word *C, const word *A, const word *B); - static inline word Multiply2Add(word *C, const word *A, const word *B); - static void Multiply4(word *C, const word *A, const word *B); - static void Multiply8(word *C, const word *A, const word *B); - static inline unsigned int MultiplyRecursionLimit() {return 8;} - - static inline void Multiply2Bottom(word *C, const word *A, const word *B); - static void Multiply4Bottom(word *C, const word *A, const word *B); - static void Multiply8Bottom(word *C, const word *A, const word *B); - static inline unsigned int MultiplyBottomRecursionLimit() {return 8;} - - static void Square2(word *R, const word *A); - static void Square4(word *R, const word *A); - static void Square8(word *R, const word *A) {assert(false);} - static inline unsigned int SquareRecursionLimit() {return 4;} -}; +// use some tricks to share assembly code between MSVC and GCC +#if defined(__GNUC__) + #define CRYPTOPP_NAKED + #define AddPrologue \ + __asm__ __volatile__ \ + ( \ + "push %%ebx;" /* save this manually, in case of -fPIC */ \ + "mov %2, %%ebx;" \ + ".intel_syntax noprefix;" + #define AddEpilogue \ + ".att_syntax prefix;" \ + "pop %%ebx;" \ + : \ + : "d" (C), "a" (A), "m" (B), "c" (N) \ + : "%esi", "memory", "cc" \ + ); + #define MulPrologue \ + __asm__ __volatile__ \ + ( \ + ".intel_syntax noprefix;" \ + AS1( push ebx) \ + AS2( mov ebx, edx) + #define MulEpilogue \ + AS1( pop ebx) \ + ".att_syntax prefix;" \ + : \ + : "d" (s_maskLow16), "c" (C), "a" (A), "D" (B) \ + : "%esi", "memory", "cc" \ + ); + #define SquPrologue MulPrologue + #define SquEpilogue \ + AS1( pop ebx) \ + ".att_syntax prefix;" \ + : \ + : "d" (s_maskLow16), "c" (C), "a" (A) \ + : "%esi", "%edi", "memory", "cc" \ + ); + #define TopPrologue MulPrologue + #define TopEpilogue \ + AS1( pop ebx) \ + ".att_syntax prefix;" \ + : \ + : "d" (s_maskLow16), "c" (C), "a" (A), "D" (B), "S" (L) \ + : "memory", "cc" \ + ); +#else + #define CRYPTOPP_NAKED __declspec(naked) + #define AddPrologue \ + __asm push ebx \ + __asm push esi \ + __asm mov eax, [esp+12] \ + __asm mov ebx, [esp+16] + #define AddEpilogue \ + __asm pop esi \ + __asm pop ebx \ + __asm ret 8 + #define SquPrologue \ + AS2( mov eax, A) \ + AS2( mov ecx, C) \ + AS2( lea ebx, s_maskLow16) + #define SquEpilogue + #define MulPrologue \ + AS2( mov eax, A) \ + AS2( mov edi, B) \ + AS2( mov ecx, C) \ + AS2( lea ebx, s_maskLow16) + #define MulEpilogue + #define TopPrologue \ + AS2( mov eax, A) \ + AS2( mov edi, B) \ + AS2( mov ecx, C) \ + AS2( mov esi, L) \ + AS2( lea ebx, s_maskLow16) + #define TopEpilogue +#endif -int Portable::Add(word *C, const word *A, const word *B, size_t N) +#if defined(_MSC_VER) && defined(_M_X64) +extern "C" { +int Baseline_Add(size_t N, word *C, const word *A, const word *B); +int Baseline_Sub(size_t N, word *C, const word *A, const word *B); +} +#elif defined(CRYPTOPP_X86_ASM_AVAILABLE) +CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B) { - assert (N%2 == 0); + AddPrologue - DWord u(0, 0); - for (unsigned int i = 0; i < N; i+=2) - { - u = DWord(A[i]) + B[i] + u.GetHighHalf(); - C[i] = u.GetLowHalf(); - u = DWord(A[i+1]) + B[i+1] + u.GetHighHalf(); - C[i+1] = u.GetLowHalf(); - } - return int(u.GetHighHalf()); + // now: eax = A, ebx = B, edx = C, ecx = N + AS2( lea eax, [eax+4*ecx]) + AS2( lea ebx, [ebx+4*ecx]) + AS2( lea edx, [edx+4*ecx]) + + AS1( neg ecx) // ecx is negative index + AS2( test ecx, 2) // this clears carry flag + ASJ( jz, 0, f) + AS2( sub ecx, 2) + ASJ( jmp, 1, f) + + ASL(0) + ASJ( jecxz, 2, f) // loop until ecx overflows and becomes zero + AS2( mov esi,[eax+4*ecx]) + AS2( adc esi,[ebx+4*ecx]) + AS2( mov [edx+4*ecx],esi) + AS2( mov esi,[eax+4*ecx+4]) + AS2( adc esi,[ebx+4*ecx+4]) + AS2( mov [edx+4*ecx+4],esi) + ASL(1) + AS2( mov esi,[eax+4*ecx+8]) + AS2( adc esi,[ebx+4*ecx+8]) + AS2( mov [edx+4*ecx+8],esi) + AS2( mov esi,[eax+4*ecx+12]) + AS2( adc esi,[ebx+4*ecx+12]) + AS2( mov [edx+4*ecx+12],esi) + + AS2( lea ecx,[ecx+4]) // advance index, avoid inc which causes slowdown on Intel Core 2 + ASJ( jmp, 0, b) + + ASL(2) + AS2( mov eax, 0) + AS1( setc al) // store carry into eax (return result register) + + AddEpilogue } -int Portable::Subtract(word *C, const word *A, const word *B, size_t N) +CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B) { - assert (N%2 == 0); + AddPrologue - DWord u(0, 0); - for (unsigned int i = 0; i < N; i+=2) - { - u = (DWord) A[i] - B[i] - u.GetHighHalfAsBorrow(); - C[i] = u.GetLowHalf(); - u = (DWord) A[i+1] - B[i+1] - u.GetHighHalfAsBorrow(); - C[i+1] = u.GetLowHalf(); - } - return int(0-u.GetHighHalf()); + // now: eax = A, ebx = B, edx = C, ecx = N + AS2( lea eax, [eax+4*ecx]) + AS2( lea ebx, [ebx+4*ecx]) + AS2( lea edx, [edx+4*ecx]) + + AS1( neg ecx) // ecx is negative index + AS2( test ecx, 2) // this clears carry flag + ASJ( jz, 0, f) + AS2( sub ecx, 2) + ASJ( jmp, 1, f) + + ASL(0) + ASJ( jecxz, 2, f) // loop until ecx overflows and becomes zero + AS2( mov esi,[eax+4*ecx]) + AS2( sbb esi,[ebx+4*ecx]) + AS2( mov [edx+4*ecx],esi) + AS2( mov esi,[eax+4*ecx+4]) + AS2( sbb esi,[ebx+4*ecx+4]) + AS2( mov [edx+4*ecx+4],esi) + ASL(1) + AS2( mov esi,[eax+4*ecx+8]) + AS2( sbb esi,[ebx+4*ecx+8]) + AS2( mov [edx+4*ecx+8],esi) + AS2( mov esi,[eax+4*ecx+12]) + AS2( sbb esi,[ebx+4*ecx+12]) + AS2( mov [edx+4*ecx+12],esi) + + AS2( lea ecx,[ecx+4]) // advance index, avoid inc which causes slowdown on Intel Core 2 + ASJ( jmp, 0, b) + + ASL(2) + AS2( mov eax, 0) + AS1( setc al) // store carry into eax (return result register) + + AddEpilogue } -void Portable::Multiply2(word *C, const word *A, const word *B) +CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B) { -/* - word s; - dword d; + AddPrologue - if (A1 >= A0) - if (B0 >= B1) - { - s = 0; - d = (dword)(A1-A0)*(B0-B1); - } - else - { - s = (A1-A0); - d = (dword)s*(word)(B0-B1); - } - else - if (B0 > B1) - { - s = (B0-B1); - d = (word)(A1-A0)*(dword)s; - } - else - { - s = 0; - d = (dword)(A0-A1)*(B1-B0); - } -*/ - // this segment is the branchless equivalent of above - word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]}; - unsigned int ai = A[1] < A[0]; - unsigned int bi = B[0] < B[1]; - unsigned int di = ai & bi; - DWord d = DWord::Multiply(D[di], D[di+2]); - D[1] = D[3] = 0; - unsigned int si = ai + !bi; - word s = D[si]; - - DWord A0B0 = DWord::Multiply(A[0], B[0]); - C[0] = A0B0.GetLowHalf(); - - DWord A1B1 = DWord::Multiply(A[1], B[1]); - DWord t = (DWord) A0B0.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() + A1B1.GetLowHalf(); - C[1] = t.GetLowHalf(); - - t = A1B1 + t.GetHighHalf() + A0B0.GetHighHalf() + d.GetHighHalf() + A1B1.GetHighHalf() - s; - C[2] = t.GetLowHalf(); - C[3] = t.GetHighHalf(); -} - -inline void Portable::Multiply2Bottom(word *C, const word *A, const word *B) -{ - DWord t = DWord::Multiply(A[0], B[0]); - C[0] = t.GetLowHalf(); - C[1] = t.GetHighHalf() + A[0]*B[1] + A[1]*B[0]; -} - -word Portable::Multiply2Add(word *C, const word *A, const word *B) -{ - word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]}; - unsigned int ai = A[1] < A[0]; - unsigned int bi = B[0] < B[1]; - unsigned int di = ai & bi; - DWord d = DWord::Multiply(D[di], D[di+2]); - D[1] = D[3] = 0; - unsigned int si = ai + !bi; - word s = D[si]; - - DWord A0B0 = DWord::Multiply(A[0], B[0]); - DWord t = A0B0 + C[0]; - C[0] = t.GetLowHalf(); - - DWord A1B1 = DWord::Multiply(A[1], B[1]); - t = (DWord) t.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() + A1B1.GetLowHalf() + C[1]; - C[1] = t.GetLowHalf(); - - t = (DWord) t.GetHighHalf() + A1B1.GetLowHalf() + A0B0.GetHighHalf() + d.GetHighHalf() + A1B1.GetHighHalf() - s + C[2]; - C[2] = t.GetLowHalf(); - - t = (DWord) t.GetHighHalf() + A1B1.GetHighHalf() + C[3]; - C[3] = t.GetLowHalf(); - return t.GetHighHalf(); -} - -#define MulAcc(x, y) \ - p = DWord::MultiplyAndAdd(A[x], B[y], c); \ - c = p.GetLowHalf(); \ - p = (DWord) d + p.GetHighHalf(); \ - d = p.GetLowHalf(); \ - e += p.GetHighHalf(); - -#define SaveMulAcc(s, x, y) \ - R[s] = c; \ - p = DWord::MultiplyAndAdd(A[x], B[y], d); \ - c = p.GetLowHalf(); \ - p = (DWord) e + p.GetHighHalf(); \ - d = p.GetLowHalf(); \ - e = p.GetHighHalf(); - -#define SquAcc(x, y) \ - q = DWord::Multiply(A[x], A[y]); \ - p = q + c; \ - c = p.GetLowHalf(); \ - p = (DWord) d + p.GetHighHalf(); \ - d = p.GetLowHalf(); \ - e += p.GetHighHalf(); \ - p = q + c; \ - c = p.GetLowHalf(); \ - p = (DWord) d + p.GetHighHalf(); \ - d = p.GetLowHalf(); \ - e += p.GetHighHalf(); - -#define SaveSquAcc(s, x, y) \ - R[s] = c; \ - q = DWord::Multiply(A[x], A[y]); \ - p = q + d; \ - c = p.GetLowHalf(); \ - p = (DWord) e + p.GetHighHalf(); \ - d = p.GetLowHalf(); \ - e = p.GetHighHalf(); \ - p = q + c; \ - c = p.GetLowHalf(); \ - p = (DWord) d + p.GetHighHalf(); \ - d = p.GetLowHalf(); \ - e += p.GetHighHalf(); - -void Portable::Multiply4(word *R, const word *A, const word *B) -{ - DWord p; - word c, d, e; - - p = DWord::Multiply(A[0], B[0]); - R[0] = p.GetLowHalf(); - c = p.GetHighHalf(); - d = e = 0; - - MulAcc(0, 1); - MulAcc(1, 0); - - SaveMulAcc(1, 2, 0); - MulAcc(1, 1); - MulAcc(0, 2); - - SaveMulAcc(2, 0, 3); - MulAcc(1, 2); - MulAcc(2, 1); - MulAcc(3, 0); - - SaveMulAcc(3, 3, 1); - MulAcc(2, 2); - MulAcc(1, 3); - - SaveMulAcc(4, 2, 3); - MulAcc(3, 2); - - R[5] = c; - p = DWord::MultiplyAndAdd(A[3], B[3], d); - R[6] = p.GetLowHalf(); - R[7] = e + p.GetHighHalf(); -} - -void Portable::Square2(word *R, const word *A) -{ - DWord p, q; - word c, d, e; - - p = DWord::Multiply(A[0], A[0]); - R[0] = p.GetLowHalf(); - c = p.GetHighHalf(); - d = e = 0; - - SquAcc(0, 1); - - R[1] = c; - p = DWord::MultiplyAndAdd(A[1], A[1], d); - R[2] = p.GetLowHalf(); - R[3] = e + p.GetHighHalf(); -} - -void Portable::Square4(word *R, const word *A) -{ -#ifdef _MSC_VER - // VC60 workaround: MSVC 6.0 has an optimization bug that makes - // (dword)A*B where either A or B has been cast to a dword before - // very expensive. Revisit this function when this - // bug is fixed. - Multiply4(R, A, A); -#else - const word *B = A; - DWord p, q; - word c, d, e; + // now: eax = A, ebx = B, edx = C, ecx = N + AS2( lea eax, [eax+4*ecx]) + AS2( lea ebx, [ebx+4*ecx]) + AS2( lea edx, [edx+4*ecx]) + + AS1( neg ecx) // ecx is negative index + AS2( pxor mm2, mm2) + ASJ( jz, 2, f) + AS2( test ecx, 2) // this clears carry flag + ASJ( jz, 0, f) + AS2( sub ecx, 2) + ASJ( jmp, 1, f) + + ASL(0) + AS2( movd mm0, DWORD PTR [eax+4*ecx]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx]) + AS2( paddq mm0, mm1) + AS2( paddq mm2, mm0) + AS2( movd DWORD PTR [edx+4*ecx], mm2) + AS2( psrlq mm2, 32) + + AS2( movd mm0, DWORD PTR [eax+4*ecx+4]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx+4]) + AS2( paddq mm0, mm1) + AS2( paddq mm2, mm0) + AS2( movd DWORD PTR [edx+4*ecx+4], mm2) + AS2( psrlq mm2, 32) + + ASL(1) + AS2( movd mm0, DWORD PTR [eax+4*ecx+8]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx+8]) + AS2( paddq mm0, mm1) + AS2( paddq mm2, mm0) + AS2( movd DWORD PTR [edx+4*ecx+8], mm2) + AS2( psrlq mm2, 32) + + AS2( movd mm0, DWORD PTR [eax+4*ecx+12]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx+12]) + AS2( paddq mm0, mm1) + AS2( paddq mm2, mm0) + AS2( movd DWORD PTR [edx+4*ecx+12], mm2) + AS2( psrlq mm2, 32) + + AS2( add ecx, 4) + ASJ( jnz, 0, b) + + ASL(2) + AS2( movd eax, mm2) + AS1( emms) - p = DWord::Multiply(A[0], A[0]); - R[0] = p.GetLowHalf(); - c = p.GetHighHalf(); - d = e = 0; + AddEpilogue +} - SquAcc(0, 1); +CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B) +{ + AddPrologue - SaveSquAcc(1, 2, 0); - MulAcc(1, 1); + // now: eax = A, ebx = B, edx = C, ecx = N + AS2( lea eax, [eax+4*ecx]) + AS2( lea ebx, [ebx+4*ecx]) + AS2( lea edx, [edx+4*ecx]) + + AS1( neg ecx) // ecx is negative index + AS2( pxor mm2, mm2) + ASJ( jz, 2, f) + AS2( test ecx, 2) // this clears carry flag + ASJ( jz, 0, f) + AS2( sub ecx, 2) + ASJ( jmp, 1, f) + + ASL(0) + AS2( movd mm0, DWORD PTR [eax+4*ecx]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx]) + AS2( psubq mm0, mm1) + AS2( psubq mm0, mm2) + AS2( movd DWORD PTR [edx+4*ecx], mm0) + AS2( psrlq mm0, 63) + + AS2( movd mm2, DWORD PTR [eax+4*ecx+4]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx+4]) + AS2( psubq mm2, mm1) + AS2( psubq mm2, mm0) + AS2( movd DWORD PTR [edx+4*ecx+4], mm2) + AS2( psrlq mm2, 63) + + ASL(1) + AS2( movd mm0, DWORD PTR [eax+4*ecx+8]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx+8]) + AS2( psubq mm0, mm1) + AS2( psubq mm0, mm2) + AS2( movd DWORD PTR [edx+4*ecx+8], mm0) + AS2( psrlq mm0, 63) + + AS2( movd mm2, DWORD PTR [eax+4*ecx+12]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx+12]) + AS2( psubq mm2, mm1) + AS2( psubq mm2, mm0) + AS2( movd DWORD PTR [edx+4*ecx+12], mm2) + AS2( psrlq mm2, 63) + + AS2( add ecx, 4) + ASJ( jnz, 0, b) + + ASL(2) + AS2( movd eax, mm2) + AS1( emms) - SaveSquAcc(2, 0, 3); - SquAcc(1, 2); + AddEpilogue +} +#else +int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B) +{ + assert (N%2 == 0); - SaveSquAcc(3, 3, 1); - MulAcc(2, 2); + Declare2Words(u); + for (size_t i=0; i<N; i+=2) + { + AddWithCarry(u, A[i], B[i]); + C[i] = LowWord(u); + AddWithCarry(u, A[i+1], B[i+1]); + C[i+1] = LowWord(u); + } + return int(GetCarry(u)); +} - SaveSquAcc(4, 2, 3); +int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B) +{ + assert (N%2 == 0); - R[5] = c; - p = DWord::MultiplyAndAdd(A[3], A[3], d); - R[6] = p.GetLowHalf(); - R[7] = e + p.GetHighHalf(); + Declare2Words(u); + for (size_t i=0; i<N; i+=2) + { + SubtractWithBorrow(u, A[i], B[i]); + C[i] = LowWord(u); + SubtractWithBorrow(u, A[i+1], B[i+1]); + C[i+1] = LowWord(u); + } + return int(GetBorrow(u)); +} #endif + +static word LinearMultiply(word *C, const word *A, word B, size_t N) +{ + word carry=0; + for(unsigned i=0; i<N; i++) + { + Declare2Words(p); + MultiplyWords(p, A[i], B); + Acc2WordsBy1(p, carry); + C[i] = LowWord(p); + carry = HighWord(p); + } + return carry; } -void Portable::Multiply8(word *R, const word *A, const word *B) -{ - DWord p; - word c, d, e; - - p = DWord::Multiply(A[0], B[0]); - R[0] = p.GetLowHalf(); - c = p.GetHighHalf(); - d = e = 0; - - MulAcc(0, 1); - MulAcc(1, 0); - - SaveMulAcc(1, 2, 0); - MulAcc(1, 1); - MulAcc(0, 2); +#define Mul_2 \ + Mul_Begin(2) \ + Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ + Mul_End(2) + +#define Mul_4 \ + Mul_Begin(4) \ + Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ + Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \ + Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \ + Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \ + Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \ + Mul_End(4) + +#define Mul_8 \ + Mul_Begin(8) \ + Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ + Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \ + Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \ + Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \ + Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \ + Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \ + Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \ + Mul_SaveAcc(7, 1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \ + Mul_SaveAcc(8, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \ + Mul_SaveAcc(9, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \ + Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \ + Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \ + Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \ + Mul_End(8) + +#define Mul_16 \ + Mul_Begin(16) \ + Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ + Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \ + Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \ + Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \ + Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \ + Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \ + Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \ + Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \ + Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \ + Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \ + Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \ + Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \ + Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \ + Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \ + Mul_SaveAcc(14, 0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \ + Mul_SaveAcc(15, 1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \ + Mul_SaveAcc(16, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \ + Mul_SaveAcc(17, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \ + Mul_SaveAcc(18, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \ + Mul_SaveAcc(19, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \ + Mul_SaveAcc(20, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \ + Mul_SaveAcc(21, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \ + Mul_SaveAcc(22, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \ + Mul_SaveAcc(23, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \ + Mul_SaveAcc(24, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \ + Mul_SaveAcc(25, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \ + Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \ + Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \ + Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \ + Mul_End(16) + +#define Squ_2 \ + Squ_Begin(2) \ + Squ_End(2) + +#define Squ_4 \ + Squ_Begin(4) \ + Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \ + Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \ + Squ_SaveAcc(3, 1, 3) Squ_Diag(2) \ + Squ_SaveAcc(4, 2, 3) Squ_NonDiag \ + Squ_End(4) + +#define Squ_8 \ + Squ_Begin(8) \ + Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \ + Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \ + Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \ + Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \ + Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \ + Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \ + Squ_SaveAcc(7, 1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \ + Squ_SaveAcc(8, 2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \ + Squ_SaveAcc(9, 3, 7) Squ_Acc(4, 6) Squ_Diag(5) \ + Squ_SaveAcc(10, 4, 7) Squ_Acc(5, 6) Squ_NonDiag \ + Squ_SaveAcc(11, 5, 7) Squ_Diag(6) \ + Squ_SaveAcc(12, 6, 7) Squ_NonDiag \ + Squ_End(8) + +#define Squ_16 \ + Squ_Begin(16) \ + Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \ + Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \ + Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \ + Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \ + Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \ + Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \ + Squ_SaveAcc(7, 0, 8) Squ_Acc(1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \ + Squ_SaveAcc(8, 0, 9) Squ_Acc(1, 8) Squ_Acc(2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \ + Squ_SaveAcc(9, 0, 10) Squ_Acc(1, 9) Squ_Acc(2, 8) Squ_Acc(3, 7) Squ_Acc(4, 6) Squ_Diag(5) \ + Squ_SaveAcc(10, 0, 11) Squ_Acc(1, 10) Squ_Acc(2, 9) Squ_Acc(3, 8) Squ_Acc(4, 7) Squ_Acc(5, 6) Squ_NonDiag \ + Squ_SaveAcc(11, 0, 12) Squ_Acc(1, 11) Squ_Acc(2, 10) Squ_Acc(3, 9) Squ_Acc(4, 8) Squ_Acc(5, 7) Squ_Diag(6) \ + Squ_SaveAcc(12, 0, 13) Squ_Acc(1, 12) Squ_Acc(2, 11) Squ_Acc(3, 10) Squ_Acc(4, 9) Squ_Acc(5, 8) Squ_Acc(6, 7) Squ_NonDiag \ + Squ_SaveAcc(13, 0, 14) Squ_Acc(1, 13) Squ_Acc(2, 12) Squ_Acc(3, 11) Squ_Acc(4, 10) Squ_Acc(5, 9) Squ_Acc(6, 8) Squ_Diag(7) \ + Squ_SaveAcc(14, 0, 15) Squ_Acc(1, 14) Squ_Acc(2, 13) Squ_Acc(3, 12) Squ_Acc(4, 11) Squ_Acc(5, 10) Squ_Acc(6, 9) Squ_Acc(7, 8) Squ_NonDiag \ + Squ_SaveAcc(15, 1, 15) Squ_Acc(2, 14) Squ_Acc(3, 13) Squ_Acc(4, 12) Squ_Acc(5, 11) Squ_Acc(6, 10) Squ_Acc(7, 9) Squ_Diag(8) \ + Squ_SaveAcc(16, 2, 15) Squ_Acc(3, 14) Squ_Acc(4, 13) Squ_Acc(5, 12) Squ_Acc(6, 11) Squ_Acc(7, 10) Squ_Acc(8, 9) Squ_NonDiag \ + Squ_SaveAcc(17, 3, 15) Squ_Acc(4, 14) Squ_Acc(5, 13) Squ_Acc(6, 12) Squ_Acc(7, 11) Squ_Acc(8, 10) Squ_Diag(9) \ + Squ_SaveAcc(18, 4, 15) Squ_Acc(5, 14) Squ_Acc(6, 13) Squ_Acc(7, 12) Squ_Acc(8, 11) Squ_Acc(9, 10) Squ_NonDiag \ + Squ_SaveAcc(19, 5, 15) Squ_Acc(6, 14) Squ_Acc(7, 13) Squ_Acc(8, 12) Squ_Acc(9, 11) Squ_Diag(10) \ + Squ_SaveAcc(20, 6, 15) Squ_Acc(7, 14) Squ_Acc(8, 13) Squ_Acc(9, 12) Squ_Acc(10, 11) Squ_NonDiag \ + Squ_SaveAcc(21, 7, 15) Squ_Acc(8, 14) Squ_Acc(9, 13) Squ_Acc(10, 12) Squ_Diag(11) \ + Squ_SaveAcc(22, 8, 15) Squ_Acc(9, 14) Squ_Acc(10, 13) Squ_Acc(11, 12) Squ_NonDiag \ + Squ_SaveAcc(23, 9, 15) Squ_Acc(10, 14) Squ_Acc(11, 13) Squ_Diag(12) \ + Squ_SaveAcc(24, 10, 15) Squ_Acc(11, 14) Squ_Acc(12, 13) Squ_NonDiag \ + Squ_SaveAcc(25, 11, 15) Squ_Acc(12, 14) Squ_Diag(13) \ + Squ_SaveAcc(26, 12, 15) Squ_Acc(13, 14) Squ_NonDiag \ + Squ_SaveAcc(27, 13, 15) Squ_Diag(14) \ + Squ_SaveAcc(28, 14, 15) Squ_NonDiag \ + Squ_End(16) + +#define Bot_2 \ + Mul_Begin(2) \ + Bot_SaveAcc(0, 0, 1) Bot_Acc(1, 0) \ + Bot_End(2) + +#define Bot_4 \ + Mul_Begin(4) \ + Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ + Mul_SaveAcc(1, 2, 0) Mul_Acc(1, 1) Mul_Acc(0, 2) \ + Bot_SaveAcc(2, 0, 3) Bot_Acc(1, 2) Bot_Acc(2, 1) Bot_Acc(3, 0) \ + Bot_End(4) + +#define Bot_8 \ + Mul_Begin(8) \ + Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ + Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \ + Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \ + Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \ + Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \ + Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \ + Bot_SaveAcc(6, 0, 7) Bot_Acc(1, 6) Bot_Acc(2, 5) Bot_Acc(3, 4) Bot_Acc(4, 3) Bot_Acc(5, 2) Bot_Acc(6, 1) Bot_Acc(7, 0) \ + Bot_End(8) + +#define Bot_16 \ + Mul_Begin(16) \ + Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ + Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \ + Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \ + Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \ + Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \ + Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \ + Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \ + Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \ + Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \ + Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \ + Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \ + Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \ + Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \ + Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \ + Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \ + Bot_End(16) + +#define Mul_Begin(n) \ + Declare2Words(p) \ + Declare2Words(c) \ + Declare2Words(d) \ + MultiplyWords(p, A[0], B[0]) \ + AssignWord(c, LowWord(p)) \ + AssignWord(d, HighWord(p)) + +#define Mul_Acc(i, j) \ + MultiplyWords(p, A[i], B[j]) \ + Acc2WordsBy1(c, LowWord(p)) \ + Acc2WordsBy1(d, HighWord(p)) + +#define Mul_SaveAcc(k, i, j) \ + R[k] = LowWord(c); \ + Add2WordsBy1(c, d, HighWord(c)) \ + MultiplyWords(p, A[i], B[j]) \ + AssignWord(d, HighWord(p)) \ + Acc2WordsBy1(c, LowWord(p)) + +#define Mul_End(n) \ + R[2*n-3] = LowWord(c); \ + Acc2WordsBy1(d, HighWord(c)) \ + MultiplyWords(p, A[n-1], B[n-1])\ + Acc2WordsBy2(d, p) \ + R[2*n-2] = LowWord(d); \ + R[2*n-1] = HighWord(d); + +#define Bot_SaveAcc(k, i, j) \ + R[k] = LowWord(c); \ + word e = LowWord(d) + HighWord(c); \ + e += A[i] * B[j]; + +#define Bot_Acc(i, j) \ + e += A[i] * B[j]; + +#define Bot_End(n) \ + R[n-1] = e; - SaveMulAcc(2, 0, 3); - MulAcc(1, 2); - MulAcc(2, 1); - MulAcc(3, 0); - - SaveMulAcc(3, 0, 4); - MulAcc(1, 3); - MulAcc(2, 2); - MulAcc(3, 1); - MulAcc(4, 0); - - SaveMulAcc(4, 0, 5); - MulAcc(1, 4); - MulAcc(2, 3); - MulAcc(3, 2); - MulAcc(4, 1); - MulAcc(5, 0); - - SaveMulAcc(5, 0, 6); - MulAcc(1, 5); - MulAcc(2, 4); - MulAcc(3, 3); - MulAcc(4, 2); - MulAcc(5, 1); - MulAcc(6, 0); - - SaveMulAcc(6, 0, 7); - MulAcc(1, 6); - MulAcc(2, 5); - MulAcc(3, 4); - MulAcc(4, 3); - MulAcc(5, 2); - MulAcc(6, 1); - MulAcc(7, 0); - - SaveMulAcc(7, 1, 7); - MulAcc(2, 6); - MulAcc(3, 5); - MulAcc(4, 4); - MulAcc(5, 3); - MulAcc(6, 2); - MulAcc(7, 1); - - SaveMulAcc(8, 2, 7); - MulAcc(3, 6); - MulAcc(4, 5); - MulAcc(5, 4); - MulAcc(6, 3); - MulAcc(7, 2); - - SaveMulAcc(9, 3, 7); - MulAcc(4, 6); - MulAcc(5, 5); - MulAcc(6, 4); - MulAcc(7, 3); +/* +// this is slower on MSVC 2005 Win32 +#define Mul_Begin(n) \ + Declare2Words(p) \ + word c; \ + Declare2Words(d) \ + MultiplyWords(p, A[0], B[0]) \ + c = LowWord(p); \ + AssignWord(d, HighWord(p)) + +#define Mul_Acc(i, j) \ + MultiplyWords(p, A[i], B[j]) \ + Acc2WordsBy1(p, c) \ + c = LowWord(p); \ + Acc2WordsBy1(d, HighWord(p)) + +#define Mul_SaveAcc(k, i, j) \ + R[k] = c; \ + MultiplyWords(p, A[i], B[j]) \ + Acc2WordsBy1(p, LowWord(d)) \ + c = LowWord(p); \ + AssignWord(d, HighWord(d)) \ + Acc2WordsBy1(d, HighWord(p)) + +#define Mul_End(n) \ + R[2*n-3] = c; \ + MultiplyWords(p, A[n-1], B[n-1])\ + Acc2WordsBy2(d, p) \ + R[2*n-2] = LowWord(d); \ + R[2*n-1] = HighWord(d); + +#define Bot_SaveAcc(k, i, j) \ + R[k] = c; \ + c = LowWord(d); \ + c += A[i] * B[j]; + +#define Bot_Acc(i, j) \ + c += A[i] * B[j]; + +#define Bot_End(n) \ + R[n-1] = c; +*/ - SaveMulAcc(10, 4, 7); - MulAcc(5, 6); - MulAcc(6, 5); - MulAcc(7, 4); - - SaveMulAcc(11, 5, 7); - MulAcc(6, 6); - MulAcc(7, 5); - - SaveMulAcc(12, 6, 7); - MulAcc(7, 6); - - R[13] = c; - p = DWord::MultiplyAndAdd(A[7], B[7], d); - R[14] = p.GetLowHalf(); - R[15] = e + p.GetHighHalf(); -} - -void Portable::Multiply4Bottom(word *R, const word *A, const word *B) -{ - DWord p; - word c, d, e; - - p = DWord::Multiply(A[0], B[0]); - R[0] = p.GetLowHalf(); - c = p.GetHighHalf(); - d = e = 0; - - MulAcc(0, 1); - MulAcc(1, 0); - - SaveMulAcc(1, 2, 0); - MulAcc(1, 1); - MulAcc(0, 2); - - R[2] = c; - R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0]; -} - -void Portable::Multiply8Bottom(word *R, const word *A, const word *B) -{ - DWord p; - word c, d, e; - - p = DWord::Multiply(A[0], B[0]); - R[0] = p.GetLowHalf(); - c = p.GetHighHalf(); - d = e = 0; - - MulAcc(0, 1); - MulAcc(1, 0); - - SaveMulAcc(1, 2, 0); - MulAcc(1, 1); - MulAcc(0, 2); - - SaveMulAcc(2, 0, 3); - MulAcc(1, 2); - MulAcc(2, 1); - MulAcc(3, 0); - - SaveMulAcc(3, 0, 4); - MulAcc(1, 3); - MulAcc(2, 2); - MulAcc(3, 1); - MulAcc(4, 0); - - SaveMulAcc(4, 0, 5); - MulAcc(1, 4); - MulAcc(2, 3); - MulAcc(3, 2); - MulAcc(4, 1); - MulAcc(5, 0); - - SaveMulAcc(5, 0, 6); - MulAcc(1, 5); - MulAcc(2, 4); - MulAcc(3, 3); - MulAcc(4, 2); - MulAcc(5, 1); - MulAcc(6, 0); +#define Squ_Begin(n) \ + Declare2Words(p) \ + Declare2Words(c) \ + Declare2Words(d) \ + Declare2Words(e) \ + MultiplyWords(p, A[0], A[0]) \ + R[0] = LowWord(p); \ + AssignWord(e, HighWord(p)) \ + MultiplyWords(p, A[0], A[1]) \ + AssignWord(c, LowWord(p)) \ + AssignWord(d, HighWord(p)) \ + Squ_NonDiag \ - R[6] = c; - R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] + - A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0]; -} +#define Squ_NonDiag \ + Double2Words(c) \ + Double2Words(d) \ -#undef MulAcc -#undef SaveMulAcc -#undef SquAcc -#undef SaveSquAcc +#define Squ_SaveAcc(k, i, j) \ + Acc2WordsBy2(c, e) \ + R[k] = LowWord(c); \ + Add2WordsBy1(e, d, HighWord(c)) \ + MultiplyWords(p, A[i], A[j]) \ + AssignWord(c, LowWord(p)) \ + AssignWord(d, HighWord(p)) \ -#ifdef CRYPTOPP_X86ASM_AVAILABLE +#define Squ_Acc(i, j) \ + MultiplyWords(p, A[i], A[j]) \ + Acc2WordsBy1(c, LowWord(p)) \ + Acc2WordsBy1(d, HighWord(p)) -// ************** x86 feature detection *************** +#define Squ_Diag(i) \ + Squ_NonDiag \ + MultiplyWords(p, A[i], A[i]) \ + Acc2WordsBy1(c, LowWord(p)) \ + Acc2WordsBy1(d, HighWord(p)) \ -static bool s_sse2Enabled = true; +#define Squ_End(n) \ + Acc2WordsBy2(c, e) \ + R[2*n-3] = LowWord(c); \ + Acc2WordsBy1(d, HighWord(c)) \ + MultiplyWords(p, A[n-1], A[n-1])\ + Acc2WordsBy2(d, p) \ + R[2*n-2] = LowWord(d); \ + R[2*n-1] = HighWord(d); -static void CpuId(word32 input, word32 *output) +void Baseline_Multiply2(word *R, const word *A, const word *B) { -#ifdef __GNUC__ - __asm__ - ( - // save ebx in case -fPIC is being used - "push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx" - : "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d" (output[3]) - : "a" (input) - ); -#else - __asm - { - mov eax, input - cpuid - mov edi, output - mov [edi], eax - mov [edi+4], ebx - mov [edi+8], ecx - mov [edi+12], edx - } -#endif + Mul_2 } -#ifdef SSE2_INTRINSICS_AVAILABLE -#ifndef _MSC_VER -static jmp_buf s_env; -static void SigIllHandler(int) +void Baseline_Multiply4(word *R, const word *A, const word *B) { - longjmp(s_env, 1); + Mul_4 } -#endif -static bool HasSSE2() +void Baseline_Multiply8(word *R, const word *A, const word *B) { - if (!s_sse2Enabled) - return false; - - word32 cpuid[4]; - CpuId(1, cpuid); - if ((cpuid[3] & (1 << 26)) == 0) - return false; - -#ifdef _MSC_VER - __try - { - __asm xorpd xmm0, xmm0 // executing SSE2 instruction - } - __except (1) - { - return false; - } - return true; -#else - typedef void (*SigHandler)(int); + Mul_8 +} - SigHandler oldHandler = signal(SIGILL, SigIllHandler); - if (oldHandler == SIG_ERR) - return false; +void Baseline_Square2(word *R, const word *A) +{ + Squ_2 +} - bool result = true; - if (setjmp(s_env)) - result = false; - else - __asm __volatile ("xorps %xmm0, %xmm0"); +void Baseline_Square4(word *R, const word *A) +{ + Squ_4 +} - signal(SIGILL, oldHandler); - return result; -#endif +void Baseline_Square8(word *R, const word *A) +{ + Squ_8 } -#endif -static bool IsP4() +void Baseline_MultiplyBottom2(word *R, const word *A, const word *B) { - word32 cpuid[4]; + Bot_2 +} - CpuId(0, cpuid); - std::swap(cpuid[2], cpuid[3]); - if (memcmp(cpuid+1, "GenuineIntel", 12) != 0) - return false; +void Baseline_MultiplyBottom4(word *R, const word *A, const word *B) +{ + Bot_4 +} - CpuId(1, cpuid); - return ((cpuid[0] >> 8) & 0xf) == 0xf; +void Baseline_MultiplyBottom8(word *R, const word *A, const word *B) +{ + Bot_8 } -// ************** Pentium/P4 optimizations *************** +/* +void Baseline_Multiply16(word *R, const word *A, const word *B) +{ + Mul_16 +} -class PentiumOptimized : public Portable +void Baseline_Square16(word *R, const word *A) { -public: - static int Add(word *C, const word *A, const word *B, size_t N); - static int Subtract(word *C, const word *A, const word *B, size_t N); - static void Multiply4(word *C, const word *A, const word *B); - static void Multiply8(word *C, const word *A, const word *B); - static void Multiply8Bottom(word *C, const word *A, const word *B); -}; + Squ_16 +} -class P4Optimized +void Baseline_MultiplyBottom16(word *R, const word *A, const word *B) { -public: - static int Add(word *C, const word *A, const word *B, size_t N); - static int Subtract(word *C, const word *A, const word *B, size_t N); -#ifdef SSE2_INTRINSICS_AVAILABLE - static void Multiply4(word *C, const word *A, const word *B); - static void Multiply8(word *C, const word *A, const word *B); - static void Multiply8Bottom(word *C, const word *A, const word *B); -#endif -}; + Bot_16 +} +*/ -typedef int (* PAddSub)(word *C, const word *A, const word *B, size_t N); -typedef void (* PMul)(word *C, const word *A, const word *B); +// ******************************************************** + +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + +CRYPTOPP_ALIGN_DATA(16) static const word32 s_maskLow16[4] CRYPTOPP_SECTION_ALIGN16 = {0xffff,0xffff,0xffff,0xffff}; + +#undef Mul_Begin +#undef Mul_Acc +#undef Squ_Acc +#undef Squ_NonDiag +#undef Squ_Diag +#undef Squ_SaveAcc +#undef Squ_Begin +#undef Mul_SaveAcc +#undef Bot_Acc +#undef Bot_SaveAcc +#undef Bot_End +#undef Squ_End +#undef Mul_End + +#define SSE2_FinalSave(k) \ + AS2( psllq xmm5, 16) \ + AS2( paddq xmm4, xmm5) \ + AS2( movq QWORD PTR [ecx+8*(k)], xmm4) + +#define SSE2_SaveShift(k) \ + AS2( movq xmm0, xmm6) \ + AS2( punpckhqdq xmm6, xmm0) \ + AS2( movq xmm1, xmm7) \ + AS2( punpckhqdq xmm7, xmm1) \ + AS2( paddd xmm6, xmm0) \ + AS2( pslldq xmm6, 4) \ + AS2( paddd xmm7, xmm1) \ + AS2( paddd xmm4, xmm6) \ + AS2( pslldq xmm7, 4) \ + AS2( movq xmm6, xmm4) \ + AS2( paddd xmm5, xmm7) \ + AS2( movq xmm7, xmm5) \ + AS2( movd DWORD PTR [ecx+8*(k)], xmm4) \ + AS2( psrlq xmm6, 16) \ + AS2( paddq xmm6, xmm7) \ + AS2( punpckhqdq xmm4, xmm0) \ + AS2( punpckhqdq xmm5, xmm0) \ + AS2( movq QWORD PTR [ecx+8*(k)+2], xmm6) \ + AS2( psrlq xmm6, 3*16) \ + AS2( paddd xmm4, xmm6) \ + +#define Squ_SSE2_SaveShift(k) \ + AS2( movq xmm0, xmm6) \ + AS2( punpckhqdq xmm6, xmm0) \ + AS2( movq xmm1, xmm7) \ + AS2( punpckhqdq xmm7, xmm1) \ + AS2( paddd xmm6, xmm0) \ + AS2( pslldq xmm6, 4) \ + AS2( paddd xmm7, xmm1) \ + AS2( paddd xmm4, xmm6) \ + AS2( pslldq xmm7, 4) \ + AS2( movhlps xmm6, xmm4) \ + AS2( movd DWORD PTR [ecx+8*(k)], xmm4) \ + AS2( paddd xmm5, xmm7) \ + AS2( movhps QWORD PTR [esp+12], xmm5)\ + AS2( psrlq xmm4, 16) \ + AS2( paddq xmm4, xmm5) \ + AS2( movq QWORD PTR [ecx+8*(k)+2], xmm4) \ + AS2( psrlq xmm4, 3*16) \ + AS2( paddd xmm4, xmm6) \ + AS2( movq QWORD PTR [esp+4], xmm4)\ + +#define SSE2_FirstMultiply(i) \ + AS2( movdqa xmm7, [esi+(i)*16])\ + AS2( movdqa xmm5, [edi-(i)*16])\ + AS2( pmuludq xmm5, xmm7) \ + AS2( movdqa xmm4, [ebx])\ + AS2( movdqa xmm6, xmm4) \ + AS2( pand xmm4, xmm5) \ + AS2( psrld xmm5, 16) \ + AS2( pmuludq xmm7, [edx-(i)*16])\ + AS2( pand xmm6, xmm7) \ + AS2( psrld xmm7, 16) + +#define Squ_Begin(n) \ + SquPrologue \ + AS2( mov esi, esp)\ + AS2( and esp, 0xfffffff0)\ + AS2( lea edi, [esp-32*n])\ + AS2( sub esp, 32*n+16)\ + AS1( push esi)\ + AS2( mov esi, edi) \ + AS2( xor edx, edx) \ + ASL(1) \ + ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \ + ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \ + AS2( movdqa [edi+2*edx], xmm0) \ + AS2( psrlq xmm0, 32) \ + AS2( movdqa [edi+2*edx+16], xmm0) \ + AS2( movdqa [edi+16*n+2*edx], xmm1) \ + AS2( psrlq xmm1, 32) \ + AS2( movdqa [edi+16*n+2*edx+16], xmm1) \ + AS2( add edx, 16) \ + AS2( cmp edx, 8*(n)) \ + ASJ( jne, 1, b) \ + AS2( lea edx, [edi+16*n])\ + SSE2_FirstMultiply(0) \ + +#define Squ_Acc(i) \ + ASL(LSqu##i) \ + AS2( movdqa xmm1, [esi+(i)*16]) \ + AS2( movdqa xmm0, [edi-(i)*16]) \ + AS2( movdqa xmm2, [ebx]) \ + AS2( pmuludq xmm0, xmm1) \ + AS2( pmuludq xmm1, [edx-(i)*16]) \ + AS2( movdqa xmm3, xmm2) \ + AS2( pand xmm2, xmm0) \ + AS2( psrld xmm0, 16) \ + AS2( paddd xmm4, xmm2) \ + AS2( paddd xmm5, xmm0) \ + AS2( pand xmm3, xmm1) \ + AS2( psrld xmm1, 16) \ + AS2( paddd xmm6, xmm3) \ + AS2( paddd xmm7, xmm1) \ + +#define Squ_Acc1(i) +#define Squ_Acc2(i) ASC(call, LSqu##i) +#define Squ_Acc3(i) Squ_Acc2(i) +#define Squ_Acc4(i) Squ_Acc2(i) +#define Squ_Acc5(i) Squ_Acc2(i) +#define Squ_Acc6(i) Squ_Acc2(i) +#define Squ_Acc7(i) Squ_Acc2(i) +#define Squ_Acc8(i) Squ_Acc2(i) + +#define SSE2_End(E, n) \ + SSE2_SaveShift(2*(n)-3) \ + AS2( movdqa xmm7, [esi+16]) \ + AS2( movdqa xmm0, [edi]) \ + AS2( pmuludq xmm0, xmm7) \ + AS2( movdqa xmm2, [ebx]) \ + AS2( pmuludq xmm7, [edx]) \ + AS2( movdqa xmm6, xmm2) \ + AS2( pand xmm2, xmm0) \ + AS2( psrld xmm0, 16) \ + AS2( paddd xmm4, xmm2) \ + AS2( paddd xmm5, xmm0) \ + AS2( pand xmm6, xmm7) \ + AS2( psrld xmm7, 16) \ + SSE2_SaveShift(2*(n)-2) \ + SSE2_FinalSave(2*(n)-1) \ + AS1( pop esp)\ + E + +#define Squ_End(n) SSE2_End(SquEpilogue, n) +#define Mul_End(n) SSE2_End(MulEpilogue, n) +#define Top_End(n) SSE2_End(TopEpilogue, n) + +#define Squ_Column1(k, i) \ + Squ_SSE2_SaveShift(k) \ + AS2( add esi, 16) \ + SSE2_FirstMultiply(1)\ + Squ_Acc##i(i) \ + AS2( paddd xmm4, xmm4) \ + AS2( paddd xmm5, xmm5) \ + AS2( movdqa xmm3, [esi]) \ + AS2( movq xmm1, QWORD PTR [esi+8]) \ + AS2( pmuludq xmm1, xmm3) \ + AS2( pmuludq xmm3, xmm3) \ + AS2( movdqa xmm0, [ebx])\ + AS2( movdqa xmm2, xmm0) \ + AS2( pand xmm0, xmm1) \ + AS2( psrld xmm1, 16) \ + AS2( paddd xmm6, xmm0) \ + AS2( paddd xmm7, xmm1) \ + AS2( pand xmm2, xmm3) \ + AS2( psrld xmm3, 16) \ + AS2( paddd xmm6, xmm6) \ + AS2( paddd xmm7, xmm7) \ + AS2( paddd xmm4, xmm2) \ + AS2( paddd xmm5, xmm3) \ + AS2( movq xmm0, QWORD PTR [esp+4])\ + AS2( movq xmm1, QWORD PTR [esp+12])\ + AS2( paddd xmm4, xmm0)\ + AS2( paddd xmm5, xmm1)\ + +#define Squ_Column0(k, i) \ + Squ_SSE2_SaveShift(k) \ + AS2( add edi, 16) \ + AS2( add edx, 16) \ + SSE2_FirstMultiply(1)\ + Squ_Acc##i(i) \ + AS2( paddd xmm6, xmm6) \ + AS2( paddd xmm7, xmm7) \ + AS2( paddd xmm4, xmm4) \ + AS2( paddd xmm5, xmm5) \ + AS2( movq xmm0, QWORD PTR [esp+4])\ + AS2( movq xmm1, QWORD PTR [esp+12])\ + AS2( paddd xmm4, xmm0)\ + AS2( paddd xmm5, xmm1)\ + +#define SSE2_MulAdd45 \ + AS2( movdqa xmm7, [esi]) \ + AS2( movdqa xmm0, [edi]) \ + AS2( pmuludq xmm0, xmm7) \ + AS2( movdqa xmm2, [ebx]) \ + AS2( pmuludq xmm7, [edx]) \ + AS2( movdqa xmm6, xmm2) \ + AS2( pand xmm2, xmm0) \ + AS2( psrld xmm0, 16) \ + AS2( paddd xmm4, xmm2) \ + AS2( paddd xmm5, xmm0) \ + AS2( pand xmm6, xmm7) \ + AS2( psrld xmm7, 16) + +#define Mul_Begin(n) \ + MulPrologue \ + AS2( mov esi, esp)\ + AS2( and esp, 0xfffffff0)\ + AS2( sub esp, 48*n+16)\ + AS1( push esi)\ + AS2( xor edx, edx) \ + ASL(1) \ + ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \ + ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \ + ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \ + AS2( movdqa [esp+20+2*edx], xmm0) \ + AS2( psrlq xmm0, 32) \ + AS2( movdqa [esp+20+2*edx+16], xmm0) \ + AS2( movdqa [esp+20+16*n+2*edx], xmm1) \ + AS2( psrlq xmm1, 32) \ + AS2( movdqa [esp+20+16*n+2*edx+16], xmm1) \ + AS2( movdqa [esp+20+32*n+2*edx], xmm2) \ + AS2( psrlq xmm2, 32) \ + AS2( movdqa [esp+20+32*n+2*edx+16], xmm2) \ + AS2( add edx, 16) \ + AS2( cmp edx, 8*(n)) \ + ASJ( jne, 1, b) \ + AS2( lea edi, [esp+20])\ + AS2( lea edx, [esp+20+16*n])\ + AS2( lea esi, [esp+20+32*n])\ + SSE2_FirstMultiply(0) \ + +#define Mul_Acc(i) \ + ASL(LMul##i) \ + AS2( movdqa xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16]) \ + AS2( movdqa xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16]) \ + AS2( movdqa xmm2, [ebx]) \ + AS2( pmuludq xmm0, xmm1) \ + AS2( pmuludq xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \ + AS2( movdqa xmm3, xmm2) \ + AS2( pand xmm2, xmm0) \ + AS2( psrld xmm0, 16) \ + AS2( paddd xmm4, xmm2) \ + AS2( paddd xmm5, xmm0) \ + AS2( pand xmm3, xmm1) \ + AS2( psrld xmm1, 16) \ + AS2( paddd xmm6, xmm3) \ + AS2( paddd xmm7, xmm1) \ + +#define Mul_Acc1(i) +#define Mul_Acc2(i) ASC(call, LMul##i) +#define Mul_Acc3(i) Mul_Acc2(i) +#define Mul_Acc4(i) Mul_Acc2(i) +#define Mul_Acc5(i) Mul_Acc2(i) +#define Mul_Acc6(i) Mul_Acc2(i) +#define Mul_Acc7(i) Mul_Acc2(i) +#define Mul_Acc8(i) Mul_Acc2(i) +#define Mul_Acc9(i) Mul_Acc2(i) +#define Mul_Acc10(i) Mul_Acc2(i) +#define Mul_Acc11(i) Mul_Acc2(i) +#define Mul_Acc12(i) Mul_Acc2(i) +#define Mul_Acc13(i) Mul_Acc2(i) +#define Mul_Acc14(i) Mul_Acc2(i) +#define Mul_Acc15(i) Mul_Acc2(i) +#define Mul_Acc16(i) Mul_Acc2(i) + +#define Mul_Column1(k, i) \ + SSE2_SaveShift(k) \ + AS2( add esi, 16) \ + SSE2_MulAdd45\ + Mul_Acc##i(i) \ + +#define Mul_Column0(k, i) \ + SSE2_SaveShift(k) \ + AS2( add edi, 16) \ + AS2( add edx, 16) \ + SSE2_MulAdd45\ + Mul_Acc##i(i) \ + +#define Bot_Acc(i) \ + AS2( movdqa xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16]) \ + AS2( movdqa xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16]) \ + AS2( pmuludq xmm0, xmm1) \ + AS2( pmuludq xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \ + AS2( paddq xmm4, xmm0) \ + AS2( paddd xmm6, xmm1) + +#define Bot_SaveAcc(k) \ + SSE2_SaveShift(k) \ + AS2( add edi, 16) \ + AS2( add edx, 16) \ + AS2( movdqa xmm6, [esi]) \ + AS2( movdqa xmm0, [edi]) \ + AS2( pmuludq xmm0, xmm6) \ + AS2( paddq xmm4, xmm0) \ + AS2( psllq xmm5, 16) \ + AS2( paddq xmm4, xmm5) \ + AS2( pmuludq xmm6, [edx]) + +#define Bot_End(n) \ + AS2( movhlps xmm7, xmm6) \ + AS2( paddd xmm6, xmm7) \ + AS2( psllq xmm6, 32) \ + AS2( paddd xmm4, xmm6) \ + AS2( movq QWORD PTR [ecx+8*((n)-1)], xmm4) \ + AS1( pop esp)\ + MulEpilogue -static PAddSub s_pAdd, s_pSub; -#ifdef SSE2_INTRINSICS_AVAILABLE -static PMul s_pMul4, s_pMul8, s_pMul8B; +#define Top_Begin(n) \ + TopPrologue \ + AS2( mov edx, esp)\ + AS2( and esp, 0xfffffff0)\ + AS2( sub esp, 48*n+16)\ + AS1( push edx)\ + AS2( xor edx, edx) \ + ASL(1) \ + ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \ + ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \ + ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \ + AS2( movdqa [esp+20+2*edx], xmm0) \ + AS2( psrlq xmm0, 32) \ + AS2( movdqa [esp+20+2*edx+16], xmm0) \ + AS2( movdqa [esp+20+16*n+2*edx], xmm1) \ + AS2( psrlq xmm1, 32) \ + AS2( movdqa [esp+20+16*n+2*edx+16], xmm1) \ + AS2( movdqa [esp+20+32*n+2*edx], xmm2) \ + AS2( psrlq xmm2, 32) \ + AS2( movdqa [esp+20+32*n+2*edx+16], xmm2) \ + AS2( add edx, 16) \ + AS2( cmp edx, 8*(n)) \ + ASJ( jne, 1, b) \ + AS2( mov eax, esi) \ + AS2( lea edi, [esp+20+00*n+16*(n/2-1)])\ + AS2( lea edx, [esp+20+16*n+16*(n/2-1)])\ + AS2( lea esi, [esp+20+32*n+16*(n/2-1)])\ + AS2( pxor xmm4, xmm4)\ + AS2( pxor xmm5, xmm5) + +#define Top_Acc(i) \ + AS2( movq xmm0, QWORD PTR [esi+i/2*(1-(i-2*(i/2))*2)*16+8]) \ + AS2( pmuludq xmm0, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \ + AS2( psrlq xmm0, 48) \ + AS2( paddd xmm5, xmm0)\ + +#define Top_Column0(i) \ + AS2( psllq xmm5, 32) \ + AS2( add edi, 16) \ + AS2( add edx, 16) \ + SSE2_MulAdd45\ + Mul_Acc##i(i) \ + +#define Top_Column1(i) \ + SSE2_SaveShift(0) \ + AS2( add esi, 16) \ + SSE2_MulAdd45\ + Mul_Acc##i(i) \ + AS2( shr eax, 16) \ + AS2( movd xmm0, eax)\ + AS2( movd xmm1, [ecx+4])\ + AS2( psrld xmm1, 16)\ + AS2( pcmpgtd xmm1, xmm0)\ + AS2( psrld xmm1, 31)\ + AS2( paddd xmm4, xmm1)\ + +void SSE2_Square4(word *C, const word *A) +{ + Squ_Begin(2) + Squ_Column0(0, 1) + Squ_End(2) +} + +void SSE2_Square8(word *C, const word *A) +{ + Squ_Begin(4) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Squ_Acc(2) + AS1( ret) ASL(0) #endif + Squ_Column0(0, 1) + Squ_Column1(1, 1) + Squ_Column0(2, 2) + Squ_Column1(3, 1) + Squ_Column0(4, 1) + Squ_End(4) +} -static void SetPentiumFunctionPointers() +void SSE2_Square16(word *C, const word *A) { - if (IsP4()) - { - s_pAdd = &P4Optimized::Add; - s_pSub = &P4Optimized::Subtract; - } - else - { - s_pAdd = &PentiumOptimized::Add; - s_pSub = &PentiumOptimized::Subtract; - } - -#ifdef SSE2_INTRINSICS_AVAILABLE - if (HasSSE2()) - { - s_pMul4 = &P4Optimized::Multiply4; - s_pMul8 = &P4Optimized::Multiply8; - s_pMul8B = &P4Optimized::Multiply8Bottom; - } - else - { - s_pMul4 = &PentiumOptimized::Multiply4; - s_pMul8 = &PentiumOptimized::Multiply8; - s_pMul8B = &PentiumOptimized::Multiply8Bottom; - } + Squ_Begin(8) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Squ_Acc(4) Squ_Acc(3) Squ_Acc(2) + AS1( ret) ASL(0) +#endif + Squ_Column0(0, 1) + Squ_Column1(1, 1) + Squ_Column0(2, 2) + Squ_Column1(3, 2) + Squ_Column0(4, 3) + Squ_Column1(5, 3) + Squ_Column0(6, 4) + Squ_Column1(7, 3) + Squ_Column0(8, 3) + Squ_Column1(9, 2) + Squ_Column0(10, 2) + Squ_Column1(11, 1) + Squ_Column0(12, 1) + Squ_End(8) +} + +void SSE2_Square32(word *C, const word *A) +{ + Squ_Begin(16) + ASJ( jmp, 0, f) + Squ_Acc(8) Squ_Acc(7) Squ_Acc(6) Squ_Acc(5) Squ_Acc(4) Squ_Acc(3) Squ_Acc(2) + AS1( ret) ASL(0) + Squ_Column0(0, 1) + Squ_Column1(1, 1) + Squ_Column0(2, 2) + Squ_Column1(3, 2) + Squ_Column0(4, 3) + Squ_Column1(5, 3) + Squ_Column0(6, 4) + Squ_Column1(7, 4) + Squ_Column0(8, 5) + Squ_Column1(9, 5) + Squ_Column0(10, 6) + Squ_Column1(11, 6) + Squ_Column0(12, 7) + Squ_Column1(13, 7) + Squ_Column0(14, 8) + Squ_Column1(15, 7) + Squ_Column0(16, 7) + Squ_Column1(17, 6) + Squ_Column0(18, 6) + Squ_Column1(19, 5) + Squ_Column0(20, 5) + Squ_Column1(21, 4) + Squ_Column0(22, 4) + Squ_Column1(23, 3) + Squ_Column0(24, 3) + Squ_Column1(25, 2) + Squ_Column0(26, 2) + Squ_Column1(27, 1) + Squ_Column0(28, 1) + Squ_End(16) +} + +void SSE2_Multiply4(word *C, const word *A, const word *B) +{ + Mul_Begin(2) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Mul_Acc(2) + AS1( ret) ASL(0) #endif + Mul_Column0(0, 2) + Mul_End(2) } -void DisableSSE2() +void SSE2_Multiply8(word *C, const word *A, const word *B) { - s_sse2Enabled = false; - SetPentiumFunctionPointers(); + Mul_Begin(4) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) + AS1( ret) ASL(0) +#endif + Mul_Column0(0, 2) + Mul_Column1(1, 3) + Mul_Column0(2, 4) + Mul_Column1(3, 3) + Mul_Column0(4, 2) + Mul_End(4) } -class LowLevel : public PentiumOptimized +void SSE2_Multiply16(word *C, const word *A, const word *B) { -public: - inline static int Add(word *C, const word *A, const word *B, size_t N) - {return s_pAdd(C, A, B, N);} - inline static int Subtract(word *C, const word *A, const word *B, size_t N) - {return s_pSub(C, A, B, N);} - inline static void Square4(word *R, const word *A) - {Multiply4(R, A, A);} -#ifdef SSE2_INTRINSICS_AVAILABLE - inline static void Multiply4(word *C, const word *A, const word *B) - {s_pMul4(C, A, B);} - inline static void Multiply8(word *C, const word *A, const word *B) - {s_pMul8(C, A, B);} - inline static void Multiply8Bottom(word *C, const word *A, const word *B) - {s_pMul8B(C, A, B);} + Mul_Begin(8) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) + AS1( ret) ASL(0) #endif -}; - -// use some tricks to share assembly code between MSVC and GCC -#ifdef _MSC_VER - #define CRYPTOPP_NAKED __declspec(naked) - #define AS1(x) __asm x - #define AS2(x, y) __asm x, y - #define AddPrologue \ - __asm push ebp \ - __asm push ebx \ - __asm push esi \ - __asm push edi \ - __asm mov ecx, [esp+20] \ - __asm mov edx, [esp+24] \ - __asm mov ebx, [esp+28] \ - __asm mov esi, [esp+32] - #define AddEpilogue \ - __asm pop edi \ - __asm pop esi \ - __asm pop ebx \ - __asm pop ebp \ - __asm ret - #define MulPrologue \ - __asm push ebp \ - __asm push ebx \ - __asm push esi \ - __asm push edi \ - __asm mov ecx, [esp+28] \ - __asm mov esi, [esp+24] \ - __asm push [esp+20] - #define MulEpilogue \ - __asm add esp, 4 \ - __asm pop edi \ - __asm pop esi \ - __asm pop ebx \ - __asm pop ebp \ - __asm ret -#else - #define CRYPTOPP_NAKED - #define AS1(x) #x ";" - #define AS2(x, y) #x ", " #y ";" - #define AddPrologue \ - __asm__ __volatile__ \ - ( \ - "push %%ebx;" /* save this manually, in case of -fPIC */ \ - "mov %2, %%ebx;" \ - ".intel_syntax noprefix;" \ - "push ebp;" - #define AddEpilogue \ - "pop ebp;" \ - ".att_syntax prefix;" \ - "pop %%ebx;" \ - : \ - : "c" (C), "d" (A), "m" (B), "S" (N) \ - : "%edi", "memory", "cc" \ - ); - #define MulPrologue \ - __asm__ __volatile__ \ - ( \ - "push %%ebx;" /* save this manually, in case of -fPIC */ \ - "push %%ebp;" \ - "push %0;" \ - ".intel_syntax noprefix;" - #define MulEpilogue \ - "add esp, 4;" \ - "pop ebp;" \ - "pop ebx;" \ - ".att_syntax prefix;" \ - : \ - : "rm" (Z), "S" (X), "c" (Y) \ - : "%eax", "%edx", "%edi", "memory", "cc" \ - ); + Mul_Column0(0, 2) + Mul_Column1(1, 3) + Mul_Column0(2, 4) + Mul_Column1(3, 5) + Mul_Column0(4, 6) + Mul_Column1(5, 7) + Mul_Column0(6, 8) + Mul_Column1(7, 7) + Mul_Column0(8, 6) + Mul_Column1(9, 5) + Mul_Column0(10, 4) + Mul_Column1(11, 3) + Mul_Column0(12, 2) + Mul_End(8) +} + +void SSE2_Multiply32(word *C, const word *A, const word *B) +{ + Mul_Begin(16) + ASJ( jmp, 0, f) + Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) + AS1( ret) ASL(0) + Mul_Column0(0, 2) + Mul_Column1(1, 3) + Mul_Column0(2, 4) + Mul_Column1(3, 5) + Mul_Column0(4, 6) + Mul_Column1(5, 7) + Mul_Column0(6, 8) + Mul_Column1(7, 9) + Mul_Column0(8, 10) + Mul_Column1(9, 11) + Mul_Column0(10, 12) + Mul_Column1(11, 13) + Mul_Column0(12, 14) + Mul_Column1(13, 15) + Mul_Column0(14, 16) + Mul_Column1(15, 15) + Mul_Column0(16, 14) + Mul_Column1(17, 13) + Mul_Column0(18, 12) + Mul_Column1(19, 11) + Mul_Column0(20, 10) + Mul_Column1(21, 9) + Mul_Column0(22, 8) + Mul_Column1(23, 7) + Mul_Column0(24, 6) + Mul_Column1(25, 5) + Mul_Column0(26, 4) + Mul_Column1(27, 3) + Mul_Column0(28, 2) + Mul_End(16) +} + +void SSE2_MultiplyBottom4(word *C, const word *A, const word *B) +{ + Mul_Begin(2) + Bot_SaveAcc(0) Bot_Acc(2) + Bot_End(2) +} + +void SSE2_MultiplyBottom8(word *C, const word *A, const word *B) +{ + Mul_Begin(4) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Mul_Acc(3) Mul_Acc(2) + AS1( ret) ASL(0) #endif + Mul_Column0(0, 2) + Mul_Column1(1, 3) + Bot_SaveAcc(2) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2) + Bot_End(4) +} -CRYPTOPP_NAKED int PentiumOptimized::Add(word *C, const word *A, const word *B, size_t N) +void SSE2_MultiplyBottom16(word *C, const word *A, const word *B) { - AddPrologue - - // now: ebx = B, ecx = C, edx = A, esi = N - AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C - AS2( xor eax, eax) // clear eax - - AS2( sub eax, esi) // eax is a negative index from end of B - AS2( lea ebx, [ebx+4*esi]) // ebx is end of B - - AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag - AS1( jz loopendAdd) // if no dwords then nothing to do - - AS1(loopstartAdd:) - AS2( mov esi,[edx]) // load lower word of A - AS2( mov ebp,[edx+4]) // load higher word of A - - AS2( mov edi,[ebx+8*eax]) // load lower word of B - AS2( lea edx,[edx+8]) // advance A and C - - AS2( adc esi,edi) // add lower words - AS2( mov edi,[ebx+8*eax+4]) // load higher word of B - - AS2( adc ebp,edi) // add higher words - AS1( inc eax) // advance B - - AS2( mov [edx+ecx-8],esi) // store lower word result - AS2( mov [edx+ecx-4],ebp) // store higher word result - - AS1( jnz loopstartAdd) // loop until eax overflows and becomes zero - - AS1(loopendAdd:) - AS2( adc eax, 0) // store carry into eax (return result register) - - AddEpilogue + Mul_Begin(8) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) + AS1( ret) ASL(0) +#endif + Mul_Column0(0, 2) + Mul_Column1(1, 3) + Mul_Column0(2, 4) + Mul_Column1(3, 5) + Mul_Column0(4, 6) + Mul_Column1(5, 7) + Bot_SaveAcc(6) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2) + Bot_End(8) +} + +void SSE2_MultiplyBottom32(word *C, const word *A, const word *B) +{ + Mul_Begin(16) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) + AS1( ret) ASL(0) +#endif + Mul_Column0(0, 2) + Mul_Column1(1, 3) + Mul_Column0(2, 4) + Mul_Column1(3, 5) + Mul_Column0(4, 6) + Mul_Column1(5, 7) + Mul_Column0(6, 8) + Mul_Column1(7, 9) + Mul_Column0(8, 10) + Mul_Column1(9, 11) + Mul_Column0(10, 12) + Mul_Column1(11, 13) + Mul_Column0(12, 14) + Mul_Column1(13, 15) + Bot_SaveAcc(14) Bot_Acc(16) Bot_Acc(15) Bot_Acc(14) Bot_Acc(13) Bot_Acc(12) Bot_Acc(11) Bot_Acc(10) Bot_Acc(9) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2) + Bot_End(16) +} + +void SSE2_MultiplyTop8(word *C, const word *A, const word *B, word L) +{ + Top_Begin(4) + Top_Acc(3) Top_Acc(2) Top_Acc(1) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) + AS1( ret) ASL(0) +#endif + Top_Column0(4) + Top_Column1(3) + Mul_Column0(0, 2) + Top_End(2) } -CRYPTOPP_NAKED int PentiumOptimized::Subtract(word *C, const word *A, const word *B, size_t N) +void SSE2_MultiplyTop16(word *C, const word *A, const word *B, word L) { - AddPrologue - - // now: ebx = B, ecx = C, edx = A, esi = N - AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C - AS2( xor eax, eax) // clear eax - - AS2( sub eax, esi) // eax is a negative index from end of B - AS2( lea ebx, [ebx+4*esi]) // ebx is end of B - - AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag - AS1( jz loopendSub) // if no dwords then nothing to do - - AS1(loopstartSub:) - AS2( mov esi,[edx]) // load lower word of A - AS2( mov ebp,[edx+4]) // load higher word of A - - AS2( mov edi,[ebx+8*eax]) // load lower word of B - AS2( lea edx,[edx+8]) // advance A and C - - AS2( sbb esi,edi) // subtract lower words - AS2( mov edi,[ebx+8*eax+4]) // load higher word of B - - AS2( sbb ebp,edi) // subtract higher words - AS1( inc eax) // advance B - - AS2( mov [edx+ecx-8],esi) // store lower word result - AS2( mov [edx+ecx-4],ebp) // store higher word result + Top_Begin(8) + Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) + AS1( ret) ASL(0) +#endif + Top_Column0(8) + Top_Column1(7) + Mul_Column0(0, 6) + Mul_Column1(1, 5) + Mul_Column0(2, 4) + Mul_Column1(3, 3) + Mul_Column0(4, 2) + Top_End(4) +} + +void SSE2_MultiplyTop32(word *C, const word *A, const word *B, word L) +{ + Top_Begin(16) + Top_Acc(15) Top_Acc(14) Top_Acc(13) Top_Acc(12) Top_Acc(11) Top_Acc(10) Top_Acc(9) Top_Acc(8) Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1) +#ifndef __GNUC__ + ASJ( jmp, 0, f) + Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) + AS1( ret) ASL(0) +#endif + Top_Column0(16) + Top_Column1(15) + Mul_Column0(0, 14) + Mul_Column1(1, 13) + Mul_Column0(2, 12) + Mul_Column1(3, 11) + Mul_Column0(4, 10) + Mul_Column1(5, 9) + Mul_Column0(6, 8) + Mul_Column1(7, 7) + Mul_Column0(8, 6) + Mul_Column1(9, 5) + Mul_Column0(10, 4) + Mul_Column1(11, 3) + Mul_Column0(12, 2) + Top_End(8) +} + +#endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE - AS1( jnz loopstartSub) // loop until eax overflows and becomes zero +// ******************************************************** - AS1(loopendSub:) - AS2( adc eax, 0) // store carry into eax (return result register) +typedef int (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B); +typedef void (* PMul)(word *C, const word *A, const word *B); +typedef void (* PSqu)(word *C, const word *A); +typedef void (* PMulTop)(word *C, const word *A, const word *B, word L); - AddEpilogue -} +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE +static PAdd s_pAdd = &Baseline_Add, s_pSub = &Baseline_Sub; +static PMulTop s_pTop[3]; +static size_t s_recursionLimit = 8; +#else +static const size_t s_recursionLimit = 8; +#endif -// On Pentium 4, the adc and sbb instructions are very expensive, so avoid them. +static PMul s_pMul[9], s_pBot[9]; +static PSqu s_pSqu[9]; -CRYPTOPP_NAKED int P4Optimized::Add(word *C, const word *A, const word *B, size_t N) +static void SetFunctionPointers() { - AddPrologue + s_pMul[0] = &Baseline_Multiply2; + s_pBot[0] = &Baseline_MultiplyBottom2; + s_pSqu[0] = &Baseline_Square2; - // now: ebx = B, ecx = C, edx = A, esi = N - AS2( xor eax, eax) - AS1( neg esi) - AS1( jz loopendAddP4) // if no dwords then nothing to do - - AS2( mov edi, [edx]) - AS2( mov ebp, [ebx]) - AS1( jmp carry1AddP4) - - AS1(loopstartAddP4:) - AS2( mov edi, [edx+8]) - AS2( add ecx, 8) - AS2( add edx, 8) - AS2( mov ebp, [ebx]) - AS2( add edi, eax) - AS1( jc carry1AddP4) - AS2( xor eax, eax) - - AS1(carry1AddP4:) - AS2( add edi, ebp) - AS2( mov ebp, 1) - AS2( mov [ecx], edi) - AS2( mov edi, [edx+4]) - AS2( cmovc eax, ebp) - AS2( mov ebp, [ebx+4]) - AS2( add ebx, 8) - AS2( add edi, eax) - AS1( jc carry2AddP4) - AS2( xor eax, eax) - - AS1(carry2AddP4:) - AS2( add edi, ebp) - AS2( mov ebp, 1) - AS2( cmovc eax, ebp) - AS2( mov [ecx+4], edi) - AS2( add esi, 2) - AS1( jnz loopstartAddP4) - - AS1(loopendAddP4:) +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + if (HasSSE2()) + { + if (IsP4()) + { + s_pAdd = &SSE2_Add; + s_pSub = &SSE2_Sub; + } - AddEpilogue -} + s_recursionLimit = 32; -CRYPTOPP_NAKED int P4Optimized::Subtract(word *C, const word *A, const word *B, size_t N) -{ - AddPrologue + s_pMul[1] = &SSE2_Multiply4; + s_pMul[2] = &SSE2_Multiply8; + s_pMul[4] = &SSE2_Multiply16; + s_pMul[8] = &SSE2_Multiply32; - // now: ebx = B, ecx = C, edx = A, esi = N - AS2( xor eax, eax) - AS1( neg esi) - AS1( jz loopendSubP4) // if no dwords then nothing to do - - AS2( mov edi, [edx]) - AS2( mov ebp, [ebx]) - AS1( jmp carry1SubP4) - - AS1(loopstartSubP4:) - AS2( mov edi, [edx+8]) - AS2( add edx, 8) - AS2( add ecx, 8) - AS2( mov ebp, [ebx]) - AS2( sub edi, eax) - AS1( jc carry1SubP4) - AS2( xor eax, eax) - - AS1(carry1SubP4:) - AS2( sub edi, ebp) - AS2( mov ebp, 1) - AS2( mov [ecx], edi) - AS2( mov edi, [edx+4]) - AS2( cmovc eax, ebp) - AS2( mov ebp, [ebx+4]) - AS2( add ebx, 8) - AS2( sub edi, eax) - AS1( jc carry2SubP4) - AS2( xor eax, eax) - - AS1(carry2SubP4:) - AS2( sub edi, ebp) - AS2( mov ebp, 1) - AS2( cmovc eax, ebp) - AS2( mov [ecx+4], edi) - AS2( add esi, 2) - AS1( jnz loopstartSubP4) - - AS1(loopendSubP4:) + s_pBot[1] = &SSE2_MultiplyBottom4; + s_pBot[2] = &SSE2_MultiplyBottom8; + s_pBot[4] = &SSE2_MultiplyBottom16; + s_pBot[8] = &SSE2_MultiplyBottom32; - AddEpilogue -} + s_pSqu[1] = &SSE2_Square4; + s_pSqu[2] = &SSE2_Square8; + s_pSqu[4] = &SSE2_Square16; + s_pSqu[8] = &SSE2_Square32; -// multiply assembly code originally contributed by Leonard Janke - -#define MulStartup \ - AS2(xor ebp, ebp) \ - AS2(xor edi, edi) \ - AS2(xor ebx, ebx) - -#define MulShiftCarry \ - AS2(mov ebp, edx) \ - AS2(mov edi, ebx) \ - AS2(xor ebx, ebx) - -#define MulAccumulateBottom(i,j) \ - AS2(mov eax, [ecx+4*j]) \ - AS2(imul eax, dword ptr [esi+4*i]) \ - AS2(add ebp, eax) - -#define MulAccumulate(i,j) \ - AS2(mov eax, [ecx+4*j]) \ - AS1(mul dword ptr [esi+4*i]) \ - AS2(add ebp, eax) \ - AS2(adc edi, edx) \ - AS2(adc bl, bh) - -#define MulStoreDigit(i) \ - AS2(mov edx, edi) \ - AS2(mov edi, [esp]) \ - AS2(mov [edi+4*i], ebp) - -#define MulLastDiagonal(digits) \ - AS2(mov eax, [ecx+4*(digits-1)]) \ - AS1(mul dword ptr [esi+4*(digits-1)]) \ - AS2(add ebp, eax) \ - AS2(adc edx, edi) \ - AS2(mov edi, [esp]) \ - AS2(mov [edi+4*(2*digits-2)], ebp) \ - AS2(mov [edi+4*(2*digits-1)], edx) - -CRYPTOPP_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y) -{ - MulPrologue - // now: [esp] = Z, esi = X, ecx = Y - MulStartup - MulAccumulate(0,0) - MulStoreDigit(0) - MulShiftCarry - - MulAccumulate(1,0) - MulAccumulate(0,1) - MulStoreDigit(1) - MulShiftCarry - - MulAccumulate(2,0) - MulAccumulate(1,1) - MulAccumulate(0,2) - MulStoreDigit(2) - MulShiftCarry - - MulAccumulate(3,0) - MulAccumulate(2,1) - MulAccumulate(1,2) - MulAccumulate(0,3) - MulStoreDigit(3) - MulShiftCarry - - MulAccumulate(3,1) - MulAccumulate(2,2) - MulAccumulate(1,3) - MulStoreDigit(4) - MulShiftCarry - - MulAccumulate(3,2) - MulAccumulate(2,3) - MulStoreDigit(5) - MulShiftCarry - - MulLastDiagonal(4) - MulEpilogue -} + s_pTop[0] = &SSE2_MultiplyTop8; + s_pTop[1] = &SSE2_MultiplyTop16; + s_pTop[2] = &SSE2_MultiplyTop32; + } + else +#endif + { + s_pMul[1] = &Baseline_Multiply4; + s_pMul[2] = &Baseline_Multiply8; +// s_pMul[4] = &Baseline_Multiply16; -CRYPTOPP_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y) -{ - MulPrologue - // now: [esp] = Z, esi = X, ecx = Y - MulStartup - MulAccumulate(0,0) - MulStoreDigit(0) - MulShiftCarry - - MulAccumulate(1,0) - MulAccumulate(0,1) - MulStoreDigit(1) - MulShiftCarry - - MulAccumulate(2,0) - MulAccumulate(1,1) - MulAccumulate(0,2) - MulStoreDigit(2) - MulShiftCarry - - MulAccumulate(3,0) - MulAccumulate(2,1) - MulAccumulate(1,2) - MulAccumulate(0,3) - MulStoreDigit(3) - MulShiftCarry - - MulAccumulate(4,0) - MulAccumulate(3,1) - MulAccumulate(2,2) - MulAccumulate(1,3) - MulAccumulate(0,4) - MulStoreDigit(4) - MulShiftCarry - - MulAccumulate(5,0) - MulAccumulate(4,1) - MulAccumulate(3,2) - MulAccumulate(2,3) - MulAccumulate(1,4) - MulAccumulate(0,5) - MulStoreDigit(5) - MulShiftCarry - - MulAccumulate(6,0) - MulAccumulate(5,1) - MulAccumulate(4,2) - MulAccumulate(3,3) - MulAccumulate(2,4) - MulAccumulate(1,5) - MulAccumulate(0,6) - MulStoreDigit(6) - MulShiftCarry - - MulAccumulate(7,0) - MulAccumulate(6,1) - MulAccumulate(5,2) - MulAccumulate(4,3) - MulAccumulate(3,4) - MulAccumulate(2,5) - MulAccumulate(1,6) - MulAccumulate(0,7) - MulStoreDigit(7) - MulShiftCarry - - MulAccumulate(7,1) - MulAccumulate(6,2) - MulAccumulate(5,3) - MulAccumulate(4,4) - MulAccumulate(3,5) - MulAccumulate(2,6) - MulAccumulate(1,7) - MulStoreDigit(8) - MulShiftCarry - - MulAccumulate(7,2) - MulAccumulate(6,3) - MulAccumulate(5,4) - MulAccumulate(4,5) - MulAccumulate(3,6) - MulAccumulate(2,7) - MulStoreDigit(9) - MulShiftCarry - - MulAccumulate(7,3) - MulAccumulate(6,4) - MulAccumulate(5,5) - MulAccumulate(4,6) - MulAccumulate(3,7) - MulStoreDigit(10) - MulShiftCarry - - MulAccumulate(7,4) - MulAccumulate(6,5) - MulAccumulate(5,6) - MulAccumulate(4,7) - MulStoreDigit(11) - MulShiftCarry - - MulAccumulate(7,5) - MulAccumulate(6,6) - MulAccumulate(5,7) - MulStoreDigit(12) - MulShiftCarry - - MulAccumulate(7,6) - MulAccumulate(6,7) - MulStoreDigit(13) - MulShiftCarry - - MulLastDiagonal(8) - MulEpilogue -} + s_pBot[1] = &Baseline_MultiplyBottom4; + s_pBot[2] = &Baseline_MultiplyBottom8; +// s_pBot[4] = &Baseline_MultiplyBottom16; -CRYPTOPP_NAKED void PentiumOptimized::Multiply8Bottom(word* Z, const word* X, const word* Y) -{ - MulPrologue - // now: [esp] = Z, esi = X, ecx = Y - MulStartup - MulAccumulate(0,0) - MulStoreDigit(0) - MulShiftCarry - - MulAccumulate(1,0) - MulAccumulate(0,1) - MulStoreDigit(1) - MulShiftCarry - - MulAccumulate(2,0) - MulAccumulate(1,1) - MulAccumulate(0,2) - MulStoreDigit(2) - MulShiftCarry - - MulAccumulate(3,0) - MulAccumulate(2,1) - MulAccumulate(1,2) - MulAccumulate(0,3) - MulStoreDigit(3) - MulShiftCarry - - MulAccumulate(4,0) - MulAccumulate(3,1) - MulAccumulate(2,2) - MulAccumulate(1,3) - MulAccumulate(0,4) - MulStoreDigit(4) - MulShiftCarry - - MulAccumulate(5,0) - MulAccumulate(4,1) - MulAccumulate(3,2) - MulAccumulate(2,3) - MulAccumulate(1,4) - MulAccumulate(0,5) - MulStoreDigit(5) - MulShiftCarry - - MulAccumulate(6,0) - MulAccumulate(5,1) - MulAccumulate(4,2) - MulAccumulate(3,3) - MulAccumulate(2,4) - MulAccumulate(1,5) - MulAccumulate(0,6) - MulStoreDigit(6) - MulShiftCarry - - MulAccumulateBottom(7,0) - MulAccumulateBottom(6,1) - MulAccumulateBottom(5,2) - MulAccumulateBottom(4,3) - MulAccumulateBottom(3,4) - MulAccumulateBottom(2,5) - MulAccumulateBottom(1,6) - MulAccumulateBottom(0,7) - MulStoreDigit(7) - MulEpilogue + s_pSqu[1] = &Baseline_Square4; + s_pSqu[2] = &Baseline_Square8; +// s_pSqu[4] = &Baseline_Square16; + } } -#undef AS1 -#undef AS2 - -#else // not x86 - no processor specific code at this layer - -typedef Portable LowLevel; - -#endif - -#ifdef SSE2_INTRINSICS_AVAILABLE - -#ifdef __GNUC__ -#define CRYPTOPP_FASTCALL +inline int Add(word *C, const word *A, const word *B, size_t N) +{ +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + return s_pAdd(N, C, A, B); #else -#define CRYPTOPP_FASTCALL __fastcall + return Baseline_Add(N, C, A, B); #endif - -static void CRYPTOPP_FASTCALL P4_Mul(__m128i *C, const __m128i *A, const __m128i *B) -{ - __m128i a3210 = _mm_load_si128(A); - __m128i b3210 = _mm_load_si128(B); - - __m128i sum; - - __m128i z = _mm_setzero_si128(); - __m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210); - C[0] = a2b2_a0b0; - - __m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0)); - __m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1)); - __m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021); - __m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z); - __m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z); - C[1] = _mm_add_epi64(a1b0, a0b1); - - __m128i a31 = _mm_srli_epi64(a3210, 32); - __m128i b31 = _mm_srli_epi64(b3210, 32); - __m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31); - C[6] = a3b3_a1b1; - - __m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z); - __m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2)); - __m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012); - __m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z); - __m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z); - sum = _mm_add_epi64(a1b1, a0b2); - C[2] = _mm_add_epi64(sum, a2b0); - - __m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1)); - __m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3)); - __m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012); - __m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103); - __m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z); - __m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z); - __m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z); - __m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z); - __m128i sum1 = _mm_add_epi64(a3b0, a1b2); - sum = _mm_add_epi64(a2b1, a0b3); - C[3] = _mm_add_epi64(sum, sum1); - - __m128i a3b1_a1b3 = _mm_mul_epu32(a2301, b2103); - __m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z); - __m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z); - __m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z); - sum = _mm_add_epi64(a2b2, a3b1); - C[4] = _mm_add_epi64(sum, a1b3); - - __m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2)); - __m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3)); - __m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203); - __m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z); - __m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z); - C[5] = _mm_add_epi64(a3b2, a2b3); -} - -void P4Optimized::Multiply4(word *C, const word *A, const word *B) -{ - __m128i temp[7]; - const word *w = (word *)temp; - const __m64 *mw = (__m64 *)w; - - P4_Mul(temp, (__m128i *)A, (__m128i *)B); - - C[0] = w[0]; - - __m64 s1, s2; - - __m64 w1 = _mm_cvtsi32_si64(w[1]); - __m64 w4 = mw[2]; - __m64 w6 = mw[3]; - __m64 w8 = mw[4]; - __m64 w10 = mw[5]; - __m64 w12 = mw[6]; - __m64 w14 = mw[7]; - __m64 w16 = mw[8]; - __m64 w18 = mw[9]; - __m64 w20 = mw[10]; - __m64 w22 = mw[11]; - __m64 w26 = _mm_cvtsi32_si64(w[26]); - - s1 = _mm_add_si64(w1, w4); - C[1] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w6, w8); - s1 = _mm_add_si64(s1, s2); - C[2] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w10, w12); - s1 = _mm_add_si64(s1, s2); - C[3] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w14, w16); - s1 = _mm_add_si64(s1, s2); - C[4] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w18, w20); - s1 = _mm_add_si64(s1, s2); - C[5] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w22, w26); - s1 = _mm_add_si64(s1, s2); - C[6] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - C[7] = _mm_cvtsi64_si32(s1) + w[27]; - _mm_empty(); -} - -void P4Optimized::Multiply8(word *C, const word *A, const word *B) -{ - __m128i temp[28]; - const word *w = (word *)temp; - const __m64 *mw = (__m64 *)w; - const word *x = (word *)temp+7*4; - const __m64 *mx = (__m64 *)x; - const word *y = (word *)temp+7*4*2; - const __m64 *my = (__m64 *)y; - const word *z = (word *)temp+7*4*3; - const __m64 *mz = (__m64 *)z; - - P4_Mul(temp, (__m128i *)A, (__m128i *)B); - - P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B); - - P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1); - - P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1); - - C[0] = w[0]; - - __m64 s1, s2, s3, s4; - - __m64 w1 = _mm_cvtsi32_si64(w[1]); - __m64 w4 = mw[2]; - __m64 w6 = mw[3]; - __m64 w8 = mw[4]; - __m64 w10 = mw[5]; - __m64 w12 = mw[6]; - __m64 w14 = mw[7]; - __m64 w16 = mw[8]; - __m64 w18 = mw[9]; - __m64 w20 = mw[10]; - __m64 w22 = mw[11]; - __m64 w26 = _mm_cvtsi32_si64(w[26]); - __m64 w27 = _mm_cvtsi32_si64(w[27]); - - __m64 x0 = _mm_cvtsi32_si64(x[0]); - __m64 x1 = _mm_cvtsi32_si64(x[1]); - __m64 x4 = mx[2]; - __m64 x6 = mx[3]; - __m64 x8 = mx[4]; - __m64 x10 = mx[5]; - __m64 x12 = mx[6]; - __m64 x14 = mx[7]; - __m64 x16 = mx[8]; - __m64 x18 = mx[9]; - __m64 x20 = mx[10]; - __m64 x22 = mx[11]; - __m64 x26 = _mm_cvtsi32_si64(x[26]); - __m64 x27 = _mm_cvtsi32_si64(x[27]); - - __m64 y0 = _mm_cvtsi32_si64(y[0]); - __m64 y1 = _mm_cvtsi32_si64(y[1]); - __m64 y4 = my[2]; - __m64 y6 = my[3]; - __m64 y8 = my[4]; - __m64 y10 = my[5]; - __m64 y12 = my[6]; - __m64 y14 = my[7]; - __m64 y16 = my[8]; - __m64 y18 = my[9]; - __m64 y20 = my[10]; - __m64 y22 = my[11]; - __m64 y26 = _mm_cvtsi32_si64(y[26]); - __m64 y27 = _mm_cvtsi32_si64(y[27]); - - __m64 z0 = _mm_cvtsi32_si64(z[0]); - __m64 z1 = _mm_cvtsi32_si64(z[1]); - __m64 z4 = mz[2]; - __m64 z6 = mz[3]; - __m64 z8 = mz[4]; - __m64 z10 = mz[5]; - __m64 z12 = mz[6]; - __m64 z14 = mz[7]; - __m64 z16 = mz[8]; - __m64 z18 = mz[9]; - __m64 z20 = mz[10]; - __m64 z22 = mz[11]; - __m64 z26 = _mm_cvtsi32_si64(z[26]); - - s1 = _mm_add_si64(w1, w4); - C[1] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w6, w8); - s1 = _mm_add_si64(s1, s2); - C[2] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w10, w12); - s1 = _mm_add_si64(s1, s2); - C[3] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x0, y0); - s2 = _mm_add_si64(w14, w16); - s1 = _mm_add_si64(s1, s3); - s1 = _mm_add_si64(s1, s2); - C[4] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x1, y1); - s4 = _mm_add_si64(x4, y4); - s1 = _mm_add_si64(s1, w18); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, w20); - s1 = _mm_add_si64(s1, s3); - C[5] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x6, y6); - s4 = _mm_add_si64(x8, y8); - s1 = _mm_add_si64(s1, w22); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, w26); - s1 = _mm_add_si64(s1, s3); - C[6] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x10, y10); - s4 = _mm_add_si64(x12, y12); - s1 = _mm_add_si64(s1, w27); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, s3); - C[7] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x14, y14); - s4 = _mm_add_si64(x16, y16); - s1 = _mm_add_si64(s1, z0); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, s3); - C[8] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x18, y18); - s4 = _mm_add_si64(x20, y20); - s1 = _mm_add_si64(s1, z1); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, z4); - s1 = _mm_add_si64(s1, s3); - C[9] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x22, y22); - s4 = _mm_add_si64(x26, y26); - s1 = _mm_add_si64(s1, z6); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, z8); - s1 = _mm_add_si64(s1, s3); - C[10] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x27, y27); - s1 = _mm_add_si64(s1, z10); - s1 = _mm_add_si64(s1, z12); - s1 = _mm_add_si64(s1, s3); - C[11] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(z14, z16); - s1 = _mm_add_si64(s1, s3); - C[12] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(z18, z20); - s1 = _mm_add_si64(s1, s3); - C[13] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(z22, z26); - s1 = _mm_add_si64(s1, s3); - C[14] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - C[15] = z[27] + _mm_cvtsi64_si32(s1); - _mm_empty(); } -void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B) -{ - __m128i temp[21]; - const word *w = (word *)temp; - const __m64 *mw = (__m64 *)w; - const word *x = (word *)temp+7*4; - const __m64 *mx = (__m64 *)x; - const word *y = (word *)temp+7*4*2; - const __m64 *my = (__m64 *)y; - - P4_Mul(temp, (__m128i *)A, (__m128i *)B); - - P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B); - - P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1); - - C[0] = w[0]; - - __m64 s1, s2, s3, s4; - - __m64 w1 = _mm_cvtsi32_si64(w[1]); - __m64 w4 = mw[2]; - __m64 w6 = mw[3]; - __m64 w8 = mw[4]; - __m64 w10 = mw[5]; - __m64 w12 = mw[6]; - __m64 w14 = mw[7]; - __m64 w16 = mw[8]; - __m64 w18 = mw[9]; - __m64 w20 = mw[10]; - __m64 w22 = mw[11]; - __m64 w26 = _mm_cvtsi32_si64(w[26]); - - __m64 x0 = _mm_cvtsi32_si64(x[0]); - __m64 x1 = _mm_cvtsi32_si64(x[1]); - __m64 x4 = mx[2]; - __m64 x6 = mx[3]; - __m64 x8 = mx[4]; - - __m64 y0 = _mm_cvtsi32_si64(y[0]); - __m64 y1 = _mm_cvtsi32_si64(y[1]); - __m64 y4 = my[2]; - __m64 y6 = my[3]; - __m64 y8 = my[4]; - - s1 = _mm_add_si64(w1, w4); - C[1] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w6, w8); - s1 = _mm_add_si64(s1, s2); - C[2] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w10, w12); - s1 = _mm_add_si64(s1, s2); - C[3] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x0, y0); - s2 = _mm_add_si64(w14, w16); - s1 = _mm_add_si64(s1, s3); - s1 = _mm_add_si64(s1, s2); - C[4] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x1, y1); - s4 = _mm_add_si64(x4, y4); - s1 = _mm_add_si64(s1, w18); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, w20); - s1 = _mm_add_si64(s1, s3); - C[5] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x6, y6); - s4 = _mm_add_si64(x8, y8); - s1 = _mm_add_si64(s1, w22); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, w26); - s1 = _mm_add_si64(s1, s3); - C[6] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12]; - _mm_empty(); +inline int Subtract(word *C, const word *A, const word *B, size_t N) +{ +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + return s_pSub(N, C, A, B); +#else + return Baseline_Sub(N, C, A, B); +#endif } -#endif // #ifdef SSE2_INTRINSICS_AVAILABLE - // ******************************************************** + #define A0 A #define A1 (A+N2) #define B0 B @@ -2004,64 +1876,37 @@ void RecursiveMultiply(word *R, word *T, const word *A, const word *B, size_t N) { assert(N>=2 && N%2==0); - if (LowLevel::MultiplyRecursionLimit() >= 8 && N==8) - LowLevel::Multiply8(R, A, B); - else if (LowLevel::MultiplyRecursionLimit() >= 4 && N==4) - LowLevel::Multiply4(R, A, B); - else if (N==2) - LowLevel::Multiply2(R, A, B); + if (N <= s_recursionLimit) + s_pMul[N/4](R, A, B); else { const size_t N2 = N/2; - int carry; - int aComp = Compare(A0, A1, N2); - int bComp = Compare(B0, B1, N2); + size_t AN2 = Compare(A0, A1, N2) > 0 ? 0 : N2; + Subtract(R0, A + AN2, A + (N2 ^ AN2), N2); - switch (2*aComp + aComp + bComp) - { - case -4: - LowLevel::Subtract(R0, A1, A0, N2); - LowLevel::Subtract(R1, B0, B1, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - LowLevel::Subtract(T1, T1, R0, N2); - carry = -1; - break; - case -2: - LowLevel::Subtract(R0, A1, A0, N2); - LowLevel::Subtract(R1, B0, B1, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - carry = 0; - break; - case 2: - LowLevel::Subtract(R0, A0, A1, N2); - LowLevel::Subtract(R1, B1, B0, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - carry = 0; - break; - case 4: - LowLevel::Subtract(R0, A1, A0, N2); - LowLevel::Subtract(R1, B0, B1, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - LowLevel::Subtract(T1, T1, R1, N2); - carry = -1; - break; - default: - SetWords(T0, 0, N); - carry = 0; - } + size_t BN2 = Compare(B0, B1, N2) > 0 ? 0 : N2; + Subtract(R1, B + BN2, B + (N2 ^ BN2), N2); - RecursiveMultiply(R0, T2, A0, B0, N2); RecursiveMultiply(R2, T2, A1, B1, N2); + RecursiveMultiply(T0, T2, R0, R1, N2); + RecursiveMultiply(R0, T2, A0, B0, N2); // now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1 - carry += LowLevel::Add(T0, T0, R0, N); - carry += LowLevel::Add(T0, T0, R2, N); - carry += LowLevel::Add(R1, R1, T0, N); + int c2 = Add(R2, R2, R1, N2); + int c3 = c2; + c2 += Add(R1, R2, R0, N2); + c3 += Add(R2, R2, R3, N2); - assert (carry >= 0 && carry <= 2); - Increment(R3, N2, carry); + if (AN2 == BN2) + c3 -= Subtract(R1, R1, T0, N); + else + c3 += Add(R1, R1, T0, N); + + c3 += Increment(R2, N2, c2); + assert (c3 >= 0 && c3 <= 2); + Increment(R3, N2, c3); } } @@ -2072,12 +1917,9 @@ void RecursiveMultiply(word *R, word *T, const word *A, const word *B, size_t N) void RecursiveSquare(word *R, word *T, const word *A, size_t N) { assert(N && N%2==0); - if (LowLevel::SquareRecursionLimit() >= 8 && N==8) - LowLevel::Square8(R, A); - if (LowLevel::SquareRecursionLimit() >= 4 && N==4) - LowLevel::Square4(R, A); - else if (N==2) - LowLevel::Square2(R, A); + + if (N <= s_recursionLimit) + s_pSqu[N/4](R, A); else { const size_t N2 = N/2; @@ -2086,35 +1928,32 @@ void RecursiveSquare(word *R, word *T, const word *A, size_t N) RecursiveSquare(R2, T2, A1, N2); RecursiveMultiply(T0, T2, A0, A1, N2); - int carry = LowLevel::Add(R1, R1, T0, N); - carry += LowLevel::Add(R1, R1, T0, N); + int carry = Add(R1, R1, T0, N); + carry += Add(R1, R1, T0, N); Increment(R3, N2, carry); } } // R[N] - bottom half of A*B -// T[N] - temporary work space +// T[3*N/2] - temporary work space // A[N] - multiplier // B[N] - multiplicant void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N) { assert(N>=2 && N%2==0); - if (LowLevel::MultiplyBottomRecursionLimit() >= 8 && N==8) - LowLevel::Multiply8Bottom(R, A, B); - else if (LowLevel::MultiplyBottomRecursionLimit() >= 4 && N==4) - LowLevel::Multiply4Bottom(R, A, B); - else if (N==2) - LowLevel::Multiply2Bottom(R, A, B); + + if (N <= s_recursionLimit) + s_pBot[N/4](R, A, B); else { const size_t N2 = N/2; RecursiveMultiply(R, T, A0, B0, N2); RecursiveMultiplyBottom(T0, T1, A1, B0, N2); - LowLevel::Add(R1, R1, T0, N2); + Add(R1, R1, T0, N2); RecursiveMultiplyBottom(T0, T1, A0, B1, N2); - LowLevel::Add(R1, R1, T0, N2); + Add(R1, R1, T0, N2); } } @@ -2124,88 +1963,61 @@ void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, siz // A[N] --- multiplier // B[N] --- multiplicant -void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N) +void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N) { assert(N>=2 && N%2==0); - if (N==4) - { - LowLevel::Multiply4(T, A, B); - memcpy(R, T+4, 4*WORD_SIZE); - } - else if (N==2) +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + if (HasSSE2() && ((N>=8) & (N<=32))) + s_pTop[N/16](R, A, B, L[N-1]); + else +#endif + if (N<=4) { - LowLevel::Multiply2(T, A, B); - memcpy(R, T+2, 2*WORD_SIZE); + s_pMul[N/4](T, A, B); + memcpy(R, T+N, N*WORD_SIZE); } else { const size_t N2 = N/2; - int carry; - int aComp = Compare(A0, A1, N2); - int bComp = Compare(B0, B1, N2); + size_t AN2 = Compare(A0, A1, N2) > 0 ? 0 : N2; + Subtract(R0, A + AN2, A + (N2 ^ AN2), N2); - switch (2*aComp + aComp + bComp) - { - case -4: - LowLevel::Subtract(R0, A1, A0, N2); - LowLevel::Subtract(R1, B0, B1, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - LowLevel::Subtract(T1, T1, R0, N2); - carry = -1; - break; - case -2: - LowLevel::Subtract(R0, A1, A0, N2); - LowLevel::Subtract(R1, B0, B1, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - carry = 0; - break; - case 2: - LowLevel::Subtract(R0, A0, A1, N2); - LowLevel::Subtract(R1, B1, B0, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - carry = 0; - break; - case 4: - LowLevel::Subtract(R0, A1, A0, N2); - LowLevel::Subtract(R1, B0, B1, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - LowLevel::Subtract(T1, T1, R1, N2); - carry = -1; - break; - default: - SetWords(T0, 0, N); - carry = 0; - } - - RecursiveMultiply(T2, R0, A1, B1, N2); + size_t BN2 = Compare(B0, B1, N2) > 0 ? 0 : N2; + Subtract(R1, B + BN2, B + (N2 ^ BN2), N2); - // now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1 + RecursiveMultiply(T0, T2, R0, R1, N2); + RecursiveMultiply(R0, T2, A1, B1, N2); - int c2 = LowLevel::Subtract(R0, L+N2, L, N2); - c2 += LowLevel::Subtract(R0, R0, T0, N2); - int t = (Compare(R0, T2, N2) == -1); + // now T[01] holds (A1-A0)*(B0-B1) = A1*B0+A0*B1-A1*B1-A0*B0, R[01] holds A1*B1 - carry += t; - carry += Increment(R0, N2, c2+t); - carry += LowLevel::Add(R0, R0, T1, N2); - carry += LowLevel::Add(R0, R0, T3, N2); - assert (carry >= 0 && carry <= 2); + int t, c3; + int c2 = Subtract(T2, L+N2, L, N2); - CopyWords(R1, T3, N2); - Increment(R1, N2, carry); - } -} + if (AN2 == BN2) + { + c2 -= Add(T2, T2, T0, N2); + t = (Compare(T2, R0, N2) == -1); + c3 = t - Subtract(T2, T2, T1, N2); + } + else + { + c2 += Subtract(T2, T2, T0, N2); + t = (Compare(T2, R0, N2) == -1); + c3 = t + Add(T2, T2, T1, N2); + } -inline int Add(word *C, const word *A, const word *B, size_t N) -{ - return LowLevel::Add(C, A, B, N); -} + c2 += t; + if (c2 >= 0) + c3 += Increment(T2, N2, c2); + else + c3 -= Decrement(T2, N2, -c2); + c3 += Add(R0, T2, R1, N2); -inline int Subtract(word *C, const word *A, const word *B, size_t N) -{ - return LowLevel::Subtract(C, A, B, N); + assert (c3 >= 0 && c3 <= 2); + Increment(R1, N2, c3); + } } inline void Multiply(word *R, word *T, const word *A, const word *B, size_t N) @@ -2223,23 +2035,6 @@ inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, size_ RecursiveMultiplyBottom(R, T, A, B, N); } -inline void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N) -{ - RecursiveMultiplyTop(R, T, L, A, B, N); -} - -static word LinearMultiply(word *C, const word *A, word B, size_t N) -{ - word carry=0; - for(unsigned i=0; i<N; i++) - { - DWord p = DWord::MultiplyAndAdd(A[i], B, carry); - C[i] = p.GetLowHalf(); - carry = p.GetHighHalf(); - } - return carry; -} - // R[NA+NB] - result = A*B // T[NA+NB] - temporary work space // A[NA] ---- multiplier @@ -2264,7 +2059,6 @@ void AsymmetricMultiply(word *R, word *T, const word *A, size_t NA, const word * } assert(NB % NA == 0); - assert((NB/NA)%2 == 0); // NB is an even multiple of NA if (NA==2 && !A[1]) { @@ -2284,15 +2078,24 @@ void AsymmetricMultiply(word *R, word *T, const word *A, size_t NA, const word * } } - Multiply(R, T, A, B, NA); - CopyWords(T+2*NA, R+NA, NA); - size_t i; + if ((NB/NA)%2 == 0) + { + Multiply(R, T, A, B, NA); + CopyWords(T+2*NA, R+NA, NA); - for (i=2*NA; i<NB; i+=2*NA) - Multiply(T+NA+i, T, A, B+i, NA); - for (i=NA; i<NB; i+=2*NA) - Multiply(R+i, T, A, B+i, NA); + for (i=2*NA; i<NB; i+=2*NA) + Multiply(T+NA+i, T, A, B+i, NA); + for (i=NA; i<NB; i+=2*NA) + Multiply(R+i, T, A, B+i, NA); + } + else + { + for (i=0; i<NB; i+=2*NA) + Multiply(R+i, T, A, B+i, NA); + for (i=NA; i<NB; i+=2*NA) + Multiply(T+NA+i, T, A, B+i, NA); + } if (Add(R+NA, R+NA, T+2*NA, NB-NA)) Increment(R+NB, NA); @@ -2308,10 +2111,10 @@ void RecursiveInverseModPower2(word *R, word *T, const word *A, size_t N) { T[0] = AtomicInverseModPower2(A[0]); T[1] = 0; - LowLevel::Multiply2Bottom(T+2, T, A); + s_pBot[0](T+2, T, A); TwosComplement(T+2, 2); Increment(T+2, 2, 2); - LowLevel::Multiply2Bottom(R, T, T+2); + s_pBot[0](R, T, T+2); } else { @@ -2333,8 +2136,9 @@ void RecursiveInverseModPower2(word *R, word *T, const word *A, size_t N) // M[N] --- modulus // U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N) -void MontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, size_t N) +void MontgomeryReduce(word *R, word *T, word *X, const word *M, const word *U, size_t N) { +#if 1 MultiplyBottom(R, T, X, U, N); MultiplyTop(T, T+N, X, R, M, N); word borrow = Subtract(T, X+N, T, N); @@ -2342,6 +2146,60 @@ void MontgomeryReduce(word *R, word *T, const word *X, const word *M, const word word carry = Add(T+N, T, M, N); assert(carry || !borrow); CopyWords(R, T + (borrow ? N : 0), N); +#elif 0 + const word u = 0-U[0]; + Declare2Words(p) + for (size_t i=0; i<N; i++) + { + const word t = u * X[i]; + word c = 0; + for (size_t j=0; j<N; j+=2) + { + MultiplyWords(p, t, M[j]); + Acc2WordsBy1(p, X[i+j]); + Acc2WordsBy1(p, c); + X[i+j] = LowWord(p); + c = HighWord(p); + MultiplyWords(p, t, M[j+1]); + Acc2WordsBy1(p, X[i+j+1]); + Acc2WordsBy1(p, c); + X[i+j+1] = LowWord(p); + c = HighWord(p); + } + + if (Increment(X+N+i, N-i, c)) + while (!Subtract(X+N, X+N, M, N)) {} + } + + memcpy(R, X+N, N*WORD_SIZE); +#else + __m64 u = _mm_cvtsi32_si64(0-U[0]), p; + for (size_t i=0; i<N; i++) + { + __m64 t = _mm_cvtsi32_si64(X[i]); + t = _mm_mul_su32(t, u); + __m64 c = _mm_setzero_si64(); + for (size_t j=0; j<N; j+=2) + { + p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j])); + p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j])); + c = _mm_add_si64(c, p); + X[i+j] = _mm_cvtsi64_si32(c); + c = _mm_srli_si64(c, 32); + p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j+1])); + p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j+1])); + c = _mm_add_si64(c, p); + X[i+j+1] = _mm_cvtsi64_si32(c); + c = _mm_srli_si64(c, 32); + } + + if (Increment(X+N+i, N-i, _mm_cvtsi64_si32(c))) + while (!Subtract(X+N, X+N, M, N)) {} + } + + memcpy(R, X+N, N*WORD_SIZE); + _mm_empty(); +#endif } // R[N] --- result = X/(2**(WORD_BITS*N/2)) mod M @@ -2491,7 +2349,7 @@ static inline void AtomicDivide(word *Q, const word *A, const word *B) // multiply quotient and divisor and add remainder, make sure it equals dividend assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0]))); word P[4]; - Portable::Multiply2(P, Q, B); + s_pMul[0](P, Q, B); Add(P, P, T, 4); assert(memcmp(P, A, 4*WORD_SIZE)==0); } @@ -2503,21 +2361,7 @@ static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, si { assert(N && N%2==0); - if (Q[1]) - { - T[N] = T[N+1] = 0; - unsigned i; - for (i=0; i<N; i+=4) - LowLevel::Multiply2(T+i, Q, B+i); - for (i=2; i<N; i+=4) - if (LowLevel::Multiply2Add(T+i, Q, B+i)) - T[i+5] += (++T[i+4]==0); - } - else - { - T[N] = LinearMultiply(T, B, Q[0], N); - T[N+1] = 0; - } + AsymmetricMultiply(T, T+N+2, Q, 2, B, N); word borrow = Subtract(R, R, T, N+2); assert(!borrow && !R[N+1]); @@ -2532,7 +2376,7 @@ static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, si // R[NB] -------- remainder = A%B // Q[NA-NB+2] --- quotient = A/B -// T[NA+2*NB+4] - temp work space +// T[NA+3*(NB+2)] - temp work space // A[NA] -------- dividend // B[NB] -------- divisor @@ -2726,9 +2570,7 @@ InitializeInteger::InitializeInteger() { if (!g_pAssignIntToInteger) { -#ifdef CRYPTOPP_X86ASM_AVAILABLE - SetPentiumFunctionPointers(); -#endif + SetFunctionPointers(); g_pAssignIntToInteger = AssignIntToInteger; } } @@ -2877,7 +2719,8 @@ Integer& Integer::operator=(const Integer& t) { if (this != &t) { - reg.New(RoundupSize(t.WordCount())); + if (reg.size() != t.reg.size() || t.reg[t.reg.size()/2] == 0) + reg.New(RoundupSize(t.WordCount())); CopyWords(reg, t.reg, reg.size()); sign = t.sign; } @@ -3240,7 +3083,7 @@ public: void GenerateBlock(byte *output, size_t size) { - UnalignedPutWord(BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter); + PutWord(false, BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter); ++m_counter; P1363_KDF2<SHA1>::DeriveKey(output, size, m_counterAndSeed, m_counterAndSeed.size(), NULL, 0); } @@ -3657,7 +3500,7 @@ void PositiveMultiply(Integer &product, const Integer &a, const Integer &b) product.reg.CleanNew(RoundupSize(aSize+bSize)); product.sign = Integer::POSITIVE; - SecAlignedWordBlock workspace(aSize + bSize); + IntegerSecBlock workspace(aSize + bSize); AsymmetricMultiply(product.reg, workspace, a.reg, aSize, b.reg, bSize); } @@ -3723,7 +3566,7 @@ void PositiveDivide(Integer &remainder, Integer "ient, quotient.reg.CleanNew(RoundupSize(aSize-bSize+2)); quotient.sign = Integer::POSITIVE; - SecAlignedWordBlock T(aSize+2*bSize+4); + IntegerSecBlock T(aSize+3*(bSize+2)); Divide(remainder.reg, quotient.reg, T, a.reg, aSize, b.reg, bSize); } |