summaryrefslogtreecommitdiff
path: root/integer.cpp
diff options
context:
space:
mode:
authorweidai <weidai@57ff6487-cd31-0410-9ec3-f628ee90f5f0>2003-07-31 01:54:53 +0000
committerweidai <weidai@57ff6487-cd31-0410-9ec3-f628ee90f5f0>2003-07-31 01:54:53 +0000
commite84412066595cc6bec6565fbbf1a5ea6936c11d3 (patch)
treea628d3e147d838d2d23398c236d34d643b6330d2 /integer.cpp
parentab81a0ed8b889d6e50225792f4b3b62f0217d182 (diff)
downloadcryptopp-e84412066595cc6bec6565fbbf1a5ea6936c11d3.tar.gz
enable SSE2 intrinsics on GCC 3.3 or later
git-svn-id: svn://svn.code.sf.net/p/cryptopp/code/trunk/c5@121 57ff6487-cd31-0410-9ec3-f628ee90f5f0
Diffstat (limited to 'integer.cpp')
-rw-r--r--integer.cpp1400
1 files changed, 671 insertions, 729 deletions
diff --git a/integer.cpp b/integer.cpp
index f5b5fc4..93539dd 100644
--- a/integer.cpp
+++ b/integer.cpp
@@ -18,9 +18,16 @@
#include <iostream>
#ifdef SSE2_INTRINSICS_AVAILABLE
-#include <emmintrin.h>
+ #ifdef __GNUC__
+ #include <xmmintrin.h>
+ #include <malloc.h>
+ #else
+ #include <emmintrin.h>
+ #endif
#elif defined(_MSC_VER) && defined(_M_IX86)
-#pragma message("You do no seem to have the Visual C++ Processor Pack installed, so use of SSE2 intrinsics will be disabled.")
+ #pragma message("You do no seem to have the Visual C++ Processor Pack installed, so use of SSE2 intrinsics will be disabled.")
+#elif defined(__GNUC__) && defined(__i386__)
+ #pragma message("You do not have GCC 3.3 or later, or did not specify -msse2 compiler option, so use of SSE2 intrinsics will be disabled.")
#endif
NAMESPACE_BEGIN(CryptoPP)
@@ -41,7 +48,11 @@ CPP_TYPENAME AllocatorBase<T>::pointer AlignedAllocator<T>::allocate(size_type n
{
#ifdef SSE2_INTRINSICS_AVAILABLE
if (n >= 4)
- return (T *)_mm_malloc(sizeof(T)*n, 16);
+ #ifdef __GNUC__
+ return (T *)memalign(16, sizeof(T)*n);
+ #else
+ return (T *)_mm_malloc(sizeof(T)*n, 16);
+ #endif
else
#endif
return new T[n];
@@ -53,10 +64,14 @@ void AlignedAllocator<T>::deallocate(void *p, size_type n)
memset(p, 0, n*sizeof(T));
#ifdef SSE2_INTRINSICS_AVAILABLE
if (n >= 4)
- _mm_free(p);
+ #ifdef __GNUC__
+ free(p);
+ #else
+ _mm_free(p);
+ #endif
else
#endif
- delete [] p;
+ delete [] (T *)p;
}
#endif
@@ -640,6 +655,13 @@ void Portable::Square2(word *R, const word *A)
void Portable::Square4(word *R, const word *A)
{
+#ifdef _MSC_VER
+ // VC60 workaround: MSVC 6.0 has an optimization bug that makes
+ // (dword)A*B where either A or B has been cast to a dword before
+ // very expensive. Revisit this function when this
+ // bug is fixed.
+ Multiply4(R, A, A);
+#else
const word *B = A;
DWord p, q;
word c, d, e;
@@ -666,6 +688,7 @@ void Portable::Square4(word *R, const word *A)
p = DWord::MultiplyAndAdd(A[3], A[3], d);
R[6] = p.GetLowHalf();
R[7] = e + p.GetHighHalf();
+#endif
}
void Portable::Multiply8(word *R, const word *A, const word *B)
@@ -834,125 +857,600 @@ void Portable::Multiply8Bottom(word *R, const word *A, const word *B)
#undef SaveSquAcc
// CodeWarrior defines _MSC_VER
-#if defined(_MSC_VER) && !defined(__MWERKS__) && defined(_M_IX86) && (_M_IX86<=700)
+#if (defined(_MSC_VER) && !defined(__MWERKS__) && defined(_M_IX86)) || (defined(__GNUC__) && defined(__i386__))
class PentiumOptimized : public Portable
{
public:
- static word __fastcall Add(word *C, const word *A, const word *B, unsigned int N);
- static word __fastcall Subtract(word *C, const word *A, const word *B, unsigned int N);
-// TODO test this with .NET #if _MSC_VER < 1300
- static inline void Square4(word *R, const word *A)
- {
- // VC60 workaround: MSVC 6.0 has an optimization bug that makes
- // (dword)A*B where either A or B has been cast to a dword before
- // very expensive. Revisit this function when this
- // bug is fixed.
- Multiply4(R, A, A);
- }
-//#endif
+ static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N);
+ static word CRYPTOPP_CDECL Subtract(word *C, const word *A, const word *B, unsigned int N);
+#ifdef __GNUC__
+ static void Square4(word *R, const word *A);
+ static void Multiply4(word *C, const word *A, const word *B);
+ static void Multiply8(word *C, const word *A, const word *B);
+#endif
};
typedef PentiumOptimized LowLevel;
-__declspec(naked) word __fastcall PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)
+// this may be selected at run time
+class P4Optimized : public PentiumOptimized
{
- __asm
- {
- push ebp
- push ebx
- push esi
- push edi
+public:
+ static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N);
+ static word CRYPTOPP_CDECL Subtract(word *C, const word *A, const word *B, unsigned int N);
+#ifdef SSE2_INTRINSICS_AVAILABLE
+ static void Multiply4(word *C, const word *A, const word *B);
+ static void Multiply8(word *C, const word *A, const word *B);
+ static void Multiply8Bottom(word *C, const word *A, const word *B);
+ static inline void Square4(word *R, const word *A) {Multiply4(R, A, A);}
+#endif
+};
+
+// use some tricks to share assembly code between MSVC and GCC
+#ifdef _MSC_VER
+ #define CRYPTOPP_NAKED __declspec(naked)
+ #define AS1(x) __asm x
+ #define AS2(x, y) __asm x, y
+ #define PentiumPrologue \
+ __asm push ebp \
+ __asm push ebx \
+ __asm push esi \
+ __asm push edi \
+ __asm mov ecx, [esp+20] \
+ __asm mov edx, [esp+24] \
+ __asm mov ebx, [esp+28] \
+ __asm mov esi, [esp+32]
+ #define PentiumEpilogue \
+ __asm pop edi \
+ __asm pop esi \
+ __asm pop ebx \
+ __asm pop ebp \
+ __asm ret
+ #define P4Prologue \
+ __asm sub esp, 16 \
+ __asm mov [esp], edi \
+ __asm mov [esp+4], esi \
+ __asm mov [esp+8], ebx \
+ __asm mov [esp+12], ebp \
+ __asm mov ecx, [esp+20] \
+ __asm mov edx, [esp+24] \
+ __asm mov ebx, [esp+28] \
+ __asm mov esi, [esp+32]
+ #define P4Epilogue \
+ __asm mov edi, [esp] \
+ __asm mov esi, [esp+4] \
+ __asm mov ebx, [esp+8] \
+ __asm mov ebp, [esp+12] \
+ __asm add esp, 16 \
+ __asm ret
+#else
+ #define CRYPTOPP_NAKED
+ #define AS1(x) #x ";"
+ #define AS2(x, y) #x ", " #y ";"
+ #define PentiumPrologue \
+ __asm__ \
+ ( \
+ ".att_syntax prefix;" \
+ "mov %0, %%ecx;" \
+ "mov %1, %%edx;" \
+ "mov %2, %%ebx;" \
+ "mov %3, %%esi;" \
+ ".intel_syntax noprefix;"
+ #define PentiumEpilogue \
+ ".att_syntax prefix;" \
+ : \
+ : "m" (C), "m" (A), "m" (B), "m" (N) \
+ : "%ecx", "%edx", "%ebx", "%esi", "%edi" \
+ );
+ #define P4Prologue PentiumPrologue
+ #define P4Epilogue PentiumEpilogue
+#endif
- mov esi, [esp+24] ; N
- mov ebx, [esp+20] ; B
+CRYPTOPP_NAKED word PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)
+{
+ PentiumPrologue
- // now: ebx = B, ecx = C, edx = A, esi = N
+ // now: ebx = B, ecx = C, edx = A, esi = N
+ AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C
+ AS2( xor eax, eax) // clear eax
- sub ecx, edx // hold the distance between C & A so we can add this to A to get C
- xor eax, eax // clear eax
+ AS2( sub eax, esi) // eax is a negative index from end of B
+ AS2( lea ebx, [ebx+4*esi]) // ebx is end of B
- sub eax, esi // eax is a negative index from end of B
- lea ebx, [ebx+4*esi] // ebx is end of B
+ AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag
+ AS1( jz loopendAdd) // if no dwords then nothing to do
- sar eax, 1 // unit of eax is now dwords; this also clears the carry flag
- jz loopend // if no dwords then nothing to do
+ AS1(loopstartAdd:)
+ AS2( mov esi,[edx]) // load lower word of A
+ AS2( mov ebp,[edx+4]) // load higher word of A
-loopstart:
- mov esi,[edx] // load lower word of A
- mov ebp,[edx+4] // load higher word of A
+ AS2( mov edi,[ebx+8*eax]) // load lower word of B
+ AS2( lea edx,[edx+8]) // advance A and C
- mov edi,[ebx+8*eax] // load lower word of B
- lea edx,[edx+8] // advance A and C
+ AS2( adc esi,edi) // add lower words
+ AS2( mov edi,[ebx+8*eax+4]) // load higher word of B
- adc esi,edi // add lower words
- mov edi,[ebx+8*eax+4] // load higher word of B
+ AS2( adc ebp,edi) // add higher words
+ AS1( inc eax) // advance B
- adc ebp,edi // add higher words
- inc eax // advance B
+ AS2( mov [edx+ecx-8],esi) // store lower word result
+ AS2( mov [edx+ecx-4],ebp) // store higher word result
- mov [edx+ecx-8],esi // store lower word result
- mov [edx+ecx-4],ebp // store higher word result
+ AS1( jnz loopstartAdd) // loop until eax overflows and becomes zero
- jnz loopstart // loop until eax overflows and becomes zero
+ AS1(loopendAdd:)
+ AS2( adc eax, 0) // store carry into eax (return result register)
-loopend:
- adc eax, 0 // store carry into eax (return result register)
- pop edi
- pop esi
- pop ebx
- pop ebp
- ret 8
- }
+ PentiumEpilogue
}
-__declspec(naked) word __fastcall PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
+CRYPTOPP_NAKED word PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
{
- __asm
- {
- push ebp
- push ebx
- push esi
- push edi
+ PentiumPrologue
- mov esi, [esp+24] ; N
- mov ebx, [esp+20] ; B
+ // now: ebx = B, ecx = C, edx = A, esi = N
+ AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C
+ AS2( xor eax, eax) // clear eax
- sub ecx, edx
- xor eax, eax
+ AS2( sub eax, esi) // eax is a negative index from end of B
+ AS2( lea ebx, [ebx+4*esi]) // ebx is end of B
- sub eax, esi
- lea ebx, [ebx+4*esi]
+ AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag
+ AS1( jz loopendSub) // if no dwords then nothing to do
- sar eax, 1
- jz loopend
+ AS1(loopstartSub:)
+ AS2( mov esi,[edx]) // load lower word of A
+ AS2( mov ebp,[edx+4]) // load higher word of A
-loopstart:
- mov esi,[edx]
- mov ebp,[edx+4]
+ AS2( mov edi,[ebx+8*eax]) // load lower word of B
+ AS2( lea edx,[edx+8]) // advance A and C
- mov edi,[ebx+8*eax]
- lea edx,[edx+8]
+ AS2( sbb esi,edi) // subtract lower words
+ AS2( mov edi,[ebx+8*eax+4]) // load higher word of B
- sbb esi,edi
- mov edi,[ebx+8*eax+4]
+ AS2( sbb ebp,edi) // subtract higher words
+ AS1( inc eax) // advance B
- sbb ebp,edi
- inc eax
+ AS2( mov [edx+ecx-8],esi) // store lower word result
+ AS2( mov [edx+ecx-4],ebp) // store higher word result
- mov [edx+ecx-8],esi
- mov [edx+ecx-4],ebp
+ AS1( jnz loopstartSub) // loop until eax overflows and becomes zero
- jnz loopstart
+ AS1(loopendSub:)
+ AS2( adc eax, 0) // store carry into eax (return result register)
-loopend:
- adc eax, 0
- pop edi
- pop esi
- pop ebx
- pop ebp
- ret 8
- }
+ PentiumEpilogue
+}
+
+CRYPTOPP_NAKED word P4Optimized::Add(word *C, const word *A, const word *B, unsigned int N)
+{
+ P4Prologue
+
+ // now: ebx = B, ecx = C, edx = A, esi = N
+ AS2( xor eax, eax)
+ AS1( neg esi)
+ AS1( jz loopendAddP4) // if no dwords then nothing to do
+
+ AS2( mov edi, [edx])
+ AS2( mov ebp, [ebx])
+
+ AS1(loopstartAddP4:)
+ AS2( add edi, eax)
+ AS1( jc carry1AddP4)
+
+ AS2( xor eax, eax)
+
+ AS1(carry1continueAddP4:)
+ AS2( add edi, ebp)
+ AS2( mov ebp, 1)
+ AS2( mov [ecx], edi)
+ AS2( mov edi, [edx+4])
+ AS2( cmovc eax, ebp)
+ AS2( mov ebp, [ebx+4])
+ AS2( lea ebx, [ebx+8])
+ AS2( add edi, eax)
+ AS1( jc carry2AddP4)
+
+ AS2( xor eax, eax)
+
+ AS1(carry2continueAddP4:)
+ AS2( add edi, ebp)
+ AS2( mov ebp, 1)
+ AS2( cmovc eax, ebp)
+ AS2( mov [ecx+4], edi)
+ AS2( add ecx, 8)
+ AS2( mov edi, [edx+8])
+ AS2( add edx, 8)
+ AS2( add esi, 2)
+ AS2( mov ebp, [ebx])
+ AS1( jnz loopstartAddP4)
+ AS1( jmp loopendAddP4)
+
+ AS1(carry1AddP4:)
+ AS2( mov eax, 1)
+ AS1( jmp carry1continueAddP4)
+
+ AS1(carry2AddP4:)
+ AS2( mov eax, 1)
+ AS1( jmp carry2continueAddP4)
+
+ AS1(loopendAddP4:)
+
+ P4Epilogue
+}
+
+CRYPTOPP_NAKED word P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
+{
+ P4Prologue
+
+ // now: ebx = B, ecx = C, edx = A, esi = N
+ AS2( xor eax, eax)
+ AS1( neg esi)
+ AS1( jz loopendSubP4) // if no dwords then nothing to do
+
+ AS2( mov edi, [edx])
+ AS2( mov ebp, [ebx])
+
+ AS1(loopstartSubP4:)
+ AS2( sub edi, eax)
+ AS1( jc carry1SubP4)
+
+ AS2( xor eax, eax)
+
+ AS1(carry1continueSubP4:)
+ AS2( sub edi, ebp)
+ AS2( mov ebp, 1)
+ AS2( mov [ecx], edi)
+ AS2( mov edi, [edx+4])
+ AS2( cmovc eax, ebp)
+ AS2( mov ebp, [ebx+4])
+ AS2( lea ebx, [ebx+8])
+ AS2( sub edi, eax)
+ AS1( jc carry2SubP4)
+
+ AS2( xor eax, eax)
+
+ AS1(carry2continueSubP4:)
+ AS2( sub edi, ebp)
+ AS2( mov ebp, 1)
+ AS2( cmovc eax, ebp)
+ AS2( mov [ecx+4], edi)
+ AS2( add ecx, 8)
+ AS2( mov edi, [edx+8])
+ AS2( add edx, 8)
+ AS2( add esi, 2)
+ AS2( mov ebp, [ebx])
+ AS1( jnz loopstartSubP4)
+ AS1( jmp loopendSubP4)
+
+ AS1(carry1SubP4:)
+ AS2( mov eax, 1)
+ AS1( jmp carry1continueSubP4)
+
+ AS1(carry2SubP4:)
+ AS2( mov eax, 1)
+ AS1( jmp carry2continueSubP4)
+
+ AS1(loopendSubP4:)
+
+ P4Epilogue
+}
+
+#if __GNUC__
+// Comba square and multiply assembly code originally contributed by Leonard Janke
+// These are not needed with MSVC, which does a good job of optimizing the C++ multiply code.
+
+#define SqrStartup \
+ "push %%ebp\n\t" \
+ "push %%esi\n\t" \
+ "push %%ebx\n\t" \
+ "xor %%ebp, %%ebp\n\t" \
+ "xor %%ebx, %%ebx\n\t" \
+ "xor %%ecx, %%ecx\n\t"
+
+#define SqrShiftCarry \
+ "mov %%ebx, %%ebp\n\t" \
+ "mov %%ecx, %%ebx\n\t" \
+ "xor %%ecx, %%ecx\n\t"
+
+#define SqrAccumulate(i,j) \
+ "mov 4*"#j"(%%esi), %%eax\n\t" \
+ "mull 4*"#i"(%%esi)\n\t" \
+ "add %%eax, %%ebp\n\t" \
+ "adc %%edx, %%ebx\n\t" \
+ "adc %%ch, %%cl\n\t" \
+ "add %%eax, %%ebp\n\t" \
+ "adc %%edx, %%ebx\n\t" \
+ "adc %%ch, %%cl\n\t"
+
+#define SqrAccumulateCentre(i) \
+ "mov 4*"#i"(%%esi), %%eax\n\t" \
+ "mull 4*"#i"(%%esi)\n\t" \
+ "add %%eax, %%ebp\n\t" \
+ "adc %%edx, %%ebx\n\t" \
+ "adc %%ch, %%cl\n\t"
+
+#define SqrStoreDigit(X) \
+ "mov %%ebp, 4*"#X"(%%edi)\n\t" \
+
+#define SqrLastDiagonal(digits) \
+ "mov 4*("#digits"-1)(%%esi), %%eax\n\t" \
+ "mull 4*("#digits"-1)(%%esi)\n\t" \
+ "add %%eax, %%ebp\n\t" \
+ "adc %%edx, %%ebx\n\t" \
+ "mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
+ "mov %%ebx, 4*(2*"#digits"-1)(%%edi)\n\t"
+
+#define SqrCleanup \
+ "pop %%ebx\n\t" \
+ "pop %%esi\n\t" \
+ "pop %%ebp\n\t"
+
+void PentiumOptimized::Square4(word* Y, const word* X)
+{
+ __asm__ __volatile__(
+ SqrStartup
+
+ SqrAccumulateCentre(0)
+ SqrStoreDigit(0)
+ SqrShiftCarry
+
+ SqrAccumulate(1,0)
+ SqrStoreDigit(1)
+ SqrShiftCarry
+
+ SqrAccumulate(2,0)
+ SqrAccumulateCentre(1)
+ SqrStoreDigit(2)
+ SqrShiftCarry
+
+ SqrAccumulate(3,0)
+ SqrAccumulate(2,1)
+ SqrStoreDigit(3)
+ SqrShiftCarry
+
+ SqrAccumulate(3,1)
+ SqrAccumulateCentre(2)
+ SqrStoreDigit(4)
+ SqrShiftCarry
+
+ SqrAccumulate(3,2)
+ SqrStoreDigit(5)
+ SqrShiftCarry
+
+ SqrLastDiagonal(4)
+
+ SqrCleanup
+
+ :
+ : "D" (Y), "S" (X)
+ : "eax", "ecx", "edx", "ebp", "memory"
+ );
+}
+
+#define MulStartup \
+ "push %%ebp\n\t" \
+ "push %%esi\n\t" \
+ "push %%ebx\n\t" \
+ "push %%edi\n\t" \
+ "mov %%eax, %%ebx \n\t" \
+ "xor %%ebp, %%ebp\n\t" \
+ "xor %%edi, %%edi\n\t" \
+ "xor %%ecx, %%ecx\n\t"
+
+#define MulShiftCarry \
+ "mov %%edx, %%ebp\n\t" \
+ "mov %%ecx, %%edi\n\t" \
+ "xor %%ecx, %%ecx\n\t"
+
+#define MulAccumulate(i,j) \
+ "mov 4*"#j"(%%ebx), %%eax\n\t" \
+ "mull 4*"#i"(%%esi)\n\t" \
+ "add %%eax, %%ebp\n\t" \
+ "adc %%edx, %%edi\n\t" \
+ "adc %%ch, %%cl\n\t"
+
+#define MulStoreDigit(X) \
+ "mov %%edi, %%edx \n\t" \
+ "mov (%%esp), %%edi \n\t" \
+ "mov %%ebp, 4*"#X"(%%edi)\n\t" \
+ "mov %%edi, (%%esp)\n\t"
+
+#define MulLastDiagonal(digits) \
+ "mov 4*("#digits"-1)(%%ebx), %%eax\n\t" \
+ "mull 4*("#digits"-1)(%%esi)\n\t" \
+ "add %%eax, %%ebp\n\t" \
+ "adc %%edi, %%edx\n\t" \
+ "mov (%%esp), %%edi\n\t" \
+ "mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
+ "mov %%edx, 4*(2*"#digits"-1)(%%edi)\n\t"
+
+#define MulCleanup \
+ "pop %%edi\n\t" \
+ "pop %%ebx\n\t" \
+ "pop %%esi\n\t" \
+ "pop %%ebp\n\t"
+
+void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
+{
+ __asm__ __volatile__(
+ MulStartup
+ MulAccumulate(0,0)
+ MulStoreDigit(0)
+ MulShiftCarry
+
+ MulAccumulate(1,0)
+ MulAccumulate(0,1)
+ MulStoreDigit(1)
+ MulShiftCarry
+
+ MulAccumulate(2,0)
+ MulAccumulate(1,1)
+ MulAccumulate(0,2)
+ MulStoreDigit(2)
+ MulShiftCarry
+
+ MulAccumulate(3,0)
+ MulAccumulate(2,1)
+ MulAccumulate(1,2)
+ MulAccumulate(0,3)
+ MulStoreDigit(3)
+ MulShiftCarry
+
+ MulAccumulate(3,1)
+ MulAccumulate(2,2)
+ MulAccumulate(1,3)
+ MulStoreDigit(4)
+ MulShiftCarry
+
+ MulAccumulate(3,2)
+ MulAccumulate(2,3)
+ MulStoreDigit(5)
+ MulShiftCarry
+
+ MulLastDiagonal(4)
+
+ MulCleanup
+
+ :
+ : "D" (Z), "S" (X), "a" (Y)
+ : "%ecx", "%edx", "memory"
+ );
+}
+
+void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
+{
+ __asm__ __volatile__(
+ MulStartup
+ MulAccumulate(0,0)
+ MulStoreDigit(0)
+ MulShiftCarry
+
+ MulAccumulate(1,0)
+ MulAccumulate(0,1)
+ MulStoreDigit(1)
+ MulShiftCarry
+
+ MulAccumulate(2,0)
+ MulAccumulate(1,1)
+ MulAccumulate(0,2)
+ MulStoreDigit(2)
+ MulShiftCarry
+
+ MulAccumulate(3,0)
+ MulAccumulate(2,1)
+ MulAccumulate(1,2)
+ MulAccumulate(0,3)
+ MulStoreDigit(3)
+ MulShiftCarry
+
+ MulAccumulate(4,0)
+ MulAccumulate(3,1)
+ MulAccumulate(2,2)
+ MulAccumulate(1,3)
+ MulAccumulate(0,4)
+ MulStoreDigit(4)
+ MulShiftCarry
+
+ MulAccumulate(5,0)
+ MulAccumulate(4,1)
+ MulAccumulate(3,2)
+ MulAccumulate(2,3)
+ MulAccumulate(1,4)
+ MulAccumulate(0,5)
+ MulStoreDigit(5)
+ MulShiftCarry
+
+ MulAccumulate(6,0)
+ MulAccumulate(5,1)
+ MulAccumulate(4,2)
+ MulAccumulate(3,3)
+ MulAccumulate(2,4)
+ MulAccumulate(1,5)
+ MulAccumulate(0,6)
+ MulStoreDigit(6)
+ MulShiftCarry
+
+ MulAccumulate(7,0)
+ MulAccumulate(6,1)
+ MulAccumulate(5,2)
+ MulAccumulate(4,3)
+ MulAccumulate(3,4)
+ MulAccumulate(2,5)
+ MulAccumulate(1,6)
+ MulAccumulate(0,7)
+ MulStoreDigit(7)
+ MulShiftCarry
+
+ MulAccumulate(7,1)
+ MulAccumulate(6,2)
+ MulAccumulate(5,3)
+ MulAccumulate(4,4)
+ MulAccumulate(3,5)
+ MulAccumulate(2,6)
+ MulAccumulate(1,7)
+ MulStoreDigit(8)
+ MulShiftCarry
+
+ MulAccumulate(7,2)
+ MulAccumulate(6,3)
+ MulAccumulate(5,4)
+ MulAccumulate(4,5)
+ MulAccumulate(3,6)
+ MulAccumulate(2,7)
+ MulStoreDigit(9)
+ MulShiftCarry
+
+ MulAccumulate(7,3)
+ MulAccumulate(6,4)
+ MulAccumulate(5,5)
+ MulAccumulate(4,6)
+ MulAccumulate(3,7)
+ MulStoreDigit(10)
+ MulShiftCarry
+
+ MulAccumulate(7,4)
+ MulAccumulate(6,5)
+ MulAccumulate(5,6)
+ MulAccumulate(4,7)
+ MulStoreDigit(11)
+ MulShiftCarry
+
+ MulAccumulate(7,5)
+ MulAccumulate(6,6)
+ MulAccumulate(5,7)
+ MulStoreDigit(12)
+ MulShiftCarry
+
+ MulAccumulate(7,6)
+ MulAccumulate(6,7)
+ MulStoreDigit(13)
+ MulShiftCarry
+
+ MulLastDiagonal(8)
+
+ MulCleanup
+
+ :
+ : "D" (Z), "S" (X), "a" (Y)
+ : "%ecx", "%edx", "memory"
+ );
+}
+
+#endif // __GNUC__
+
+#else // not x86 - no processor specific code at this layer
+
+typedef Portable LowLevel;
+
+#endif
+
+bool g_sse2DetectionDone = false, g_sse2Detected, g_sse2Enabled = true;
+
+void DisableSSE2()
+{
+ g_sse2Enabled = false;
}
#ifdef SSE2_INTRINSICS_AVAILABLE
@@ -961,23 +1459,20 @@ static bool GetSSE2Capability()
{
word32 b;
+#ifdef __GNUC__
+ __asm__("mov $1, %%eax; cpuid; mov %%edx, %0" : "=rm" (b) : : "%eax", "%edx");
+#else
__asm
{
mov eax, 1
cpuid
mov b, edx
}
+#endif
return (b & (1 << 26)) != 0;
}
-bool g_sse2DetectionDone = false, g_sse2Detected, g_sse2Enabled = true;
-
-void DisableSSE2()
-{
- g_sse2Enabled = false;
-}
-
static inline bool HasSSE2()
{
if (g_sse2Enabled && !g_sse2DetectionDone)
@@ -988,19 +1483,9 @@ static inline bool HasSSE2()
return g_sse2Enabled && g_sse2Detected;
}
-class P4Optimized : public PentiumOptimized
-{
-public:
- static word __fastcall Add(word *C, const word *A, const word *B, unsigned int N);
- static word __fastcall Subtract(word *C, const word *A, const word *B, unsigned int N);
- static void Multiply4(word *C, const word *A, const word *B);
- static void Multiply8(word *C, const word *A, const word *B);
- static inline void Square4(word *R, const word *A)
- {
- Multiply4(R, A, A);
- }
- static void Multiply8Bottom(word *C, const word *A, const word *B);
-};
+#ifdef __GNUC__
+#define __fastcall
+#endif
static void __fastcall P4_Mul(__m128i *C, const __m128i *A, const __m128i *B)
{
@@ -1072,7 +1557,7 @@ void P4Optimized::Multiply4(word *C, const word *A, const word *B)
__m64 s1, s2;
- __m64 w1 = _m_from_int(w[1]);
+ __m64 w1 = _mm_cvtsi32_si64(w[1]);
__m64 w4 = mw[2];
__m64 w6 = mw[3];
__m64 w8 = mw[4];
@@ -1083,38 +1568,38 @@ void P4Optimized::Multiply4(word *C, const word *A, const word *B)
__m64 w18 = mw[9];
__m64 w20 = mw[10];
__m64 w22 = mw[11];
- __m64 w26 = _m_from_int(w[26]);
+ __m64 w26 = _mm_cvtsi32_si64(w[26]);
s1 = _mm_add_si64(w1, w4);
- C[1] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[1] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s2 = _mm_add_si64(w6, w8);
s1 = _mm_add_si64(s1, s2);
- C[2] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[2] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s2 = _mm_add_si64(w10, w12);
s1 = _mm_add_si64(s1, s2);
- C[3] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[3] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s2 = _mm_add_si64(w14, w16);
s1 = _mm_add_si64(s1, s2);
- C[4] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[4] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s2 = _mm_add_si64(w18, w20);
s1 = _mm_add_si64(s1, s2);
- C[5] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[5] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s2 = _mm_add_si64(w22, w26);
s1 = _mm_add_si64(s1, s2);
- C[6] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[6] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
- C[7] = _m_to_int(s1) + w[27];
+ C[7] = _mm_cvtsi64_si32(s1) + w[27];
_mm_empty();
}
@@ -1142,7 +1627,7 @@ void P4Optimized::Multiply8(word *C, const word *A, const word *B)
__m64 s1, s2, s3, s4;
- __m64 w1 = _m_from_int(w[1]);
+ __m64 w1 = _mm_cvtsi32_si64(w[1]);
__m64 w4 = mw[2];
__m64 w6 = mw[3];
__m64 w8 = mw[4];
@@ -1153,11 +1638,11 @@ void P4Optimized::Multiply8(word *C, const word *A, const word *B)
__m64 w18 = mw[9];
__m64 w20 = mw[10];
__m64 w22 = mw[11];
- __m64 w26 = _m_from_int(w[26]);
- __m64 w27 = _m_from_int(w[27]);
+ __m64 w26 = _mm_cvtsi32_si64(w[26]);
+ __m64 w27 = _mm_cvtsi32_si64(w[27]);
- __m64 x0 = _m_from_int(x[0]);
- __m64 x1 = _m_from_int(x[1]);
+ __m64 x0 = _mm_cvtsi32_si64(x[0]);
+ __m64 x1 = _mm_cvtsi32_si64(x[1]);
__m64 x4 = mx[2];
__m64 x6 = mx[3];
__m64 x8 = mx[4];
@@ -1168,11 +1653,11 @@ void P4Optimized::Multiply8(word *C, const word *A, const word *B)
__m64 x18 = mx[9];
__m64 x20 = mx[10];
__m64 x22 = mx[11];
- __m64 x26 = _m_from_int(x[26]);
- __m64 x27 = _m_from_int(x[27]);
+ __m64 x26 = _mm_cvtsi32_si64(x[26]);
+ __m64 x27 = _mm_cvtsi32_si64(x[27]);
- __m64 y0 = _m_from_int(y[0]);
- __m64 y1 = _m_from_int(y[1]);
+ __m64 y0 = _mm_cvtsi32_si64(y[0]);
+ __m64 y1 = _mm_cvtsi32_si64(y[1]);
__m64 y4 = my[2];
__m64 y6 = my[3];
__m64 y8 = my[4];
@@ -1183,11 +1668,11 @@ void P4Optimized::Multiply8(word *C, const word *A, const word *B)
__m64 y18 = my[9];
__m64 y20 = my[10];
__m64 y22 = my[11];
- __m64 y26 = _m_from_int(y[26]);
- __m64 y27 = _m_from_int(y[27]);
+ __m64 y26 = _mm_cvtsi32_si64(y[26]);
+ __m64 y27 = _mm_cvtsi32_si64(y[27]);
- __m64 z0 = _m_from_int(z[0]);
- __m64 z1 = _m_from_int(z[1]);
+ __m64 z0 = _mm_cvtsi32_si64(z[0]);
+ __m64 z1 = _mm_cvtsi32_si64(z[1]);
__m64 z4 = mz[2];
__m64 z6 = mz[3];
__m64 z8 = mz[4];
@@ -1198,28 +1683,28 @@ void P4Optimized::Multiply8(word *C, const word *A, const word *B)
__m64 z18 = mz[9];
__m64 z20 = mz[10];
__m64 z22 = mz[11];
- __m64 z26 = _m_from_int(z[26]);
+ __m64 z26 = _mm_cvtsi32_si64(z[26]);
s1 = _mm_add_si64(w1, w4);
- C[1] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[1] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s2 = _mm_add_si64(w6, w8);
s1 = _mm_add_si64(s1, s2);
- C[2] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[2] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s2 = _mm_add_si64(w10, w12);
s1 = _mm_add_si64(s1, s2);
- C[3] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[3] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s3 = _mm_add_si64(x0, y0);
s2 = _mm_add_si64(w14, w16);
s1 = _mm_add_si64(s1, s3);
s1 = _mm_add_si64(s1, s2);
- C[4] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[4] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s3 = _mm_add_si64(x1, y1);
s4 = _mm_add_si64(x4, y4);
@@ -1227,8 +1712,8 @@ void P4Optimized::Multiply8(word *C, const word *A, const word *B)
s3 = _mm_add_si64(s3, s4);
s1 = _mm_add_si64(s1, w20);
s1 = _mm_add_si64(s1, s3);
- C[5] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[5] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s3 = _mm_add_si64(x6, y6);
s4 = _mm_add_si64(x8, y8);
@@ -1236,24 +1721,24 @@ void P4Optimized::Multiply8(word *C, const word *A, const word *B)
s3 = _mm_add_si64(s3, s4);
s1 = _mm_add_si64(s1, w26);
s1 = _mm_add_si64(s1, s3);
- C[6] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[6] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s3 = _mm_add_si64(x10, y10);
s4 = _mm_add_si64(x12, y12);
s1 = _mm_add_si64(s1, w27);
s3 = _mm_add_si64(s3, s4);
s1 = _mm_add_si64(s1, s3);
- C[7] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[7] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s3 = _mm_add_si64(x14, y14);
s4 = _mm_add_si64(x16, y16);
s1 = _mm_add_si64(s1, z0);
s3 = _mm_add_si64(s3, s4);
s1 = _mm_add_si64(s1, s3);
- C[8] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[8] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s3 = _mm_add_si64(x18, y18);
s4 = _mm_add_si64(x20, y20);
@@ -1261,8 +1746,8 @@ void P4Optimized::Multiply8(word *C, const word *A, const word *B)
s3 = _mm_add_si64(s3, s4);
s1 = _mm_add_si64(s1, z4);
s1 = _mm_add_si64(s1, s3);
- C[9] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[9] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s3 = _mm_add_si64(x22, y22);
s4 = _mm_add_si64(x26, y26);
@@ -1270,32 +1755,32 @@ void P4Optimized::Multiply8(word *C, const word *A, const word *B)
s3 = _mm_add_si64(s3, s4);
s1 = _mm_add_si64(s1, z8);
s1 = _mm_add_si64(s1, s3);
- C[10] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[10] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s3 = _mm_add_si64(x27, y27);
s1 = _mm_add_si64(s1, z10);
s1 = _mm_add_si64(s1, z12);
s1 = _mm_add_si64(s1, s3);
- C[11] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[11] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s3 = _mm_add_si64(z14, z16);
s1 = _mm_add_si64(s1, s3);
- C[12] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[12] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s3 = _mm_add_si64(z18, z20);
s1 = _mm_add_si64(s1, s3);
- C[13] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[13] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s3 = _mm_add_si64(z22, z26);
s1 = _mm_add_si64(s1, s3);
- C[14] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[14] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
- C[15] = z[27] + _m_to_int(s1);
+ C[15] = z[27] + _mm_cvtsi64_si32(s1);
_mm_empty();
}
@@ -1319,7 +1804,7 @@ void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
__m64 s1, s2, s3, s4;
- __m64 w1 = _m_from_int(w[1]);
+ __m64 w1 = _mm_cvtsi32_si64(w[1]);
__m64 w4 = mw[2];
__m64 w6 = mw[3];
__m64 w8 = mw[4];
@@ -1330,40 +1815,40 @@ void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
__m64 w18 = mw[9];
__m64 w20 = mw[10];
__m64 w22 = mw[11];
- __m64 w26 = _m_from_int(w[26]);
+ __m64 w26 = _mm_cvtsi32_si64(w[26]);
- __m64 x0 = _m_from_int(x[0]);
- __m64 x1 = _m_from_int(x[1]);
+ __m64 x0 = _mm_cvtsi32_si64(x[0]);
+ __m64 x1 = _mm_cvtsi32_si64(x[1]);
__m64 x4 = mx[2];
__m64 x6 = mx[3];
__m64 x8 = mx[4];
- __m64 y0 = _m_from_int(y[0]);
- __m64 y1 = _m_from_int(y[1]);
+ __m64 y0 = _mm_cvtsi32_si64(y[0]);
+ __m64 y1 = _mm_cvtsi32_si64(y[1]);
__m64 y4 = my[2];
__m64 y6 = my[3];
__m64 y8 = my[4];
s1 = _mm_add_si64(w1, w4);
- C[1] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[1] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s2 = _mm_add_si64(w6, w8);
s1 = _mm_add_si64(s1, s2);
- C[2] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[2] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s2 = _mm_add_si64(w10, w12);
s1 = _mm_add_si64(s1, s2);
- C[3] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[3] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s3 = _mm_add_si64(x0, y0);
s2 = _mm_add_si64(w14, w16);
s1 = _mm_add_si64(s1, s3);
s1 = _mm_add_si64(s1, s2);
- C[4] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[4] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s3 = _mm_add_si64(x1, y1);
s4 = _mm_add_si64(x4, y4);
@@ -1371,8 +1856,8 @@ void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
s3 = _mm_add_si64(s3, s4);
s1 = _mm_add_si64(s1, w20);
s1 = _mm_add_si64(s1, s3);
- C[5] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[5] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
s3 = _mm_add_si64(x6, y6);
s4 = _mm_add_si64(x8, y8);
@@ -1380,558 +1865,15 @@ void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
s3 = _mm_add_si64(s3, s4);
s1 = _mm_add_si64(s1, w26);
s1 = _mm_add_si64(s1, s3);
- C[6] = _m_to_int(s1);
- s1 = _m_psrlqi(s1, 32);
+ C[6] = _mm_cvtsi64_si32(s1);
+ s1 = _mm_srli_si64(s1, 32);
- C[7] = _m_to_int(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
+ C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
_mm_empty();
}
-__declspec(naked) word __fastcall P4Optimized::Add(word *C, const word *A, const word *B, unsigned int N)
-{
- __asm
- {
- sub esp, 16
- xor eax, eax
- mov [esp], edi
- mov [esp+4], esi
- mov [esp+8], ebx
- mov [esp+12], ebp
-
- mov ebx, [esp+20] // B
- mov esi, [esp+24] // N
-
- // now: ebx = B, ecx = C, edx = A, esi = N
-
- neg esi
- jz loopend // if no dwords then nothing to do
-
- mov edi, [edx]
- mov ebp, [ebx]
-
-loopstart:
- add edi, eax
- jc carry1
-
- xor eax, eax
-
-carry1continue:
- add edi, ebp
- mov ebp, 1
- mov [ecx], edi
- mov edi, [edx+4]
- cmovc eax, ebp
- mov ebp, [ebx+4]
- lea ebx, [ebx+8]
- add edi, eax
- jc carry2
-
- xor eax, eax
-
-carry2continue:
- add edi, ebp
- mov ebp, 1
- cmovc eax, ebp
- mov [ecx+4], edi
- add ecx, 8
- mov edi, [edx+8]
- add edx, 8
- add esi, 2
- mov ebp, [ebx]
- jnz loopstart
-
-loopend:
- mov edi, [esp]
- mov esi, [esp+4]
- mov ebx, [esp+8]
- mov ebp, [esp+12]
- add esp, 16
- ret 8
-
-carry1:
- mov eax, 1
- jmp carry1continue
-
-carry2:
- mov eax, 1
- jmp carry2continue
- }
-}
-
-__declspec(naked) word __fastcall P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
-{
- __asm
- {
- sub esp, 16
- xor eax, eax
- mov [esp], edi
- mov [esp+4], esi
- mov [esp+8], ebx
- mov [esp+12], ebp
-
- mov ebx, [esp+20] // B
- mov esi, [esp+24] // N
-
- // now: ebx = B, ecx = C, edx = A, esi = N
-
- neg esi
- jz loopend // if no dwords then nothing to do
-
- mov edi, [edx]
- mov ebp, [ebx]
-
-loopstart:
- sub edi, eax
- jc carry1
-
- xor eax, eax
-
-carry1continue:
- sub edi, ebp
- mov ebp, 1
- mov [ecx], edi
- mov edi, [edx+4]
- cmovc eax, ebp
- mov ebp, [ebx+4]
- lea ebx, [ebx+8]
- sub edi, eax
- jc carry2
-
- xor eax, eax
-
-carry2continue:
- sub edi, ebp
- mov ebp, 1
- cmovc eax, ebp
- mov [ecx+4], edi
- add ecx, 8
- mov edi, [edx+8]
- add edx, 8
- add esi, 2
- mov ebp, [ebx]
- jnz loopstart
-
-loopend:
- mov edi, [esp]
- mov esi, [esp+4]
- mov ebx, [esp+8]
- mov ebp, [esp+12]
- add esp, 16
- ret 8
-
-carry1:
- mov eax, 1
- jmp carry1continue
-
-carry2:
- mov eax, 1
- jmp carry2continue
- }
-}
-
#endif // #ifdef SSE2_INTRINSICS_AVAILABLE
-#elif defined(__GNUC__) && defined(__i386__)
-
-class PentiumOptimized : public Portable
-{
-public:
-#ifndef __pic__ // -fpic uses up a register, leaving too few for the asm code
- static word Add(word *C, const word *A, const word *B, unsigned int N);
- static word Subtract(word *C, const word *A, const word *B, unsigned int N);
-#endif
- static void Square4(word *R, const word *A);
- static void Multiply4(word *C, const word *A, const word *B);
- static void Multiply8(word *C, const word *A, const word *B);
-};
-
-typedef PentiumOptimized LowLevel;
-
-// Add and Subtract assembly code originally contributed by Alister Lee
-
-#ifndef __pic__
-__attribute__((regparm(3))) word PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)
-{
- assert (N%2 == 0);
-
- register word carry, temp;
-
- __asm__ __volatile__(
- "push %%ebp;"
- "sub %3, %2;"
- "xor %0, %0;"
- "sub %4, %0;"
- "lea (%1,%4,4), %1;"
- "sar $1, %0;"
- "jz 1f;"
-
- "0:;"
- "mov 0(%3), %4;"
- "mov 4(%3), %%ebp;"
- "mov (%1,%0,8), %5;"
- "lea 8(%3), %3;"
- "adc %5, %4;"
- "mov 4(%1,%0,8), %5;"
- "adc %5, %%ebp;"
- "inc %0;"
- "mov %4, -8(%3, %2);"
- "mov %%ebp, -4(%3, %2);"
- "jnz 0b;"
-
- "1:;"
- "adc $0, %0;"
- "pop %%ebp;"
-
- : "=aSD" (carry), "+r" (B), "+r" (C), "+r" (A), "+r" (N), "=r" (temp)
- : : "cc", "memory");
-
- return carry;
-}
-
-__attribute__((regparm(3))) word PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
-{
- assert (N%2 == 0);
-
- register word carry, temp;
-
- __asm__ __volatile__(
- "push %%ebp;"
- "sub %3, %2;"
- "xor %0, %0;"
- "sub %4, %0;"
- "lea (%1,%4,4), %1;"
- "sar $1, %0;"
- "jz 1f;"
-
- "0:;"
- "mov 0(%3), %4;"
- "mov 4(%3), %%ebp;"
- "mov (%1,%0,8), %5;"
- "lea 8(%3), %3;"
- "sbb %5, %4;"
- "mov 4(%1,%0,8), %5;"
- "sbb %5, %%ebp;"
- "inc %0;"
- "mov %4, -8(%3, %2);"
- "mov %%ebp, -4(%3, %2);"
- "jnz 0b;"
-
- "1:;"
- "adc $0, %0;"
- "pop %%ebp;"
-
- : "=aSD" (carry), "+r" (B), "+r" (C), "+r" (A), "+r" (N), "=r" (temp)
- : : "cc", "memory");
-
- return carry;
-}
-#endif // __pic__
-
-// Comba square and multiply assembly code originally contributed by Leonard Janke
-
-#define SqrStartup \
- "push %%ebp\n\t" \
- "push %%esi\n\t" \
- "push %%ebx\n\t" \
- "xor %%ebp, %%ebp\n\t" \
- "xor %%ebx, %%ebx\n\t" \
- "xor %%ecx, %%ecx\n\t"
-
-#define SqrShiftCarry \
- "mov %%ebx, %%ebp\n\t" \
- "mov %%ecx, %%ebx\n\t" \
- "xor %%ecx, %%ecx\n\t"
-
-#define SqrAccumulate(i,j) \
- "mov 4*"#j"(%%esi), %%eax\n\t" \
- "mull 4*"#i"(%%esi)\n\t" \
- "add %%eax, %%ebp\n\t" \
- "adc %%edx, %%ebx\n\t" \
- "adc %%ch, %%cl\n\t" \
- "add %%eax, %%ebp\n\t" \
- "adc %%edx, %%ebx\n\t" \
- "adc %%ch, %%cl\n\t"
-
-#define SqrAccumulateCentre(i) \
- "mov 4*"#i"(%%esi), %%eax\n\t" \
- "mull 4*"#i"(%%esi)\n\t" \
- "add %%eax, %%ebp\n\t" \
- "adc %%edx, %%ebx\n\t" \
- "adc %%ch, %%cl\n\t"
-
-#define SqrStoreDigit(X) \
- "mov %%ebp, 4*"#X"(%%edi)\n\t" \
-
-#define SqrLastDiagonal(digits) \
- "mov 4*("#digits"-1)(%%esi), %%eax\n\t" \
- "mull 4*("#digits"-1)(%%esi)\n\t" \
- "add %%eax, %%ebp\n\t" \
- "adc %%edx, %%ebx\n\t" \
- "mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
- "mov %%ebx, 4*(2*"#digits"-1)(%%edi)\n\t"
-
-#define SqrCleanup \
- "pop %%ebx\n\t" \
- "pop %%esi\n\t" \
- "pop %%ebp\n\t"
-
-void PentiumOptimized::Square4(word* Y, const word* X)
-{
- __asm__ __volatile__(
- SqrStartup
-
- SqrAccumulateCentre(0)
- SqrStoreDigit(0)
- SqrShiftCarry
-
- SqrAccumulate(1,0)
- SqrStoreDigit(1)
- SqrShiftCarry
-
- SqrAccumulate(2,0)
- SqrAccumulateCentre(1)
- SqrStoreDigit(2)
- SqrShiftCarry
-
- SqrAccumulate(3,0)
- SqrAccumulate(2,1)
- SqrStoreDigit(3)
- SqrShiftCarry
-
- SqrAccumulate(3,1)
- SqrAccumulateCentre(2)
- SqrStoreDigit(4)
- SqrShiftCarry
-
- SqrAccumulate(3,2)
- SqrStoreDigit(5)
- SqrShiftCarry
-
- SqrLastDiagonal(4)
-
- SqrCleanup
-
- :
- : "D" (Y), "S" (X)
- : "eax", "ecx", "edx", "ebp", "memory"
- );
-}
-
-#define MulStartup \
- "push %%ebp\n\t" \
- "push %%esi\n\t" \
- "push %%ebx\n\t" \
- "push %%edi\n\t" \
- "mov %%eax, %%ebx \n\t" \
- "xor %%ebp, %%ebp\n\t" \
- "xor %%edi, %%edi\n\t" \
- "xor %%ecx, %%ecx\n\t"
-
-#define MulShiftCarry \
- "mov %%edx, %%ebp\n\t" \
- "mov %%ecx, %%edi\n\t" \
- "xor %%ecx, %%ecx\n\t"
-
-#define MulAccumulate(i,j) \
- "mov 4*"#j"(%%ebx), %%eax\n\t" \
- "mull 4*"#i"(%%esi)\n\t" \
- "add %%eax, %%ebp\n\t" \
- "adc %%edx, %%edi\n\t" \
- "adc %%ch, %%cl\n\t"
-
-#define MulStoreDigit(X) \
- "mov %%edi, %%edx \n\t" \
- "mov (%%esp), %%edi \n\t" \
- "mov %%ebp, 4*"#X"(%%edi)\n\t" \
- "mov %%edi, (%%esp)\n\t"
-
-#define MulLastDiagonal(digits) \
- "mov 4*("#digits"-1)(%%ebx), %%eax\n\t" \
- "mull 4*("#digits"-1)(%%esi)\n\t" \
- "add %%eax, %%ebp\n\t" \
- "adc %%edi, %%edx\n\t" \
- "mov (%%esp), %%edi\n\t" \
- "mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
- "mov %%edx, 4*(2*"#digits"-1)(%%edi)\n\t"
-
-#define MulCleanup \
- "pop %%edi\n\t" \
- "pop %%ebx\n\t" \
- "pop %%esi\n\t" \
- "pop %%ebp\n\t"
-
-void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
-{
- __asm__ __volatile__(
- MulStartup
- MulAccumulate(0,0)
- MulStoreDigit(0)
- MulShiftCarry
-
- MulAccumulate(1,0)
- MulAccumulate(0,1)
- MulStoreDigit(1)
- MulShiftCarry
-
- MulAccumulate(2,0)
- MulAccumulate(1,1)
- MulAccumulate(0,2)
- MulStoreDigit(2)
- MulShiftCarry
-
- MulAccumulate(3,0)
- MulAccumulate(2,1)
- MulAccumulate(1,2)
- MulAccumulate(0,3)
- MulStoreDigit(3)
- MulShiftCarry
-
- MulAccumulate(3,1)
- MulAccumulate(2,2)
- MulAccumulate(1,3)
- MulStoreDigit(4)
- MulShiftCarry
-
- MulAccumulate(3,2)
- MulAccumulate(2,3)
- MulStoreDigit(5)
- MulShiftCarry
-
- MulLastDiagonal(4)
-
- MulCleanup
-
- :
- : "D" (Z), "S" (X), "a" (Y)
- : "%ecx", "%edx", "memory"
- );
-}
-
-void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
-{
- __asm__ __volatile__(
- MulStartup
- MulAccumulate(0,0)
- MulStoreDigit(0)
- MulShiftCarry
-
- MulAccumulate(1,0)
- MulAccumulate(0,1)
- MulStoreDigit(1)
- MulShiftCarry
-
- MulAccumulate(2,0)
- MulAccumulate(1,1)
- MulAccumulate(0,2)
- MulStoreDigit(2)
- MulShiftCarry
-
- MulAccumulate(3,0)
- MulAccumulate(2,1)
- MulAccumulate(1,2)
- MulAccumulate(0,3)
- MulStoreDigit(3)
- MulShiftCarry
-
- MulAccumulate(4,0)
- MulAccumulate(3,1)
- MulAccumulate(2,2)
- MulAccumulate(1,3)
- MulAccumulate(0,4)
- MulStoreDigit(4)
- MulShiftCarry
-
- MulAccumulate(5,0)
- MulAccumulate(4,1)
- MulAccumulate(3,2)
- MulAccumulate(2,3)
- MulAccumulate(1,4)
- MulAccumulate(0,5)
- MulStoreDigit(5)
- MulShiftCarry
-
- MulAccumulate(6,0)
- MulAccumulate(5,1)
- MulAccumulate(4,2)
- MulAccumulate(3,3)
- MulAccumulate(2,4)
- MulAccumulate(1,5)
- MulAccumulate(0,6)
- MulStoreDigit(6)
- MulShiftCarry
-
- MulAccumulate(7,0)
- MulAccumulate(6,1)
- MulAccumulate(5,2)
- MulAccumulate(4,3)
- MulAccumulate(3,4)
- MulAccumulate(2,5)
- MulAccumulate(1,6)
- MulAccumulate(0,7)
- MulStoreDigit(7)
- MulShiftCarry
-
- MulAccumulate(7,1)
- MulAccumulate(6,2)
- MulAccumulate(5,3)
- MulAccumulate(4,4)
- MulAccumulate(3,5)
- MulAccumulate(2,6)
- MulAccumulate(1,7)
- MulStoreDigit(8)
- MulShiftCarry
-
- MulAccumulate(7,2)
- MulAccumulate(6,3)
- MulAccumulate(5,4)
- MulAccumulate(4,5)
- MulAccumulate(3,6)
- MulAccumulate(2,7)
- MulStoreDigit(9)
- MulShiftCarry
-
- MulAccumulate(7,3)
- MulAccumulate(6,4)
- MulAccumulate(5,5)
- MulAccumulate(4,6)
- MulAccumulate(3,7)
- MulStoreDigit(10)
- MulShiftCarry
-
- MulAccumulate(7,4)
- MulAccumulate(6,5)
- MulAccumulate(5,6)
- MulAccumulate(4,7)
- MulStoreDigit(11)
- MulShiftCarry
-
- MulAccumulate(7,5)
- MulAccumulate(6,6)
- MulAccumulate(5,7)
- MulStoreDigit(12)
- MulShiftCarry
-
- MulAccumulate(7,6)
- MulAccumulate(6,7)
- MulStoreDigit(13)
- MulShiftCarry
-
- MulLastDiagonal(8)
-
- MulCleanup
-
- :
- : "D" (Z), "S" (X), "a" (Y)
- : "%ecx", "%edx", "memory"
- );
-}
-
-#else // no processor specific code at this layer
-
-typedef Portable LowLevel;
-
-#endif
-
// ********************************************************
#define A0 A