From e84412066595cc6bec6565fbbf1a5ea6936c11d3 Mon Sep 17 00:00:00 2001
From: weidai <weidai@57ff6487-cd31-0410-9ec3-f628ee90f5f0>
Date: Thu, 31 Jul 2003 01:54:53 +0000
Subject: enable SSE2 intrinsics on GCC 3.3 or later

git-svn-id: svn://svn.code.sf.net/p/cryptopp/code/trunk/c5@121 57ff6487-cd31-0410-9ec3-f628ee90f5f0
---
 integer.cpp | 1804 +++++++++++++++++++++++++++++------------------------------
 1 file changed, 873 insertions(+), 931 deletions(-)

(limited to 'integer.cpp')
diff --git a/integer.cpp b/integer.cpp
index f5b5fc4..93539dd 100644
--- a/integer.cpp
+++ b/integer.cpp
@@ -18,9 +18,16 @@
 #include <iostream>
 
 #ifdef SSE2_INTRINSICS_AVAILABLE
-#include <emmintrin.h>
+	#ifdef __GNUC__
+		#include <xmmintrin.h>
+		#include <malloc.h>
+	#else
+		#include <emmintrin.h>
+	#endif
 #elif defined(_MSC_VER) && defined(_M_IX86)
-#pragma message("You do no seem to have the Visual C++ Processor Pack installed, so use of SSE2 intrinsics will be disabled.")
+	#pragma message("You do no seem to have the Visual C++ Processor Pack installed, so use of SSE2 intrinsics will be disabled.")
+#elif defined(__GNUC__) && defined(__i386__)
+	#pragma message("You do not have GCC 3.3 or later, or did not specify -msse2 compiler option, so use of SSE2 intrinsics will be disabled.")
 #endif
 
 NAMESPACE_BEGIN(CryptoPP)
@@ -41,7 +48,11 @@ CPP_TYPENAME AllocatorBase<T>::pointer AlignedAllocator<T>::allocate(size_type n
 {
 #ifdef SSE2_INTRINSICS_AVAILABLE
 	if (n >= 4)
-		return (T *)_mm_malloc(sizeof(T)*n, 16);
+		#ifdef __GNUC__
+				return (T *)memalign(16, sizeof(T)*n);
+		#else
+				return (T *)_mm_malloc(sizeof(T)*n, 16);
+		#endif
 	else
 #endif
 		return new T[n];
@@ -53,10 +64,14 @@ void AlignedAllocator<T>::deallocate(void *p, size_type n)
 	memset(p, 0, n*sizeof(T));
 #ifdef SSE2_INTRINSICS_AVAILABLE
 	if (n >= 4)
-		_mm_free(p);
+		#ifdef __GNUC__
+			free(p);
+		#else
+			_mm_free(p);
+		#endif
 	else
 #endif
-		delete [] p;
+		delete [] (T *)p;
 }
 #endif
 
@@ -640,6 +655,13 @@ void Portable::Square2(word *R, const word *A)
 
 void Portable::Square4(word *R, const word *A)
 {
+#ifdef _MSC_VER
+	// VC60 workaround: MSVC 6.0 has an optimization bug that makes
+	// (dword)A*B where either A or B has been cast to a dword before
+	// very expensive. Revisit this function when this
+	// bug is fixed.
+	Multiply4(R, A, A);
+#else
 	const word *B = A;
 	DWord p, q;
 	word c, d, e;
@@ -666,6 +688,7 @@ void Portable::Square4(word *R, const word *A)
 	p = DWord::MultiplyAndAdd(A[3], A[3], d);
 	R[6] = p.GetLowHalf();
 	R[7] = e + p.GetHighHalf();
+#endif
 }
 
 void Portable::Multiply8(word *R, const word *A, const word *B)
@@ -834,480 +857,763 @@ void Portable::Multiply8Bottom(word *R, const word *A, const word *B)
 #undef SaveSquAcc
 
 // CodeWarrior defines _MSC_VER
-#if defined(_MSC_VER) && !defined(__MWERKS__) && defined(_M_IX86) && (_M_IX86<=700)
+#if (defined(_MSC_VER) && !defined(__MWERKS__) && defined(_M_IX86)) || (defined(__GNUC__) && defined(__i386__))
 
 class PentiumOptimized : public Portable
 {
 public:
-	static word __fastcall Add(word *C, const word *A, const word *B, unsigned int N);
-	static word __fastcall Subtract(word *C, const word *A, const word *B, unsigned int N);
-// TODO test this with .NET #if _MSC_VER < 1300
-	static inline void Square4(word *R, const word *A)
-	{
-		// VC60 workaround: MSVC 6.0 has an optimization bug that makes
-		// (dword)A*B where either A or B has been cast to a dword before
-		// very expensive. Revisit this function when this
-		// bug is fixed.
-		Multiply4(R, A, A);
-	}
-//#endif
+	static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N);
+	static word CRYPTOPP_CDECL Subtract(word *C, const word *A, const word *B, unsigned int N);
+#ifdef __GNUC__
+	static void Square4(word *R, const word *A);
+	static void Multiply4(word *C, const word *A, const word *B);
+	static void Multiply8(word *C, const word *A, const word *B);
+#endif
 };
 
 typedef PentiumOptimized LowLevel;
 
-__declspec(naked) word __fastcall PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)
+// this may be selected at run time
+class P4Optimized : public PentiumOptimized
 {
-	__asm
-	{
-		push ebp
-		push ebx
-		push esi
-		push edi
+public:
+	static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N);
+	static word CRYPTOPP_CDECL Subtract(word *C, const word *A, const word *B, unsigned int N);
+#ifdef SSE2_INTRINSICS_AVAILABLE
+	static void Multiply4(word *C, const word *A, const word *B);
+	static void Multiply8(word *C, const word *A, const word *B);
+	static void Multiply8Bottom(word *C, const word *A, const word *B);
+	static inline void Square4(word *R, const word *A) {Multiply4(R, A, A);}
+#endif
+};
+
+// use some tricks to share assembly code between MSVC and GCC
+#ifdef _MSC_VER
+	#define CRYPTOPP_NAKED __declspec(naked)
+	#define AS1(x) __asm x
+	#define AS2(x, y) __asm x, y
+	#define PentiumPrologue \
+		__asm	push ebp \
+		__asm	push ebx \
+		__asm	push esi \
+		__asm	push edi \
+		__asm	mov		ecx, [esp+20] \
+		__asm	mov		edx, [esp+24] \
+		__asm	mov		ebx, [esp+28] \
+		__asm	mov		esi, [esp+32]
+	#define PentiumEpilogue \
+		__asm	pop edi \
+		__asm	pop esi \
+		__asm	pop ebx \
+		__asm	pop ebp \
+		__asm	ret
+	#define P4Prologue \
+		__asm	sub		esp, 16 \
+		__asm	mov		[esp], edi \
+		__asm	mov		[esp+4], esi \
+		__asm	mov		[esp+8], ebx \
+		__asm	mov		[esp+12], ebp \
+		__asm	mov		ecx, [esp+20] \
+		__asm	mov		edx, [esp+24] \
+		__asm	mov		ebx, [esp+28] \
+		__asm	mov		esi, [esp+32]
+	#define P4Epilogue \
+		__asm	mov		edi, [esp] \
+		__asm	mov		esi, [esp+4] \
+		__asm	mov		ebx, [esp+8] \
+		__asm	mov		ebp, [esp+12] \
+		__asm	add		esp, 16 \
+		__asm	ret
+#else
+	#define CRYPTOPP_NAKED
+	#define AS1(x) #x ";"
+	#define AS2(x, y) #x ", " #y ";"
+	#define PentiumPrologue \
+		__asm__ \
+		( \
+			".att_syntax prefix;" \
+			"mov %0, %%ecx;" \
+			"mov %1, %%edx;" \
+			"mov %2, %%ebx;" \
+			"mov %3, %%esi;" \
+			".intel_syntax noprefix;"
+	#define PentiumEpilogue \
+			".att_syntax prefix;" \
+					: \
+					: "m" (C), "m" (A), "m" (B), "m" (N) \
+					: "%ecx", "%edx", "%ebx", "%esi", "%edi" \
+		);
+	#define P4Prologue PentiumPrologue
+	#define P4Epilogue PentiumEpilogue
+#endif
 
-		mov esi, [esp+24]	; N
-		mov ebx, [esp+20]	; B
+CRYPTOPP_NAKED word PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)
+{
+	PentiumPrologue
 
-		// now: ebx = B, ecx = C, edx = A, esi = N
+	// now: ebx = B, ecx = C, edx = A, esi = N
+	AS2(	sub ecx, edx)	// hold the distance between C & A so we can add this to A to get C
+	AS2(	xor eax, eax)	// clear eax
 
-		sub ecx, edx	// hold the distance between C & A so we can add this to A to get C
-		xor eax, eax	// clear eax
+	AS2(	sub eax, esi)	// eax is a negative index from end of B
+	AS2(	lea ebx, [ebx+4*esi])	// ebx is end of B
 
-		sub eax, esi	// eax is a negative index from end of B
-		lea ebx, [ebx+4*esi]	// ebx is end of B
+	AS2(	sar eax, 1)		// unit of eax is now dwords; this also clears the carry flag
+	AS1(	jz	loopendAdd)		// if no dwords then nothing to do
 
-		sar eax, 1		// unit of eax is now dwords; this also clears the carry flag
-		jz	loopend		// if no dwords then nothing to do
+	AS1(loopstartAdd:)
+	AS2(	mov    esi,[edx])			// load lower word of A
+	AS2(	mov    ebp,[edx+4])			// load higher word of A
 
-loopstart:
-		mov    esi,[edx]			// load lower word of A
-		mov    ebp,[edx+4]			// load higher word of A
+	AS2(	mov    edi,[ebx+8*eax])		// load lower word of B
+	AS2(	lea    edx,[edx+8])			// advance A and C
 
-		mov    edi,[ebx+8*eax]		// load lower word of B
-		lea    edx,[edx+8]			// advance A and C
+	AS2(	adc    esi,edi)				// add lower words
+	AS2(	mov    edi,[ebx+8*eax+4])	// load higher word of B
 
-		adc    esi,edi				// add lower words
-		mov    edi,[ebx+8*eax+4]	// load higher word of B
+	AS2(	adc    ebp,edi)				// add higher words
+	AS1(	inc    eax)					// advance B
 
-		adc    ebp,edi				// add higher words
-		inc    eax					// advance B
+	AS2(	mov    [edx+ecx-8],esi)		// store lower word result
+	AS2(	mov    [edx+ecx-4],ebp)		// store higher word result
 
-		mov    [edx+ecx-8],esi		// store lower word result
-		mov    [edx+ecx-4],ebp		// store higher word result
+	AS1(	jnz    loopstartAdd)			// loop until eax overflows and becomes zero
 
-		jnz    loopstart			// loop until eax overflows and becomes zero
+	AS1(loopendAdd:)
+	AS2(	adc eax, 0)		// store carry into eax (return result register)
 
-loopend:
-		adc eax, 0		// store carry into eax (return result register)
-		pop edi
-		pop esi
-		pop ebx
-		pop ebp
-		ret 8
-	}
+	PentiumEpilogue
 }
 
-__declspec(naked) word __fastcall PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
+CRYPTOPP_NAKED word PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
 {
-	__asm
-	{
-		push ebp
-		push ebx
-		push esi
-		push edi
+	PentiumPrologue
 
-		mov esi, [esp+24]	; N
-		mov ebx, [esp+20]	; B
+	// now: ebx = B, ecx = C, edx = A, esi = N
+	AS2(	sub ecx, edx)	// hold the distance between C & A so we can add this to A to get C
+	AS2(	xor eax, eax)	// clear eax
 
-		sub ecx, edx
-		xor eax, eax
+	AS2(	sub eax, esi)	// eax is a negative index from end of B
+	AS2(	lea ebx, [ebx+4*esi])	// ebx is end of B
 
-		sub eax, esi
-		lea ebx, [ebx+4*esi]
+	AS2(	sar eax, 1)		// unit of eax is now dwords; this also clears the carry flag
+	AS1(	jz	loopendSub)		// if no dwords then nothing to do
 
-		sar eax, 1
-		jz	loopend
+	AS1(loopstartSub:)
+	AS2(	mov    esi,[edx])			// load lower word of A
+	AS2(	mov    ebp,[edx+4])			// load higher word of A
 
-loopstart:
-		mov    esi,[edx]
-		mov    ebp,[edx+4]
+	AS2(	mov    edi,[ebx+8*eax])		// load lower word of B
+	AS2(	lea    edx,[edx+8])			// advance A and C
 
-		mov    edi,[ebx+8*eax]
-		lea    edx,[edx+8]
+	AS2(	sbb    esi,edi)				// subtract lower words
+	AS2(	mov    edi,[ebx+8*eax+4])	// load higher word of B
 
-		sbb    esi,edi
-		mov    edi,[ebx+8*eax+4]
+	AS2(	sbb    ebp,edi)				// subtract higher words
+	AS1(	inc    eax)					// advance B
 
-		sbb    ebp,edi
-		inc    eax
+	AS2(	mov    [edx+ecx-8],esi)		// store lower word result
+	AS2(	mov    [edx+ecx-4],ebp)		// store higher word result
 
-		mov    [edx+ecx-8],esi
-		mov    [edx+ecx-4],ebp
+	AS1(	jnz    loopstartSub)			// loop until eax overflows and becomes zero
 
-		jnz    loopstart
+	AS1(loopendSub:)
+	AS2(	adc eax, 0)		// store carry into eax (return result register)
 
-loopend:
-		adc eax, 0
-		pop edi
-		pop esi
-		pop ebx
-		pop ebp
-		ret 8
-	}
+	PentiumEpilogue
 }
 
-#ifdef SSE2_INTRINSICS_AVAILABLE
-
-static bool GetSSE2Capability()
+CRYPTOPP_NAKED word P4Optimized::Add(word *C, const word *A, const word *B, unsigned int N)
 {
-	word32 b;
+	P4Prologue
 
-	__asm
-	{
-		mov		eax, 1
-		cpuid
-		mov		b, edx
-	}
+	// now: ebx = B, ecx = C, edx = A, esi = N
+	AS2(	xor		eax, eax)
+	AS1(	neg		esi)
+	AS1(	jz		loopendAddP4)		// if no dwords then nothing to do
 
-	return (b & (1 << 26)) != 0;
-}
+	AS2(	mov		edi, [edx])
+	AS2(	mov		ebp, [ebx])
 
-bool g_sse2DetectionDone = false, g_sse2Detected, g_sse2Enabled = true;
+	AS1(loopstartAddP4:)
+	AS2(	add		edi, eax)
+	AS1(	jc		carry1AddP4)
 
-void DisableSSE2()
-{
-	g_sse2Enabled = false;
-}
+	AS2(	xor		eax, eax)
 
-static inline bool HasSSE2()
-{
-	if (g_sse2Enabled && !g_sse2DetectionDone)
-	{
-		g_sse2Detected = GetSSE2Capability();
-		g_sse2DetectionDone = true;
-	}
-	return g_sse2Enabled && g_sse2Detected;
-}
+	AS1(carry1continueAddP4:)
+	AS2(	add		edi, ebp)
+	AS2(	mov		ebp, 1)
+	AS2(	mov		[ecx], edi)
+	AS2(	mov		edi, [edx+4])
+	AS2(	cmovc	eax, ebp)
+	AS2(	mov		ebp, [ebx+4])
+	AS2(	lea		ebx, [ebx+8])
+	AS2(	add		edi, eax)
+	AS1(	jc		carry2AddP4)
 
-class P4Optimized : public PentiumOptimized
-{
-public:
-	static word __fastcall Add(word *C, const word *A, const word *B, unsigned int N);
-	static word __fastcall Subtract(word *C, const word *A, const word *B, unsigned int N);
-	static void Multiply4(word *C, const word *A, const word *B);
-	static void Multiply8(word *C, const word *A, const word *B);
-	static inline void Square4(word *R, const word *A)
-	{
-		Multiply4(R, A, A);
-	}
-	static void Multiply8Bottom(word *C, const word *A, const word *B);
-};
+	AS2(	xor		eax, eax)
 
-static void __fastcall P4_Mul(__m128i *C, const __m128i *A, const __m128i *B)
-{
-	__m128i a3210 = _mm_load_si128(A);
-	__m128i b3210 = _mm_load_si128(B);
+	AS1(carry2continueAddP4:)
+	AS2(	add		edi, ebp)
+	AS2(	mov		ebp, 1)
+	AS2(	cmovc	eax, ebp)
+	AS2(	mov		[ecx+4], edi)
+	AS2(	add		ecx, 8)
+	AS2(	mov		edi, [edx+8])
+	AS2(	add		edx, 8)
+	AS2(	add		esi, 2)
+	AS2(	mov		ebp, [ebx])
+	AS1(	jnz		loopstartAddP4)
+	AS1(	jmp		loopendAddP4)
 
-	__m128i sum;
+	AS1(carry1AddP4:)
+	AS2(	mov		eax, 1)
+	AS1(	jmp		carry1continueAddP4)
 
-	__m128i z = _mm_setzero_si128();
-	__m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210);
-	C[0] = a2b2_a0b0;
+	AS1(carry2AddP4:)
+	AS2(	mov		eax, 1)
+	AS1(	jmp		carry2continueAddP4)
 
-	__m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0));
-	__m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1));
-	__m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021);
-	__m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z);
-	__m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z);
-	C[1] = _mm_add_epi64(a1b0, a0b1);
+	AS1(loopendAddP4:)
 
-	__m128i a31 = _mm_srli_epi64(a3210, 32);
-	__m128i b31 = _mm_srli_epi64(b3210, 32);
-	__m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31);
-	C[6] = a3b3_a1b1;
+	P4Epilogue
+}
 
-	__m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z);
-	__m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2));
-	__m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012);
-	__m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z);
-	__m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z);
-	sum = _mm_add_epi64(a1b1, a0b2);
-	C[2] = _mm_add_epi64(sum, a2b0);
+CRYPTOPP_NAKED word P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
+{
+	P4Prologue
 
-	__m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1));
-	__m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3));
-	__m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012);
-	__m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103);
-	__m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z);
-	__m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z);
-	__m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z);
-	__m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z);
-	__m128i sum1 = _mm_add_epi64(a3b0, a1b2);
-	sum = _mm_add_epi64(a2b1, a0b3);
-	C[3] = _mm_add_epi64(sum, sum1);
+	// now: ebx = B, ecx = C, edx = A, esi = N
+	AS2(	xor		eax, eax)
+	AS1(	neg		esi)
+	AS1(	jz		loopendSubP4)		// if no dwords then nothing to do
 
-	__m128i	a3b1_a1b3 = _mm_mul_epu32(a2301, b2103);
-	__m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z);
-	__m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z);
-	__m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z);
-	sum = _mm_add_epi64(a2b2, a3b1);
-	C[4] = _mm_add_epi64(sum, a1b3);
+	AS2(	mov		edi, [edx])
+	AS2(	mov		ebp, [ebx])
 
-	__m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2));
-	__m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3));
-	__m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203);
-	__m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z);
-	__m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z);
-	C[5] = _mm_add_epi64(a3b2, a2b3);
-}
+	AS1(loopstartSubP4:)
+	AS2(	sub		edi, eax)
+	AS1(	jc		carry1SubP4)
 
-void P4Optimized::Multiply4(word *C, const word *A, const word *B)
-{
-	__m128i temp[7];
-	const word *w = (word *)temp;
-	const __m64 *mw = (__m64 *)w;
+	AS2(	xor		eax, eax)
 
-	P4_Mul(temp, (__m128i *)A, (__m128i *)B);
+	AS1(carry1continueSubP4:)
+	AS2(	sub		edi, ebp)
+	AS2(	mov		ebp, 1)
+	AS2(	mov		[ecx], edi)
+	AS2(	mov		edi, [edx+4])
+	AS2(	cmovc	eax, ebp)
+	AS2(	mov		ebp, [ebx+4])
+	AS2(	lea		ebx, [ebx+8])
+	AS2(	sub		edi, eax)
+	AS1(	jc		carry2SubP4)
 
-	C[0] = w[0];
+	AS2(	xor		eax, eax)
 
-	__m64 s1, s2;
+	AS1(carry2continueSubP4:)
+	AS2(	sub		edi, ebp)
+	AS2(	mov		ebp, 1)
+	AS2(	cmovc	eax, ebp)
+	AS2(	mov		[ecx+4], edi)
+	AS2(	add		ecx, 8)
+	AS2(	mov		edi, [edx+8])
+	AS2(	add		edx, 8)
+	AS2(	add		esi, 2)
+	AS2(	mov		ebp, [ebx])
+	AS1(	jnz		loopstartSubP4)
+	AS1(	jmp		loopendSubP4)
 
-	__m64 w1 = _m_from_int(w[1]);
-	__m64 w4 = mw[2];
-	__m64 w6 = mw[3];
-	__m64 w8 = mw[4];
-	__m64 w10 = mw[5];
-	__m64 w12 = mw[6];
-	__m64 w14 = mw[7];
-	__m64 w16 = mw[8];
-	__m64 w18 = mw[9];
-	__m64 w20 = mw[10];
-	__m64 w22 = mw[11];
-	__m64 w26 = _m_from_int(w[26]);
+	AS1(carry1SubP4:)
+	AS2(	mov		eax, 1)
+	AS1(	jmp		carry1continueSubP4)
+
+	AS1(carry2SubP4:)
+	AS2(	mov		eax, 1)
+	AS1(	jmp		carry2continueSubP4)
 
-	s1 = _mm_add_si64(w1, w4);
-	C[1] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+	AS1(loopendSubP4:)
+
+	P4Epilogue
+}
+
+#if __GNUC__
+// Comba square and multiply assembly code originally contributed by Leonard Janke
+// These are not needed with MSVC, which does a good job of optimizing the C++ multiply code.
 
-	s2 = _mm_add_si64(w6, w8);
-	s1 = _mm_add_si64(s1, s2);
-	C[2] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+#define SqrStartup \
+  "push %%ebp\n\t" \
+  "push %%esi\n\t" \
+  "push %%ebx\n\t" \
+  "xor %%ebp, %%ebp\n\t" \
+  "xor %%ebx, %%ebx\n\t" \
+  "xor %%ecx, %%ecx\n\t" 
 
-	s2 = _mm_add_si64(w10, w12);
-	s1 = _mm_add_si64(s1, s2);
-	C[3] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+#define SqrShiftCarry \
+  "mov %%ebx, %%ebp\n\t" \
+  "mov %%ecx, %%ebx\n\t" \
+  "xor %%ecx, %%ecx\n\t"
 
-	s2 = _mm_add_si64(w14, w16);
-	s1 = _mm_add_si64(s1, s2);
-	C[4] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+#define SqrAccumulate(i,j) \
+  "mov 4*"#j"(%%esi), %%eax\n\t" \
+  "mull 4*"#i"(%%esi)\n\t" \
+  "add %%eax, %%ebp\n\t" \
+  "adc %%edx, %%ebx\n\t" \
+  "adc %%ch, %%cl\n\t" \
+  "add %%eax, %%ebp\n\t" \
+  "adc %%edx, %%ebx\n\t" \
+  "adc %%ch, %%cl\n\t"
 
-	s2 = _mm_add_si64(w18, w20);
-	s1 = _mm_add_si64(s1, s2);
-	C[5] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+#define SqrAccumulateCentre(i) \
+  "mov 4*"#i"(%%esi), %%eax\n\t" \
+  "mull 4*"#i"(%%esi)\n\t" \
+  "add %%eax, %%ebp\n\t" \
+  "adc %%edx, %%ebx\n\t" \
+  "adc %%ch, %%cl\n\t" 
 
-	s2 = _mm_add_si64(w22, w26);
-	s1 = _mm_add_si64(s1, s2);
-	C[6] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+#define SqrStoreDigit(X)  \
+  "mov %%ebp, 4*"#X"(%%edi)\n\t" \
 
-	C[7] = _m_to_int(s1) + w[27];
-	_mm_empty();
-}
+#define SqrLastDiagonal(digits) \
+  "mov 4*("#digits"-1)(%%esi), %%eax\n\t" \
+  "mull 4*("#digits"-1)(%%esi)\n\t" \
+  "add %%eax, %%ebp\n\t" \
+  "adc %%edx, %%ebx\n\t" \
+  "mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
+  "mov %%ebx, 4*(2*"#digits"-1)(%%edi)\n\t" 
 
-void P4Optimized::Multiply8(word *C, const word *A, const word *B)
+#define SqrCleanup \
+  "pop %%ebx\n\t" \
+  "pop %%esi\n\t" \
+  "pop %%ebp\n\t" 
+
+void PentiumOptimized::Square4(word* Y, const word* X)
 {
-	__m128i temp[28];
-	const word *w = (word *)temp;
-	const __m64 *mw = (__m64 *)w;
-	const word *x = (word *)temp+7*4;
-	const __m64 *mx = (__m64 *)x;
-	const word *y = (word *)temp+7*4*2;
-	const __m64 *my = (__m64 *)y;
-	const word *z = (word *)temp+7*4*3;
-	const __m64 *mz = (__m64 *)z;
+	__asm__ __volatile__(
+		SqrStartup
 
-	P4_Mul(temp, (__m128i *)A, (__m128i *)B);
+		SqrAccumulateCentre(0)
+		SqrStoreDigit(0)
+		SqrShiftCarry
 
-	P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
+		SqrAccumulate(1,0)
+		SqrStoreDigit(1)
+		SqrShiftCarry
 
-	P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
+		SqrAccumulate(2,0)
+		SqrAccumulateCentre(1)
+		SqrStoreDigit(2)
+		SqrShiftCarry
 
-	P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1);
+		SqrAccumulate(3,0)
+		SqrAccumulate(2,1)
+		SqrStoreDigit(3)
+		SqrShiftCarry
 
-	C[0] = w[0];
+		SqrAccumulate(3,1)
+		SqrAccumulateCentre(2)
+		SqrStoreDigit(4)
+		SqrShiftCarry
 
-	__m64 s1, s2, s3, s4;
+		SqrAccumulate(3,2)
+		SqrStoreDigit(5)
+		SqrShiftCarry
 
-	__m64 w1 = _m_from_int(w[1]);
-	__m64 w4 = mw[2];
-	__m64 w6 = mw[3];
-	__m64 w8 = mw[4];
-	__m64 w10 = mw[5];
-	__m64 w12 = mw[6];
-	__m64 w14 = mw[7];
-	__m64 w16 = mw[8];
-	__m64 w18 = mw[9];
-	__m64 w20 = mw[10];
-	__m64 w22 = mw[11];
-	__m64 w26 = _m_from_int(w[26]);
-	__m64 w27 = _m_from_int(w[27]);
+		SqrLastDiagonal(4)
 
-	__m64 x0 = _m_from_int(x[0]);
-	__m64 x1 = _m_from_int(x[1]);
-	__m64 x4 = mx[2];
-	__m64 x6 = mx[3];
-	__m64 x8 = mx[4];
-	__m64 x10 = mx[5];
-	__m64 x12 = mx[6];
-	__m64 x14 = mx[7];
-	__m64 x16 = mx[8];
-	__m64 x18 = mx[9];
-	__m64 x20 = mx[10];
-	__m64 x22 = mx[11];
-	__m64 x26 = _m_from_int(x[26]);
-	__m64 x27 = _m_from_int(x[27]);
+		SqrCleanup
 
-	__m64 y0 = _m_from_int(y[0]);
-	__m64 y1 = _m_from_int(y[1]);
-	__m64 y4 = my[2];
-	__m64 y6 = my[3];
-	__m64 y8 = my[4];
-	__m64 y10 = my[5];
-	__m64 y12 = my[6];
-	__m64 y14 = my[7];
-	__m64 y16 = my[8];
-	__m64 y18 = my[9];
-	__m64 y20 = my[10];
-	__m64 y22 = my[11];
-	__m64 y26 = _m_from_int(y[26]);
-	__m64 y27 = _m_from_int(y[27]);
+		:
+		: "D" (Y), "S" (X)
+		: "eax",  "ecx", "edx", "ebp",   "memory"
+	);
+}
 
-	__m64 z0 = _m_from_int(z[0]);
-	__m64 z1 = _m_from_int(z[1]);
-	__m64 z4 = mz[2];
-	__m64 z6 = mz[3];
-	__m64 z8 = mz[4];
-	__m64 z10 = mz[5];
-	__m64 z12 = mz[6];
-	__m64 z14 = mz[7];
-	__m64 z16 = mz[8];
-	__m64 z18 = mz[9];
-	__m64 z20 = mz[10];
-	__m64 z22 = mz[11];
-	__m64 z26 = _m_from_int(z[26]);
+#define MulStartup \
+  "push %%ebp\n\t" \
+  "push %%esi\n\t" \
+  "push %%ebx\n\t" \
+  "push %%edi\n\t" \
+  "mov %%eax, %%ebx \n\t" \
+  "xor %%ebp, %%ebp\n\t" \
+  "xor %%edi, %%edi\n\t" \
+  "xor %%ecx, %%ecx\n\t" 
 
-	s1 = _mm_add_si64(w1, w4);
-	C[1] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+#define MulShiftCarry \
+  "mov %%edx, %%ebp\n\t" \
+  "mov %%ecx, %%edi\n\t" \
+  "xor %%ecx, %%ecx\n\t"
 
-	s2 = _mm_add_si64(w6, w8);
-	s1 = _mm_add_si64(s1, s2);
-	C[2] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+#define MulAccumulate(i,j) \
+  "mov 4*"#j"(%%ebx), %%eax\n\t" \
+  "mull 4*"#i"(%%esi)\n\t" \
+  "add %%eax, %%ebp\n\t" \
+  "adc %%edx, %%edi\n\t" \
+  "adc %%ch, %%cl\n\t"
 
-	s2 = _mm_add_si64(w10, w12);
-	s1 = _mm_add_si64(s1, s2);
-	C[3] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+#define MulStoreDigit(X)  \
+  "mov %%edi, %%edx \n\t" \
+  "mov (%%esp), %%edi \n\t" \
+  "mov %%ebp, 4*"#X"(%%edi)\n\t" \
+  "mov %%edi, (%%esp)\n\t" 
 
-	s3 = _mm_add_si64(x0, y0);
-	s2 = _mm_add_si64(w14, w16);
-	s1 = _mm_add_si64(s1, s3);
-	s1 = _mm_add_si64(s1, s2);
-	C[4] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+#define MulLastDiagonal(digits) \
+  "mov 4*("#digits"-1)(%%ebx), %%eax\n\t" \
+  "mull 4*("#digits"-1)(%%esi)\n\t" \
+  "add %%eax, %%ebp\n\t" \
+  "adc %%edi, %%edx\n\t" \
+  "mov (%%esp), %%edi\n\t" \
+  "mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
+  "mov %%edx, 4*(2*"#digits"-1)(%%edi)\n\t" 
 
-	s3 = _mm_add_si64(x1, y1);
-	s4 = _mm_add_si64(x4, y4);
-	s1 = _mm_add_si64(s1, w18);
-	s3 = _mm_add_si64(s3, s4);
-	s1 = _mm_add_si64(s1, w20);
-	s1 = _mm_add_si64(s1, s3);
-	C[5] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+#define MulCleanup \
+  "pop %%edi\n\t" \
+  "pop %%ebx\n\t" \
+  "pop %%esi\n\t" \
+  "pop %%ebp\n\t" 
 
-	s3 = _mm_add_si64(x6, y6);
-	s4 = _mm_add_si64(x8, y8);
-	s1 = _mm_add_si64(s1, w22);
-	s3 = _mm_add_si64(s3, s4);
-	s1 = _mm_add_si64(s1, w26);
-	s1 = _mm_add_si64(s1, s3);
-	C[6] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
+{
+	__asm__ __volatile__(
+		MulStartup
+		MulAccumulate(0,0)
+		MulStoreDigit(0)
+		MulShiftCarry
 
-	s3 = _mm_add_si64(x10, y10);
-	s4 = _mm_add_si64(x12, y12);
-	s1 = _mm_add_si64(s1, w27);
-	s3 = _mm_add_si64(s3, s4);
-	s1 = _mm_add_si64(s1, s3);
-	C[7] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+		MulAccumulate(1,0)
+		MulAccumulate(0,1)
+		MulStoreDigit(1)
+		MulShiftCarry
 
-	s3 = _mm_add_si64(x14, y14);
-	s4 = _mm_add_si64(x16, y16);
-	s1 = _mm_add_si64(s1, z0);
-	s3 = _mm_add_si64(s3, s4);
-	s1 = _mm_add_si64(s1, s3);
-	C[8] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+		MulAccumulate(2,0)
+		MulAccumulate(1,1)
+		MulAccumulate(0,2)
+		MulStoreDigit(2)
+		MulShiftCarry
 
-	s3 = _mm_add_si64(x18, y18);
-	s4 = _mm_add_si64(x20, y20);
-	s1 = _mm_add_si64(s1, z1);
-	s3 = _mm_add_si64(s3, s4);
-	s1 = _mm_add_si64(s1, z4);
-	s1 = _mm_add_si64(s1, s3);
-	C[9] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+		MulAccumulate(3,0)
+		MulAccumulate(2,1)
+		MulAccumulate(1,2)
+		MulAccumulate(0,3)
+		MulStoreDigit(3)
+		MulShiftCarry
+
+		MulAccumulate(3,1)
+		MulAccumulate(2,2)
+		MulAccumulate(1,3)
+		MulStoreDigit(4)
+		MulShiftCarry
+
+		MulAccumulate(3,2)
+		MulAccumulate(2,3)
+		MulStoreDigit(5)
+		MulShiftCarry
+
+		MulLastDiagonal(4)
+
+		MulCleanup
+
+		: 
+		: "D" (Z), "S" (X), "a" (Y)
+		: "%ecx", "%edx",  "memory"
+	);
+}
+
+void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
+{
+	__asm__ __volatile__(
+		MulStartup
+		MulAccumulate(0,0)
+		MulStoreDigit(0)
+		MulShiftCarry
+
+		MulAccumulate(1,0)
+		MulAccumulate(0,1)
+		MulStoreDigit(1)
+		MulShiftCarry
+
+		MulAccumulate(2,0)
+		MulAccumulate(1,1)
+		MulAccumulate(0,2)
+		MulStoreDigit(2)
+		MulShiftCarry
+
+		MulAccumulate(3,0)
+		MulAccumulate(2,1)
+		MulAccumulate(1,2)
+		MulAccumulate(0,3)
+		MulStoreDigit(3)
+		MulShiftCarry
+
+		MulAccumulate(4,0)
+		MulAccumulate(3,1)
+		MulAccumulate(2,2)
+		MulAccumulate(1,3)
+		MulAccumulate(0,4)
+		MulStoreDigit(4)
+		MulShiftCarry
+
+		MulAccumulate(5,0)
+		MulAccumulate(4,1)
+		MulAccumulate(3,2)
+		MulAccumulate(2,3)
+		MulAccumulate(1,4)
+		MulAccumulate(0,5)
+		MulStoreDigit(5)
+		MulShiftCarry
+
+		MulAccumulate(6,0)
+		MulAccumulate(5,1)
+		MulAccumulate(4,2)
+		MulAccumulate(3,3)
+		MulAccumulate(2,4)
+		MulAccumulate(1,5)
+		MulAccumulate(0,6)
+		MulStoreDigit(6)
+		MulShiftCarry
+
+		MulAccumulate(7,0)
+		MulAccumulate(6,1)
+		MulAccumulate(5,2)
+		MulAccumulate(4,3)
+		MulAccumulate(3,4)
+		MulAccumulate(2,5)
+		MulAccumulate(1,6)
+		MulAccumulate(0,7)
+		MulStoreDigit(7)
+		MulShiftCarry
+
+		MulAccumulate(7,1)
+		MulAccumulate(6,2)
+		MulAccumulate(5,3)
+		MulAccumulate(4,4)
+		MulAccumulate(3,5)
+		MulAccumulate(2,6)
+		MulAccumulate(1,7)
+		MulStoreDigit(8)
+		MulShiftCarry
+
+		MulAccumulate(7,2)
+		MulAccumulate(6,3)
+		MulAccumulate(5,4)
+		MulAccumulate(4,5)
+		MulAccumulate(3,6)
+		MulAccumulate(2,7)
+		MulStoreDigit(9)
+		MulShiftCarry
+
+		MulAccumulate(7,3)
+		MulAccumulate(6,4)
+		MulAccumulate(5,5)
+		MulAccumulate(4,6)
+		MulAccumulate(3,7)
+		MulStoreDigit(10)
+		MulShiftCarry
+
+		MulAccumulate(7,4)
+		MulAccumulate(6,5)
+		MulAccumulate(5,6)
+		MulAccumulate(4,7)
+		MulStoreDigit(11)
+		MulShiftCarry
+
+		MulAccumulate(7,5)
+		MulAccumulate(6,6)
+		MulAccumulate(5,7)
+		MulStoreDigit(12)
+		MulShiftCarry
+
+		MulAccumulate(7,6)
+		MulAccumulate(6,7)
+		MulStoreDigit(13)
+		MulShiftCarry
+
+		MulLastDiagonal(8)
+
+		MulCleanup
+
+		: 
+		: "D" (Z), "S" (X), "a" (Y)
+		: "%ecx", "%edx",  "memory"
+	);
+}
+
+#endif	// __GNUC__
+
+#else	// not x86 - no processor specific code at this layer
+
+typedef Portable LowLevel;
+
+#endif
+
+bool g_sse2DetectionDone = false, g_sse2Detected, g_sse2Enabled = true;
+
+void DisableSSE2()
+{
+	g_sse2Enabled = false;
+}
+
+#ifdef SSE2_INTRINSICS_AVAILABLE
+
+static bool GetSSE2Capability()
+{
+	word32 b;
+
+#ifdef __GNUC__
+	__asm__("mov $1, %%eax; cpuid; mov %%edx, %0" : "=rm" (b) : : "%eax", "%edx");
+#else
+	__asm
+	{
+		mov		eax, 1
+		cpuid
+		mov		b, edx
+	}
+#endif
+
+	return (b & (1 << 26)) != 0;
+}
+
+static inline bool HasSSE2()
+{
+	if (g_sse2Enabled && !g_sse2DetectionDone)
+	{
+		g_sse2Detected = GetSSE2Capability();
+		g_sse2DetectionDone = true;
+	}
+	return g_sse2Enabled && g_sse2Detected;
+}
+
+#ifdef __GNUC__
+#define __fastcall
+#endif
+
+static void __fastcall P4_Mul(__m128i *C, const __m128i *A, const __m128i *B)
+{
+	__m128i a3210 = _mm_load_si128(A);
+	__m128i b3210 = _mm_load_si128(B);
+
+	__m128i sum;
+
+	__m128i z = _mm_setzero_si128();
+	__m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210);
+	C[0] = a2b2_a0b0;
+
+	__m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0));
+	__m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1));
+	__m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021);
+	__m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z);
+	__m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z);
+	C[1] = _mm_add_epi64(a1b0, a0b1);
+
+	__m128i a31 = _mm_srli_epi64(a3210, 32);
+	__m128i b31 = _mm_srli_epi64(b3210, 32);
+	__m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31);
+	C[6] = a3b3_a1b1;
+
+	__m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z);
+	__m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2));
+	__m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012);
+	__m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z);
+	__m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z);
+	sum = _mm_add_epi64(a1b1, a0b2);
+	C[2] = _mm_add_epi64(sum, a2b0);
+
+	__m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1));
+	__m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3));
+	__m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012);
+	__m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103);
+	__m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z);
+	__m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z);
+	__m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z);
+	__m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z);
+	__m128i sum1 = _mm_add_epi64(a3b0, a1b2);
+	sum = _mm_add_epi64(a2b1, a0b3);
+	C[3] = _mm_add_epi64(sum, sum1);
+
+	__m128i	a3b1_a1b3 = _mm_mul_epu32(a2301, b2103);
+	__m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z);
+	__m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z);
+	__m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z);
+	sum = _mm_add_epi64(a2b2, a3b1);
+	C[4] = _mm_add_epi64(sum, a1b3);
+
+	__m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2));
+	__m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3));
+	__m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203);
+	__m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z);
+	__m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z);
+	C[5] = _mm_add_epi64(a3b2, a2b3);
+}
+
+void P4Optimized::Multiply4(word *C, const word *A, const word *B)
+{
+	__m128i temp[7];
+	const word *w = (word *)temp;
+	const __m64 *mw = (__m64 *)w;
+
+	P4_Mul(temp, (__m128i *)A, (__m128i *)B);
+
+	C[0] = w[0];
+
+	__m64 s1, s2;
+
+	__m64 w1 = _mm_cvtsi32_si64(w[1]);
+	__m64 w4 = mw[2];
+	__m64 w6 = mw[3];
+	__m64 w8 = mw[4];
+	__m64 w10 = mw[5];
+	__m64 w12 = mw[6];
+	__m64 w14 = mw[7];
+	__m64 w16 = mw[8];
+	__m64 w18 = mw[9];
+	__m64 w20 = mw[10];
+	__m64 w22 = mw[11];
+	__m64 w26 = _mm_cvtsi32_si64(w[26]);
 
-	s3 = _mm_add_si64(x22, y22);
-	s4 = _mm_add_si64(x26, y26);
-	s1 = _mm_add_si64(s1, z6);
-	s3 = _mm_add_si64(s3, s4);
-	s1 = _mm_add_si64(s1, z8);
-	s1 = _mm_add_si64(s1, s3);
-	C[10] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+	s1 = _mm_add_si64(w1, w4);
+	C[1] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-	s3 = _mm_add_si64(x27, y27);
-	s1 = _mm_add_si64(s1, z10);
-	s1 = _mm_add_si64(s1, z12);
-	s1 = _mm_add_si64(s1, s3);
-	C[11] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+	s2 = _mm_add_si64(w6, w8);
+	s1 = _mm_add_si64(s1, s2);
+	C[2] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-	s3 = _mm_add_si64(z14, z16);
-	s1 = _mm_add_si64(s1, s3);
-	C[12] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+	s2 = _mm_add_si64(w10, w12);
+	s1 = _mm_add_si64(s1, s2);
+	C[3] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-	s3 = _mm_add_si64(z18, z20);
-	s1 = _mm_add_si64(s1, s3);
-	C[13] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+	s2 = _mm_add_si64(w14, w16);
+	s1 = _mm_add_si64(s1, s2);
+	C[4] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-	s3 = _mm_add_si64(z22, z26);
-	s1 = _mm_add_si64(s1, s3);
-	C[14] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+	s2 = _mm_add_si64(w18, w20);
+	s1 = _mm_add_si64(s1, s2);
+	C[5] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
+
+	s2 = _mm_add_si64(w22, w26);
+	s1 = _mm_add_si64(s1, s2);
+	C[6] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-	C[15] = z[27] + _m_to_int(s1);
+	C[7] = _mm_cvtsi64_si32(s1) + w[27];
 	_mm_empty();
 }
 
-void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
+void P4Optimized::Multiply8(word *C, const word *A, const word *B)
 {
-	__m128i temp[21];
+	__m128i temp[28];
 	const word *w = (word *)temp;
 	const __m64 *mw = (__m64 *)w;
 	const word *x = (word *)temp+7*4;
 	const __m64 *mx = (__m64 *)x;
 	const word *y = (word *)temp+7*4*2;
 	const __m64 *my = (__m64 *)y;
+	const word *z = (word *)temp+7*4*3;
+	const __m64 *mz = (__m64 *)z;
 
 	P4_Mul(temp, (__m128i *)A, (__m128i *)B);
 
@@ -1315,11 +1621,13 @@ void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
 
 	P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
 
+	P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1);
+
 	C[0] = w[0];
 
 	__m64 s1, s2, s3, s4;
 
-	__m64 w1 = _m_from_int(w[1]);
+	__m64 w1 = _mm_cvtsi32_si64(w[1]);
 	__m64 w4 = mw[2];
 	__m64 w6 = mw[3];
 	__m64 w8 = mw[4];
@@ -1330,607 +1638,241 @@ void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
 	__m64 w18 = mw[9];
 	__m64 w20 = mw[10];
 	__m64 w22 = mw[11];
-	__m64 w26 = _m_from_int(w[26]);
+	__m64 w26 = _mm_cvtsi32_si64(w[26]);
+	__m64 w27 = _mm_cvtsi32_si64(w[27]);
 
-	__m64 x0 = _m_from_int(x[0]);
-	__m64 x1 = _m_from_int(x[1]);
+	__m64 x0 = _mm_cvtsi32_si64(x[0]);
+	__m64 x1 = _mm_cvtsi32_si64(x[1]);
 	__m64 x4 = mx[2];
 	__m64 x6 = mx[3];
 	__m64 x8 = mx[4];
+	__m64 x10 = mx[5];
+	__m64 x12 = mx[6];
+	__m64 x14 = mx[7];
+	__m64 x16 = mx[8];
+	__m64 x18 = mx[9];
+	__m64 x20 = mx[10];
+	__m64 x22 = mx[11];
+	__m64 x26 = _mm_cvtsi32_si64(x[26]);
+	__m64 x27 = _mm_cvtsi32_si64(x[27]);
 
-	__m64 y0 = _m_from_int(y[0]);
-	__m64 y1 = _m_from_int(y[1]);
+	__m64 y0 = _mm_cvtsi32_si64(y[0]);
+	__m64 y1 = _mm_cvtsi32_si64(y[1]);
 	__m64 y4 = my[2];
 	__m64 y6 = my[3];
 	__m64 y8 = my[4];
+	__m64 y10 = my[5];
+	__m64 y12 = my[6];
+	__m64 y14 = my[7];
+	__m64 y16 = my[8];
+	__m64 y18 = my[9];
+	__m64 y20 = my[10];
+	__m64 y22 = my[11];
+	__m64 y26 = _mm_cvtsi32_si64(y[26]);
+	__m64 y27 = _mm_cvtsi32_si64(y[27]);
+
+	__m64 z0 = _mm_cvtsi32_si64(z[0]);
+	__m64 z1 = _mm_cvtsi32_si64(z[1]);
+	__m64 z4 = mz[2];
+	__m64 z6 = mz[3];
+	__m64 z8 = mz[4];
+	__m64 z10 = mz[5];
+	__m64 z12 = mz[6];
+	__m64 z14 = mz[7];
+	__m64 z16 = mz[8];
+	__m64 z18 = mz[9];
+	__m64 z20 = mz[10];
+	__m64 z22 = mz[11];
+	__m64 z26 = _mm_cvtsi32_si64(z[26]);
 
 	s1 = _mm_add_si64(w1, w4);
-	C[1] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+	C[1] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
 	s2 = _mm_add_si64(w6, w8);
 	s1 = _mm_add_si64(s1, s2);
-	C[2] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
+	C[2] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
 	s2 = _mm_add_si64(w10, w12);
 	s1 = _mm_add_si64(s1, s2);
-	C[3] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
-
-	s3 = _mm_add_si64(x0, y0);
-	s2 = _mm_add_si64(w14, w16);
-	s1 = _mm_add_si64(s1, s3);
-	s1 = _mm_add_si64(s1, s2);
-	C[4] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
-
-	s3 = _mm_add_si64(x1, y1);
-	s4 = _mm_add_si64(x4, y4);
-	s1 = _mm_add_si64(s1, w18);
-	s3 = _mm_add_si64(s3, s4);
-	s1 = _mm_add_si64(s1, w20);
-	s1 = _mm_add_si64(s1, s3);
-	C[5] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
-
-	s3 = _mm_add_si64(x6, y6);
-	s4 = _mm_add_si64(x8, y8);
-	s1 = _mm_add_si64(s1, w22);
-	s3 = _mm_add_si64(s3, s4);
-	s1 = _mm_add_si64(s1, w26);
-	s1 = _mm_add_si64(s1, s3);
-	C[6] = _m_to_int(s1);
-	s1 = _m_psrlqi(s1, 32);
-
-	C[7] = _m_to_int(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
-	_mm_empty();
-}
-
-__declspec(naked) word __fastcall P4Optimized::Add(word *C, const word *A, const word *B, unsigned int N)
-{
-	__asm
-	{
-		sub		esp, 16
-		xor		eax, eax
-		mov		[esp], edi
-		mov		[esp+4], esi
-		mov		[esp+8], ebx
-		mov		[esp+12], ebp
-
-		mov		ebx, [esp+20]	// B
-		mov		esi, [esp+24]	// N
-
-		// now: ebx = B, ecx = C, edx = A, esi = N
-
-		neg		esi
-		jz		loopend		// if no dwords then nothing to do
-
-		mov		edi, [edx]
-		mov		ebp, [ebx]
-
-loopstart:
-		add		edi, eax
-		jc		carry1
-
-		xor		eax, eax
-
-carry1continue:
-		add		edi, ebp
-		mov		ebp, 1
-		mov		[ecx], edi
-		mov		edi, [edx+4]
-		cmovc	eax, ebp
-		mov		ebp, [ebx+4]
-		lea		ebx, [ebx+8]
-		add		edi, eax
-		jc		carry2
-
-		xor		eax, eax
-
-carry2continue:
-		add		edi, ebp
-		mov		ebp, 1
-		cmovc	eax, ebp
-		mov		[ecx+4], edi
-		add		ecx, 8
-		mov		edi, [edx+8]
-		add		edx, 8
-		add		esi, 2
-		mov		ebp, [ebx]
-		jnz		loopstart
-
-loopend:
-		mov		edi, [esp]
-		mov		esi, [esp+4]
-		mov		ebx, [esp+8]
-		mov		ebp, [esp+12]
-		add		esp, 16
-		ret		8
-
-carry1:
-		mov		eax, 1
-		jmp		carry1continue
-
-carry2:
-		mov		eax, 1
-		jmp		carry2continue
-	}
-}
-
-__declspec(naked) word __fastcall P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
-{
-	__asm
-	{
-		sub		esp, 16
-		xor		eax, eax
-		mov		[esp], edi
-		mov		[esp+4], esi
-		mov		[esp+8], ebx
-		mov		[esp+12], ebp
-
-		mov		ebx, [esp+20]	// B
-		mov		esi, [esp+24]	// N
-
-		// now: ebx = B, ecx = C, edx = A, esi = N
-
-		neg		esi
-		jz		loopend		// if no dwords then nothing to do
-
-		mov		edi, [edx]
-		mov		ebp, [ebx]
-
-loopstart:
-		sub		edi, eax
-		jc		carry1
-
-		xor		eax, eax
-
-carry1continue:
-		sub		edi, ebp
-		mov		ebp, 1
-		mov		[ecx], edi
-		mov		edi, [edx+4]
-		cmovc	eax, ebp
-		mov		ebp, [ebx+4]
-		lea		ebx, [ebx+8]
-		sub		edi, eax
-		jc		carry2
-
-		xor		eax, eax
-
-carry2continue:
-		sub		edi, ebp
-		mov		ebp, 1
-		cmovc	eax, ebp
-		mov		[ecx+4], edi
-		add		ecx, 8
-		mov		edi, [edx+8]
-		add		edx, 8
-		add		esi, 2
-		mov		ebp, [ebx]
-		jnz		loopstart
-
-loopend:
-		mov		edi, [esp]
-		mov		esi, [esp+4]
-		mov		ebx, [esp+8]
-		mov		ebp, [esp+12]
-		add		esp, 16
-		ret		8
-
-carry1:
-		mov		eax, 1
-		jmp		carry1continue
-
-carry2:
-		mov		eax, 1
-		jmp		carry2continue
-	}
-}
-
-#endif	// #ifdef SSE2_INTRINSICS_AVAILABLE
-
-#elif defined(__GNUC__) && defined(__i386__)
-
-class PentiumOptimized : public Portable
-{
-public:
-#ifndef __pic__		// -fpic uses up a register, leaving too few for the asm code
-	static word Add(word *C, const word *A, const word *B, unsigned int N);
-	static word Subtract(word *C, const word *A, const word *B, unsigned int N);
-#endif
-	static void Square4(word *R, const word *A);
-	static void Multiply4(word *C, const word *A, const word *B);
-	static void Multiply8(word *C, const word *A, const word *B);
-};
-
-typedef PentiumOptimized LowLevel;
-
-// Add and Subtract assembly code originally contributed by Alister Lee
-
-#ifndef __pic__
-__attribute__((regparm(3))) word PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)
-{
-	assert (N%2 == 0);
-
-	register word carry, temp;
-
-	__asm__ __volatile__(
-			"push %%ebp;"
-			"sub %3, %2;"
-			"xor %0, %0;"
-			"sub %4, %0;"
-			"lea (%1,%4,4), %1;"
-			"sar $1, %0;"
-			"jz 1f;"
-
-		"0:;"
-			"mov 0(%3), %4;"
-			"mov 4(%3), %%ebp;"
-			"mov (%1,%0,8), %5;"
-			"lea 8(%3), %3;"
-			"adc %5, %4;"
-			"mov 4(%1,%0,8), %5;"
-			"adc %5, %%ebp;"
-			"inc %0;"
-			"mov %4, -8(%3, %2);"
-			"mov %%ebp, -4(%3, %2);"
-			"jnz 0b;"
-
-		"1:;"
-			"adc $0, %0;"
-			"pop %%ebp;"
-
-		: "=aSD" (carry), "+r" (B), "+r" (C), "+r" (A), "+r" (N), "=r" (temp)
-		: : "cc", "memory");
-
-	return carry;
-}
-
-__attribute__((regparm(3))) word PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
-{
-	assert (N%2 == 0);
-
-	register word carry, temp;
-
-	__asm__ __volatile__(
-			"push %%ebp;"
-			"sub %3, %2;"
-			"xor %0, %0;"
-			"sub %4, %0;"
-			"lea (%1,%4,4), %1;"
-			"sar $1, %0;"
-			"jz 1f;"
-
-		"0:;"
-			"mov 0(%3), %4;"
-			"mov 4(%3), %%ebp;"
-			"mov (%1,%0,8), %5;"
-			"lea 8(%3), %3;"
-			"sbb %5, %4;"
-			"mov 4(%1,%0,8), %5;"
-			"sbb %5, %%ebp;"
-			"inc %0;"
-			"mov %4, -8(%3, %2);"
-			"mov %%ebp, -4(%3, %2);"
-			"jnz 0b;"
-
-		"1:;"
-			"adc $0, %0;"
-			"pop %%ebp;"
-
-		: "=aSD" (carry), "+r" (B), "+r" (C), "+r" (A), "+r" (N), "=r" (temp)
-		: : "cc", "memory");
-
-	return carry;
-}
-#endif	// __pic__
-
-// Comba square and multiply assembly code originally contributed by Leonard Janke
-
-#define SqrStartup \
-  "push %%ebp\n\t" \
-  "push %%esi\n\t" \
-  "push %%ebx\n\t" \
-  "xor %%ebp, %%ebp\n\t" \
-  "xor %%ebx, %%ebx\n\t" \
-  "xor %%ecx, %%ecx\n\t" 
-
-#define SqrShiftCarry \
-  "mov %%ebx, %%ebp\n\t" \
-  "mov %%ecx, %%ebx\n\t" \
-  "xor %%ecx, %%ecx\n\t"
-
-#define SqrAccumulate(i,j) \
-  "mov 4*"#j"(%%esi), %%eax\n\t" \
-  "mull 4*"#i"(%%esi)\n\t" \
-  "add %%eax, %%ebp\n\t" \
-  "adc %%edx, %%ebx\n\t" \
-  "adc %%ch, %%cl\n\t" \
-  "add %%eax, %%ebp\n\t" \
-  "adc %%edx, %%ebx\n\t" \
-  "adc %%ch, %%cl\n\t"
-
-#define SqrAccumulateCentre(i) \
-  "mov 4*"#i"(%%esi), %%eax\n\t" \
-  "mull 4*"#i"(%%esi)\n\t" \
-  "add %%eax, %%ebp\n\t" \
-  "adc %%edx, %%ebx\n\t" \
-  "adc %%ch, %%cl\n\t" 
-
-#define SqrStoreDigit(X)  \
-  "mov %%ebp, 4*"#X"(%%edi)\n\t" \
-
-#define SqrLastDiagonal(digits) \
-  "mov 4*("#digits"-1)(%%esi), %%eax\n\t" \
-  "mull 4*("#digits"-1)(%%esi)\n\t" \
-  "add %%eax, %%ebp\n\t" \
-  "adc %%edx, %%ebx\n\t" \
-  "mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
-  "mov %%ebx, 4*(2*"#digits"-1)(%%edi)\n\t" 
-
-#define SqrCleanup \
-  "pop %%ebx\n\t" \
-  "pop %%esi\n\t" \
-  "pop %%ebp\n\t" 
-
-void PentiumOptimized::Square4(word* Y, const word* X)
-{
-	__asm__ __volatile__(
-		SqrStartup
+	C[3] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-		SqrAccumulateCentre(0)
-		SqrStoreDigit(0)
-		SqrShiftCarry
-
-		SqrAccumulate(1,0)
-		SqrStoreDigit(1)
-		SqrShiftCarry
-
-		SqrAccumulate(2,0)
-		SqrAccumulateCentre(1)
-		SqrStoreDigit(2)
-		SqrShiftCarry
-
-		SqrAccumulate(3,0)
-		SqrAccumulate(2,1)
-		SqrStoreDigit(3)
-		SqrShiftCarry
-
-		SqrAccumulate(3,1)
-		SqrAccumulateCentre(2)
-		SqrStoreDigit(4)
-		SqrShiftCarry
-
-		SqrAccumulate(3,2)
-		SqrStoreDigit(5)
-		SqrShiftCarry
-
-		SqrLastDiagonal(4)
-
-		SqrCleanup
-
-		:
-		: "D" (Y), "S" (X)
-		: "eax",  "ecx", "edx", "ebp",   "memory"
-	);
-}
-
-#define MulStartup \
-  "push %%ebp\n\t" \
-  "push %%esi\n\t" \
-  "push %%ebx\n\t" \
-  "push %%edi\n\t" \
-  "mov %%eax, %%ebx \n\t" \
-  "xor %%ebp, %%ebp\n\t" \
-  "xor %%edi, %%edi\n\t" \
-  "xor %%ecx, %%ecx\n\t" 
-
-#define MulShiftCarry \
-  "mov %%edx, %%ebp\n\t" \
-  "mov %%ecx, %%edi\n\t" \
-  "xor %%ecx, %%ecx\n\t"
-
-#define MulAccumulate(i,j) \
-  "mov 4*"#j"(%%ebx), %%eax\n\t" \
-  "mull 4*"#i"(%%esi)\n\t" \
-  "add %%eax, %%ebp\n\t" \
-  "adc %%edx, %%edi\n\t" \
-  "adc %%ch, %%cl\n\t"
-
-#define MulStoreDigit(X)  \
-  "mov %%edi, %%edx \n\t" \
-  "mov (%%esp), %%edi \n\t" \
-  "mov %%ebp, 4*"#X"(%%edi)\n\t" \
-  "mov %%edi, (%%esp)\n\t" 
+	s3 = _mm_add_si64(x0, y0);
+	s2 = _mm_add_si64(w14, w16);
+	s1 = _mm_add_si64(s1, s3);
+	s1 = _mm_add_si64(s1, s2);
+	C[4] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-#define MulLastDiagonal(digits) \
-  "mov 4*("#digits"-1)(%%ebx), %%eax\n\t" \
-  "mull 4*("#digits"-1)(%%esi)\n\t" \
-  "add %%eax, %%ebp\n\t" \
-  "adc %%edi, %%edx\n\t" \
-  "mov (%%esp), %%edi\n\t" \
-  "mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
-  "mov %%edx, 4*(2*"#digits"-1)(%%edi)\n\t" 
+	s3 = _mm_add_si64(x1, y1);
+	s4 = _mm_add_si64(x4, y4);
+	s1 = _mm_add_si64(s1, w18);
+	s3 = _mm_add_si64(s3, s4);
+	s1 = _mm_add_si64(s1, w20);
+	s1 = _mm_add_si64(s1, s3);
+	C[5] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-#define MulCleanup \
-  "pop %%edi\n\t" \
-  "pop %%ebx\n\t" \
-  "pop %%esi\n\t" \
-  "pop %%ebp\n\t" 
+	s3 = _mm_add_si64(x6, y6);
+	s4 = _mm_add_si64(x8, y8);
+	s1 = _mm_add_si64(s1, w22);
+	s3 = _mm_add_si64(s3, s4);
+	s1 = _mm_add_si64(s1, w26);
+	s1 = _mm_add_si64(s1, s3);
+	C[6] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
-{
-	__asm__ __volatile__(
-		MulStartup
-		MulAccumulate(0,0)
-		MulStoreDigit(0)
-		MulShiftCarry
+	s3 = _mm_add_si64(x10, y10);
+	s4 = _mm_add_si64(x12, y12);
+	s1 = _mm_add_si64(s1, w27);
+	s3 = _mm_add_si64(s3, s4);
+	s1 = _mm_add_si64(s1, s3);
+	C[7] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-		MulAccumulate(1,0)
-		MulAccumulate(0,1)
-		MulStoreDigit(1)
-		MulShiftCarry
+	s3 = _mm_add_si64(x14, y14);
+	s4 = _mm_add_si64(x16, y16);
+	s1 = _mm_add_si64(s1, z0);
+	s3 = _mm_add_si64(s3, s4);
+	s1 = _mm_add_si64(s1, s3);
+	C[8] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-		MulAccumulate(2,0)
-		MulAccumulate(1,1)
-		MulAccumulate(0,2)
-		MulStoreDigit(2)
-		MulShiftCarry
+	s3 = _mm_add_si64(x18, y18);
+	s4 = _mm_add_si64(x20, y20);
+	s1 = _mm_add_si64(s1, z1);
+	s3 = _mm_add_si64(s3, s4);
+	s1 = _mm_add_si64(s1, z4);
+	s1 = _mm_add_si64(s1, s3);
+	C[9] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-		MulAccumulate(3,0)
-		MulAccumulate(2,1)
-		MulAccumulate(1,2)
-		MulAccumulate(0,3)
-		MulStoreDigit(3)
-		MulShiftCarry
+	s3 = _mm_add_si64(x22, y22);
+	s4 = _mm_add_si64(x26, y26);
+	s1 = _mm_add_si64(s1, z6);
+	s3 = _mm_add_si64(s3, s4);
+	s1 = _mm_add_si64(s1, z8);
+	s1 = _mm_add_si64(s1, s3);
+	C[10] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-		MulAccumulate(3,1)
-		MulAccumulate(2,2)
-		MulAccumulate(1,3)
-		MulStoreDigit(4)
-		MulShiftCarry
+	s3 = _mm_add_si64(x27, y27);
+	s1 = _mm_add_si64(s1, z10);
+	s1 = _mm_add_si64(s1, z12);
+	s1 = _mm_add_si64(s1, s3);
+	C[11] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-		MulAccumulate(3,2)
-		MulAccumulate(2,3)
-		MulStoreDigit(5)
-		MulShiftCarry
+	s3 = _mm_add_si64(z14, z16);
+	s1 = _mm_add_si64(s1, s3);
+	C[12] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-		MulLastDiagonal(4)
+	s3 = _mm_add_si64(z18, z20);
+	s1 = _mm_add_si64(s1, s3);
+	C[13] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-		MulCleanup
+	s3 = _mm_add_si64(z22, z26);
+	s1 = _mm_add_si64(s1, s3);
+	C[14] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-		: 
-		: "D" (Z), "S" (X), "a" (Y)
-		: "%ecx", "%edx",  "memory"
-	);
+	C[15] = z[27] + _mm_cvtsi64_si32(s1);
+	_mm_empty();
 }
 
-void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
+void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
 {
-	__asm__ __volatile__(
-		MulStartup
-		MulAccumulate(0,0)
-		MulStoreDigit(0)
-		MulShiftCarry
-
-		MulAccumulate(1,0)
-		MulAccumulate(0,1)
-		MulStoreDigit(1)
-		MulShiftCarry
+	__m128i temp[21];
+	const word *w = (word *)temp;
+	const __m64 *mw = (__m64 *)w;
+	const word *x = (word *)temp+7*4;
+	const __m64 *mx = (__m64 *)x;
+	const word *y = (word *)temp+7*4*2;
+	const __m64 *my = (__m64 *)y;
 
-		MulAccumulate(2,0)
-		MulAccumulate(1,1)
-		MulAccumulate(0,2)
-		MulStoreDigit(2)
-		MulShiftCarry
+	P4_Mul(temp, (__m128i *)A, (__m128i *)B);
 
-		MulAccumulate(3,0)
-		MulAccumulate(2,1)
-		MulAccumulate(1,2)
-		MulAccumulate(0,3)
-		MulStoreDigit(3)
-		MulShiftCarry
+	P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
 
-		MulAccumulate(4,0)
-		MulAccumulate(3,1)
-		MulAccumulate(2,2)
-		MulAccumulate(1,3)
-		MulAccumulate(0,4)
-		MulStoreDigit(4)
-		MulShiftCarry
+	P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
 
-		MulAccumulate(5,0)
-		MulAccumulate(4,1)
-		MulAccumulate(3,2)
-		MulAccumulate(2,3)
-		MulAccumulate(1,4)
-		MulAccumulate(0,5)
-		MulStoreDigit(5)
-		MulShiftCarry
+	C[0] = w[0];
 
-		MulAccumulate(6,0)
-		MulAccumulate(5,1)
-		MulAccumulate(4,2)
-		MulAccumulate(3,3)
-		MulAccumulate(2,4)
-		MulAccumulate(1,5)
-		MulAccumulate(0,6)
-		MulStoreDigit(6)
-		MulShiftCarry
+	__m64 s1, s2, s3, s4;
 
-		MulAccumulate(7,0)
-		MulAccumulate(6,1)
-		MulAccumulate(5,2)
-		MulAccumulate(4,3)
-		MulAccumulate(3,4)
-		MulAccumulate(2,5)
-		MulAccumulate(1,6)
-		MulAccumulate(0,7)
-		MulStoreDigit(7)
-		MulShiftCarry
+	__m64 w1 = _mm_cvtsi32_si64(w[1]);
+	__m64 w4 = mw[2];
+	__m64 w6 = mw[3];
+	__m64 w8 = mw[4];
+	__m64 w10 = mw[5];
+	__m64 w12 = mw[6];
+	__m64 w14 = mw[7];
+	__m64 w16 = mw[8];
+	__m64 w18 = mw[9];
+	__m64 w20 = mw[10];
+	__m64 w22 = mw[11];
+	__m64 w26 = _mm_cvtsi32_si64(w[26]);
 
-		MulAccumulate(7,1)
-		MulAccumulate(6,2)
-		MulAccumulate(5,3)
-		MulAccumulate(4,4)
-		MulAccumulate(3,5)
-		MulAccumulate(2,6)
-		MulAccumulate(1,7)
-		MulStoreDigit(8)
-		MulShiftCarry
+	__m64 x0 = _mm_cvtsi32_si64(x[0]);
+	__m64 x1 = _mm_cvtsi32_si64(x[1]);
+	__m64 x4 = mx[2];
+	__m64 x6 = mx[3];
+	__m64 x8 = mx[4];
 
-		MulAccumulate(7,2)
-		MulAccumulate(6,3)
-		MulAccumulate(5,4)
-		MulAccumulate(4,5)
-		MulAccumulate(3,6)
-		MulAccumulate(2,7)
-		MulStoreDigit(9)
-		MulShiftCarry
+	__m64 y0 = _mm_cvtsi32_si64(y[0]);
+	__m64 y1 = _mm_cvtsi32_si64(y[1]);
+	__m64 y4 = my[2];
+	__m64 y6 = my[3];
+	__m64 y8 = my[4];
 
-		MulAccumulate(7,3)
-		MulAccumulate(6,4)
-		MulAccumulate(5,5)
-		MulAccumulate(4,6)
-		MulAccumulate(3,7)
-		MulStoreDigit(10)
-		MulShiftCarry
+	s1 = _mm_add_si64(w1, w4);
+	C[1] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-		MulAccumulate(7,4)
-		MulAccumulate(6,5)
-		MulAccumulate(5,6)
-		MulAccumulate(4,7)
-		MulStoreDigit(11)
-		MulShiftCarry
+	s2 = _mm_add_si64(w6, w8);
+	s1 = _mm_add_si64(s1, s2);
+	C[2] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-		MulAccumulate(7,5)
-		MulAccumulate(6,6)
-		MulAccumulate(5,7)
-		MulStoreDigit(12)
-		MulShiftCarry
+	s2 = _mm_add_si64(w10, w12);
+	s1 = _mm_add_si64(s1, s2);
+	C[3] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-		MulAccumulate(7,6)
-		MulAccumulate(6,7)
-		MulStoreDigit(13)
-		MulShiftCarry
+	s3 = _mm_add_si64(x0, y0);
+	s2 = _mm_add_si64(w14, w16);
+	s1 = _mm_add_si64(s1, s3);
+	s1 = _mm_add_si64(s1, s2);
+	C[4] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-		MulLastDiagonal(8)
+	s3 = _mm_add_si64(x1, y1);
+	s4 = _mm_add_si64(x4, y4);
+	s1 = _mm_add_si64(s1, w18);
+	s3 = _mm_add_si64(s3, s4);
+	s1 = _mm_add_si64(s1, w20);
+	s1 = _mm_add_si64(s1, s3);
+	C[5] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-		MulCleanup
+	s3 = _mm_add_si64(x6, y6);
+	s4 = _mm_add_si64(x8, y8);
+	s1 = _mm_add_si64(s1, w22);
+	s3 = _mm_add_si64(s3, s4);
+	s1 = _mm_add_si64(s1, w26);
+	s1 = _mm_add_si64(s1, s3);
+	C[6] = _mm_cvtsi64_si32(s1);
+	s1 = _mm_srli_si64(s1, 32);
 
-		: 
-		: "D" (Z), "S" (X), "a" (Y)
-		: "%ecx", "%edx",  "memory"
-	);
+	C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
+	_mm_empty();
 }
 
-#else	// no processor specific code at this layer
-
-typedef Portable LowLevel;
-
-#endif
+#endif	// #ifdef SSE2_INTRINSICS_AVAILABLE
 
 // ********************************************************
 
-- 
cgit v1.2.1