From df1ffe1e41f89222c379d982e543c2a32da78cbd Mon Sep 17 00:00:00 2001
From: weidai <weidai@57ff6487-cd31-0410-9ec3-f628ee90f5f0>
Date: Fri, 4 May 2007 15:24:09 +0000
Subject: fix compile for x64, DLL and VC 6

git-svn-id: svn://svn.code.sf.net/p/cryptopp/code/trunk/c5@332 57ff6487-cd31-0410-9ec3-f628ee90f5f0
---
 rijndael.cpp | 254 +++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 158 insertions(+), 96 deletions(-)

(limited to 'rijndael.cpp')

diff --git a/rijndael.cpp b/rijndael.cpp
index 4a8572f..ac4f769 100644
--- a/rijndael.cpp
+++ b/rijndael.cpp
@@ -149,81 +149,133 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
 
 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
 {
-#ifdef CRYPTOPP_X86_ASM_AVAILABLE
+#if defined(CRYPTOPP_X86_ASM_AVAILABLE)
 	if (HasMMX())
 	{
 		const word32 *k = m_key;
 		const word32 *kLoopEnd = k + m_rounds*4;
+		#if CRYPTOPP_BOOL_X64
+			#define K_REG			r8
+			#define K_END_REG		r9
+			#define SAVE_K
+			#define RESTORE_K
+			#define RESTORE_K_END
+			#define SAVE_0(x)		AS2(mov	r10d, x)
+			#define SAVE_1(x)		AS2(mov	r11d, x)
+			#define SAVE_2(x)		AS2(mov	r12d, x)
+			#define RESTORE_0(x)	AS2(mov	x, r10d)
+			#define RESTORE_1(x)	AS2(mov	x, r11d)
+			#define RESTORE_2(x)	AS2(mov	x, r12d)
+		#else
+			#define K_REG			esi
+			#define K_END_REG		edi
+			#define SAVE_K			AS2(movd	mm4, esi)
+			#define RESTORE_K		AS2(movd	esi, mm4)
+			#define RESTORE_K_END	AS2(movd	edi, mm5)
+			#define SAVE_0(x)		AS2(movd	mm0, x)
+			#define SAVE_1(x)		AS2(movd	mm1, x)
+			#define SAVE_2(x)		AS2(movd	mm2, x)
+			#define RESTORE_0(x)	AS2(movd	x, mm0)
+			#define RESTORE_1(x)	AS2(movd	x, mm1)
+			#define RESTORE_2(x)	AS2(movd	x, mm2)
+		#endif
 #ifdef __GNUC__
 		word32 t0, t1, t2, t3;
 		__asm__ __volatile__
 		(
 		".intel_syntax noprefix;"
-		AS1(	push	ebx)
-		AS1(	push	ebp)
-		AS2(	mov		ebp, eax)
+		AS_PUSH(		bx)
+		AS_PUSH(		bp)
+		AS2(	mov		WORD_REG(bp), WORD_REG(ax))
+	#if CRYPTOPP_BOOL_X64
+		// save these manually. clobber list doesn't seem to work as of GCC 4.1.0
+		AS1(	pushq	K_REG)
+		AS1(	pushq	K_END_REG)
+		AS1(	pushq	r10)
+		AS1(	pushq	r11)
+		AS1(	pushq	r12)
+		AS2(	mov		K_REG, rsi)
+		AS2(	mov		K_END_REG, rcx)
+	#else
 		AS2(	movd	mm5, ecx)
+	#endif
 #else
+	#if _MSC_VER < 1300
+		const word32 *t = Te;
+		AS2(	mov		eax, t)
+	#endif
 		AS2(	mov		edx, g_cacheLineSize)
-		AS2(	mov		edi, inBlock)
-		AS2(	mov		esi, k)
+		AS2(	mov		WORD_REG(di), inBlock)
+		AS2(	mov		K_REG, k)
 		AS2(	movd	mm5, kLoopEnd)
-		AS1(	push	ebp)
+	#if _MSC_VER < 1300
+		AS_PUSH(		bx)
+		AS_PUSH(		bp)
+		AS2(	mov		ebp, eax)
+	#else
+		AS_PUSH(		bp)
 		AS2(	lea		ebp, Te)
+	#endif
 #endif
-		AS2(	mov		eax, [esi+0*4])	// s0
-		AS2(	xor		eax, [edi+0*4])
-		AS2(	movd	mm0, eax)
-		AS2(	mov		ebx, [esi+1*4])
-		AS2(	xor		ebx, [edi+1*4])
-		AS2(	movd	mm1, ebx)
+		AS2(	mov		eax, [K_REG+0*4])	// s0
+		AS2(	xor		eax, [WORD_REG(di)+0*4])
+		SAVE_0(eax)
+		AS2(	mov		ebx, [K_REG+1*4])
+		AS2(	xor		ebx, [WORD_REG(di)+1*4])
+		SAVE_1(ebx)
 		AS2(	and		ebx, eax)
-		AS2(	mov		eax, [esi+2*4])
-		AS2(	xor		eax, [edi+2*4])
-		AS2(	movd	mm2, eax)
+		AS2(	mov		eax, [K_REG+2*4])
+		AS2(	xor		eax, [WORD_REG(di)+2*4])
+		SAVE_2(eax)
 		AS2(	and		ebx, eax)
-		AS2(	mov		ecx, [esi+3*4])
-		AS2(	xor		ecx, [edi+3*4])
+		AS2(	mov		ecx, [K_REG+3*4])
+		AS2(	xor		ecx, [WORD_REG(di)+3*4])
 		AS2(	and		ebx, ecx)
 
 		// read Te0 into L1 cache. this code could be simplifed by using lfence, but that is an SSE2 instruction
 		AS2(	and		ebx, 0)
 		AS2(	mov		edi, ebx)	// make index depend on previous loads to simulate lfence
 		ASL(2)
-		AS2(	and		ebx, [ebp+edi])
+		AS2(	and		ebx, [WORD_REG(bp)+WORD_REG(di)])
 		AS2(	add		edi, edx)
-		AS2(	and		ebx, [ebp+edi])
+		AS2(	and		ebx, [WORD_REG(bp)+WORD_REG(di)])
 		AS2(	add		edi, edx)
-		AS2(	and		ebx, [ebp+edi])
+		AS2(	and		ebx, [WORD_REG(bp)+WORD_REG(di)])
 		AS2(	add		edi, edx)
-		AS2(	and		ebx, [ebp+edi])
+		AS2(	and		ebx, [WORD_REG(bp)+WORD_REG(di)])
 		AS2(	add		edi, edx)
 		AS2(	cmp		edi, 1024)
 		ASJ(	jl,		2, b)
-		AS2(	and		ebx, [ebp+1020])
+		AS2(	and		ebx, [WORD_REG(bp)+1020])
+#if CRYPTOPP_BOOL_X64
+		AS2(	xor		r10d, ebx)
+		AS2(	xor		r11d, ebx)
+		AS2(	xor		r12d, ebx)
+#else
 		AS2(	movd	mm6, ebx)
 		AS2(	pxor	mm2, mm6)
 		AS2(	pxor	mm1, mm6)
 		AS2(	pxor	mm0, mm6)
+#endif
 		AS2(	xor		ecx, ebx)
 
-		AS2(	mov		edi, [esi+4*4])	// t0
-		AS2(	mov		eax, [esi+5*4])
-		AS2(	mov		ebx, [esi+6*4])
-		AS2(	mov		edx, [esi+7*4])
-		AS2(	add		esi, 8*4)
-		AS2(	movd	mm4, esi)
+		AS2(	mov		edi, [K_REG+4*4])	// t0
+		AS2(	mov		eax, [K_REG+5*4])
+		AS2(	mov		ebx, [K_REG+6*4])
+		AS2(	mov		edx, [K_REG+7*4])
+		AS2(	add		K_REG, 8*4)
+		SAVE_K
 
 #define QUARTER_ROUND(t, a, b, c, d)	\
 	AS2(movzx esi, t##l)\
-	AS2(d, [ebp+0*1024+4*esi])\
+	AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)])\
 	AS2(movzx esi, t##h)\
-	AS2(c, [ebp+1*1024+4*esi])\
+	AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\
 	AS2(shr e##t##x, 16)\
 	AS2(movzx esi, t##l)\
-	AS2(b, [ebp+2*1024+4*esi])\
+	AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\
 	AS2(movzx esi, t##h)\
-	AS2(a, [ebp+3*1024+4*esi])
+	AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)])
 
 #define s0		xor edi
 #define s1		xor eax
@@ -235,69 +287,69 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 #define t3		xor edx
 
 		QUARTER_ROUND(c, t0, t1, t2, t3)
-		AS2(	movd	ecx, mm2)
+		RESTORE_2(ecx)
 		QUARTER_ROUND(c, t3, t0, t1, t2)
-		AS2(	movd	ecx, mm1)
+		RESTORE_1(ecx)
 		QUARTER_ROUND(c, t2, t3, t0, t1)
-		AS2(	movd	ecx, mm0)
+		RESTORE_0(ecx)
 		QUARTER_ROUND(c, t1, t2, t3, t0)
-		AS2(	movd	mm2, ebx)
-		AS2(	movd	mm1, eax)
-		AS2(	movd	mm0, edi)
+		SAVE_2(ebx)
+		SAVE_1(eax)
+		SAVE_0(edi)
 #undef QUARTER_ROUND
 
-		AS2(	movd	esi, mm4)
+		RESTORE_K
 
 		ASL(0)
-		AS2(	mov		edi, [esi+0*4])
-		AS2(	mov		eax, [esi+1*4])
-		AS2(	mov		ebx, [esi+2*4])
-		AS2(	mov		ecx, [esi+3*4])
+		AS2(	mov		edi, [K_REG+0*4])
+		AS2(	mov		eax, [K_REG+1*4])
+		AS2(	mov		ebx, [K_REG+2*4])
+		AS2(	mov		ecx, [K_REG+3*4])
 
 #define QUARTER_ROUND(t, a, b, c, d)	\
 	AS2(movzx esi, t##l)\
-	AS2(a, [ebp+3*1024+4*esi])\
+	AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)])\
 	AS2(movzx esi, t##h)\
-	AS2(b, [ebp+2*1024+4*esi])\
+	AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\
 	AS2(shr e##t##x, 16)\
 	AS2(movzx esi, t##l)\
-	AS2(c, [ebp+1*1024+4*esi])\
+	AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\
 	AS2(movzx esi, t##h)\
-	AS2(d, [ebp+0*1024+4*esi])
+	AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)])
 
 		QUARTER_ROUND(d, s0, s1, s2, s3)
-		AS2(	movd	edx, mm2)
+		RESTORE_2(edx)
 		QUARTER_ROUND(d, s3, s0, s1, s2)
-		AS2(	movd	edx, mm1)
+		RESTORE_1(edx)
 		QUARTER_ROUND(d, s2, s3, s0, s1)
-		AS2(	movd	edx, mm0)
+		RESTORE_0(edx)
 		QUARTER_ROUND(d, s1, s2, s3, s0)
-		AS2(	movd	esi, mm4)
-		AS2(	movd	mm2, ebx)
-		AS2(	movd	mm1, eax)
-		AS2(	movd	mm0, edi)
+		RESTORE_K
+		SAVE_2(ebx)
+		SAVE_1(eax)
+		SAVE_0(edi)
 
-		AS2(	mov		edi, [esi+4*4])
-		AS2(	mov		eax, [esi+5*4])
-		AS2(	mov		ebx, [esi+6*4])
-		AS2(	mov		edx, [esi+7*4])
+		AS2(	mov		edi, [K_REG+4*4])
+		AS2(	mov		eax, [K_REG+5*4])
+		AS2(	mov		ebx, [K_REG+6*4])
+		AS2(	mov		edx, [K_REG+7*4])
 
 		QUARTER_ROUND(c, t0, t1, t2, t3)
-		AS2(	movd	ecx, mm2)
+		RESTORE_2(ecx)
 		QUARTER_ROUND(c, t3, t0, t1, t2)
-		AS2(	movd	ecx, mm1)
+		RESTORE_1(ecx)
 		QUARTER_ROUND(c, t2, t3, t0, t1)
-		AS2(	movd	ecx, mm0)
+		RESTORE_0(ecx)
 		QUARTER_ROUND(c, t1, t2, t3, t0)
-		AS2(	movd	mm2, ebx)
-		AS2(	movd	mm1, eax)
-		AS2(	movd	mm0, edi)
-
-		AS2(	movd	esi, mm4)
-		AS2(	movd	edi, mm5)
-		AS2(	add		esi, 8*4)
-		AS2(	movd	mm4, esi)
-		AS2(	cmp		edi, esi)
+		SAVE_2(ebx)
+		SAVE_1(eax)
+		SAVE_0(edi)
+
+		RESTORE_K
+		RESTORE_K_END
+		AS2(	add		K_REG, 8*4)
+		SAVE_K
+		AS2(	cmp		K_END_REG, K_REG)
 		ASJ(	jne,	0, b)
 
 #undef QUARTER_ROUND
@@ -310,44 +362,54 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 #undef t2
 #undef t3
 
-		AS2(	mov		eax, [edi+0*4])
-		AS2(	mov		ecx, [edi+1*4])
-		AS2(	mov		esi, [edi+2*4])
-		AS2(	mov		edi, [edi+3*4])
+		AS2(	mov		eax, [K_END_REG+0*4])
+		AS2(	mov		ecx, [K_END_REG+1*4])
+		AS2(	mov		esi, [K_END_REG+2*4])
+		AS2(	mov		edi, [K_END_REG+3*4])
 
 #define QUARTER_ROUND(a, b, c, d)	\
 	AS2(	movzx	ebx, dl)\
-	AS2(	movzx	ebx, BYTE PTR [ebp+1+4*ebx])\
+	AS2(	movzx	ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
 	AS2(	shl		ebx, 3*8)\
 	AS2(	xor		a, ebx)\
 	AS2(	movzx	ebx, dh)\
-	AS2(	movzx	ebx, BYTE PTR [ebp+1+4*ebx])\
+	AS2(	movzx	ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
 	AS2(	shl		ebx, 2*8)\
 	AS2(	xor		b, ebx)\
 	AS2(	shr		edx, 16)\
 	AS2(	movzx	ebx, dl)\
 	AS2(	shr		edx, 8)\
-	AS2(	movzx	ebx, BYTE PTR [ebp+1+4*ebx])\
+	AS2(	movzx	ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
 	AS2(	shl		ebx, 1*8)\
 	AS2(	xor		c, ebx)\
-	AS2(	movzx	ebx, BYTE PTR [ebp+1+4*edx])\
+	AS2(	movzx	ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(dx)])\
 	AS2(	xor		d, ebx)
 
 		QUARTER_ROUND(eax, ecx, esi, edi)
-		AS2(	movd	edx, mm2)
+		RESTORE_2(edx)
 		QUARTER_ROUND(edi, eax, ecx, esi)
-		AS2(	movd	edx, mm1)
+		RESTORE_1(edx)
 		QUARTER_ROUND(esi, edi, eax, ecx)
-		AS2(	movd	edx, mm0)
+		RESTORE_0(edx)
 		QUARTER_ROUND(ecx, esi, edi, eax)
 
 #undef QUARTER_ROUND
 
-		AS1(	pop		ebp)
-		AS1(	emms)
+#if CRYPTOPP_BOOL_X64
+		AS1(popq	r12)
+		AS1(popq	r11)
+		AS1(popq	r10)
+		AS1(popq	K_END_REG)
+		AS1(popq	K_REG)
+#else
+		AS1(emms)
+#endif
+		AS_POP(		bp)
 
+#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300)
+		AS_POP(		bx)
+#endif
 #ifdef __GNUC__
-		AS1(	pop		ebx)
 		".att_syntax prefix;"
 			: "=a" (t0), "=c" (t1), "=S" (t2), "=D" (t3)
 			: "a" (Te), "D" (inBlock), "S" (k), "c" (kLoopEnd), "d" (g_cacheLineSize)
@@ -366,19 +428,19 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 		((word32 *)outBlock)[2] = t2;
 		((word32 *)outBlock)[3] = t3;
 #else
-		AS2(	mov		ebx, xorBlock)
-		AS2(	test	ebx, ebx)
+		AS2(	mov		WORD_REG(bx), xorBlock)
+		AS2(	test	WORD_REG(bx), WORD_REG(bx))
 		ASJ(	jz,		1, f)
-		AS2(	xor		eax, [ebx+0*4])
-		AS2(	xor		ecx, [ebx+1*4])
-		AS2(	xor		esi, [ebx+2*4])
-		AS2(	xor		edi, [ebx+3*4])
+		AS2(	xor		eax, [WORD_REG(bx)+0*4])
+		AS2(	xor		ecx, [WORD_REG(bx)+1*4])
+		AS2(	xor		esi, [WORD_REG(bx)+2*4])
+		AS2(	xor		edi, [WORD_REG(bx)+3*4])
 		ASL(1)
-		AS2(	mov		ebx, outBlock)
-		AS2(	mov		[ebx+0*4], eax)
-		AS2(	mov		[ebx+1*4], ecx)
-		AS2(	mov		[ebx+2*4], esi)
-		AS2(	mov		[ebx+3*4], edi)
+		AS2(	mov		WORD_REG(bx), outBlock)
+		AS2(	mov		[WORD_REG(bx)+0*4], eax)
+		AS2(	mov		[WORD_REG(bx)+1*4], ecx)
+		AS2(	mov		[WORD_REG(bx)+2*4], esi)
+		AS2(	mov		[WORD_REG(bx)+3*4], edi)
 #endif
 	}
 	else
-- 
cgit v1.2.1