Improved loop logic, and unrolled

loop twice. Gave a modest speedup. Rev: nettle/x86/arcfour-crypt.asm:1.2
author: Niels Möller <nisse@lysator.liu.se> 2008-09-17 23:03:01 +0200
committer: Niels Möller <nisse@lysator.liu.se> 2008-09-17 23:03:01 +0200
commit: 644dc4fd448d7aaf92db6453c7694fe92e44b9c1 (patch)
tree: 3f87a41bf0f70c050e447beea66290db7c7a69cf /x86
parent: 7dcba661d62d9248ed7b892024a2442b0520016f (diff)
download: nettle-644dc4fd448d7aaf92db6453c7694fe92e44b9c1.tar.gz
1 files changed, 42 insertions, 21 deletions
diff --git a/x86/arcfour-crypt.asm b/x86/arcfour-crypt.asm
index 8bf04539..bdeb98f0 100644
--- a/x86/arcfour-crypt.asm
+++ b/x86/arcfour-crypt.asm
@@ -38,48 +38,69 @@ C Input arguments:
 	C src = 32(%esp)
 C Register usage:
 	C %ebp = ctx
-	C %esi = src (updated through out loop)
-	C %edi = dst (updated through out loop)
-	C %edx = src + length (end of source area)
+	C %esi = src
+	C %edi = dst
+	C %edx = loop counter
 	C %eax = i
 	C %ebx = j
 	C %cl  = si
 	C %ch  = sj
 
 	movl	24(%esp), %edx		C  length
-	testl	%edx,%edx
-	jz	.Lend
-
 	movl	20(%esp), %ebp		C  ctx
-	movl	28(%esp), %edi
-	movl	32(%esp), %esi
-	addl	%esi, %edx		C  Keep src + length
+	movl	28(%esp), %edi		C  dst
+	movl	32(%esp), %esi		C  src
+
+	lea	(%edx, %edi), %edi
+	lea	(%edx, %esi), %esi
+	negl	%edx
+	jnc	.Lend
 	
 	movzbl  ARCFOUR_I (%ebp), %eax	C  i
 	movzbl  ARCFOUR_J (%ebp), %ebx	C  j
+
+	incb	%al
+	sarl	$1, %edx
+	jc	.Lloop_odd
+	
+	ALIGN(4)
 .Lloop:
-C	incb	%al
+	movb	(%ebp, %eax), %cl	C  si.
+	addb    %cl, %bl
+	movb    (%ebp, %ebx), %ch	C  sj
+	movb    %ch, (%ebp, %eax)	C  S[i] = sj
 	incl	%eax
-	andl	$0xff, %eax
-	movzbl  (%ebp, %eax), %ecx	C  si. Clears high bytes
+	movzbl	%al, %eax
+	movb	%cl, (%ebp, %ebx)	C  S[j] = si
+	addb    %ch, %cl
+	movzbl  %cl, %ecx		C  Clear, so it can be used
+					C  for indexing.
+	movb    (%ebp, %ecx), %cl
+	xorb    (%esi, %edx, 2), %cl
+	movb    %cl, (%edi, %edx, 2)
+
+	C FIXME: Could exchange cl and ch in the second half
+	C and try to interleave instructions better.
+.Lloop_odd:
+	movb	(%ebp, %eax), %cl	C  si.
 	addb    %cl, %bl
-C The addl andl is preferable on PPro and PII, but slows thing down on AMD Duron.
-C	addl	%ecx, %ebx
-C	andl	$0xff, %ebx
 	movb    (%ebp, %ebx), %ch	C  sj
 	movb    %ch, (%ebp, %eax)	C  S[i] = sj
+	incl	%eax
+	movzbl	%al, %eax
 	movb	%cl, (%ebp, %ebx)	C  S[j] = si
 	addb    %ch, %cl
 	movzbl  %cl, %ecx		C  Clear, so it can be used
 					C  for indexing.
 	movb    (%ebp, %ecx), %cl
-	xorb    (%esi), %cl
-	incl    %esi
-	movb    %cl, (%edi)
-	incl    %edi
-	cmpl	%esi, %edx
-	jne	.Lloop
+	xorb    1(%esi, %edx, 2), %cl
+	incl    %edx
+	movb    %cl, -1(%edi, %edx, 2)
+
+	jnz	.Lloop
 
+C .Lloop_done:
+	decb	%al
 	movb	%al, ARCFOUR_I (%ebp)		C  Store the new i and j.
 	movb	%bl, ARCFOUR_J (%ebp)
 .Lend:
author	Niels Möller <nisse@lysator.liu.se>	2008-09-17 23:03:01 +0200
committer	Niels Möller <nisse@lysator.liu.se>	2008-09-17 23:03:01 +0200
commit	644dc4fd448d7aaf92db6453c7694fe92e44b9c1 (patch)
tree	3f87a41bf0f70c050e447beea66290db7c7a69cf /x86
parent	7dcba661d62d9248ed7b892024a2442b0520016f (diff)
download	nettle-644dc4fd448d7aaf92db6453c7694fe92e44b9c1.tar.gz