summaryrefslogtreecommitdiff
path: root/x86
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2008-09-17 23:03:01 +0200
committerNiels Möller <nisse@lysator.liu.se>2008-09-17 23:03:01 +0200
commit644dc4fd448d7aaf92db6453c7694fe92e44b9c1 (patch)
tree3f87a41bf0f70c050e447beea66290db7c7a69cf /x86
parent7dcba661d62d9248ed7b892024a2442b0520016f (diff)
downloadnettle-644dc4fd448d7aaf92db6453c7694fe92e44b9c1.tar.gz
Improved loop logic, and unrolled
loop twice. Gave a modest speedup. Rev: nettle/x86/arcfour-crypt.asm:1.2
Diffstat (limited to 'x86')
-rw-r--r--x86/arcfour-crypt.asm63
1 files changed, 42 insertions, 21 deletions
diff --git a/x86/arcfour-crypt.asm b/x86/arcfour-crypt.asm
index 8bf04539..bdeb98f0 100644
--- a/x86/arcfour-crypt.asm
+++ b/x86/arcfour-crypt.asm
@@ -38,48 +38,69 @@ C Input arguments:
C src = 32(%esp)
C Register usage:
C %ebp = ctx
- C %esi = src (updated through out loop)
- C %edi = dst (updated through out loop)
- C %edx = src + length (end of source area)
+ C %esi = src
+ C %edi = dst
+ C %edx = loop counter
C %eax = i
C %ebx = j
C %cl = si
C %ch = sj
movl 24(%esp), %edx C length
- testl %edx,%edx
- jz .Lend
-
movl 20(%esp), %ebp C ctx
- movl 28(%esp), %edi
- movl 32(%esp), %esi
- addl %esi, %edx C Keep src + length
+ movl 28(%esp), %edi C dst
+ movl 32(%esp), %esi C src
+
+ lea (%edx, %edi), %edi
+ lea (%edx, %esi), %esi
+ negl %edx
+ jnc .Lend
movzbl ARCFOUR_I (%ebp), %eax C i
movzbl ARCFOUR_J (%ebp), %ebx C j
+
+ incb %al
+ sarl $1, %edx
+ jc .Lloop_odd
+
+ ALIGN(4)
.Lloop:
-C incb %al
+ movb (%ebp, %eax), %cl C si.
+ addb %cl, %bl
+ movb (%ebp, %ebx), %ch C sj
+ movb %ch, (%ebp, %eax) C S[i] = sj
incl %eax
- andl $0xff, %eax
- movzbl (%ebp, %eax), %ecx C si. Clears high bytes
+ movzbl %al, %eax
+ movb %cl, (%ebp, %ebx) C S[j] = si
+ addb %ch, %cl
+ movzbl %cl, %ecx C Clear, so it can be used
+ C for indexing.
+ movb (%ebp, %ecx), %cl
+ xorb (%esi, %edx, 2), %cl
+ movb %cl, (%edi, %edx, 2)
+
+ C FIXME: Could exchange cl and ch in the second half
+ C and try to interleave instructions better.
+.Lloop_odd:
+ movb (%ebp, %eax), %cl C si.
addb %cl, %bl
-C The addl andl is preferable on PPro and PII, but slows thing down on AMD Duron.
-C addl %ecx, %ebx
-C andl $0xff, %ebx
movb (%ebp, %ebx), %ch C sj
movb %ch, (%ebp, %eax) C S[i] = sj
+ incl %eax
+ movzbl %al, %eax
movb %cl, (%ebp, %ebx) C S[j] = si
addb %ch, %cl
movzbl %cl, %ecx C Clear, so it can be used
C for indexing.
movb (%ebp, %ecx), %cl
- xorb (%esi), %cl
- incl %esi
- movb %cl, (%edi)
- incl %edi
- cmpl %esi, %edx
- jne .Lloop
+ xorb 1(%esi, %edx, 2), %cl
+ incl %edx
+ movb %cl, -1(%edi, %edx, 2)
+
+ jnz .Lloop
+C .Lloop_done:
+ decb %al
movb %al, ARCFOUR_I (%ebp) C Store the new i and j.
movb %bl, ARCFOUR_J (%ebp)
.Lend: