diff options
author | Niels Möller <nisse@lysator.liu.se> | 2008-09-17 23:03:01 +0200 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2008-09-17 23:03:01 +0200 |
commit | 644dc4fd448d7aaf92db6453c7694fe92e44b9c1 (patch) | |
tree | 3f87a41bf0f70c050e447beea66290db7c7a69cf /x86 | |
parent | 7dcba661d62d9248ed7b892024a2442b0520016f (diff) | |
download | nettle-644dc4fd448d7aaf92db6453c7694fe92e44b9c1.tar.gz |
Improved loop logic, and unrolled
loop twice. Gave a modest speedup.
Rev: nettle/x86/arcfour-crypt.asm:1.2
Diffstat (limited to 'x86')
-rw-r--r-- | x86/arcfour-crypt.asm | 63 |
1 files changed, 42 insertions, 21 deletions
diff --git a/x86/arcfour-crypt.asm b/x86/arcfour-crypt.asm index 8bf04539..bdeb98f0 100644 --- a/x86/arcfour-crypt.asm +++ b/x86/arcfour-crypt.asm @@ -38,48 +38,69 @@ C Input arguments: C src = 32(%esp) C Register usage: C %ebp = ctx - C %esi = src (updated through out loop) - C %edi = dst (updated through out loop) - C %edx = src + length (end of source area) + C %esi = src + C %edi = dst + C %edx = loop counter C %eax = i C %ebx = j C %cl = si C %ch = sj movl 24(%esp), %edx C length - testl %edx,%edx - jz .Lend - movl 20(%esp), %ebp C ctx - movl 28(%esp), %edi - movl 32(%esp), %esi - addl %esi, %edx C Keep src + length + movl 28(%esp), %edi C dst + movl 32(%esp), %esi C src + + lea (%edx, %edi), %edi + lea (%edx, %esi), %esi + negl %edx + jnc .Lend movzbl ARCFOUR_I (%ebp), %eax C i movzbl ARCFOUR_J (%ebp), %ebx C j + + incb %al + sarl $1, %edx + jc .Lloop_odd + + ALIGN(4) .Lloop: -C incb %al + movb (%ebp, %eax), %cl C si. + addb %cl, %bl + movb (%ebp, %ebx), %ch C sj + movb %ch, (%ebp, %eax) C S[i] = sj incl %eax - andl $0xff, %eax - movzbl (%ebp, %eax), %ecx C si. Clears high bytes + movzbl %al, %eax + movb %cl, (%ebp, %ebx) C S[j] = si + addb %ch, %cl + movzbl %cl, %ecx C Clear, so it can be used + C for indexing. + movb (%ebp, %ecx), %cl + xorb (%esi, %edx, 2), %cl + movb %cl, (%edi, %edx, 2) + + C FIXME: Could exchange cl and ch in the second half + C and try to interleave instructions better. +.Lloop_odd: + movb (%ebp, %eax), %cl C si. addb %cl, %bl -C The addl andl is preferable on PPro and PII, but slows thing down on AMD Duron. -C addl %ecx, %ebx -C andl $0xff, %ebx movb (%ebp, %ebx), %ch C sj movb %ch, (%ebp, %eax) C S[i] = sj + incl %eax + movzbl %al, %eax movb %cl, (%ebp, %ebx) C S[j] = si addb %ch, %cl movzbl %cl, %ecx C Clear, so it can be used C for indexing. movb (%ebp, %ecx), %cl - xorb (%esi), %cl - incl %esi - movb %cl, (%edi) - incl %edi - cmpl %esi, %edx - jne .Lloop + xorb 1(%esi, %edx, 2), %cl + incl %edx + movb %cl, -1(%edi, %edx, 2) + + jnz .Lloop +C .Lloop_done: + decb %al movb %al, ARCFOUR_I (%ebp) C Store the new i and j. movb %bl, ARCFOUR_J (%ebp) .Lend: |