summaryrefslogtreecommitdiff
path: root/cipher/arcfour-amd64.S
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2013-12-30 15:10:13 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2013-12-30 15:10:13 +0200
commit7547898109c72a97e3102b2a045ee4fdb2aa40bf (patch)
tree2f49ce05d804a1587435633a4b11cddc94fbf0c5 /cipher/arcfour-amd64.S
parenta05be441d8cd89b90d8d58e3a343a436dae377d0 (diff)
downloadlibgcrypt-7547898109c72a97e3102b2a045ee4fdb2aa40bf.tar.gz
Add AMD64 assembly implementation for arcfour
* cipher/Makefile.am: Add 'arcfour-amd64.S'. * cipher/arcfour-amd64.S: New. * cipher/arcfour.c (USE_AMD64_ASM): New. [USE_AMD64_ASM] (ARCFOUR_context, _gcry_arcfour_amd64) (encrypt_stream): New. * configure.ac [host=x86_64]: Add 'arcfour-amd64.lo'. -- Patch adds Marc Bevand's public-domain AMD64 assembly implementation of RC4 to libgcrypt. Original implementation is at: http://www.zorinaq.com/papers/rc4-amd64.html Benchmarks on Intel i5-4570 (3200 Mhz): New: ARCFOUR | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 1.29 ns/B 737.7 MiB/s 4.14 c/B STREAM dec | 1.31 ns/B 730.6 MiB/s 4.18 c/B Old (C-language): ARCFOUR | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 2.09 ns/B 457.4 MiB/s 6.67 c/B STREAM dec | 2.09 ns/B 457.2 MiB/s 6.68 c/B Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/arcfour-amd64.S')
-rw-r--r--cipher/arcfour-amd64.S97
1 files changed, 97 insertions, 0 deletions
diff --git a/cipher/arcfour-amd64.S b/cipher/arcfour-amd64.S
new file mode 100644
index 00000000..c32cd6f1
--- /dev/null
+++ b/cipher/arcfour-amd64.S
@@ -0,0 +1,97 @@
+/*
+** RC4 implementation optimized for AMD64.
+**
+** Author: Marc Bevand <bevand_m (at) epita.fr>
+** Licence: I hereby disclaim the copyright on this code and place it
+** in the public domain.
+**
+** The throughput achieved by this code is about 320 MBytes/sec, on
+** a 1.8 GHz AMD Opteron (rev C0) processor.
+**
+** 2013/12/20 <jussi.kivilinna@iki.fi>:
+** - Integrated to libgcrypt
+** - 4.18 cycles/byte on Intel i5-4570
+*/
+
+#ifdef __x86_64__
+#include <config.h>
+#if defined(USE_ARCFOUR) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+
+.text
+.align 16
+.globl _gcry_arcfour_amd64
+.type _gcry_arcfour_amd64,@function
+_gcry_arcfour_amd64:
+ push %rbp
+ push %rbx
+ mov %rdi, %rbp # key = ARG(key)
+ mov %rsi, %rbx # rbx = ARG(len)
+ mov %rdx, %rsi # in = ARG(in)
+ mov %rcx, %rdi # out = ARG(out)
+ mov (4*256)(%rbp), %ecx # x = key->x
+ mov (4*256+4)(%rbp),%edx # y = key->y
+ inc %rcx # x++
+ and $255, %rcx # x &= 0xff
+ lea -8(%rbx,%rsi), %rbx # rbx = in+len-8
+ mov %rbx, %r9 # tmp = in+len-8
+ mov (%rbp,%rcx,4), %eax # tx = d[x]
+ cmp %rsi, %rbx # cmp in with in+len-8
+ jl .Lend # jump if (in+len-8 < in)
+
+.Lstart:
+ add $8, %rsi # increment in
+ add $8, %rdi # increment out
+
+ # generate the next 8 bytes of the rc4 stream into %r8
+ mov $8, %r11 # byte counter
+1: add %al, %dl # y += tx
+ mov (%rbp,%rdx,4), %ebx # ty = d[y]
+ mov %ebx, (%rbp,%rcx,4) # d[x] = ty
+ add %al, %bl # val = ty + tx
+ mov %eax, (%rbp,%rdx,4) # d[y] = tx
+ inc %cl # x++ (NEXT ROUND)
+ mov (%rbp,%rcx,4), %eax # tx = d[x] (NEXT ROUND)
+ shl $8, %r8
+ movb (%rbp,%rbx,4), %r8b # val = d[val]
+ dec %r11b
+ jnz 1b
+
+ # xor 8 bytes
+ bswap %r8
+ xor -8(%rsi), %r8
+ cmp %r9, %rsi # cmp in+len-8 with in
+ mov %r8, -8(%rdi)
+ jle .Lstart # jump if (in <= in+len-8)
+
+.Lend:
+ add $8, %r9 # tmp = in+len
+
+ # handle the last bytes, one by one
+1: cmp %rsi, %r9 # cmp in with in+len
+ jle .Lfinished # jump if (in+len <= in)
+ add %al, %dl # y += tx
+ mov (%rbp,%rdx,4), %ebx # ty = d[y]
+ mov %ebx, (%rbp,%rcx,4) # d[x] = ty
+ add %al, %bl # val = ty + tx
+ mov %eax, (%rbp,%rdx,4) # d[y] = tx
+ inc %cl # x++ (NEXT ROUND)
+ mov (%rbp,%rcx,4), %eax # tx = d[x] (NEXT ROUND)
+ movb (%rbp,%rbx,4), %r8b # val = d[val]
+ xor (%rsi), %r8b # xor 1 byte
+ movb %r8b, (%rdi)
+ inc %rsi # in++
+ inc %rdi # out++
+ jmp 1b
+
+.Lfinished:
+ dec %rcx # x--
+ movb %dl, (4*256)(%rbp) # key->y = y
+ movb %cl, (4*256+4)(%rbp) # key->x = x
+ pop %rbx
+ pop %rbp
+ ret
+.L__gcry_arcfour_amd64_end:
+.size _gcry_arcfour_amd64,.L__gcry_arcfour_amd64_end-_gcry_arcfour_amd64
+
+#endif
+#endif