diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-12-30 15:10:13 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-12-30 15:10:13 +0200 |
commit | 7547898109c72a97e3102b2a045ee4fdb2aa40bf (patch) | |
tree | 2f49ce05d804a1587435633a4b11cddc94fbf0c5 /cipher/arcfour-amd64.S | |
parent | a05be441d8cd89b90d8d58e3a343a436dae377d0 (diff) | |
download | libgcrypt-7547898109c72a97e3102b2a045ee4fdb2aa40bf.tar.gz |
Add AMD64 assembly implementation for arcfour
* cipher/Makefile.am: Add 'arcfour-amd64.S'.
* cipher/arcfour-amd64.S: New.
* cipher/arcfour.c (USE_AMD64_ASM): New.
[USE_AMD64_ASM] (ARCFOUR_context, _gcry_arcfour_amd64)
(encrypt_stream): New.
* configure.ac [host=x86_64]: Add 'arcfour-amd64.lo'.
--
Patch adds Marc Bevand's public-domain AMD64 assembly implementation of RC4 to
libgcrypt. Original implementation is at:
http://www.zorinaq.com/papers/rc4-amd64.html
Benchmarks on Intel i5-4570 (3200 Mhz):
New:
ARCFOUR | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 1.29 ns/B 737.7 MiB/s 4.14 c/B
STREAM dec | 1.31 ns/B 730.6 MiB/s 4.18 c/B
Old (C-language):
ARCFOUR | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 2.09 ns/B 457.4 MiB/s 6.67 c/B
STREAM dec | 2.09 ns/B 457.2 MiB/s 6.68 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/arcfour-amd64.S')
-rw-r--r-- | cipher/arcfour-amd64.S | 97 |
1 files changed, 97 insertions, 0 deletions
diff --git a/cipher/arcfour-amd64.S b/cipher/arcfour-amd64.S new file mode 100644 index 00000000..c32cd6f1 --- /dev/null +++ b/cipher/arcfour-amd64.S @@ -0,0 +1,97 @@ +/* +** RC4 implementation optimized for AMD64. +** +** Author: Marc Bevand <bevand_m (at) epita.fr> +** Licence: I hereby disclaim the copyright on this code and place it +** in the public domain. +** +** The throughput achieved by this code is about 320 MBytes/sec, on +** a 1.8 GHz AMD Opteron (rev C0) processor. +** +** 2013/12/20 <jussi.kivilinna@iki.fi>: +** - Integrated to libgcrypt +** - 4.18 cycles/byte on Intel i5-4570 +*/ + +#ifdef __x86_64__ +#include <config.h> +#if defined(USE_ARCFOUR) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) + +.text +.align 16 +.globl _gcry_arcfour_amd64 +.type _gcry_arcfour_amd64,@function +_gcry_arcfour_amd64: + push %rbp + push %rbx + mov %rdi, %rbp # key = ARG(key) + mov %rsi, %rbx # rbx = ARG(len) + mov %rdx, %rsi # in = ARG(in) + mov %rcx, %rdi # out = ARG(out) + mov (4*256)(%rbp), %ecx # x = key->x + mov (4*256+4)(%rbp),%edx # y = key->y + inc %rcx # x++ + and $255, %rcx # x &= 0xff + lea -8(%rbx,%rsi), %rbx # rbx = in+len-8 + mov %rbx, %r9 # tmp = in+len-8 + mov (%rbp,%rcx,4), %eax # tx = d[x] + cmp %rsi, %rbx # cmp in with in+len-8 + jl .Lend # jump if (in+len-8 < in) + +.Lstart: + add $8, %rsi # increment in + add $8, %rdi # increment out + + # generate the next 8 bytes of the rc4 stream into %r8 + mov $8, %r11 # byte counter +1: add %al, %dl # y += tx + mov (%rbp,%rdx,4), %ebx # ty = d[y] + mov %ebx, (%rbp,%rcx,4) # d[x] = ty + add %al, %bl # val = ty + tx + mov %eax, (%rbp,%rdx,4) # d[y] = tx + inc %cl # x++ (NEXT ROUND) + mov (%rbp,%rcx,4), %eax # tx = d[x] (NEXT ROUND) + shl $8, %r8 + movb (%rbp,%rbx,4), %r8b # val = d[val] + dec %r11b + jnz 1b + + # xor 8 bytes + bswap %r8 + xor -8(%rsi), %r8 + cmp %r9, %rsi # cmp in+len-8 with in + mov %r8, -8(%rdi) + jle .Lstart # jump if (in <= in+len-8) + +.Lend: + add $8, %r9 # tmp = in+len + + # handle the last bytes, one by one +1: cmp %rsi, %r9 # cmp in with in+len + jle .Lfinished # jump if (in+len <= in) + add %al, %dl # y += tx + mov (%rbp,%rdx,4), %ebx # ty = d[y] + mov %ebx, (%rbp,%rcx,4) # d[x] = ty + add %al, %bl # val = ty + tx + mov %eax, (%rbp,%rdx,4) # d[y] = tx + inc %cl # x++ (NEXT ROUND) + mov (%rbp,%rcx,4), %eax # tx = d[x] (NEXT ROUND) + movb (%rbp,%rbx,4), %r8b # val = d[val] + xor (%rsi), %r8b # xor 1 byte + movb %r8b, (%rdi) + inc %rsi # in++ + inc %rdi # out++ + jmp 1b + +.Lfinished: + dec %rcx # x-- + movb %dl, (4*256)(%rbp) # key->y = y + movb %cl, (4*256+4)(%rbp) # key->x = x + pop %rbx + pop %rbp + ret +.L__gcry_arcfour_amd64_end: +.size _gcry_arcfour_amd64,.L__gcry_arcfour_amd64_end-_gcry_arcfour_amd64 + +#endif +#endif |