diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-05-23 14:15:46 +0300 |
---|---|---|
committer | Werner Koch <wk@gnupg.org> | 2013-05-23 17:34:23 +0200 |
commit | b60f06f70227c1e69e1010da8b47ea51ade48145 (patch) | |
tree | c59a9a6977c260a658bd9eb53a512eb07084027a /cipher/camellia-aesni-avx-amd64.S | |
parent | 319ee14f2aab8db56a830fd7ac8926f91b4f738a (diff) | |
download | libgcrypt-b60f06f70227c1e69e1010da8b47ea51ade48145.tar.gz |
camellia: add parallel processing for CFB decryption
* cipher/camellia-aesni-avx-amd64.S
(_gcry_camellia_aesni_avx_cfb_dec): New function.
* cipher/camellia-glue.c (_gcry_camellia_aesni_avx_cfb_dec): New
prototype.
(_gcry_camellia_cfb_dec): New function.
(selftest_cfb_128): New function.
(selftest): Call selftest_cfb_128.
* cipher/cipher.c (gry_cipher_open): Add bulk CFB decryption function
for Camellia.
* src/cipher.h (_gcry_camellia_cfb_dec): New prototype.
--
Patch makes Camellia-CFB decryption 4.7 times faster on Intel Sandy-Bridge.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/camellia-aesni-avx-amd64.S')
-rw-r--r-- | cipher/camellia-aesni-avx-amd64.S | 65 |
1 files changed, 65 insertions, 0 deletions
diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S index 2b1df17a..95c96b8b 100644 --- a/cipher/camellia-aesni-avx-amd64.S +++ b/cipher/camellia-aesni-avx-amd64.S @@ -1116,5 +1116,70 @@ _gcry_camellia_aesni_avx_cbc_dec: ret; .size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec; +.align 8 +.global _gcry_camellia_aesni_avx_cfb_dec +.type _gcry_camellia_aesni_avx_cfb_dec,@function; + +_gcry_camellia_aesni_avx_cfb_dec: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: iv + */ + + subq $(16 * 16), %rsp; + movq %rsp, %rax; + + /* inpack16_pre: */ + vmovq (key_table)(CTX), %xmm0; + vpshufb .Lpack_bswap RIP, %xmm0, %xmm0; + vpxor (%rcx), %xmm0, %xmm15; + vmovdqu 15 * 16(%rdx), %xmm1; + vmovdqu %xmm1, (%rcx); /* store new IV */ + vpxor 0 * 16(%rdx), %xmm0, %xmm14; + vpxor 1 * 16(%rdx), %xmm0, %xmm13; + vpxor 2 * 16(%rdx), %xmm0, %xmm12; + vpxor 3 * 16(%rdx), %xmm0, %xmm11; + vpxor 4 * 16(%rdx), %xmm0, %xmm10; + vpxor 5 * 16(%rdx), %xmm0, %xmm9; + vpxor 6 * 16(%rdx), %xmm0, %xmm8; + vpxor 7 * 16(%rdx), %xmm0, %xmm7; + vpxor 8 * 16(%rdx), %xmm0, %xmm6; + vpxor 9 * 16(%rdx), %xmm0, %xmm5; + vpxor 10 * 16(%rdx), %xmm0, %xmm4; + vpxor 11 * 16(%rdx), %xmm0, %xmm3; + vpxor 12 * 16(%rdx), %xmm0, %xmm2; + vpxor 13 * 16(%rdx), %xmm0, %xmm1; + vpxor 14 * 16(%rdx), %xmm0, %xmm0; + + call __camellia_enc_blk16; + + addq $(16 * 16), %rsp; + + vpxor 0 * 16(%rdx), %xmm7, %xmm7; + vpxor 1 * 16(%rdx), %xmm6, %xmm6; + vpxor 2 * 16(%rdx), %xmm5, %xmm5; + vpxor 3 * 16(%rdx), %xmm4, %xmm4; + vpxor 4 * 16(%rdx), %xmm3, %xmm3; + vpxor 5 * 16(%rdx), %xmm2, %xmm2; + vpxor 6 * 16(%rdx), %xmm1, %xmm1; + vpxor 7 * 16(%rdx), %xmm0, %xmm0; + vpxor 8 * 16(%rdx), %xmm15, %xmm15; + vpxor 9 * 16(%rdx), %xmm14, %xmm14; + vpxor 10 * 16(%rdx), %xmm13, %xmm13; + vpxor 11 * 16(%rdx), %xmm12, %xmm12; + vpxor 12 * 16(%rdx), %xmm11, %xmm11; + vpxor 13 * 16(%rdx), %xmm10, %xmm10; + vpxor 14 * 16(%rdx), %xmm9, %xmm9; + vpxor 15 * 16(%rdx), %xmm8, %xmm8; + + write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, + %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, + %xmm8, %rsi); + + ret; +.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec; + #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/ #endif /*__x86_64*/ |