diff options
author | Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | 2013-01-23 11:55:13 +0200 |
---|---|---|
committer | Werner Koch <wk@gnupg.org> | 2013-02-19 11:21:48 +0100 |
commit | 63ac3ba07dba82fde040d31b90b4eff627bd92b9 (patch) | |
tree | c103c60a747faff8ebb8e1f7b72a9faa68ed089d /cipher/camellia-glue.c | |
parent | 4de62d80644228fc5db2a9f9c94a7eb633d8de2e (diff) | |
download | libgcrypt-63ac3ba07dba82fde040d31b90b4eff627bd92b9.tar.gz |
Add AES-NI/AVX accelerated Camellia implementation
* configure.ac: Add option --disable-avx-support.
(HAVE_GCC_INLINE_ASM_AVX): New.
(ENABLE_AVX_SUPPORT): New.
(camellia) [ENABLE_AVX_SUPPORT, ENABLE_AESNI_SUPPORT]: Add
camellia_aesni_avx_x86-64.lo.
* cipher/Makefile.am (AM_CCASFLAGS): Add.
(EXTRA_libcipher_la_SOURCES): Add camellia_aesni_avx_x86-64.S
* cipher/camellia-glue.c [ENABLE_AESNI_SUPPORT, ENABLE_AVX_SUPPORT]
[__x86_64__] (USE_AESNI_AVX): Add macro.
(struct Camellia_context) [USE_AESNI_AVX]: Add use_aesni_avx.
[USE_AESNI_AVX] (_gcry_camellia_aesni_avx_ctr_enc)
(_gcry_camellia_aesni_avx_cbc_dec): New prototypes to assembly
functions.
(camellia_setkey) [USE_AESNI_AVX]: Enable AES-NI/AVX if hardware
support both.
(_gcry_camellia_ctr_enc) [USE_AESNI_AVX]: Add AES-NI/AVX code.
(_gcry_camellia_cbc_dec) [USE_AESNI_AVX]: Add AES-NI/AVX code.
* cipher/camellia_aesni_avx_x86-64.S: New.
* src/g10lib.h (HWF_INTEL_AVX): New.
* src/global.c (hwflist): Add HWF_INTEL_AVX.
* src/hwf-x86.c (detect_x86_gnuc) [ENABLE_AVX_SUPPORT]: Add detection
for AVX.
--
Before:
Running each test 250 times.
ECB/Stream CBC CFB OFB CTR
--------------- --------------- --------------- --------------- ---------------
CAMELLIA128 2210ms 2200ms 2300ms 2050ms 2240ms 2250ms 2290ms 2270ms 2070ms 2070ms
CAMELLIA256 2810ms 2800ms 2920ms 2670ms 2840ms 2850ms 2910ms 2890ms 2660ms 2640ms
After:
Running each test 250 times.
ECB/Stream CBC CFB OFB CTR
--------------- --------------- --------------- --------------- ---------------
CAMELLIA128 2200ms 2220ms 2290ms 470ms 2240ms 2270ms 2270ms 2290ms 480ms 480ms
CAMELLIA256 2820ms 2820ms 2900ms 600ms 2860ms 2860ms 2900ms 2920ms 620ms 620ms
AES-NI/AVX implementation works by processing 16 parallel blocks (256 bytes).
It's bytesliced implementation that uses AES-NI (Subbyte) for Camellia sboxes,
with help of prefiltering/postfiltering. For smaller data sets generic C
implementation is used.
Speed-up for CBC-decryption and CTR-mode (large data): 4.3x
Tests were run on: Intel Core i5-2450M
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
(license boiler plate update by wk)
Diffstat (limited to 'cipher/camellia-glue.c')
-rw-r--r-- | cipher/camellia-glue.c | 100 |
1 files changed, 98 insertions, 2 deletions
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c index ba8aa281..dd9206f1 100644 --- a/cipher/camellia-glue.c +++ b/cipher/camellia-glue.c @@ -71,12 +71,38 @@ # define ATTR_ALIGNED_16 #endif +/* USE_AESNI inidicates whether to compile with Intel AES-NI/AVX code. */ +#undef USE_AESNI_AVX +#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT) +# if defined(__x86_64__) +# define USE_AESNI_AVX 1 +# endif +#endif + typedef struct { int keybitlength; KEY_TABLE_TYPE keytable; +#ifdef USE_AESNI_AVX + int use_aesni_avx; /* AES-NI/AVX implementation shall be used. */ +#endif /*USE_AESNI_AVX*/ } CAMELLIA_context; +#ifdef USE_AESNI_AVX +/* Assembler implementations of Camellia using AES-NI and AVX. Process data + in 16 block same time. + */ +extern void _gcry_camellia_aesni_avx_ctr_enc(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *ctr); + +extern void _gcry_camellia_aesni_avx_cbc_dec(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *iv); +#endif + static const char *selftest(void); static gcry_err_code_t @@ -109,6 +135,15 @@ camellia_setkey(void *c, const byte *key, unsigned keylen) +3*2*sizeof(void*) /* Function calls. */ ); +#ifdef USE_AESNI_AVX + ctx->use_aesni_avx = 0; + if ((_gcry_get_hw_features () & HWF_INTEL_AESNI) && + (_gcry_get_hw_features () & HWF_INTEL_AVX)) + { + ctx->use_aesni_avx = 1; + } +#endif + return 0; } @@ -158,8 +193,39 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr, unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned char tmpbuf[CAMELLIA_BLOCK_SIZE]; + int burn_stack_depth = CAMELLIA_encrypt_stack_burn_size; int i; +#ifdef USE_AESNI_AVX + if (ctx->use_aesni_avx) + { + int did_use_aesni_avx = 0; + + /* Process data in 16 block chunks. */ + while (nblocks >= 16) + { + _gcry_camellia_aesni_avx_ctr_enc(ctx, outbuf, inbuf, ctr); + + nblocks -= 16; + outbuf += 16 * CAMELLIA_BLOCK_SIZE; + inbuf += 16 * CAMELLIA_BLOCK_SIZE; + did_use_aesni_avx = 1; + } + + if (did_use_aesni_avx) + { + /* clear AVX registers */ + asm volatile ("vzeroall;\n":::); + + if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *)) + burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *); + } + + /* Use generic code to handle smaller chunks... */ + /* TODO: use caching instead? */ + } +#endif + for ( ;nblocks; nblocks-- ) { /* Encrypt the counter. */ @@ -178,7 +244,7 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr, } wipememory(tmpbuf, sizeof(tmpbuf)); - _gcry_burn_stack(CAMELLIA_encrypt_stack_burn_size); + _gcry_burn_stack(burn_stack_depth); } /* Bulk decryption of complete blocks in CBC mode. This function is only @@ -192,6 +258,36 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv, unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned char savebuf[CAMELLIA_BLOCK_SIZE]; + int burn_stack_depth = CAMELLIA_decrypt_stack_burn_size; + +#ifdef USE_AESNI_AVX + if (ctx->use_aesni_avx) + { + int did_use_aesni_avx = 0; + + /* Process data in 16 block chunks. */ + while (nblocks >= 16) + { + _gcry_camellia_aesni_avx_cbc_dec(ctx, outbuf, inbuf, iv); + + nblocks -= 16; + outbuf += 16 * CAMELLIA_BLOCK_SIZE; + inbuf += 16 * CAMELLIA_BLOCK_SIZE; + did_use_aesni_avx = 1; + } + + if (did_use_aesni_avx) + { + /* clear AVX registers */ + asm volatile ("vzeroall;\n":::); + + if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *)) + burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *); + } + + /* Use generic code to handle smaller chunks... */ + } +#endif for ( ;nblocks; nblocks-- ) { @@ -208,7 +304,7 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv, } wipememory(savebuf, sizeof(savebuf)); - _gcry_burn_stack(CAMELLIA_decrypt_stack_burn_size); + _gcry_burn_stack(burn_stack_depth); } /* Run the self-tests for CAMELLIA-CTR-128, tests IV increment of bulk CTR |