diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2021-01-11 00:56:47 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2021-02-28 12:34:06 +0200 |
commit | 0e7e60241a0d054eae7a98116636a831ec6ccc97 (patch) | |
tree | 07d16dec8501dc730efd4b816fe0ad7b29996d87 /cipher/camellia-glue.c | |
parent | eb404d8904532f6dca82421c952be286a1f4e11c (diff) | |
download | libgcrypt-0e7e60241a0d054eae7a98116636a831ec6ccc97.tar.gz |
camellia: add x86_64 VAES/AVX2 accelerated implementation
* cipher/Makefile.am: Add 'camellia-aesni-avx2-amd64.h' and
'camellia-vaes-avx2-amd64.S'.
* cipher/camellia-aesni-avx2-amd64.S: New, old content moved to...
* cipher/camellia-aesni-avx2-amd64.h: ...here.
(IF_AESNI, IF_VAES, FUNC_NAME): New.
* cipher/camellia-vaes-avx2-amd64.S: New.
* cipher/camellia-glue.c (USE_VAES_AVX2): New.
(CAMELLIA_context): New member 'use_vaes_avx2'.
(_gcry_camellia_vaes_avx2_ctr_enc, _gcry_camellia_vaes_avx2_cbc_dec)
(_gcry_camellia_vaes_avx2_cfb_dec, _gcry_camellia_vaes_avx2_ocb_enc)
(_gcry_camellia_vaes_avx2_ocb_dec)
(_gcry_camellia_vaes_avx2_ocb_auth): New.
(camellia_setkey): Check for HWF_INTEL_VAES.
(_gcry_camellia_ctr_enc, _gcry_camellia_cbc_dec)
(_gcry_camellia_cfb_dec, _gcry_camellia_ocb_crypt)
(_gcry_camellia_ocb_auth): Add USE_VAES_AVX2 code.
* configure.ac: Add 'camellia-vaes-avx2-amd64.lo'.
--
Camellia AES-NI/AVX2 implementation had to split 256-bit vector
to 128-bit parts for AES processing, but now we can use those
256-bit registers directly with VAES.
Benchmarks on AMD Ryzen 5800X:
Before (AES-NI/AVX2):
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CBC dec | 0.539 ns/B 1769 MiB/s 2.62 c/B 4852
CFB dec | 0.528 ns/B 1806 MiB/s 2.56 c/B 4852±1
CTR enc | 0.552 ns/B 1728 MiB/s 2.68 c/B 4850
OCB enc | 0.550 ns/B 1734 MiB/s 2.65 c/B 4825
OCB dec | 0.577 ns/B 1653 MiB/s 2.78 c/B 4825
OCB auth | 0.546 ns/B 1747 MiB/s 2.63 c/B 4825
After (VAES/AVX2, CBC-dec ~13%, CFB-dec/CTR/OCB ~20% faster):
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CBC dec | 0.477 ns/B 1999 MiB/s 2.31 c/B 4850
CFB dec | 0.433 ns/B 2201 MiB/s 2.10 c/B 4850
CTR enc | 0.438 ns/B 2176 MiB/s 2.13 c/B 4851
OCB enc | 0.449 ns/B 2122 MiB/s 2.18 c/B 4850
OCB dec | 0.468 ns/B 2038 MiB/s 2.27 c/B 4850
OCB auth | 0.447 ns/B 2131 MiB/s 2.17 c/B 4850
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/camellia-glue.c')
-rw-r--r-- | cipher/camellia-glue.c | 114 |
1 files changed, 106 insertions, 8 deletions
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c index 6577b651..23cbec81 100644 --- a/cipher/camellia-glue.c +++ b/cipher/camellia-glue.c @@ -91,6 +91,12 @@ # endif #endif +/* USE_VAES_AVX2 inidicates whether to compile with Intel VAES/AVX2 code. */ +#undef USE_VAES_AVX2 +#if defined(USE_AESNI_AVX2) && defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL) +# define USE_VAES_AVX2 1 +#endif + typedef struct { KEY_TABLE_TYPE keytable; @@ -100,6 +106,7 @@ typedef struct #endif /*USE_AESNI_AVX*/ #ifdef USE_AESNI_AVX2 unsigned int use_aesni_avx2:1;/* AES-NI/AVX2 implementation shall be used. */ + unsigned int use_vaes_avx2:1; /* VAES/AVX2 implementation shall be used. */ #endif /*USE_AESNI_AVX2*/ } CAMELLIA_context; @@ -201,6 +208,46 @@ extern void _gcry_camellia_aesni_avx2_ocb_auth(CAMELLIA_context *ctx, const u64 Ls[32]) ASM_FUNC_ABI; #endif +#ifdef USE_VAES_AVX2 +/* Assembler implementations of Camellia using VAES and AVX2. Process data + in 32 block same time. + */ +extern void _gcry_camellia_vaes_avx2_ctr_enc(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *ctr) ASM_FUNC_ABI; + +extern void _gcry_camellia_vaes_avx2_cbc_dec(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *iv) ASM_FUNC_ABI; + +extern void _gcry_camellia_vaes_avx2_cfb_dec(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *iv) ASM_FUNC_ABI; + +extern void _gcry_camellia_vaes_avx2_ocb_enc(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[32]) ASM_FUNC_ABI; + +extern void _gcry_camellia_vaes_avx2_ocb_dec(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[32]) ASM_FUNC_ABI; + +extern void _gcry_camellia_vaes_avx2_ocb_auth(CAMELLIA_context *ctx, + const unsigned char *abuf, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[32]) ASM_FUNC_ABI; +#endif + static const char *selftest(void); static void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr, @@ -225,7 +272,7 @@ camellia_setkey(void *c, const byte *key, unsigned keylen, CAMELLIA_context *ctx=c; static int initialized=0; static const char *selftest_failed=NULL; -#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) +#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) || defined(USE_VAES_AVX2) unsigned int hwf = _gcry_get_hw_features (); #endif @@ -248,6 +295,10 @@ camellia_setkey(void *c, const byte *key, unsigned keylen, #endif #ifdef USE_AESNI_AVX2 ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2); + ctx->use_vaes_avx2 = 0; +#endif +#ifdef USE_VAES_AVX2 + ctx->use_vaes_avx2 = (hwf & HWF_INTEL_VAES_VPCLMUL) && (hwf & HWF_INTEL_AVX2); #endif ctx->keybitlength=keylen*8; @@ -389,11 +440,19 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr, if (ctx->use_aesni_avx2) { int did_use_aesni_avx2 = 0; +#ifdef USE_VAES_AVX2 + int use_vaes = ctx->use_vaes_avx2; +#endif /* Process data in 32 block chunks. */ while (nblocks >= 32) { - _gcry_camellia_aesni_avx2_ctr_enc(ctx, outbuf, inbuf, ctr); +#ifdef USE_VAES_AVX2 + if (use_vaes) + _gcry_camellia_vaes_avx2_ctr_enc(ctx, outbuf, inbuf, ctr); + else +#endif + _gcry_camellia_aesni_avx2_ctr_enc(ctx, outbuf, inbuf, ctr); nblocks -= 32; outbuf += 32 * CAMELLIA_BLOCK_SIZE; @@ -478,11 +537,19 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv, if (ctx->use_aesni_avx2) { int did_use_aesni_avx2 = 0; +#ifdef USE_VAES_AVX2 + int use_vaes = ctx->use_vaes_avx2; +#endif /* Process data in 32 block chunks. */ while (nblocks >= 32) { - _gcry_camellia_aesni_avx2_cbc_dec(ctx, outbuf, inbuf, iv); +#ifdef USE_VAES_AVX2 + if (use_vaes) + _gcry_camellia_vaes_avx2_cbc_dec(ctx, outbuf, inbuf, iv); + else +#endif + _gcry_camellia_aesni_avx2_cbc_dec(ctx, outbuf, inbuf, iv); nblocks -= 32; outbuf += 32 * CAMELLIA_BLOCK_SIZE; @@ -564,11 +631,19 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv, if (ctx->use_aesni_avx2) { int did_use_aesni_avx2 = 0; +#ifdef USE_VAES_AVX2 + int use_vaes = ctx->use_vaes_avx2; +#endif /* Process data in 32 block chunks. */ while (nblocks >= 32) { - _gcry_camellia_aesni_avx2_cfb_dec(ctx, outbuf, inbuf, iv); +#ifdef USE_VAES_AVX2 + if (use_vaes) + _gcry_camellia_vaes_avx2_cfb_dec(ctx, outbuf, inbuf, iv); + else +#endif + _gcry_camellia_aesni_avx2_cfb_dec(ctx, outbuf, inbuf, iv); nblocks -= 32; outbuf += 32 * CAMELLIA_BLOCK_SIZE; @@ -654,6 +729,10 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, if (ctx->use_aesni_avx2) { int did_use_aesni_avx2 = 0; +#ifdef USE_VAES_AVX2 + int encrypt_use_vaes = encrypt && ctx->use_vaes_avx2; + int decrypt_use_vaes = !encrypt && ctx->use_vaes_avx2; +#endif u64 Ls[32]; unsigned int n = 32 - (blkn % 32); u64 *l; @@ -685,7 +764,16 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, blkn += 32; *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32); - if (encrypt) + if (0) {} +#ifdef USE_VAES_AVX2 + else if (encrypt_use_vaes) + _gcry_camellia_vaes_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + else if (decrypt_use_vaes) + _gcry_camellia_vaes_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); +#endif + else if (encrypt) _gcry_camellia_aesni_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls); else @@ -803,6 +891,9 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, if (ctx->use_aesni_avx2) { int did_use_aesni_avx2 = 0; +#ifdef USE_VAES_AVX2 + int use_vaes = ctx->use_vaes_avx2; +#endif u64 Ls[32]; unsigned int n = 32 - (blkn % 32); u64 *l; @@ -834,9 +925,16 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, blkn += 32; *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32); - _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf, - c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum, Ls); +#ifdef USE_VAES_AVX2 + if (use_vaes) + _gcry_camellia_vaes_avx2_ocb_auth(ctx, abuf, + c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum, Ls); + else +#endif + _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf, + c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum, Ls); nblocks -= 32; abuf += 32 * CAMELLIA_BLOCK_SIZE; |