summaryrefslogtreecommitdiff
path: root/cipher/camellia-glue.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2021-01-11 00:56:47 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2021-02-28 12:34:06 +0200
commit0e7e60241a0d054eae7a98116636a831ec6ccc97 (patch)
tree07d16dec8501dc730efd4b816fe0ad7b29996d87 /cipher/camellia-glue.c
parenteb404d8904532f6dca82421c952be286a1f4e11c (diff)
downloadlibgcrypt-0e7e60241a0d054eae7a98116636a831ec6ccc97.tar.gz
camellia: add x86_64 VAES/AVX2 accelerated implementation
* cipher/Makefile.am: Add 'camellia-aesni-avx2-amd64.h' and 'camellia-vaes-avx2-amd64.S'. * cipher/camellia-aesni-avx2-amd64.S: New, old content moved to... * cipher/camellia-aesni-avx2-amd64.h: ...here. (IF_AESNI, IF_VAES, FUNC_NAME): New. * cipher/camellia-vaes-avx2-amd64.S: New. * cipher/camellia-glue.c (USE_VAES_AVX2): New. (CAMELLIA_context): New member 'use_vaes_avx2'. (_gcry_camellia_vaes_avx2_ctr_enc, _gcry_camellia_vaes_avx2_cbc_dec) (_gcry_camellia_vaes_avx2_cfb_dec, _gcry_camellia_vaes_avx2_ocb_enc) (_gcry_camellia_vaes_avx2_ocb_dec) (_gcry_camellia_vaes_avx2_ocb_auth): New. (camellia_setkey): Check for HWF_INTEL_VAES. (_gcry_camellia_ctr_enc, _gcry_camellia_cbc_dec) (_gcry_camellia_cfb_dec, _gcry_camellia_ocb_crypt) (_gcry_camellia_ocb_auth): Add USE_VAES_AVX2 code. * configure.ac: Add 'camellia-vaes-avx2-amd64.lo'. -- Camellia AES-NI/AVX2 implementation had to split 256-bit vector to 128-bit parts for AES processing, but now we can use those 256-bit registers directly with VAES. Benchmarks on AMD Ryzen 5800X: Before (AES-NI/AVX2): CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz CBC dec | 0.539 ns/B 1769 MiB/s 2.62 c/B 4852 CFB dec | 0.528 ns/B 1806 MiB/s 2.56 c/B 4852±1 CTR enc | 0.552 ns/B 1728 MiB/s 2.68 c/B 4850 OCB enc | 0.550 ns/B 1734 MiB/s 2.65 c/B 4825 OCB dec | 0.577 ns/B 1653 MiB/s 2.78 c/B 4825 OCB auth | 0.546 ns/B 1747 MiB/s 2.63 c/B 4825 After (VAES/AVX2, CBC-dec ~13%, CFB-dec/CTR/OCB ~20% faster): CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz CBC dec | 0.477 ns/B 1999 MiB/s 2.31 c/B 4850 CFB dec | 0.433 ns/B 2201 MiB/s 2.10 c/B 4850 CTR enc | 0.438 ns/B 2176 MiB/s 2.13 c/B 4851 OCB enc | 0.449 ns/B 2122 MiB/s 2.18 c/B 4850 OCB dec | 0.468 ns/B 2038 MiB/s 2.27 c/B 4850 OCB auth | 0.447 ns/B 2131 MiB/s 2.17 c/B 4850 Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/camellia-glue.c')
-rw-r--r--cipher/camellia-glue.c114
1 files changed, 106 insertions, 8 deletions
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 6577b651..23cbec81 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -91,6 +91,12 @@
# endif
#endif
+/* USE_VAES_AVX2 inidicates whether to compile with Intel VAES/AVX2 code. */
+#undef USE_VAES_AVX2
+#if defined(USE_AESNI_AVX2) && defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL)
+# define USE_VAES_AVX2 1
+#endif
+
typedef struct
{
KEY_TABLE_TYPE keytable;
@@ -100,6 +106,7 @@ typedef struct
#endif /*USE_AESNI_AVX*/
#ifdef USE_AESNI_AVX2
unsigned int use_aesni_avx2:1;/* AES-NI/AVX2 implementation shall be used. */
+ unsigned int use_vaes_avx2:1; /* VAES/AVX2 implementation shall be used. */
#endif /*USE_AESNI_AVX2*/
} CAMELLIA_context;
@@ -201,6 +208,46 @@ extern void _gcry_camellia_aesni_avx2_ocb_auth(CAMELLIA_context *ctx,
const u64 Ls[32]) ASM_FUNC_ABI;
#endif
+#ifdef USE_VAES_AVX2
+/* Assembler implementations of Camellia using VAES and AVX2. Process data
+ in 32 block same time.
+ */
+extern void _gcry_camellia_vaes_avx2_ctr_enc(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_vaes_avx2_cbc_dec(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_vaes_avx2_cfb_dec(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_vaes_avx2_ocb_enc(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_vaes_avx2_ocb_dec(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_vaes_avx2_ocb_auth(CAMELLIA_context *ctx,
+ const unsigned char *abuf,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[32]) ASM_FUNC_ABI;
+#endif
+
static const char *selftest(void);
static void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr,
@@ -225,7 +272,7 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
CAMELLIA_context *ctx=c;
static int initialized=0;
static const char *selftest_failed=NULL;
-#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) || defined(USE_VAES_AVX2)
unsigned int hwf = _gcry_get_hw_features ();
#endif
@@ -248,6 +295,10 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
#endif
#ifdef USE_AESNI_AVX2
ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2);
+ ctx->use_vaes_avx2 = 0;
+#endif
+#ifdef USE_VAES_AVX2
+ ctx->use_vaes_avx2 = (hwf & HWF_INTEL_VAES_VPCLMUL) && (hwf & HWF_INTEL_AVX2);
#endif
ctx->keybitlength=keylen*8;
@@ -389,11 +440,19 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
if (ctx->use_aesni_avx2)
{
int did_use_aesni_avx2 = 0;
+#ifdef USE_VAES_AVX2
+ int use_vaes = ctx->use_vaes_avx2;
+#endif
/* Process data in 32 block chunks. */
while (nblocks >= 32)
{
- _gcry_camellia_aesni_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
+#ifdef USE_VAES_AVX2
+ if (use_vaes)
+ _gcry_camellia_vaes_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
+ else
+#endif
+ _gcry_camellia_aesni_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
nblocks -= 32;
outbuf += 32 * CAMELLIA_BLOCK_SIZE;
@@ -478,11 +537,19 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
if (ctx->use_aesni_avx2)
{
int did_use_aesni_avx2 = 0;
+#ifdef USE_VAES_AVX2
+ int use_vaes = ctx->use_vaes_avx2;
+#endif
/* Process data in 32 block chunks. */
while (nblocks >= 32)
{
- _gcry_camellia_aesni_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
+#ifdef USE_VAES_AVX2
+ if (use_vaes)
+ _gcry_camellia_vaes_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
+ else
+#endif
+ _gcry_camellia_aesni_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
nblocks -= 32;
outbuf += 32 * CAMELLIA_BLOCK_SIZE;
@@ -564,11 +631,19 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
if (ctx->use_aesni_avx2)
{
int did_use_aesni_avx2 = 0;
+#ifdef USE_VAES_AVX2
+ int use_vaes = ctx->use_vaes_avx2;
+#endif
/* Process data in 32 block chunks. */
while (nblocks >= 32)
{
- _gcry_camellia_aesni_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
+#ifdef USE_VAES_AVX2
+ if (use_vaes)
+ _gcry_camellia_vaes_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
+ else
+#endif
+ _gcry_camellia_aesni_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
nblocks -= 32;
outbuf += 32 * CAMELLIA_BLOCK_SIZE;
@@ -654,6 +729,10 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
if (ctx->use_aesni_avx2)
{
int did_use_aesni_avx2 = 0;
+#ifdef USE_VAES_AVX2
+ int encrypt_use_vaes = encrypt && ctx->use_vaes_avx2;
+ int decrypt_use_vaes = !encrypt && ctx->use_vaes_avx2;
+#endif
u64 Ls[32];
unsigned int n = 32 - (blkn % 32);
u64 *l;
@@ -685,7 +764,16 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
blkn += 32;
*l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32);
- if (encrypt)
+ if (0) {}
+#ifdef USE_VAES_AVX2
+ else if (encrypt_use_vaes)
+ _gcry_camellia_vaes_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+ else if (decrypt_use_vaes)
+ _gcry_camellia_vaes_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+#endif
+ else if (encrypt)
_gcry_camellia_aesni_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
c->u_ctr.ctr, Ls);
else
@@ -803,6 +891,9 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
if (ctx->use_aesni_avx2)
{
int did_use_aesni_avx2 = 0;
+#ifdef USE_VAES_AVX2
+ int use_vaes = ctx->use_vaes_avx2;
+#endif
u64 Ls[32];
unsigned int n = 32 - (blkn % 32);
u64 *l;
@@ -834,9 +925,16 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
blkn += 32;
*l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32);
- _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf,
- c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum, Ls);
+#ifdef USE_VAES_AVX2
+ if (use_vaes)
+ _gcry_camellia_vaes_avx2_ocb_auth(ctx, abuf,
+ c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls);
+ else
+#endif
+ _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf,
+ c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls);
nblocks -= 32;
abuf += 32 * CAMELLIA_BLOCK_SIZE;