diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-02-21 20:53:39 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-02-22 21:13:36 +0200 |
commit | b9a9755742c7bf7ca8c007d33f98aaa076a382c7 (patch) | |
tree | 4bf1aa87c67ae8d75e375506a997e9c2ceef9827 /cipher/camellia-glue.c | |
parent | a4c22331f57d23832ddd019ac3108b5fa3dd942b (diff) | |
download | libgcrypt-b9a9755742c7bf7ca8c007d33f98aaa076a382c7.tar.gz |
camellia-avx2: speed up for round key broadcasting
* cipher/camellia-aesni-avx2-amd64.h (roundsm32, fls32): Use
'vpbroadcastb' for loading round key.
* cipher/camellia-glue.c (camellia_encrypt_blk1_32)
(camellia_decrypt_blk1_32): Adjust num_blks thresholds for AVX2
implementations, 2 blks for GFNI, 4 blks for VAES and 5 blks for AESNI.
--
Benchmark on AMD Ryzen 9 7900X (turbo-freq off):
Before:
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.213 ns/B 4469 MiB/s 1.00 c/B 4700
ECB dec | 0.215 ns/B 4440 MiB/s 1.01 c/B 4700
After (~10% faster):
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.194 ns/B 4919 MiB/s 0.911 c/B 4700
ECB dec | 0.195 ns/B 4896 MiB/s 0.916 c/B 4700
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/camellia-glue.c')
-rw-r--r-- | cipher/camellia-glue.c | 24 |
1 files changed, 12 insertions, 12 deletions
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c index 76a09eb1..b87faa91 100644 --- a/cipher/camellia-glue.c +++ b/cipher/camellia-glue.c @@ -630,27 +630,27 @@ camellia_encrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf, gcry_assert (num_blks <= 32); #ifdef USE_GFNI_AVX2 - if (ctx->use_gfni_avx2 && num_blks >= 3) + if (ctx->use_gfni_avx2 && num_blks >= 2) { - /* 3 or more parallel block GFNI processing is faster than + /* 2 or more parallel block GFNI processing is faster than * generic C implementation. */ _gcry_camellia_gfni_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks); return avx2_burn_stack_depth; } #endif #ifdef USE_VAES_AVX2 - if (ctx->use_vaes_avx2 && num_blks >= 6) + if (ctx->use_vaes_avx2 && num_blks >= 4) { - /* 6 or more parallel block VAES processing is faster than + /* 4 or more parallel block VAES processing is faster than * generic C implementation. */ _gcry_camellia_vaes_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks); return avx2_burn_stack_depth; } #endif #ifdef USE_AESNI_AVX2 - if (ctx->use_aesni_avx2 && num_blks >= 6) + if (ctx->use_aesni_avx2 && num_blks >= 5) { - /* 6 or more parallel block AESNI processing is faster than + /* 5 or more parallel block AESNI processing is faster than * generic C implementation. */ _gcry_camellia_aesni_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks); return avx2_burn_stack_depth; @@ -721,27 +721,27 @@ camellia_decrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf, gcry_assert (num_blks <= 32); #ifdef USE_GFNI_AVX2 - if (ctx->use_gfni_avx2 && num_blks >= 3) + if (ctx->use_gfni_avx2 && num_blks >= 2) { - /* 3 or more parallel block GFNI processing is faster than + /* 2 or more parallel block GFNI processing is faster than * generic C implementation. */ _gcry_camellia_gfni_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks); return avx2_burn_stack_depth; } #endif #ifdef USE_VAES_AVX2 - if (ctx->use_vaes_avx2 && num_blks >= 6) + if (ctx->use_vaes_avx2 && num_blks >= 4) { - /* 6 or more parallel block VAES processing is faster than + /* 4 or more parallel block VAES processing is faster than * generic C implementation. */ _gcry_camellia_vaes_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks); return avx2_burn_stack_depth; } #endif #ifdef USE_AESNI_AVX2 - if (ctx->use_aesni_avx2 && num_blks >= 6) + if (ctx->use_aesni_avx2 && num_blks >= 5) { - /* 6 or more parallel block AESNI processing is faster than + /* 5 or more parallel block AESNI processing is faster than * generic C implementation. */ _gcry_camellia_aesni_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks); return avx2_burn_stack_depth; |