summaryrefslogtreecommitdiff
path: root/cipher/camellia-glue.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2023-02-21 20:53:39 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2023-02-22 21:13:36 +0200
commitb9a9755742c7bf7ca8c007d33f98aaa076a382c7 (patch)
tree4bf1aa87c67ae8d75e375506a997e9c2ceef9827 /cipher/camellia-glue.c
parenta4c22331f57d23832ddd019ac3108b5fa3dd942b (diff)
downloadlibgcrypt-b9a9755742c7bf7ca8c007d33f98aaa076a382c7.tar.gz
camellia-avx2: speed up for round key broadcasting
* cipher/camellia-aesni-avx2-amd64.h (roundsm32, fls32): Use 'vpbroadcastb' for loading round key. * cipher/camellia-glue.c (camellia_encrypt_blk1_32) (camellia_decrypt_blk1_32): Adjust num_blks thresholds for AVX2 implementations, 2 blks for GFNI, 4 blks for VAES and 5 blks for AESNI. -- Benchmark on AMD Ryzen 9 7900X (turbo-freq off): Before: CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.213 ns/B 4469 MiB/s 1.00 c/B 4700 ECB dec | 0.215 ns/B 4440 MiB/s 1.01 c/B 4700 After (~10% faster): CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.194 ns/B 4919 MiB/s 0.911 c/B 4700 ECB dec | 0.195 ns/B 4896 MiB/s 0.916 c/B 4700 Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/camellia-glue.c')
-rw-r--r--cipher/camellia-glue.c24
1 files changed, 12 insertions, 12 deletions
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 76a09eb1..b87faa91 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -630,27 +630,27 @@ camellia_encrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf,
gcry_assert (num_blks <= 32);
#ifdef USE_GFNI_AVX2
- if (ctx->use_gfni_avx2 && num_blks >= 3)
+ if (ctx->use_gfni_avx2 && num_blks >= 2)
{
- /* 3 or more parallel block GFNI processing is faster than
+ /* 2 or more parallel block GFNI processing is faster than
* generic C implementation. */
_gcry_camellia_gfni_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks);
return avx2_burn_stack_depth;
}
#endif
#ifdef USE_VAES_AVX2
- if (ctx->use_vaes_avx2 && num_blks >= 6)
+ if (ctx->use_vaes_avx2 && num_blks >= 4)
{
- /* 6 or more parallel block VAES processing is faster than
+ /* 4 or more parallel block VAES processing is faster than
* generic C implementation. */
_gcry_camellia_vaes_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks);
return avx2_burn_stack_depth;
}
#endif
#ifdef USE_AESNI_AVX2
- if (ctx->use_aesni_avx2 && num_blks >= 6)
+ if (ctx->use_aesni_avx2 && num_blks >= 5)
{
- /* 6 or more parallel block AESNI processing is faster than
+ /* 5 or more parallel block AESNI processing is faster than
* generic C implementation. */
_gcry_camellia_aesni_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks);
return avx2_burn_stack_depth;
@@ -721,27 +721,27 @@ camellia_decrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf,
gcry_assert (num_blks <= 32);
#ifdef USE_GFNI_AVX2
- if (ctx->use_gfni_avx2 && num_blks >= 3)
+ if (ctx->use_gfni_avx2 && num_blks >= 2)
{
- /* 3 or more parallel block GFNI processing is faster than
+ /* 2 or more parallel block GFNI processing is faster than
* generic C implementation. */
_gcry_camellia_gfni_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks);
return avx2_burn_stack_depth;
}
#endif
#ifdef USE_VAES_AVX2
- if (ctx->use_vaes_avx2 && num_blks >= 6)
+ if (ctx->use_vaes_avx2 && num_blks >= 4)
{
- /* 6 or more parallel block VAES processing is faster than
+ /* 4 or more parallel block VAES processing is faster than
* generic C implementation. */
_gcry_camellia_vaes_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks);
return avx2_burn_stack_depth;
}
#endif
#ifdef USE_AESNI_AVX2
- if (ctx->use_aesni_avx2 && num_blks >= 6)
+ if (ctx->use_aesni_avx2 && num_blks >= 5)
{
- /* 6 or more parallel block AESNI processing is faster than
+ /* 5 or more parallel block AESNI processing is faster than
* generic C implementation. */
_gcry_camellia_aesni_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks);
return avx2_burn_stack_depth;