diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2022-07-21 11:14:07 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2022-07-25 16:11:09 +0300 |
commit | beaad75f4655e5316ce24f75ef172c231fd47fc1 (patch) | |
tree | 9d61130f5670af0999601055c9a436c09142a0a5 /cipher/keccak.c | |
parent | dca0bd133dd08ec88e0b4c454cfc26c9093572a9 (diff) | |
download | libgcrypt-beaad75f4655e5316ce24f75ef172c231fd47fc1.tar.gz |
sha3: Add x86-64 AVX512 accelerated implementation
* LICENSES: Add 'cipher/keccak-amd64-avx512.S'.
* configure.ac: Add 'keccak-amd64-avx512.lo'.
* cipher/Makefile.am: Add 'keccak-amd64-avx512.S'.
* cipher/keccak-amd64-avx512.S: New.
* cipher/keccak.c (USE_64BIT_AVX512, ASM_FUNC_ABI): New.
[USE_64BIT_AVX512] (_gcry_keccak_f1600_state_permute64_avx512)
(_gcry_keccak_absorb_blocks_avx512, keccak_f1600_state_permute64_avx512)
(keccak_absorb_lanes64_avx512, keccak_avx512_64_ops): New.
(keccak_init) [USE_64BIT_AVX512]: Enable x86-64 AVX512 implementation
if supported by HW features.
--
Benchmark on Intel Core i3-1115G4 (tigerlake):
Before (BMI2 instructions):
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
SHA3-224 | 1.77 ns/B 540.3 MiB/s 7.22 c/B 4088
SHA3-256 | 1.86 ns/B 514.0 MiB/s 7.59 c/B 4089
SHA3-384 | 2.43 ns/B 393.1 MiB/s 9.92 c/B 4089
SHA3-512 | 3.49 ns/B 273.2 MiB/s 14.27 c/B 4088
SHAKE128 | 1.52 ns/B 629.1 MiB/s 6.20 c/B 4089
SHAKE256 | 1.86 ns/B 511.6 MiB/s 7.62 c/B 4089
After (~33% faster):
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
SHA3-224 | 1.32 ns/B 721.8 MiB/s 5.40 c/B 4089
SHA3-256 | 1.40 ns/B 681.7 MiB/s 5.72 c/B 4089
SHA3-384 | 1.83 ns/B 522.5 MiB/s 7.46 c/B 4089
SHA3-512 | 2.63 ns/B 362.1 MiB/s 10.77 c/B 4088
SHAKE128 | 1.13 ns/B 840.4 MiB/s 4.64 c/B 4089
SHAKE256 | 1.40 ns/B 682.1 MiB/s 5.72 c/B 4089
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/keccak.c')
-rw-r--r-- | cipher/keccak.c | 83 |
1 files changed, 83 insertions, 0 deletions
diff --git a/cipher/keccak.c b/cipher/keccak.c index f3502022..e7e42473 100644 --- a/cipher/keccak.c +++ b/cipher/keccak.c @@ -62,6 +62,16 @@ #endif +/* USE_64BIT_AVX512 indicates whether to compile with Intel AVX512 code. */ +#undef USE_64BIT_AVX512 +#if defined(USE_64BIT) && defined(__x86_64__) && \ + defined(HAVE_GCC_INLINE_ASM_AVX512) && \ + (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) +# define USE_64BIT_AVX512 1 +#endif + + /* USE_64BIT_ARM_NEON indicates whether to enable 64-bit ARM/NEON assembly * code. */ #undef USE_64BIT_ARM_NEON @@ -81,6 +91,16 @@ #endif /* USE_S390X_CRYPTO */ +/* x86-64 vector register assembly implementations use SystemV ABI, ABI + * conversion needed on Win64 through function attribute. */ +#undef ASM_FUNC_ABI +#if defined(USE_64BIT_AVX512) && defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) +# define ASM_FUNC_ABI __attribute__((sysv_abi)) +#else +# define ASM_FUNC_ABI +#endif + + #if defined(USE_64BIT) || defined(USE_64BIT_ARM_NEON) # define NEED_COMMON64 1 #endif @@ -428,6 +448,65 @@ static const keccak_ops_t keccak_bmi2_64_ops = #endif /* USE_64BIT_BMI2 */ +/* 64-bit Intel AVX512 implementation. */ +#ifdef USE_64BIT_AVX512 + +extern ASM_FUNC_ABI unsigned int +_gcry_keccak_f1600_state_permute64_avx512(u64 *state, const u64 *rconst); + +extern ASM_FUNC_ABI unsigned int +_gcry_keccak_absorb_blocks_avx512(u64 *state, const u64 *rconst, + const byte *lanes, size_t nlanes, + size_t blocklanes, const byte **new_lanes); + +static unsigned int +keccak_f1600_state_permute64_avx512(KECCAK_STATE *hd) +{ + return _gcry_keccak_f1600_state_permute64_avx512 ( + hd->u.state64, _gcry_keccak_round_consts_64bit); +} + +static unsigned int +keccak_absorb_lanes64_avx512(KECCAK_STATE *hd, int pos, const byte *lanes, + unsigned int nlanes, int blocklanes) +{ + while (nlanes) + { + if (pos == 0 && blocklanes > 0 && nlanes >= (unsigned int)blocklanes) + { + nlanes = _gcry_keccak_absorb_blocks_avx512 ( + hd->u.state64, _gcry_keccak_round_consts_64bit, + lanes, nlanes, blocklanes, &lanes); + } + + while (nlanes) + { + hd->u.state64[pos] ^= buf_get_le64 (lanes); + lanes += 8; + nlanes--; + + if (++pos == blocklanes) + { + keccak_f1600_state_permute64_avx512 (hd); + pos = 0; + break; + } + } + } + + return 0; +} + +static const keccak_ops_t keccak_avx512_64_ops = +{ + .permute = keccak_f1600_state_permute64_avx512, + .absorb = keccak_absorb_lanes64_avx512, + .extract = keccak_extract64, +}; + +#endif /* USE_64BIT_AVX512 */ + + /* 64-bit ARMv7/NEON implementation. */ #ifdef USE_64BIT_ARM_NEON @@ -894,6 +973,10 @@ keccak_init (int algo, void *context, unsigned int flags) /* Select optimized implementation based in hw features. */ if (0) {} +#ifdef USE_64BIT_AVX512 + else if (features & HWF_INTEL_AVX512) + ctx->ops = &keccak_avx512_64_ops; +#endif #ifdef USE_64BIT_ARM_NEON else if (features & HWF_ARM_NEON) ctx->ops = &keccak_armv7_neon_64_ops; |