summaryrefslogtreecommitdiff
path: root/cipher/keccak.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2022-07-21 11:14:07 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2022-07-25 16:11:09 +0300
commitbeaad75f4655e5316ce24f75ef172c231fd47fc1 (patch)
tree9d61130f5670af0999601055c9a436c09142a0a5 /cipher/keccak.c
parentdca0bd133dd08ec88e0b4c454cfc26c9093572a9 (diff)
downloadlibgcrypt-beaad75f4655e5316ce24f75ef172c231fd47fc1.tar.gz
sha3: Add x86-64 AVX512 accelerated implementation
* LICENSES: Add 'cipher/keccak-amd64-avx512.S'. * configure.ac: Add 'keccak-amd64-avx512.lo'. * cipher/Makefile.am: Add 'keccak-amd64-avx512.S'. * cipher/keccak-amd64-avx512.S: New. * cipher/keccak.c (USE_64BIT_AVX512, ASM_FUNC_ABI): New. [USE_64BIT_AVX512] (_gcry_keccak_f1600_state_permute64_avx512) (_gcry_keccak_absorb_blocks_avx512, keccak_f1600_state_permute64_avx512) (keccak_absorb_lanes64_avx512, keccak_avx512_64_ops): New. (keccak_init) [USE_64BIT_AVX512]: Enable x86-64 AVX512 implementation if supported by HW features. -- Benchmark on Intel Core i3-1115G4 (tigerlake): Before (BMI2 instructions): | nanosecs/byte mebibytes/sec cycles/byte auto Mhz SHA3-224 | 1.77 ns/B 540.3 MiB/s 7.22 c/B 4088 SHA3-256 | 1.86 ns/B 514.0 MiB/s 7.59 c/B 4089 SHA3-384 | 2.43 ns/B 393.1 MiB/s 9.92 c/B 4089 SHA3-512 | 3.49 ns/B 273.2 MiB/s 14.27 c/B 4088 SHAKE128 | 1.52 ns/B 629.1 MiB/s 6.20 c/B 4089 SHAKE256 | 1.86 ns/B 511.6 MiB/s 7.62 c/B 4089 After (~33% faster): | nanosecs/byte mebibytes/sec cycles/byte auto Mhz SHA3-224 | 1.32 ns/B 721.8 MiB/s 5.40 c/B 4089 SHA3-256 | 1.40 ns/B 681.7 MiB/s 5.72 c/B 4089 SHA3-384 | 1.83 ns/B 522.5 MiB/s 7.46 c/B 4089 SHA3-512 | 2.63 ns/B 362.1 MiB/s 10.77 c/B 4088 SHAKE128 | 1.13 ns/B 840.4 MiB/s 4.64 c/B 4089 SHAKE256 | 1.40 ns/B 682.1 MiB/s 5.72 c/B 4089 Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/keccak.c')
-rw-r--r--cipher/keccak.c83
1 files changed, 83 insertions, 0 deletions
diff --git a/cipher/keccak.c b/cipher/keccak.c
index f3502022..e7e42473 100644
--- a/cipher/keccak.c
+++ b/cipher/keccak.c
@@ -62,6 +62,16 @@
#endif
+/* USE_64BIT_AVX512 indicates whether to compile with Intel AVX512 code. */
+#undef USE_64BIT_AVX512
+#if defined(USE_64BIT) && defined(__x86_64__) && \
+ defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_64BIT_AVX512 1
+#endif
+
+
/* USE_64BIT_ARM_NEON indicates whether to enable 64-bit ARM/NEON assembly
* code. */
#undef USE_64BIT_ARM_NEON
@@ -81,6 +91,16 @@
#endif /* USE_S390X_CRYPTO */
+/* x86-64 vector register assembly implementations use SystemV ABI, ABI
+ * conversion needed on Win64 through function attribute. */
+#undef ASM_FUNC_ABI
+#if defined(USE_64BIT_AVX512) && defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+#else
+# define ASM_FUNC_ABI
+#endif
+
+
#if defined(USE_64BIT) || defined(USE_64BIT_ARM_NEON)
# define NEED_COMMON64 1
#endif
@@ -428,6 +448,65 @@ static const keccak_ops_t keccak_bmi2_64_ops =
#endif /* USE_64BIT_BMI2 */
+/* 64-bit Intel AVX512 implementation. */
+#ifdef USE_64BIT_AVX512
+
+extern ASM_FUNC_ABI unsigned int
+_gcry_keccak_f1600_state_permute64_avx512(u64 *state, const u64 *rconst);
+
+extern ASM_FUNC_ABI unsigned int
+_gcry_keccak_absorb_blocks_avx512(u64 *state, const u64 *rconst,
+ const byte *lanes, size_t nlanes,
+ size_t blocklanes, const byte **new_lanes);
+
+static unsigned int
+keccak_f1600_state_permute64_avx512(KECCAK_STATE *hd)
+{
+ return _gcry_keccak_f1600_state_permute64_avx512 (
+ hd->u.state64, _gcry_keccak_round_consts_64bit);
+}
+
+static unsigned int
+keccak_absorb_lanes64_avx512(KECCAK_STATE *hd, int pos, const byte *lanes,
+ unsigned int nlanes, int blocklanes)
+{
+ while (nlanes)
+ {
+ if (pos == 0 && blocklanes > 0 && nlanes >= (unsigned int)blocklanes)
+ {
+ nlanes = _gcry_keccak_absorb_blocks_avx512 (
+ hd->u.state64, _gcry_keccak_round_consts_64bit,
+ lanes, nlanes, blocklanes, &lanes);
+ }
+
+ while (nlanes)
+ {
+ hd->u.state64[pos] ^= buf_get_le64 (lanes);
+ lanes += 8;
+ nlanes--;
+
+ if (++pos == blocklanes)
+ {
+ keccak_f1600_state_permute64_avx512 (hd);
+ pos = 0;
+ break;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static const keccak_ops_t keccak_avx512_64_ops =
+{
+ .permute = keccak_f1600_state_permute64_avx512,
+ .absorb = keccak_absorb_lanes64_avx512,
+ .extract = keccak_extract64,
+};
+
+#endif /* USE_64BIT_AVX512 */
+
+
/* 64-bit ARMv7/NEON implementation. */
#ifdef USE_64BIT_ARM_NEON
@@ -894,6 +973,10 @@ keccak_init (int algo, void *context, unsigned int flags)
/* Select optimized implementation based in hw features. */
if (0) {}
+#ifdef USE_64BIT_AVX512
+ else if (features & HWF_INTEL_AVX512)
+ ctx->ops = &keccak_avx512_64_ops;
+#endif
#ifdef USE_64BIT_ARM_NEON
else if (features & HWF_ARM_NEON)
ctx->ops = &keccak_armv7_neon_64_ops;