summaryrefslogtreecommitdiff
path: root/configure.ac
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2023-02-17 00:23:19 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2023-02-22 20:25:50 +0200
commitf4268a8f51a89a7c0374a23f669d7a19cad304ae (patch)
tree8ad1385d20b087dbc577587ea1b9e5d8c2987462 /configure.ac
parent833a904faf2b90a1b1d1b58e1e9a12f2e8e2378c (diff)
downloadlibgcrypt-f4268a8f51a89a7c0374a23f669d7a19cad304ae.tar.gz
aria: add x86_64 AESNI/GFNI/AVX/AVX2 accelerated implementations
* cipher/Makefile.am: Add 'aria-aesni-avx-amd64.S' and 'aria-aesni-avx2-amd64.S'. * cipher/aria-aesni-avx-amd64.S: New. * cipher/aria-aesni-avx2-amd64.S: New. * cipher/aria.c (USE_AESNI_AVX, USE_GFNI_AVX, USE_AESNI_AVX2) (USE_GFNI_AVX2, MAX_PARALLEL_BLKS, ASM_FUNC_ABI, ASM_EXTRA_STACK): New. (ARIA_context): Add 'use_aesni_avx', 'use_gfni_avx', 'use_aesni_avx2' and 'use_gfni_avx2'. (_gcry_aria_aesni_avx_ecb_crypt_blk1_16) (_gcry_aria_aesni_avx_ctr_crypt_blk16) (_gcry_aria_gfni_avx_ecb_crypt_blk1_16) (_gcry_aria_gfni_avx_ctr_crypt_blk16) (aria_avx_ecb_crypt_blk1_16, aria_avx_ctr_crypt_blk16) (_gcry_aria_aesni_avx2_ecb_crypt_blk32) (_gcry_aria_aesni_avx2_ctr_crypt_blk32) (_gcry_aria_gfni_avx2_ecb_crypt_blk32) (_gcry_aria_gfni_avx2_ctr_crypt_blk32) (aria_avx2_ecb_crypt_blk32, aria_avx2_ctr_crypt_blk32): New. (aria_crypt_blocks) [USE_AESNI_AVX2]: Add 32 parallel block AVX2/AESNI/GFNI processing. (aria_crypt_blocks) [USE_AESNI_AVX]: Add 3 to 16 parallel block AVX/AESNI/GFNI processing. (_gcry_aria_ctr_enc) [USE_AESNI_AVX2]: Add 32 parallel block AVX2/AESNI/GFNI processing. (_gcry_aria_ctr_enc) [USE_AESNI_AVX]: Add 16 parallel block AVX/AESNI/GFNI processing. (_gcry_aria_ctr_enc, _gcry_aria_cbc_dec, _gcry_aria_cfb_enc) (_gcry_aria_ecb_crypt, _gcry_aria_xts_crypt, _gcry_aria_ctr32le_enc) (_gcry_aria_ocb_crypt, _gcry_aria_ocb_auth): Use MAX_PARALLEL_BLKS for parallel processing width. (aria_setkey): Enable AESNI/AVX, GFNI/AVX, AESNI/AVX2, GFNI/AVX2 based on HW features. * configure.ac: Add 'aria-aesni-avx-amd64.lo' and 'aria-aesni-avx2-amd64.lo'. --- This patch adds AVX/AVX2/AESNI/GFNI accelerated ARIA block cipher implementations for libgcrypt. This implementation is based on work by Taehee Yoo, with following notable changes: - Integration to libgcrypt, use of 'aes-common-amd64.h'. - Use 'vmovddup' for loading GFNI constants. - Use round loop instead of unrolling for smaller code size and increased performance. - Use stack for temporary storage instead of external buffers. - Use merge ECB encryption/decryption to single function. - Add 1 to 15 blocks support for AVX ECB functions. - Add byte-addition fast path for CTR. === Benchmark on AMD Ryzen 9 7900X (zen4, turbo-freq off): AESNI/AVX: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.715 ns/B 1333 MiB/s 3.36 c/B 4700 ECB dec | 0.712 ns/B 1339 MiB/s 3.35 c/B 4700 CTR enc | 0.714 ns/B 1336 MiB/s 3.36 c/B 4700 CTR dec | 0.714 ns/B 1335 MiB/s 3.36 c/B 4700 GFNI/AVX: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.516 ns/B 1847 MiB/s 2.43 c/B 4700 ECB dec | 0.519 ns/B 1839 MiB/s 2.44 c/B 4700 CTR enc | 0.517 ns/B 1846 MiB/s 2.43 c/B 4700 CTR dec | 0.518 ns/B 1843 MiB/s 2.43 c/B 4700 AESNI/AVX2: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.416 ns/B 2292 MiB/s 1.96 c/B 4700 ECB dec | 0.421 ns/B 2266 MiB/s 1.98 c/B 4700 CTR enc | 0.415 ns/B 2298 MiB/s 1.95 c/B 4700 CTR dec | 0.415 ns/B 2300 MiB/s 1.95 c/B 4700 GFNI/AVX2: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.235 ns/B 4056 MiB/s 1.11 c/B 4700 ECB dec | 0.234 ns/B 4079 MiB/s 1.10 c/B 4700 CTR enc | 0.232 ns/B 4104 MiB/s 1.09 c/B 4700 CTR dec | 0.233 ns/B 4094 MiB/s 1.10 c/B 4700 === Benchmark on Intel Core i3-1115G4 (tiger-lake, turbo-freq off): AESNI/AVX: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 1.26 ns/B 757.6 MiB/s 3.77 c/B 2993 ECB dec | 1.27 ns/B 753.1 MiB/s 3.79 c/B 2992 CTR enc | 1.25 ns/B 760.3 MiB/s 3.75 c/B 2992 CTR dec | 1.26 ns/B 759.1 MiB/s 3.76 c/B 2992 GFNI/AVX: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.967 ns/B 986.6 MiB/s 2.89 c/B 2992 ECB dec | 0.966 ns/B 987.1 MiB/s 2.89 c/B 2992 CTR enc | 0.972 ns/B 980.8 MiB/s 2.91 c/B 2993 CTR dec | 0.971 ns/B 982.5 MiB/s 2.90 c/B 2993 AESNI/AVX2: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.817 ns/B 1167 MiB/s 2.44 c/B 2992 ECB dec | 0.819 ns/B 1164 MiB/s 2.45 c/B 2992 CTR enc | 0.819 ns/B 1164 MiB/s 2.45 c/B 2992 CTR dec | 0.819 ns/B 1164 MiB/s 2.45 c/B 2992 GFNI/AVX2: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.506 ns/B 1886 MiB/s 1.51 c/B 2992 ECB dec | 0.505 ns/B 1887 MiB/s 1.51 c/B 2992 CTR enc | 0.564 ns/B 1691 MiB/s 1.69 c/B 2992 CTR dec | 0.565 ns/B 1689 MiB/s 1.69 c/B 2992 === Benchmark on AMD Ryzen 7 5800X (zen3, turbo-freq off): AESNI/AVX: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.921 ns/B 1035 MiB/s 3.50 c/B 3800 ECB dec | 0.922 ns/B 1034 MiB/s 3.50 c/B 3800 CTR enc | 0.923 ns/B 1033 MiB/s 3.51 c/B 3800 CTR dec | 0.923 ns/B 1033 MiB/s 3.51 c/B 3800 AESNI/AVX2: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.559 ns/B 1707 MiB/s 2.12 c/B 3800 ECB dec | 0.560 ns/B 1703 MiB/s 2.13 c/B 3800 CTR enc | 0.570 ns/B 1672 MiB/s 2.17 c/B 3800 CTR dec | 0.568 ns/B 1679 MiB/s 2.16 c/B 3800 === Benchmark on AMD EPYC 7642 (zen2): AESNI/AVX: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 1.22 ns/B 784.5 MiB/s 4.01 c/B 3298 ECB dec | 1.22 ns/B 784.8 MiB/s 4.00 c/B 3292 CTR enc | 1.22 ns/B 780.1 MiB/s 4.03 c/B 3299 CTR dec | 1.22 ns/B 779.1 MiB/s 4.04 c/B 3299 AESNI/AVX2: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.735 ns/B 1298 MiB/s 2.42 c/B 3299 ECB dec | 0.738 ns/B 1292 MiB/s 2.44 c/B 3299 CTR enc | 0.732 ns/B 1303 MiB/s 2.41 c/B 3299 CTR dec | 0.732 ns/B 1303 MiB/s 2.41 c/B 3299 === Benchmark on Intel Core i5-6500 (skylake): AESNI/AVX: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 1.24 ns/B 766.6 MiB/s 4.48 c/B 3598 ECB dec | 1.25 ns/B 764.9 MiB/s 4.49 c/B 3598 CTR enc | 1.25 ns/B 761.7 MiB/s 4.50 c/B 3598 CTR dec | 1.25 ns/B 761.6 MiB/s 4.51 c/B 3598 AESNI/AVX2: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.829 ns/B 1150 MiB/s 2.98 c/B 3599 ECB dec | 0.831 ns/B 1147 MiB/s 2.99 c/B 3598 CTR enc | 0.829 ns/B 1150 MiB/s 2.98 c/B 3598 CTR dec | 0.828 ns/B 1152 MiB/s 2.98 c/B 3598 === Benchmark on Intel Core i5-2450M (sandy-bridge, turbo-freq off): AESNI/AVX: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 2.11 ns/B 452.7 MiB/s 5.25 c/B 2494 ECB dec | 2.10 ns/B 454.5 MiB/s 5.23 c/B 2494 CTR enc | 2.10 ns/B 453.2 MiB/s 5.25 c/B 2494 CTR dec | 2.10 ns/B 453.2 MiB/s 5.25 c/B 2494 [v2] - Optimization for CTR mode: Use CTR byte-addition path when counter carry-overflow happen only on ctr-variable but not in generated counter vector registers. Cc: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'configure.ac')
-rw-r--r--configure.ac8
1 files changed, 8 insertions, 0 deletions
diff --git a/configure.ac b/configure.ac
index 9163b2ed..4f983a58 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3034,6 +3034,14 @@ LIST_MEMBER(aria, $enabled_ciphers)
if test "$found" = "1" ; then
GCRYPT_CIPHERS="$GCRYPT_CIPHERS aria.lo"
AC_DEFINE(USE_ARIA, 1, [Defined if this module should be included])
+
+ case "${host}" in
+ x86_64-*-*)
+ # Build with the assembly implementation
+ GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS aria-aesni-avx-amd64.lo"
+ GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS aria-aesni-avx2-amd64.lo"
+ ;;
+ esac
fi
LIST_MEMBER(dsa, $enabled_pubkey_ciphers)