summaryrefslogtreecommitdiff
path: root/configure.ac
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2023-02-26 21:15:36 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2023-02-28 16:14:35 +0200
commit898c857206ada06d70c5f46ac5adaa9d7058e672 (patch)
tree7248f49dee5fabc13354230fba2ac4ffdd1ad5a8 /configure.ac
parent6fa11d8b7070eb7c4c296c879213c9596bd00b1c (diff)
downloadlibgcrypt-898c857206ada06d70c5f46ac5adaa9d7058e672.tar.gz
camellia: add AArch64 crypto-extension implementation
* cipher/Makefile.am: Add 'camellia-aarch64-ce.(c|o|lo)'. (aarch64_neon_cflags): New. * cipher/camellia-aarch64-ce.c: New. * cipher/camellia-glue.c (USE_AARCH64_CE): New. (CAMELLIA_context): Add 'use_aarch64ce'. (_gcry_camellia_aarch64ce_encrypt_blk16) (_gcry_camellia_aarch64ce_decrypt_blk16) (_gcry_camellia_aarch64ce_keygen, camellia_aarch64ce_enc_blk16) (camellia_aarch64ce_dec_blk16, aarch64ce_burn_stack_depth): New. (camellia_setkey) [USE_AARCH64_CE]: Set use_aarch64ce if HW has HWF_ARM_AES; Use AArch64/CE key generation if supported by HW. (camellia_encrypt_blk1_32, camellia_decrypt_blk1_32) [USE_AARCH64_CE]: Add AArch64/CE code path. -- Patch enables 128-bit vector instrinsics implementation of Camellia cipher for AArch64. Benchmark on AWS Graviton2: Before: CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 5.99 ns/B 159.2 MiB/s 14.97 c/B 2500 ECB dec | 5.99 ns/B 159.1 MiB/s 14.98 c/B 2500 CBC enc | 6.16 ns/B 154.7 MiB/s 15.41 c/B 2500 CBC dec | 6.12 ns/B 155.8 MiB/s 15.29 c/B 2499 CFB enc | 6.49 ns/B 147.0 MiB/s 16.21 c/B 2500 CFB dec | 6.05 ns/B 157.6 MiB/s 15.13 c/B 2500 CTR enc | 6.09 ns/B 156.7 MiB/s 15.22 c/B 2500 CTR dec | 6.09 ns/B 156.6 MiB/s 15.22 c/B 2500 XTS enc | 6.16 ns/B 154.9 MiB/s 15.39 c/B 2500 XTS dec | 6.16 ns/B 154.8 MiB/s 15.40 c/B 2499 GCM enc | 6.31 ns/B 151.1 MiB/s 15.78 c/B 2500 GCM dec | 6.31 ns/B 151.1 MiB/s 15.78 c/B 2500 GCM auth | 0.206 ns/B 4635 MiB/s 0.514 c/B 2500 OCB enc | 6.63 ns/B 143.9 MiB/s 16.57 c/B 2499 OCB dec | 6.63 ns/B 143.9 MiB/s 16.56 c/B 2499 OCB auth | 6.55 ns/B 145.7 MiB/s 16.37 c/B 2499 After (ecb ~2.1x faster): CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 2.77 ns/B 344.2 MiB/s 6.93 c/B 2499 ECB dec | 2.76 ns/B 345.3 MiB/s 6.90 c/B 2499 CBC enc | 6.17 ns/B 154.7 MiB/s 15.41 c/B 2499 CBC dec | 2.89 ns/B 330.3 MiB/s 7.22 c/B 2500 CFB enc | 6.48 ns/B 147.1 MiB/s 16.21 c/B 2499 CFB dec | 2.84 ns/B 336.1 MiB/s 7.09 c/B 2499 CTR enc | 2.90 ns/B 328.8 MiB/s 7.25 c/B 2499 CTR dec | 2.90 ns/B 328.9 MiB/s 7.25 c/B 2500 XTS enc | 2.93 ns/B 325.3 MiB/s 7.33 c/B 2500 XTS dec | 2.92 ns/B 326.2 MiB/s 7.31 c/B 2500 GCM enc | 3.10 ns/B 307.2 MiB/s 7.76 c/B 2500 GCM dec | 3.10 ns/B 307.2 MiB/s 7.76 c/B 2499 GCM auth | 0.206 ns/B 4635 MiB/s 0.514 c/B 2500 Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'configure.ac')
-rw-r--r--configure.ac106
1 files changed, 102 insertions, 4 deletions
diff --git a/configure.ac b/configure.ac
index a40a8135..0d5c9160 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2136,7 +2136,103 @@ fi
#
-# Check whether PowerPC AltiVec/VSX intrinsics
+# Check whether compiler supports AArch64/NEON/crypto intrinsics
+#
+AC_CACHE_CHECK([whether compiler supports AArch64/NEON/crypto intrinsics],
+ [gcry_cv_cc_aarch64_neon_intrinsics],
+ [if test "$mpi_cpu_arch" != "aarch64" ||
+ test "$try_asm_modules" != "yes" ; then
+ gcry_cv_cc_aarch64_neon_intrinsics="n/a"
+ else
+ gcry_cv_cc_aarch64_neon_intrinsics=no
+ AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+ [[#include <arm_neon.h>
+ #define __m128i uint64x2_t
+ #define vpsrldq128(s, a, o) \
+ ({ uint64x2_t __tmp = { 0, 0 }; \
+ o = (__m128i)vextq_u8((uint8x16_t)a, \
+ (uint8x16_t)__tmp, (s) & 15); })
+ #define vaesenclast128(a, b, o) \
+ (o = (__m128i)vaeseq_u8((uint8x16_t)b, (uint8x16_t)a))
+ #define memory_barrier_with_vec(a) __asm__("" : "+w"(a) :: "memory")
+ static inline __attribute__((always_inline)) __m128i
+ fn2(__m128i a)
+ {
+ vpsrldq128(2, a, a);
+ return a;
+ }
+ __m128i fn(__m128i in)
+ {
+ __m128i x;
+ memory_barrier_with_vec(in);
+ x = fn2(in);
+ memory_barrier_with_vec(x);
+ vaesenclast128(in, x, in);
+ memory_barrier_with_vec(in);
+ return in;
+ }
+ ]])],
+ [gcry_cv_cc_aarch64_neon_intrinsics=yes])
+ fi])
+if test "$gcry_cv_cc_aarch64_neon_intrinsics" = "yes" ; then
+ AC_DEFINE(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS,1,
+ [Defined if underlying compiler supports AArch64/NEON/crypto intrinsics])
+fi
+
+_gcc_cflags_save=$CFLAGS
+CFLAGS="$CFLAGS -O2 -march=armv8-a+crypto"
+
+if test "$gcry_cv_cc_aarch64_neon_intrinsics" = "no" &&
+ test "$mpi_cpu_arch" = "aarch64" &&
+ test "$try_asm_modules" = "yes" ; then
+ AC_CACHE_CHECK([whether compiler supports AArch64/NEON/crypto intrinsics with extra GCC flags],
+ [gcry_cv_cc_aarch64_neon_intrinsics_cflags],
+ [gcry_cv_cc_aarch64_neon_intrinsics_cflags=no
+ AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+ [[#include <arm_neon.h>
+ #define __m128i uint64x2_t
+ #define vpsrldq128(s, a, o) \
+ ({ uint64x2_t __tmp = { 0, 0 }; \
+ o = (__m128i)vextq_u8((uint8x16_t)a, \
+ (uint8x16_t)__tmp, (s) & 15); })
+ #define vaesenclast128(a, b, o) \
+ (o = (__m128i)vaeseq_u8((uint8x16_t)b, (uint8x16_t)a))
+ #define memory_barrier_with_vec(a) __asm__("" : "+w"(a) :: "memory")
+ static inline __attribute__((always_inline)) __m128i
+ fn2(__m128i a)
+ {
+ vpsrldq128(2, a, a);
+ return a;
+ }
+ __m128i fn(__m128i in)
+ {
+ __m128i x;
+ memory_barrier_with_vec(in);
+ x = fn2(in);
+ memory_barrier_with_vec(x);
+ vaesenclast128(in, x, in);
+ memory_barrier_with_vec(in);
+ return in;
+ }
+ ]])],
+ [gcry_cv_cc_aarch64_neon_intrinsics_cflags=yes])])
+ if test "$gcry_cv_cc_aarch64_neon_intrinsics_cflags" = "yes" ; then
+ AC_DEFINE(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS,1,
+ [Defined if underlying compiler supports AArch64/NEON/crypto intrinsics])
+ AC_DEFINE(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS_WITH_CFLAGS,1,
+ [Defined if underlying compiler supports AArch64/NEON/crypto intrinsics with extra GCC flags])
+ fi
+fi
+
+AM_CONDITIONAL(ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS,
+ test "$gcry_cv_cc_aarch64_neon_intrinsics_cflags" = "yes")
+
+# Restore flags.
+CFLAGS=$_gcc_cflags_save;
+
+
+#
+# Check whether compiler supports PowerPC AltiVec/VSX intrinsics
#
AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics],
[gcry_cv_cc_ppc_altivec],
@@ -2173,8 +2269,8 @@ _gcc_cflags_save=$CFLAGS
CFLAGS="$CFLAGS -O2 -maltivec -mvsx -mcrypto"
if test "$gcry_cv_cc_ppc_altivec" = "no" &&
- test "$mpi_cpu_arch" = "ppc" &&
- test "$try_asm_modules" == "yes" ; then
+ test "$mpi_cpu_arch" = "ppc" &&
+ test "$try_asm_modules" = "yes" ; then
AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags],
[gcry_cv_cc_ppc_altivec_cflags],
[gcry_cv_cc_ppc_altivec_cflags=no
@@ -2193,7 +2289,8 @@ if test "$gcry_cv_cc_ppc_altivec" = "no" &&
vecu32 y = vec_vsx_ld (0, (unsigned int*)0);
y = vec_sld_u32 (y, y, 3);
return vec_cipher_be (t, in) ^ (block)y;
- }]])],
+ }
+ ]])],
[gcry_cv_cc_ppc_altivec_cflags=yes])])
if test "$gcry_cv_cc_ppc_altivec_cflags" = "yes" ; then
AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1,
@@ -2966,6 +3063,7 @@ if test "$found" = "1" ; then
aarch64-*-*)
# Build with the assembly implementation
GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aarch64.lo"
+ GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aarch64-ce.lo"
;;
powerpc64le-*-*)
# Build with the POWER vector implementations