diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-02-26 21:15:36 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-02-28 16:14:35 +0200 |
commit | 898c857206ada06d70c5f46ac5adaa9d7058e672 (patch) | |
tree | 7248f49dee5fabc13354230fba2ac4ffdd1ad5a8 | |
parent | 6fa11d8b7070eb7c4c296c879213c9596bd00b1c (diff) | |
download | libgcrypt-898c857206ada06d70c5f46ac5adaa9d7058e672.tar.gz |
camellia: add AArch64 crypto-extension implementation
* cipher/Makefile.am: Add 'camellia-aarch64-ce.(c|o|lo)'.
(aarch64_neon_cflags): New.
* cipher/camellia-aarch64-ce.c: New.
* cipher/camellia-glue.c (USE_AARCH64_CE): New.
(CAMELLIA_context): Add 'use_aarch64ce'.
(_gcry_camellia_aarch64ce_encrypt_blk16)
(_gcry_camellia_aarch64ce_decrypt_blk16)
(_gcry_camellia_aarch64ce_keygen, camellia_aarch64ce_enc_blk16)
(camellia_aarch64ce_dec_blk16, aarch64ce_burn_stack_depth): New.
(camellia_setkey) [USE_AARCH64_CE]: Set use_aarch64ce if HW has
HWF_ARM_AES; Use AArch64/CE key generation if supported by HW.
(camellia_encrypt_blk1_32, camellia_decrypt_blk1_32)
[USE_AARCH64_CE]: Add AArch64/CE code path.
--
Patch enables 128-bit vector instrinsics implementation of Camellia
cipher for AArch64.
Benchmark on AWS Graviton2:
Before:
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 5.99 ns/B 159.2 MiB/s 14.97 c/B 2500
ECB dec | 5.99 ns/B 159.1 MiB/s 14.98 c/B 2500
CBC enc | 6.16 ns/B 154.7 MiB/s 15.41 c/B 2500
CBC dec | 6.12 ns/B 155.8 MiB/s 15.29 c/B 2499
CFB enc | 6.49 ns/B 147.0 MiB/s 16.21 c/B 2500
CFB dec | 6.05 ns/B 157.6 MiB/s 15.13 c/B 2500
CTR enc | 6.09 ns/B 156.7 MiB/s 15.22 c/B 2500
CTR dec | 6.09 ns/B 156.6 MiB/s 15.22 c/B 2500
XTS enc | 6.16 ns/B 154.9 MiB/s 15.39 c/B 2500
XTS dec | 6.16 ns/B 154.8 MiB/s 15.40 c/B 2499
GCM enc | 6.31 ns/B 151.1 MiB/s 15.78 c/B 2500
GCM dec | 6.31 ns/B 151.1 MiB/s 15.78 c/B 2500
GCM auth | 0.206 ns/B 4635 MiB/s 0.514 c/B 2500
OCB enc | 6.63 ns/B 143.9 MiB/s 16.57 c/B 2499
OCB dec | 6.63 ns/B 143.9 MiB/s 16.56 c/B 2499
OCB auth | 6.55 ns/B 145.7 MiB/s 16.37 c/B 2499
After (ecb ~2.1x faster):
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 2.77 ns/B 344.2 MiB/s 6.93 c/B 2499
ECB dec | 2.76 ns/B 345.3 MiB/s 6.90 c/B 2499
CBC enc | 6.17 ns/B 154.7 MiB/s 15.41 c/B 2499
CBC dec | 2.89 ns/B 330.3 MiB/s 7.22 c/B 2500
CFB enc | 6.48 ns/B 147.1 MiB/s 16.21 c/B 2499
CFB dec | 2.84 ns/B 336.1 MiB/s 7.09 c/B 2499
CTR enc | 2.90 ns/B 328.8 MiB/s 7.25 c/B 2499
CTR dec | 2.90 ns/B 328.9 MiB/s 7.25 c/B 2500
XTS enc | 2.93 ns/B 325.3 MiB/s 7.33 c/B 2500
XTS dec | 2.92 ns/B 326.2 MiB/s 7.31 c/B 2500
GCM enc | 3.10 ns/B 307.2 MiB/s 7.76 c/B 2500
GCM dec | 3.10 ns/B 307.2 MiB/s 7.76 c/B 2499
GCM auth | 0.206 ns/B 4635 MiB/s 0.514 c/B 2500
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r-- | cipher/Makefile.am | 14 | ||||
-rw-r--r-- | cipher/camellia-aarch64-ce.c | 42 | ||||
-rw-r--r-- | cipher/camellia-glue.c | 70 | ||||
-rw-r--r-- | configure.ac | 106 |
4 files changed, 227 insertions, 5 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 52435ed5..dcaa68bb 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -148,7 +148,7 @@ EXTRA_libcipher_la_SOURCES = \ camellia-aesni-avx2-amd64.h \ camellia-gfni-avx2-amd64.S camellia-gfni-avx512-amd64.S \ camellia-vaes-avx2-amd64.S camellia-aesni-avx2-amd64.S \ - camellia-arm.S camellia-aarch64.S \ + camellia-arm.S camellia-aarch64.S camellia-aarch64-ce.c \ camellia-simd128.h camellia-ppc8le.c camellia-ppc9le.c \ blake2.c \ blake2b-amd64-avx2.S blake2b-amd64-avx512.S \ @@ -238,6 +238,12 @@ else ppc_vcrypto_cflags = endif +if ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS +aarch64_neon_cflags = -O2 -march=armv8-a+crypto +else +aarch64_neon_cflags = +endif + rijndael-ppc.o: $(srcdir)/rijndael-ppc.c Makefile `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` @@ -297,3 +303,9 @@ camellia-ppc9le.o: $(srcdir)/camellia-ppc9le.c Makefile camellia-ppc9le.lo: $(srcdir)/camellia-ppc9le.c Makefile `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` + +camellia-aarch64-ce.o: $(srcdir)/camellia-aarch64-ce.c Makefile + `echo $(COMPILE) $(aarch64_neon_cflags) -c $< | $(instrumentation_munging) ` + +camellia-aarch64-ce.lo: $(srcdir)/camellia-aarch64-ce.c Makefile + `echo $(LTCOMPILE) $(aarch64_neon_cflags) -c $< | $(instrumentation_munging) ` diff --git a/cipher/camellia-aarch64-ce.c b/cipher/camellia-aarch64-ce.c new file mode 100644 index 00000000..76813e94 --- /dev/null +++ b/cipher/camellia-aarch64-ce.c @@ -0,0 +1,42 @@ +/* camellia-aarch64-ce.c - ARMv8/CE Camellia implementation + * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <config.h> + +#if defined(__AARCH64EL__) && \ + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \ + defined(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS) && \ + (__GNUC__ >= 4) + +#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE +# define FUNC_ATTR_OPT __attribute__((optimize("-O2"))) +#else +# define FUNC_ATTR_OPT +#endif + +#define SIMD128_OPT_ATTR FUNC_ATTR_OPT + +#define FUNC_ENC_BLK16 _gcry_camellia_aarch64ce_encrypt_blk16 +#define FUNC_DEC_BLK16 _gcry_camellia_aarch64ce_decrypt_blk16 +#define FUNC_KEY_SETUP _gcry_camellia_aarch64ce_keygen + +#include "camellia-simd128.h" + +#endif /* __AARCH64EL__ */ diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c index 46bbe182..0b07f2d1 100644 --- a/cipher/camellia-glue.c +++ b/cipher/camellia-glue.c @@ -119,6 +119,16 @@ # define USE_PPC_CRYPTO 1 #endif +/* USE_AARCH64_CE indicates whether to enable ARMv8/CE accelerated code. */ +#undef USE_AARCH64_CE +#if defined(__AARCH64EL__) && \ + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \ + defined(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS) && \ + (__GNUC__ >= 4) +# define USE_AARCH64_CE 1 +#endif + typedef struct { KEY_TABLE_TYPE keytable; @@ -138,6 +148,9 @@ typedef struct unsigned int use_ppc8:1; unsigned int use_ppc9:1; #endif /*USE_PPC_CRYPTO*/ +#ifdef USE_AARCH64_CE + unsigned int use_aarch64ce:1; +#endif /*USE_AARCH64_CE*/ } CAMELLIA_context; /* Assembly implementations use SystemV ABI, ABI conversion and additional @@ -472,6 +485,36 @@ static const int ppc_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 16 + 2 * sizeof(void *); #endif /*USE_PPC_CRYPTO*/ +#ifdef USE_AARCH64_CE +extern void _gcry_camellia_aarch64ce_encrypt_blk16(const void *key_table, + void *out, const void *in, + int key_length); + +extern void _gcry_camellia_aarch64ce_decrypt_blk16(const void *key_table, + void *out, const void *in, + int key_length); + +extern void _gcry_camellia_aarch64ce_keygen(void *key_table, const void *vkey, + unsigned int keylen); + +void camellia_aarch64ce_enc_blk16(const CAMELLIA_context *ctx, + unsigned char *out, const unsigned char *in) +{ + _gcry_camellia_aarch64ce_encrypt_blk16 (ctx->keytable, out, in, + ctx->keybitlength / 8); +} + +void camellia_aarch64ce_dec_blk16(const CAMELLIA_context *ctx, + unsigned char *out, const unsigned char *in) +{ + _gcry_camellia_aarch64ce_decrypt_blk16 (ctx->keytable, out, in, + ctx->keybitlength / 8); +} + +static const int aarch64ce_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 16 + + 2 * sizeof(void *); +#endif /*USE_AARCH64_CE*/ + static const char *selftest(void); static void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr, @@ -549,6 +592,9 @@ camellia_setkey(void *c, const byte *key, unsigned keylen, ctx->use_ppc9 = (hwf & HWF_PPC_VCRYPTO) && (hwf & HWF_PPC_ARCH_3_00); ctx->use_ppc = ctx->use_ppc8 || ctx->use_ppc9; #endif +#ifdef USE_AARCH64_CE + ctx->use_aarch64ce = (hwf & HWF_ARM_AES) != 0; +#endif ctx->keybitlength=keylen*8; @@ -575,6 +621,10 @@ camellia_setkey(void *c, const byte *key, unsigned keylen, else if (ctx->use_ppc8) _gcry_camellia_ppc8_keygen(ctx->keytable, key, keylen); #endif +#ifdef USE_AARCH64_CE + else if (ctx->use_aarch64ce) + _gcry_camellia_aarch64ce_keygen(ctx->keytable, key, keylen); +#endif else { Camellia_Ekeygen(ctx->keybitlength,key,ctx->keytable); @@ -754,6 +804,16 @@ camellia_encrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf, num_blks -= 16; } #endif +#ifdef USE_AARCH64_CE + while (ctx->use_aarch64ce && num_blks >= 16) + { + camellia_aarch64ce_enc_blk16 (ctx, outbuf, inbuf); + stack_burn_size = aarch64ce_burn_stack_depth; + outbuf += CAMELLIA_BLOCK_SIZE * 16; + inbuf += CAMELLIA_BLOCK_SIZE * 16; + num_blks -= 16; + } +#endif while (num_blks) { @@ -855,6 +915,16 @@ camellia_decrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf, num_blks -= 16; } #endif +#ifdef USE_AARCH64_CE + while (ctx->use_aarch64ce && num_blks >= 16) + { + camellia_aarch64ce_dec_blk16 (ctx, outbuf, inbuf); + stack_burn_size = aarch64ce_burn_stack_depth; + outbuf += CAMELLIA_BLOCK_SIZE * 16; + inbuf += CAMELLIA_BLOCK_SIZE * 16; + num_blks -= 16; + } +#endif while (num_blks) { diff --git a/configure.ac b/configure.ac index a40a8135..0d5c9160 100644 --- a/configure.ac +++ b/configure.ac @@ -2136,7 +2136,103 @@ fi # -# Check whether PowerPC AltiVec/VSX intrinsics +# Check whether compiler supports AArch64/NEON/crypto intrinsics +# +AC_CACHE_CHECK([whether compiler supports AArch64/NEON/crypto intrinsics], + [gcry_cv_cc_aarch64_neon_intrinsics], + [if test "$mpi_cpu_arch" != "aarch64" || + test "$try_asm_modules" != "yes" ; then + gcry_cv_cc_aarch64_neon_intrinsics="n/a" + else + gcry_cv_cc_aarch64_neon_intrinsics=no + AC_COMPILE_IFELSE([AC_LANG_SOURCE( + [[#include <arm_neon.h> + #define __m128i uint64x2_t + #define vpsrldq128(s, a, o) \ + ({ uint64x2_t __tmp = { 0, 0 }; \ + o = (__m128i)vextq_u8((uint8x16_t)a, \ + (uint8x16_t)__tmp, (s) & 15); }) + #define vaesenclast128(a, b, o) \ + (o = (__m128i)vaeseq_u8((uint8x16_t)b, (uint8x16_t)a)) + #define memory_barrier_with_vec(a) __asm__("" : "+w"(a) :: "memory") + static inline __attribute__((always_inline)) __m128i + fn2(__m128i a) + { + vpsrldq128(2, a, a); + return a; + } + __m128i fn(__m128i in) + { + __m128i x; + memory_barrier_with_vec(in); + x = fn2(in); + memory_barrier_with_vec(x); + vaesenclast128(in, x, in); + memory_barrier_with_vec(in); + return in; + } + ]])], + [gcry_cv_cc_aarch64_neon_intrinsics=yes]) + fi]) +if test "$gcry_cv_cc_aarch64_neon_intrinsics" = "yes" ; then + AC_DEFINE(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS,1, + [Defined if underlying compiler supports AArch64/NEON/crypto intrinsics]) +fi + +_gcc_cflags_save=$CFLAGS +CFLAGS="$CFLAGS -O2 -march=armv8-a+crypto" + +if test "$gcry_cv_cc_aarch64_neon_intrinsics" = "no" && + test "$mpi_cpu_arch" = "aarch64" && + test "$try_asm_modules" = "yes" ; then + AC_CACHE_CHECK([whether compiler supports AArch64/NEON/crypto intrinsics with extra GCC flags], + [gcry_cv_cc_aarch64_neon_intrinsics_cflags], + [gcry_cv_cc_aarch64_neon_intrinsics_cflags=no + AC_COMPILE_IFELSE([AC_LANG_SOURCE( + [[#include <arm_neon.h> + #define __m128i uint64x2_t + #define vpsrldq128(s, a, o) \ + ({ uint64x2_t __tmp = { 0, 0 }; \ + o = (__m128i)vextq_u8((uint8x16_t)a, \ + (uint8x16_t)__tmp, (s) & 15); }) + #define vaesenclast128(a, b, o) \ + (o = (__m128i)vaeseq_u8((uint8x16_t)b, (uint8x16_t)a)) + #define memory_barrier_with_vec(a) __asm__("" : "+w"(a) :: "memory") + static inline __attribute__((always_inline)) __m128i + fn2(__m128i a) + { + vpsrldq128(2, a, a); + return a; + } + __m128i fn(__m128i in) + { + __m128i x; + memory_barrier_with_vec(in); + x = fn2(in); + memory_barrier_with_vec(x); + vaesenclast128(in, x, in); + memory_barrier_with_vec(in); + return in; + } + ]])], + [gcry_cv_cc_aarch64_neon_intrinsics_cflags=yes])]) + if test "$gcry_cv_cc_aarch64_neon_intrinsics_cflags" = "yes" ; then + AC_DEFINE(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS,1, + [Defined if underlying compiler supports AArch64/NEON/crypto intrinsics]) + AC_DEFINE(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS_WITH_CFLAGS,1, + [Defined if underlying compiler supports AArch64/NEON/crypto intrinsics with extra GCC flags]) + fi +fi + +AM_CONDITIONAL(ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS, + test "$gcry_cv_cc_aarch64_neon_intrinsics_cflags" = "yes") + +# Restore flags. +CFLAGS=$_gcc_cflags_save; + + +# +# Check whether compiler supports PowerPC AltiVec/VSX intrinsics # AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics], [gcry_cv_cc_ppc_altivec], @@ -2173,8 +2269,8 @@ _gcc_cflags_save=$CFLAGS CFLAGS="$CFLAGS -O2 -maltivec -mvsx -mcrypto" if test "$gcry_cv_cc_ppc_altivec" = "no" && - test "$mpi_cpu_arch" = "ppc" && - test "$try_asm_modules" == "yes" ; then + test "$mpi_cpu_arch" = "ppc" && + test "$try_asm_modules" = "yes" ; then AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags], [gcry_cv_cc_ppc_altivec_cflags], [gcry_cv_cc_ppc_altivec_cflags=no @@ -2193,7 +2289,8 @@ if test "$gcry_cv_cc_ppc_altivec" = "no" && vecu32 y = vec_vsx_ld (0, (unsigned int*)0); y = vec_sld_u32 (y, y, 3); return vec_cipher_be (t, in) ^ (block)y; - }]])], + } + ]])], [gcry_cv_cc_ppc_altivec_cflags=yes])]) if test "$gcry_cv_cc_ppc_altivec_cflags" = "yes" ; then AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1, @@ -2966,6 +3063,7 @@ if test "$found" = "1" ; then aarch64-*-*) # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aarch64.lo" + GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aarch64-ce.lo" ;; powerpc64le-*-*) # Build with the POWER vector implementations |