diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-02-17 00:23:19 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-02-22 20:25:50 +0200 |
commit | f4268a8f51a89a7c0374a23f669d7a19cad304ae (patch) | |
tree | 8ad1385d20b087dbc577587ea1b9e5d8c2987462 /cipher/aria.c | |
parent | 833a904faf2b90a1b1d1b58e1e9a12f2e8e2378c (diff) | |
download | libgcrypt-f4268a8f51a89a7c0374a23f669d7a19cad304ae.tar.gz |
aria: add x86_64 AESNI/GFNI/AVX/AVX2 accelerated implementations
* cipher/Makefile.am: Add 'aria-aesni-avx-amd64.S' and
'aria-aesni-avx2-amd64.S'.
* cipher/aria-aesni-avx-amd64.S: New.
* cipher/aria-aesni-avx2-amd64.S: New.
* cipher/aria.c (USE_AESNI_AVX, USE_GFNI_AVX, USE_AESNI_AVX2)
(USE_GFNI_AVX2, MAX_PARALLEL_BLKS, ASM_FUNC_ABI, ASM_EXTRA_STACK): New.
(ARIA_context): Add 'use_aesni_avx', 'use_gfni_avx',
'use_aesni_avx2' and 'use_gfni_avx2'.
(_gcry_aria_aesni_avx_ecb_crypt_blk1_16)
(_gcry_aria_aesni_avx_ctr_crypt_blk16)
(_gcry_aria_gfni_avx_ecb_crypt_blk1_16)
(_gcry_aria_gfni_avx_ctr_crypt_blk16)
(aria_avx_ecb_crypt_blk1_16, aria_avx_ctr_crypt_blk16)
(_gcry_aria_aesni_avx2_ecb_crypt_blk32)
(_gcry_aria_aesni_avx2_ctr_crypt_blk32)
(_gcry_aria_gfni_avx2_ecb_crypt_blk32)
(_gcry_aria_gfni_avx2_ctr_crypt_blk32)
(aria_avx2_ecb_crypt_blk32, aria_avx2_ctr_crypt_blk32): New.
(aria_crypt_blocks) [USE_AESNI_AVX2]: Add 32 parallel block
AVX2/AESNI/GFNI processing.
(aria_crypt_blocks) [USE_AESNI_AVX]: Add 3 to 16 parallel block
AVX/AESNI/GFNI processing.
(_gcry_aria_ctr_enc) [USE_AESNI_AVX2]: Add 32 parallel block
AVX2/AESNI/GFNI processing.
(_gcry_aria_ctr_enc) [USE_AESNI_AVX]: Add 16 parallel block
AVX/AESNI/GFNI processing.
(_gcry_aria_ctr_enc, _gcry_aria_cbc_dec, _gcry_aria_cfb_enc)
(_gcry_aria_ecb_crypt, _gcry_aria_xts_crypt, _gcry_aria_ctr32le_enc)
(_gcry_aria_ocb_crypt, _gcry_aria_ocb_auth): Use MAX_PARALLEL_BLKS
for parallel processing width.
(aria_setkey): Enable AESNI/AVX, GFNI/AVX, AESNI/AVX2, GFNI/AVX2 based
on HW features.
* configure.ac: Add 'aria-aesni-avx-amd64.lo' and
'aria-aesni-avx2-amd64.lo'.
---
This patch adds AVX/AVX2/AESNI/GFNI accelerated ARIA block cipher
implementations for libgcrypt. This implementation is based on work
by Taehee Yoo, with following notable changes:
- Integration to libgcrypt, use of 'aes-common-amd64.h'.
- Use 'vmovddup' for loading GFNI constants.
- Use round loop instead of unrolling for smaller code size and
increased performance.
- Use stack for temporary storage instead of external buffers.
- Use merge ECB encryption/decryption to single function.
- Add 1 to 15 blocks support for AVX ECB functions.
- Add byte-addition fast path for CTR.
===
Benchmark on AMD Ryzen 9 7900X (zen4, turbo-freq off):
AESNI/AVX:
ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.715 ns/B 1333 MiB/s 3.36 c/B 4700
ECB dec | 0.712 ns/B 1339 MiB/s 3.35 c/B 4700
CTR enc | 0.714 ns/B 1336 MiB/s 3.36 c/B 4700
CTR dec | 0.714 ns/B 1335 MiB/s 3.36 c/B 4700
GFNI/AVX:
ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.516 ns/B 1847 MiB/s 2.43 c/B 4700
ECB dec | 0.519 ns/B 1839 MiB/s 2.44 c/B 4700
CTR enc | 0.517 ns/B 1846 MiB/s 2.43 c/B 4700
CTR dec | 0.518 ns/B 1843 MiB/s 2.43 c/B 4700
AESNI/AVX2:
ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.416 ns/B 2292 MiB/s 1.96 c/B 4700
ECB dec | 0.421 ns/B 2266 MiB/s 1.98 c/B 4700
CTR enc | 0.415 ns/B 2298 MiB/s 1.95 c/B 4700
CTR dec | 0.415 ns/B 2300 MiB/s 1.95 c/B 4700
GFNI/AVX2:
ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.235 ns/B 4056 MiB/s 1.11 c/B 4700
ECB dec | 0.234 ns/B 4079 MiB/s 1.10 c/B 4700
CTR enc | 0.232 ns/B 4104 MiB/s 1.09 c/B 4700
CTR dec | 0.233 ns/B 4094 MiB/s 1.10 c/B 4700
===
Benchmark on Intel Core i3-1115G4 (tiger-lake, turbo-freq off):
AESNI/AVX:
ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 1.26 ns/B 757.6 MiB/s 3.77 c/B 2993
ECB dec | 1.27 ns/B 753.1 MiB/s 3.79 c/B 2992
CTR enc | 1.25 ns/B 760.3 MiB/s 3.75 c/B 2992
CTR dec | 1.26 ns/B 759.1 MiB/s 3.76 c/B 2992
GFNI/AVX:
ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.967 ns/B 986.6 MiB/s 2.89 c/B 2992
ECB dec | 0.966 ns/B 987.1 MiB/s 2.89 c/B 2992
CTR enc | 0.972 ns/B 980.8 MiB/s 2.91 c/B 2993
CTR dec | 0.971 ns/B 982.5 MiB/s 2.90 c/B 2993
AESNI/AVX2:
ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.817 ns/B 1167 MiB/s 2.44 c/B 2992
ECB dec | 0.819 ns/B 1164 MiB/s 2.45 c/B 2992
CTR enc | 0.819 ns/B 1164 MiB/s 2.45 c/B 2992
CTR dec | 0.819 ns/B 1164 MiB/s 2.45 c/B 2992
GFNI/AVX2:
ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.506 ns/B 1886 MiB/s 1.51 c/B 2992
ECB dec | 0.505 ns/B 1887 MiB/s 1.51 c/B 2992
CTR enc | 0.564 ns/B 1691 MiB/s 1.69 c/B 2992
CTR dec | 0.565 ns/B 1689 MiB/s 1.69 c/B 2992
===
Benchmark on AMD Ryzen 7 5800X (zen3, turbo-freq off):
AESNI/AVX:
ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.921 ns/B 1035 MiB/s 3.50 c/B 3800
ECB dec | 0.922 ns/B 1034 MiB/s 3.50 c/B 3800
CTR enc | 0.923 ns/B 1033 MiB/s 3.51 c/B 3800
CTR dec | 0.923 ns/B 1033 MiB/s 3.51 c/B 3800
AESNI/AVX2:
ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.559 ns/B 1707 MiB/s 2.12 c/B 3800
ECB dec | 0.560 ns/B 1703 MiB/s 2.13 c/B 3800
CTR enc | 0.570 ns/B 1672 MiB/s 2.17 c/B 3800
CTR dec | 0.568 ns/B 1679 MiB/s 2.16 c/B 3800
===
Benchmark on AMD EPYC 7642 (zen2):
AESNI/AVX:
ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 1.22 ns/B 784.5 MiB/s 4.01 c/B 3298
ECB dec | 1.22 ns/B 784.8 MiB/s 4.00 c/B 3292
CTR enc | 1.22 ns/B 780.1 MiB/s 4.03 c/B 3299
CTR dec | 1.22 ns/B 779.1 MiB/s 4.04 c/B 3299
AESNI/AVX2:
ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.735 ns/B 1298 MiB/s 2.42 c/B 3299
ECB dec | 0.738 ns/B 1292 MiB/s 2.44 c/B 3299
CTR enc | 0.732 ns/B 1303 MiB/s 2.41 c/B 3299
CTR dec | 0.732 ns/B 1303 MiB/s 2.41 c/B 3299
===
Benchmark on Intel Core i5-6500 (skylake):
AESNI/AVX:
ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 1.24 ns/B 766.6 MiB/s 4.48 c/B 3598
ECB dec | 1.25 ns/B 764.9 MiB/s 4.49 c/B 3598
CTR enc | 1.25 ns/B 761.7 MiB/s 4.50 c/B 3598
CTR dec | 1.25 ns/B 761.6 MiB/s 4.51 c/B 3598
AESNI/AVX2:
ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.829 ns/B 1150 MiB/s 2.98 c/B 3599
ECB dec | 0.831 ns/B 1147 MiB/s 2.99 c/B 3598
CTR enc | 0.829 ns/B 1150 MiB/s 2.98 c/B 3598
CTR dec | 0.828 ns/B 1152 MiB/s 2.98 c/B 3598
===
Benchmark on Intel Core i5-2450M (sandy-bridge, turbo-freq off):
AESNI/AVX:
ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 2.11 ns/B 452.7 MiB/s 5.25 c/B 2494
ECB dec | 2.10 ns/B 454.5 MiB/s 5.23 c/B 2494
CTR enc | 2.10 ns/B 453.2 MiB/s 5.25 c/B 2494
CTR dec | 2.10 ns/B 453.2 MiB/s 5.25 c/B 2494
[v2]
- Optimization for CTR mode: Use CTR byte-addition path when
counter carry-overflow happen only on ctr-variable but not in
generated counter vector registers.
Cc: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/aria.c')
-rw-r--r-- | cipher/aria.c | 299 |
1 files changed, 274 insertions, 25 deletions
diff --git a/cipher/aria.c b/cipher/aria.c index 700ea409..18952d04 100644 --- a/cipher/aria.c +++ b/cipher/aria.c @@ -50,6 +50,60 @@ #endif +/* USE_AESNI_AVX inidicates whether to compile with Intel AES-NI/AVX code. */ +#undef USE_AESNI_AVX +#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT) +# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) +# define USE_AESNI_AVX 1 +# endif +#endif + +/* USE_GFNI_AVX inidicates whether to compile with Intel GFNI/AVX code. */ +#undef USE_GFNI_AVX +#if defined(USE_AESNI_AVX) && defined(ENABLE_GFNI_SUPPORT) +# define USE_GFNI_AVX 1 +#endif + +/* USE_AESNI_AVX2 inidicates whether to compile with Intel AES-NI/AVX2 code. */ +#undef USE_AESNI_AVX2 +#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) +# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) +# define USE_AESNI_AVX2 1 +# endif +#endif + +/* USE_GFNI_AVX2 inidicates whether to compile with Intel GFNI/AVX2 code. */ +#undef USE_GFNI_AVX2 +#if defined(USE_AESNI_AVX2) && defined(ENABLE_GFNI_SUPPORT) +# define USE_GFNI_AVX2 1 +#endif + +/* How many parallel blocks to handle in bulk processing functions. */ +#if defined(USE_AESNI_AVX2) +# define MAX_PARALLEL_BLKS 32 +#elif defined(USE_AESNI_AVX) +# define MAX_PARALLEL_BLKS 16 +#else +# define MAX_PARALLEL_BLKS 8 +#endif + +/* Assembly implementations use SystemV ABI, ABI conversion and additional + * stack to store XMM6-XMM15 needed on Win64. */ +#undef ASM_FUNC_ABI +#undef ASM_EXTRA_STACK +#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) +# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS +# define ASM_FUNC_ABI __attribute__((sysv_abi)) +# define ASM_EXTRA_STACK (10 * 16) +# else +# define ASM_FUNC_ABI +# define ASM_EXTRA_STACK 0 +# endif +#endif + + static const char *aria_selftest (void); @@ -69,6 +123,15 @@ typedef struct unsigned int decryption_prepared:1; /* The decryption key is set up. */ unsigned int bulk_prefetch_ready:1; /* Look-up table prefetch ready for * current bulk operation. */ + +#ifdef USE_AESNI_AVX + unsigned int use_aesni_avx:1; + unsigned int use_gfni_avx:1; +#endif +#ifdef USE_AESNI_AVX2 + unsigned int use_aesni_avx2:1; + unsigned int use_gfni_avx2:1; +#endif } ARIA_context; @@ -363,6 +426,102 @@ static struct 0 }; +#ifdef USE_AESNI_AVX +extern unsigned int +_gcry_aria_aesni_avx_ecb_crypt_blk1_16(const void *ctx, byte *out, + const byte *in, const void *key, + u64 nblks) ASM_FUNC_ABI; +extern unsigned int +_gcry_aria_aesni_avx_ctr_crypt_blk16(const void *ctx, byte *out, + const byte *in, byte *iv) ASM_FUNC_ABI; + +#ifdef USE_GFNI_AVX +extern unsigned int +_gcry_aria_gfni_avx_ecb_crypt_blk1_16(const void *ctx, byte *out, + const byte *in, const void *key, + u64 nblks) ASM_FUNC_ABI; +extern unsigned int +_gcry_aria_gfni_avx_ctr_crypt_blk16(const void *ctx, byte *out, + const byte *in, byte *iv) ASM_FUNC_ABI; +#endif /* USE_GFNI_AVX */ + +static inline unsigned int +aria_avx_ecb_crypt_blk1_16(const ARIA_context *ctx, byte *out, const byte *in, + const u32 key[][ARIA_RD_KEY_WORDS], size_t nblks) +{ +#ifdef USE_GFNI_AVX + if (ctx->use_gfni_avx) + return _gcry_aria_gfni_avx_ecb_crypt_blk1_16(ctx, out, in, key, nblks) + + ASM_EXTRA_STACK; + else +#endif /* USE_GFNI_AVX */ + return _gcry_aria_aesni_avx_ecb_crypt_blk1_16(ctx, out, in, key, nblks) + + ASM_EXTRA_STACK; +} + +static inline unsigned int +aria_avx_ctr_crypt_blk16(const ARIA_context *ctx, byte *out, const byte *in, + byte *iv) +{ +#ifdef USE_GFNI_AVX + if (ctx->use_gfni_avx) + return _gcry_aria_gfni_avx_ctr_crypt_blk16(ctx, out, in, iv) + + ASM_EXTRA_STACK; + else +#endif /* USE_GFNI_AVX */ + return _gcry_aria_aesni_avx_ctr_crypt_blk16(ctx, out, in, iv) + + ASM_EXTRA_STACK; +} +#endif /* USE_AESNI_AVX */ + +#ifdef USE_AESNI_AVX2 +extern unsigned int +_gcry_aria_aesni_avx2_ecb_crypt_blk32(const void *ctx, byte *out, + const byte *in, + const void *key) ASM_FUNC_ABI; +extern unsigned int +_gcry_aria_aesni_avx2_ctr_crypt_blk32(const void *ctx, byte *out, + const byte *in, byte *iv) ASM_FUNC_ABI; + +#ifdef USE_GFNI_AVX2 +extern unsigned int +_gcry_aria_gfni_avx2_ecb_crypt_blk32(const void *ctx, byte *out, + const byte *in, + const void *key) ASM_FUNC_ABI; +extern unsigned int +_gcry_aria_gfni_avx2_ctr_crypt_blk32(const void *ctx, byte *out, + const byte *in, byte *iv) ASM_FUNC_ABI; +#endif /* USE_GFNI_AVX2 */ + +static inline unsigned int +aria_avx2_ecb_crypt_blk32(const ARIA_context *ctx, byte *out, const byte *in, + const u32 key[][ARIA_RD_KEY_WORDS]) +{ +#ifdef USE_GFNI_AVX2 + if (ctx->use_gfni_avx2) + return _gcry_aria_gfni_avx2_ecb_crypt_blk32(ctx, out, in, key) + + ASM_EXTRA_STACK; + else +#endif /* USE_GFNI_AVX2 */ + return _gcry_aria_aesni_avx2_ecb_crypt_blk32(ctx, out, in, key) + + ASM_EXTRA_STACK; +} + +static inline unsigned int +aria_avx2_ctr_crypt_blk32(const ARIA_context *ctx, byte *out, const byte *in, + byte *iv) +{ +#ifdef USE_GFNI_AVX2 + if (ctx->use_gfni_avx2) + return _gcry_aria_gfni_avx2_ctr_crypt_blk32(ctx, out, in, iv) + + ASM_EXTRA_STACK; + else +#endif /* USE_GFNI_AVX2 */ + return _gcry_aria_aesni_avx2_ctr_crypt_blk32(ctx, out, in, iv) + + ASM_EXTRA_STACK; +} +#endif /* USE_AESNI_AVX2 */ + /* Prefetching for sbox tables. */ static inline void prefetch_table(const volatile byte *tab, size_t len) @@ -864,7 +1023,47 @@ aria_crypt_blocks (ARIA_context *ctx, byte *out, const byte *in, size_t num_blks, u32 key[][ARIA_RD_KEY_WORDS]) { unsigned int burn_depth = 0; - unsigned int nburn; + +#ifdef USE_AESNI_AVX2 + if (ctx->use_aesni_avx2 || ctx->use_gfni_avx2) + { + unsigned int nburn = 0; + + while (num_blks >= 32) + { + nburn = aria_avx2_ecb_crypt_blk32 (ctx, out, in, key); + in += 32 * ARIA_BLOCK_SIZE; + out += 32 * ARIA_BLOCK_SIZE; + num_blks -= 32; + } + + burn_depth = nburn > burn_depth ? nburn : burn_depth; + + if (num_blks == 0) + return burn_depth; + } +#endif /* USE_AESNI_AVX2 */ + +#ifdef USE_AESNI_AVX + if (ctx->use_aesni_avx || ctx->use_gfni_avx) + { + unsigned int nburn = 0; + + while (num_blks >= 3) + { + size_t curr_blks = num_blks < 16 ? num_blks : 16; + nburn = aria_avx_ecb_crypt_blk1_16 (ctx, out, in, key, curr_blks); + in += curr_blks * ARIA_BLOCK_SIZE; + out += curr_blks * ARIA_BLOCK_SIZE; + num_blks -= curr_blks; + } + + burn_depth = nburn > burn_depth ? nburn : burn_depth; + + if (num_blks == 0) + return burn_depth; + } +#endif /* USE_AESNI_AVX */ if (!ctx->bulk_prefetch_ready) { @@ -874,19 +1073,19 @@ aria_crypt_blocks (ARIA_context *ctx, byte *out, const byte *in, while (num_blks >= 2) { - nburn = aria_crypt_2blks (ctx, out, in, key); + unsigned int nburn = aria_crypt_2blks (ctx, out, in, key); burn_depth = nburn > burn_depth ? nburn : burn_depth; - out += 2 * 16; - in += 2 * 16; + out += 2 * ARIA_BLOCK_SIZE; + in += 2 * ARIA_BLOCK_SIZE; num_blks -= 2; } while (num_blks) { - nburn = aria_crypt (ctx, out, in, key); + unsigned int nburn = aria_crypt (ctx, out, in, key); burn_depth = nburn > burn_depth ? nburn : burn_depth; - out += 16; - in += 16; + out += ARIA_BLOCK_SIZE; + in += ARIA_BLOCK_SIZE; num_blks--; } @@ -925,12 +1124,46 @@ _gcry_aria_ctr_enc(void *context, unsigned char *ctr, const byte *inbuf = inbuf_arg; int burn_stack_depth = 0; +#ifdef USE_AESNI_AVX2 + if (ctx->use_aesni_avx2 || ctx->use_gfni_avx2) + { + size_t nburn = 0; + + while (nblocks >= 32) + { + nburn = aria_avx2_ctr_crypt_blk32 (ctx, outbuf, inbuf, ctr); + inbuf += 32 * ARIA_BLOCK_SIZE; + outbuf += 32 * ARIA_BLOCK_SIZE; + nblocks -= 32; + } + + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; + } +#endif /* USE_AESNI_AVX */ + +#ifdef USE_AESNI_AVX + if (ctx->use_aesni_avx || ctx->use_gfni_avx) + { + size_t nburn = 0; + + while (nblocks >= 16) + { + nburn = aria_avx_ctr_crypt_blk16 (ctx, outbuf, inbuf, ctr); + inbuf += 16 * ARIA_BLOCK_SIZE; + outbuf += 16 * ARIA_BLOCK_SIZE; + nblocks -= 16; + } + + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; + } +#endif /* USE_AESNI_AVX */ + /* Process remaining blocks. */ if (nblocks) { - byte tmpbuf[16 * ARIA_BLOCK_SIZE]; + byte tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE]; unsigned int tmp_used = ARIA_BLOCK_SIZE; - size_t nburn; + size_t nburn = 0; ctx->bulk_prefetch_ready = 0; @@ -1002,7 +1235,7 @@ _gcry_aria_cbc_dec(void *context, unsigned char *iv, /* Process remaining blocks. */ if (nblocks) { - unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE]; + unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE]; unsigned int tmp_used = ARIA_BLOCK_SIZE; size_t nburn; @@ -1062,7 +1295,7 @@ _gcry_aria_cfb_dec(void *context, unsigned char *iv, /* Process remaining blocks. */ if (nblocks) { - unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE]; + unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE]; unsigned int tmp_used = ARIA_BLOCK_SIZE; size_t nburn; @@ -1099,14 +1332,14 @@ _gcry_aria_ecb_crypt (void *context, void *outbuf_arg, /* Process remaining blocks. */ if (nblocks) { - bulk_crypt_fn_t crypt_blk1_16; + bulk_crypt_fn_t crypt_blk1_n; size_t nburn; ctx->bulk_prefetch_ready = 0; - crypt_blk1_16 = encrypt ? aria_enc_blocks : aria_dec_blocks; + crypt_blk1_n = encrypt ? aria_enc_blocks : aria_dec_blocks; - nburn = bulk_ecb_crypt_128(ctx, crypt_blk1_16, - outbuf, inbuf, nblocks, 16); + nburn = bulk_ecb_crypt_128(ctx, crypt_blk1_n, + outbuf, inbuf, nblocks, MAX_PARALLEL_BLKS); burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; } @@ -1133,15 +1366,15 @@ _gcry_aria_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, /* Process remaining blocks. */ if (nblocks) { - unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE]; + unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE]; unsigned int tmp_used = ARIA_BLOCK_SIZE; - bulk_crypt_fn_t crypt_blk1_16; + bulk_crypt_fn_t crypt_blk1_n; size_t nburn; ctx->bulk_prefetch_ready = 0; - crypt_blk1_16 = encrypt ? aria_enc_blocks : aria_dec_blocks; + crypt_blk1_n = encrypt ? aria_enc_blocks : aria_dec_blocks; - nburn = bulk_xts_crypt_128(ctx, crypt_blk1_16, + nburn = bulk_xts_crypt_128(ctx, crypt_blk1_n, outbuf, inbuf, nblocks, tweak, tmpbuf, sizeof(tmpbuf) / ARIA_BLOCK_SIZE, @@ -1169,7 +1402,7 @@ _gcry_aria_ctr32le_enc(void *context, unsigned char *ctr, /* Process remaining blocks. */ if (nblocks) { - unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE]; + unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE]; unsigned int tmp_used = ARIA_BLOCK_SIZE; size_t nburn; @@ -1208,15 +1441,15 @@ _gcry_aria_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, /* Process remaining blocks. */ if (nblocks) { - unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE]; + unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE]; unsigned int tmp_used = ARIA_BLOCK_SIZE; - bulk_crypt_fn_t crypt_blk1_16; + bulk_crypt_fn_t crypt_blk1_n; size_t nburn; ctx->bulk_prefetch_ready = 0; - crypt_blk1_16 = encrypt ? aria_enc_blocks : aria_dec_blocks; + crypt_blk1_n = encrypt ? aria_enc_blocks : aria_dec_blocks; - nburn = bulk_ocb_crypt_128 (c, ctx, crypt_blk1_16, outbuf, inbuf, nblocks, + nburn = bulk_ocb_crypt_128 (c, ctx, crypt_blk1_n, outbuf, inbuf, nblocks, &blkn, encrypt, tmpbuf, sizeof(tmpbuf) / ARIA_BLOCK_SIZE, &tmp_used); @@ -1245,7 +1478,7 @@ _gcry_aria_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks) /* Process remaining blocks. */ if (nblocks) { - unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE]; + unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE]; unsigned int tmp_used = ARIA_BLOCK_SIZE; size_t nburn; @@ -1275,6 +1508,9 @@ aria_setkey(void *c, const byte *key, unsigned keylen, ARIA_context *ctx = c; static int initialized = 0; static const char *selftest_failed = NULL; + unsigned int hwf = _gcry_get_hw_features (); + + (void)hwf; if (keylen != 16 && keylen != 24 && keylen != 32) return GPG_ERR_INV_KEYLEN; @@ -1290,6 +1526,19 @@ aria_setkey(void *c, const byte *key, unsigned keylen, if (selftest_failed) return GPG_ERR_SELFTEST_FAILED; +#ifdef USE_AESNI_AVX2 + ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2); +#endif +#ifdef USE_GFNI_AVX2 + ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX2); +#endif +#ifdef USE_AESNI_AVX + ctx->use_aesni_avx = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX); +#endif +#ifdef USE_GFNI_AVX + ctx->use_gfni_avx = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX); +#endif + /* Setup bulk encryption routines. */ memset (bulk_ops, 0, sizeof(*bulk_ops)); bulk_ops->cbc_enc = _gcry_aria_cbc_enc; |