summaryrefslogtreecommitdiff
path: root/cipher/aria.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2023-02-17 00:23:19 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2023-02-22 20:25:50 +0200
commitf4268a8f51a89a7c0374a23f669d7a19cad304ae (patch)
tree8ad1385d20b087dbc577587ea1b9e5d8c2987462 /cipher/aria.c
parent833a904faf2b90a1b1d1b58e1e9a12f2e8e2378c (diff)
downloadlibgcrypt-f4268a8f51a89a7c0374a23f669d7a19cad304ae.tar.gz
aria: add x86_64 AESNI/GFNI/AVX/AVX2 accelerated implementations
* cipher/Makefile.am: Add 'aria-aesni-avx-amd64.S' and 'aria-aesni-avx2-amd64.S'. * cipher/aria-aesni-avx-amd64.S: New. * cipher/aria-aesni-avx2-amd64.S: New. * cipher/aria.c (USE_AESNI_AVX, USE_GFNI_AVX, USE_AESNI_AVX2) (USE_GFNI_AVX2, MAX_PARALLEL_BLKS, ASM_FUNC_ABI, ASM_EXTRA_STACK): New. (ARIA_context): Add 'use_aesni_avx', 'use_gfni_avx', 'use_aesni_avx2' and 'use_gfni_avx2'. (_gcry_aria_aesni_avx_ecb_crypt_blk1_16) (_gcry_aria_aesni_avx_ctr_crypt_blk16) (_gcry_aria_gfni_avx_ecb_crypt_blk1_16) (_gcry_aria_gfni_avx_ctr_crypt_blk16) (aria_avx_ecb_crypt_blk1_16, aria_avx_ctr_crypt_blk16) (_gcry_aria_aesni_avx2_ecb_crypt_blk32) (_gcry_aria_aesni_avx2_ctr_crypt_blk32) (_gcry_aria_gfni_avx2_ecb_crypt_blk32) (_gcry_aria_gfni_avx2_ctr_crypt_blk32) (aria_avx2_ecb_crypt_blk32, aria_avx2_ctr_crypt_blk32): New. (aria_crypt_blocks) [USE_AESNI_AVX2]: Add 32 parallel block AVX2/AESNI/GFNI processing. (aria_crypt_blocks) [USE_AESNI_AVX]: Add 3 to 16 parallel block AVX/AESNI/GFNI processing. (_gcry_aria_ctr_enc) [USE_AESNI_AVX2]: Add 32 parallel block AVX2/AESNI/GFNI processing. (_gcry_aria_ctr_enc) [USE_AESNI_AVX]: Add 16 parallel block AVX/AESNI/GFNI processing. (_gcry_aria_ctr_enc, _gcry_aria_cbc_dec, _gcry_aria_cfb_enc) (_gcry_aria_ecb_crypt, _gcry_aria_xts_crypt, _gcry_aria_ctr32le_enc) (_gcry_aria_ocb_crypt, _gcry_aria_ocb_auth): Use MAX_PARALLEL_BLKS for parallel processing width. (aria_setkey): Enable AESNI/AVX, GFNI/AVX, AESNI/AVX2, GFNI/AVX2 based on HW features. * configure.ac: Add 'aria-aesni-avx-amd64.lo' and 'aria-aesni-avx2-amd64.lo'. --- This patch adds AVX/AVX2/AESNI/GFNI accelerated ARIA block cipher implementations for libgcrypt. This implementation is based on work by Taehee Yoo, with following notable changes: - Integration to libgcrypt, use of 'aes-common-amd64.h'. - Use 'vmovddup' for loading GFNI constants. - Use round loop instead of unrolling for smaller code size and increased performance. - Use stack for temporary storage instead of external buffers. - Use merge ECB encryption/decryption to single function. - Add 1 to 15 blocks support for AVX ECB functions. - Add byte-addition fast path for CTR. === Benchmark on AMD Ryzen 9 7900X (zen4, turbo-freq off): AESNI/AVX: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.715 ns/B 1333 MiB/s 3.36 c/B 4700 ECB dec | 0.712 ns/B 1339 MiB/s 3.35 c/B 4700 CTR enc | 0.714 ns/B 1336 MiB/s 3.36 c/B 4700 CTR dec | 0.714 ns/B 1335 MiB/s 3.36 c/B 4700 GFNI/AVX: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.516 ns/B 1847 MiB/s 2.43 c/B 4700 ECB dec | 0.519 ns/B 1839 MiB/s 2.44 c/B 4700 CTR enc | 0.517 ns/B 1846 MiB/s 2.43 c/B 4700 CTR dec | 0.518 ns/B 1843 MiB/s 2.43 c/B 4700 AESNI/AVX2: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.416 ns/B 2292 MiB/s 1.96 c/B 4700 ECB dec | 0.421 ns/B 2266 MiB/s 1.98 c/B 4700 CTR enc | 0.415 ns/B 2298 MiB/s 1.95 c/B 4700 CTR dec | 0.415 ns/B 2300 MiB/s 1.95 c/B 4700 GFNI/AVX2: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.235 ns/B 4056 MiB/s 1.11 c/B 4700 ECB dec | 0.234 ns/B 4079 MiB/s 1.10 c/B 4700 CTR enc | 0.232 ns/B 4104 MiB/s 1.09 c/B 4700 CTR dec | 0.233 ns/B 4094 MiB/s 1.10 c/B 4700 === Benchmark on Intel Core i3-1115G4 (tiger-lake, turbo-freq off): AESNI/AVX: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 1.26 ns/B 757.6 MiB/s 3.77 c/B 2993 ECB dec | 1.27 ns/B 753.1 MiB/s 3.79 c/B 2992 CTR enc | 1.25 ns/B 760.3 MiB/s 3.75 c/B 2992 CTR dec | 1.26 ns/B 759.1 MiB/s 3.76 c/B 2992 GFNI/AVX: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.967 ns/B 986.6 MiB/s 2.89 c/B 2992 ECB dec | 0.966 ns/B 987.1 MiB/s 2.89 c/B 2992 CTR enc | 0.972 ns/B 980.8 MiB/s 2.91 c/B 2993 CTR dec | 0.971 ns/B 982.5 MiB/s 2.90 c/B 2993 AESNI/AVX2: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.817 ns/B 1167 MiB/s 2.44 c/B 2992 ECB dec | 0.819 ns/B 1164 MiB/s 2.45 c/B 2992 CTR enc | 0.819 ns/B 1164 MiB/s 2.45 c/B 2992 CTR dec | 0.819 ns/B 1164 MiB/s 2.45 c/B 2992 GFNI/AVX2: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.506 ns/B 1886 MiB/s 1.51 c/B 2992 ECB dec | 0.505 ns/B 1887 MiB/s 1.51 c/B 2992 CTR enc | 0.564 ns/B 1691 MiB/s 1.69 c/B 2992 CTR dec | 0.565 ns/B 1689 MiB/s 1.69 c/B 2992 === Benchmark on AMD Ryzen 7 5800X (zen3, turbo-freq off): AESNI/AVX: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.921 ns/B 1035 MiB/s 3.50 c/B 3800 ECB dec | 0.922 ns/B 1034 MiB/s 3.50 c/B 3800 CTR enc | 0.923 ns/B 1033 MiB/s 3.51 c/B 3800 CTR dec | 0.923 ns/B 1033 MiB/s 3.51 c/B 3800 AESNI/AVX2: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.559 ns/B 1707 MiB/s 2.12 c/B 3800 ECB dec | 0.560 ns/B 1703 MiB/s 2.13 c/B 3800 CTR enc | 0.570 ns/B 1672 MiB/s 2.17 c/B 3800 CTR dec | 0.568 ns/B 1679 MiB/s 2.16 c/B 3800 === Benchmark on AMD EPYC 7642 (zen2): AESNI/AVX: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 1.22 ns/B 784.5 MiB/s 4.01 c/B 3298 ECB dec | 1.22 ns/B 784.8 MiB/s 4.00 c/B 3292 CTR enc | 1.22 ns/B 780.1 MiB/s 4.03 c/B 3299 CTR dec | 1.22 ns/B 779.1 MiB/s 4.04 c/B 3299 AESNI/AVX2: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.735 ns/B 1298 MiB/s 2.42 c/B 3299 ECB dec | 0.738 ns/B 1292 MiB/s 2.44 c/B 3299 CTR enc | 0.732 ns/B 1303 MiB/s 2.41 c/B 3299 CTR dec | 0.732 ns/B 1303 MiB/s 2.41 c/B 3299 === Benchmark on Intel Core i5-6500 (skylake): AESNI/AVX: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 1.24 ns/B 766.6 MiB/s 4.48 c/B 3598 ECB dec | 1.25 ns/B 764.9 MiB/s 4.49 c/B 3598 CTR enc | 1.25 ns/B 761.7 MiB/s 4.50 c/B 3598 CTR dec | 1.25 ns/B 761.6 MiB/s 4.51 c/B 3598 AESNI/AVX2: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.829 ns/B 1150 MiB/s 2.98 c/B 3599 ECB dec | 0.831 ns/B 1147 MiB/s 2.99 c/B 3598 CTR enc | 0.829 ns/B 1150 MiB/s 2.98 c/B 3598 CTR dec | 0.828 ns/B 1152 MiB/s 2.98 c/B 3598 === Benchmark on Intel Core i5-2450M (sandy-bridge, turbo-freq off): AESNI/AVX: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 2.11 ns/B 452.7 MiB/s 5.25 c/B 2494 ECB dec | 2.10 ns/B 454.5 MiB/s 5.23 c/B 2494 CTR enc | 2.10 ns/B 453.2 MiB/s 5.25 c/B 2494 CTR dec | 2.10 ns/B 453.2 MiB/s 5.25 c/B 2494 [v2] - Optimization for CTR mode: Use CTR byte-addition path when counter carry-overflow happen only on ctr-variable but not in generated counter vector registers. Cc: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/aria.c')
-rw-r--r--cipher/aria.c299
1 files changed, 274 insertions, 25 deletions
diff --git a/cipher/aria.c b/cipher/aria.c
index 700ea409..18952d04 100644
--- a/cipher/aria.c
+++ b/cipher/aria.c
@@ -50,6 +50,60 @@
#endif
+/* USE_AESNI_AVX inidicates whether to compile with Intel AES-NI/AVX code. */
+#undef USE_AESNI_AVX
+#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AESNI_AVX 1
+# endif
+#endif
+
+/* USE_GFNI_AVX inidicates whether to compile with Intel GFNI/AVX code. */
+#undef USE_GFNI_AVX
+#if defined(USE_AESNI_AVX) && defined(ENABLE_GFNI_SUPPORT)
+# define USE_GFNI_AVX 1
+#endif
+
+/* USE_AESNI_AVX2 inidicates whether to compile with Intel AES-NI/AVX2 code. */
+#undef USE_AESNI_AVX2
+#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AESNI_AVX2 1
+# endif
+#endif
+
+/* USE_GFNI_AVX2 inidicates whether to compile with Intel GFNI/AVX2 code. */
+#undef USE_GFNI_AVX2
+#if defined(USE_AESNI_AVX2) && defined(ENABLE_GFNI_SUPPORT)
+# define USE_GFNI_AVX2 1
+#endif
+
+/* How many parallel blocks to handle in bulk processing functions. */
+#if defined(USE_AESNI_AVX2)
+# define MAX_PARALLEL_BLKS 32
+#elif defined(USE_AESNI_AVX)
+# define MAX_PARALLEL_BLKS 16
+#else
+# define MAX_PARALLEL_BLKS 8
+#endif
+
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+# define ASM_EXTRA_STACK (10 * 16)
+# else
+# define ASM_FUNC_ABI
+# define ASM_EXTRA_STACK 0
+# endif
+#endif
+
+
static const char *aria_selftest (void);
@@ -69,6 +123,15 @@ typedef struct
unsigned int decryption_prepared:1; /* The decryption key is set up. */
unsigned int bulk_prefetch_ready:1; /* Look-up table prefetch ready for
* current bulk operation. */
+
+#ifdef USE_AESNI_AVX
+ unsigned int use_aesni_avx:1;
+ unsigned int use_gfni_avx:1;
+#endif
+#ifdef USE_AESNI_AVX2
+ unsigned int use_aesni_avx2:1;
+ unsigned int use_gfni_avx2:1;
+#endif
} ARIA_context;
@@ -363,6 +426,102 @@ static struct
0
};
+#ifdef USE_AESNI_AVX
+extern unsigned int
+_gcry_aria_aesni_avx_ecb_crypt_blk1_16(const void *ctx, byte *out,
+ const byte *in, const void *key,
+ u64 nblks) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_aesni_avx_ctr_crypt_blk16(const void *ctx, byte *out,
+ const byte *in, byte *iv) ASM_FUNC_ABI;
+
+#ifdef USE_GFNI_AVX
+extern unsigned int
+_gcry_aria_gfni_avx_ecb_crypt_blk1_16(const void *ctx, byte *out,
+ const byte *in, const void *key,
+ u64 nblks) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_gfni_avx_ctr_crypt_blk16(const void *ctx, byte *out,
+ const byte *in, byte *iv) ASM_FUNC_ABI;
+#endif /* USE_GFNI_AVX */
+
+static inline unsigned int
+aria_avx_ecb_crypt_blk1_16(const ARIA_context *ctx, byte *out, const byte *in,
+ const u32 key[][ARIA_RD_KEY_WORDS], size_t nblks)
+{
+#ifdef USE_GFNI_AVX
+ if (ctx->use_gfni_avx)
+ return _gcry_aria_gfni_avx_ecb_crypt_blk1_16(ctx, out, in, key, nblks)
+ + ASM_EXTRA_STACK;
+ else
+#endif /* USE_GFNI_AVX */
+ return _gcry_aria_aesni_avx_ecb_crypt_blk1_16(ctx, out, in, key, nblks)
+ + ASM_EXTRA_STACK;
+}
+
+static inline unsigned int
+aria_avx_ctr_crypt_blk16(const ARIA_context *ctx, byte *out, const byte *in,
+ byte *iv)
+{
+#ifdef USE_GFNI_AVX
+ if (ctx->use_gfni_avx)
+ return _gcry_aria_gfni_avx_ctr_crypt_blk16(ctx, out, in, iv)
+ + ASM_EXTRA_STACK;
+ else
+#endif /* USE_GFNI_AVX */
+ return _gcry_aria_aesni_avx_ctr_crypt_blk16(ctx, out, in, iv)
+ + ASM_EXTRA_STACK;
+}
+#endif /* USE_AESNI_AVX */
+
+#ifdef USE_AESNI_AVX2
+extern unsigned int
+_gcry_aria_aesni_avx2_ecb_crypt_blk32(const void *ctx, byte *out,
+ const byte *in,
+ const void *key) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_aesni_avx2_ctr_crypt_blk32(const void *ctx, byte *out,
+ const byte *in, byte *iv) ASM_FUNC_ABI;
+
+#ifdef USE_GFNI_AVX2
+extern unsigned int
+_gcry_aria_gfni_avx2_ecb_crypt_blk32(const void *ctx, byte *out,
+ const byte *in,
+ const void *key) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_gfni_avx2_ctr_crypt_blk32(const void *ctx, byte *out,
+ const byte *in, byte *iv) ASM_FUNC_ABI;
+#endif /* USE_GFNI_AVX2 */
+
+static inline unsigned int
+aria_avx2_ecb_crypt_blk32(const ARIA_context *ctx, byte *out, const byte *in,
+ const u32 key[][ARIA_RD_KEY_WORDS])
+{
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ return _gcry_aria_gfni_avx2_ecb_crypt_blk32(ctx, out, in, key)
+ + ASM_EXTRA_STACK;
+ else
+#endif /* USE_GFNI_AVX2 */
+ return _gcry_aria_aesni_avx2_ecb_crypt_blk32(ctx, out, in, key)
+ + ASM_EXTRA_STACK;
+}
+
+static inline unsigned int
+aria_avx2_ctr_crypt_blk32(const ARIA_context *ctx, byte *out, const byte *in,
+ byte *iv)
+{
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ return _gcry_aria_gfni_avx2_ctr_crypt_blk32(ctx, out, in, iv)
+ + ASM_EXTRA_STACK;
+ else
+#endif /* USE_GFNI_AVX2 */
+ return _gcry_aria_aesni_avx2_ctr_crypt_blk32(ctx, out, in, iv)
+ + ASM_EXTRA_STACK;
+}
+#endif /* USE_AESNI_AVX2 */
+
/* Prefetching for sbox tables. */
static inline void
prefetch_table(const volatile byte *tab, size_t len)
@@ -864,7 +1023,47 @@ aria_crypt_blocks (ARIA_context *ctx, byte *out, const byte *in,
size_t num_blks, u32 key[][ARIA_RD_KEY_WORDS])
{
unsigned int burn_depth = 0;
- unsigned int nburn;
+
+#ifdef USE_AESNI_AVX2
+ if (ctx->use_aesni_avx2 || ctx->use_gfni_avx2)
+ {
+ unsigned int nburn = 0;
+
+ while (num_blks >= 32)
+ {
+ nburn = aria_avx2_ecb_crypt_blk32 (ctx, out, in, key);
+ in += 32 * ARIA_BLOCK_SIZE;
+ out += 32 * ARIA_BLOCK_SIZE;
+ num_blks -= 32;
+ }
+
+ burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+ if (num_blks == 0)
+ return burn_depth;
+ }
+#endif /* USE_AESNI_AVX2 */
+
+#ifdef USE_AESNI_AVX
+ if (ctx->use_aesni_avx || ctx->use_gfni_avx)
+ {
+ unsigned int nburn = 0;
+
+ while (num_blks >= 3)
+ {
+ size_t curr_blks = num_blks < 16 ? num_blks : 16;
+ nburn = aria_avx_ecb_crypt_blk1_16 (ctx, out, in, key, curr_blks);
+ in += curr_blks * ARIA_BLOCK_SIZE;
+ out += curr_blks * ARIA_BLOCK_SIZE;
+ num_blks -= curr_blks;
+ }
+
+ burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+ if (num_blks == 0)
+ return burn_depth;
+ }
+#endif /* USE_AESNI_AVX */
if (!ctx->bulk_prefetch_ready)
{
@@ -874,19 +1073,19 @@ aria_crypt_blocks (ARIA_context *ctx, byte *out, const byte *in,
while (num_blks >= 2)
{
- nburn = aria_crypt_2blks (ctx, out, in, key);
+ unsigned int nburn = aria_crypt_2blks (ctx, out, in, key);
burn_depth = nburn > burn_depth ? nburn : burn_depth;
- out += 2 * 16;
- in += 2 * 16;
+ out += 2 * ARIA_BLOCK_SIZE;
+ in += 2 * ARIA_BLOCK_SIZE;
num_blks -= 2;
}
while (num_blks)
{
- nburn = aria_crypt (ctx, out, in, key);
+ unsigned int nburn = aria_crypt (ctx, out, in, key);
burn_depth = nburn > burn_depth ? nburn : burn_depth;
- out += 16;
- in += 16;
+ out += ARIA_BLOCK_SIZE;
+ in += ARIA_BLOCK_SIZE;
num_blks--;
}
@@ -925,12 +1124,46 @@ _gcry_aria_ctr_enc(void *context, unsigned char *ctr,
const byte *inbuf = inbuf_arg;
int burn_stack_depth = 0;
+#ifdef USE_AESNI_AVX2
+ if (ctx->use_aesni_avx2 || ctx->use_gfni_avx2)
+ {
+ size_t nburn = 0;
+
+ while (nblocks >= 32)
+ {
+ nburn = aria_avx2_ctr_crypt_blk32 (ctx, outbuf, inbuf, ctr);
+ inbuf += 32 * ARIA_BLOCK_SIZE;
+ outbuf += 32 * ARIA_BLOCK_SIZE;
+ nblocks -= 32;
+ }
+
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+ }
+#endif /* USE_AESNI_AVX */
+
+#ifdef USE_AESNI_AVX
+ if (ctx->use_aesni_avx || ctx->use_gfni_avx)
+ {
+ size_t nburn = 0;
+
+ while (nblocks >= 16)
+ {
+ nburn = aria_avx_ctr_crypt_blk16 (ctx, outbuf, inbuf, ctr);
+ inbuf += 16 * ARIA_BLOCK_SIZE;
+ outbuf += 16 * ARIA_BLOCK_SIZE;
+ nblocks -= 16;
+ }
+
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+ }
+#endif /* USE_AESNI_AVX */
+
/* Process remaining blocks. */
if (nblocks)
{
- byte tmpbuf[16 * ARIA_BLOCK_SIZE];
+ byte tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
unsigned int tmp_used = ARIA_BLOCK_SIZE;
- size_t nburn;
+ size_t nburn = 0;
ctx->bulk_prefetch_ready = 0;
@@ -1002,7 +1235,7 @@ _gcry_aria_cbc_dec(void *context, unsigned char *iv,
/* Process remaining blocks. */
if (nblocks)
{
- unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE];
+ unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
unsigned int tmp_used = ARIA_BLOCK_SIZE;
size_t nburn;
@@ -1062,7 +1295,7 @@ _gcry_aria_cfb_dec(void *context, unsigned char *iv,
/* Process remaining blocks. */
if (nblocks)
{
- unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE];
+ unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
unsigned int tmp_used = ARIA_BLOCK_SIZE;
size_t nburn;
@@ -1099,14 +1332,14 @@ _gcry_aria_ecb_crypt (void *context, void *outbuf_arg,
/* Process remaining blocks. */
if (nblocks)
{
- bulk_crypt_fn_t crypt_blk1_16;
+ bulk_crypt_fn_t crypt_blk1_n;
size_t nburn;
ctx->bulk_prefetch_ready = 0;
- crypt_blk1_16 = encrypt ? aria_enc_blocks : aria_dec_blocks;
+ crypt_blk1_n = encrypt ? aria_enc_blocks : aria_dec_blocks;
- nburn = bulk_ecb_crypt_128(ctx, crypt_blk1_16,
- outbuf, inbuf, nblocks, 16);
+ nburn = bulk_ecb_crypt_128(ctx, crypt_blk1_n,
+ outbuf, inbuf, nblocks, MAX_PARALLEL_BLKS);
burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
}
@@ -1133,15 +1366,15 @@ _gcry_aria_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
/* Process remaining blocks. */
if (nblocks)
{
- unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE];
+ unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
unsigned int tmp_used = ARIA_BLOCK_SIZE;
- bulk_crypt_fn_t crypt_blk1_16;
+ bulk_crypt_fn_t crypt_blk1_n;
size_t nburn;
ctx->bulk_prefetch_ready = 0;
- crypt_blk1_16 = encrypt ? aria_enc_blocks : aria_dec_blocks;
+ crypt_blk1_n = encrypt ? aria_enc_blocks : aria_dec_blocks;
- nburn = bulk_xts_crypt_128(ctx, crypt_blk1_16,
+ nburn = bulk_xts_crypt_128(ctx, crypt_blk1_n,
outbuf, inbuf, nblocks,
tweak, tmpbuf,
sizeof(tmpbuf) / ARIA_BLOCK_SIZE,
@@ -1169,7 +1402,7 @@ _gcry_aria_ctr32le_enc(void *context, unsigned char *ctr,
/* Process remaining blocks. */
if (nblocks)
{
- unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE];
+ unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
unsigned int tmp_used = ARIA_BLOCK_SIZE;
size_t nburn;
@@ -1208,15 +1441,15 @@ _gcry_aria_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
/* Process remaining blocks. */
if (nblocks)
{
- unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE];
+ unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
unsigned int tmp_used = ARIA_BLOCK_SIZE;
- bulk_crypt_fn_t crypt_blk1_16;
+ bulk_crypt_fn_t crypt_blk1_n;
size_t nburn;
ctx->bulk_prefetch_ready = 0;
- crypt_blk1_16 = encrypt ? aria_enc_blocks : aria_dec_blocks;
+ crypt_blk1_n = encrypt ? aria_enc_blocks : aria_dec_blocks;
- nburn = bulk_ocb_crypt_128 (c, ctx, crypt_blk1_16, outbuf, inbuf, nblocks,
+ nburn = bulk_ocb_crypt_128 (c, ctx, crypt_blk1_n, outbuf, inbuf, nblocks,
&blkn, encrypt, tmpbuf,
sizeof(tmpbuf) / ARIA_BLOCK_SIZE,
&tmp_used);
@@ -1245,7 +1478,7 @@ _gcry_aria_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
/* Process remaining blocks. */
if (nblocks)
{
- unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE];
+ unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
unsigned int tmp_used = ARIA_BLOCK_SIZE;
size_t nburn;
@@ -1275,6 +1508,9 @@ aria_setkey(void *c, const byte *key, unsigned keylen,
ARIA_context *ctx = c;
static int initialized = 0;
static const char *selftest_failed = NULL;
+ unsigned int hwf = _gcry_get_hw_features ();
+
+ (void)hwf;
if (keylen != 16 && keylen != 24 && keylen != 32)
return GPG_ERR_INV_KEYLEN;
@@ -1290,6 +1526,19 @@ aria_setkey(void *c, const byte *key, unsigned keylen,
if (selftest_failed)
return GPG_ERR_SELFTEST_FAILED;
+#ifdef USE_AESNI_AVX2
+ ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2);
+#endif
+#ifdef USE_GFNI_AVX2
+ ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX2);
+#endif
+#ifdef USE_AESNI_AVX
+ ctx->use_aesni_avx = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX);
+#endif
+#ifdef USE_GFNI_AVX
+ ctx->use_gfni_avx = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX);
+#endif
+
/* Setup bulk encryption routines. */
memset (bulk_ops, 0, sizeof(*bulk_ops));
bulk_ops->cbc_enc = _gcry_aria_cbc_enc;