summaryrefslogtreecommitdiff
path: root/cipher/rijndael-aesni.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2022-10-23 13:43:03 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2022-10-26 21:43:04 +0300
commit84f3d41acb2377d1ed0c2b9e8268de9d35e90af0 (patch)
tree4cc373cee35e924017c35bb4eda694a9e36be580 /cipher/rijndael-aesni.c
parentbf5ec001dfcbd4a293d0bd577fd70a0f8286c4e6 (diff)
downloadlibgcrypt-84f3d41acb2377d1ed0c2b9e8268de9d35e90af0.tar.gz
rijndael: add ECB acceleration (for benchmarking purposes)
* cipher/cipher-internal.h (cipher_bulk_ops): Add 'ecb_crypt'. * cipher/cipher.c (do_ecb_crypt): Use bulk function if available. * cipher/rijndael-aesni.c (do_aesni_enc_vec8): Change asm label '.Ldeclast' to '.Lenclast'. (_gcry_aes_aesni_ecb_crypt): New. * cipher/rijndael-armv8-aarch32-ce.S (_gcry_aes_ecb_enc_armv8_ce) (_gcry_aes_ecb_dec_armv8_ce): New. * cipher/rijndael-armv8-aarch64-ce.S (_gcry_aes_ecb_enc_armv8_ce) (_gcry_aes_ecb_dec_armv8_ce): New. * cipher/rijndael-armv8-ce.c (_gcry_aes_ocb_enc_armv8_ce) (_gcry_aes_ocb_dec_armv8_ce, _gcry_aes_ocb_auth_armv8_ce): Change return value from void to size_t. (ocb_crypt_fn_t, xts_crypt_fn_t): Remove. (_gcry_aes_armv8_ce_ocb_crypt, _gcry_aes_armv8_ce_xts_crypt): Remove indirect function call; Return value from called function (allows tail call optimization). (_gcry_aes_armv8_ce_ocb_auth): Return value from called function (allows tail call optimization). (_gcry_aes_ecb_enc_armv8_ce, _gcry_aes_ecb_dec_armv8_ce) (_gcry_aes_armv8_ce_ecb_crypt): New. * cipher/rijndael-vaes-avx2-amd64.S (_gcry_vaes_avx2_ecb_crypt_amd64): New. * cipher/rijndael-vaes.c (_gcry_vaes_avx2_ecb_crypt_amd64) (_gcry_aes_vaes_ecb_crypt): New. * cipher/rijndael.c (_gcry_aes_aesni_ecb_crypt) (_gcry_aes_vaes_ecb_crypt, _gcry_aes_armv8_ce_ecb_crypt): New. (do_setkey): Setup ECB bulk function for x86 AESNI/VAES and ARM CE. -- Benchmark on AMD Ryzen 9 7900X: Before (OCB for reference): AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.128 ns/B 7460 MiB/s 0.720 c/B 5634±1 ECB dec | 0.134 ns/B 7103 MiB/s 0.753 c/B 5608 OCB enc | 0.029 ns/B 32930 MiB/s 0.163 c/B 5625 OCB dec | 0.029 ns/B 32738 MiB/s 0.164 c/B 5625 After: AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.028 ns/B 33761 MiB/s 0.159 c/B 5625 ECB dec | 0.028 ns/B 33917 MiB/s 0.158 c/B 5625 GnuPG-bug-id: T6242 Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/rijndael-aesni.c')
-rw-r--r--cipher/rijndael-aesni.c160
1 files changed, 157 insertions, 3 deletions
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 156af015..906737a6 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -870,7 +870,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
"aesenc %%xmm0, %%xmm10\n\t"
"aesenc %%xmm0, %%xmm11\n\t"
"movdqa 0xa0(%[key]), %%xmm0\n\t"
- "jb .Ldeclast%=\n\t"
+ "jb .Lenclast%=\n\t"
"aesenc %%xmm0, %%xmm1\n\t"
"aesenc %%xmm0, %%xmm2\n\t"
"aesenc %%xmm0, %%xmm3\n\t"
@@ -889,7 +889,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
"aesenc %%xmm0, %%xmm10\n\t"
"aesenc %%xmm0, %%xmm11\n\t"
"movdqa 0xc0(%[key]), %%xmm0\n\t"
- "je .Ldeclast%=\n\t"
+ "je .Lenclast%=\n\t"
"aesenc %%xmm0, %%xmm1\n\t"
"aesenc %%xmm0, %%xmm2\n\t"
"aesenc %%xmm0, %%xmm3\n\t"
@@ -909,7 +909,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
"aesenc %%xmm0, %%xmm11\n\t"
"movdqa 0xe0(%[key]), %%xmm0\n"
- ".Ldeclast%=:\n\t"
+ ".Lenclast%=:\n\t"
: /* no output */
: [key] "r" (ctx->keyschenc),
[rounds] "r" (ctx->rounds)
@@ -1718,6 +1718,160 @@ _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
void ASM_FUNC_ATTR
+_gcry_aes_aesni_ecb_crypt (RIJNDAEL_context *ctx, unsigned char *dst,
+ const unsigned char *src, size_t nblocks,
+ int encrypt)
+{
+ aesni_prepare_2_7_variable;
+
+ aesni_prepare ();
+ aesni_prepare_2_7();
+
+ if (!encrypt && !ctx->decryption_prepared)
+ {
+ do_aesni_prepare_decryption ( ctx );
+ ctx->decryption_prepared = 1;
+ }
+
+#ifdef __x86_64__
+ if (nblocks >= 8)
+ {
+ const void *key = encrypt ? ctx->keyschenc : ctx->keyschdec;
+ aesni_prepare_8_15_variable;
+
+ aesni_prepare_8_15();
+
+ for (; nblocks >= 8; nblocks -= 8)
+ {
+ asm volatile
+ ("movdqa (%[key]), %%xmm0\n\t"
+ "movdqu 0*16(%[src]), %%xmm1\n\t"
+ "movdqu 1*16(%[src]), %%xmm2\n\t"
+ "movdqu 2*16(%[src]), %%xmm3\n\t"
+ "movdqu 3*16(%[src]), %%xmm4\n\t"
+ "movdqu 4*16(%[src]), %%xmm8\n\t"
+ "movdqu 5*16(%[src]), %%xmm9\n\t"
+ "movdqu 6*16(%[src]), %%xmm10\n\t"
+ "movdqu 7*16(%[src]), %%xmm11\n\t"
+ "pxor %%xmm0, %%xmm1\n\t"
+ "pxor %%xmm0, %%xmm2\n\t"
+ "pxor %%xmm0, %%xmm3\n\t"
+ "pxor %%xmm0, %%xmm4\n\t"
+ "pxor %%xmm0, %%xmm8\n\t"
+ "pxor %%xmm0, %%xmm9\n\t"
+ "pxor %%xmm0, %%xmm10\n\t"
+ "pxor %%xmm0, %%xmm11\n\t"
+ : /* No output */
+ : [src] "r" (src),
+ [key] "r" (key)
+ : "memory");
+
+ if (encrypt)
+ {
+ do_aesni_enc_vec8 (ctx);
+ asm volatile
+ ("aesenclast %%xmm0, %%xmm1\n\t"
+ "aesenclast %%xmm0, %%xmm2\n\t"
+ "aesenclast %%xmm0, %%xmm3\n\t"
+ "aesenclast %%xmm0, %%xmm4\n\t"
+ "aesenclast %%xmm0, %%xmm8\n\t"
+ "aesenclast %%xmm0, %%xmm9\n\t"
+ "aesenclast %%xmm0, %%xmm10\n\t"
+ "aesenclast %%xmm0, %%xmm11\n\t"
+ ::: "memory" );
+ }
+ else
+ {
+ do_aesni_dec_vec8 (ctx);
+ asm volatile
+ ("aesdeclast %%xmm0, %%xmm1\n\t"
+ "aesdeclast %%xmm0, %%xmm2\n\t"
+ "aesdeclast %%xmm0, %%xmm3\n\t"
+ "aesdeclast %%xmm0, %%xmm4\n\t"
+ "aesdeclast %%xmm0, %%xmm8\n\t"
+ "aesdeclast %%xmm0, %%xmm9\n\t"
+ "aesdeclast %%xmm0, %%xmm10\n\t"
+ "aesdeclast %%xmm0, %%xmm11\n\t"
+ ::: "memory" );
+ }
+
+ asm volatile
+ ("movdqu %%xmm1, 0*16(%[dst])\n\t"
+ "movdqu %%xmm2, 1*16(%[dst])\n\t"
+ "movdqu %%xmm3, 2*16(%[dst])\n\t"
+ "movdqu %%xmm4, 3*16(%[dst])\n\t"
+ "movdqu %%xmm8, 4*16(%[dst])\n\t"
+ "movdqu %%xmm9, 5*16(%[dst])\n\t"
+ "movdqu %%xmm10, 6*16(%[dst])\n\t"
+ "movdqu %%xmm11, 7*16(%[dst])\n\t"
+ : /* No output */
+ : [dst] "r" (dst)
+ : "memory");
+
+ dst += 8*BLOCKSIZE;
+ src += 8*BLOCKSIZE;
+ }
+
+ aesni_cleanup_8_15();
+ }
+#endif
+
+ for (; nblocks >= 4; nblocks -= 4)
+ {
+ asm volatile
+ ("movdqu 0*16(%[src]), %%xmm1\n\t"
+ "movdqu 1*16(%[src]), %%xmm2\n\t"
+ "movdqu 2*16(%[src]), %%xmm3\n\t"
+ "movdqu 3*16(%[src]), %%xmm4\n\t"
+ : /* No output */
+ : [src] "r" (src)
+ : "memory");
+
+ if (encrypt)
+ do_aesni_enc_vec4 (ctx);
+ else
+ do_aesni_dec_vec4 (ctx);
+
+ asm volatile
+ ("movdqu %%xmm1, 0*16(%[dst])\n\t"
+ "movdqu %%xmm2, 1*16(%[dst])\n\t"
+ "movdqu %%xmm3, 2*16(%[dst])\n\t"
+ "movdqu %%xmm4, 3*16(%[dst])\n\t"
+ : /* No output */
+ : [dst] "r" (dst)
+ : "memory");
+
+ dst += 4*BLOCKSIZE;
+ src += 4*BLOCKSIZE;
+ }
+
+ for (; nblocks; nblocks--)
+ {
+ asm volatile ("movdqu %[src], %%xmm0\n\t"
+ :
+ : [src] "m" (*src)
+ : "memory" );
+
+ if (encrypt)
+ do_aesni_enc (ctx);
+ else
+ do_aesni_dec (ctx);
+
+ asm volatile ("movdqu %%xmm0, %[dst]\n\t"
+ : [dst] "=m" (*dst)
+ :
+ : "memory" );
+
+ dst += BLOCKSIZE;
+ src += BLOCKSIZE;
+ }
+
+ aesni_cleanup ();
+ aesni_cleanup_2_7 ();
+}
+
+
+void ASM_FUNC_ATTR
_gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, unsigned char *iv,
unsigned char *outbuf, const unsigned char *inbuf,
size_t nblocks)