diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2022-10-23 13:43:03 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2022-10-26 21:43:04 +0300 |
commit | 84f3d41acb2377d1ed0c2b9e8268de9d35e90af0 (patch) | |
tree | 4cc373cee35e924017c35bb4eda694a9e36be580 /cipher/rijndael-aesni.c | |
parent | bf5ec001dfcbd4a293d0bd577fd70a0f8286c4e6 (diff) | |
download | libgcrypt-84f3d41acb2377d1ed0c2b9e8268de9d35e90af0.tar.gz |
rijndael: add ECB acceleration (for benchmarking purposes)
* cipher/cipher-internal.h (cipher_bulk_ops): Add 'ecb_crypt'.
* cipher/cipher.c (do_ecb_crypt): Use bulk function if available.
* cipher/rijndael-aesni.c (do_aesni_enc_vec8): Change asm label
'.Ldeclast' to '.Lenclast'.
(_gcry_aes_aesni_ecb_crypt): New.
* cipher/rijndael-armv8-aarch32-ce.S (_gcry_aes_ecb_enc_armv8_ce)
(_gcry_aes_ecb_dec_armv8_ce): New.
* cipher/rijndael-armv8-aarch64-ce.S (_gcry_aes_ecb_enc_armv8_ce)
(_gcry_aes_ecb_dec_armv8_ce): New.
* cipher/rijndael-armv8-ce.c (_gcry_aes_ocb_enc_armv8_ce)
(_gcry_aes_ocb_dec_armv8_ce, _gcry_aes_ocb_auth_armv8_ce): Change
return value from void to size_t.
(ocb_crypt_fn_t, xts_crypt_fn_t): Remove.
(_gcry_aes_armv8_ce_ocb_crypt, _gcry_aes_armv8_ce_xts_crypt): Remove
indirect function call; Return value from called function (allows tail
call optimization).
(_gcry_aes_armv8_ce_ocb_auth): Return value from called function (allows
tail call optimization).
(_gcry_aes_ecb_enc_armv8_ce, _gcry_aes_ecb_dec_armv8_ce)
(_gcry_aes_armv8_ce_ecb_crypt): New.
* cipher/rijndael-vaes-avx2-amd64.S
(_gcry_vaes_avx2_ecb_crypt_amd64): New.
* cipher/rijndael-vaes.c (_gcry_vaes_avx2_ecb_crypt_amd64)
(_gcry_aes_vaes_ecb_crypt): New.
* cipher/rijndael.c (_gcry_aes_aesni_ecb_crypt)
(_gcry_aes_vaes_ecb_crypt, _gcry_aes_armv8_ce_ecb_crypt): New.
(do_setkey): Setup ECB bulk function for x86 AESNI/VAES and ARM CE.
--
Benchmark on AMD Ryzen 9 7900X:
Before (OCB for reference):
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.128 ns/B 7460 MiB/s 0.720 c/B 5634±1
ECB dec | 0.134 ns/B 7103 MiB/s 0.753 c/B 5608
OCB enc | 0.029 ns/B 32930 MiB/s 0.163 c/B 5625
OCB dec | 0.029 ns/B 32738 MiB/s 0.164 c/B 5625
After:
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.028 ns/B 33761 MiB/s 0.159 c/B 5625
ECB dec | 0.028 ns/B 33917 MiB/s 0.158 c/B 5625
GnuPG-bug-id: T6242
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/rijndael-aesni.c')
-rw-r--r-- | cipher/rijndael-aesni.c | 160 |
1 files changed, 157 insertions, 3 deletions
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c index 156af015..906737a6 100644 --- a/cipher/rijndael-aesni.c +++ b/cipher/rijndael-aesni.c @@ -870,7 +870,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx) "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xa0(%[key]), %%xmm0\n\t" - "jb .Ldeclast%=\n\t" + "jb .Lenclast%=\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" @@ -889,7 +889,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx) "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xc0(%[key]), %%xmm0\n\t" - "je .Ldeclast%=\n\t" + "je .Lenclast%=\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" @@ -909,7 +909,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx) "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xe0(%[key]), %%xmm0\n" - ".Ldeclast%=:\n\t" + ".Lenclast%=:\n\t" : /* no output */ : [key] "r" (ctx->keyschenc), [rounds] "r" (ctx->rounds) @@ -1718,6 +1718,160 @@ _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst, void ASM_FUNC_ATTR +_gcry_aes_aesni_ecb_crypt (RIJNDAEL_context *ctx, unsigned char *dst, + const unsigned char *src, size_t nblocks, + int encrypt) +{ + aesni_prepare_2_7_variable; + + aesni_prepare (); + aesni_prepare_2_7(); + + if (!encrypt && !ctx->decryption_prepared) + { + do_aesni_prepare_decryption ( ctx ); + ctx->decryption_prepared = 1; + } + +#ifdef __x86_64__ + if (nblocks >= 8) + { + const void *key = encrypt ? ctx->keyschenc : ctx->keyschdec; + aesni_prepare_8_15_variable; + + aesni_prepare_8_15(); + + for (; nblocks >= 8; nblocks -= 8) + { + asm volatile + ("movdqa (%[key]), %%xmm0\n\t" + "movdqu 0*16(%[src]), %%xmm1\n\t" + "movdqu 1*16(%[src]), %%xmm2\n\t" + "movdqu 2*16(%[src]), %%xmm3\n\t" + "movdqu 3*16(%[src]), %%xmm4\n\t" + "movdqu 4*16(%[src]), %%xmm8\n\t" + "movdqu 5*16(%[src]), %%xmm9\n\t" + "movdqu 6*16(%[src]), %%xmm10\n\t" + "movdqu 7*16(%[src]), %%xmm11\n\t" + "pxor %%xmm0, %%xmm1\n\t" + "pxor %%xmm0, %%xmm2\n\t" + "pxor %%xmm0, %%xmm3\n\t" + "pxor %%xmm0, %%xmm4\n\t" + "pxor %%xmm0, %%xmm8\n\t" + "pxor %%xmm0, %%xmm9\n\t" + "pxor %%xmm0, %%xmm10\n\t" + "pxor %%xmm0, %%xmm11\n\t" + : /* No output */ + : [src] "r" (src), + [key] "r" (key) + : "memory"); + + if (encrypt) + { + do_aesni_enc_vec8 (ctx); + asm volatile + ("aesenclast %%xmm0, %%xmm1\n\t" + "aesenclast %%xmm0, %%xmm2\n\t" + "aesenclast %%xmm0, %%xmm3\n\t" + "aesenclast %%xmm0, %%xmm4\n\t" + "aesenclast %%xmm0, %%xmm8\n\t" + "aesenclast %%xmm0, %%xmm9\n\t" + "aesenclast %%xmm0, %%xmm10\n\t" + "aesenclast %%xmm0, %%xmm11\n\t" + ::: "memory" ); + } + else + { + do_aesni_dec_vec8 (ctx); + asm volatile + ("aesdeclast %%xmm0, %%xmm1\n\t" + "aesdeclast %%xmm0, %%xmm2\n\t" + "aesdeclast %%xmm0, %%xmm3\n\t" + "aesdeclast %%xmm0, %%xmm4\n\t" + "aesdeclast %%xmm0, %%xmm8\n\t" + "aesdeclast %%xmm0, %%xmm9\n\t" + "aesdeclast %%xmm0, %%xmm10\n\t" + "aesdeclast %%xmm0, %%xmm11\n\t" + ::: "memory" ); + } + + asm volatile + ("movdqu %%xmm1, 0*16(%[dst])\n\t" + "movdqu %%xmm2, 1*16(%[dst])\n\t" + "movdqu %%xmm3, 2*16(%[dst])\n\t" + "movdqu %%xmm4, 3*16(%[dst])\n\t" + "movdqu %%xmm8, 4*16(%[dst])\n\t" + "movdqu %%xmm9, 5*16(%[dst])\n\t" + "movdqu %%xmm10, 6*16(%[dst])\n\t" + "movdqu %%xmm11, 7*16(%[dst])\n\t" + : /* No output */ + : [dst] "r" (dst) + : "memory"); + + dst += 8*BLOCKSIZE; + src += 8*BLOCKSIZE; + } + + aesni_cleanup_8_15(); + } +#endif + + for (; nblocks >= 4; nblocks -= 4) + { + asm volatile + ("movdqu 0*16(%[src]), %%xmm1\n\t" + "movdqu 1*16(%[src]), %%xmm2\n\t" + "movdqu 2*16(%[src]), %%xmm3\n\t" + "movdqu 3*16(%[src]), %%xmm4\n\t" + : /* No output */ + : [src] "r" (src) + : "memory"); + + if (encrypt) + do_aesni_enc_vec4 (ctx); + else + do_aesni_dec_vec4 (ctx); + + asm volatile + ("movdqu %%xmm1, 0*16(%[dst])\n\t" + "movdqu %%xmm2, 1*16(%[dst])\n\t" + "movdqu %%xmm3, 2*16(%[dst])\n\t" + "movdqu %%xmm4, 3*16(%[dst])\n\t" + : /* No output */ + : [dst] "r" (dst) + : "memory"); + + dst += 4*BLOCKSIZE; + src += 4*BLOCKSIZE; + } + + for (; nblocks; nblocks--) + { + asm volatile ("movdqu %[src], %%xmm0\n\t" + : + : [src] "m" (*src) + : "memory" ); + + if (encrypt) + do_aesni_enc (ctx); + else + do_aesni_dec (ctx); + + asm volatile ("movdqu %%xmm0, %[dst]\n\t" + : [dst] "=m" (*dst) + : + : "memory" ); + + dst += BLOCKSIZE; + src += BLOCKSIZE; + } + + aesni_cleanup (); + aesni_cleanup_2_7 (); +} + + +void ASM_FUNC_ATTR _gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, unsigned char *iv, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) |