diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2022-10-23 13:43:03 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2022-10-26 21:43:04 +0300 |
commit | 84f3d41acb2377d1ed0c2b9e8268de9d35e90af0 (patch) | |
tree | 4cc373cee35e924017c35bb4eda694a9e36be580 /cipher/rijndael-armv8-aarch32-ce.S | |
parent | bf5ec001dfcbd4a293d0bd577fd70a0f8286c4e6 (diff) | |
download | libgcrypt-84f3d41acb2377d1ed0c2b9e8268de9d35e90af0.tar.gz |
rijndael: add ECB acceleration (for benchmarking purposes)
* cipher/cipher-internal.h (cipher_bulk_ops): Add 'ecb_crypt'.
* cipher/cipher.c (do_ecb_crypt): Use bulk function if available.
* cipher/rijndael-aesni.c (do_aesni_enc_vec8): Change asm label
'.Ldeclast' to '.Lenclast'.
(_gcry_aes_aesni_ecb_crypt): New.
* cipher/rijndael-armv8-aarch32-ce.S (_gcry_aes_ecb_enc_armv8_ce)
(_gcry_aes_ecb_dec_armv8_ce): New.
* cipher/rijndael-armv8-aarch64-ce.S (_gcry_aes_ecb_enc_armv8_ce)
(_gcry_aes_ecb_dec_armv8_ce): New.
* cipher/rijndael-armv8-ce.c (_gcry_aes_ocb_enc_armv8_ce)
(_gcry_aes_ocb_dec_armv8_ce, _gcry_aes_ocb_auth_armv8_ce): Change
return value from void to size_t.
(ocb_crypt_fn_t, xts_crypt_fn_t): Remove.
(_gcry_aes_armv8_ce_ocb_crypt, _gcry_aes_armv8_ce_xts_crypt): Remove
indirect function call; Return value from called function (allows tail
call optimization).
(_gcry_aes_armv8_ce_ocb_auth): Return value from called function (allows
tail call optimization).
(_gcry_aes_ecb_enc_armv8_ce, _gcry_aes_ecb_dec_armv8_ce)
(_gcry_aes_armv8_ce_ecb_crypt): New.
* cipher/rijndael-vaes-avx2-amd64.S
(_gcry_vaes_avx2_ecb_crypt_amd64): New.
* cipher/rijndael-vaes.c (_gcry_vaes_avx2_ecb_crypt_amd64)
(_gcry_aes_vaes_ecb_crypt): New.
* cipher/rijndael.c (_gcry_aes_aesni_ecb_crypt)
(_gcry_aes_vaes_ecb_crypt, _gcry_aes_armv8_ce_ecb_crypt): New.
(do_setkey): Setup ECB bulk function for x86 AESNI/VAES and ARM CE.
--
Benchmark on AMD Ryzen 9 7900X:
Before (OCB for reference):
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.128 ns/B 7460 MiB/s 0.720 c/B 5634±1
ECB dec | 0.134 ns/B 7103 MiB/s 0.753 c/B 5608
OCB enc | 0.029 ns/B 32930 MiB/s 0.163 c/B 5625
OCB dec | 0.029 ns/B 32738 MiB/s 0.164 c/B 5625
After:
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.028 ns/B 33761 MiB/s 0.159 c/B 5625
ECB dec | 0.028 ns/B 33917 MiB/s 0.158 c/B 5625
GnuPG-bug-id: T6242
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/rijndael-armv8-aarch32-ce.S')
-rw-r--r-- | cipher/rijndael-armv8-aarch32-ce.S | 152 |
1 files changed, 149 insertions, 3 deletions
diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S index 1eafa93e..6208652b 100644 --- a/cipher/rijndael-armv8-aarch32-ce.S +++ b/cipher/rijndael-armv8-aarch32-ce.S @@ -654,6 +654,149 @@ _gcry_aes_cbc_dec_armv8_ce: /* + * void _gcry_aes_ecb_enc_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * size_t nblocks, + * unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_ecb_enc_armv8_ce +.type _gcry_aes_ecb_enc_armv8_ce,%function; +_gcry_aes_ecb_enc_armv8_ce: + /* input: + * r0: keysched + * r1: outbuf + * r2: inbuf + * r3: nblocks + * %st+0: nrounds => r4 + */ + + push {r4-r6,lr} /* 4*4 = 16b */ + cmp r3, #0 + beq .Lecb_enc_skip + ldr r4, [sp, #(16+0)] + vpush {q4-q7} + + cmp r4, #12 + aes_preload_keys(r0, lr); + + beq .Lecb_entry_192e + bhi .Lecb_entry_256e + +#define ECB_CRYPT(bits, e_d, mc_imc, ...) \ + .Lecb_entry_##bits##e_d: \ + cmp r3, #4; \ + blo .Lecb_loop_##bits##e_d; \ + \ + .Lecb_loop4_##bits##e_d: \ + vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \ + sub r3, r3, #4; \ + vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \ + cmp r3, #4; \ + \ + do_aes_4_##bits(e_d, mc_imc, q1, q2, q3, q4, ##__VA_ARGS__); \ + \ + vst1.8 {q1-q2}, [r1]!; /* store ciphertext */ \ + vst1.8 {q3-q4}, [r1]!; /* store ciphertext */ \ + \ + bhs .Lecb_loop4_##bits##e_d; \ + cmp r3, #0; \ + beq .Lecb_done_##e_d; \ + \ + .Lecb_loop_##bits##e_d: \ + vld1.8 {q1}, [r2]!; /* load ciphertext */ \ + subs r3, r3, #1; \ + \ + do_aes_one##bits(e_d, mc_imc, q1, q1, ##__VA_ARGS__); \ + \ + vst1.8 {q1}, [r1]!; /* store plaintext */ \ + bne .Lecb_loop_##bits##e_d; \ + b .Lecb_done_##e_d; + + ECB_CRYPT(128, e, mc) + ECB_CRYPT(192, e, mc, r0, lr) + ECB_CRYPT(256, e, mc, r0, lr) + +.Lecb_done_e: + CLEAR_REG(q0) + CLEAR_REG(q1) + CLEAR_REG(q2) + CLEAR_REG(q3) + CLEAR_REG(q8) + CLEAR_REG(q9) + vpop {q4-q7} + CLEAR_REG(q10) + CLEAR_REG(q11) + CLEAR_REG(q12) + CLEAR_REG(q13) + CLEAR_REG(q14) + +.Lecb_enc_skip: + pop {r4-r6,pc} +.size _gcry_aes_ecb_enc_armv8_ce,.-_gcry_aes_ecb_enc_armv8_ce; + + +/* + * void _gcry_aes_ecb_dec_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * size_t nblocks, + * unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_ecb_dec_armv8_ce +.type _gcry_aes_ecb_dec_armv8_ce,%function; +_gcry_aes_ecb_dec_armv8_ce: + /* input: + * r0: keysched + * r1: outbuf + * r2: inbuf + * r3: nblocks + * %st+0: nrounds => r4 + */ + + push {r4-r6,lr} /* 4*4 = 16b */ + cmp r3, #0 + beq .Lecb_enc_skip + ldr r4, [sp, #(16+0)] + vpush {q4-q7} + + cmp r4, #12 + + aes_preload_keys(r0, lr); + + beq .Lecb_entry_192d + bhi .Lecb_entry_256d + + ECB_CRYPT(128, d, imc) + ECB_CRYPT(192, d, imc, r0, lr) + ECB_CRYPT(256, d, imc, r0, lr) + +#undef ECB_CRYPT + +.Lecb_done_d: + CLEAR_REG(q0) + CLEAR_REG(q1) + CLEAR_REG(q2) + CLEAR_REG(q3) + CLEAR_REG(q8) + CLEAR_REG(q9) + vpop {q4-q7} + CLEAR_REG(q10) + CLEAR_REG(q11) + CLEAR_REG(q12) + CLEAR_REG(q13) + CLEAR_REG(q14) + +.Lecb_dec_skip: + pop {r4-r6,pc} +.size _gcry_aes_ecb_dec_armv8_ce,.-_gcry_aes_ecb_dec_armv8_ce; + + +/* * void _gcry_aes_cfb_enc_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, @@ -1138,7 +1281,7 @@ _gcry_aes_ctr32le_enc_armv8_ce: /* - * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched, + * long _gcry_aes_ocb_enc_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *offset, @@ -1305,6 +1448,7 @@ _gcry_aes_ocb_enc_armv8_ce: CLEAR_REG(q13) CLEAR_REG(q14) + mov r0, #0 pop {r4-r12,lr} vpop {q4-q7} bx lr @@ -1312,7 +1456,7 @@ _gcry_aes_ocb_enc_armv8_ce: /* - * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched, + * long _gcry_aes_ocb_dec_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *offset, @@ -1479,6 +1623,7 @@ _gcry_aes_ocb_dec_armv8_ce: CLEAR_REG(q13) CLEAR_REG(q14) + mov r0, #0 pop {r4-r12,lr} vpop {q4-q7} bx lr @@ -1486,7 +1631,7 @@ _gcry_aes_ocb_dec_armv8_ce: /* - * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched, + * long _gcry_aes_ocb_auth_armv8_ce (const void *keysched, * const unsigned char *abuf, * unsigned char *offset, * unsigned char *checksum, @@ -1632,6 +1777,7 @@ _gcry_aes_ocb_auth_armv8_ce: CLEAR_REG(q13) CLEAR_REG(q14) + mov r0, #0 pop {r4-r12,lr} vpop {q4-q7} bx lr |