summaryrefslogtreecommitdiff
path: root/cipher/rijndael-armv8-aarch32-ce.S
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2022-10-23 13:43:03 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2022-10-26 21:43:04 +0300
commit84f3d41acb2377d1ed0c2b9e8268de9d35e90af0 (patch)
tree4cc373cee35e924017c35bb4eda694a9e36be580 /cipher/rijndael-armv8-aarch32-ce.S
parentbf5ec001dfcbd4a293d0bd577fd70a0f8286c4e6 (diff)
downloadlibgcrypt-84f3d41acb2377d1ed0c2b9e8268de9d35e90af0.tar.gz
rijndael: add ECB acceleration (for benchmarking purposes)
* cipher/cipher-internal.h (cipher_bulk_ops): Add 'ecb_crypt'. * cipher/cipher.c (do_ecb_crypt): Use bulk function if available. * cipher/rijndael-aesni.c (do_aesni_enc_vec8): Change asm label '.Ldeclast' to '.Lenclast'. (_gcry_aes_aesni_ecb_crypt): New. * cipher/rijndael-armv8-aarch32-ce.S (_gcry_aes_ecb_enc_armv8_ce) (_gcry_aes_ecb_dec_armv8_ce): New. * cipher/rijndael-armv8-aarch64-ce.S (_gcry_aes_ecb_enc_armv8_ce) (_gcry_aes_ecb_dec_armv8_ce): New. * cipher/rijndael-armv8-ce.c (_gcry_aes_ocb_enc_armv8_ce) (_gcry_aes_ocb_dec_armv8_ce, _gcry_aes_ocb_auth_armv8_ce): Change return value from void to size_t. (ocb_crypt_fn_t, xts_crypt_fn_t): Remove. (_gcry_aes_armv8_ce_ocb_crypt, _gcry_aes_armv8_ce_xts_crypt): Remove indirect function call; Return value from called function (allows tail call optimization). (_gcry_aes_armv8_ce_ocb_auth): Return value from called function (allows tail call optimization). (_gcry_aes_ecb_enc_armv8_ce, _gcry_aes_ecb_dec_armv8_ce) (_gcry_aes_armv8_ce_ecb_crypt): New. * cipher/rijndael-vaes-avx2-amd64.S (_gcry_vaes_avx2_ecb_crypt_amd64): New. * cipher/rijndael-vaes.c (_gcry_vaes_avx2_ecb_crypt_amd64) (_gcry_aes_vaes_ecb_crypt): New. * cipher/rijndael.c (_gcry_aes_aesni_ecb_crypt) (_gcry_aes_vaes_ecb_crypt, _gcry_aes_armv8_ce_ecb_crypt): New. (do_setkey): Setup ECB bulk function for x86 AESNI/VAES and ARM CE. -- Benchmark on AMD Ryzen 9 7900X: Before (OCB for reference): AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.128 ns/B 7460 MiB/s 0.720 c/B 5634±1 ECB dec | 0.134 ns/B 7103 MiB/s 0.753 c/B 5608 OCB enc | 0.029 ns/B 32930 MiB/s 0.163 c/B 5625 OCB dec | 0.029 ns/B 32738 MiB/s 0.164 c/B 5625 After: AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.028 ns/B 33761 MiB/s 0.159 c/B 5625 ECB dec | 0.028 ns/B 33917 MiB/s 0.158 c/B 5625 GnuPG-bug-id: T6242 Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/rijndael-armv8-aarch32-ce.S')
-rw-r--r--cipher/rijndael-armv8-aarch32-ce.S152
1 files changed, 149 insertions, 3 deletions
diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S
index 1eafa93e..6208652b 100644
--- a/cipher/rijndael-armv8-aarch32-ce.S
+++ b/cipher/rijndael-armv8-aarch32-ce.S
@@ -654,6 +654,149 @@ _gcry_aes_cbc_dec_armv8_ce:
/*
+ * void _gcry_aes_ecb_enc_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * size_t nblocks,
+ * unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ecb_enc_armv8_ce
+.type _gcry_aes_ecb_enc_armv8_ce,%function;
+_gcry_aes_ecb_enc_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: outbuf
+ * r2: inbuf
+ * r3: nblocks
+ * %st+0: nrounds => r4
+ */
+
+ push {r4-r6,lr} /* 4*4 = 16b */
+ cmp r3, #0
+ beq .Lecb_enc_skip
+ ldr r4, [sp, #(16+0)]
+ vpush {q4-q7}
+
+ cmp r4, #12
+ aes_preload_keys(r0, lr);
+
+ beq .Lecb_entry_192e
+ bhi .Lecb_entry_256e
+
+#define ECB_CRYPT(bits, e_d, mc_imc, ...) \
+ .Lecb_entry_##bits##e_d: \
+ cmp r3, #4; \
+ blo .Lecb_loop_##bits##e_d; \
+ \
+ .Lecb_loop4_##bits##e_d: \
+ vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \
+ sub r3, r3, #4; \
+ vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \
+ cmp r3, #4; \
+ \
+ do_aes_4_##bits(e_d, mc_imc, q1, q2, q3, q4, ##__VA_ARGS__); \
+ \
+ vst1.8 {q1-q2}, [r1]!; /* store ciphertext */ \
+ vst1.8 {q3-q4}, [r1]!; /* store ciphertext */ \
+ \
+ bhs .Lecb_loop4_##bits##e_d; \
+ cmp r3, #0; \
+ beq .Lecb_done_##e_d; \
+ \
+ .Lecb_loop_##bits##e_d: \
+ vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+ subs r3, r3, #1; \
+ \
+ do_aes_one##bits(e_d, mc_imc, q1, q1, ##__VA_ARGS__); \
+ \
+ vst1.8 {q1}, [r1]!; /* store plaintext */ \
+ bne .Lecb_loop_##bits##e_d; \
+ b .Lecb_done_##e_d;
+
+ ECB_CRYPT(128, e, mc)
+ ECB_CRYPT(192, e, mc, r0, lr)
+ ECB_CRYPT(256, e, mc, r0, lr)
+
+.Lecb_done_e:
+ CLEAR_REG(q0)
+ CLEAR_REG(q1)
+ CLEAR_REG(q2)
+ CLEAR_REG(q3)
+ CLEAR_REG(q8)
+ CLEAR_REG(q9)
+ vpop {q4-q7}
+ CLEAR_REG(q10)
+ CLEAR_REG(q11)
+ CLEAR_REG(q12)
+ CLEAR_REG(q13)
+ CLEAR_REG(q14)
+
+.Lecb_enc_skip:
+ pop {r4-r6,pc}
+.size _gcry_aes_ecb_enc_armv8_ce,.-_gcry_aes_ecb_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_ecb_dec_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * size_t nblocks,
+ * unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ecb_dec_armv8_ce
+.type _gcry_aes_ecb_dec_armv8_ce,%function;
+_gcry_aes_ecb_dec_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: outbuf
+ * r2: inbuf
+ * r3: nblocks
+ * %st+0: nrounds => r4
+ */
+
+ push {r4-r6,lr} /* 4*4 = 16b */
+ cmp r3, #0
+ beq .Lecb_enc_skip
+ ldr r4, [sp, #(16+0)]
+ vpush {q4-q7}
+
+ cmp r4, #12
+
+ aes_preload_keys(r0, lr);
+
+ beq .Lecb_entry_192d
+ bhi .Lecb_entry_256d
+
+ ECB_CRYPT(128, d, imc)
+ ECB_CRYPT(192, d, imc, r0, lr)
+ ECB_CRYPT(256, d, imc, r0, lr)
+
+#undef ECB_CRYPT
+
+.Lecb_done_d:
+ CLEAR_REG(q0)
+ CLEAR_REG(q1)
+ CLEAR_REG(q2)
+ CLEAR_REG(q3)
+ CLEAR_REG(q8)
+ CLEAR_REG(q9)
+ vpop {q4-q7}
+ CLEAR_REG(q10)
+ CLEAR_REG(q11)
+ CLEAR_REG(q12)
+ CLEAR_REG(q13)
+ CLEAR_REG(q14)
+
+.Lecb_dec_skip:
+ pop {r4-r6,pc}
+.size _gcry_aes_ecb_dec_armv8_ce,.-_gcry_aes_ecb_dec_armv8_ce;
+
+
+/*
* void _gcry_aes_cfb_enc_armv8_ce (const void *keysched,
* unsigned char *outbuf,
* const unsigned char *inbuf,
@@ -1138,7 +1281,7 @@ _gcry_aes_ctr32le_enc_armv8_ce:
/*
- * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
* unsigned char *outbuf,
* const unsigned char *inbuf,
* unsigned char *offset,
@@ -1305,6 +1448,7 @@ _gcry_aes_ocb_enc_armv8_ce:
CLEAR_REG(q13)
CLEAR_REG(q14)
+ mov r0, #0
pop {r4-r12,lr}
vpop {q4-q7}
bx lr
@@ -1312,7 +1456,7 @@ _gcry_aes_ocb_enc_armv8_ce:
/*
- * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
* unsigned char *outbuf,
* const unsigned char *inbuf,
* unsigned char *offset,
@@ -1479,6 +1623,7 @@ _gcry_aes_ocb_dec_armv8_ce:
CLEAR_REG(q13)
CLEAR_REG(q14)
+ mov r0, #0
pop {r4-r12,lr}
vpop {q4-q7}
bx lr
@@ -1486,7 +1631,7 @@ _gcry_aes_ocb_dec_armv8_ce:
/*
- * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
* const unsigned char *abuf,
* unsigned char *offset,
* unsigned char *checksum,
@@ -1632,6 +1777,7 @@ _gcry_aes_ocb_auth_armv8_ce:
CLEAR_REG(q13)
CLEAR_REG(q14)
+ mov r0, #0
pop {r4-r12,lr}
vpop {q4-q7}
bx lr