summaryrefslogtreecommitdiff
path: root/cipher/rijndael-armv8-aarch64-ce.S
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2016-09-04 13:41:02 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2016-09-05 20:08:48 +0300
commit4cd8d40d698564d24ece2af24546e34c58bf2961 (patch)
treef2c84d8d30ad74654b1fa94c4b12452740b5559b /cipher/rijndael-armv8-aarch64-ce.S
parent0b332c1aef03a735c1fb0df184f74d523deb2f98 (diff)
downloadlibgcrypt-4cd8d40d698564d24ece2af24546e34c58bf2961.tar.gz
Add ARMv8/AArch64 Crypto Extension implementation of AES
* cipher/Makefile.am: Add 'rijndael-armv-aarch64-ce.S'. * cipher/rijndael-armv8-aarch64-ce.S: New. * cipher/rijndael-internal.h (USE_ARM_CE): Enable for ARMv8/AArch64. * configure.ac: Add 'rijndael-armv-aarch64-ce.lo' and 'rijndael-armv8-ce.lo' for ARMv8/AArch64. -- Improvement vs AArch64 assembly on Cortex-A53: AES-128 AES-192 AES-256 CBC enc: 13.19x 13.53x 13.76x CBC dec: 20.53x 21.91x 22.60x CFB enc: 14.29x 14.50x 14.63x CFB dec: 20.42x 21.69x 22.50x CTR: 18.29x 19.61x 20.53x OCB enc: 15.21x 16.32x 17.12x OCB dec: 14.95x 16.11x 16.88x OCB auth: 16.73x 17.93x 18.66x Benchmark on Cortex-A53 (1152 Mhz): Before: AES | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 21.86 ns/B 43.62 MiB/s 25.19 c/B ECB dec | 22.68 ns/B 42.05 MiB/s 26.13 c/B CBC enc | 18.66 ns/B 51.10 MiB/s 21.50 c/B CBC dec | 18.72 ns/B 50.95 MiB/s 21.56 c/B CFB enc | 18.61 ns/B 51.25 MiB/s 21.44 c/B CFB dec | 18.61 ns/B 51.25 MiB/s 21.44 c/B OFB enc | 22.84 ns/B 41.75 MiB/s 26.31 c/B OFB dec | 22.84 ns/B 41.75 MiB/s 26.31 c/B CTR enc | 18.89 ns/B 50.50 MiB/s 21.76 c/B CTR dec | 18.89 ns/B 50.50 MiB/s 21.76 c/B CCM enc | 37.55 ns/B 25.40 MiB/s 43.25 c/B CCM dec | 37.55 ns/B 25.40 MiB/s 43.25 c/B CCM auth | 18.77 ns/B 50.80 MiB/s 21.63 c/B GCM enc | 20.18 ns/B 47.25 MiB/s 23.25 c/B GCM dec | 20.18 ns/B 47.25 MiB/s 23.25 c/B GCM auth | 1.30 ns/B 732.5 MiB/s 1.50 c/B OCB enc | 19.67 ns/B 48.48 MiB/s 22.66 c/B OCB dec | 19.73 ns/B 48.34 MiB/s 22.72 c/B OCB auth | 19.46 ns/B 49.00 MiB/s 22.42 c/B = AES192 | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 25.39 ns/B 37.56 MiB/s 29.25 c/B ECB dec | 26.15 ns/B 36.47 MiB/s 30.13 c/B CBC enc | 22.08 ns/B 43.19 MiB/s 25.44 c/B CBC dec | 22.25 ns/B 42.87 MiB/s 25.63 c/B CFB enc | 22.03 ns/B 43.30 MiB/s 25.38 c/B CFB dec | 22.03 ns/B 43.29 MiB/s 25.38 c/B OFB enc | 26.26 ns/B 36.32 MiB/s 30.25 c/B OFB dec | 26.26 ns/B 36.32 MiB/s 30.25 c/B CTR enc | 22.30 ns/B 42.76 MiB/s 25.69 c/B CTR dec | 22.30 ns/B 42.76 MiB/s 25.69 c/B CCM enc | 44.38 ns/B 21.49 MiB/s 51.13 c/B CCM dec | 44.38 ns/B 21.49 MiB/s 51.13 c/B CCM auth | 22.20 ns/B 42.97 MiB/s 25.57 c/B GCM enc | 23.60 ns/B 40.41 MiB/s 27.19 c/B GCM dec | 23.60 ns/B 40.41 MiB/s 27.19 c/B GCM auth | 1.30 ns/B 732.4 MiB/s 1.50 c/B OCB enc | 23.09 ns/B 41.31 MiB/s 26.60 c/B OCB dec | 23.21 ns/B 41.09 MiB/s 26.74 c/B OCB auth | 22.88 ns/B 41.68 MiB/s 26.36 c/B = AES256 | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 28.76 ns/B 33.17 MiB/s 33.13 c/B ECB dec | 29.46 ns/B 32.37 MiB/s 33.94 c/B CBC enc | 25.45 ns/B 37.48 MiB/s 29.31 c/B CBC dec | 25.50 ns/B 37.40 MiB/s 29.38 c/B CFB enc | 25.39 ns/B 37.56 MiB/s 29.25 c/B CFB dec | 25.39 ns/B 37.56 MiB/s 29.25 c/B OFB enc | 29.62 ns/B 32.19 MiB/s 34.13 c/B OFB dec | 29.62 ns/B 32.19 MiB/s 34.13 c/B CTR enc | 25.67 ns/B 37.15 MiB/s 29.57 c/B CTR dec | 25.67 ns/B 37.15 MiB/s 29.57 c/B CCM enc | 51.11 ns/B 18.66 MiB/s 58.88 c/B CCM dec | 51.11 ns/B 18.66 MiB/s 58.88 c/B CCM auth | 25.56 ns/B 37.32 MiB/s 29.44 c/B GCM enc | 26.96 ns/B 35.37 MiB/s 31.06 c/B GCM dec | 26.98 ns/B 35.35 MiB/s 31.08 c/B GCM auth | 1.30 ns/B 733.4 MiB/s 1.50 c/B OCB enc | 26.45 ns/B 36.05 MiB/s 30.47 c/B OCB dec | 26.53 ns/B 35.95 MiB/s 30.56 c/B OCB auth | 26.24 ns/B 36.34 MiB/s 30.23 c/B = After: Cipher: AES | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 4.83 ns/B 197.5 MiB/s 5.56 c/B ECB dec | 4.99 ns/B 191.1 MiB/s 5.75 c/B CBC enc | 1.41 ns/B 675.5 MiB/s 1.63 c/B CBC dec | 0.911 ns/B 1046.9 MiB/s 1.05 c/B CFB enc | 1.30 ns/B 732.2 MiB/s 1.50 c/B CFB dec | 0.911 ns/B 1046.7 MiB/s 1.05 c/B OFB enc | 5.81 ns/B 164.3 MiB/s 6.69 c/B OFB dec | 5.81 ns/B 164.3 MiB/s 6.69 c/B CTR enc | 1.03 ns/B 924.0 MiB/s 1.19 c/B CTR dec | 1.03 ns/B 924.1 MiB/s 1.19 c/B CCM enc | 2.50 ns/B 381.8 MiB/s 2.88 c/B CCM dec | 2.50 ns/B 381.7 MiB/s 2.88 c/B CCM auth | 1.57 ns/B 606.1 MiB/s 1.81 c/B GCM enc | 2.33 ns/B 408.5 MiB/s 2.69 c/B GCM dec | 2.34 ns/B 408.4 MiB/s 2.69 c/B GCM auth | 1.30 ns/B 732.1 MiB/s 1.50 c/B OCB enc | 1.29 ns/B 736.6 MiB/s 1.49 c/B OCB dec | 1.32 ns/B 724.4 MiB/s 1.52 c/B OCB auth | 1.16 ns/B 819.6 MiB/s 1.34 c/B = AES192 | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 5.48 ns/B 174.0 MiB/s 6.31 c/B ECB dec | 5.64 ns/B 169.0 MiB/s 6.50 c/B CBC enc | 1.63 ns/B 585.8 MiB/s 1.88 c/B CBC dec | 1.02 ns/B 935.8 MiB/s 1.17 c/B CFB enc | 1.52 ns/B 627.7 MiB/s 1.75 c/B CFB dec | 1.02 ns/B 935.9 MiB/s 1.17 c/B OFB enc | 6.46 ns/B 147.7 MiB/s 7.44 c/B OFB dec | 6.46 ns/B 147.7 MiB/s 7.44 c/B CTR enc | 1.14 ns/B 836.1 MiB/s 1.31 c/B CTR dec | 1.14 ns/B 835.9 MiB/s 1.31 c/B CCM enc | 2.83 ns/B 337.6 MiB/s 3.25 c/B CCM dec | 2.82 ns/B 338.0 MiB/s 3.25 c/B CCM auth | 1.79 ns/B 532.7 MiB/s 2.06 c/B GCM enc | 2.44 ns/B 390.3 MiB/s 2.82 c/B GCM dec | 2.44 ns/B 390.2 MiB/s 2.82 c/B GCM auth | 1.30 ns/B 731.9 MiB/s 1.50 c/B OCB enc | 1.41 ns/B 674.7 MiB/s 1.63 c/B OCB dec | 1.44 ns/B 662.0 MiB/s 1.66 c/B OCB auth | 1.28 ns/B 746.1 MiB/s 1.47 c/B = AES256 | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 6.13 ns/B 155.5 MiB/s 7.06 c/B ECB dec | 6.29 ns/B 151.5 MiB/s 7.25 c/B CBC enc | 1.85 ns/B 516.8 MiB/s 2.13 c/B CBC dec | 1.13 ns/B 845.6 MiB/s 1.30 c/B CFB enc | 1.74 ns/B 549.5 MiB/s 2.00 c/B CFB dec | 1.13 ns/B 846.1 MiB/s 1.30 c/B OFB enc | 7.11 ns/B 134.2 MiB/s 8.19 c/B OFB dec | 7.11 ns/B 134.2 MiB/s 8.19 c/B CTR enc | 1.25 ns/B 763.5 MiB/s 1.44 c/B CTR dec | 1.25 ns/B 763.4 MiB/s 1.44 c/B CCM enc | 3.15 ns/B 302.9 MiB/s 3.63 c/B CCM dec | 3.15 ns/B 302.9 MiB/s 3.63 c/B CCM auth | 2.01 ns/B 474.2 MiB/s 2.32 c/B GCM enc | 2.55 ns/B 374.2 MiB/s 2.94 c/B GCM dec | 2.55 ns/B 373.7 MiB/s 2.94 c/B GCM auth | 1.30 ns/B 732.2 MiB/s 1.50 c/B OCB enc | 1.54 ns/B 617.6 MiB/s 1.78 c/B OCB dec | 1.57 ns/B 606.8 MiB/s 1.81 c/B OCB auth | 1.40 ns/B 679.8 MiB/s 1.62 c/B = Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/rijndael-armv8-aarch64-ce.S')
-rw-r--r--cipher/rijndael-armv8-aarch64-ce.S1265
1 files changed, 1265 insertions, 0 deletions
diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S
new file mode 100644
index 00000000..21d0aec8
--- /dev/null
+++ b/cipher/rijndael-armv8-aarch64-ce.S
@@ -0,0 +1,1265 @@
+/* rijndael-armv8-aarch64-ce.S - ARMv8/CE accelerated AES
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(__AARCH64EL__) && \
+ defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+
+.arch armv8-a+crypto
+
+.text
+
+
+#if (SIZEOF_VOID_P == 4)
+ #define ptr8 w8
+ #define ptr9 w9
+ #define ptr10 w10
+ #define ptr11 w11
+ #define ptr_sz 4
+#elif (SIZEOF_VOID_P == 8)
+ #define ptr8 x8
+ #define ptr9 x9
+ #define ptr10 x10
+ #define ptr11 x11
+ #define ptr_sz 8
+#else
+ #error "missing SIZEOF_VOID_P"
+#endif
+
+
+#define GET_DATA_POINTER(reg, name) \
+ adrp reg, :got:name ; \
+ ldr reg, [reg, #:got_lo12:name] ;
+
+
+/* Register macros */
+
+#define vk0 v17
+#define vk1 v18
+#define vk2 v19
+#define vk3 v20
+#define vk4 v21
+#define vk5 v22
+#define vk6 v23
+#define vk7 v24
+#define vk8 v25
+#define vk9 v26
+#define vk10 v27
+#define vk11 v28
+#define vk12 v29
+#define vk13 v30
+#define vk14 v31
+
+
+/* AES macros */
+
+#define aes_preload_keys(keysched, nrounds) \
+ cmp nrounds, #12; \
+ ld1 {vk0.16b-vk3.16b}, [keysched], #64; \
+ ld1 {vk4.16b-vk7.16b}, [keysched], #64; \
+ ld1 {vk8.16b-vk10.16b}, [keysched], #48; \
+ b.lo 1f; \
+ ld1 {vk11.16b-vk12.16b}, [keysched], #32; \
+ b.eq 1f; \
+ ld1 {vk13.16b-vk14.16b}, [keysched]; \
+1: ;
+
+#define do_aes_one128(ed, mcimc, vo, vb) \
+ aes##ed vb.16b, vk0.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk1.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk2.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk3.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk4.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk5.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk6.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk7.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk8.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk9.16b; \
+ eor vo.16b, vb.16b, vk10.16b;
+
+#define do_aes_one192(ed, mcimc, vo, vb) \
+ aes##ed vb.16b, vk0.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk1.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk2.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk3.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk4.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk5.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk6.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk7.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk8.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk9.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk10.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk11.16b; \
+ eor vo.16b, vb.16b, vk12.16b;
+
+#define do_aes_one256(ed, mcimc, vo, vb) \
+ aes##ed vb.16b, vk0.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk1.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk2.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk3.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk4.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk5.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk6.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk7.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk8.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk9.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk10.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk11.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk12.16b; \
+ aes##mcimc vb.16b, vb.16b; \
+ aes##ed vb.16b, vk13.16b; \
+ eor vo.16b, vb.16b, vk14.16b;
+
+#define aes_round_4(ed, mcimc, b0, b1, b2, b3, key) \
+ aes##ed b0.16b, key.16b; \
+ aes##mcimc b0.16b, b0.16b; \
+ aes##ed b1.16b, key.16b; \
+ aes##mcimc b1.16b, b1.16b; \
+ aes##ed b2.16b, key.16b; \
+ aes##mcimc b2.16b, b2.16b; \
+ aes##ed b3.16b, key.16b; \
+ aes##mcimc b3.16b, b3.16b;
+
+#define aes_lastround_4(ed, b0, b1, b2, b3, key1, key2) \
+ aes##ed b0.16b, key1.16b; \
+ eor b0.16b, b0.16b, key2.16b; \
+ aes##ed b1.16b, key1.16b; \
+ eor b1.16b, b1.16b, key2.16b; \
+ aes##ed b2.16b, key1.16b; \
+ eor b2.16b, b2.16b, key2.16b; \
+ aes##ed b3.16b, key1.16b; \
+ eor b3.16b, b3.16b, key2.16b;
+
+#define do_aes_4_128(ed, mcimc, b0, b1, b2, b3) \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk7); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \
+ aes_lastround_4(ed, b0, b1, b2, b3, vk9, vk10);
+
+#define do_aes_4_192(ed, mcimc, b0, b1, b2, b3) \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk7); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk9); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk10); \
+ aes_lastround_4(ed, b0, b1, b2, b3, vk11, vk12);
+
+#define do_aes_4_256(ed, mcimc, b0, b1, b2, b3) \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk7); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk9); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk10); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk11); \
+ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk12); \
+ aes_lastround_4(ed, b0, b1, b2, b3, vk13, vk14);
+
+
+/* Other functional macros */
+
+#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b;
+
+#define aes_clear_keys(nrounds) \
+ cmp nrounds, #12; \
+ CLEAR_REG(vk0); \
+ CLEAR_REG(vk1); \
+ CLEAR_REG(vk2); \
+ CLEAR_REG(vk3); \
+ CLEAR_REG(vk4); \
+ CLEAR_REG(vk5); \
+ CLEAR_REG(vk6); \
+ CLEAR_REG(vk7); \
+ CLEAR_REG(vk9); \
+ CLEAR_REG(vk8); \
+ CLEAR_REG(vk10); \
+ b.lo 1f; \
+ CLEAR_REG(vk11); \
+ CLEAR_REG(vk12); \
+ b.eq 1f; \
+ CLEAR_REG(vk13); \
+ CLEAR_REG(vk14); \
+1: ;
+
+
+/*
+ * unsigned int _gcry_aes_enc_armv8_ce(void *keysched, byte *dst,
+ * const byte *src,
+ * unsigned int nrounds);
+ */
+.align 3
+.globl _gcry_aes_enc_armv8_ce
+.type _gcry_aes_enc_armv8_ce,%function;
+_gcry_aes_enc_armv8_ce:
+ /* input:
+ * x0: keysched
+ * x1: dst
+ * x2: src
+ * w3: nrounds
+ */
+
+ aes_preload_keys(x0, w3);
+
+ ld1 {v0.16b}, [x2]
+
+ b.hi .Lenc1_256
+ b.eq .Lenc1_192
+
+.Lenc1_128:
+ do_aes_one128(e, mc, v0, v0);
+
+.Lenc1_tail:
+ CLEAR_REG(vk0)
+ CLEAR_REG(vk1)
+ CLEAR_REG(vk2)
+ CLEAR_REG(vk3)
+ CLEAR_REG(vk4)
+ CLEAR_REG(vk5)
+ CLEAR_REG(vk6)
+ CLEAR_REG(vk7)
+ CLEAR_REG(vk8)
+ CLEAR_REG(vk9)
+ CLEAR_REG(vk10)
+ st1 {v0.16b}, [x1]
+ CLEAR_REG(v0)
+
+ mov x0, #0
+ ret
+
+.Lenc1_192:
+ do_aes_one192(e, mc, v0, v0);
+
+ CLEAR_REG(vk11)
+ CLEAR_REG(vk12)
+ b .Lenc1_tail
+
+.Lenc1_256:
+ do_aes_one256(e, mc, v0, v0);
+
+ CLEAR_REG(vk11)
+ CLEAR_REG(vk12)
+ CLEAR_REG(vk13)
+ CLEAR_REG(vk14)
+ b .Lenc1_tail
+.size _gcry_aes_enc_armv8_ce,.-_gcry_aes_enc_armv8_ce;
+
+
+/*
+ * unsigned int _gcry_aes_dec_armv8_ce(void *keysched, byte *dst,
+ * const byte *src,
+ * unsigned int nrounds);
+ */
+.align 3
+.globl _gcry_aes_dec_armv8_ce
+.type _gcry_aes_dec_armv8_ce,%function;
+_gcry_aes_dec_armv8_ce:
+ /* input:
+ * x0: keysched
+ * x1: dst
+ * x2: src
+ * w3: nrounds
+ */
+
+ aes_preload_keys(x0, w3);
+
+ ld1 {v0.16b}, [x2]
+
+ b.hi .Ldec1_256
+ b.eq .Ldec1_192
+
+.Ldec1_128:
+ do_aes_one128(d, imc, v0, v0);
+
+.Ldec1_tail:
+ CLEAR_REG(vk0)
+ CLEAR_REG(vk1)
+ CLEAR_REG(vk2)
+ CLEAR_REG(vk3)
+ CLEAR_REG(vk4)
+ CLEAR_REG(vk5)
+ CLEAR_REG(vk6)
+ CLEAR_REG(vk7)
+ CLEAR_REG(vk8)
+ CLEAR_REG(vk9)
+ CLEAR_REG(vk10)
+ st1 {v0.16b}, [x1]
+ CLEAR_REG(v0)
+
+ mov x0, #0
+ ret
+
+.Ldec1_192:
+ do_aes_one192(d, imc, v0, v0);
+
+ CLEAR_REG(vk11)
+ CLEAR_REG(vk12)
+ b .Ldec1_tail
+
+.Ldec1_256:
+ do_aes_one256(d, imc, v0, v0);
+
+ CLEAR_REG(vk11)
+ CLEAR_REG(vk12)
+ CLEAR_REG(vk13)
+ CLEAR_REG(vk14)
+ b .Ldec1_tail
+.size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce;
+
+
+/*
+ * void _gcry_aes_cbc_enc_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * unsigned char *iv, size_t nblocks,
+ * int cbc_mac, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_cbc_enc_armv8_ce
+.type _gcry_aes_cbc_enc_armv8_ce,%function;
+_gcry_aes_cbc_enc_armv8_ce:
+ /* input:
+ * x0: keysched
+ * x1: outbuf
+ * x2: inbuf
+ * x3: iv
+ * x4: nblocks
+ * w5: cbc_mac
+ * w6: nrounds
+ */
+
+ cbz x4, .Lcbc_enc_skip
+
+ cmp w5, #0
+ ld1 {v1.16b}, [x3] /* load IV */
+ cset x5, eq
+
+ aes_preload_keys(x0, w6);
+ lsl x5, x5, #4
+
+ b.eq .Lcbc_enc_loop192
+ b.hi .Lcbc_enc_loop256
+
+#define CBC_ENC(bits) \
+ .Lcbc_enc_loop##bits: \
+ ld1 {v0.16b}, [x2], #16; /* load plaintext */ \
+ eor v1.16b, v0.16b, v1.16b; \
+ sub x4, x4, #1; \
+ \
+ do_aes_one##bits(e, mc, v1, v1); \
+ \
+ st1 {v1.16b}, [x1], x5; /* store ciphertext */ \
+ \
+ cbnz x4, .Lcbc_enc_loop##bits; \
+ b .Lcbc_enc_done;
+
+ CBC_ENC(128)
+ CBC_ENC(192)
+ CBC_ENC(256)
+
+#undef CBC_ENC
+
+.Lcbc_enc_done:
+ aes_clear_keys(w6)
+
+ st1 {v1.16b}, [x3] /* store IV */
+
+ CLEAR_REG(v1)
+ CLEAR_REG(v0)
+
+.Lcbc_enc_skip:
+ ret
+.size _gcry_aes_cbc_enc_armv8_ce,.-_gcry_aes_cbc_enc_armv8_ce;
+
+/*
+ * void _gcry_aes_cbc_dec_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_cbc_dec_armv8_ce
+.type _gcry_aes_cbc_dec_armv8_ce,%function;
+_gcry_aes_cbc_dec_armv8_ce:
+ /* input:
+ * x0: keysched
+ * x1: outbuf
+ * x2: inbuf
+ * x3: iv
+ * x4: nblocks
+ * w5: nrounds
+ */
+
+ cbz x4, .Lcbc_dec_skip
+
+ ld1 {v0.16b}, [x3] /* load IV */
+
+ aes_preload_keys(x0, w5);
+
+ b.eq .Lcbc_dec_entry_192
+ b.hi .Lcbc_dec_entry_256
+
+#define CBC_DEC(bits) \
+ .Lcbc_dec_entry_##bits: \
+ cmp x4, #4; \
+ b.lo .Lcbc_dec_loop_##bits; \
+ \
+ .Lcbc_dec_loop4_##bits: \
+ \
+ ld1 {v1.16b-v4.16b}, [x2], #64; /* load ciphertext */ \
+ sub x4, x4, #4; \
+ mov v5.16b, v1.16b; \
+ mov v6.16b, v2.16b; \
+ mov v7.16b, v3.16b; \
+ mov v16.16b, v4.16b; \
+ cmp x4, #4; \
+ \
+ do_aes_4_##bits(d, imc, v1, v2, v3, v4); \
+ \
+ eor v1.16b, v1.16b, v0.16b; \
+ eor v2.16b, v2.16b, v5.16b; \
+ st1 {v1.16b-v2.16b}, [x1], #32; /* store plaintext */ \
+ eor v3.16b, v3.16b, v6.16b; \
+ eor v4.16b, v4.16b, v7.16b; \
+ mov v0.16b, v16.16b; /* next IV */ \
+ st1 {v3.16b-v4.16b}, [x1], #32; /* store plaintext */ \
+ \
+ b.hs .Lcbc_dec_loop4_##bits; \
+ CLEAR_REG(v3); \
+ CLEAR_REG(v4); \
+ CLEAR_REG(v5); \
+ CLEAR_REG(v6); \
+ CLEAR_REG(v7); \
+ CLEAR_REG(v16); \
+ cbz x4, .Lcbc_dec_done; \
+ \
+ .Lcbc_dec_loop_##bits: \
+ ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \
+ sub x4, x4, #1; \
+ mov v2.16b, v1.16b; \
+ \
+ do_aes_one##bits(d, imc, v1, v1); \
+ \
+ eor v1.16b, v1.16b, v0.16b; \
+ mov v0.16b, v2.16b; \
+ st1 {v1.16b}, [x1], #16; /* store plaintext */ \
+ \
+ cbnz x4, .Lcbc_dec_loop_##bits; \
+ b .Lcbc_dec_done;
+
+ CBC_DEC(128)
+ CBC_DEC(192)
+ CBC_DEC(256)
+
+#undef CBC_DEC
+
+.Lcbc_dec_done:
+ aes_clear_keys(w5)
+
+ st1 {v0.16b}, [x3] /* store IV */
+
+ CLEAR_REG(v0)
+ CLEAR_REG(v1)
+ CLEAR_REG(v2)
+
+.Lcbc_dec_skip:
+ ret
+.size _gcry_aes_cbc_dec_armv8_ce,.-_gcry_aes_cbc_dec_armv8_ce;
+
+
+/*
+ * void _gcry_aes_ctr_enc_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ctr_enc_armv8_ce
+.type _gcry_aes_ctr_enc_armv8_ce,%function;
+_gcry_aes_ctr_enc_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: outbuf
+ * r2: inbuf
+ * r3: iv
+ * x4: nblocks
+ * w5: nrounds
+ */
+
+ cbz x4, .Lctr_enc_skip
+
+ mov x6, #1
+ movi v16.16b, #0
+ mov v16.D[1], x6
+
+ /* load IV */
+ ldp x9, x10, [x3]
+ ld1 {v0.16b}, [x3]
+ rev x9, x9
+ rev x10, x10
+
+ aes_preload_keys(x0, w5);
+
+ b.eq .Lctr_enc_entry_192
+ b.hi .Lctr_enc_entry_256
+
+#define CTR_ENC(bits) \
+ .Lctr_enc_entry_##bits: \
+ cmp x4, #4; \
+ b.lo .Lctr_enc_loop_##bits; \
+ \
+ .Lctr_enc_loop4_##bits: \
+ cmp x10, #0xfffffffffffffffc; \
+ sub x4, x4, #4; \
+ b.lo .Lctr_enc_loop4_##bits##_nocarry; \
+ \
+ adds x10, x10, #1; \
+ mov v1.16b, v0.16b; \
+ adc x9, x9, xzr; \
+ mov v2.D[1], x10; \
+ mov v2.D[0], x9; \
+ \
+ adds x10, x10, #1; \
+ rev64 v2.16b, v2.16b; \
+ adc x9, x9, xzr; \
+ mov v3.D[1], x10; \
+ mov v3.D[0], x9; \
+ \
+ adds x10, x10, #1; \
+ rev64 v3.16b, v3.16b; \
+ adc x9, x9, xzr; \
+ mov v4.D[1], x10; \
+ mov v4.D[0], x9; \
+ \
+ adds x10, x10, #1; \
+ rev64 v4.16b, v4.16b; \
+ adc x9, x9, xzr; \
+ mov v0.D[1], x10; \
+ mov v0.D[0], x9; \
+ rev64 v0.16b, v0.16b; \
+ \
+ b .Lctr_enc_loop4_##bits##_store_ctr; \
+ \
+ .Lctr_enc_loop4_##bits##_nocarry: \
+ \
+ add v3.2d, v16.2d, v16.2d; /* 2 */ \
+ rev64 v6.16b, v0.16b; \
+ add x10, x10, #4; \
+ add v4.2d, v3.2d, v16.2d; /* 3 */ \
+ add v0.2d, v3.2d, v3.2d; /* 4 */ \
+ rev64 v1.16b, v6.16b; \
+ add v2.2d, v6.2d, v16.2d; \
+ add v3.2d, v6.2d, v3.2d; \
+ add v4.2d, v6.2d, v4.2d; \
+ add v0.2d, v6.2d, v0.2d; \
+ rev64 v2.16b, v2.16b; \
+ rev64 v3.16b, v3.16b; \
+ rev64 v0.16b, v0.16b; \
+ rev64 v4.16b, v4.16b; \
+ \
+ .Lctr_enc_loop4_##bits##_store_ctr: \
+ \
+ st1 {v0.16b}, [x3]; \
+ cmp x4, #4; \
+ ld1 {v5.16b-v7.16b}, [x2], #48; /* preload ciphertext */ \
+ \
+ do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+ \
+ eor v1.16b, v1.16b, v5.16b; \
+ ld1 {v5.16b}, [x2], #16; /* load ciphertext */ \
+ eor v2.16b, v2.16b, v6.16b; \
+ eor v3.16b, v3.16b, v7.16b; \
+ eor v4.16b, v4.16b, v5.16b; \
+ st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
+ \
+ b.hs .Lctr_enc_loop4_##bits; \
+ CLEAR_REG(v3); \
+ CLEAR_REG(v4); \
+ CLEAR_REG(v5); \
+ CLEAR_REG(v6); \
+ CLEAR_REG(v7); \
+ cbz x4, .Lctr_enc_done; \
+ \
+ .Lctr_enc_loop_##bits: \
+ \
+ adds x10, x10, #1; \
+ mov v1.16b, v0.16b; \
+ adc x9, x9, xzr; \
+ mov v0.D[1], x10; \
+ mov v0.D[0], x9; \
+ sub x4, x4, #1; \
+ ld1 {v2.16b}, [x2], #16; /* load ciphertext */ \
+ rev64 v0.16b, v0.16b; \
+ \
+ do_aes_one##bits(e, mc, v1, v1); \
+ \
+ eor v1.16b, v2.16b, v1.16b; \
+ st1 {v1.16b}, [x1], #16; /* store plaintext */ \
+ \
+ cbnz x4, .Lctr_enc_loop_##bits; \
+ b .Lctr_enc_done;
+
+ CTR_ENC(128)
+ CTR_ENC(192)
+ CTR_ENC(256)
+
+#undef CTR_ENC
+
+.Lctr_enc_done:
+ aes_clear_keys(w5)
+
+ st1 {v0.16b}, [x3] /* store IV */
+
+ CLEAR_REG(v0)
+ CLEAR_REG(v1)
+ CLEAR_REG(v2)
+
+.Lctr_enc_skip:
+ ret
+
+.size _gcry_aes_ctr_enc_armv8_ce,.-_gcry_aes_ctr_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_cfb_enc_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_cfb_enc_armv8_ce
+.type _gcry_aes_cfb_enc_armv8_ce,%function;
+_gcry_aes_cfb_enc_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: outbuf
+ * r2: inbuf
+ * r3: iv
+ * x4: nblocks
+ * w5: nrounds
+ */
+
+ cbz x4, .Lcfb_enc_skip
+
+ /* load IV */
+ ld1 {v0.16b}, [x3]
+
+ aes_preload_keys(x0, w5);
+
+ b.eq .Lcfb_enc_entry_192
+ b.hi .Lcfb_enc_entry_256
+
+#define CFB_ENC(bits) \
+ .Lcfb_enc_entry_##bits: \
+ .Lcfb_enc_loop_##bits: \
+ ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
+ sub x4, x4, #1; \
+ \
+ do_aes_one##bits(e, mc, v0, v0); \
+ \
+ eor v0.16b, v1.16b, v0.16b; \
+ st1 {v0.16b}, [x1], #16; /* store ciphertext */ \
+ \
+ cbnz x4, .Lcfb_enc_loop_##bits; \
+ b .Lcfb_enc_done;
+
+ CFB_ENC(128)
+ CFB_ENC(192)
+ CFB_ENC(256)
+
+#undef CFB_ENC
+
+.Lcfb_enc_done:
+ aes_clear_keys(w5)
+
+ st1 {v0.16b}, [x3] /* store IV */
+
+ CLEAR_REG(v0)
+ CLEAR_REG(v1)
+
+.Lcfb_enc_skip:
+ ret
+.size _gcry_aes_cfb_enc_armv8_ce,.-_gcry_aes_cfb_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_cfb_dec_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_cfb_dec_armv8_ce
+.type _gcry_aes_cfb_dec_armv8_ce,%function;
+_gcry_aes_cfb_dec_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: outbuf
+ * r2: inbuf
+ * r3: iv
+ * x4: nblocks
+ * w5: nrounds
+ */
+
+ cbz x4, .Lcfb_dec_skip
+
+ /* load IV */
+ ld1 {v0.16b}, [x3]
+
+ aes_preload_keys(x0, w5);
+
+ b.eq .Lcfb_dec_entry_192
+ b.hi .Lcfb_dec_entry_256
+
+#define CFB_DEC(bits) \
+ .Lcfb_dec_entry_##bits: \
+ cmp x4, #4; \
+ b.lo .Lcfb_dec_loop_##bits; \
+ \
+ .Lcfb_dec_loop4_##bits: \
+ \
+ ld1 {v2.16b-v4.16b}, [x2], #48; /* load ciphertext */ \
+ mov v1.16b, v0.16b; \
+ sub x4, x4, #4; \
+ cmp x4, #4; \
+ mov v5.16b, v2.16b; \
+ mov v6.16b, v3.16b; \
+ mov v7.16b, v4.16b; \
+ ld1 {v0.16b}, [x2], #16; /* load next IV / ciphertext */ \
+ \
+ do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+ \
+ eor v1.16b, v1.16b, v5.16b; \
+ eor v2.16b, v2.16b, v6.16b; \
+ eor v3.16b, v3.16b, v7.16b; \
+ eor v4.16b, v4.16b, v0.16b; \
+ st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
+ \
+ b.hs .Lcfb_dec_loop4_##bits; \
+ CLEAR_REG(v3); \
+ CLEAR_REG(v4); \
+ CLEAR_REG(v5); \
+ CLEAR_REG(v6); \
+ CLEAR_REG(v7); \
+ cbz x4, .Lcfb_dec_done; \
+ \
+ .Lcfb_dec_loop_##bits: \
+ \
+ ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \
+ \
+ sub x4, x4, #1; \
+ \
+ do_aes_one##bits(e, mc, v0, v0); \
+ \
+ eor v2.16b, v1.16b, v0.16b; \
+ mov v0.16b, v1.16b; \
+ st1 {v2.16b}, [x1], #16; /* store plaintext */ \
+ \
+ cbnz x4, .Lcfb_dec_loop_##bits; \
+ b .Lcfb_dec_done;
+
+ CFB_DEC(128)
+ CFB_DEC(192)
+ CFB_DEC(256)
+
+#undef CFB_DEC
+
+.Lcfb_dec_done:
+ aes_clear_keys(w5)
+
+ st1 {v0.16b}, [x3] /* store IV */
+
+ CLEAR_REG(v0)
+ CLEAR_REG(v1)
+ CLEAR_REG(v2)
+
+.Lcfb_dec_skip:
+ ret
+.size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce;
+
+
+/*
+ * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * unsigned char *offset,
+ * unsigned char *checksum,
+ * void **Ls,
+ * size_t nblocks,
+ * unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ocb_enc_armv8_ce
+.type _gcry_aes_ocb_enc_armv8_ce,%function;
+_gcry_aes_ocb_enc_armv8_ce:
+ /* input:
+ * x0: keysched
+ * x1: outbuf
+ * x2: inbuf
+ * x3: offset
+ * x4: checksum
+ * x5: Ls
+ * x6: nblocks (0 < nblocks <= 32)
+ * w7: nrounds
+ */
+
+ ld1 {v0.16b}, [x3] /* load offset */
+ ld1 {v16.16b}, [x4] /* load checksum */
+
+ aes_preload_keys(x0, w7);
+
+ b.eq .Locb_enc_entry_192
+ b.hi .Locb_enc_entry_256
+
+#define OCB_ENC(bits, ...) \
+ .Locb_enc_entry_##bits: \
+ cmp x6, #4; \
+ b.lo .Locb_enc_loop_##bits; \
+ \
+ .Locb_enc_loop4_##bits: \
+ \
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+ /* Checksum_i = Checksum_{i-1} xor P_i */ \
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ \
+ \
+ ldp ptr8, ptr9, [x5], #(ptr_sz*2); \
+ \
+ ld1 {v1.16b-v4.16b}, [x2], #64; /* load P_i+<0-3> */ \
+ ldp ptr10, ptr11, [x5], #(ptr_sz*2); \
+ sub x6, x6, #4; \
+ \
+ ld1 {v5.16b}, [x8]; /* load L_{ntz(i+0)} */ \
+ eor v16.16b, v16.16b, v1.16b; /* Checksum_i+0 */ \
+ ld1 {v6.16b}, [x9]; /* load L_{ntz(i+1)} */ \
+ eor v16.16b, v16.16b, v2.16b; /* Checksum_i+1 */ \
+ ld1 {v7.16b}, [x10]; /* load L_{ntz(i+2)} */ \
+ eor v16.16b, v16.16b, v3.16b; /* Checksum_i+2 */ \
+ eor v5.16b, v5.16b, v0.16b; /* Offset_i+0 */ \
+ ld1 {v0.16b}, [x11]; /* load L_{ntz(i+3)} */ \
+ eor v16.16b, v16.16b, v4.16b; /* Checksum_i+3 */ \
+ eor v6.16b, v6.16b, v5.16b; /* Offset_i+1 */ \
+ eor v1.16b, v1.16b, v5.16b; /* P_i+0 xor Offset_i+0 */ \
+ eor v7.16b, v7.16b, v6.16b; /* Offset_i+2 */ \
+ eor v2.16b, v2.16b, v6.16b; /* P_i+1 xor Offset_i+1 */ \
+ eor v0.16b, v0.16b, v7.16b; /* Offset_i+3 */ \
+ cmp x6, #4; \
+ eor v3.16b, v3.16b, v7.16b; /* P_i+2 xor Offset_i+2 */ \
+ eor v4.16b, v4.16b, v0.16b; /* P_i+3 xor Offset_i+3 */ \
+ \
+ do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+ \
+ eor v1.16b, v1.16b, v5.16b; /* xor Offset_i+0 */ \
+ eor v2.16b, v2.16b, v6.16b; /* xor Offset_i+1 */ \
+ eor v3.16b, v3.16b, v7.16b; /* xor Offset_i+2 */ \
+ eor v4.16b, v4.16b, v0.16b; /* xor Offset_i+3 */ \
+ st1 {v1.16b-v4.16b}, [x1], #64; \
+ \
+ b.hs .Locb_enc_loop4_##bits; \
+ CLEAR_REG(v3); \
+ CLEAR_REG(v4); \
+ CLEAR_REG(v5); \
+ CLEAR_REG(v6); \
+ CLEAR_REG(v7); \
+ cbz x6, .Locb_enc_done; \
+ \
+ .Locb_enc_loop_##bits: \
+ \
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+ /* Checksum_i = Checksum_{i-1} xor P_i */ \
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ \
+ \
+ ldr ptr8, [x5], #(ptr_sz); \
+ ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
+ ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
+ sub x6, x6, #1; \
+ eor v0.16b, v0.16b, v2.16b; \
+ eor v16.16b, v16.16b, v1.16b; \
+ eor v1.16b, v1.16b, v0.16b; \
+ \
+ do_aes_one##bits(e, mc, v1, v1); \
+ \
+ eor v1.16b, v1.16b, v0.16b; \
+ st1 {v1.16b}, [x1], #16; /* store ciphertext */ \
+ \
+ cbnz x6, .Locb_enc_loop_##bits; \
+ b .Locb_enc_done;
+
+ OCB_ENC(128)
+ OCB_ENC(192)
+ OCB_ENC(256)
+
+#undef OCB_ENC
+
+.Locb_enc_done:
+ aes_clear_keys(w7)
+
+ st1 {v16.16b}, [x4] /* store checksum */
+ st1 {v0.16b}, [x3] /* store offset */
+
+ CLEAR_REG(v0)
+ CLEAR_REG(v1)
+ CLEAR_REG(v2)
+ CLEAR_REG(v16)
+
+ ret
+.size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * unsigned char *offset,
+ * unsigned char *checksum,
+ * void **Ls,
+ * size_t nblocks,
+ * unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ocb_dec_armv8_ce
+.type _gcry_aes_ocb_dec_armv8_ce,%function;
+_gcry_aes_ocb_dec_armv8_ce:
+ /* input:
+ * x0: keysched
+ * x1: outbuf
+ * x2: inbuf
+ * x3: offset
+ * x4: checksum
+ * x5: Ls
+ * x6: nblocks (0 < nblocks <= 32)
+ * w7: nrounds
+ */
+
+ ld1 {v0.16b}, [x3] /* load offset */
+ ld1 {v16.16b}, [x4] /* load checksum */
+
+ aes_preload_keys(x0, w7);
+
+ b.eq .Locb_dec_entry_192
+ b.hi .Locb_dec_entry_256
+
+#define OCB_DEC(bits) \
+ .Locb_dec_entry_##bits: \
+ cmp x6, #4; \
+ b.lo .Locb_dec_loop_##bits; \
+ \
+ .Locb_dec_loop4_##bits: \
+ \
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ \
+ /* Checksum_i = Checksum_{i-1} xor P_i */ \
+ \
+ ldp ptr8, ptr9, [x5], #(ptr_sz*2); \
+ \
+ ld1 {v1.16b-v4.16b}, [x2], #64; /* load C_i+<0-3> */ \
+ ldp ptr10, ptr11, [x5], #(ptr_sz*2); \
+ sub x6, x6, #4; \
+ \
+ ld1 {v5.16b}, [x8]; /* load L_{ntz(i+0)} */ \
+ ld1 {v6.16b}, [x9]; /* load L_{ntz(i+1)} */ \
+ ld1 {v7.16b}, [x10]; /* load L_{ntz(i+2)} */ \
+ eor v5.16b, v5.16b, v0.16b; /* Offset_i+0 */ \
+ ld1 {v0.16b}, [x11]; /* load L_{ntz(i+3)} */ \
+ eor v6.16b, v6.16b, v5.16b; /* Offset_i+1 */ \
+ eor v1.16b, v1.16b, v5.16b; /* C_i+0 xor Offset_i+0 */ \
+ eor v7.16b, v7.16b, v6.16b; /* Offset_i+2 */ \
+ eor v2.16b, v2.16b, v6.16b; /* C_i+1 xor Offset_i+1 */ \
+ eor v0.16b, v0.16b, v7.16b; /* Offset_i+3 */ \
+ cmp x6, #4; \
+ eor v3.16b, v3.16b, v7.16b; /* C_i+2 xor Offset_i+2 */ \
+ eor v4.16b, v4.16b, v0.16b; /* C_i+3 xor Offset_i+3 */ \
+ \
+ do_aes_4_##bits(d, imc, v1, v2, v3, v4); \
+ \
+ eor v1.16b, v1.16b, v5.16b; /* xor Offset_i+0 */ \
+ eor v2.16b, v2.16b, v6.16b; /* xor Offset_i+1 */ \
+ eor v16.16b, v16.16b, v1.16b; /* Checksum_i+0 */ \
+ eor v3.16b, v3.16b, v7.16b; /* xor Offset_i+2 */ \
+ eor v16.16b, v16.16b, v2.16b; /* Checksum_i+1 */ \
+ eor v4.16b, v4.16b, v0.16b; /* xor Offset_i+3 */ \
+ eor v16.16b, v16.16b, v3.16b; /* Checksum_i+2 */ \
+ eor v16.16b, v16.16b, v4.16b; /* Checksum_i+3 */ \
+ st1 {v1.16b-v4.16b}, [x1], #64; \
+ \
+ b.hs .Locb_dec_loop4_##bits; \
+ CLEAR_REG(v3); \
+ CLEAR_REG(v4); \
+ CLEAR_REG(v5); \
+ CLEAR_REG(v6); \
+ CLEAR_REG(v7); \
+ cbz x6, .Locb_dec_done; \
+ \
+ .Locb_dec_loop_##bits: \
+ \
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ \
+ /* Checksum_i = Checksum_{i-1} xor P_i */ \
+ \
+ ldr ptr8, [x5], #(ptr_sz); \
+ ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \
+ ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
+ sub x6, x6, #1; \
+ eor v0.16b, v0.16b, v2.16b; \
+ eor v1.16b, v1.16b, v0.16b; \
+ \
+ do_aes_one##bits(d, imc, v1, v1) \
+ \
+ eor v1.16b, v1.16b, v0.16b; \
+ st1 {v1.16b}, [x1], #16; /* store plaintext */ \
+ eor v16.16b, v16.16b, v1.16b; \
+ \
+ cbnz x6, .Locb_dec_loop_##bits; \
+ b .Locb_dec_done;
+
+ OCB_DEC(128)
+ OCB_DEC(192)
+ OCB_DEC(256)
+
+#undef OCB_DEC
+
+.Locb_dec_done:
+ aes_clear_keys(w7)
+
+ st1 {v16.16b}, [x4] /* store checksum */
+ st1 {v0.16b}, [x3] /* store offset */
+
+ CLEAR_REG(v0)
+ CLEAR_REG(v1)
+ CLEAR_REG(v2)
+ CLEAR_REG(v16)
+
+ ret
+.size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce;
+
+
+/*
+ * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
+ * const unsigned char *abuf,
+ * unsigned char *offset,
+ * unsigned char *checksum,
+ * void **Ls,
+ * size_t nblocks,
+ * unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ocb_auth_armv8_ce
+.type _gcry_aes_ocb_auth_armv8_ce,%function;
+_gcry_aes_ocb_auth_armv8_ce:
+ /* input:
+ * x0: keysched
+ * x1: abuf
+ * x2: offset => x3
+ * x3: checksum => x4
+ * x4: Ls => x5
+ * x5: nblocks => x6 (0 < nblocks <= 32)
+ * w6: nrounds => w7
+ */
+ mov x7, x6
+ mov x6, x5
+ mov x5, x4
+ mov x4, x3
+ mov x3, x2
+
+ aes_preload_keys(x0, w7);
+
+ ld1 {v0.16b}, [x3] /* load offset */
+ ld1 {v16.16b}, [x4] /* load checksum */
+
+ beq .Locb_auth_entry_192
+ bhi .Locb_auth_entry_256
+
+#define OCB_AUTH(bits) \
+ .Locb_auth_entry_##bits: \
+ cmp x6, #4; \
+ b.lo .Locb_auth_loop_##bits; \
+ \
+ .Locb_auth_loop4_##bits: \
+ \
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ \
+ \
+ ldp ptr8, ptr9, [x5], #(ptr_sz*2); \
+ \
+ ld1 {v1.16b-v4.16b}, [x1], #64; /* load A_i+<0-3> */ \
+ ldp ptr10, ptr11, [x5], #(ptr_sz*2); \
+ sub x6, x6, #4; \
+ \
+ ld1 {v5.16b}, [x8]; /* load L_{ntz(i+0)} */ \
+ ld1 {v6.16b}, [x9]; /* load L_{ntz(i+1)} */ \
+ ld1 {v7.16b}, [x10]; /* load L_{ntz(i+2)} */ \
+ eor v5.16b, v5.16b, v0.16b; /* Offset_i+0 */ \
+ ld1 {v0.16b}, [x11]; /* load L_{ntz(i+3)} */ \
+ eor v6.16b, v6.16b, v5.16b; /* Offset_i+1 */ \
+ eor v1.16b, v1.16b, v5.16b; /* A_i+0 xor Offset_i+0 */ \
+ eor v7.16b, v7.16b, v6.16b; /* Offset_i+2 */ \
+ eor v2.16b, v2.16b, v6.16b; /* A_i+1 xor Offset_i+1 */ \
+ eor v0.16b, v0.16b, v7.16b; /* Offset_i+3 */ \
+ cmp x6, #4; \
+ eor v3.16b, v3.16b, v7.16b; /* A_i+2 xor Offset_i+2 */ \
+ eor v4.16b, v4.16b, v0.16b; /* A_i+3 xor Offset_i+3 */ \
+ \
+ do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+ \
+ eor v1.16b, v1.16b, v2.16b; \
+ eor v16.16b, v16.16b, v3.16b; \
+ eor v1.16b, v1.16b, v4.16b; \
+ eor v16.16b, v16.16b, v1.16b; \
+ \
+ b.hs .Locb_auth_loop4_##bits; \
+ CLEAR_REG(v3); \
+ CLEAR_REG(v4); \
+ CLEAR_REG(v5); \
+ CLEAR_REG(v6); \
+ CLEAR_REG(v7); \
+ cbz x6, .Locb_auth_done; \
+ \
+ .Locb_auth_loop_##bits: \
+ \
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ \
+ \
+ ldr ptr8, [x5], #(ptr_sz); \
+ ld1 {v1.16b}, [x1], #16; /* load aadtext */ \
+ ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
+ sub x6, x6, #1; \
+ eor v0.16b, v0.16b, v2.16b; \
+ eor v1.16b, v1.16b, v0.16b; \
+ \
+ do_aes_one##bits(e, mc, v1, v1) \
+ \
+ eor v16.16b, v16.16b, v1.16b; \
+ \
+ cbnz x6, .Locb_auth_loop_##bits; \
+ b .Locb_auth_done;
+
+ OCB_AUTH(128)
+ OCB_AUTH(192)
+ OCB_AUTH(256)
+
+#undef OCB_AUTH
+
+.Locb_auth_done:
+ aes_clear_keys(w7)
+
+ st1 {v16.16b}, [x4] /* store checksum */
+ st1 {v0.16b}, [x3] /* store offset */
+
+ CLEAR_REG(v0)
+ CLEAR_REG(v1)
+ CLEAR_REG(v2)
+ CLEAR_REG(v16)
+
+ ret
+.size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;
+
+
+/*
+ * u32 _gcry_aes_sbox4_armv8_ce(u32 in4b);
+ */
+.align 3
+.globl _gcry_aes_sbox4_armv8_ce
+.type _gcry_aes_sbox4_armv8_ce,%function;
+_gcry_aes_sbox4_armv8_ce:
+ /* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in
+ * Cryptology — CT-RSA 2015" for details.
+ */
+ movi v0.16b, #0x52
+ movi v1.16b, #0
+ mov v0.S[0], w0
+ aese v0.16b, v1.16b
+ addv s0, v0.4s
+ mov w0, v0.S[0]
+ CLEAR_REG(v0)
+ ret
+.size _gcry_aes_sbox4_armv8_ce,.-_gcry_aes_sbox4_armv8_ce;
+
+
+/*
+ * void _gcry_aes_invmixcol_armv8_ce(void *dst, const void *src);
+ */
+.align 3
+.globl _gcry_aes_invmixcol_armv8_ce
+.type _gcry_aes_invmixcol_armv8_ce,%function;
+_gcry_aes_invmixcol_armv8_ce:
+ ld1 {v0.16b}, [x1]
+ aesimc v0.16b, v0.16b
+ st1 {v0.16b}, [x0]
+ CLEAR_REG(v0)
+ ret
+.size _gcry_aes_invmixcol_armv8_ce,.-_gcry_aes_invmixcol_armv8_ce;
+
+#endif