summaryrefslogtreecommitdiff
path: root/cipher/rijndael-armv8-ce.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2016-07-14 17:55:28 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2016-07-14 17:55:28 +0300
commit05a4cecae0c02d2b4ee1cadd9c08115beae3a94a (patch)
treef52e7cc5468b8165814ed86e860acaf123d4985b /cipher/rijndael-armv8-ce.c
parent962b15470663db11e5c35b86768f1b5d8e600017 (diff)
downloadlibgcrypt-05a4cecae0c02d2b4ee1cadd9c08115beae3a94a.tar.gz
Add ARMv8/AArch32 Crypto Extension implementation of AES
* cipher/Makefile.am: Add 'rijndael-armv8-ce.c' and 'rijndael-armv-aarch32-ce.S'. * cipher/rijndael-armv8-aarch32-ce.S: New. * cipher/rijndael-armv8-ce.c: New. * cipher/rijndael-internal.h (USE_ARM_CE): New. (RIJNDAEL_context_s): Add 'use_arm_ce'. * cipher/rijndael.c [USE_ARM_CE] (_gcry_aes_armv8_ce_setkey) (_gcry_aes_armv8_ce_prepare_decryption) (_gcry_aes_armv8_ce_encrypt, _gcry_aes_armv8_ce_decrypt) (_gcry_aes_armv8_ce_cfb_enc, _gcry_aes_armv8_ce_cbc_enc) (_gcry_aes_armv8_ce_ctr_enc, _gcry_aes_armv8_ce_cfb_dec) (_gcry_aes_armv8_ce_cbc_dec, _gcry_aes_armv8_ce_ocb_crypt) (_gcry_aes_armv8_ce_ocb_auth): New. (do_setkey) [USE_ARM_CE]: Add ARM CE/AES HW feature check and key setup for ARM CE. (prepare_decryption, _gcry_aes_cfb_enc, _gcry_aes_cbc_enc) (_gcry_aes_ctr_enc, _gcry_aes_cfb_dec, _gcry_aes_cbc_dec) (_gcry_aes_ocb_crypt, _gcry_aes_ocb_auth) [USE_ARM_CE]: Add ARM CE support. * configure.ac: Add 'rijndael-armv8-ce.lo' and 'rijndael-armv8-aarch32-ce.lo'. -- Improvement vs ARM assembly on Cortex-A53: AES-128 AES-192 AES-256 CBC enc: 14.8x 12.8x 11.4x CBC dec: 21.4x 20.5x 19.4x CFB enc: 16.2x 13.6x 11.6x CFB dec: 21.6x 20.5x 19.4x CTR: 19.1x 18.6x 17.8x OCB enc: 16.0x 16.2x 16.1x OCB dec: 15.6x 15.9x 15.8x OCB auth: 18.3x 18.4x 18.0x Benchmark on Cortex-A53 (1152 Mhz): Before: AES | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 24.42 ns/B 39.06 MiB/s 28.13 c/B ECB dec | 25.07 ns/B 38.05 MiB/s 28.88 c/B CBC enc | 21.05 ns/B 45.30 MiB/s 24.25 c/B CBC dec | 21.16 ns/B 45.07 MiB/s 24.38 c/B CFB enc | 21.05 ns/B 45.31 MiB/s 24.25 c/B CFB dec | 21.38 ns/B 44.61 MiB/s 24.62 c/B OFB enc | 26.15 ns/B 36.47 MiB/s 30.13 c/B OFB dec | 26.15 ns/B 36.47 MiB/s 30.13 c/B CTR enc | 21.17 ns/B 45.06 MiB/s 24.38 c/B CTR dec | 21.16 ns/B 45.06 MiB/s 24.38 c/B CCM enc | 42.32 ns/B 22.53 MiB/s 48.75 c/B CCM dec | 42.32 ns/B 22.53 MiB/s 48.75 c/B CCM auth | 21.17 ns/B 45.06 MiB/s 24.38 c/B GCM enc | 22.08 ns/B 43.19 MiB/s 25.44 c/B GCM dec | 22.08 ns/B 43.18 MiB/s 25.44 c/B GCM auth | 0.923 ns/B 1032.8 MiB/s 1.06 c/B OCB enc | 26.20 ns/B 36.40 MiB/s 30.18 c/B OCB dec | 25.97 ns/B 36.73 MiB/s 29.91 c/B OCB auth | 24.52 ns/B 38.90 MiB/s 28.24 c/B = AES192 | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 27.83 ns/B 34.26 MiB/s 32.06 c/B ECB dec | 28.54 ns/B 33.42 MiB/s 32.88 c/B CBC enc | 24.47 ns/B 38.97 MiB/s 28.19 c/B CBC dec | 25.27 ns/B 37.74 MiB/s 29.11 c/B CFB enc | 25.08 ns/B 38.02 MiB/s 28.89 c/B CFB dec | 25.31 ns/B 37.68 MiB/s 29.16 c/B OFB enc | 29.57 ns/B 32.25 MiB/s 34.06 c/B OFB dec | 29.57 ns/B 32.25 MiB/s 34.06 c/B CTR enc | 25.24 ns/B 37.78 MiB/s 29.08 c/B CTR dec | 25.24 ns/B 37.79 MiB/s 29.08 c/B CCM enc | 49.81 ns/B 19.15 MiB/s 57.38 c/B CCM dec | 49.80 ns/B 19.15 MiB/s 57.37 c/B CCM auth | 24.58 ns/B 38.80 MiB/s 28.32 c/B GCM enc | 26.15 ns/B 36.47 MiB/s 30.13 c/B GCM dec | 26.11 ns/B 36.52 MiB/s 30.08 c/B GCM auth | 0.923 ns/B 1033.0 MiB/s 1.06 c/B OCB enc | 29.59 ns/B 32.23 MiB/s 34.09 c/B OCB dec | 29.42 ns/B 32.42 MiB/s 33.89 c/B OCB auth | 27.92 ns/B 34.16 MiB/s 32.16 c/B = AES256 | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 31.20 ns/B 30.57 MiB/s 35.94 c/B ECB dec | 31.80 ns/B 29.99 MiB/s 36.63 c/B CBC enc | 27.83 ns/B 34.27 MiB/s 32.06 c/B CBC dec | 27.87 ns/B 34.21 MiB/s 32.11 c/B CFB enc | 27.88 ns/B 34.20 MiB/s 32.12 c/B CFB dec | 28.16 ns/B 33.87 MiB/s 32.44 c/B OFB enc | 32.93 ns/B 28.96 MiB/s 37.94 c/B OFB dec | 32.93 ns/B 28.96 MiB/s 37.94 c/B CTR enc | 27.95 ns/B 34.13 MiB/s 32.19 c/B CTR dec | 27.95 ns/B 34.12 MiB/s 32.20 c/B CCM enc | 55.88 ns/B 17.07 MiB/s 64.38 c/B CCM dec | 55.88 ns/B 17.07 MiB/s 64.38 c/B CCM auth | 27.95 ns/B 34.12 MiB/s 32.20 c/B GCM enc | 28.86 ns/B 33.05 MiB/s 33.25 c/B GCM dec | 28.87 ns/B 33.04 MiB/s 33.25 c/B GCM auth | 0.923 ns/B 1033.0 MiB/s 1.06 c/B OCB enc | 32.96 ns/B 28.94 MiB/s 37.97 c/B OCB dec | 32.73 ns/B 29.14 MiB/s 37.70 c/B OCB auth | 31.29 ns/B 30.48 MiB/s 36.04 c/B After: AES | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 5.10 ns/B 187.0 MiB/s 5.88 c/B ECB dec | 5.27 ns/B 181.0 MiB/s 6.07 c/B CBC enc | 1.41 ns/B 675.8 MiB/s 1.63 c/B CBC dec | 0.992 ns/B 961.7 MiB/s 1.14 c/B CFB enc | 1.30 ns/B 732.4 MiB/s 1.50 c/B CFB dec | 0.991 ns/B 962.7 MiB/s 1.14 c/B OFB enc | 7.05 ns/B 135.2 MiB/s 8.13 c/B OFB dec | 7.05 ns/B 135.2 MiB/s 8.13 c/B CTR enc | 1.11 ns/B 856.9 MiB/s 1.28 c/B CTR dec | 1.11 ns/B 857.0 MiB/s 1.28 c/B CCM enc | 2.58 ns/B 369.8 MiB/s 2.97 c/B CCM dec | 2.58 ns/B 369.5 MiB/s 2.97 c/B CCM auth | 1.58 ns/B 605.2 MiB/s 1.82 c/B GCM enc | 2.04 ns/B 467.9 MiB/s 2.35 c/B GCM dec | 2.04 ns/B 466.6 MiB/s 2.35 c/B GCM auth | 0.923 ns/B 1033.0 MiB/s 1.06 c/B OCB enc | 1.64 ns/B 579.8 MiB/s 1.89 c/B OCB dec | 1.66 ns/B 574.5 MiB/s 1.91 c/B OCB auth | 1.33 ns/B 715.5 MiB/s 1.54 c/B = AES192 | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 5.64 ns/B 169.0 MiB/s 6.50 c/B ECB dec | 5.81 ns/B 164.3 MiB/s 6.69 c/B CBC enc | 1.90 ns/B 502.1 MiB/s 2.19 c/B CBC dec | 1.24 ns/B 771.7 MiB/s 1.42 c/B CFB enc | 1.84 ns/B 517.1 MiB/s 2.12 c/B CFB dec | 1.23 ns/B 772.5 MiB/s 1.42 c/B OFB enc | 7.60 ns/B 125.5 MiB/s 8.75 c/B OFB dec | 7.60 ns/B 125.6 MiB/s 8.75 c/B CTR enc | 1.36 ns/B 702.7 MiB/s 1.56 c/B CTR dec | 1.36 ns/B 702.5 MiB/s 1.56 c/B CCM enc | 3.31 ns/B 287.8 MiB/s 3.82 c/B CCM dec | 3.31 ns/B 288.0 MiB/s 3.81 c/B CCM auth | 2.06 ns/B 462.1 MiB/s 2.38 c/B GCM enc | 2.28 ns/B 418.4 MiB/s 2.63 c/B GCM dec | 2.28 ns/B 418.0 MiB/s 2.63 c/B GCM auth | 0.923 ns/B 1032.8 MiB/s 1.06 c/B OCB enc | 1.83 ns/B 520.1 MiB/s 2.11 c/B OCB dec | 1.84 ns/B 517.8 MiB/s 2.12 c/B OCB auth | 1.52 ns/B 626.1 MiB/s 1.75 c/B = AES256 | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 5.86 ns/B 162.7 MiB/s 6.75 c/B ECB dec | 6.02 ns/B 158.3 MiB/s 6.94 c/B CBC enc | 2.44 ns/B 390.5 MiB/s 2.81 c/B CBC dec | 1.45 ns/B 656.4 MiB/s 1.67 c/B CFB enc | 2.39 ns/B 399.5 MiB/s 2.75 c/B CFB dec | 1.45 ns/B 656.8 MiB/s 1.67 c/B OFB enc | 7.81 ns/B 122.1 MiB/s 9.00 c/B OFB dec | 7.81 ns/B 122.1 MiB/s 9.00 c/B CTR enc | 1.57 ns/B 605.8 MiB/s 1.81 c/B CTR dec | 1.57 ns/B 605.9 MiB/s 1.81 c/B CCM enc | 4.07 ns/B 234.3 MiB/s 4.69 c/B CCM dec | 4.07 ns/B 234.1 MiB/s 4.69 c/B CCM auth | 2.61 ns/B 365.7 MiB/s 3.00 c/B GCM enc | 2.50 ns/B 381.9 MiB/s 2.88 c/B GCM dec | 2.49 ns/B 382.3 MiB/s 2.87 c/B GCM auth | 0.926 ns/B 1029.7 MiB/s 1.07 c/B OCB enc | 2.05 ns/B 465.6 MiB/s 2.36 c/B OCB dec | 2.06 ns/B 462.0 MiB/s 2.38 c/B OCB auth | 1.74 ns/B 548.4 MiB/s 2.00 c/B Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/rijndael-armv8-ce.c')
-rw-r--r--cipher/rijndael-armv8-ce.c469
1 files changed, 469 insertions, 0 deletions
diff --git a/cipher/rijndael-armv8-ce.c b/cipher/rijndael-armv8-ce.c
new file mode 100644
index 00000000..bed40665
--- /dev/null
+++ b/cipher/rijndael-armv8-ce.c
@@ -0,0 +1,469 @@
+/* ARMv8 Crypto Extension AES for Libgcrypt
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h> /* for memcmp() */
+
+#include "types.h" /* for byte and u32 typedefs */
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-selftest.h"
+#include "rijndael-internal.h"
+#include "./cipher-internal.h"
+
+
+#ifdef USE_ARM_CE
+
+
+typedef struct u128_s { u32 a, b, c, d; } u128_t;
+
+extern u32 _gcry_aes_sbox4_armv8_ce(u32 in4b);
+extern void _gcry_aes_invmixcol_armv8_ce(u128_t *dst, const u128_t *src);
+
+extern unsigned int _gcry_aes_enc_armv8_ce(const void *keysched, byte *dst,
+ const byte *src,
+ unsigned int nrounds);
+extern unsigned int _gcry_aes_dec_armv8_ce(const void *keysched, byte *dst,
+ const byte *src,
+ unsigned int nrounds);
+
+extern void _gcry_aes_cbc_enc_armv8_ce (const void *keysched,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *iv, size_t nblocks,
+ int cbc_mac, unsigned int nrounds);
+extern void _gcry_aes_cbc_dec_armv8_ce (const void *keysched,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *iv, size_t nblocks,
+ unsigned int nrounds);
+
+extern void _gcry_aes_cfb_enc_armv8_ce (const void *keysched,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *iv, size_t nblocks,
+ unsigned int nrounds);
+extern void _gcry_aes_cfb_dec_armv8_ce (const void *keysched,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *iv, size_t nblocks,
+ unsigned int nrounds);
+
+extern void _gcry_aes_ctr_enc_armv8_ce (const void *keysched,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *iv, size_t nblocks,
+ unsigned int nrounds);
+
+extern void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *offset,
+ unsigned char *checksum,
+ void **Ls,
+ size_t nblocks,
+ unsigned int nrounds);
+extern void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *offset,
+ unsigned char *checksum,
+ void **Ls,
+ size_t nblocks,
+ unsigned int nrounds);
+extern void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
+ const unsigned char *abuf,
+ unsigned char *offset,
+ unsigned char *checksum,
+ void **Ls,
+ size_t nblocks,
+ unsigned int nrounds);
+
+typedef void (*ocb_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *offset, unsigned char *checksum,
+ void **Ls, size_t nblocks,
+ unsigned int nrounds);
+
+void
+_gcry_aes_armv8_ce_setkey (RIJNDAEL_context *ctx, const byte *key)
+{
+ union
+ {
+ PROPERLY_ALIGNED_TYPE dummy;
+ byte data[MAXKC][4];
+ u32 data32[MAXKC];
+ } tkk[2];
+ unsigned int rounds = ctx->rounds;
+ int KC = rounds - 6;
+ unsigned int keylen = KC * 4;
+ unsigned int i, r, t;
+ byte rcon = 1;
+ int j;
+#define k tkk[0].data
+#define k_u32 tkk[0].data32
+#define tk tkk[1].data
+#define tk_u32 tkk[1].data32
+#define W (ctx->keyschenc)
+#define W_u32 (ctx->keyschenc32)
+
+ for (i = 0; i < keylen; i++)
+ {
+ k[i >> 2][i & 3] = key[i];
+ }
+
+ for (j = KC-1; j >= 0; j--)
+ {
+ tk_u32[j] = k_u32[j];
+ }
+ r = 0;
+ t = 0;
+ /* Copy values into round key array. */
+ for (j = 0; (j < KC) && (r < rounds + 1); )
+ {
+ for (; (j < KC) && (t < 4); j++, t++)
+ {
+ W_u32[r][t] = le_bswap32(tk_u32[j]);
+ }
+ if (t == 4)
+ {
+ r++;
+ t = 0;
+ }
+ }
+
+ while (r < rounds + 1)
+ {
+ tk_u32[0] ^= _gcry_aes_sbox4_armv8_ce(rol(tk_u32[KC - 1], 24)) ^ rcon;
+
+ if (KC != 8)
+ {
+ for (j = 1; j < KC; j++)
+ {
+ tk_u32[j] ^= tk_u32[j-1];
+ }
+ }
+ else
+ {
+ for (j = 1; j < KC/2; j++)
+ {
+ tk_u32[j] ^= tk_u32[j-1];
+ }
+
+ tk_u32[KC/2] ^= _gcry_aes_sbox4_armv8_ce(tk_u32[KC/2 - 1]);
+
+ for (j = KC/2 + 1; j < KC; j++)
+ {
+ tk_u32[j] ^= tk_u32[j-1];
+ }
+ }
+
+ /* Copy values into round key array. */
+ for (j = 0; (j < KC) && (r < rounds + 1); )
+ {
+ for (; (j < KC) && (t < 4); j++, t++)
+ {
+ W_u32[r][t] = le_bswap32(tk_u32[j]);
+ }
+ if (t == 4)
+ {
+ r++;
+ t = 0;
+ }
+ }
+
+ rcon = (rcon << 1) ^ ((rcon >> 7) * 0x1b);
+ }
+
+#undef W
+#undef tk
+#undef k
+#undef W_u32
+#undef tk_u32
+#undef k_u32
+ wipememory(&tkk, sizeof(tkk));
+}
+
+/* Make a decryption key from an encryption key. */
+void
+_gcry_aes_armv8_ce_prepare_decryption (RIJNDAEL_context *ctx)
+{
+ u128_t *ekey = (u128_t *)(void *)ctx->keyschenc;
+ u128_t *dkey = (u128_t *)(void *)ctx->keyschdec;
+ int rounds = ctx->rounds;
+ int rr;
+ int r;
+
+#define DO_AESIMC() _gcry_aes_invmixcol_armv8_ce(&dkey[r], &ekey[rr])
+
+ dkey[0] = ekey[rounds];
+ r = 1;
+ rr = rounds-1;
+ DO_AESIMC(); r++; rr--; /* round 1 */
+ DO_AESIMC(); r++; rr--; /* round 2 */
+ DO_AESIMC(); r++; rr--; /* round 3 */
+ DO_AESIMC(); r++; rr--; /* round 4 */
+ DO_AESIMC(); r++; rr--; /* round 5 */
+ DO_AESIMC(); r++; rr--; /* round 6 */
+ DO_AESIMC(); r++; rr--; /* round 7 */
+ DO_AESIMC(); r++; rr--; /* round 8 */
+ DO_AESIMC(); r++; rr--; /* round 9 */
+ if (rounds >= 12)
+ {
+ if (rounds > 12)
+ {
+ DO_AESIMC(); r++; rr--; /* round 10 */
+ DO_AESIMC(); r++; rr--; /* round 11 */
+ }
+
+ DO_AESIMC(); r++; rr--; /* round 12 / 10 */
+ DO_AESIMC(); r++; rr--; /* round 13 / 11 */
+ }
+
+ dkey[r] = ekey[0];
+
+#undef DO_AESIMC
+}
+
+unsigned int
+_gcry_aes_armv8_ce_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
+ const unsigned char *src)
+{
+ const void *keysched = ctx->keyschenc32;
+ unsigned int nrounds = ctx->rounds;
+
+ return _gcry_aes_enc_armv8_ce(keysched, dst, src, nrounds);
+}
+
+unsigned int
+_gcry_aes_armv8_ce_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
+ const unsigned char *src)
+{
+ const void *keysched = ctx->keyschdec32;
+ unsigned int nrounds = ctx->rounds;
+
+ return _gcry_aes_dec_armv8_ce(keysched, dst, src, nrounds);
+}
+
+void
+_gcry_aes_armv8_ce_cbc_enc (const RIJNDAEL_context *ctx, unsigned char *outbuf,
+ const unsigned char *inbuf, unsigned char *iv,
+ size_t nblocks, int cbc_mac)
+{
+ const void *keysched = ctx->keyschenc32;
+ unsigned int nrounds = ctx->rounds;
+
+ _gcry_aes_cbc_enc_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, cbc_mac,
+ nrounds);
+}
+
+void
+_gcry_aes_armv8_ce_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
+ const unsigned char *inbuf, unsigned char *iv,
+ size_t nblocks)
+{
+ const void *keysched = ctx->keyschdec32;
+ unsigned int nrounds = ctx->rounds;
+
+ _gcry_aes_cbc_dec_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds);
+}
+
+void
+_gcry_aes_armv8_ce_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
+ const unsigned char *inbuf, unsigned char *iv,
+ size_t nblocks)
+{
+ const void *keysched = ctx->keyschenc32;
+ unsigned int nrounds = ctx->rounds;
+
+ _gcry_aes_cfb_enc_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds);
+}
+
+void
+_gcry_aes_armv8_ce_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
+ const unsigned char *inbuf, unsigned char *iv,
+ size_t nblocks)
+{
+ const void *keysched = ctx->keyschenc32;
+ unsigned int nrounds = ctx->rounds;
+
+ _gcry_aes_cfb_dec_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds);
+}
+
+void
+_gcry_aes_armv8_ce_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
+ const unsigned char *inbuf, unsigned char *iv,
+ size_t nblocks)
+{
+ const void *keysched = ctx->keyschenc32;
+ unsigned int nrounds = ctx->rounds;
+
+ _gcry_aes_ctr_enc_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds);
+}
+
+void
+_gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks,
+ int encrypt)
+{
+ RIJNDAEL_context *ctx = (void *)&c->context.c;
+ const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
+ ocb_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_ocb_enc_armv8_ce
+ : _gcry_aes_ocb_dec_armv8_ce;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ unsigned int nrounds = ctx->rounds;
+ u64 blkn = c->u_mode.ocb.data_nblocks;
+ u64 blkn_offs = blkn - blkn % 32;
+ unsigned int n = 32 - blkn % 32;
+ unsigned char l_tmp[16];
+ void *Ls[32];
+ void **l;
+ size_t i;
+
+ c->u_mode.ocb.data_nblocks = blkn + nblocks;
+
+ if (nblocks >= 32)
+ {
+ for (i = 0; i < 32; i += 8)
+ {
+ Ls[(i + 0 + n) % 32] = (void *)c->u_mode.ocb.L[0];
+ Ls[(i + 1 + n) % 32] = (void *)c->u_mode.ocb.L[1];
+ Ls[(i + 2 + n) % 32] = (void *)c->u_mode.ocb.L[0];
+ Ls[(i + 3 + n) % 32] = (void *)c->u_mode.ocb.L[2];
+ Ls[(i + 4 + n) % 32] = (void *)c->u_mode.ocb.L[0];
+ Ls[(i + 5 + n) % 32] = (void *)c->u_mode.ocb.L[1];
+ Ls[(i + 6 + n) % 32] = (void *)c->u_mode.ocb.L[0];
+ }
+
+ Ls[(7 + n) % 32] = (void *)c->u_mode.ocb.L[3];
+ Ls[(15 + n) % 32] = (void *)c->u_mode.ocb.L[4];
+ Ls[(23 + n) % 32] = (void *)c->u_mode.ocb.L[3];
+ l = &Ls[(31 + n) % 32];
+
+ /* Process data in 32 block chunks. */
+ while (nblocks >= 32)
+ {
+ /* l_tmp will be used only every 65536-th block. */
+ blkn_offs += 32;
+ *l = (void *)ocb_get_l(c, l_tmp, blkn_offs);
+
+ crypt_fn(keysched, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls, 32,
+ nrounds);
+
+ nblocks -= 32;
+ outbuf += 32 * 16;
+ inbuf += 32 * 16;
+ }
+
+ if (nblocks && l < &Ls[nblocks])
+ {
+ *l = (void *)ocb_get_l(c, l_tmp, 32 + blkn_offs);
+ }
+ }
+ else
+ {
+ for (i = 0; i < nblocks; i++)
+ Ls[i] = (void *)ocb_get_l(c, l_tmp, ++blkn);
+ }
+
+ if (nblocks)
+ {
+ crypt_fn(keysched, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls, nblocks,
+ nrounds);
+ }
+
+ wipememory(&l_tmp, sizeof(l_tmp));
+}
+
+void
+_gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,
+ size_t nblocks)
+{
+ RIJNDAEL_context *ctx = (void *)&c->context.c;
+ const void *keysched = ctx->keyschenc32;
+ const unsigned char *abuf = abuf_arg;
+ unsigned int nrounds = ctx->rounds;
+ u64 blkn = c->u_mode.ocb.aad_nblocks;
+ u64 blkn_offs = blkn - blkn % 32;
+ unsigned int n = 32 - blkn % 32;
+ unsigned char l_tmp[16];
+ void *Ls[32];
+ void **l;
+ size_t i;
+
+ c->u_mode.ocb.aad_nblocks = blkn + nblocks;
+
+ if (nblocks >= 32)
+ {
+ for (i = 0; i < 32; i += 8)
+ {
+ Ls[(i + 0 + n) % 32] = (void *)c->u_mode.ocb.L[0];
+ Ls[(i + 1 + n) % 32] = (void *)c->u_mode.ocb.L[1];
+ Ls[(i + 2 + n) % 32] = (void *)c->u_mode.ocb.L[0];
+ Ls[(i + 3 + n) % 32] = (void *)c->u_mode.ocb.L[2];
+ Ls[(i + 4 + n) % 32] = (void *)c->u_mode.ocb.L[0];
+ Ls[(i + 5 + n) % 32] = (void *)c->u_mode.ocb.L[1];
+ Ls[(i + 6 + n) % 32] = (void *)c->u_mode.ocb.L[0];
+ }
+
+ Ls[(7 + n) % 32] = (void *)c->u_mode.ocb.L[3];
+ Ls[(15 + n) % 32] = (void *)c->u_mode.ocb.L[4];
+ Ls[(23 + n) % 32] = (void *)c->u_mode.ocb.L[3];
+ l = &Ls[(31 + n) % 32];
+
+ /* Process data in 32 block chunks. */
+ while (nblocks >= 32)
+ {
+ /* l_tmp will be used only every 65536-th block. */
+ blkn_offs += 32;
+ *l = (void *)ocb_get_l(c, l_tmp, blkn_offs);
+
+ _gcry_aes_ocb_auth_armv8_ce(keysched, abuf, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls, 32, nrounds);
+
+ nblocks -= 32;
+ abuf += 32 * 16;
+ }
+
+ if (nblocks && l < &Ls[nblocks])
+ {
+ *l = (void *)ocb_get_l(c, l_tmp, 32 + blkn_offs);
+ }
+ }
+ else
+ {
+ for (i = 0; i < nblocks; i++)
+ Ls[i] = (void *)ocb_get_l(c, l_tmp, ++blkn);
+ }
+
+ if (nblocks)
+ {
+ _gcry_aes_ocb_auth_armv8_ce(keysched, abuf, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls, nblocks, nrounds);
+ }
+
+ wipememory(&l_tmp, sizeof(l_tmp));
+}
+
+#endif /* USE_ARM_CE */