summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMakoto Kato <m_kato@ga2.so-net.ne.jp>2019-10-11 19:33:06 +0000
committerMakoto Kato <m_kato@ga2.so-net.ne.jp>2019-10-11 19:33:06 +0000
commit6fde68ff510a779fd12b02f08ba3ae081cf8b6d5 (patch)
tree36c72881d54b2c4e801308c096347f06d644425a
parentd2a64b472badb3dc3817305705f0031ba21ce13f (diff)
downloadnss-hg-6fde68ff510a779fd12b02f08ba3ae081cf8b6d5.tar.gz
Bug 1152625 - Support AES HW acceleration on ARMv8. r=kjacobs,jcj
Differential Revision: https://phabricator.services.mozilla.com/D34473
-rw-r--r--lib/freebl/Makefile23
-rw-r--r--lib/freebl/aes-armv8.c1168
-rw-r--r--lib/freebl/aes-armv8.h103
-rw-r--r--lib/freebl/freebl.gyp43
-rw-r--r--lib/freebl/intel-aes.h6
-rw-r--r--lib/freebl/rijndael.c21
6 files changed, 1356 insertions, 8 deletions
diff --git a/lib/freebl/Makefile b/lib/freebl/Makefile
index 81ea8b734..5943fb377 100644
--- a/lib/freebl/Makefile
+++ b/lib/freebl/Makefile
@@ -120,7 +120,24 @@ else
endif
endif
ifeq ($(CPU_ARCH),aarch64)
- EXTRA_SRCS += gcm-aarch64.c
+ DEFINES += -DUSE_HW_AES
+ EXTRA_SRCS += aes-armv8.c gcm-aarch64.c
+endif
+ifeq ($(CPU_ARCH),arm)
+ ifdef CC_IS_CLANG
+ DEFINES += -DUSE_HW_AES
+ EXTRA_SRCS += aes-armv8.c
+ else ifeq (1,$(CC_IS_GCC))
+ # Old compiler doesn't support ARM AES.
+ ifneq (,$(filter 4.9,$(word 1,$(GCC_VERSION)).$(word 2,$(GCC_VERSION))))
+ DEFINES += -DUSE_HW_AES
+ EXTRA_SRCS += aes-armv8.c
+ endif
+ ifeq (,$(filter 0 1 2 3 4,$(word 1,$(GCC_VERSION))))
+ DEFINES += -DUSE_HW_AES
+ EXTRA_SRCS += aes-armv8.c
+ endif
+ endif
endif
ifeq ($(OS_TARGET),OSF1)
@@ -761,6 +778,10 @@ ifdef INTEL_GCM_CLANG_CL
$(OBJDIR)/$(PROG_PREFIX)intel-gcm-wrap$(OBJ_SUFFIX): CFLAGS += -mssse3
endif
+ifeq ($(CPU_ARCH),arm)
+$(OBJDIR)/$(PROG_PREFIX)aes-armv8$(OBJ_SUFFIX): CFLAGS += -march=armv8-a -mfpu=crypto-neon-fp-armv8
+endif
ifeq ($(CPU_ARCH),aarch64)
+$(OBJDIR)/$(PROG_PREFIX)aes-armv8$(OBJ_SUFFIX): CFLAGS += -march=armv8-a+crypto
$(OBJDIR)/$(PROG_PREFIX)gcm-aarch64$(OBJ_SUFFIX): CFLAGS += -march=armv8-a+crypto
endif
diff --git a/lib/freebl/aes-armv8.c b/lib/freebl/aes-armv8.c
new file mode 100644
index 000000000..1cc7e0d7c
--- /dev/null
+++ b/lib/freebl/aes-armv8.c
@@ -0,0 +1,1168 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "secerr.h"
+#include "rijndael.h"
+
+#if (defined(__clang__) || \
+ (defined(__GNUC__) && defined(__GNUC_MINOR__) && \
+ (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 8))))
+
+#ifndef __ARM_FEATURE_CRYPTO
+#error "Compiler option is invalid"
+#endif
+
+#include <arm_neon.h>
+
+SECStatus
+arm_aes_encrypt_ecb_128(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+ pre_align unsigned char buf[16] post_align;
+#endif
+ uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+ uint8x16_t key11;
+ const PRUint8 *key = (const PRUint8 *)cx->expandedKey;
+
+ if (!inputLen) {
+ return SECSuccess;
+ }
+
+ key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+ key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+ key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+ key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+ key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+ key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+ key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+ key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+ key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+ key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+ key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+
+ while (inputLen > 0) {
+ uint8x16_t state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+ state = vld1q_u8(input);
+#else
+ if ((uintptr_t)input & 0x7) {
+ memcpy(buf, input, 16);
+ state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+ } else {
+ state = vld1q_u8(__builtin_assume_aligned(input, 8));
+ }
+#endif
+ input += 16;
+ inputLen -= 16;
+
+ /* Rounds */
+ state = vaeseq_u8(state, key1);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key2);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key3);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key4);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key5);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key6);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key7);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key8);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key9);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key10);
+ /* AddRoundKey */
+ state = veorq_u8(state, key11);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+ vst1q_u8(output, state);
+#else
+ if ((uintptr_t)output & 0x7) {
+ vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+ memcpy(output, buf, 16);
+ } else {
+ vst1q_u8(__builtin_assume_aligned(output, 8), state);
+ }
+#endif
+ output += 16;
+ }
+
+ return SECSuccess;
+}
+
+SECStatus
+arm_aes_decrypt_ecb_128(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+ pre_align unsigned char buf[16] post_align;
+#endif
+ uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+ uint8x16_t key11;
+ const PRUint8 *key = (const PRUint8 *)cx->expandedKey;
+
+ if (inputLen == 0) {
+ return SECSuccess;
+ }
+
+ key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+ key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+ key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+ key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+ key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+ key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+ key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+ key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+ key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+ key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+ key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+
+ while (inputLen > 0) {
+ uint8x16_t state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+ state = vld1q_u8(input);
+#else
+ if ((uintptr_t)input & 0x7) {
+ memcpy(buf, input, 16);
+ state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+ } else {
+ state = vld1q_u8(__builtin_assume_aligned(input, 8));
+ }
+#endif
+ input += 16;
+ inputLen -= 16;
+
+ /* Rounds */
+ state = vaesdq_u8(state, key11);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key10);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key9);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key8);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key7);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key6);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key5);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key4);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key3);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key2);
+ /* AddRoundKey */
+ state = veorq_u8(state, key1);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+ vst1q_u8(output, state);
+#else
+ if ((uintptr_t)output & 0x7) {
+ vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+ memcpy(output, buf, 16);
+ } else {
+ vst1q_u8(__builtin_assume_aligned(output, 8), state);
+ }
+#endif
+ output += 16;
+ }
+
+ return SECSuccess;
+}
+
+SECStatus
+arm_aes_encrypt_cbc_128(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+ pre_align unsigned char buf[16] post_align;
+#endif
+ uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+ uint8x16_t key11;
+ uint8x16_t iv;
+ const PRUint8 *key = (const PRUint8 *)cx->expandedKey;
+
+ if (!inputLen) {
+ return SECSuccess;
+ }
+
+ /* iv */
+ iv = vld1q_u8(__builtin_assume_aligned(cx->iv, 16));
+
+ /* expanedKey */
+ key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+ key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+ key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+ key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+ key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+ key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+ key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+ key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+ key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+ key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+ key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+
+ while (inputLen > 0) {
+ uint8x16_t state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+ state = vld1q_u8(input);
+#else
+ if ((uintptr_t)input & 0x7) {
+ memcpy(buf, input, 16);
+ state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+ } else {
+ state = vld1q_u8(__builtin_assume_aligned(input, 8));
+ }
+#endif
+ input += 16;
+ inputLen -= 16;
+
+ state = veorq_u8(state, iv);
+
+ /* Rounds */
+ state = vaeseq_u8(state, key1);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key2);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key3);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key4);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key5);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key6);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key7);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key8);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key9);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key10);
+ /* AddRoundKey */
+ state = veorq_u8(state, key11);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+ vst1q_u8(output, state);
+#else
+ if ((uintptr_t)output & 0x7) {
+ vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+ memcpy(output, buf, 16);
+ } else {
+ vst1q_u8(__builtin_assume_aligned(output, 8), state);
+ }
+#endif
+ output += 16;
+ iv = state;
+ }
+ vst1q_u8(__builtin_assume_aligned(cx->iv, 16), iv);
+
+ return SECSuccess;
+}
+
+SECStatus
+arm_aes_decrypt_cbc_128(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+ pre_align unsigned char buf[16] post_align;
+#endif
+ uint8x16_t iv;
+ uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+ uint8x16_t key11;
+ const PRUint8 *key = (const PRUint8 *)cx->expandedKey;
+
+ if (!inputLen) {
+ return SECSuccess;
+ }
+
+ /* iv */
+ iv = vld1q_u8(__builtin_assume_aligned(cx->iv, 16));
+
+ /* expanedKey */
+ key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+ key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+ key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+ key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+ key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+ key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+ key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+ key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+ key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+ key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+ key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+
+ while (inputLen > 0) {
+ uint8x16_t state, old_state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+ state = vld1q_u8(input);
+#else
+ if ((uintptr_t)input & 0x7) {
+ memcpy(buf, input, 16);
+ state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+ } else {
+ state = vld1q_u8(__builtin_assume_aligned(input, 8));
+ }
+#endif
+ old_state = state;
+ input += 16;
+ inputLen -= 16;
+
+ /* Rounds */
+ state = vaesdq_u8(state, key11);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key10);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key9);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key8);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key7);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key6);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key5);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key4);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key3);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key2);
+ /* AddRoundKey */
+ state = veorq_u8(state, key1);
+
+ state = veorq_u8(state, iv);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+ vst1q_u8(output, state);
+#else
+ if ((uintptr_t)output & 0x7) {
+ vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+ memcpy(output, buf, 16);
+ } else {
+ vst1q_u8(__builtin_assume_aligned(output, 8), state);
+ }
+#endif
+ output += 16;
+
+ iv = old_state;
+ }
+ vst1q_u8(__builtin_assume_aligned(cx->iv, 16), iv);
+
+ return SECSuccess;
+}
+
+SECStatus
+arm_aes_encrypt_ecb_192(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+ pre_align unsigned char buf[16] post_align;
+#endif
+ uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+ uint8x16_t key11, key12, key13;
+ PRUint8 *key = (PRUint8 *)cx->expandedKey;
+
+ if (!inputLen) {
+ return SECSuccess;
+ }
+
+ key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+ key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+ key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+ key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+ key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+ key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+ key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+ key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+ key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+ key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+ key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+ key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16));
+ key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16));
+
+ while (inputLen > 0) {
+ uint8x16_t state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+ state = vld1q_u8(input);
+#else
+ if ((uintptr_t)input & 0x7) {
+ memcpy(buf, input, 16);
+ state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+ } else {
+ state = vld1q_u8(__builtin_assume_aligned(input, 8));
+ }
+#endif
+ input += 16;
+ inputLen -= 16;
+
+ /* Rounds */
+ state = vaeseq_u8(state, key1);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key2);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key3);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key4);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key5);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key6);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key7);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key8);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key9);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key10);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key11);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key12);
+ /* AddRoundKey */
+ state = veorq_u8(state, key13);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+ vst1q_u8(output, state);
+#else
+ if ((uintptr_t)output & 0x7) {
+ vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+ memcpy(output, buf, 16);
+ } else {
+ vst1q_u8(__builtin_assume_aligned(output, 8), state);
+ }
+#endif
+ output += 16;
+ }
+
+ return SECSuccess;
+}
+
+SECStatus
+arm_aes_decrypt_ecb_192(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+ pre_align unsigned char buf[16] post_align;
+#endif
+ uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+ uint8x16_t key11, key12, key13;
+ const PRUint8 *key = (const PRUint8 *)cx->expandedKey;
+
+ if (!inputLen) {
+ return SECSuccess;
+ }
+
+ key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+ key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+ key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+ key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+ key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+ key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+ key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+ key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+ key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+ key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+ key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+ key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16));
+ key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16));
+
+ while (inputLen > 0) {
+ uint8x16_t state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+ state = vld1q_u8(input);
+#else
+ if ((uintptr_t)input & 0x7) {
+ memcpy(buf, input, 16);
+ state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+ } else {
+ state = vld1q_u8(__builtin_assume_aligned(input, 8));
+ }
+#endif
+ input += 16;
+ inputLen -= 16;
+
+ /* Rounds */
+ state = vaesdq_u8(state, key13);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key12);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key11);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key10);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key9);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key8);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key7);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key6);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key5);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key4);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key3);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key2);
+ /* AddRoundKey */
+ state = veorq_u8(state, key1);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+ vst1q_u8(output, state);
+#else
+ if ((uintptr_t)output & 0x7) {
+ vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+ memcpy(output, buf, 16);
+ } else {
+ vst1q_u8(__builtin_assume_aligned(output, 8), state);
+ }
+#endif
+ output += 16;
+ }
+
+ return SECSuccess;
+}
+
+SECStatus
+arm_aes_encrypt_cbc_192(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+ pre_align unsigned char buf[16] post_align;
+#endif
+ uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+ uint8x16_t key11, key12, key13;
+ uint8x16_t iv;
+ PRUint8 *key = (PRUint8 *)cx->expandedKey;
+
+ if (!inputLen) {
+ return SECSuccess;
+ }
+
+ /* iv */
+ iv = vld1q_u8(cx->iv);
+
+ /* expanedKey */
+ key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+ key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+ key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+ key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+ key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+ key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+ key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+ key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+ key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+ key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+ key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+ key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16));
+ key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16));
+
+ while (inputLen > 0) {
+ uint8x16_t state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+ state = vld1q_u8(input);
+#else
+ if ((uintptr_t)input & 0x7) {
+ memcpy(buf, input, 16);
+ state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+ } else {
+ state = vld1q_u8(__builtin_assume_aligned(input, 8));
+ }
+#endif
+ input += 16;
+ inputLen -= 16;
+
+ state = veorq_u8(state, iv);
+
+ /* Rounds */
+ state = vaeseq_u8(state, key1);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key2);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key3);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key4);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key5);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key6);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key7);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key8);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key9);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key10);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key11);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key12);
+ state = veorq_u8(state, key13);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+ vst1q_u8(output, state);
+#else
+ if ((uintptr_t)output & 0x7) {
+ vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+ memcpy(output, buf, 16);
+ } else {
+ vst1q_u8(__builtin_assume_aligned(output, 8), state);
+ }
+#endif
+ output += 16;
+ iv = state;
+ }
+ vst1q_u8(__builtin_assume_aligned(cx->iv, 16), iv);
+
+ return SECSuccess;
+}
+
+SECStatus
+arm_aes_decrypt_cbc_192(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+ pre_align unsigned char buf[16] post_align;
+#endif
+ uint8x16_t iv;
+ uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+ uint8x16_t key11, key12, key13;
+ const PRUint8 *key = (const PRUint8 *)cx->expandedKey;
+
+ if (!inputLen) {
+ return SECSuccess;
+ }
+
+ /* iv */
+ iv = vld1q_u8(__builtin_assume_aligned(cx->iv, 16));
+
+ /* expanedKey */
+ key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+ key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+ key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+ key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+ key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+ key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+ key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+ key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+ key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+ key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+ key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+ key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16));
+ key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16));
+
+ while (inputLen > 0) {
+ uint8x16_t state, old_state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+ state = vld1q_u8(input);
+#else
+ if ((uintptr_t)input & 0x7) {
+ memcpy(buf, input, 16);
+ state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+ } else {
+ state = vld1q_u8(__builtin_assume_aligned(input, 8));
+ }
+#endif
+ old_state = state;
+ input += 16;
+ inputLen -= 16;
+
+ /* Rounds */
+ state = vaesdq_u8(state, key13);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key12);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key11);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key10);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key9);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key8);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key7);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key6);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key5);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key4);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key3);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key2);
+ /* AddRoundKey */
+ state = veorq_u8(state, key1);
+
+ state = veorq_u8(state, iv);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+ vst1q_u8(output, state);
+#else
+ if ((uintptr_t)output & 0x7) {
+ vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+ memcpy(output, buf, 16);
+ } else {
+ vst1q_u8(__builtin_assume_aligned(output, 8), state);
+ }
+#endif
+ output += 16;
+
+ iv = old_state;
+ }
+ vst1q_u8(__builtin_assume_aligned(cx->iv, 16), iv);
+
+ return SECSuccess;
+}
+
+SECStatus
+arm_aes_encrypt_ecb_256(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+ pre_align unsigned char buf[16] post_align;
+#endif
+ uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+ uint8x16_t key11, key12, key13, key14, key15;
+ PRUint8 *key = (PRUint8 *)cx->expandedKey;
+
+ if (inputLen == 0) {
+ return SECSuccess;
+ }
+
+ key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+ key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+ key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+ key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+ key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+ key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+ key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+ key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+ key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+ key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+ key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+ key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16));
+ key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16));
+ key14 = vld1q_u8(__builtin_assume_aligned(key + 208, 16));
+ key15 = vld1q_u8(__builtin_assume_aligned(key + 224, 16));
+
+ while (inputLen > 0) {
+ uint8x16_t state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+ state = vld1q_u8(input);
+#else
+ if ((uintptr_t)input & 0x7) {
+ memcpy(buf, input, 16);
+ state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+ } else {
+ state = vld1q_u8(__builtin_assume_aligned(input, 8));
+ }
+#endif
+ input += 16;
+ inputLen -= 16;
+
+ /* Rounds */
+ state = vaeseq_u8(state, key1);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key2);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key3);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key4);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key5);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key6);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key7);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key8);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key9);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key10);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key11);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key12);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key13);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key14);
+ /* AddRoundKey */
+ state = veorq_u8(state, key15);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+ vst1q_u8(output, state);
+#else
+ if ((uintptr_t)output & 0x7) {
+ vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+ memcpy(output, buf, 16);
+ } else {
+ vst1q_u8(__builtin_assume_aligned(output, 8), state);
+ }
+#endif
+ output += 16;
+ }
+ return SECSuccess;
+}
+
+SECStatus
+arm_aes_decrypt_ecb_256(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+ pre_align unsigned char buf[16] post_align;
+#endif
+ uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+ uint8x16_t key11, key12, key13, key14, key15;
+ const PRUint8 *key = (const PRUint8 *)cx->expandedKey;
+
+ if (!inputLen) {
+ return SECSuccess;
+ }
+
+ key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+ key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+ key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+ key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+ key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+ key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+ key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+ key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+ key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+ key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+ key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+ key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16));
+ key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16));
+ key14 = vld1q_u8(__builtin_assume_aligned(key + 208, 16));
+ key15 = vld1q_u8(__builtin_assume_aligned(key + 224, 16));
+
+ while (inputLen > 0) {
+ uint8x16_t state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+ state = vld1q_u8(input);
+#else
+ if ((uintptr_t)input & 0x7) {
+ memcpy(buf, input, 16);
+ state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+ } else {
+ state = vld1q_u8(__builtin_assume_aligned(input, 8));
+ }
+#endif
+ input += 16;
+ inputLen -= 16;
+
+ /* Rounds */
+ state = vaesdq_u8(state, key15);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key14);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key13);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key12);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key11);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key10);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key9);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key8);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key7);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key6);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key5);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key4);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key3);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key2);
+ /* AddRoundKey */
+ state = veorq_u8(state, key1);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+ vst1q_u8(output, state);
+#else
+ if ((uintptr_t)output & 0x7) {
+ vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+ memcpy(output, buf, 16);
+ } else {
+ vst1q_u8(__builtin_assume_aligned(output, 8), state);
+ }
+#endif
+ output += 16;
+ }
+
+ return SECSuccess;
+}
+
+SECStatus
+arm_aes_encrypt_cbc_256(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+ pre_align unsigned char buf[16] post_align;
+#endif
+ uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+ uint8x16_t key11, key12, key13, key14, key15;
+ uint8x16_t iv;
+ const PRUint8 *key = (const PRUint8 *)cx->expandedKey;
+
+ if (!inputLen) {
+ return SECSuccess;
+ }
+
+ /* iv */
+ iv = vld1q_u8(cx->iv);
+
+ /* expanedKey */
+ key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+ key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+ key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+ key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+ key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+ key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+ key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+ key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+ key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+ key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+ key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+ key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16));
+ key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16));
+ key14 = vld1q_u8(__builtin_assume_aligned(key + 208, 16));
+ key15 = vld1q_u8(__builtin_assume_aligned(key + 224, 16));
+
+ while (inputLen > 0) {
+ uint8x16_t state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+ state = vld1q_u8(input);
+#else
+ if ((uintptr_t)input & 0x7) {
+ memcpy(buf, input, 16);
+ state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+ } else {
+ state = vld1q_u8(__builtin_assume_aligned(input, 8));
+ }
+#endif
+ input += 16;
+ inputLen -= 16;
+
+ state = veorq_u8(state, iv);
+
+ /* Rounds */
+ state = vaeseq_u8(state, key1);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key2);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key3);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key4);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key5);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key6);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key7);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key8);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key9);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key10);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key11);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key12);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key13);
+ state = vaesmcq_u8(state);
+ state = vaeseq_u8(state, key14);
+ /* AddRoundKey */
+ state = veorq_u8(state, key15);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+ vst1q_u8(output, state);
+#else
+ if ((uintptr_t)output & 0x7) {
+ vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+ memcpy(output, buf, 16);
+ } else {
+ vst1q_u8(__builtin_assume_aligned(output, 8), state);
+ }
+#endif
+ output += 16;
+ iv = state;
+ }
+ vst1q_u8(__builtin_assume_aligned(cx->iv, 16), iv);
+
+ return SECSuccess;
+}
+
+SECStatus
+arm_aes_decrypt_cbc_256(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+ pre_align unsigned char buf[16] post_align;
+#endif
+ uint8x16_t iv;
+ uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+ uint8x16_t key11, key12, key13, key14, key15;
+ const PRUint8 *key = (const PRUint8 *)cx->expandedKey;
+
+ if (!inputLen) {
+ return SECSuccess;
+ }
+
+ /* iv */
+ iv = vld1q_u8(cx->iv);
+
+ /* expanedKey */
+ key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+ key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+ key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+ key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+ key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+ key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+ key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+ key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+ key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+ key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+ key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+ key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16));
+ key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16));
+ key14 = vld1q_u8(__builtin_assume_aligned(key + 208, 16));
+ key15 = vld1q_u8(__builtin_assume_aligned(key + 224, 16));
+
+ while (inputLen > 0) {
+ uint8x16_t state, old_state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+ state = vld1q_u8(input);
+#else
+ if ((uintptr_t)input & 0x7) {
+ memcpy(buf, input, 16);
+ state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+ } else {
+ state = vld1q_u8(__builtin_assume_aligned(input, 8));
+ }
+#endif
+ old_state = state;
+ input += 16;
+ inputLen -= 16;
+
+ /* Rounds */
+ state = vaesdq_u8(state, key15);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key14);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key13);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key12);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key11);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key10);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key9);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key8);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key7);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key6);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key5);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key4);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key3);
+ state = vaesimcq_u8(state);
+ state = vaesdq_u8(state, key2);
+ /* AddRoundKey */
+ state = veorq_u8(state, key1);
+
+ state = veorq_u8(state, iv);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+ vst1q_u8(output, state);
+#else
+ if ((uintptr_t)output & 0x7) {
+ vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+ memcpy(output, buf, 16);
+ } else {
+ vst1q_u8(__builtin_assume_aligned(output, 8), state);
+ }
+#endif
+ output += 16;
+
+ iv = old_state;
+ }
+ vst1q_u8(__builtin_assume_aligned(cx->iv, 16), iv);
+
+ return SECSuccess;
+}
+
+#endif
diff --git a/lib/freebl/aes-armv8.h b/lib/freebl/aes-armv8.h
new file mode 100644
index 000000000..b0ef1c870
--- /dev/null
+++ b/lib/freebl/aes-armv8.h
@@ -0,0 +1,103 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+SECStatus arm_aes_encrypt_ecb_128(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize);
+SECStatus arm_aes_decrypt_ecb_128(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize);
+SECStatus arm_aes_encrypt_cbc_128(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize);
+SECStatus arm_aes_decrypt_cbc_128(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize);
+SECStatus arm_aes_encrypt_ecb_192(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize);
+SECStatus arm_aes_decrypt_ecb_192(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize);
+SECStatus arm_aes_encrypt_cbc_192(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize);
+SECStatus arm_aes_decrypt_cbc_192(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize);
+SECStatus arm_aes_encrypt_ecb_256(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize);
+SECStatus arm_aes_decrypt_ecb_256(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize);
+SECStatus arm_aes_encrypt_cbc_256(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize);
+SECStatus arm_aes_decrypt_cbc_256(AESContext *cx, unsigned char *output,
+ unsigned int *outputLen,
+ unsigned int maxOutputLen,
+ const unsigned char *input,
+ unsigned int inputLen,
+ unsigned int blocksize);
+
+#define native_aes_ecb_worker(encrypt, keysize) \
+ ((encrypt) \
+ ? ((keysize) == 16 ? arm_aes_encrypt_ecb_128 \
+ : (keysize) == 24 ? arm_aes_encrypt_ecb_192 \
+ : arm_aes_encrypt_ecb_256) \
+ : ((keysize) == 16 ? arm_aes_decrypt_ecb_128 \
+ : (keysize) == 24 ? arm_aes_decrypt_ecb_192 \
+ : arm_aes_decrypt_ecb_256))
+
+#define native_aes_cbc_worker(encrypt, keysize) \
+ ((encrypt) \
+ ? ((keysize) == 16 ? arm_aes_encrypt_cbc_128 \
+ : (keysize) == 24 ? arm_aes_encrypt_cbc_192 \
+ : arm_aes_encrypt_cbc_256) \
+ : ((keysize) == 16 ? arm_aes_decrypt_cbc_128 \
+ : (keysize) == 24 ? arm_aes_decrypt_cbc_192 \
+ : arm_aes_decrypt_cbc_256))
+
+#define native_aes_init(encrypt, keysize) \
+ do { \
+ if (encrypt) { \
+ rijndael_key_expansion(cx, key, Nk); \
+ } else { \
+ rijndael_invkey_expansion(cx, key, Nk); \
+ } \
+ } while (0)
diff --git a/lib/freebl/freebl.gyp b/lib/freebl/freebl.gyp
index e96b0fb7b..dc33f73a7 100644
--- a/lib/freebl/freebl.gyp
+++ b/lib/freebl/freebl.gyp
@@ -133,6 +133,35 @@
]
},
{
+ 'target_name': 'armv8_c_lib',
+ 'type': 'static_library',
+ 'sources': [
+ 'aes-armv8.c',
+ ],
+ 'dependencies': [
+ '<(DEPTH)/exports.gyp:nss_exports'
+ ],
+ 'conditions': [
+ [ 'target_arch=="arm"', {
+ 'cflags': [
+ '-march=armv8-a',
+ '-mfpu=crypto-neon-fp-armv8'
+ ],
+ 'cflags_mozilla': [
+ '-march=armv8-a',
+ '-mfpu=crypto-neon-fp-armv8'
+ ],
+ }, 'target_arch=="arm64" or target_arch=="aarch64"', {
+ 'cflags': [
+ '-march=armv8-a+crypto'
+ ],
+ 'cflags_mozilla': [
+ '-march=armv8-a+crypto'
+ ],
+ }]
+ ]
+ },
+ {
'target_name': 'freebl',
'type': 'static_library',
'sources': [
@@ -160,6 +189,10 @@
'dependencies': [
'gcm-aes-x86_c_lib',
],
+ }, 'target_arch=="arm" or target_arch=="arm64" or target_arch=="aarch64"', {
+ 'dependencies': [
+ 'armv8_c_lib'
+ ],
}],
[ 'target_arch=="arm64" or target_arch=="aarch64"', {
'dependencies': [
@@ -202,6 +235,10 @@
'dependencies': [
'gcm-aes-x86_c_lib',
]
+ }, 'target_arch=="arm" or target_arch=="arm64" or target_arch=="aarch64"', {
+ 'dependencies': [
+ 'armv8_c_lib',
+ ],
}],
[ 'target_arch=="arm64" or target_arch=="aarch64"', {
'dependencies': [
@@ -429,6 +466,12 @@
'MP_USE_UINT_DIGIT',
'SHA_NO_LONG_LONG',
'ARMHF',
+ 'USE_HW_AES',
+ ],
+ }],
+ [ 'target_arch=="arm64" or target_arch=="aarch64"', {
+ 'defines': [
+ 'USE_HW_AES',
],
}],
],
diff --git a/lib/freebl/intel-aes.h b/lib/freebl/intel-aes.h
index d5bd2d8ca..970f5192c 100644
--- a/lib/freebl/intel-aes.h
+++ b/lib/freebl/intel-aes.h
@@ -100,7 +100,7 @@ SECStatus intel_aes_encrypt_ctr_256(CTRContext *cx, unsigned char *output,
unsigned int inputLen,
unsigned int blocksize);
-#define intel_aes_ecb_worker(encrypt, keysize) \
+#define native_aes_ecb_worker(encrypt, keysize) \
((encrypt) \
? ((keysize) == 16 ? intel_aes_encrypt_ecb_128 \
: (keysize) == 24 ? intel_aes_encrypt_ecb_192 \
@@ -109,7 +109,7 @@ SECStatus intel_aes_encrypt_ctr_256(CTRContext *cx, unsigned char *output,
: (keysize) == 24 ? intel_aes_decrypt_ecb_192 \
: intel_aes_decrypt_ecb_256))
-#define intel_aes_cbc_worker(encrypt, keysize) \
+#define native_aes_cbc_worker(encrypt, keysize) \
((encrypt) \
? ((keysize) == 16 ? intel_aes_encrypt_cbc_128 \
: (keysize) == 24 ? intel_aes_encrypt_cbc_192 \
@@ -123,7 +123,7 @@ SECStatus intel_aes_encrypt_ctr_256(CTRContext *cx, unsigned char *output,
: (nr) == 12 ? intel_aes_encrypt_ctr_192 \
: intel_aes_encrypt_ctr_256)
-#define intel_aes_init(encrypt, keysize) \
+#define native_aes_init(encrypt, keysize) \
do { \
if (encrypt) { \
if (keysize == 16) \
diff --git a/lib/freebl/rijndael.c b/lib/freebl/rijndael.c
index 6dab440f8..26bd58ee0 100644
--- a/lib/freebl/rijndael.c
+++ b/lib/freebl/rijndael.c
@@ -20,9 +20,18 @@
#include "gcm.h"
#include "mpi.h"
+#if !defined(IS_LITTLE_ENDIAN) && !defined(NSS_X86_OR_X64)
+// not test yet on big endian platform of arm
+#undef USE_HW_AES
+#endif
+
#ifdef USE_HW_AES
+#ifdef NSS_X86_OR_X64
#include "intel-aes.h"
+#else
+#include "aes-armv8.h"
#endif
+#endif /* USE_HW_AES */
#ifdef INTEL_GCM
#include "intel-gcm.h"
#endif /* INTEL_GCM */
@@ -847,7 +856,11 @@ aes_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize,
PORT_SetError(SEC_ERROR_INVALID_ARGS);
return SECFailure;
}
- use_hw_aes = aesni_support() && (keysize % 8) == 0;
+#if defined(NSS_X86_OR_X64) || defined(USE_HW_AES)
+ use_hw_aes = (aesni_support() || arm_aes_support()) && (keysize % 8) == 0;
+#else
+ use_hw_aes = PR_FALSE;
+#endif
/* Nb = (block size in bits) / 32 */
cx->Nb = AES_BLOCK_SIZE / 4;
/* Nk = (key size in bits) / 32 */
@@ -860,7 +873,7 @@ aes_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize,
#ifdef USE_HW_AES
if (use_hw_aes) {
cx->worker = (freeblCipherFunc)
- intel_aes_cbc_worker(encrypt, keysize);
+ native_aes_cbc_worker(encrypt, keysize);
} else
#endif
{
@@ -872,7 +885,7 @@ aes_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize,
#ifdef USE_HW_AES
if (use_hw_aes) {
cx->worker = (freeblCipherFunc)
- intel_aes_ecb_worker(encrypt, keysize);
+ native_aes_ecb_worker(encrypt, keysize);
} else
#endif
{
@@ -888,7 +901,7 @@ aes_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize,
}
#ifdef USE_HW_AES
if (use_hw_aes) {
- intel_aes_init(encrypt, keysize);
+ native_aes_init(encrypt, keysize);
} else
#endif
{