diff options
author | Makoto Kato <m_kato@ga2.so-net.ne.jp> | 2019-10-11 19:33:06 +0000 |
---|---|---|
committer | Makoto Kato <m_kato@ga2.so-net.ne.jp> | 2019-10-11 19:33:06 +0000 |
commit | 6fde68ff510a779fd12b02f08ba3ae081cf8b6d5 (patch) | |
tree | 36c72881d54b2c4e801308c096347f06d644425a | |
parent | d2a64b472badb3dc3817305705f0031ba21ce13f (diff) | |
download | nss-hg-6fde68ff510a779fd12b02f08ba3ae081cf8b6d5.tar.gz |
Bug 1152625 - Support AES HW acceleration on ARMv8. r=kjacobs,jcj
Differential Revision: https://phabricator.services.mozilla.com/D34473
-rw-r--r-- | lib/freebl/Makefile | 23 | ||||
-rw-r--r-- | lib/freebl/aes-armv8.c | 1168 | ||||
-rw-r--r-- | lib/freebl/aes-armv8.h | 103 | ||||
-rw-r--r-- | lib/freebl/freebl.gyp | 43 | ||||
-rw-r--r-- | lib/freebl/intel-aes.h | 6 | ||||
-rw-r--r-- | lib/freebl/rijndael.c | 21 |
6 files changed, 1356 insertions, 8 deletions
diff --git a/lib/freebl/Makefile b/lib/freebl/Makefile index 81ea8b734..5943fb377 100644 --- a/lib/freebl/Makefile +++ b/lib/freebl/Makefile @@ -120,7 +120,24 @@ else endif endif ifeq ($(CPU_ARCH),aarch64) - EXTRA_SRCS += gcm-aarch64.c + DEFINES += -DUSE_HW_AES + EXTRA_SRCS += aes-armv8.c gcm-aarch64.c +endif +ifeq ($(CPU_ARCH),arm) + ifdef CC_IS_CLANG + DEFINES += -DUSE_HW_AES + EXTRA_SRCS += aes-armv8.c + else ifeq (1,$(CC_IS_GCC)) + # Old compiler doesn't support ARM AES. + ifneq (,$(filter 4.9,$(word 1,$(GCC_VERSION)).$(word 2,$(GCC_VERSION)))) + DEFINES += -DUSE_HW_AES + EXTRA_SRCS += aes-armv8.c + endif + ifeq (,$(filter 0 1 2 3 4,$(word 1,$(GCC_VERSION)))) + DEFINES += -DUSE_HW_AES + EXTRA_SRCS += aes-armv8.c + endif + endif endif ifeq ($(OS_TARGET),OSF1) @@ -761,6 +778,10 @@ ifdef INTEL_GCM_CLANG_CL $(OBJDIR)/$(PROG_PREFIX)intel-gcm-wrap$(OBJ_SUFFIX): CFLAGS += -mssse3 endif +ifeq ($(CPU_ARCH),arm) +$(OBJDIR)/$(PROG_PREFIX)aes-armv8$(OBJ_SUFFIX): CFLAGS += -march=armv8-a -mfpu=crypto-neon-fp-armv8 +endif ifeq ($(CPU_ARCH),aarch64) +$(OBJDIR)/$(PROG_PREFIX)aes-armv8$(OBJ_SUFFIX): CFLAGS += -march=armv8-a+crypto $(OBJDIR)/$(PROG_PREFIX)gcm-aarch64$(OBJ_SUFFIX): CFLAGS += -march=armv8-a+crypto endif diff --git a/lib/freebl/aes-armv8.c b/lib/freebl/aes-armv8.c new file mode 100644 index 000000000..1cc7e0d7c --- /dev/null +++ b/lib/freebl/aes-armv8.c @@ -0,0 +1,1168 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "secerr.h" +#include "rijndael.h" + +#if (defined(__clang__) || \ + (defined(__GNUC__) && defined(__GNUC_MINOR__) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 8)))) + +#ifndef __ARM_FEATURE_CRYPTO +#error "Compiler option is invalid" +#endif + +#include <arm_neon.h> + +SECStatus +arm_aes_encrypt_ecb_128(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11; + const PRUint8 *key = (const PRUint8 *)cx->expandedKey; + + if (!inputLen) { + return SECSuccess; + } + + key1 = vld1q_u8(__builtin_assume_aligned(key, 16)); + key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16)); + key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16)); + key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16)); + key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16)); + key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16)); + key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16)); + key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16)); + key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16)); + key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16)); + key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16)); + + while (inputLen > 0) { + uint8x16_t state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + input += 16; + inputLen -= 16; + + /* Rounds */ + state = vaeseq_u8(state, key1); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key2); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key3); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key4); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key5); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key6); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key7); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key8); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key9); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key10); + /* AddRoundKey */ + state = veorq_u8(state, key11); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + } + + return SECSuccess; +} + +SECStatus +arm_aes_decrypt_ecb_128(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11; + const PRUint8 *key = (const PRUint8 *)cx->expandedKey; + + if (inputLen == 0) { + return SECSuccess; + } + + key1 = vld1q_u8(__builtin_assume_aligned(key, 16)); + key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16)); + key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16)); + key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16)); + key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16)); + key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16)); + key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16)); + key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16)); + key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16)); + key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16)); + key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16)); + + while (inputLen > 0) { + uint8x16_t state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + input += 16; + inputLen -= 16; + + /* Rounds */ + state = vaesdq_u8(state, key11); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key10); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key9); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key8); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key7); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key6); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key5); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key4); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key3); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key2); + /* AddRoundKey */ + state = veorq_u8(state, key1); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + } + + return SECSuccess; +} + +SECStatus +arm_aes_encrypt_cbc_128(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11; + uint8x16_t iv; + const PRUint8 *key = (const PRUint8 *)cx->expandedKey; + + if (!inputLen) { + return SECSuccess; + } + + /* iv */ + iv = vld1q_u8(__builtin_assume_aligned(cx->iv, 16)); + + /* expanedKey */ + key1 = vld1q_u8(__builtin_assume_aligned(key, 16)); + key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16)); + key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16)); + key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16)); + key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16)); + key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16)); + key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16)); + key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16)); + key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16)); + key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16)); + key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16)); + + while (inputLen > 0) { + uint8x16_t state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + input += 16; + inputLen -= 16; + + state = veorq_u8(state, iv); + + /* Rounds */ + state = vaeseq_u8(state, key1); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key2); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key3); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key4); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key5); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key6); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key7); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key8); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key9); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key10); + /* AddRoundKey */ + state = veorq_u8(state, key11); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + iv = state; + } + vst1q_u8(__builtin_assume_aligned(cx->iv, 16), iv); + + return SECSuccess; +} + +SECStatus +arm_aes_decrypt_cbc_128(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t iv; + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11; + const PRUint8 *key = (const PRUint8 *)cx->expandedKey; + + if (!inputLen) { + return SECSuccess; + } + + /* iv */ + iv = vld1q_u8(__builtin_assume_aligned(cx->iv, 16)); + + /* expanedKey */ + key1 = vld1q_u8(__builtin_assume_aligned(key, 16)); + key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16)); + key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16)); + key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16)); + key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16)); + key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16)); + key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16)); + key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16)); + key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16)); + key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16)); + key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16)); + + while (inputLen > 0) { + uint8x16_t state, old_state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + old_state = state; + input += 16; + inputLen -= 16; + + /* Rounds */ + state = vaesdq_u8(state, key11); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key10); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key9); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key8); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key7); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key6); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key5); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key4); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key3); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key2); + /* AddRoundKey */ + state = veorq_u8(state, key1); + + state = veorq_u8(state, iv); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + + iv = old_state; + } + vst1q_u8(__builtin_assume_aligned(cx->iv, 16), iv); + + return SECSuccess; +} + +SECStatus +arm_aes_encrypt_ecb_192(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11, key12, key13; + PRUint8 *key = (PRUint8 *)cx->expandedKey; + + if (!inputLen) { + return SECSuccess; + } + + key1 = vld1q_u8(__builtin_assume_aligned(key, 16)); + key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16)); + key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16)); + key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16)); + key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16)); + key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16)); + key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16)); + key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16)); + key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16)); + key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16)); + key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16)); + key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16)); + key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16)); + + while (inputLen > 0) { + uint8x16_t state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + input += 16; + inputLen -= 16; + + /* Rounds */ + state = vaeseq_u8(state, key1); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key2); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key3); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key4); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key5); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key6); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key7); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key8); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key9); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key10); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key11); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key12); + /* AddRoundKey */ + state = veorq_u8(state, key13); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + } + + return SECSuccess; +} + +SECStatus +arm_aes_decrypt_ecb_192(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11, key12, key13; + const PRUint8 *key = (const PRUint8 *)cx->expandedKey; + + if (!inputLen) { + return SECSuccess; + } + + key1 = vld1q_u8(__builtin_assume_aligned(key, 16)); + key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16)); + key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16)); + key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16)); + key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16)); + key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16)); + key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16)); + key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16)); + key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16)); + key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16)); + key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16)); + key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16)); + key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16)); + + while (inputLen > 0) { + uint8x16_t state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + input += 16; + inputLen -= 16; + + /* Rounds */ + state = vaesdq_u8(state, key13); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key12); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key11); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key10); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key9); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key8); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key7); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key6); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key5); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key4); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key3); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key2); + /* AddRoundKey */ + state = veorq_u8(state, key1); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + } + + return SECSuccess; +} + +SECStatus +arm_aes_encrypt_cbc_192(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11, key12, key13; + uint8x16_t iv; + PRUint8 *key = (PRUint8 *)cx->expandedKey; + + if (!inputLen) { + return SECSuccess; + } + + /* iv */ + iv = vld1q_u8(cx->iv); + + /* expanedKey */ + key1 = vld1q_u8(__builtin_assume_aligned(key, 16)); + key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16)); + key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16)); + key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16)); + key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16)); + key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16)); + key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16)); + key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16)); + key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16)); + key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16)); + key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16)); + key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16)); + key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16)); + + while (inputLen > 0) { + uint8x16_t state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + input += 16; + inputLen -= 16; + + state = veorq_u8(state, iv); + + /* Rounds */ + state = vaeseq_u8(state, key1); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key2); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key3); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key4); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key5); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key6); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key7); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key8); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key9); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key10); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key11); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key12); + state = veorq_u8(state, key13); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + iv = state; + } + vst1q_u8(__builtin_assume_aligned(cx->iv, 16), iv); + + return SECSuccess; +} + +SECStatus +arm_aes_decrypt_cbc_192(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t iv; + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11, key12, key13; + const PRUint8 *key = (const PRUint8 *)cx->expandedKey; + + if (!inputLen) { + return SECSuccess; + } + + /* iv */ + iv = vld1q_u8(__builtin_assume_aligned(cx->iv, 16)); + + /* expanedKey */ + key1 = vld1q_u8(__builtin_assume_aligned(key, 16)); + key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16)); + key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16)); + key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16)); + key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16)); + key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16)); + key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16)); + key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16)); + key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16)); + key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16)); + key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16)); + key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16)); + key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16)); + + while (inputLen > 0) { + uint8x16_t state, old_state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + old_state = state; + input += 16; + inputLen -= 16; + + /* Rounds */ + state = vaesdq_u8(state, key13); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key12); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key11); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key10); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key9); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key8); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key7); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key6); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key5); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key4); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key3); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key2); + /* AddRoundKey */ + state = veorq_u8(state, key1); + + state = veorq_u8(state, iv); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + + iv = old_state; + } + vst1q_u8(__builtin_assume_aligned(cx->iv, 16), iv); + + return SECSuccess; +} + +SECStatus +arm_aes_encrypt_ecb_256(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11, key12, key13, key14, key15; + PRUint8 *key = (PRUint8 *)cx->expandedKey; + + if (inputLen == 0) { + return SECSuccess; + } + + key1 = vld1q_u8(__builtin_assume_aligned(key, 16)); + key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16)); + key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16)); + key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16)); + key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16)); + key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16)); + key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16)); + key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16)); + key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16)); + key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16)); + key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16)); + key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16)); + key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16)); + key14 = vld1q_u8(__builtin_assume_aligned(key + 208, 16)); + key15 = vld1q_u8(__builtin_assume_aligned(key + 224, 16)); + + while (inputLen > 0) { + uint8x16_t state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + input += 16; + inputLen -= 16; + + /* Rounds */ + state = vaeseq_u8(state, key1); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key2); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key3); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key4); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key5); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key6); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key7); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key8); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key9); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key10); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key11); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key12); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key13); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key14); + /* AddRoundKey */ + state = veorq_u8(state, key15); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + } + return SECSuccess; +} + +SECStatus +arm_aes_decrypt_ecb_256(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11, key12, key13, key14, key15; + const PRUint8 *key = (const PRUint8 *)cx->expandedKey; + + if (!inputLen) { + return SECSuccess; + } + + key1 = vld1q_u8(__builtin_assume_aligned(key, 16)); + key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16)); + key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16)); + key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16)); + key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16)); + key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16)); + key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16)); + key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16)); + key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16)); + key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16)); + key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16)); + key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16)); + key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16)); + key14 = vld1q_u8(__builtin_assume_aligned(key + 208, 16)); + key15 = vld1q_u8(__builtin_assume_aligned(key + 224, 16)); + + while (inputLen > 0) { + uint8x16_t state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + input += 16; + inputLen -= 16; + + /* Rounds */ + state = vaesdq_u8(state, key15); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key14); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key13); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key12); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key11); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key10); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key9); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key8); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key7); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key6); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key5); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key4); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key3); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key2); + /* AddRoundKey */ + state = veorq_u8(state, key1); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + } + + return SECSuccess; +} + +SECStatus +arm_aes_encrypt_cbc_256(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11, key12, key13, key14, key15; + uint8x16_t iv; + const PRUint8 *key = (const PRUint8 *)cx->expandedKey; + + if (!inputLen) { + return SECSuccess; + } + + /* iv */ + iv = vld1q_u8(cx->iv); + + /* expanedKey */ + key1 = vld1q_u8(__builtin_assume_aligned(key, 16)); + key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16)); + key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16)); + key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16)); + key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16)); + key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16)); + key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16)); + key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16)); + key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16)); + key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16)); + key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16)); + key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16)); + key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16)); + key14 = vld1q_u8(__builtin_assume_aligned(key + 208, 16)); + key15 = vld1q_u8(__builtin_assume_aligned(key + 224, 16)); + + while (inputLen > 0) { + uint8x16_t state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + input += 16; + inputLen -= 16; + + state = veorq_u8(state, iv); + + /* Rounds */ + state = vaeseq_u8(state, key1); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key2); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key3); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key4); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key5); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key6); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key7); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key8); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key9); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key10); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key11); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key12); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key13); + state = vaesmcq_u8(state); + state = vaeseq_u8(state, key14); + /* AddRoundKey */ + state = veorq_u8(state, key15); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + iv = state; + } + vst1q_u8(__builtin_assume_aligned(cx->iv, 16), iv); + + return SECSuccess; +} + +SECStatus +arm_aes_decrypt_cbc_256(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize) +{ +#if !defined(HAVE_UNALIGNED_ACCESS) + pre_align unsigned char buf[16] post_align; +#endif + uint8x16_t iv; + uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10; + uint8x16_t key11, key12, key13, key14, key15; + const PRUint8 *key = (const PRUint8 *)cx->expandedKey; + + if (!inputLen) { + return SECSuccess; + } + + /* iv */ + iv = vld1q_u8(cx->iv); + + /* expanedKey */ + key1 = vld1q_u8(__builtin_assume_aligned(key, 16)); + key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16)); + key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16)); + key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16)); + key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16)); + key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16)); + key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16)); + key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16)); + key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16)); + key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16)); + key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16)); + key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16)); + key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16)); + key14 = vld1q_u8(__builtin_assume_aligned(key + 208, 16)); + key15 = vld1q_u8(__builtin_assume_aligned(key + 224, 16)); + + while (inputLen > 0) { + uint8x16_t state, old_state; +#if defined(HAVE_UNALIGNED_ACCESS) + state = vld1q_u8(input); +#else + if ((uintptr_t)input & 0x7) { + memcpy(buf, input, 16); + state = vld1q_u8(__builtin_assume_aligned(buf, 16)); + } else { + state = vld1q_u8(__builtin_assume_aligned(input, 8)); + } +#endif + old_state = state; + input += 16; + inputLen -= 16; + + /* Rounds */ + state = vaesdq_u8(state, key15); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key14); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key13); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key12); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key11); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key10); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key9); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key8); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key7); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key6); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key5); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key4); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key3); + state = vaesimcq_u8(state); + state = vaesdq_u8(state, key2); + /* AddRoundKey */ + state = veorq_u8(state, key1); + + state = veorq_u8(state, iv); + +#if defined(HAVE_UNALIGNED_ACCESS) + vst1q_u8(output, state); +#else + if ((uintptr_t)output & 0x7) { + vst1q_u8(__builtin_assume_aligned(buf, 16), state); + memcpy(output, buf, 16); + } else { + vst1q_u8(__builtin_assume_aligned(output, 8), state); + } +#endif + output += 16; + + iv = old_state; + } + vst1q_u8(__builtin_assume_aligned(cx->iv, 16), iv); + + return SECSuccess; +} + +#endif diff --git a/lib/freebl/aes-armv8.h b/lib/freebl/aes-armv8.h new file mode 100644 index 000000000..b0ef1c870 --- /dev/null +++ b/lib/freebl/aes-armv8.h @@ -0,0 +1,103 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +SECStatus arm_aes_encrypt_ecb_128(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize); +SECStatus arm_aes_decrypt_ecb_128(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize); +SECStatus arm_aes_encrypt_cbc_128(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize); +SECStatus arm_aes_decrypt_cbc_128(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize); +SECStatus arm_aes_encrypt_ecb_192(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize); +SECStatus arm_aes_decrypt_ecb_192(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize); +SECStatus arm_aes_encrypt_cbc_192(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize); +SECStatus arm_aes_decrypt_cbc_192(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize); +SECStatus arm_aes_encrypt_ecb_256(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize); +SECStatus arm_aes_decrypt_ecb_256(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize); +SECStatus arm_aes_encrypt_cbc_256(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize); +SECStatus arm_aes_decrypt_cbc_256(AESContext *cx, unsigned char *output, + unsigned int *outputLen, + unsigned int maxOutputLen, + const unsigned char *input, + unsigned int inputLen, + unsigned int blocksize); + +#define native_aes_ecb_worker(encrypt, keysize) \ + ((encrypt) \ + ? ((keysize) == 16 ? arm_aes_encrypt_ecb_128 \ + : (keysize) == 24 ? arm_aes_encrypt_ecb_192 \ + : arm_aes_encrypt_ecb_256) \ + : ((keysize) == 16 ? arm_aes_decrypt_ecb_128 \ + : (keysize) == 24 ? arm_aes_decrypt_ecb_192 \ + : arm_aes_decrypt_ecb_256)) + +#define native_aes_cbc_worker(encrypt, keysize) \ + ((encrypt) \ + ? ((keysize) == 16 ? arm_aes_encrypt_cbc_128 \ + : (keysize) == 24 ? arm_aes_encrypt_cbc_192 \ + : arm_aes_encrypt_cbc_256) \ + : ((keysize) == 16 ? arm_aes_decrypt_cbc_128 \ + : (keysize) == 24 ? arm_aes_decrypt_cbc_192 \ + : arm_aes_decrypt_cbc_256)) + +#define native_aes_init(encrypt, keysize) \ + do { \ + if (encrypt) { \ + rijndael_key_expansion(cx, key, Nk); \ + } else { \ + rijndael_invkey_expansion(cx, key, Nk); \ + } \ + } while (0) diff --git a/lib/freebl/freebl.gyp b/lib/freebl/freebl.gyp index e96b0fb7b..dc33f73a7 100644 --- a/lib/freebl/freebl.gyp +++ b/lib/freebl/freebl.gyp @@ -133,6 +133,35 @@ ] }, { + 'target_name': 'armv8_c_lib', + 'type': 'static_library', + 'sources': [ + 'aes-armv8.c', + ], + 'dependencies': [ + '<(DEPTH)/exports.gyp:nss_exports' + ], + 'conditions': [ + [ 'target_arch=="arm"', { + 'cflags': [ + '-march=armv8-a', + '-mfpu=crypto-neon-fp-armv8' + ], + 'cflags_mozilla': [ + '-march=armv8-a', + '-mfpu=crypto-neon-fp-armv8' + ], + }, 'target_arch=="arm64" or target_arch=="aarch64"', { + 'cflags': [ + '-march=armv8-a+crypto' + ], + 'cflags_mozilla': [ + '-march=armv8-a+crypto' + ], + }] + ] + }, + { 'target_name': 'freebl', 'type': 'static_library', 'sources': [ @@ -160,6 +189,10 @@ 'dependencies': [ 'gcm-aes-x86_c_lib', ], + }, 'target_arch=="arm" or target_arch=="arm64" or target_arch=="aarch64"', { + 'dependencies': [ + 'armv8_c_lib' + ], }], [ 'target_arch=="arm64" or target_arch=="aarch64"', { 'dependencies': [ @@ -202,6 +235,10 @@ 'dependencies': [ 'gcm-aes-x86_c_lib', ] + }, 'target_arch=="arm" or target_arch=="arm64" or target_arch=="aarch64"', { + 'dependencies': [ + 'armv8_c_lib', + ], }], [ 'target_arch=="arm64" or target_arch=="aarch64"', { 'dependencies': [ @@ -429,6 +466,12 @@ 'MP_USE_UINT_DIGIT', 'SHA_NO_LONG_LONG', 'ARMHF', + 'USE_HW_AES', + ], + }], + [ 'target_arch=="arm64" or target_arch=="aarch64"', { + 'defines': [ + 'USE_HW_AES', ], }], ], diff --git a/lib/freebl/intel-aes.h b/lib/freebl/intel-aes.h index d5bd2d8ca..970f5192c 100644 --- a/lib/freebl/intel-aes.h +++ b/lib/freebl/intel-aes.h @@ -100,7 +100,7 @@ SECStatus intel_aes_encrypt_ctr_256(CTRContext *cx, unsigned char *output, unsigned int inputLen, unsigned int blocksize); -#define intel_aes_ecb_worker(encrypt, keysize) \ +#define native_aes_ecb_worker(encrypt, keysize) \ ((encrypt) \ ? ((keysize) == 16 ? intel_aes_encrypt_ecb_128 \ : (keysize) == 24 ? intel_aes_encrypt_ecb_192 \ @@ -109,7 +109,7 @@ SECStatus intel_aes_encrypt_ctr_256(CTRContext *cx, unsigned char *output, : (keysize) == 24 ? intel_aes_decrypt_ecb_192 \ : intel_aes_decrypt_ecb_256)) -#define intel_aes_cbc_worker(encrypt, keysize) \ +#define native_aes_cbc_worker(encrypt, keysize) \ ((encrypt) \ ? ((keysize) == 16 ? intel_aes_encrypt_cbc_128 \ : (keysize) == 24 ? intel_aes_encrypt_cbc_192 \ @@ -123,7 +123,7 @@ SECStatus intel_aes_encrypt_ctr_256(CTRContext *cx, unsigned char *output, : (nr) == 12 ? intel_aes_encrypt_ctr_192 \ : intel_aes_encrypt_ctr_256) -#define intel_aes_init(encrypt, keysize) \ +#define native_aes_init(encrypt, keysize) \ do { \ if (encrypt) { \ if (keysize == 16) \ diff --git a/lib/freebl/rijndael.c b/lib/freebl/rijndael.c index 6dab440f8..26bd58ee0 100644 --- a/lib/freebl/rijndael.c +++ b/lib/freebl/rijndael.c @@ -20,9 +20,18 @@ #include "gcm.h" #include "mpi.h" +#if !defined(IS_LITTLE_ENDIAN) && !defined(NSS_X86_OR_X64) +// not test yet on big endian platform of arm +#undef USE_HW_AES +#endif + #ifdef USE_HW_AES +#ifdef NSS_X86_OR_X64 #include "intel-aes.h" +#else +#include "aes-armv8.h" #endif +#endif /* USE_HW_AES */ #ifdef INTEL_GCM #include "intel-gcm.h" #endif /* INTEL_GCM */ @@ -847,7 +856,11 @@ aes_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize, PORT_SetError(SEC_ERROR_INVALID_ARGS); return SECFailure; } - use_hw_aes = aesni_support() && (keysize % 8) == 0; +#if defined(NSS_X86_OR_X64) || defined(USE_HW_AES) + use_hw_aes = (aesni_support() || arm_aes_support()) && (keysize % 8) == 0; +#else + use_hw_aes = PR_FALSE; +#endif /* Nb = (block size in bits) / 32 */ cx->Nb = AES_BLOCK_SIZE / 4; /* Nk = (key size in bits) / 32 */ @@ -860,7 +873,7 @@ aes_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize, #ifdef USE_HW_AES if (use_hw_aes) { cx->worker = (freeblCipherFunc) - intel_aes_cbc_worker(encrypt, keysize); + native_aes_cbc_worker(encrypt, keysize); } else #endif { @@ -872,7 +885,7 @@ aes_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize, #ifdef USE_HW_AES if (use_hw_aes) { cx->worker = (freeblCipherFunc) - intel_aes_ecb_worker(encrypt, keysize); + native_aes_ecb_worker(encrypt, keysize); } else #endif { @@ -888,7 +901,7 @@ aes_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize, } #ifdef USE_HW_AES if (use_hw_aes) { - intel_aes_init(encrypt, keysize); + native_aes_init(encrypt, keysize); } else #endif { |