diff options
-rw-r--r-- | coreconf/config.gypi | 1 | ||||
-rw-r--r-- | lib/freebl/Makefile | 18 | ||||
-rw-r--r-- | lib/freebl/blinit.c | 2 | ||||
-rw-r--r-- | lib/freebl/freebl.gyp | 13 | ||||
-rw-r--r-- | lib/freebl/sha256-armv8.c | 203 | ||||
-rw-r--r-- | lib/freebl/sha256.h | 8 | ||||
-rw-r--r-- | lib/freebl/sha512.c | 66 |
7 files changed, 296 insertions, 15 deletions
diff --git a/coreconf/config.gypi b/coreconf/config.gypi index 62d3cc71e..77ed8da98 100644 --- a/coreconf/config.gypi +++ b/coreconf/config.gypi @@ -97,6 +97,7 @@ 'cc_use_gnu_ld%': '<(cc_use_gnu_ld)', # Some defaults 'disable_arm_hw_aes%': 0, + 'disable_arm_hw_sha2%': 0, 'disable_tests%': 0, 'disable_chachapoly%': 0, 'disable_deprecated_seed%': 0, diff --git a/lib/freebl/Makefile b/lib/freebl/Makefile index 5f7384429..29dc940a3 100644 --- a/lib/freebl/Makefile +++ b/lib/freebl/Makefile @@ -120,25 +120,25 @@ else endif endif ifeq ($(CPU_ARCH),aarch64) - DEFINES += -DUSE_HW_AES - EXTRA_SRCS += aes-armv8.c gcm-aarch64.c + DEFINES += -DUSE_HW_AES -DUSE_HW_SHA2 + EXTRA_SRCS += aes-armv8.c gcm-aarch64.c sha256-armv8.c endif ifeq ($(CPU_ARCH),arm) ifndef NSS_DISABLE_ARM32_NEON EXTRA_SRCS += gcm-arm32-neon.c endif ifdef CC_IS_CLANG - DEFINES += -DUSE_HW_AES - EXTRA_SRCS += aes-armv8.c + DEFINES += -DUSE_HW_AES -DUSE_HW_SHA2 + EXTRA_SRCS += aes-armv8.c sha256-armv8.c else ifeq (1,$(CC_IS_GCC)) # Old compiler doesn't support ARM AES. ifneq (,$(filter 4.9,$(word 1,$(GCC_VERSION)).$(word 2,$(GCC_VERSION)))) - DEFINES += -DUSE_HW_AES - EXTRA_SRCS += aes-armv8.c + DEFINES += -DUSE_HW_AES -DUSE_HW_SHA2 + EXTRA_SRCS += aes-armv8.c sha256-armv8.c endif ifeq (,$(filter 0 1 2 3 4,$(word 1,$(GCC_VERSION)))) - DEFINES += -DUSE_HW_AES - EXTRA_SRCS += aes-armv8.c + DEFINES += -DUSE_HW_AES -DUSE_HW_SHA2 + EXTRA_SRCS += aes-armv8.c sha256-armv8.c endif endif endif @@ -713,6 +713,7 @@ ifeq ($(CPU_ARCH),arm) # Confusingly, __SOFTFP__ is the name of the define for the softfloat ABI, not for the softfp ABI. USES_SOFTFLOAT_ABI := $(shell $(CC) -o - -E -dM - $(CFLAGS) < /dev/null | grep __SOFTFP__ > /dev/null && echo 1) $(OBJDIR)/$(PROG_PREFIX)aes-armv8$(OBJ_SUFFIX): CFLAGS += -march=armv8-a -mfpu=crypto-neon-fp-armv8$(if $(USES_SOFTFLOAT_ABI), -mfloat-abi=softfp) +$(OBJDIR)/$(PROG_PREFIX)sha256-armv8$(OBJ_SUFFIX): CFLAGS += -march=armv8-a -mfpu=crypto-neon-fp-armv8$(if $(USES_SOFTFLOAT_ABI), -mfloat-abi=softfp) ifndef NSS_DISABLE_ARM32_NEON $(OBJDIR)/$(PROG_PREFIX)gcm-arm32-neon$(OBJ_SUFFIX): CFLAGS += -mfpu=neon$(if $(USES_SOFTFLOAT_ABI), -mfloat-abi=softfp) endif @@ -720,6 +721,7 @@ endif ifeq ($(CPU_ARCH),aarch64) $(OBJDIR)/$(PROG_PREFIX)aes-armv8$(OBJ_SUFFIX): CFLAGS += -march=armv8-a+crypto $(OBJDIR)/$(PROG_PREFIX)gcm-aarch64$(OBJ_SUFFIX): CFLAGS += -march=armv8-a+crypto +$(OBJDIR)/$(PROG_PREFIX)sha256-armv8$(OBJ_SUFFIX): CFLAGS += -march=armv8-a+crypto endif ifeq ($(CPU_ARCH),ppc) diff --git a/lib/freebl/blinit.c b/lib/freebl/blinit.c index db7d1eab1..4f2d2d492 100644 --- a/lib/freebl/blinit.c +++ b/lib/freebl/blinit.c @@ -232,6 +232,7 @@ CheckARMSupport() arm_neon_support_ = PR_GetEnvSecure("NSS_DISABLE_ARM_NEON") == NULL; arm_aes_support_ &= PR_GetEnvSecure("NSS_DISABLE_HW_AES") == NULL; arm_pmull_support_ &= PR_GetEnvSecure("NSS_DISABLE_PMULL") == NULL; + arm_sha2_support_ &= PR_GetEnvSecure("NSS_DISABLE_HW_SHA2") == NULL; } #endif /* defined(__aarch64__) */ @@ -355,6 +356,7 @@ CheckARMSupport() arm_sha2_support_ = hwcaps & HWCAP2_SHA2; } arm_neon_support_ = GetNeonSupport(); + arm_sha2_support_ &= PR_GetEnvSecure("NSS_DISABLE_HW_SHA2") == NULL; } #endif /* defined(__arm__) */ diff --git a/lib/freebl/freebl.gyp b/lib/freebl/freebl.gyp index f3bb8a71a..16fa61c6b 100644 --- a/lib/freebl/freebl.gyp +++ b/lib/freebl/freebl.gyp @@ -329,6 +329,7 @@ 'type': 'static_library', 'sources': [ 'aes-armv8.c', + 'sha256-armv8.c', ], 'dependencies': [ '<(DEPTH)/exports.gyp:nss_exports' @@ -384,7 +385,7 @@ 'dependencies': [ 'gcm-aes-x86_c_lib', ], - }, 'disable_arm_hw_aes==0 and (target_arch=="arm" or target_arch=="arm64" or target_arch=="aarch64")', { + }, '(disable_arm_hw_aes==0 or disable_arm_hw_sha2==0) and (target_arch=="arm" or target_arch=="arm64" or target_arch=="aarch64")', { 'dependencies': [ 'armv8_c_lib' ], @@ -642,6 +643,11 @@ 'USE_HW_AES', ], }], + [ 'OS=="win" and (target_arch=="arm64" or target_arch=="aarch64") and disable_arm_hw_sha2==0', { + 'defines': [ + 'USE_HW_SHA2', + ], + }], [ 'cc_use_gnu_ld==1 and OS=="win" and target_arch=="x64"', { # mingw x64 'defines': [ @@ -704,6 +710,11 @@ 'USE_HW_AES', ], }], + [ 'disable_arm_hw_sha2==0 and (target_arch=="arm" or target_arch=="arm64" or target_arch=="aarch64")', { + 'defines': [ + 'USE_HW_SHA2', + ], + }], ], }], ], diff --git a/lib/freebl/sha256-armv8.c b/lib/freebl/sha256-armv8.c new file mode 100644 index 000000000..17fe126c4 --- /dev/null +++ b/lib/freebl/sha256-armv8.c @@ -0,0 +1,203 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifdef USE_HW_SHA2 + +#ifndef __ARM_FEATURE_CRYPTO +#error "Compiler option is invalid" +#endif + +#ifdef FREEBL_NO_DEPEND +#include "stubs.h" +#endif + +#include "prcpucfg.h" +#include "prtypes.h" /* for PRUintXX */ +#include "prlong.h" +#include "blapi.h" +#include "sha256.h" + +#include <arm_neon.h> + +/* SHA-256 constants, K256. */ +static const PRUint32 __attribute__((aligned(16))) K256[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +#define ROUND(n, a, b, c, d) \ + { \ + uint32x4_t t = vaddq_u32(a, k##n); \ + uint32x4_t wt = w0; \ + w0 = vsha256hq_u32(w0, w1, t); \ + w1 = vsha256h2q_u32(w1, wt, t); \ + if (n < 12) { \ + a = vsha256su0q_u32(a, b); \ + a = vsha256su1q_u32(a, c, d); \ + } \ + } + +void +SHA256_Compress_Native(SHA256Context *ctx) +{ + const uint32x4_t k0 = vld1q_u32(K256); + const uint32x4_t k1 = vld1q_u32(K256 + 4); + const uint32x4_t k2 = vld1q_u32(K256 + 8); + const uint32x4_t k3 = vld1q_u32(K256 + 12); + const uint32x4_t k4 = vld1q_u32(K256 + 16); + const uint32x4_t k5 = vld1q_u32(K256 + 20); + const uint32x4_t k6 = vld1q_u32(K256 + 24); + const uint32x4_t k7 = vld1q_u32(K256 + 28); + const uint32x4_t k8 = vld1q_u32(K256 + 32); + const uint32x4_t k9 = vld1q_u32(K256 + 36); + const uint32x4_t k10 = vld1q_u32(K256 + 40); + const uint32x4_t k11 = vld1q_u32(K256 + 44); + const uint32x4_t k12 = vld1q_u32(K256 + 48); + const uint32x4_t k13 = vld1q_u32(K256 + 52); + const uint32x4_t k14 = vld1q_u32(K256 + 56); + const uint32x4_t k15 = vld1q_u32(K256 + 60); + + uint32x4_t h0 = vld1q_u32(ctx->h); + uint32x4_t h1 = vld1q_u32(ctx->h + 4); + + unsigned char *input = ctx->u.b; + + uint32x4_t a = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input))); + uint32x4_t b = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16))); + uint32x4_t c = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32))); + uint32x4_t d = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48))); + + uint32x4_t w0 = h0; + uint32x4_t w1 = h1; + + ROUND(0, a, b, c, d) + ROUND(1, b, c, d, a) + ROUND(2, c, d, a, b) + ROUND(3, d, a, b, c) + ROUND(4, a, b, c, d) + ROUND(5, b, c, d, a) + ROUND(6, c, d, a, b) + ROUND(7, d, a, b, c) + ROUND(8, a, b, c, d) + ROUND(9, b, c, d, a) + ROUND(10, c, d, a, b) + ROUND(11, d, a, b, c) + ROUND(12, a, b, c, d) + ROUND(13, b, c, d, a) + ROUND(14, c, d, a, b) + ROUND(15, d, a, b, c) + + h0 = vaddq_u32(h0, w0); + h1 = vaddq_u32(h1, w1); + + vst1q_u32(ctx->h, h0); + vst1q_u32(ctx->h + 4, h1); +} + +void +SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input, + unsigned int inputLen) +{ + const uint32x4_t k0 = vld1q_u32(K256); + const uint32x4_t k1 = vld1q_u32(K256 + 4); + const uint32x4_t k2 = vld1q_u32(K256 + 8); + const uint32x4_t k3 = vld1q_u32(K256 + 12); + const uint32x4_t k4 = vld1q_u32(K256 + 16); + const uint32x4_t k5 = vld1q_u32(K256 + 20); + const uint32x4_t k6 = vld1q_u32(K256 + 24); + const uint32x4_t k7 = vld1q_u32(K256 + 28); + const uint32x4_t k8 = vld1q_u32(K256 + 32); + const uint32x4_t k9 = vld1q_u32(K256 + 36); + const uint32x4_t k10 = vld1q_u32(K256 + 40); + const uint32x4_t k11 = vld1q_u32(K256 + 44); + const uint32x4_t k12 = vld1q_u32(K256 + 48); + const uint32x4_t k13 = vld1q_u32(K256 + 52); + const uint32x4_t k14 = vld1q_u32(K256 + 56); + const uint32x4_t k15 = vld1q_u32(K256 + 60); + + unsigned int inBuf = ctx->sizeLo & 0x3f; + if (!inputLen) { + return; + } + + /* Add inputLen into the count of bytes processed, before processing */ + if ((ctx->sizeLo += inputLen) < inputLen) { + ctx->sizeHi++; + } + + /* if data already in buffer, attemp to fill rest of buffer */ + if (inBuf) { + unsigned int todo = SHA256_BLOCK_LENGTH - inBuf; + if (inputLen < todo) { + todo = inputLen; + } + memcpy(ctx->u.b + inBuf, input, todo); + input += todo; + inputLen -= todo; + if (inBuf + todo == SHA256_BLOCK_LENGTH) { + SHA256_Compress_Native(ctx); + } + } + + uint32x4_t h0 = vld1q_u32(ctx->h); + uint32x4_t h1 = vld1q_u32(ctx->h + 4); + + /* if enough data to fill one or more whole buffers, process them. */ + while (inputLen >= SHA256_BLOCK_LENGTH) { + uint32x4_t a, b, c, d; + a = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input))); + b = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16))); + c = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32))); + d = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48))); + input += SHA256_BLOCK_LENGTH; + inputLen -= SHA256_BLOCK_LENGTH; + + uint32x4_t w0 = h0; + uint32x4_t w1 = h1; + + ROUND(0, a, b, c, d) + ROUND(1, b, c, d, a) + ROUND(2, c, d, a, b) + ROUND(3, d, a, b, c) + ROUND(4, a, b, c, d) + ROUND(5, b, c, d, a) + ROUND(6, c, d, a, b) + ROUND(7, d, a, b, c) + ROUND(8, a, b, c, d) + ROUND(9, b, c, d, a) + ROUND(10, c, d, a, b) + ROUND(11, d, a, b, c) + ROUND(12, a, b, c, d) + ROUND(13, b, c, d, a) + ROUND(14, c, d, a, b) + ROUND(15, d, a, b, c) + + h0 = vaddq_u32(h0, w0); + h1 = vaddq_u32(h1, w1); + } + + vst1q_u32(ctx->h, h0); + vst1q_u32(ctx->h + 4, h1); + + /* if data left over, fill it into buffer */ + if (inputLen) { + memcpy(ctx->u.b, input, inputLen); + } +} + +#endif /* USE_HW_SHA2 */ diff --git a/lib/freebl/sha256.h b/lib/freebl/sha256.h index c65ca152d..645118b07 100644 --- a/lib/freebl/sha256.h +++ b/lib/freebl/sha256.h @@ -7,6 +7,12 @@ #include "prtypes.h" +struct SHA256ContextStr; + +typedef void (*sha256_compress_t)(struct SHA256ContextStr *); +typedef void (*sha256_update_t)(struct SHA256ContextStr *, const unsigned char *, + unsigned int); + struct SHA256ContextStr { union { PRUint32 w[64]; /* message schedule, input buffer, plus 48 words */ @@ -14,6 +20,8 @@ struct SHA256ContextStr { } u; PRUint32 h[8]; /* 8 state variables */ PRUint32 sizeHi, sizeLo; /* 64-bit count of hashed bytes. */ + sha256_compress_t compress; + sha256_update_t update; }; #endif /* _SHA_256_H_ */ diff --git a/lib/freebl/sha512.c b/lib/freebl/sha512.c index f2a1a33ca..dc0ed776b 100644 --- a/lib/freebl/sha512.c +++ b/lib/freebl/sha512.c @@ -19,6 +19,7 @@ #include "secport.h" /* for PORT_XXX */ #include "blapi.h" #include "blapii.h" +#include "secerr.h" #include "sha256.h" /* for struct SHA256ContextStr */ #include "crypto_primitives.h" #include "ppc-crypto.h" /* for USE_PPC_CRYPTO */ @@ -156,6 +157,30 @@ swap4b(PRUint32 value) #define s0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ SHR(x, 3)) #define s1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ SHR(x, 10)) +void SHA256_Compress_Native(SHA256Context *ctx); +void SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input, unsigned int inputLen); + +static void SHA256_Compress_Generic(SHA256Context *ctx); +static void SHA256_Update_Generic(SHA256Context *ctx, const unsigned char *input, + unsigned int inputLen); + +#ifndef USE_HW_SHA2 +void +SHA256_Compress_Native(SHA256Context *ctx) +{ + PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); + PORT_Assert(0); +} + +void +SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input, + unsigned int inputLen) +{ + PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); + PORT_Assert(0); +} +#endif + SHA256Context * SHA256_NewContext(void) { @@ -177,6 +202,17 @@ SHA256_Begin(SHA256Context *ctx) { memset(ctx, 0, sizeof *ctx); memcpy(H, H256, sizeof H256); +#if defined(USE_HW_SHA2) && defined(IS_LITTLE_ENDIAN) + /* arm's implementation is tested on little endian only */ + if (arm_sha2_support()) { + ctx->compress = SHA256_Compress_Native; + ctx->update = SHA256_Update_Native; + } else +#endif + { + ctx->compress = SHA256_Compress_Generic; + ctx->update = SHA256_Update_Generic; + } } #if defined(USE_PPC_CRYPTO) @@ -273,7 +309,7 @@ SHA256_Begin(SHA256Context *ctx) ROUND(63, b, c, d, e, f, g, h, a) static void -SHA256_Compress(SHA256Context *ctx) +SHA256_Compress_Generic(SHA256Context *ctx) { #if defined(USE_PPC_CRYPTO) vec_u32 w[16], s0, s1; @@ -475,6 +511,13 @@ void SHA256_Update(SHA256Context *ctx, const unsigned char *input, unsigned int inputLen) { + ctx->update(ctx, input, inputLen); +} + +static void +SHA256_Update_Generic(SHA256Context *ctx, const unsigned char *input, + unsigned int inputLen) +{ unsigned int inBuf = ctx->sizeLo & 0x3f; if (!inputLen) return; @@ -492,7 +535,7 @@ SHA256_Update(SHA256Context *ctx, const unsigned char *input, input += todo; inputLen -= todo; if (inBuf + todo == SHA256_BLOCK_LENGTH) - SHA256_Compress(ctx); + SHA256_Compress_Generic(ctx); } /* if enough data to fill one or more whole buffers, process them. */ @@ -500,7 +543,7 @@ SHA256_Update(SHA256Context *ctx, const unsigned char *input, memcpy(B, input, SHA256_BLOCK_LENGTH); input += SHA256_BLOCK_LENGTH; inputLen -= SHA256_BLOCK_LENGTH; - SHA256_Compress(ctx); + SHA256_Compress_Generic(ctx); } /* if data left over, fill it into buffer */ if (inputLen) @@ -518,7 +561,7 @@ SHA256_End(SHA256Context *ctx, unsigned char *digest, hi = (ctx->sizeHi << 3) | (ctx->sizeLo >> 29); lo = (ctx->sizeLo << 3); - SHA256_Update(ctx, pad, padLen); + ctx->update(ctx, pad, padLen); #if defined(IS_LITTLE_ENDIAN) W[14] = SHA_HTONL(hi); @@ -527,7 +570,7 @@ SHA256_End(SHA256Context *ctx, unsigned char *digest, W[14] = hi; W[15] = lo; #endif - SHA256_Compress(ctx); + ctx->compress(ctx); /* now output the answer */ #if defined(IS_LITTLE_ENDIAN) @@ -651,13 +694,24 @@ SHA224_Begin(SHA224Context *ctx) { memset(ctx, 0, sizeof *ctx); memcpy(H, H224, sizeof H224); +#if defined(USE_HW_SHA2) && defined(IS_LITTLE_ENDIAN) + /* arm's implementation is tested on little endian only */ + if (arm_sha2_support()) { + ctx->compress = SHA256_Compress_Native; + ctx->update = SHA256_Update_Native; + } else +#endif + { + ctx->compress = SHA256_Compress_Generic; + ctx->update = SHA256_Update_Generic; + } } void SHA224_Update(SHA224Context *ctx, const unsigned char *input, unsigned int inputLen) { - SHA256_Update(ctx, input, inputLen); + ctx->update(ctx, input, inputLen); } void |