7 files changed, 296 insertions, 15 deletions
diff --git a/coreconf/config.gypi b/coreconf/config.gypi
index 62d3cc71e..77ed8da98 100644
--- a/coreconf/config.gypi
+++ b/coreconf/config.gypi
@@ -97,6 +97,7 @@
     'cc_use_gnu_ld%': '<(cc_use_gnu_ld)',
     # Some defaults
     'disable_arm_hw_aes%': 0,
+    'disable_arm_hw_sha2%': 0,
     'disable_tests%': 0,
     'disable_chachapoly%': 0,
     'disable_deprecated_seed%': 0,
diff --git a/lib/freebl/Makefile b/lib/freebl/Makefile
index 5f7384429..29dc940a3 100644
--- a/lib/freebl/Makefile
+++ b/lib/freebl/Makefile
@@ -120,25 +120,25 @@ else
 endif
 endif
 ifeq ($(CPU_ARCH),aarch64)
-    DEFINES += -DUSE_HW_AES
-    EXTRA_SRCS += aes-armv8.c gcm-aarch64.c
+    DEFINES += -DUSE_HW_AES -DUSE_HW_SHA2
+    EXTRA_SRCS += aes-armv8.c gcm-aarch64.c sha256-armv8.c
 endif
 ifeq ($(CPU_ARCH),arm)
 ifndef NSS_DISABLE_ARM32_NEON
     EXTRA_SRCS += gcm-arm32-neon.c
 endif
     ifdef CC_IS_CLANG
-        DEFINES += -DUSE_HW_AES
-        EXTRA_SRCS += aes-armv8.c
+        DEFINES += -DUSE_HW_AES -DUSE_HW_SHA2
+        EXTRA_SRCS += aes-armv8.c sha256-armv8.c
     else ifeq (1,$(CC_IS_GCC))
         # Old compiler doesn't support ARM AES.
         ifneq (,$(filter 4.9,$(word 1,$(GCC_VERSION)).$(word 2,$(GCC_VERSION))))
-            DEFINES += -DUSE_HW_AES
-            EXTRA_SRCS += aes-armv8.c
+            DEFINES += -DUSE_HW_AES -DUSE_HW_SHA2
+            EXTRA_SRCS += aes-armv8.c sha256-armv8.c
         endif
         ifeq (,$(filter 0 1 2 3 4,$(word 1,$(GCC_VERSION))))
-            DEFINES += -DUSE_HW_AES
-            EXTRA_SRCS += aes-armv8.c
+            DEFINES += -DUSE_HW_AES -DUSE_HW_SHA2
+            EXTRA_SRCS += aes-armv8.c sha256-armv8.c
         endif
     endif
 endif
@@ -713,6 +713,7 @@ ifeq ($(CPU_ARCH),arm)
 # Confusingly, __SOFTFP__ is the name of the define for the softfloat ABI, not for the softfp ABI.
 USES_SOFTFLOAT_ABI := $(shell $(CC) -o - -E -dM - $(CFLAGS) < /dev/null | grep __SOFTFP__ > /dev/null && echo 1)
 $(OBJDIR)/$(PROG_PREFIX)aes-armv8$(OBJ_SUFFIX): CFLAGS += -march=armv8-a -mfpu=crypto-neon-fp-armv8$(if $(USES_SOFTFLOAT_ABI), -mfloat-abi=softfp)
+$(OBJDIR)/$(PROG_PREFIX)sha256-armv8$(OBJ_SUFFIX): CFLAGS += -march=armv8-a -mfpu=crypto-neon-fp-armv8$(if $(USES_SOFTFLOAT_ABI), -mfloat-abi=softfp)
 ifndef NSS_DISABLE_ARM32_NEON
 $(OBJDIR)/$(PROG_PREFIX)gcm-arm32-neon$(OBJ_SUFFIX): CFLAGS += -mfpu=neon$(if $(USES_SOFTFLOAT_ABI), -mfloat-abi=softfp)
 endif
@@ -720,6 +721,7 @@ endif
 ifeq ($(CPU_ARCH),aarch64)
 $(OBJDIR)/$(PROG_PREFIX)aes-armv8$(OBJ_SUFFIX): CFLAGS += -march=armv8-a+crypto
 $(OBJDIR)/$(PROG_PREFIX)gcm-aarch64$(OBJ_SUFFIX): CFLAGS += -march=armv8-a+crypto
+$(OBJDIR)/$(PROG_PREFIX)sha256-armv8$(OBJ_SUFFIX): CFLAGS += -march=armv8-a+crypto
 endif
 
 ifeq ($(CPU_ARCH),ppc)
diff --git a/lib/freebl/blinit.c b/lib/freebl/blinit.c
index db7d1eab1..4f2d2d492 100644
--- a/lib/freebl/blinit.c
+++ b/lib/freebl/blinit.c
@@ -232,6 +232,7 @@ CheckARMSupport()
     arm_neon_support_ = PR_GetEnvSecure("NSS_DISABLE_ARM_NEON") == NULL;
     arm_aes_support_ &= PR_GetEnvSecure("NSS_DISABLE_HW_AES") == NULL;
     arm_pmull_support_ &= PR_GetEnvSecure("NSS_DISABLE_PMULL") == NULL;
+    arm_sha2_support_ &= PR_GetEnvSecure("NSS_DISABLE_HW_SHA2") == NULL;
 }
 #endif /* defined(__aarch64__) */
 
@@ -355,6 +356,7 @@ CheckARMSupport()
         arm_sha2_support_ = hwcaps & HWCAP2_SHA2;
     }
     arm_neon_support_ = GetNeonSupport();
+    arm_sha2_support_ &= PR_GetEnvSecure("NSS_DISABLE_HW_SHA2") == NULL;
 }
 #endif /* defined(__arm__) */
 
diff --git a/lib/freebl/freebl.gyp b/lib/freebl/freebl.gyp
index f3bb8a71a..16fa61c6b 100644
--- a/lib/freebl/freebl.gyp
+++ b/lib/freebl/freebl.gyp
@@ -329,6 +329,7 @@
       'type': 'static_library',
       'sources': [
         'aes-armv8.c',
+        'sha256-armv8.c',
       ],
       'dependencies': [
         '<(DEPTH)/exports.gyp:nss_exports'
@@ -384,7 +385,7 @@
           'dependencies': [
             'gcm-aes-x86_c_lib',
           ],
-        }, 'disable_arm_hw_aes==0 and (target_arch=="arm" or target_arch=="arm64" or target_arch=="aarch64")', {
+        }, '(disable_arm_hw_aes==0 or disable_arm_hw_sha2==0) and (target_arch=="arm" or target_arch=="arm64" or target_arch=="aarch64")', {
           'dependencies': [
             'armv8_c_lib'
           ],
@@ -642,6 +643,11 @@
           'USE_HW_AES',
         ],
       }],
+      [ 'OS=="win" and (target_arch=="arm64" or target_arch=="aarch64") and disable_arm_hw_sha2==0', {
+        'defines': [
+          'USE_HW_SHA2',
+        ],
+      }],
       [ 'cc_use_gnu_ld==1 and OS=="win" and target_arch=="x64"', {
         # mingw x64
         'defines': [
@@ -704,6 +710,11 @@
               'USE_HW_AES',
             ],
           }],
+          [ 'disable_arm_hw_sha2==0 and (target_arch=="arm" or target_arch=="arm64" or target_arch=="aarch64")', {
+            'defines': [
+              'USE_HW_SHA2',
+            ],
+          }],
         ],
       }],
     ],
diff --git a/lib/freebl/sha256-armv8.c b/lib/freebl/sha256-armv8.c
new file mode 100644
index 000000000..17fe126c4
--- /dev/null
+++ b/lib/freebl/sha256-armv8.c
@@ -0,0 +1,203 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifdef USE_HW_SHA2
+
+#ifndef __ARM_FEATURE_CRYPTO
+#error "Compiler option is invalid"
+#endif
+
+#ifdef FREEBL_NO_DEPEND
+#include "stubs.h"
+#endif
+
+#include "prcpucfg.h"
+#include "prtypes.h" /* for PRUintXX */
+#include "prlong.h"
+#include "blapi.h"
+#include "sha256.h"
+
+#include <arm_neon.h>
+
+/* SHA-256 constants, K256. */
+static const PRUint32 __attribute__((aligned(16))) K256[64] = {
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+#define ROUND(n, a, b, c, d)               \
+    {                                      \
+        uint32x4_t t = vaddq_u32(a, k##n); \
+        uint32x4_t wt = w0;                \
+        w0 = vsha256hq_u32(w0, w1, t);     \
+        w1 = vsha256h2q_u32(w1, wt, t);    \
+        if (n < 12) {                      \
+            a = vsha256su0q_u32(a, b);     \
+            a = vsha256su1q_u32(a, c, d);  \
+        }                                  \
+    }
+
+void
+SHA256_Compress_Native(SHA256Context *ctx)
+{
+    const uint32x4_t k0 = vld1q_u32(K256);
+    const uint32x4_t k1 = vld1q_u32(K256 + 4);
+    const uint32x4_t k2 = vld1q_u32(K256 + 8);
+    const uint32x4_t k3 = vld1q_u32(K256 + 12);
+    const uint32x4_t k4 = vld1q_u32(K256 + 16);
+    const uint32x4_t k5 = vld1q_u32(K256 + 20);
+    const uint32x4_t k6 = vld1q_u32(K256 + 24);
+    const uint32x4_t k7 = vld1q_u32(K256 + 28);
+    const uint32x4_t k8 = vld1q_u32(K256 + 32);
+    const uint32x4_t k9 = vld1q_u32(K256 + 36);
+    const uint32x4_t k10 = vld1q_u32(K256 + 40);
+    const uint32x4_t k11 = vld1q_u32(K256 + 44);
+    const uint32x4_t k12 = vld1q_u32(K256 + 48);
+    const uint32x4_t k13 = vld1q_u32(K256 + 52);
+    const uint32x4_t k14 = vld1q_u32(K256 + 56);
+    const uint32x4_t k15 = vld1q_u32(K256 + 60);
+
+    uint32x4_t h0 = vld1q_u32(ctx->h);
+    uint32x4_t h1 = vld1q_u32(ctx->h + 4);
+
+    unsigned char *input = ctx->u.b;
+
+    uint32x4_t a = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input)));
+    uint32x4_t b = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16)));
+    uint32x4_t c = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32)));
+    uint32x4_t d = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48)));
+
+    uint32x4_t w0 = h0;
+    uint32x4_t w1 = h1;
+
+    ROUND(0, a, b, c, d)
+    ROUND(1, b, c, d, a)
+    ROUND(2, c, d, a, b)
+    ROUND(3, d, a, b, c)
+    ROUND(4, a, b, c, d)
+    ROUND(5, b, c, d, a)
+    ROUND(6, c, d, a, b)
+    ROUND(7, d, a, b, c)
+    ROUND(8, a, b, c, d)
+    ROUND(9, b, c, d, a)
+    ROUND(10, c, d, a, b)
+    ROUND(11, d, a, b, c)
+    ROUND(12, a, b, c, d)
+    ROUND(13, b, c, d, a)
+    ROUND(14, c, d, a, b)
+    ROUND(15, d, a, b, c)
+
+    h0 = vaddq_u32(h0, w0);
+    h1 = vaddq_u32(h1, w1);
+
+    vst1q_u32(ctx->h, h0);
+    vst1q_u32(ctx->h + 4, h1);
+}
+
+void
+SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input,
+                     unsigned int inputLen)
+{
+    const uint32x4_t k0 = vld1q_u32(K256);
+    const uint32x4_t k1 = vld1q_u32(K256 + 4);
+    const uint32x4_t k2 = vld1q_u32(K256 + 8);
+    const uint32x4_t k3 = vld1q_u32(K256 + 12);
+    const uint32x4_t k4 = vld1q_u32(K256 + 16);
+    const uint32x4_t k5 = vld1q_u32(K256 + 20);
+    const uint32x4_t k6 = vld1q_u32(K256 + 24);
+    const uint32x4_t k7 = vld1q_u32(K256 + 28);
+    const uint32x4_t k8 = vld1q_u32(K256 + 32);
+    const uint32x4_t k9 = vld1q_u32(K256 + 36);
+    const uint32x4_t k10 = vld1q_u32(K256 + 40);
+    const uint32x4_t k11 = vld1q_u32(K256 + 44);
+    const uint32x4_t k12 = vld1q_u32(K256 + 48);
+    const uint32x4_t k13 = vld1q_u32(K256 + 52);
+    const uint32x4_t k14 = vld1q_u32(K256 + 56);
+    const uint32x4_t k15 = vld1q_u32(K256 + 60);
+
+    unsigned int inBuf = ctx->sizeLo & 0x3f;
+    if (!inputLen) {
+        return;
+    }
+
+    /* Add inputLen into the count of bytes processed, before processing */
+    if ((ctx->sizeLo += inputLen) < inputLen) {
+        ctx->sizeHi++;
+    }
+
+    /* if data already in buffer, attemp to fill rest of buffer */
+    if (inBuf) {
+        unsigned int todo = SHA256_BLOCK_LENGTH - inBuf;
+        if (inputLen < todo) {
+            todo = inputLen;
+        }
+        memcpy(ctx->u.b + inBuf, input, todo);
+        input += todo;
+        inputLen -= todo;
+        if (inBuf + todo == SHA256_BLOCK_LENGTH) {
+            SHA256_Compress_Native(ctx);
+        }
+    }
+
+    uint32x4_t h0 = vld1q_u32(ctx->h);
+    uint32x4_t h1 = vld1q_u32(ctx->h + 4);
+
+    /* if enough data to fill one or more whole buffers, process them. */
+    while (inputLen >= SHA256_BLOCK_LENGTH) {
+        uint32x4_t a, b, c, d;
+        a = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input)));
+        b = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16)));
+        c = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32)));
+        d = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48)));
+        input += SHA256_BLOCK_LENGTH;
+        inputLen -= SHA256_BLOCK_LENGTH;
+
+        uint32x4_t w0 = h0;
+        uint32x4_t w1 = h1;
+
+        ROUND(0, a, b, c, d)
+        ROUND(1, b, c, d, a)
+        ROUND(2, c, d, a, b)
+        ROUND(3, d, a, b, c)
+        ROUND(4, a, b, c, d)
+        ROUND(5, b, c, d, a)
+        ROUND(6, c, d, a, b)
+        ROUND(7, d, a, b, c)
+        ROUND(8, a, b, c, d)
+        ROUND(9, b, c, d, a)
+        ROUND(10, c, d, a, b)
+        ROUND(11, d, a, b, c)
+        ROUND(12, a, b, c, d)
+        ROUND(13, b, c, d, a)
+        ROUND(14, c, d, a, b)
+        ROUND(15, d, a, b, c)
+
+        h0 = vaddq_u32(h0, w0);
+        h1 = vaddq_u32(h1, w1);
+    }
+
+    vst1q_u32(ctx->h, h0);
+    vst1q_u32(ctx->h + 4, h1);
+
+    /* if data left over, fill it into buffer */
+    if (inputLen) {
+        memcpy(ctx->u.b, input, inputLen);
+    }
+}
+
+#endif /* USE_HW_SHA2 */
diff --git a/lib/freebl/sha256.h b/lib/freebl/sha256.h
index c65ca152d..645118b07 100644
--- a/lib/freebl/sha256.h
+++ b/lib/freebl/sha256.h
@@ -7,6 +7,12 @@
 
 #include "prtypes.h"
 
+struct SHA256ContextStr;
+
+typedef void (*sha256_compress_t)(struct SHA256ContextStr *);
+typedef void (*sha256_update_t)(struct SHA256ContextStr *, const unsigned char *,
+                                unsigned int);
+
 struct SHA256ContextStr {
     union {
         PRUint32 w[64]; /* message schedule, input buffer, plus 48 words */
@@ -14,6 +20,8 @@ struct SHA256ContextStr {
     } u;
     PRUint32 h[8];           /* 8 state variables */
     PRUint32 sizeHi, sizeLo; /* 64-bit count of hashed bytes. */
+    sha256_compress_t compress;
+    sha256_update_t update;
 };
 
 #endif /* _SHA_256_H_ */
diff --git a/lib/freebl/sha512.c b/lib/freebl/sha512.c
index f2a1a33ca..dc0ed776b 100644
--- a/lib/freebl/sha512.c
+++ b/lib/freebl/sha512.c
@@ -19,6 +19,7 @@
 #include "secport.h" /* for PORT_XXX */
 #include "blapi.h"
 #include "blapii.h"
+#include "secerr.h"
 #include "sha256.h" /* for struct SHA256ContextStr */
 #include "crypto_primitives.h"
 #include "ppc-crypto.h" /* for USE_PPC_CRYPTO */
@@ -156,6 +157,30 @@ swap4b(PRUint32 value)
 #define s0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ SHR(x, 3))
 #define s1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ SHR(x, 10))
 
+void SHA256_Compress_Native(SHA256Context *ctx);
+void SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input, unsigned int inputLen);
+
+static void SHA256_Compress_Generic(SHA256Context *ctx);
+static void SHA256_Update_Generic(SHA256Context *ctx, const unsigned char *input,
+                                  unsigned int inputLen);
+
+#ifndef USE_HW_SHA2
+void
+SHA256_Compress_Native(SHA256Context *ctx)
+{
+    PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
+    PORT_Assert(0);
+}
+
+void
+SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input,
+                     unsigned int inputLen)
+{
+    PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
+    PORT_Assert(0);
+}
+#endif
+
 SHA256Context *
 SHA256_NewContext(void)
 {
@@ -177,6 +202,17 @@ SHA256_Begin(SHA256Context *ctx)
 {
     memset(ctx, 0, sizeof *ctx);
     memcpy(H, H256, sizeof H256);
+#if defined(USE_HW_SHA2) && defined(IS_LITTLE_ENDIAN)
+    /* arm's implementation is tested on little endian only */
+    if (arm_sha2_support()) {
+        ctx->compress = SHA256_Compress_Native;
+        ctx->update = SHA256_Update_Native;
+    } else
+#endif
+    {
+        ctx->compress = SHA256_Compress_Generic;
+        ctx->update = SHA256_Update_Generic;
+    }
 }
 
 #if defined(USE_PPC_CRYPTO)
@@ -273,7 +309,7 @@ SHA256_Begin(SHA256Context *ctx)
     ROUND(63, b, c, d, e, f, g, h, a)
 
 static void
-SHA256_Compress(SHA256Context *ctx)
+SHA256_Compress_Generic(SHA256Context *ctx)
 {
 #if defined(USE_PPC_CRYPTO)
     vec_u32 w[16], s0, s1;
@@ -475,6 +511,13 @@ void
 SHA256_Update(SHA256Context *ctx, const unsigned char *input,
               unsigned int inputLen)
 {
+    ctx->update(ctx, input, inputLen);
+}
+
+static void
+SHA256_Update_Generic(SHA256Context *ctx, const unsigned char *input,
+                      unsigned int inputLen)
+{
     unsigned int inBuf = ctx->sizeLo & 0x3f;
     if (!inputLen)
         return;
@@ -492,7 +535,7 @@ SHA256_Update(SHA256Context *ctx, const unsigned char *input,
         input += todo;
         inputLen -= todo;
         if (inBuf + todo == SHA256_BLOCK_LENGTH)
-            SHA256_Compress(ctx);
+            SHA256_Compress_Generic(ctx);
     }
 
     /* if enough data to fill one or more whole buffers, process them. */
@@ -500,7 +543,7 @@ SHA256_Update(SHA256Context *ctx, const unsigned char *input,
         memcpy(B, input, SHA256_BLOCK_LENGTH);
         input += SHA256_BLOCK_LENGTH;
         inputLen -= SHA256_BLOCK_LENGTH;
-        SHA256_Compress(ctx);
+        SHA256_Compress_Generic(ctx);
     }
     /* if data left over, fill it into buffer */
     if (inputLen)
@@ -518,7 +561,7 @@ SHA256_End(SHA256Context *ctx, unsigned char *digest,
     hi = (ctx->sizeHi << 3) | (ctx->sizeLo >> 29);
     lo = (ctx->sizeLo << 3);
 
-    SHA256_Update(ctx, pad, padLen);
+    ctx->update(ctx, pad, padLen);
 
 #if defined(IS_LITTLE_ENDIAN)
     W[14] = SHA_HTONL(hi);
@@ -527,7 +570,7 @@ SHA256_End(SHA256Context *ctx, unsigned char *digest,
     W[14] = hi;
     W[15] = lo;
 #endif
-    SHA256_Compress(ctx);
+    ctx->compress(ctx);
 
 /* now output the answer */
 #if defined(IS_LITTLE_ENDIAN)
@@ -651,13 +694,24 @@ SHA224_Begin(SHA224Context *ctx)
 {
     memset(ctx, 0, sizeof *ctx);
     memcpy(H, H224, sizeof H224);
+#if defined(USE_HW_SHA2) && defined(IS_LITTLE_ENDIAN)
+    /* arm's implementation is tested on little endian only */
+    if (arm_sha2_support()) {
+        ctx->compress = SHA256_Compress_Native;
+        ctx->update = SHA256_Update_Native;
+    } else
+#endif
+    {
+        ctx->compress = SHA256_Compress_Generic;
+        ctx->update = SHA256_Update_Generic;
+    }
 }
 
 void
 SHA224_Update(SHA224Context *ctx, const unsigned char *input,
               unsigned int inputLen)
 {
-    SHA256_Update(ctx, input, inputLen);
+    ctx->update(ctx, input, inputLen);
 }
 
 void