Bug 1424663 - vectorized ChaCha20 from HACL* for SSSE3 and ARM NEON, r=ttaubert

Summary: This adds the vectorized ChaCha20 implementation from HACL* to NSS and replaces the old vectorized code. Note that this is not used on Android as we currently have no way of testing this for Android or use it on Android for Firefox. Reviewers: ttaubert Reviewed By: ttaubert Bug #: 1424663 Differential Revision: https://phabricator.services.mozilla.com/D467
author: Franziskus Kiefer <franziskuskiefer@gmail.com> 2018-02-19 12:32:59 +0100
committer: Franziskus Kiefer <franziskuskiefer@gmail.com> 2018-02-19 12:32:59 +0100
commit: 89175c78faa076ba0d1be9cfe845f97c8a70e0fd (patch)
tree: 1ea85e91b8f3efd7eee525177d9fea8a2284701c
parent: e2f4deee109a08371fc0fa43e318d337c3dd6f04 (diff)
download: nss-hg-89175c78faa076ba0d1be9cfe845f97c8a70e0fd.tar.gz
20 files changed, 1136 insertions, 413 deletions
diff --git a/automation/taskcluster/docker-hacl/Dockerfile b/automation/taskcluster/docker-hacl/Dockerfile
index e8a88f06c..fefa89288 100644
--- a/automation/taskcluster/docker-hacl/Dockerfile
+++ b/automation/taskcluster/docker-hacl/Dockerfile
@@ -9,7 +9,7 @@ ENV haclrepo https://github.com/mitls/hacl-star.git
 
 # Define versions of dependencies
 ENV opamv 4.04.2
-ENV haclversion dcd48329d535727dbde93877b124c5ec4a7a2b20
+ENV haclversion 104de0fbc83939a5e76012d64e3db2b3c0524bd1
 
 # Install required packages and set versions
 ADD setup.sh /tmp/setup.sh
diff --git a/automation/taskcluster/graph/src/extend.js b/automation/taskcluster/graph/src/extend.js
index f6e7b9b75..ee9ac9b74 100644
--- a/automation/taskcluster/graph/src/extend.js
+++ b/automation/taskcluster/graph/src/extend.js
@@ -77,7 +77,8 @@ queue.filter(task => {
     }
   }
 
-  if (task.tests == "fips" && task.platform == "mac") {
+  if (task.tests == "fips" &&
+     (task.platform == "mac" || task.platform == "aarch64")) {
     return false;
   }
 
@@ -93,7 +94,7 @@ queue.filter(task => {
     }
   }
 
-  // Don't run additional hardware tests on ARM (we don't have anything there).
+  // Don't run all additional hardware tests on ARM.
   if (task.group == "Cipher" && task.platform == "aarch64" && task.env &&
       (task.env.NSS_DISABLE_PCLMUL == "1" || task.env.NSS_DISABLE_HW_AES == "1"
        || task.env.NSS_DISABLE_AVX == "1")) {
@@ -271,6 +272,18 @@ export default async function main() {
     }, aarch64_base)
   );
 
+  await scheduleLinux("Linux AArch64 (debug, make)",
+    merge({
+      env: {USE_64: "1"},
+      command: [
+         "/bin/bash",
+         "-c",
+         "bin/checkout.sh && nss/automation/taskcluster/scripts/build.sh"
+      ],
+      collection: "make",
+    }, aarch64_base)
+  );
+
   await scheduleMac("Mac (opt)", {collection: "opt"}, "--opt");
   await scheduleMac("Mac (debug)", {collection: "debug"});
 }
@@ -900,6 +913,13 @@ function scheduleTests(task_build, task_cert, test_base) {
     env: {NSS_DISABLE_AVX: "1"}, group: "Cipher"
   }));
   queue.scheduleTask(merge(no_cert_base, {
+    name: "Cipher tests", symbol: "NoSSSE3|NEON", tests: "cipher",
+    env: {
+      NSS_DISABLE_ARM_NEON: "1",
+      NSS_DISABLE_SSSE3: "1"
+    }, group: "Cipher"
+  }));
+  queue.scheduleTask(merge(no_cert_base, {
     name: "EC tests", symbol: "EC", tests: "ec"
   }));
   queue.scheduleTask(merge(no_cert_base, {
diff --git a/lib/freebl/Makefile b/lib/freebl/Makefile
index 6c8e6a2ea..a4b1a86ae 100644
--- a/lib/freebl/Makefile
+++ b/lib/freebl/Makefile
@@ -519,23 +519,16 @@ ifndef NSS_DISABLE_CHACHAPOLY
         else
             EXTRA_SRCS += poly1305.c
         endif
-
-        ifneq (1,$(CC_IS_GCC))
-            EXTRA_SRCS += chacha20.c
-            VERIFIED_SRCS += Hacl_Chacha20.c
-        else
-            EXTRA_SRCS += chacha20_vec.c
-        endif
     else
         ifeq ($(CPU_ARCH),aarch64)
             EXTRA_SRCS += Hacl_Poly1305_64.c
         else
             EXTRA_SRCS += poly1305.c
         endif
-
-        EXTRA_SRCS += chacha20.c
-        VERIFIED_SRCS += Hacl_Chacha20.c
     endif # x86_64
+
+    VERIFIED_SRCS += Hacl_Chacha20.c
+    VERIFIED_SRCS += Hacl_Chacha20_Vec128.c
 endif # NSS_DISABLE_CHACHAPOLY
 
 ifeq (,$(filter-out i386 x386 x86 x86_64 aarch64,$(CPU_ARCH)))
diff --git a/lib/freebl/blapii.h b/lib/freebl/blapii.h
index bcf62e9f3..743a1168b 100644
--- a/lib/freebl/blapii.h
+++ b/lib/freebl/blapii.h
@@ -80,5 +80,11 @@ SECStatus generate_prime(mp_int *prime, int primeLen);
 PRBool aesni_support();
 PRBool clmul_support();
 PRBool avx_support();
+PRBool ssse3_support();
+PRBool arm_neon_support();
+PRBool arm_aes_support();
+PRBool arm_pmull_support();
+PRBool arm_sha1_support();
+PRBool arm_sha2_support();
 
 #endif /* _BLAPII_H_ */
diff --git a/lib/freebl/blinit.c b/lib/freebl/blinit.c
index d7f2ec53a..4ac1c49ad 100644
--- a/lib/freebl/blinit.c
+++ b/lib/freebl/blinit.c
@@ -23,6 +23,12 @@ static PRCallOnceType coFreeblInit;
 static PRBool aesni_support_ = PR_FALSE;
 static PRBool clmul_support_ = PR_FALSE;
 static PRBool avx_support_ = PR_FALSE;
+static PRBool ssse3_support_ = PR_FALSE;
+static PRBool arm_neon_support_ = PR_FALSE;
+static PRBool arm_aes_support_ = PR_FALSE;
+static PRBool arm_sha1_support_ = PR_FALSE;
+static PRBool arm_sha2_support_ = PR_FALSE;
+static PRBool arm_pmull_support_ = PR_FALSE;
 
 #ifdef NSS_X86_OR_X64
 /*
@@ -62,6 +68,7 @@ check_xcr0_ymm()
 #define ECX_XSAVE (1 << 26)
 #define ECX_OSXSAVE (1 << 27)
 #define ECX_AVX (1 << 28)
+#define ECX_SSSE3 (1 << 9)
 #define AVX_BITS (ECX_XSAVE | ECX_OSXSAVE | ECX_AVX)
 
 void
@@ -71,6 +78,7 @@ CheckX86CPUSupport()
     char *disable_hw_aes = PR_GetEnvSecure("NSS_DISABLE_HW_AES");
     char *disable_pclmul = PR_GetEnvSecure("NSS_DISABLE_PCLMUL");
     char *disable_avx = PR_GetEnvSecure("NSS_DISABLE_AVX");
+    char *disable_ssse3 = PR_GetEnvSecure("NSS_DISABLE_SSSE3");
     freebl_cpuid(1, &eax, &ebx, &ecx, &edx);
     aesni_support_ = (PRBool)((ecx & ECX_AESNI) != 0 && disable_hw_aes == NULL);
     clmul_support_ = (PRBool)((ecx & ECX_CLMUL) != 0 && disable_pclmul == NULL);
@@ -78,9 +86,107 @@ CheckX86CPUSupport()
      * as well as XMM and YMM state. */
     avx_support_ = (PRBool)((ecx & AVX_BITS) == AVX_BITS) && check_xcr0_ymm() &&
                    disable_avx == NULL;
+    ssse3_support_ = (PRBool)((ecx & ECX_SSSE3) != 0 &&
+                              disable_ssse3 == NULL);
 }
 #endif /* NSS_X86_OR_X64 */
 
+#if (defined(__aarch64__) || defined(__arm__)) && !defined(__ANDROID__)
+#if defined(__GNUC__) && __GNUC__ >= 2 && defined(__ELF__)
+#include <sys/auxv.h>
+extern unsigned long getauxval(unsigned long type) __attribute__((weak));
+#else
+static unsigned long (*getauxval)(unsigned long) = NULL;
+#define AT_HWCAP2
+#define AT_HWCAP
+#endif /* defined(__GNUC__) && __GNUC__ >= 2 && defined(__ELF__)*/
+#endif /* (defined(__aarch64__) || defined(__arm__)) && !defined(__ANDROID__) */
+
+#if defined(__aarch64__) && !defined(__ANDROID__)
+// Defines from hwcap.h in Linux kernel - ARM64
+#define HWCAP_AES (1 << 3)
+#define HWCAP_PMULL (1 << 4)
+#define HWCAP_SHA1 (1 << 5)
+#define HWCAP_SHA2 (1 << 6)
+
+void
+CheckARMSupport()
+{
+    char *disable_arm_neon = PR_GetEnvSecure("NSS_DISABLE_ARM_NEON");
+    char *disable_hw_aes = PR_GetEnvSecure("NSS_DISABLE_HW_AES");
+    if (getauxval) {
+        long hwcaps = getauxval(AT_HWCAP);
+        arm_aes_support_ = hwcaps & HWCAP_AES && disable_hw_aes == NULL;
+        arm_pmull_support_ = hwcaps & HWCAP_PMULL;
+        arm_sha1_support_ = hwcaps & HWCAP_SHA1;
+        arm_sha2_support_ = hwcaps & HWCAP_SHA2;
+    }
+    /* aarch64 must support NEON. */
+    arm_neon_support_ = disable_arm_neon == NULL;
+}
+#endif /* defined(__aarch64__) && !defined(__ANDROID__) */
+
+#if defined(__arm__) && !defined(__ANDROID__)
+// Defines from hwcap.h in Linux kernel - ARM
+/*
+ * HWCAP flags - for elf_hwcap (in kernel) and AT_HWCAP
+ */
+#define HWCAP_NEON (1 << 12)
+
+/*
+ * HWCAP2 flags - for elf_hwcap2 (in kernel) and AT_HWCAP2
+ */
+#define HWCAP2_AES (1 << 0)
+#define HWCAP2_PMULL (1 << 1)
+#define HWCAP2_SHA1 (1 << 2)
+#define HWCAP2_SHA2 (1 << 3)
+
+void
+CheckARMSupport()
+{
+    char *disable_arm_neon = PR_GetEnvSecure("NSS_DISABLE_ARM_NEON");
+    char *disable_hw_aes = PR_GetEnvSecure("NSS_DISABLE_HW_AES");
+    if (getauxval) {
+        long hwcaps = getauxval(AT_HWCAP2);
+        arm_aes_support_ = hwcaps & HWCAP2_AES && disable_hw_aes == NULL;
+        arm_pmull_support_ = hwcaps & HWCAP2_PMULL;
+        arm_sha1_support_ = hwcaps & HWCAP2_SHA1;
+        arm_sha2_support_ = hwcaps & HWCAP2_SHA2;
+        arm_neon_support_ = hwcaps & HWCAP_NEON && disable_arm_neon == NULL;
+    }
+}
+#endif /* defined(__arm__) && !defined(__ANDROID__) */
+
+// Enable when Firefox can use it.
+// #if defined(__ANDROID__) && (defined(__arm__) || defined(__aarch64__))
+// #include <cpu-features.h>
+// void
+// CheckARMSupport()
+// {
+//     char *disable_arm_neon = PR_GetEnvSecure("NSS_DISABLE_ARM_NEON");
+//     char *disable_hw_aes = PR_GetEnvSecure("NSS_DISABLE_HW_AES");
+//     AndroidCpuFamily family = android_getCpuFamily();
+//     uint64_t features = android_getCpuFeatures();
+//     if (family == ANDROID_CPU_FAMILY_ARM64) {
+//         arm_aes_support_ = features & ANDROID_CPU_ARM64_FEATURE_AES &&
+//                            disable_hw_aes == NULL;
+//         arm_pmull_support_ = features & ANDROID_CPU_ARM64_FEATURE_PMULL;
+//         arm_sha1_support_ = features & ANDROID_CPU_ARM64_FEATURE_SHA1;
+//         arm_sha2_support_ = features & ANDROID_CPU_ARM64_FEATURE_SHA2;
+//         arm_neon_support_ = disable_arm_neon == NULL;
+//     }
+//     if (family == ANDROID_CPU_FAMILY_ARM) {
+//         arm_aes_support_ = features & ANDROID_CPU_ARM_FEATURE_AES &&
+//                            disable_hw_aes == NULL;
+//         arm_pmull_support_ = features & ANDROID_CPU_ARM_FEATURE_PMULL;
+//         arm_sha1_support_ = features & ANDROID_CPU_ARM_FEATURE_SHA1;
+//         arm_sha2_support_ = features & ANDROID_CPU_ARM_FEATURE_SHA2;
+//         arm_neon_support_ = hwcaps & ANDROID_CPU_ARM_FEATURE_NEON &&
+//                             disable_arm_neon == NULL;
+//     }
+// }
+// #endif /* defined(__ANDROID__) && (defined(__arm__) || defined(__aarch64__)) */
+
 PRBool
 aesni_support()
 {
@@ -96,12 +202,44 @@ avx_support()
 {
     return avx_support_;
 }
+PRBool
+ssse3_support()
+{
+    return ssse3_support_;
+}
+PRBool
+arm_neon_support()
+{
+    return arm_neon_support_;
+}
+PRBool
+arm_aes_support()
+{
+    return arm_aes_support_;
+}
+PRBool
+arm_pmull_support()
+{
+    return arm_pmull_support_;
+}
+PRBool
+arm_sha1_support()
+{
+    return arm_sha1_support_;
+}
+PRBool
+arm_sha2_support()
+{
+    return arm_sha2_support_;
+}
 
 static PRStatus
 FreeblInit(void)
 {
 #ifdef NSS_X86_OR_X64
     CheckX86CPUSupport();
+#elif (defined(__aarch64__) || defined(__arm__)) && !defined(__ANDROID__)
+    CheckARMSupport();
 #endif
     return PR_SUCCESS;
 }
diff --git a/lib/freebl/chacha20.c b/lib/freebl/chacha20.c
deleted file mode 100644
index 15ed67b5b..000000000
--- a/lib/freebl/chacha20.c
+++ /dev/null
@@ -1,19 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-/* Adopted from the public domain code in NaCl by djb. */
-
-#include <string.h>
-#include <stdio.h>
-
-#include "chacha20.h"
-#include "verified/Hacl_Chacha20.h"
-
-void
-ChaCha20XOR(unsigned char *out, const unsigned char *in, unsigned int inLen,
-            const unsigned char key[32], const unsigned char nonce[12],
-            uint32_t counter)
-{
-    Hacl_Chacha20_chacha20(out, (uint8_t *)in, inLen, (uint8_t *)key, (uint8_t *)nonce, counter);
-}
diff --git a/lib/freebl/chacha20.h b/lib/freebl/chacha20.h
deleted file mode 100644
index 7e396fa8c..000000000
--- a/lib/freebl/chacha20.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * chacha20.h - header file for ChaCha20 implementation.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-#ifndef FREEBL_CHACHA20_H_
-#define FREEBL_CHACHA20_H_
-
-#if defined(_MSC_VER) && _MSC_VER < 1600
-#include "prtypes.h"
-typedef PRUint32 uint32_t;
-typedef PRUint64 uint64_t;
-#else
-#include <stdint.h>
-#endif
-
-/* ChaCha20XOR encrypts |inLen| bytes from |in| with the given key and
- * nonce and writes the result to |out|, which may be equal to |in|. The
- * initial block counter is specified by |counter|. */
-extern void ChaCha20XOR(unsigned char *out, const unsigned char *in,
-                        unsigned int inLen, const unsigned char key[32],
-                        const unsigned char nonce[12], uint32_t counter);
-
-#endif /* FREEBL_CHACHA20_H_ */
diff --git a/lib/freebl/chacha20_vec.c b/lib/freebl/chacha20_vec.c
deleted file mode 100644
index 12f94d897..000000000
--- a/lib/freebl/chacha20_vec.c
+++ /dev/null
@@ -1,327 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-/* This implementation is by Ted Krovetz and was submitted to SUPERCOP and
- * marked as public domain. It was been altered to allow for non-aligned inputs
- * and to allow the block counter to be passed in specifically. */
-
-#include <string.h>
-
-#include "chacha20.h"
-#include "blapii.h"
-
-#ifndef CHACHA_RNDS
-#define CHACHA_RNDS 20 /* 8 (high speed), 20 (conservative), 12 (middle) */
-#endif
-
-/* Architecture-neutral way to specify 16-byte vector of ints              */
-typedef unsigned vec __attribute__((vector_size(16)));
-
-/* This implementation is designed for Neon, SSE and AltiVec machines. The
- * following specify how to do certain vector operations efficiently on
- * each architecture, using intrinsics.
- * This implementation supports parallel processing of multiple blocks,
- * including potentially using general-purpose registers.
- */
-#if __ARM_NEON__
-#include <arm_neon.h>
-#define GPR_TOO 1
-#define VBPI 2
-#define ONE (vec) vsetq_lane_u32(1, vdupq_n_u32(0), 0)
-#define LOAD(m) (vec)(*((vec *)(m)))
-#define STORE(m, r) (*((vec *)(m))) = (r)
-#define ROTV1(x) (vec) vextq_u32((uint32x4_t)x, (uint32x4_t)x, 1)
-#define ROTV2(x) (vec) vextq_u32((uint32x4_t)x, (uint32x4_t)x, 2)
-#define ROTV3(x) (vec) vextq_u32((uint32x4_t)x, (uint32x4_t)x, 3)
-#define ROTW16(x) (vec) vrev32q_u16((uint16x8_t)x)
-#if __clang__
-#define ROTW7(x) (x << ((vec){ 7, 7, 7, 7 })) ^ (x >> ((vec){ 25, 25, 25, 25 }))
-#define ROTW8(x) (x << ((vec){ 8, 8, 8, 8 })) ^ (x >> ((vec){ 24, 24, 24, 24 }))
-#define ROTW12(x) (x << ((vec){ 12, 12, 12, 12 })) ^ (x >> ((vec){ 20, 20, 20, 20 }))
-#else
-#define ROTW7(x) (vec) vsriq_n_u32(vshlq_n_u32((uint32x4_t)x, 7), (uint32x4_t)x, 25)
-#define ROTW8(x) (vec) vsriq_n_u32(vshlq_n_u32((uint32x4_t)x, 8), (uint32x4_t)x, 24)
-#define ROTW12(x) (vec) vsriq_n_u32(vshlq_n_u32((uint32x4_t)x, 12), (uint32x4_t)x, 20)
-#endif
-#elif __SSE2__
-#include <emmintrin.h>
-#define GPR_TOO 0
-#if __clang__
-#define VBPI 4
-#else
-#define VBPI 3
-#endif
-#define ONE (vec) _mm_set_epi32(0, 0, 0, 1)
-#define LOAD(m) (vec) _mm_loadu_si128((__m128i *)(m))
-#define STORE(m, r) _mm_storeu_si128((__m128i *)(m), (__m128i)(r))
-#define ROTV1(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(0, 3, 2, 1))
-#define ROTV2(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(1, 0, 3, 2))
-#define ROTV3(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(2, 1, 0, 3))
-#define ROTW7(x) (vec)(_mm_slli_epi32((__m128i)x, 7) ^ _mm_srli_epi32((__m128i)x, 25))
-#define ROTW12(x) (vec)(_mm_slli_epi32((__m128i)x, 12) ^ _mm_srli_epi32((__m128i)x, 20))
-#if __SSSE3__
-#include <tmmintrin.h>
-#define ROTW8(x) (vec) _mm_shuffle_epi8((__m128i)x, _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3))
-#define ROTW16(x) (vec) _mm_shuffle_epi8((__m128i)x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2))
-#else
-#define ROTW8(x) (vec)(_mm_slli_epi32((__m128i)x, 8) ^ _mm_srli_epi32((__m128i)x, 24))
-#define ROTW16(x) (vec)(_mm_slli_epi32((__m128i)x, 16) ^ _mm_srli_epi32((__m128i)x, 16))
-#endif
-#else
-#error-- Implementation supports only machines with neon or SSE2
-#endif
-
-#ifndef REVV_BE
-#define REVV_BE(x) (x)
-#endif
-
-#ifndef REVW_BE
-#define REVW_BE(x) (x)
-#endif
-
-#define BPI (VBPI + GPR_TOO) /* Blocks computed per loop iteration   */
-
-#define DQROUND_VECTORS(a, b, c, d) \
-    a += b;                         \
-    d ^= a;                         \
-    d = ROTW16(d);                  \
-    c += d;                         \
-    b ^= c;                         \
-    b = ROTW12(b);                  \
-    a += b;                         \
-    d ^= a;                         \
-    d = ROTW8(d);                   \
-    c += d;                         \
-    b ^= c;                         \
-    b = ROTW7(b);                   \
-    b = ROTV1(b);                   \
-    c = ROTV2(c);                   \
-    d = ROTV3(d);                   \
-    a += b;                         \
-    d ^= a;                         \
-    d = ROTW16(d);                  \
-    c += d;                         \
-    b ^= c;                         \
-    b = ROTW12(b);                  \
-    a += b;                         \
-    d ^= a;                         \
-    d = ROTW8(d);                   \
-    c += d;                         \
-    b ^= c;                         \
-    b = ROTW7(b);                   \
-    b = ROTV3(b);                   \
-    c = ROTV2(c);                   \
-    d = ROTV1(d);
-
-#define QROUND_WORDS(a, b, c, d) \
-    a = a + b;                   \
-    d ^= a;                      \
-    d = d << 16 | d >> 16;       \
-    c = c + d;                   \
-    b ^= c;                      \
-    b = b << 12 | b >> 20;       \
-    a = a + b;                   \
-    d ^= a;                      \
-    d = d << 8 | d >> 24;        \
-    c = c + d;                   \
-    b ^= c;                      \
-    b = b << 7 | b >> 25;
-
-#define WRITE_XOR(in, op, d, v0, v1, v2, v3)           \
-    STORE(op + d + 0, LOAD(in + d + 0) ^ REVV_BE(v0)); \
-    STORE(op + d + 4, LOAD(in + d + 4) ^ REVV_BE(v1)); \
-    STORE(op + d + 8, LOAD(in + d + 8) ^ REVV_BE(v2)); \
-    STORE(op + d + 12, LOAD(in + d + 12) ^ REVV_BE(v3));
-
-void NO_SANITIZE_ALIGNMENT
-ChaCha20XOR(unsigned char *out, const unsigned char *in, unsigned int inlen,
-            const unsigned char key[32], const unsigned char nonce[12],
-            uint32_t counter)
-{
-    unsigned iters, i, *op = (unsigned *)out, *ip = (unsigned *)in, *kp;
-#if defined(__ARM_NEON__)
-    unsigned *np;
-#endif
-    vec s0, s1, s2, s3;
-#if !defined(__ARM_NEON__) && !defined(__SSE2__)
-    __attribute__((aligned(16))) unsigned key[8], nonce[4];
-#endif
-    __attribute__((aligned(16))) unsigned chacha_const[] =
-        { 0x61707865, 0x3320646E, 0x79622D32, 0x6B206574 };
-#if defined(__ARM_NEON__) || defined(__SSE2__)
-    kp = (unsigned *)key;
-#else
-    ((vec *)key)[0] = REVV_BE(((vec *)key)[0]);
-    ((vec *)key)[1] = REVV_BE(((vec *)key)[1]);
-    ((unsigned *)nonce)[0] = REVW_BE(((unsigned *)nonce)[0]);
-    ((unsigned *)nonce)[1] = REVW_BE(((unsigned *)nonce)[1]);
-    ((unsigned *)nonce)[2] = REVW_BE(((unsigned *)nonce)[2]);
-    ((unsigned *)nonce)[3] = REVW_BE(((unsigned *)nonce)[3]);
-    kp = (unsigned *)key;
-    np = (unsigned *)nonce;
-#endif
-#if defined(__ARM_NEON__)
-    np = (unsigned *)nonce;
-#endif
-    s0 = LOAD(chacha_const);
-    s1 = LOAD(&((vec *)kp)[0]);
-    s2 = LOAD(&((vec *)kp)[1]);
-    s3 = (vec){
-        counter,
-        ((uint32_t *)nonce)[0],
-        ((uint32_t *)nonce)[1],
-        ((uint32_t *)nonce)[2]
-    };
-
-    for (iters = 0; iters < inlen / (BPI * 64); iters++) {
-#if GPR_TOO
-        register unsigned x0, x1, x2, x3, x4, x5, x6, x7, x8,
-            x9, x10, x11, x12, x13, x14, x15;
-#endif
-#if VBPI > 2
-        vec v8, v9, v10, v11;
-#endif
-#if VBPI > 3
-        vec v12, v13, v14, v15;
-#endif
-
-        vec v0, v1, v2, v3, v4, v5, v6, v7;
-        v4 = v0 = s0;
-        v5 = v1 = s1;
-        v6 = v2 = s2;
-        v3 = s3;
-        v7 = v3 + ONE;
-#if VBPI > 2
-        v8 = v4;
-        v9 = v5;
-        v10 = v6;
-        v11 = v7 + ONE;
-#endif
-#if VBPI > 3
-        v12 = v8;
-        v13 = v9;
-        v14 = v10;
-        v15 = v11 + ONE;
-#endif
-#if GPR_TOO
-        x0 = chacha_const[0];
-        x1 = chacha_const[1];
-        x2 = chacha_const[2];
-        x3 = chacha_const[3];
-        x4 = kp[0];
-        x5 = kp[1];
-        x6 = kp[2];
-        x7 = kp[3];
-        x8 = kp[4];
-        x9 = kp[5];
-        x10 = kp[6];
-        x11 = kp[7];
-        x12 = counter + BPI * iters + (BPI - 1);
-        x13 = np[0];
-        x14 = np[1];
-        x15 = np[2];
-#endif
-        for (i = CHACHA_RNDS / 2; i; i--) {
-            DQROUND_VECTORS(v0, v1, v2, v3)
-            DQROUND_VECTORS(v4, v5, v6, v7)
-#if VBPI > 2
-            DQROUND_VECTORS(v8, v9, v10, v11)
-#endif
-#if VBPI > 3
-            DQROUND_VECTORS(v12, v13, v14, v15)
-#endif
-#if GPR_TOO
-            QROUND_WORDS(x0, x4, x8, x12)
-            QROUND_WORDS(x1, x5, x9, x13)
-            QROUND_WORDS(x2, x6, x10, x14)
-            QROUND_WORDS(x3, x7, x11, x15)
-            QROUND_WORDS(x0, x5, x10, x15)
-            QROUND_WORDS(x1, x6, x11, x12)
-            QROUND_WORDS(x2, x7, x8, x13)
-            QROUND_WORDS(x3, x4, x9, x14)
-#endif
-        }
-
-        WRITE_XOR(ip, op, 0, v0 + s0, v1 + s1, v2 + s2, v3 + s3)
-        s3 += ONE;
-        WRITE_XOR(ip, op, 16, v4 + s0, v5 + s1, v6 + s2, v7 + s3)
-        s3 += ONE;
-#if VBPI > 2
-        WRITE_XOR(ip, op, 32, v8 + s0, v9 + s1, v10 + s2, v11 + s3)
-        s3 += ONE;
-#endif
-#if VBPI > 3
-        WRITE_XOR(ip, op, 48, v12 + s0, v13 + s1, v14 + s2, v15 + s3)
-        s3 += ONE;
-#endif
-        ip += VBPI * 16;
-        op += VBPI * 16;
-#if GPR_TOO
-        op[0] = REVW_BE(REVW_BE(ip[0]) ^ (x0 + chacha_const[0]));
-        op[1] = REVW_BE(REVW_BE(ip[1]) ^ (x1 + chacha_const[1]));
-        op[2] = REVW_BE(REVW_BE(ip[2]) ^ (x2 + chacha_const[2]));
-        op[3] = REVW_BE(REVW_BE(ip[3]) ^ (x3 + chacha_const[3]));
-        op[4] = REVW_BE(REVW_BE(ip[4]) ^ (x4 + kp[0]));
-        op[5] = REVW_BE(REVW_BE(ip[5]) ^ (x5 + kp[1]));
-        op[6] = REVW_BE(REVW_BE(ip[6]) ^ (x6 + kp[2]));
-        op[7] = REVW_BE(REVW_BE(ip[7]) ^ (x7 + kp[3]));
-        op[8] = REVW_BE(REVW_BE(ip[8]) ^ (x8 + kp[4]));
-        op[9] = REVW_BE(REVW_BE(ip[9]) ^ (x9 + kp[5]));
-        op[10] = REVW_BE(REVW_BE(ip[10]) ^ (x10 + kp[6]));
-        op[11] = REVW_BE(REVW_BE(ip[11]) ^ (x11 + kp[7]));
-        op[12] = REVW_BE(REVW_BE(ip[12]) ^ (x12 + counter + BPI * iters + (BPI - 1)));
-        op[13] = REVW_BE(REVW_BE(ip[13]) ^ (x13 + np[0]));
-        op[14] = REVW_BE(REVW_BE(ip[14]) ^ (x14 + np[1]));
-        op[15] = REVW_BE(REVW_BE(ip[15]) ^ (x15 + np[2]));
-        s3 += ONE;
-        ip += 16;
-        op += 16;
-#endif
-    }
-
-    for (iters = inlen % (BPI * 64) / 64; iters != 0; iters--) {
-        vec v0 = s0, v1 = s1, v2 = s2, v3 = s3;
-        for (i = CHACHA_RNDS / 2; i; i--) {
-            DQROUND_VECTORS(v0, v1, v2, v3);
-        }
-        WRITE_XOR(ip, op, 0, v0 + s0, v1 + s1, v2 + s2, v3 + s3)
-        s3 += ONE;
-        ip += 16;
-        op += 16;
-    }
-
-    inlen = inlen % 64;
-    if (inlen) {
-        __attribute__((aligned(16))) vec buf[4];
-        vec v0, v1, v2, v3;
-        v0 = s0;
-        v1 = s1;
-        v2 = s2;
-        v3 = s3;
-        for (i = CHACHA_RNDS / 2; i; i--) {
-            DQROUND_VECTORS(v0, v1, v2, v3);
-        }
-
-        if (inlen >= 16) {
-            STORE(op + 0, LOAD(ip + 0) ^ REVV_BE(v0 + s0));
-            if (inlen >= 32) {
-                STORE(op + 4, LOAD(ip + 4) ^ REVV_BE(v1 + s1));
-                if (inlen >= 48) {
-                    STORE(op + 8, LOAD(ip + 8) ^ REVV_BE(v2 + s2));
-                    buf[3] = REVV_BE(v3 + s3);
-                } else {
-                    buf[2] = REVV_BE(v2 + s2);
-                }
-            } else {
-                buf[1] = REVV_BE(v1 + s1);
-            }
-        } else {
-            buf[0] = REVV_BE(v0 + s0);
-        }
-
-        for (i = inlen & ~15; i < inlen; i++) {
-            ((char *)op)[i] = ((char *)ip)[i] ^ ((char *)buf)[i];
-        }
-    }
-}
diff --git a/lib/freebl/chacha20poly1305.c b/lib/freebl/chacha20poly1305.c
index 991fa0ca3..859d05316 100644
--- a/lib/freebl/chacha20poly1305.c
+++ b/lib/freebl/chacha20poly1305.c
@@ -12,25 +12,28 @@
 #include "seccomon.h"
 #include "secerr.h"
 #include "blapit.h"
+#include "blapii.h"
 
 #ifndef NSS_DISABLE_CHACHAPOLY
-#if defined(HAVE_INT128_SUPPORT) && (defined(NSS_X86_OR_X64) || defined(__aarch64__))
-#include "verified/Hacl_Poly1305_64.h"
-#else
-#include "poly1305.h"
-#endif
-#include "chacha20.h"
 #include "chacha20poly1305.h"
-#endif
+// Forward declaration from "Hacl_Chacha20_Vec128.h".
+extern void Hacl_Chacha20_Vec128_chacha20(uint8_t *output, uint8_t *plain,
+                                          uint32_t len, uint8_t *k, uint8_t *n1,
+                                          uint32_t ctr);
+// Forward declaration from "Hacl_Chacha20.h".
+extern void Hacl_Chacha20_chacha20(uint8_t *output, uint8_t *plain, uint32_t len,
+                                   uint8_t *k, uint8_t *n1, uint32_t ctr);
 
 /* Poly1305Do writes the Poly1305 authenticator of the given additional data
  * and ciphertext to |out|. */
-#ifndef NSS_DISABLE_CHACHAPOLY
-
 #if defined(HAVE_INT128_SUPPORT) && (defined(NSS_X86_OR_X64) || defined(__aarch64__))
+/* Use HACL* Poly1305 on 64-bit Intel and ARM */
+#include "verified/Hacl_Poly1305_64.h"
 
 static void
-Poly1305PadUpdate(Hacl_Impl_Poly1305_64_State_poly1305_state state, unsigned char *block, const unsigned char *p, const unsigned int pLen)
+Poly1305PadUpdate(Hacl_Impl_Poly1305_64_State_poly1305_state state,
+                  unsigned char *block, const unsigned char *p,
+                  const unsigned int pLen)
 {
     unsigned int pRemLen = pLen % 16;
     Hacl_Poly1305_64_update(state, (uint8_t *)p, (pLen / 16));
@@ -46,7 +49,8 @@ Poly1305Do(unsigned char *out, const unsigned char *ad, unsigned int adLen,
            const unsigned char key[32])
 {
     uint64_t tmp1[6U] = { 0U };
-    Hacl_Impl_Poly1305_64_State_poly1305_state state = Hacl_Poly1305_64_mk_state(tmp1, tmp1 + 3);
+    Hacl_Impl_Poly1305_64_State_poly1305_state state =
+        Hacl_Poly1305_64_mk_state(tmp1, tmp1 + 3);
 
     unsigned char block[16] = { 0 };
     Hacl_Poly1305_64_init(state, (uint8_t *)key);
@@ -68,6 +72,8 @@ Poly1305Do(unsigned char *out, const unsigned char *ad, unsigned int adLen,
     Hacl_Poly1305_64_finish(state, out, (uint8_t *)(key + 16));
 }
 #else
+/* All other platforms get the 32-bit poly1305 reference implementation. */
+#include "poly1305.h"
 
 static void
 Poly1305Do(unsigned char *out, const unsigned char *ad, unsigned int adLen,
@@ -165,6 +171,17 @@ ChaCha20Poly1305_DestroyContext(ChaCha20Poly1305Context *ctx, PRBool freeit)
 #endif
 }
 
+void
+ChaCha20Xor(uint8_t *output, uint8_t *block, uint32_t len, uint8_t *k,
+            uint8_t *nonce, uint32_t ctr)
+{
+    if (ssse3_support() || arm_neon_support()) {
+        Hacl_Chacha20_Vec128_chacha20(output, block, len, k, nonce, ctr);
+    } else {
+        Hacl_Chacha20_chacha20(output, block, len, k, nonce, ctr);
+    }
+}
+
 SECStatus
 ChaCha20Poly1305_Seal(const ChaCha20Poly1305Context *ctx, unsigned char *output,
                       unsigned int *outputLen, unsigned int maxOutputLen,
@@ -191,8 +208,10 @@ ChaCha20Poly1305_Seal(const ChaCha20Poly1305Context *ctx, unsigned char *output,
     PORT_Memset(block, 0, sizeof(block));
     // Generate a block of keystream. The first 32 bytes will be the poly1305
     // key. The remainder of the block is discarded.
-    ChaCha20XOR(block, block, sizeof(block), ctx->key, nonce, 0);
-    ChaCha20XOR(output, input, inputLen, ctx->key, nonce, 1);
+    ChaCha20Xor(block, (uint8_t *)block, sizeof(block), (uint8_t *)ctx->key,
+                (uint8_t *)nonce, 0);
+    ChaCha20Xor(output, (uint8_t *)input, inputLen, (uint8_t *)ctx->key,
+                (uint8_t *)nonce, 1);
 
     Poly1305Do(tag, ad, adLen, output, inputLen, block);
     PORT_Memcpy(output + inputLen, tag, ctx->tagLen);
@@ -233,14 +252,16 @@ ChaCha20Poly1305_Open(const ChaCha20Poly1305Context *ctx, unsigned char *output,
     PORT_Memset(block, 0, sizeof(block));
     // Generate a block of keystream. The first 32 bytes will be the poly1305
     // key. The remainder of the block is discarded.
-    ChaCha20XOR(block, block, sizeof(block), ctx->key, nonce, 0);
+    ChaCha20Xor(block, (uint8_t *)block, sizeof(block), (uint8_t *)ctx->key,
+                (uint8_t *)nonce, 0);
     Poly1305Do(tag, ad, adLen, input, ciphertextLen, block);
     if (NSS_SecureMemcmp(tag, &input[ciphertextLen], ctx->tagLen) != 0) {
         PORT_SetError(SEC_ERROR_BAD_DATA);
         return SECFailure;
     }
 
-    ChaCha20XOR(output, input, ciphertextLen, ctx->key, nonce, 1);
+    ChaCha20Xor(output, (uint8_t *)input, ciphertextLen, (uint8_t *)ctx->key,
+                (uint8_t *)nonce, 1);
 
     return SECSuccess;
 #endif
diff --git a/lib/freebl/det_rng.c b/lib/freebl/det_rng.c
index 53d48bc7c..56be2d356 100644
--- a/lib/freebl/det_rng.c
+++ b/lib/freebl/det_rng.c
@@ -4,7 +4,7 @@
 
 #include "blapi.h"
 #include "blapit.h"
-#include "chacha20.h"
+#include "Hacl_Chacha20.h"
 #include "nssilock.h"
 #include "seccomon.h"
 #include "secerr.h"
@@ -99,7 +99,7 @@ RNG_GenerateGlobalRandomBytes(void *dest, size_t len)
 
     memset(dest, 0, len);
     memcpy(dest, globalBytes, PR_MIN(len, GLOBAL_BYTES_SIZE));
-    ChaCha20XOR(dest, dest, len, key, nonce, 0);
+    Hacl_Chacha20_chacha20(dest, (uint8_t *)dest, len, (uint8_t *)key, nonce, 0);
     ChaCha20Poly1305_DestroyContext(cx, PR_TRUE);
 
     PZ_Unlock(rng_lock);
diff --git a/lib/freebl/freebl.gyp b/lib/freebl/freebl.gyp
index f80d045c1..fae56f709 100644
--- a/lib/freebl/freebl.gyp
+++ b/lib/freebl/freebl.gyp
@@ -10,7 +10,7 @@
       'target_name': 'intel-gcm-wrap_c_lib',
       'type': 'static_library',
       'sources': [
-        'intel-gcm-wrap.c'
+        'intel-gcm-wrap.c',
       ],
       'dependencies': [
         '<(DEPTH)/exports.gyp:nss_exports'
@@ -23,6 +23,38 @@
       ]
     },
     {
+      # TODO: make this so that all hardware accelerated code is in here.
+      'target_name': 'hw-acc-crypto',
+      'type': 'static_library',
+      'sources': [
+        'verified/Hacl_Chacha20_Vec128.c',
+      ],
+      'dependencies': [
+        '<(DEPTH)/exports.gyp:nss_exports'
+      ],
+      'conditions': [
+        [ 'target_arch=="ia32" or target_arch=="x64"', {
+          'cflags': [
+            '-mssse3'
+          ],
+          'cflags_mozilla': [
+            '-mssse3'
+          ],
+          # GCC doesn't define this.
+          'defines': [
+            '__SSSE3__',
+          ],
+        }],
+        [ 'OS=="android"', {
+          # On Android we can't use any of the hardware acceleration :(
+          'defines!': [
+            '__ARM_NEON__',
+            '__ARM_NEON',
+          ],
+        }],
+      ],
+    },
+    {
       'target_name': 'gcm-aes-x86_c_lib',
       'type': 'static_library',
       'sources': [
@@ -74,11 +106,12 @@
       ],
       'dependencies': [
         '<(DEPTH)/exports.gyp:nss_exports',
+        'hw-acc-crypto',
       ],
       'conditions': [
         [ 'target_arch=="ia32" or target_arch=="x64"', {
           'dependencies': [
-            'gcm-aes-x86_c_lib'
+            'gcm-aes-x86_c_lib',
           ],
         }],
         [ 'OS=="linux"', {
@@ -110,11 +143,12 @@
       ],
       'dependencies': [
         '<(DEPTH)/exports.gyp:nss_exports',
+        'hw-acc-crypto',
       ],
       'conditions': [
         [ 'target_arch=="ia32" or target_arch=="x64"', {
           'dependencies': [
-            'gcm-aes-x86_c_lib'
+            'gcm-aes-x86_c_lib',
           ]
         }],
         [ 'OS!="linux" and OS!="android"', {
diff --git a/lib/freebl/freebl_base.gypi b/lib/freebl/freebl_base.gypi
index 44e28963b..ebd1018d8 100644
--- a/lib/freebl/freebl_base.gypi
+++ b/lib/freebl/freebl_base.gypi
@@ -144,12 +144,17 @@
       ],
     }],
     [ 'disable_chachapoly==0', {
+      # The ChaCha20 code is linked in through the static ssse3-crypto lib on
+      # all platforms that support SSSE3. There are runtime checks in place to
+      # choose the correct ChaCha implementation at runtime.
+      'sources': [
+        'verified/Hacl_Chacha20.c',
+      ],
       'conditions': [
         [ 'OS!="win"', {
           'conditions': [
             [ 'target_arch=="x64"', {
               'sources': [
-                'chacha20_vec.c',
                 'verified/Hacl_Poly1305_64.c',
               ],
             }, {
@@ -157,15 +162,11 @@
               'conditions': [
                 [ 'target_arch=="arm64" or target_arch=="aarch64"', {
                   'sources': [
-                    'chacha20.c',
-                    'verified/Hacl_Chacha20.c',
                     'verified/Hacl_Poly1305_64.c',
                   ],
                 }, {
                   # !Windows & !x64 & !arm64 & !aarch64
                   'sources': [
-                    'chacha20.c',
-                    'verified/Hacl_Chacha20.c',
                     'poly1305.c',
                   ],
                 }],
@@ -175,8 +176,6 @@
         }, {
           # Windows
           'sources': [
-            'chacha20.c',
-            'verified/Hacl_Chacha20.c',
             'poly1305.c',
           ],
         }],
diff --git a/lib/freebl/verified/Hacl_Chacha20_Vec128.c b/lib/freebl/verified/Hacl_Chacha20_Vec128.c
new file mode 100644
index 000000000..74b74a1ba
--- /dev/null
+++ b/lib/freebl/verified/Hacl_Chacha20_Vec128.c
@@ -0,0 +1,396 @@
+/* Copyright 2016-2017 INRIA and Microsoft Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Hacl_Chacha20_Vec128.h"
+
+inline static void
+Hacl_Impl_Chacha20_Vec128_State_state_incr(vec *k)
+{
+    vec k3 = k[3U];
+    k[3U] = vec_increment(k3);
+}
+
+inline static void
+Hacl_Impl_Chacha20_Vec128_State_state_to_key_block(uint8_t *stream_block, vec *k)
+{
+    vec k0 = k[0U];
+    vec k1 = k[1U];
+    vec k2 = k[2U];
+    vec k3 = k[3U];
+    uint8_t *a = stream_block;
+    uint8_t *b = stream_block + (uint32_t)16U;
+    uint8_t *c = stream_block + (uint32_t)32U;
+    uint8_t *d = stream_block + (uint32_t)48U;
+    vec_store_le(a, k0);
+    vec_store_le(b, k1);
+    vec_store_le(c, k2);
+    vec_store_le(d, k3);
+}
+
+inline static void
+Hacl_Impl_Chacha20_Vec128_State_state_setup(vec *st, uint8_t *k, uint8_t *n1, uint32_t c)
+{
+    st[0U] =
+        vec_load_32x4((uint32_t)0x61707865U,
+                      (uint32_t)0x3320646eU,
+                      (uint32_t)0x79622d32U,
+                      (uint32_t)0x6b206574U);
+    vec k0 = vec_load128_le(k);
+    vec k1 = vec_load128_le(k + (uint32_t)16U);
+    st[1U] = k0;
+    st[2U] = k1;
+    uint32_t n0 = load32_le(n1);
+    uint8_t *x00 = n1 + (uint32_t)4U;
+    uint32_t n10 = load32_le(x00);
+    uint8_t *x0 = n1 + (uint32_t)8U;
+    uint32_t n2 = load32_le(x0);
+    vec v1 = vec_load_32x4(c, n0, n10, n2);
+    st[3U] = v1;
+}
+
+inline static void
+Hacl_Impl_Chacha20_Vec128_round(vec *st)
+{
+    vec sa = st[0U];
+    vec sb0 = st[1U];
+    vec sd0 = st[3U];
+    vec sa10 = vec_add(sa, sb0);
+    vec sd10 = vec_rotate_left(vec_xor(sd0, sa10), (uint32_t)16U);
+    st[0U] = sa10;
+    st[3U] = sd10;
+    vec sa0 = st[2U];
+    vec sb1 = st[3U];
+    vec sd2 = st[1U];
+    vec sa11 = vec_add(sa0, sb1);
+    vec sd11 = vec_rotate_left(vec_xor(sd2, sa11), (uint32_t)12U);
+    st[2U] = sa11;
+    st[1U] = sd11;
+    vec sa2 = st[0U];
+    vec sb2 = st[1U];
+    vec sd3 = st[3U];
+    vec sa12 = vec_add(sa2, sb2);
+    vec sd12 = vec_rotate_left(vec_xor(sd3, sa12), (uint32_t)8U);
+    st[0U] = sa12;
+    st[3U] = sd12;
+    vec sa3 = st[2U];
+    vec sb = st[3U];
+    vec sd = st[1U];
+    vec sa1 = vec_add(sa3, sb);
+    vec sd1 = vec_rotate_left(vec_xor(sd, sa1), (uint32_t)7U);
+    st[2U] = sa1;
+    st[1U] = sd1;
+}
+
+inline static void
+Hacl_Impl_Chacha20_Vec128_double_round(vec *st)
+{
+    Hacl_Impl_Chacha20_Vec128_round(st);
+    vec r1 = st[1U];
+    vec r20 = st[2U];
+    vec r30 = st[3U];
+    st[1U] = vec_shuffle_right(r1, (uint32_t)1U);
+    st[2U] = vec_shuffle_right(r20, (uint32_t)2U);
+    st[3U] = vec_shuffle_right(r30, (uint32_t)3U);
+    Hacl_Impl_Chacha20_Vec128_round(st);
+    vec r10 = st[1U];
+    vec r2 = st[2U];
+    vec r3 = st[3U];
+    st[1U] = vec_shuffle_right(r10, (uint32_t)3U);
+    st[2U] = vec_shuffle_right(r2, (uint32_t)2U);
+    st[3U] = vec_shuffle_right(r3, (uint32_t)1U);
+}
+
+inline static void
+Hacl_Impl_Chacha20_Vec128_double_round3(vec *st, vec *st_, vec *st__)
+{
+    Hacl_Impl_Chacha20_Vec128_double_round(st);
+    Hacl_Impl_Chacha20_Vec128_double_round(st_);
+    Hacl_Impl_Chacha20_Vec128_double_round(st__);
+}
+
+inline static void
+Hacl_Impl_Chacha20_Vec128_sum_states(vec *st_, vec *st)
+{
+    vec s0 = st[0U];
+    vec s1 = st[1U];
+    vec s2 = st[2U];
+    vec s3 = st[3U];
+    vec s0_ = st_[0U];
+    vec s1_ = st_[1U];
+    vec s2_ = st_[2U];
+    vec s3_ = st_[3U];
+    st_[0U] = vec_add(s0_, s0);
+    st_[1U] = vec_add(s1_, s1);
+    st_[2U] = vec_add(s2_, s2);
+    st_[3U] = vec_add(s3_, s3);
+}
+
+inline static void
+Hacl_Impl_Chacha20_Vec128_copy_state(vec *st_, vec *st)
+{
+    vec st0 = st[0U];
+    vec st1 = st[1U];
+    vec st2 = st[2U];
+    vec st3 = st[3U];
+    st_[0U] = st0;
+    st_[1U] = st1;
+    st_[2U] = st2;
+    st_[3U] = st3;
+}
+
+inline static void
+Hacl_Impl_Chacha20_Vec128_chacha20_core(vec *k, vec *st)
+{
+    Hacl_Impl_Chacha20_Vec128_copy_state(k, st);
+    for (uint32_t i = (uint32_t)0U; i < (uint32_t)10U; i = i + (uint32_t)1U)
+        Hacl_Impl_Chacha20_Vec128_double_round(k);
+    Hacl_Impl_Chacha20_Vec128_sum_states(k, st);
+}
+
+static void
+Hacl_Impl_Chacha20_Vec128_state_incr(vec *st)
+{
+    Hacl_Impl_Chacha20_Vec128_State_state_incr(st);
+}
+
+inline static void
+Hacl_Impl_Chacha20_Vec128_chacha20_incr3(vec *k0, vec *k1, vec *k2, vec *st)
+{
+    Hacl_Impl_Chacha20_Vec128_copy_state(k0, st);
+    Hacl_Impl_Chacha20_Vec128_copy_state(k1, st);
+    Hacl_Impl_Chacha20_Vec128_state_incr(k1);
+    Hacl_Impl_Chacha20_Vec128_copy_state(k2, k1);
+    Hacl_Impl_Chacha20_Vec128_state_incr(k2);
+}
+
+inline static void
+Hacl_Impl_Chacha20_Vec128_chacha20_sum3(vec *k0, vec *k1, vec *k2, vec *st)
+{
+    Hacl_Impl_Chacha20_Vec128_sum_states(k0, st);
+    Hacl_Impl_Chacha20_Vec128_state_incr(st);
+    Hacl_Impl_Chacha20_Vec128_sum_states(k1, st);
+    Hacl_Impl_Chacha20_Vec128_state_incr(st);
+    Hacl_Impl_Chacha20_Vec128_sum_states(k2, st);
+}
+
+inline static void
+Hacl_Impl_Chacha20_Vec128_chacha20_core3(vec *k0, vec *k1, vec *k2, vec *st)
+{
+    Hacl_Impl_Chacha20_Vec128_chacha20_incr3(k0, k1, k2, st);
+    for (uint32_t i = (uint32_t)0U; i < (uint32_t)10U; i = i + (uint32_t)1U)
+        Hacl_Impl_Chacha20_Vec128_double_round3(k0, k1, k2);
+    Hacl_Impl_Chacha20_Vec128_chacha20_sum3(k0, k1, k2, st);
+}
+
+inline static void
+Hacl_Impl_Chacha20_Vec128_chacha20_block(uint8_t *stream_block, vec *st)
+{
+    KRML_CHECK_SIZE(vec_zero(), (uint32_t)4U);
+    vec k[4U];
+    for (uint32_t _i = 0U; _i < (uint32_t)4U; ++_i)
+        k[_i] = vec_zero();
+    Hacl_Impl_Chacha20_Vec128_chacha20_core(k, st);
+    Hacl_Impl_Chacha20_Vec128_State_state_to_key_block(stream_block, k);
+}
+
+inline static void
+Hacl_Impl_Chacha20_Vec128_init(vec *st, uint8_t *k, uint8_t *n1, uint32_t ctr)
+{
+    Hacl_Impl_Chacha20_Vec128_State_state_setup(st, k, n1, ctr);
+}
+
+static void
+Hacl_Impl_Chacha20_Vec128_update_last(uint8_t *output, uint8_t *plain, uint32_t len, vec *st)
+{
+    uint8_t block[64U] = { 0U };
+    Hacl_Impl_Chacha20_Vec128_chacha20_block(block, st);
+    uint8_t *mask = block;
+    for (uint32_t i = (uint32_t)0U; i < len; i = i + (uint32_t)1U) {
+        uint8_t xi = plain[i];
+        uint8_t yi = mask[i];
+        output[i] = xi ^ yi;
+    }
+}
+
+static void
+Hacl_Impl_Chacha20_Vec128_store_4_vec(uint8_t *output, vec v0, vec v1, vec v2, vec v3)
+{
+    uint8_t *o0 = output;
+    uint8_t *o1 = output + (uint32_t)16U;
+    uint8_t *o2 = output + (uint32_t)32U;
+    uint8_t *o3 = output + (uint32_t)48U;
+    vec_store_le(o0, v0);
+    vec_store_le(o1, v1);
+    vec_store_le(o2, v2);
+    vec_store_le(o3, v3);
+}
+
+static void
+Hacl_Impl_Chacha20_Vec128_xor_block(uint8_t *output, uint8_t *plain, vec *st)
+{
+    vec p0 = vec_load_le(plain);
+    vec p1 = vec_load_le(plain + (uint32_t)16U);
+    vec p2 = vec_load_le(plain + (uint32_t)32U);
+    vec p3 = vec_load_le(plain + (uint32_t)48U);
+    vec k0 = st[0U];
+    vec k1 = st[1U];
+    vec k2 = st[2U];
+    vec k3 = st[3U];
+    vec o0 = vec_xor(p0, k0);
+    vec o1 = vec_xor(p1, k1);
+    vec o2 = vec_xor(p2, k2);
+    vec o3 = vec_xor(p3, k3);
+    Hacl_Impl_Chacha20_Vec128_store_4_vec(output, o0, o1, o2, o3);
+}
+
+static void
+Hacl_Impl_Chacha20_Vec128_update(uint8_t *output, uint8_t *plain, vec *st)
+{
+    KRML_CHECK_SIZE(vec_zero(), (uint32_t)4U);
+    vec k[4U];
+    for (uint32_t _i = 0U; _i < (uint32_t)4U; ++_i)
+        k[_i] = vec_zero();
+    Hacl_Impl_Chacha20_Vec128_chacha20_core(k, st);
+    Hacl_Impl_Chacha20_Vec128_xor_block(output, plain, k);
+}
+
+static void
+Hacl_Impl_Chacha20_Vec128_update3(uint8_t *output, uint8_t *plain, vec *st)
+{
+    KRML_CHECK_SIZE(vec_zero(), (uint32_t)4U);
+    vec k0[4U];
+    for (uint32_t _i = 0U; _i < (uint32_t)4U; ++_i)
+        k0[_i] = vec_zero();
+    KRML_CHECK_SIZE(vec_zero(), (uint32_t)4U);
+    vec k1[4U];
+    for (uint32_t _i = 0U; _i < (uint32_t)4U; ++_i)
+        k1[_i] = vec_zero();
+    KRML_CHECK_SIZE(vec_zero(), (uint32_t)4U);
+    vec k2[4U];
+    for (uint32_t _i = 0U; _i < (uint32_t)4U; ++_i)
+        k2[_i] = vec_zero();
+    Hacl_Impl_Chacha20_Vec128_chacha20_core3(k0, k1, k2, st);
+    uint8_t *p0 = plain;
+    uint8_t *p1 = plain + (uint32_t)64U;
+    uint8_t *p2 = plain + (uint32_t)128U;
+    uint8_t *o0 = output;
+    uint8_t *o1 = output + (uint32_t)64U;
+    uint8_t *o2 = output + (uint32_t)128U;
+    Hacl_Impl_Chacha20_Vec128_xor_block(o0, p0, k0);
+    Hacl_Impl_Chacha20_Vec128_xor_block(o1, p1, k1);
+    Hacl_Impl_Chacha20_Vec128_xor_block(o2, p2, k2);
+}
+
+static void
+Hacl_Impl_Chacha20_Vec128_update3_(
+    uint8_t *output,
+    uint8_t *plain,
+    uint32_t len,
+    vec *st,
+    uint32_t i)
+{
+    uint8_t *out_block = output + (uint32_t)192U * i;
+    uint8_t *plain_block = plain + (uint32_t)192U * i;
+    Hacl_Impl_Chacha20_Vec128_update3(out_block, plain_block, st);
+    Hacl_Impl_Chacha20_Vec128_state_incr(st);
+}
+
+static void
+Hacl_Impl_Chacha20_Vec128_chacha20_counter_mode_blocks3(
+    uint8_t *output,
+    uint8_t *plain,
+    uint32_t len,
+    vec *st)
+{
+    for (uint32_t i = (uint32_t)0U; i < len; i = i + (uint32_t)1U)
+        Hacl_Impl_Chacha20_Vec128_update3_(output, plain, len, st, i);
+}
+
+static void
+Hacl_Impl_Chacha20_Vec128_chacha20_counter_mode_blocks(
+    uint8_t *output,
+    uint8_t *plain,
+    uint32_t len,
+    vec *st)
+{
+    uint32_t len3 = len / (uint32_t)3U;
+    uint32_t rest3 = len % (uint32_t)3U;
+    uint8_t *plain_ = plain;
+    uint8_t *blocks1 = plain + (uint32_t)192U * len3;
+    uint8_t *output_ = output;
+    uint8_t *outs = output + (uint32_t)192U * len3;
+    Hacl_Impl_Chacha20_Vec128_chacha20_counter_mode_blocks3(output_, plain_, len3, st);
+    if (rest3 == (uint32_t)2U) {
+        uint8_t *block0 = blocks1;
+        uint8_t *block1 = blocks1 + (uint32_t)64U;
+        uint8_t *out0 = outs;
+        uint8_t *out1 = outs + (uint32_t)64U;
+        Hacl_Impl_Chacha20_Vec128_update(out0, block0, st);
+        Hacl_Impl_Chacha20_Vec128_state_incr(st);
+        Hacl_Impl_Chacha20_Vec128_update(out1, block1, st);
+        Hacl_Impl_Chacha20_Vec128_state_incr(st);
+    } else if (rest3 == (uint32_t)1U) {
+        Hacl_Impl_Chacha20_Vec128_update(outs, blocks1, st);
+        Hacl_Impl_Chacha20_Vec128_state_incr(st);
+    }
+}
+
+static void
+Hacl_Impl_Chacha20_Vec128_chacha20_counter_mode(
+    uint8_t *output,
+    uint8_t *plain,
+    uint32_t len,
+    vec *st)
+{
+    uint32_t blocks_len = len >> (uint32_t)6U;
+    uint32_t part_len = len & (uint32_t)0x3fU;
+    uint8_t *output_ = output;
+    uint8_t *plain_ = plain;
+    uint8_t *output__ = output + (uint32_t)64U * blocks_len;
+    uint8_t *plain__ = plain + (uint32_t)64U * blocks_len;
+    Hacl_Impl_Chacha20_Vec128_chacha20_counter_mode_blocks(output_, plain_, blocks_len, st);
+    if (part_len > (uint32_t)0U)
+        Hacl_Impl_Chacha20_Vec128_update_last(output__, plain__, part_len, st);
+}
+
+static void
+Hacl_Impl_Chacha20_Vec128_chacha20(
+    uint8_t *output,
+    uint8_t *plain,
+    uint32_t len,
+    uint8_t *k,
+    uint8_t *n1,
+    uint32_t ctr)
+{
+    KRML_CHECK_SIZE(vec_zero(), (uint32_t)4U);
+    vec buf[4U];
+    for (uint32_t _i = 0U; _i < (uint32_t)4U; ++_i)
+        buf[_i] = vec_zero();
+    vec *st = buf;
+    Hacl_Impl_Chacha20_Vec128_init(st, k, n1, ctr);
+    Hacl_Impl_Chacha20_Vec128_chacha20_counter_mode(output, plain, len, st);
+}
+
+void
+Hacl_Chacha20_Vec128_chacha20(
+    uint8_t *output,
+    uint8_t *plain,
+    uint32_t len,
+    uint8_t *k,
+    uint8_t *n1,
+    uint32_t ctr)
+{
+    Hacl_Impl_Chacha20_Vec128_chacha20(output, plain, len, k, n1, ctr);
+}
diff --git a/lib/freebl/verified/Hacl_Chacha20_Vec128.h b/lib/freebl/verified/Hacl_Chacha20_Vec128.h
new file mode 100644
index 000000000..57942093d
--- /dev/null
+++ b/lib/freebl/verified/Hacl_Chacha20_Vec128.h
@@ -0,0 +1,61 @@
+/* Copyright 2016-2017 INRIA and Microsoft Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kremlib.h"
+#ifndef __Hacl_Chacha20_Vec128_H
+#define __Hacl_Chacha20_Vec128_H
+
+#include "vec128.h"
+
+typedef uint32_t Hacl_Impl_Xor_Lemmas_u32;
+
+typedef uint8_t Hacl_Impl_Xor_Lemmas_u8;
+
+typedef uint32_t Hacl_Impl_Chacha20_Vec128_State_u32;
+
+typedef uint32_t Hacl_Impl_Chacha20_Vec128_State_h32;
+
+typedef uint8_t *Hacl_Impl_Chacha20_Vec128_State_uint8_p;
+
+typedef vec *Hacl_Impl_Chacha20_Vec128_State_state;
+
+typedef uint32_t Hacl_Impl_Chacha20_Vec128_u32;
+
+typedef uint32_t Hacl_Impl_Chacha20_Vec128_h32;
+
+typedef uint8_t *Hacl_Impl_Chacha20_Vec128_uint8_p;
+
+typedef uint32_t Hacl_Impl_Chacha20_Vec128_idx;
+
+typedef struct
+{
+    void *k;
+    void *n;
+    uint32_t ctr;
+} Hacl_Impl_Chacha20_Vec128_log_t_;
+
+typedef void *Hacl_Impl_Chacha20_Vec128_log_t;
+
+typedef uint8_t *Hacl_Chacha20_Vec128_uint8_p;
+
+void
+Hacl_Chacha20_Vec128_chacha20(
+    uint8_t *output,
+    uint8_t *plain,
+    uint32_t len,
+    uint8_t *k,
+    uint8_t *n1,
+    uint32_t ctr);
+#endif
diff --git a/lib/freebl/verified/vec128.h b/lib/freebl/verified/vec128.h
new file mode 100644
index 000000000..986e9db82
--- /dev/null
+++ b/lib/freebl/verified/vec128.h
@@ -0,0 +1,345 @@
+/* Copyright 2016-2017 INRIA and Microsoft Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __Vec_H
+#define __Vec_H
+
+#ifdef __MSVC__
+#define forceinline __forceinline inline
+#elif (defined(__GNUC__) || defined(__clang__))
+#define forceinline __attribute__((always_inline)) inline
+#else
+#define forceinline inline
+#endif
+
+#if defined(__SSSE3__) || defined(__AVX2__) || defined(__AVX__)
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+#define VEC128
+#define vec_size 4
+
+typedef __m128i vec;
+
+static forceinline vec
+vec_rotate_left_8(vec v)
+{
+    __m128i x = _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
+    return _mm_shuffle_epi8(v, x);
+}
+
+static forceinline vec
+vec_rotate_left_16(vec v)
+{
+    __m128i x = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
+    return _mm_shuffle_epi8(v, x);
+}
+
+static forceinline vec
+vec_rotate_left(vec v, unsigned int n)
+{
+    if (n == 8)
+        return vec_rotate_left_8(v);
+    if (n == 16)
+        return vec_rotate_left_16(v);
+    return _mm_xor_si128(_mm_slli_epi32(v, n),
+                         _mm_srli_epi32(v, 32 - n));
+}
+
+static forceinline vec
+vec_rotate_right(vec v, unsigned int n)
+{
+    return (vec_rotate_left(v, 32 - n));
+}
+
+#define vec_shuffle_right(x, n) \
+    _mm_shuffle_epi32(x, _MM_SHUFFLE((3 + (n)) % 4, (2 + (n)) % 4, (1 + (n)) % 4, (n) % 4))
+
+#define vec_shuffle_left(x, n) vec_shuffle_right((x), 4 - (n))
+
+static forceinline vec
+vec_load_32x4(uint32_t x1, uint32_t x2, uint32_t x3, uint32_t x4)
+{
+    return _mm_set_epi32(x4, x3, x2, x1);
+}
+
+static forceinline vec
+vec_load_32x8(uint32_t x1, uint32_t x2, uint32_t x3, uint32_t x4, uint32_t x5, uint32_t x6, uint32_t x7, uint32_t x8)
+{
+    return _mm_set_epi32(x4, x3, x2, x1);
+}
+
+static forceinline vec
+vec_load_le(const unsigned char* in)
+{
+    return _mm_loadu_si128((__m128i*)(in));
+}
+
+static forceinline vec
+vec_load128_le(const unsigned char* in)
+{
+    return vec_load_le(in);
+}
+
+static forceinline void
+vec_store_le(unsigned char* out, vec v)
+{
+    _mm_storeu_si128((__m128i*)(out), v);
+}
+
+static forceinline vec
+vec_add(vec v1, vec v2)
+{
+    return _mm_add_epi32(v1, v2);
+}
+
+static forceinline vec
+vec_add_u32(vec v1, uint32_t x)
+{
+    vec v2 = vec_load_32x4(x, 0, 0, 0);
+    return _mm_add_epi32(v1, v2);
+}
+
+static forceinline vec
+vec_increment(vec v1)
+{
+    vec one = vec_load_32x4(1, 0, 0, 0);
+    return _mm_add_epi32(v1, one);
+}
+
+static forceinline vec
+vec_xor(vec v1, vec v2)
+{
+    return _mm_xor_si128(v1, v2);
+}
+
+#define vec_zero() _mm_set_epi32(0, 0, 0, 0)
+
+#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+
+typedef uint32x4_t vec;
+
+static forceinline vec
+vec_xor(vec v1, vec v2)
+{
+    return veorq_u32(v1, v2);
+}
+
+#define vec_rotate_left(x, n) \
+    vsriq_n_u32(vshlq_n_u32((x), (n)), (x), 32 - (n))
+
+#define vec_rotate_right(a, b) \
+    vec_rotate_left((b), 32 - (b))
+
+#define vec_shuffle_right(x, n) \
+    vextq_u32((x), (x), (n))
+
+#define vec_shuffle_left(a, b) \
+    vec_shuffle_right((a), 4 - (b))
+
+static forceinline vec
+vec_load_32x4(uint32_t x1, uint32_t x2, uint32_t x3, uint32_t x4)
+{
+    uint32_t a[4] = { x1, x2, x3, x4 };
+    return vld1q_u32(a);
+}
+
+static forceinline vec
+vec_load_32(uint32_t x1)
+{
+    uint32_t a[4] = { x1, x1, x1, x1 };
+    return vld1q_u32(a);
+}
+
+static forceinline vec
+vec_load_32x8(uint32_t x1, uint32_t x2, uint32_t x3, uint32_t x4, uint32_t x5, uint32_t x6, uint32_t x7, uint32_t x8)
+{
+    return vec_load_32x4(x1, x2, x3, x4);
+}
+
+static forceinline vec
+vec_load_le(const unsigned char* in)
+{
+    return vld1q_u32((uint32_t*)in);
+}
+
+static forceinline vec
+vec_load128_le(const unsigned char* in)
+{
+    return vec_load_le(in);
+}
+
+static forceinline void
+vec_store_le(unsigned char* out, vec v)
+{
+    vst1q_u32((uint32_t*)out, v);
+}
+
+static forceinline vec
+vec_add(vec v1, vec v2)
+{
+    return vaddq_u32(v1, v2);
+}
+
+static forceinline vec
+vec_add_u32(vec v1, uint32_t x)
+{
+    vec v2 = vec_load_32x4(x, 0, 0, 0);
+    return vec_add(v1, v2);
+}
+
+static forceinline vec
+vec_increment(vec v1)
+{
+    vec one = vec_load_32x4(1, 0, 0, 0);
+    return vec_add(v1, one);
+}
+
+#define vec_zero() vec_load_32x4(0, 0, 0, 0)
+
+#else
+
+#define VEC128
+#define vec_size 4
+
+typedef struct {
+    uint32_t v[4];
+} vec;
+
+static forceinline vec
+vec_xor(vec v1, vec v2)
+{
+    vec r;
+    r.v[0] = v1.v[0] ^ v2.v[0];
+    r.v[1] = v1.v[1] ^ v2.v[1];
+    r.v[2] = v1.v[2] ^ v2.v[2];
+    r.v[3] = v1.v[3] ^ v2.v[3];
+    return r;
+}
+
+static forceinline vec
+vec_rotate_left(vec v, unsigned int n)
+{
+    vec r;
+    r.v[0] = (v.v[0] << n) ^ (v.v[0] >> (32 - n));
+    r.v[1] = (v.v[1] << n) ^ (v.v[1] >> (32 - n));
+    r.v[2] = (v.v[2] << n) ^ (v.v[2] >> (32 - n));
+    r.v[3] = (v.v[3] << n) ^ (v.v[3] >> (32 - n));
+    return r;
+}
+
+static forceinline vec
+vec_rotate_right(vec v, unsigned int n)
+{
+    return (vec_rotate_left(v, 32 - n));
+}
+
+static forceinline vec
+vec_shuffle_right(vec v, unsigned int n)
+{
+    vec r;
+    r.v[0] = v.v[n % 4];
+    r.v[1] = v.v[(n + 1) % 4];
+    r.v[2] = v.v[(n + 2) % 4];
+    r.v[3] = v.v[(n + 3) % 4];
+    return r;
+}
+
+static forceinline vec
+vec_shuffle_left(vec x, unsigned int n)
+{
+    return vec_shuffle_right(x, 4 - n);
+}
+
+static forceinline vec
+vec_load_32x4(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3)
+{
+    vec v;
+    v.v[0] = x0;
+    v.v[1] = x1;
+    v.v[2] = x2;
+    v.v[3] = x3;
+    return v;
+}
+
+static forceinline vec
+vec_load_32(uint32_t x0)
+{
+    vec v;
+    v.v[0] = x0;
+    v.v[1] = x0;
+    v.v[2] = x0;
+    v.v[3] = x0;
+    return v;
+}
+
+static forceinline vec
+vec_load_le(const uint8_t* in)
+{
+    vec r;
+    r.v[0] = load32_le((uint8_t*)in);
+    r.v[1] = load32_le((uint8_t*)in + 4);
+    r.v[2] = load32_le((uint8_t*)in + 8);
+    r.v[3] = load32_le((uint8_t*)in + 12);
+    return r;
+}
+
+static forceinline void
+vec_store_le(unsigned char* out, vec r)
+{
+    store32_le(out, r.v[0]);
+    store32_le(out + 4, r.v[1]);
+    store32_le(out + 8, r.v[2]);
+    store32_le(out + 12, r.v[3]);
+}
+
+static forceinline vec
+vec_load128_le(const unsigned char* in)
+{
+    return vec_load_le(in);
+}
+
+static forceinline vec
+vec_add(vec v1, vec v2)
+{
+    vec r;
+    r.v[0] = v1.v[0] + v2.v[0];
+    r.v[1] = v1.v[1] + v2.v[1];
+    r.v[2] = v1.v[2] + v2.v[2];
+    r.v[3] = v1.v[3] + v2.v[3];
+    return r;
+}
+
+static forceinline vec
+vec_add_u32(vec v1, uint32_t x)
+{
+    vec v2 = vec_load_32x4(x, 0, 0, 0);
+    return vec_add(v1, v2);
+}
+
+static forceinline vec
+vec_increment(vec v1)
+{
+    vec one = vec_load_32x4(1, 0, 0, 0);
+    return vec_add(v1, one);
+}
+
+#define vec_zero() vec_load_32x4(0, 0, 0, 0)
+
+#endif
+
+#endif
diff --git a/nss-tool/hw-support.c b/nss-tool/hw-support.c
new file mode 100644
index 000000000..0aa097ffc
--- /dev/null
+++ b/nss-tool/hw-support.c
@@ -0,0 +1,37 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifdef FREEBL_NO_DEPEND
+#include "stubs.h"
+#endif
+
+/* This is a freebl command line utility that prints hardware support as freebl
+ * sees it from its detection in blinit.c
+ */
+
+#include <stdio.h>
+
+#include "blapi.h"
+#include "blapii.h"
+#include "nss.h"
+
+int main(int argc, char const *argv[]) {
+  BL_Init();
+  printf("\n\n ========== NSS Hardware Report ==========\n");
+#if defined(NSS_X86_OR_X64)
+  printf("\tAES-NI \t%s supported\n", aesni_support() ? "" : "not");
+  printf("\tPCLMUL \t%s supported\n", clmul_support() ? "" : "not");
+  printf("\tAVX \t%s supported\n", avx_support() ? "" : "not");
+  printf("\tSSSE3 \t%s supported\n", ssse3_support() ? "" : "not");
+#elif defined(__aarch64__) || defined(__arm__)
+  printf("\tNEON \t%s supported\n", arm_neon_support() ? "" : "not");
+  printf("\tAES \t%s supported\n", arm_aes_support() ? "" : "not");
+  printf("\tPMULL \t%s supported\n", arm_pmull_support() ? "" : "not");
+  printf("\tSHA1 \t%s supported\n", arm_sha1_support() ? "" : "not");
+  printf("\tSHA2 \t%s supported\n", arm_sha2_support() ? "" : "not");
+#endif
+  printf(" ========== Hardware Report End ==========\n\n\n");
+  BL_Cleanup();
+  return 0;
+}
diff --git a/nss-tool/nss_tool.gyp b/nss-tool/nss_tool.gyp
index a5d03fcf9..d0741b426 100644
--- a/nss-tool/nss_tool.gyp
+++ b/nss-tool/nss_tool.gyp
@@ -26,6 +26,43 @@
         '<(DEPTH)/exports.gyp:dbm_exports',
         '<(DEPTH)/exports.gyp:nss_exports',
       ],
-    }
+    },
+    {
+      'target_name': 'hw-support',
+      'type': 'executable',
+      'sources': [
+        'hw-support.c',
+      ],
+      'conditions': [
+        [ 'OS=="win"', {
+          'libraries': [
+            'advapi32.lib',
+          ],
+        }],
+      ],
+      'dependencies' : [
+        '<(DEPTH)/exports.gyp:nss_exports',
+        '<(DEPTH)/lib/util/util.gyp:nssutil3',
+        '<(DEPTH)/lib/nss/nss.gyp:nss_static',
+        '<(DEPTH)/lib/pk11wrap/pk11wrap.gyp:pk11wrap_static',
+        '<(DEPTH)/lib/cryptohi/cryptohi.gyp:cryptohi',
+        '<(DEPTH)/lib/certhigh/certhigh.gyp:certhi',
+        '<(DEPTH)/lib/certdb/certdb.gyp:certdb',
+        '<(DEPTH)/lib/base/base.gyp:nssb',
+        '<(DEPTH)/lib/dev/dev.gyp:nssdev',
+        '<(DEPTH)/lib/pki/pki.gyp:nsspki',
+      ],
+      'include_dirs': [
+        '<(DEPTH)/lib/freebl',
+        '<(DEPTH)/lib/freebl/mpi',
+      ],
+      'defines': [
+        'NSS_USE_STATIC_LIBS'
+      ],
+      'variables': {
+        'module': 'nss',
+        'use_static_libs': 1
+      },
+    },
   ],
 }
diff --git a/nss.gyp b/nss.gyp
index b3524aa1a..36b0dd974 100644
--- a/nss.gyp
+++ b/nss.gyp
@@ -107,6 +107,7 @@
             'cmd/ssltap/ssltap.gyp:ssltap',
             'cmd/symkeyutil/symkeyutil.gyp:symkeyutil',
             'nss-tool/nss_tool.gyp:nss',
+            'nss-tool/nss_tool.gyp:hw-support',
           ],
         }],
       ],
diff --git a/tests/all.sh b/tests/all.sh
index 8d5bd2dbb..3a02debef 100755
--- a/tests/all.sh
+++ b/tests/all.sh
@@ -325,6 +325,11 @@ NSS_SSL_RUN="${NSS_SSL_RUN:-$nss_ssl_run}"
 ENV_BACKUP=${HOSTDIR}/env.sh
 env_backup > ${ENV_BACKUP}
 
+# Print hardware support if we built it.
+if [ -f ${BINDIR}/hw-support ]; then
+    ${BINDIR}/hw-support
+fi
+
 if [ "${O_CRON}" = "ON" ]; then
     run_cycles >> ${LOGFILE}
 else
diff --git a/tests/common/cleanup.sh b/tests/common/cleanup.sh
index 40885bc79..97c139321 100755
--- a/tests/common/cleanup.sh
+++ b/tests/common/cleanup.sh
@@ -30,6 +30,8 @@ if [ -z "${CLEANUP}" -o "${CLEANUP}" = "${SCRIPTNAME}" ]; then
     echo "NSS_DISABLE_HW_AES=${NSS_DISABLE_HW_AES}"
     echo "NSS_DISABLE_PCLMUL=${NSS_DISABLE_PCLMUL}"
     echo "NSS_DISABLE_AVX=${NSS_DISABLE_AVX}"
+    echo "NSS_DISABLE_ARM_NEON=${NSS_DISABLE_ARM_NEON}"
+    echo "NSS_DISABLE_SSSE3=${NSS_DISABLE_SSSE3}"
     echo
     echo "Tests summary:"
     echo "--------------"
author	Franziskus Kiefer <franziskuskiefer@gmail.com>	2018-02-19 12:32:59 +0100
committer	Franziskus Kiefer <franziskuskiefer@gmail.com>	2018-02-19 12:32:59 +0100
commit	89175c78faa076ba0d1be9cfe845f97c8a70e0fd (patch)
tree	1ea85e91b8f3efd7eee525177d9fea8a2284701c
parent	e2f4deee109a08371fc0fa43e318d337c3dd6f04 (diff)
download	nss-hg-89175c78faa076ba0d1be9cfe845f97c8a70e0fd.tar.gz