summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFranziskus Kiefer <franziskuskiefer@gmail.com>2018-02-26 16:09:56 +0100
committerFranziskus Kiefer <franziskuskiefer@gmail.com>2018-02-26 16:09:56 +0100
commit80059f0df685235533262a74ac9c4216e02a301d (patch)
treee32d4ee891c8e2e0377524e3bbc5cde8436e5868
parent44167178fd2022aa89f54950b8dda46831f5b86c (diff)
downloadnss-hg-80059f0df685235533262a74ac9c4216e02a301d.tar.gz
Bug 1441219 - HACL* poly1305 32-bit, r=ttaubert
Differential Revision: https://phabricator.services.mozilla.com/D649
-rw-r--r--automation/taskcluster/docker-hacl/Dockerfile2
-rwxr-xr-xautomation/taskcluster/scripts/run_hacl.sh3
-rw-r--r--lib/freebl/Makefile4
-rw-r--r--lib/freebl/chacha20poly1305.c88
-rw-r--r--lib/freebl/freebl_base.gypi4
-rw-r--r--lib/freebl/poly1305-donna-x64-sse2-incremental-source.c881
-rw-r--r--lib/freebl/poly1305.c314
-rw-r--r--lib/freebl/poly1305.h30
-rw-r--r--lib/freebl/verified/Hacl_Poly1305_32.c576
-rw-r--r--lib/freebl/verified/Hacl_Poly1305_32.h103
10 files changed, 721 insertions, 1284 deletions
diff --git a/automation/taskcluster/docker-hacl/Dockerfile b/automation/taskcluster/docker-hacl/Dockerfile
index c6e22e2c7..470efd1c0 100644
--- a/automation/taskcluster/docker-hacl/Dockerfile
+++ b/automation/taskcluster/docker-hacl/Dockerfile
@@ -9,7 +9,7 @@ ENV haclrepo https://github.com/mitls/hacl-star.git
# Define versions of dependencies
ENV opamv 4.04.2
-ENV haclversion 890ca2602c6481ef818ffa5a5a2211f1ad40fd61
+ENV haclversion 5cc724bb814ff9dce7d9e9a6cdf7bcd6800c5da8
# Install required packages and set versions
ADD setup.sh /tmp/setup.sh
diff --git a/automation/taskcluster/scripts/run_hacl.sh b/automation/taskcluster/scripts/run_hacl.sh
index 281075eef..e038197eb 100755
--- a/automation/taskcluster/scripts/run_hacl.sh
+++ b/automation/taskcluster/scripts/run_hacl.sh
@@ -12,9 +12,6 @@ set -e -x -v
# The extracted C code from HACL* is already generated and the HACL* tests were
# successfully executed.
-# Verify Poly1305 (doesn't work in docker image build)
-make verify -C ~/hacl-star/code/poly1305 -j$(nproc)
-
# Add license header to specs
spec_files=($(find ~/hacl-star/specs -type f -name '*.fst'))
for f in "${spec_files[@]}"; do
diff --git a/lib/freebl/Makefile b/lib/freebl/Makefile
index a4b1a86ae..e6a94d35c 100644
--- a/lib/freebl/Makefile
+++ b/lib/freebl/Makefile
@@ -517,13 +517,13 @@ ifndef NSS_DISABLE_CHACHAPOLY
ifdef HAVE_INT128_SUPPORT
EXTRA_SRCS += Hacl_Poly1305_64.c
else
- EXTRA_SRCS += poly1305.c
+ EXTRA_SRCS += Hacl_Poly1305_32.c
endif
else
ifeq ($(CPU_ARCH),aarch64)
EXTRA_SRCS += Hacl_Poly1305_64.c
else
- EXTRA_SRCS += poly1305.c
+ EXTRA_SRCS += Hacl_Poly1305_32.c
endif
endif # x86_64
diff --git a/lib/freebl/chacha20poly1305.c b/lib/freebl/chacha20poly1305.c
index 859d05316..302f0db9e 100644
--- a/lib/freebl/chacha20poly1305.c
+++ b/lib/freebl/chacha20poly1305.c
@@ -24,36 +24,60 @@ extern void Hacl_Chacha20_Vec128_chacha20(uint8_t *output, uint8_t *plain,
extern void Hacl_Chacha20_chacha20(uint8_t *output, uint8_t *plain, uint32_t len,
uint8_t *k, uint8_t *n1, uint32_t ctr);
-/* Poly1305Do writes the Poly1305 authenticator of the given additional data
- * and ciphertext to |out|. */
#if defined(HAVE_INT128_SUPPORT) && (defined(NSS_X86_OR_X64) || defined(__aarch64__))
/* Use HACL* Poly1305 on 64-bit Intel and ARM */
#include "verified/Hacl_Poly1305_64.h"
+#define NSS_POLY1305_64 1
+#define Hacl_Poly1305_update Hacl_Poly1305_64_update
+#define Hacl_Poly1305_mk_state Hacl_Poly1305_64_mk_state
+#define Hacl_Poly1305_init Hacl_Poly1305_64_init
+#define Hacl_Poly1305_finish Hacl_Poly1305_64_finish
+typedef Hacl_Impl_Poly1305_64_State_poly1305_state Hacl_Impl_Poly1305_State_poly1305_state;
+#else
+/* All other platforms get the 32-bit poly1305 HACL* implementation. */
+#include "verified/Hacl_Poly1305_32.h"
+#define NSS_POLY1305_32 1
+#define Hacl_Poly1305_update Hacl_Poly1305_32_update
+#define Hacl_Poly1305_mk_state Hacl_Poly1305_32_mk_state
+#define Hacl_Poly1305_init Hacl_Poly1305_32_init
+#define Hacl_Poly1305_finish Hacl_Poly1305_32_finish
+typedef Hacl_Impl_Poly1305_32_State_poly1305_state Hacl_Impl_Poly1305_State_poly1305_state;
+#endif /* HAVE_INT128_SUPPORT */
static void
-Poly1305PadUpdate(Hacl_Impl_Poly1305_64_State_poly1305_state state,
+Poly1305PadUpdate(Hacl_Impl_Poly1305_State_poly1305_state state,
unsigned char *block, const unsigned char *p,
const unsigned int pLen)
{
unsigned int pRemLen = pLen % 16;
- Hacl_Poly1305_64_update(state, (uint8_t *)p, (pLen / 16));
+ Hacl_Poly1305_update(state, (uint8_t *)p, (pLen / 16));
if (pRemLen > 0) {
memcpy(block, p + (pLen - pRemLen), pRemLen);
- Hacl_Poly1305_64_update(state, block, 1);
+ Hacl_Poly1305_update(state, block, 1);
}
}
+/* Poly1305Do writes the Poly1305 authenticator of the given additional data
+ * and ciphertext to |out|. */
static void
Poly1305Do(unsigned char *out, const unsigned char *ad, unsigned int adLen,
const unsigned char *ciphertext, unsigned int ciphertextLen,
const unsigned char key[32])
{
- uint64_t tmp1[6U] = { 0U };
- Hacl_Impl_Poly1305_64_State_poly1305_state state =
- Hacl_Poly1305_64_mk_state(tmp1, tmp1 + 3);
+#ifdef NSS_POLY1305_64
+ uint64_t stateStack[6U] = { 0U };
+ size_t offset = 3;
+#elif defined NSS_POLY1305_32
+ uint32_t stateStack[10U] = { 0U };
+ size_t offset = 5;
+#else
+#error "This can't happen."
+#endif
+ Hacl_Impl_Poly1305_State_poly1305_state state =
+ Hacl_Poly1305_mk_state(stateStack, stateStack + offset);
unsigned char block[16] = { 0 };
- Hacl_Poly1305_64_init(state, (uint8_t *)key);
+ Hacl_Poly1305_init(state, (uint8_t *)key);
Poly1305PadUpdate(state, block, ad, adLen);
memset(block, 0, 16);
@@ -68,49 +92,11 @@ Poly1305Do(unsigned char *out, const unsigned char *ad, unsigned int adLen,
block[i] = j;
}
- Hacl_Poly1305_64_update(state, block, 1);
- Hacl_Poly1305_64_finish(state, out, (uint8_t *)(key + 16));
+ Hacl_Poly1305_update(state, block, 1);
+ Hacl_Poly1305_finish(state, out, (uint8_t *)(key + 16));
+#undef NSS_POLY1305_64
+#undef NSS_POLY1305_32
}
-#else
-/* All other platforms get the 32-bit poly1305 reference implementation. */
-#include "poly1305.h"
-
-static void
-Poly1305Do(unsigned char *out, const unsigned char *ad, unsigned int adLen,
- const unsigned char *ciphertext, unsigned int ciphertextLen,
- const unsigned char key[32])
-{
- poly1305_state state;
- unsigned int j;
- unsigned char lengthBytes[8];
- static const unsigned char zeros[15];
- unsigned int i;
-
- Poly1305Init(&state, key);
- Poly1305Update(&state, ad, adLen);
- if (adLen % 16 > 0) {
- Poly1305Update(&state, zeros, 16 - adLen % 16);
- }
- Poly1305Update(&state, ciphertext, ciphertextLen);
- if (ciphertextLen % 16 > 0) {
- Poly1305Update(&state, zeros, 16 - ciphertextLen % 16);
- }
- j = adLen;
- for (i = 0; i < sizeof(lengthBytes); i++) {
- lengthBytes[i] = j;
- j >>= 8;
- }
- Poly1305Update(&state, lengthBytes, sizeof(lengthBytes));
- j = ciphertextLen;
- for (i = 0; i < sizeof(lengthBytes); i++) {
- lengthBytes[i] = j;
- j >>= 8;
- }
- Poly1305Update(&state, lengthBytes, sizeof(lengthBytes));
- Poly1305Finish(&state, out);
-}
-
-#endif /* HAVE_INT128_SUPPORT */
#endif /* NSS_DISABLE_CHACHAPOLY */
SECStatus
diff --git a/lib/freebl/freebl_base.gypi b/lib/freebl/freebl_base.gypi
index ebd1018d8..22baf4e5c 100644
--- a/lib/freebl/freebl_base.gypi
+++ b/lib/freebl/freebl_base.gypi
@@ -167,7 +167,7 @@
}, {
# !Windows & !x64 & !arm64 & !aarch64
'sources': [
- 'poly1305.c',
+ 'verified/Hacl_Poly1305_32.c',
],
}],
],
@@ -176,7 +176,7 @@
}, {
# Windows
'sources': [
- 'poly1305.c',
+ 'verified/Hacl_Poly1305_32.c',
],
}],
],
diff --git a/lib/freebl/poly1305-donna-x64-sse2-incremental-source.c b/lib/freebl/poly1305-donna-x64-sse2-incremental-source.c
deleted file mode 100644
index 3c803c167..000000000
--- a/lib/freebl/poly1305-donna-x64-sse2-incremental-source.c
+++ /dev/null
@@ -1,881 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-/* This implementation of poly1305 is by Andrew Moon
- * (https://github.com/floodyberry/poly1305-donna) and released as public
- * domain. It implements SIMD vectorization based on the algorithm described in
- * http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte
- * block size. */
-
-#include <emmintrin.h>
-#include <stdint.h>
-
-#include "poly1305.h"
-#include "blapii.h"
-
-#define ALIGN(x) __attribute__((aligned(x)))
-#define INLINE inline
-#define U8TO64_LE(m) (*(uint64_t *)(m))
-#define U8TO32_LE(m) (*(uint32_t *)(m))
-#define U64TO8_LE(m, v) (*(uint64_t *)(m)) = v
-
-typedef __m128i xmmi;
-typedef unsigned __int128 uint128_t;
-
-static const uint32_t ALIGN(16) poly1305_x64_sse2_message_mask[4] = { (1 << 26) - 1, 0, (1 << 26) - 1, 0 };
-static const uint32_t ALIGN(16) poly1305_x64_sse2_5[4] = { 5, 0, 5, 0 };
-static const uint32_t ALIGN(16) poly1305_x64_sse2_1shl128[4] = { (1 << 24), 0, (1 << 24), 0 };
-
-static uint128_t INLINE
-add128(uint128_t a, uint128_t b)
-{
- return a + b;
-}
-
-static uint128_t INLINE
-add128_64(uint128_t a, uint64_t b)
-{
- return a + b;
-}
-
-static uint128_t INLINE
-mul64x64_128(uint64_t a, uint64_t b)
-{
- return (uint128_t)a * b;
-}
-
-static uint64_t INLINE
-lo128(uint128_t a)
-{
- return (uint64_t)a;
-}
-
-static uint64_t INLINE
-shr128(uint128_t v, const int shift)
-{
- return (uint64_t)(v >> shift);
-}
-
-static uint64_t INLINE
-shr128_pair(uint64_t hi, uint64_t lo, const int shift)
-{
- return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift);
-}
-
-typedef struct poly1305_power_t {
- union {
- xmmi v;
- uint64_t u[2];
- uint32_t d[4];
- } R20, R21, R22, R23, R24, S21, S22, S23, S24;
-} poly1305_power;
-
-typedef struct poly1305_state_internal_t {
- poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144 bytes of free storage */
- union {
- xmmi H[5]; /* 80 bytes */
- uint64_t HH[10];
- };
- /* uint64_t r0,r1,r2; [24 bytes] */
- /* uint64_t pad0,pad1; [16 bytes] */
- uint64_t started; /* 8 bytes */
- uint64_t leftover; /* 8 bytes */
- uint8_t buffer[64]; /* 64 bytes */
-} poly1305_state_internal; /* 448 bytes total + 63 bytes for alignment = 511 bytes raw */
-
-static poly1305_state_internal INLINE
- *
- poly1305_aligned_state(poly1305_state *state)
-{
- return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
-}
-
-/* copy 0-63 bytes */
-static void INLINE NO_SANITIZE_ALIGNMENT
-poly1305_block_copy(uint8_t *dst, const uint8_t *src, size_t bytes)
-{
- size_t offset = src - dst;
- if (bytes & 32) {
- _mm_storeu_si128((xmmi *)(dst + 0), _mm_loadu_si128((xmmi *)(dst + offset + 0)));
- _mm_storeu_si128((xmmi *)(dst + 16), _mm_loadu_si128((xmmi *)(dst + offset + 16)));
- dst += 32;
- }
- if (bytes & 16) {
- _mm_storeu_si128((xmmi *)dst, _mm_loadu_si128((xmmi *)(dst + offset)));
- dst += 16;
- }
- if (bytes & 8) {
- *(uint64_t *)dst = *(uint64_t *)(dst + offset);
- dst += 8;
- }
- if (bytes & 4) {
- *(uint32_t *)dst = *(uint32_t *)(dst + offset);
- dst += 4;
- }
- if (bytes & 2) {
- *(uint16_t *)dst = *(uint16_t *)(dst + offset);
- dst += 2;
- }
- if (bytes & 1) {
- *(uint8_t *)dst = *(uint8_t *)(dst + offset);
- }
-}
-
-/* zero 0-15 bytes */
-static void INLINE
-poly1305_block_zero(uint8_t *dst, size_t bytes)
-{
- if (bytes & 8) {
- *(uint64_t *)dst = 0;
- dst += 8;
- }
- if (bytes & 4) {
- *(uint32_t *)dst = 0;
- dst += 4;
- }
- if (bytes & 2) {
- *(uint16_t *)dst = 0;
- dst += 2;
- }
- if (bytes & 1) {
- *(uint8_t *)dst = 0;
- }
-}
-
-static size_t INLINE
-poly1305_min(size_t a, size_t b)
-{
- return (a < b) ? a : b;
-}
-
-void
-Poly1305Init(poly1305_state *state, const unsigned char key[32])
-{
- poly1305_state_internal *st = poly1305_aligned_state(state);
- poly1305_power *p;
- uint64_t r0, r1, r2;
- uint64_t t0, t1;
-
- /* clamp key */
- t0 = U8TO64_LE(key + 0);
- t1 = U8TO64_LE(key + 8);
- r0 = t0 & 0xffc0fffffff;
- t0 >>= 44;
- t0 |= t1 << 20;
- r1 = t0 & 0xfffffc0ffff;
- t1 >>= 24;
- r2 = t1 & 0x00ffffffc0f;
-
- /* store r in un-used space of st->P[1] */
- p = &st->P[1];
- p->R20.d[1] = (uint32_t)(r0);
- p->R20.d[3] = (uint32_t)(r0 >> 32);
- p->R21.d[1] = (uint32_t)(r1);
- p->R21.d[3] = (uint32_t)(r1 >> 32);
- p->R22.d[1] = (uint32_t)(r2);
- p->R22.d[3] = (uint32_t)(r2 >> 32);
-
- /* store pad */
- p->R23.d[1] = U8TO32_LE(key + 16);
- p->R23.d[3] = U8TO32_LE(key + 20);
- p->R24.d[1] = U8TO32_LE(key + 24);
- p->R24.d[3] = U8TO32_LE(key + 28);
-
- /* H = 0 */
- st->H[0] = _mm_setzero_si128();
- st->H[1] = _mm_setzero_si128();
- st->H[2] = _mm_setzero_si128();
- st->H[3] = _mm_setzero_si128();
- st->H[4] = _mm_setzero_si128();
-
- st->started = 0;
- st->leftover = 0;
-}
-
-static void
-poly1305_first_block(poly1305_state_internal *st, const uint8_t *m)
-{
- const xmmi MMASK = _mm_load_si128((xmmi *)poly1305_x64_sse2_message_mask);
- const xmmi FIVE = _mm_load_si128((xmmi *)poly1305_x64_sse2_5);
- const xmmi HIBIT = _mm_load_si128((xmmi *)poly1305_x64_sse2_1shl128);
- xmmi T5, T6;
- poly1305_power *p;
- uint128_t d[3];
- uint64_t r0, r1, r2;
- uint64_t r20, r21, r22, s22;
- uint64_t pad0, pad1;
- uint64_t c;
- uint64_t i;
-
- /* pull out stored info */
- p = &st->P[1];
-
- r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
- r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
- r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
- pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
- pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
-
- /* compute powers r^2,r^4 */
- r20 = r0;
- r21 = r1;
- r22 = r2;
- for (i = 0; i < 2; i++) {
- s22 = r22 * (5 << 2);
-
- d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22));
- d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21));
- d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20));
-
- r20 = lo128(d[0]) & 0xfffffffffff;
- c = shr128(d[0], 44);
- d[1] = add128_64(d[1], c);
- r21 = lo128(d[1]) & 0xfffffffffff;
- c = shr128(d[1], 44);
- d[2] = add128_64(d[2], c);
- r22 = lo128(d[2]) & 0x3ffffffffff;
- c = shr128(d[2], 42);
- r20 += c * 5;
- c = (r20 >> 44);
- r20 = r20 & 0xfffffffffff;
- r21 += c;
-
- p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff), _MM_SHUFFLE(1, 0, 1, 0));
- p->R21.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff), _MM_SHUFFLE(1, 0, 1, 0));
- p->R22.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff), _MM_SHUFFLE(1, 0, 1, 0));
- p->R23.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff), _MM_SHUFFLE(1, 0, 1, 0));
- p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))), _MM_SHUFFLE(1, 0, 1, 0));
- p->S21.v = _mm_mul_epu32(p->R21.v, FIVE);
- p->S22.v = _mm_mul_epu32(p->R22.v, FIVE);
- p->S23.v = _mm_mul_epu32(p->R23.v, FIVE);
- p->S24.v = _mm_mul_epu32(p->R24.v, FIVE);
- p--;
- }
-
- /* put saved info back */
- p = &st->P[1];
- p->R20.d[1] = (uint32_t)(r0);
- p->R20.d[3] = (uint32_t)(r0 >> 32);
- p->R21.d[1] = (uint32_t)(r1);
- p->R21.d[3] = (uint32_t)(r1 >> 32);
- p->R22.d[1] = (uint32_t)(r2);
- p->R22.d[3] = (uint32_t)(r2 >> 32);
- p->R23.d[1] = (uint32_t)(pad0);
- p->R23.d[3] = (uint32_t)(pad0 >> 32);
- p->R24.d[1] = (uint32_t)(pad1);
- p->R24.d[3] = (uint32_t)(pad1 >> 32);
-
- /* H = [Mx,My] */
- T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)), _mm_loadl_epi64((xmmi *)(m + 16)));
- T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)), _mm_loadl_epi64((xmmi *)(m + 24)));
- st->H[0] = _mm_and_si128(MMASK, T5);
- st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
- T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
- st->H[2] = _mm_and_si128(MMASK, T5);
- st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
- st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
-}
-
-static void
-poly1305_blocks(poly1305_state_internal *st, const uint8_t *m, size_t bytes)
-{
- const xmmi MMASK = _mm_load_si128((xmmi *)poly1305_x64_sse2_message_mask);
- const xmmi FIVE = _mm_load_si128((xmmi *)poly1305_x64_sse2_5);
- const xmmi HIBIT = _mm_load_si128((xmmi *)poly1305_x64_sse2_1shl128);
-
- poly1305_power *p;
- xmmi H0, H1, H2, H3, H4;
- xmmi T0, T1, T2, T3, T4, T5, T6;
- xmmi M0, M1, M2, M3, M4;
- xmmi C1, C2;
-
- H0 = st->H[0];
- H1 = st->H[1];
- H2 = st->H[2];
- H3 = st->H[3];
- H4 = st->H[4];
-
- while (bytes >= 64) {
- /* H *= [r^4,r^4] */
- p = &st->P[0];
- T0 = _mm_mul_epu32(H0, p->R20.v);
- T1 = _mm_mul_epu32(H0, p->R21.v);
- T2 = _mm_mul_epu32(H0, p->R22.v);
- T3 = _mm_mul_epu32(H0, p->R23.v);
- T4 = _mm_mul_epu32(H0, p->R24.v);
- T5 = _mm_mul_epu32(H1, p->S24.v);
- T6 = _mm_mul_epu32(H1, p->R20.v);
- T0 = _mm_add_epi64(T0, T5);
- T1 = _mm_add_epi64(T1, T6);
- T5 = _mm_mul_epu32(H2, p->S23.v);
- T6 = _mm_mul_epu32(H2, p->S24.v);
- T0 = _mm_add_epi64(T0, T5);
- T1 = _mm_add_epi64(T1, T6);
- T5 = _mm_mul_epu32(H3, p->S22.v);
- T6 = _mm_mul_epu32(H3, p->S23.v);
- T0 = _mm_add_epi64(T0, T5);
- T1 = _mm_add_epi64(T1, T6);
- T5 = _mm_mul_epu32(H4, p->S21.v);
- T6 = _mm_mul_epu32(H4, p->S22.v);
- T0 = _mm_add_epi64(T0, T5);
- T1 = _mm_add_epi64(T1, T6);
- T5 = _mm_mul_epu32(H1, p->R21.v);
- T6 = _mm_mul_epu32(H1, p->R22.v);
- T2 = _mm_add_epi64(T2, T5);
- T3 = _mm_add_epi64(T3, T6);
- T5 = _mm_mul_epu32(H2, p->R20.v);
- T6 = _mm_mul_epu32(H2, p->R21.v);
- T2 = _mm_add_epi64(T2, T5);
- T3 = _mm_add_epi64(T3, T6);
- T5 = _mm_mul_epu32(H3, p->S24.v);
- T6 = _mm_mul_epu32(H3, p->R20.v);
- T2 = _mm_add_epi64(T2, T5);
- T3 = _mm_add_epi64(T3, T6);
- T5 = _mm_mul_epu32(H4, p->S23.v);
- T6 = _mm_mul_epu32(H4, p->S24.v);
- T2 = _mm_add_epi64(T2, T5);
- T3 = _mm_add_epi64(T3, T6);
- T5 = _mm_mul_epu32(H1, p->R23.v);
- T4 = _mm_add_epi64(T4, T5);
- T5 = _mm_mul_epu32(H2, p->R22.v);
- T4 = _mm_add_epi64(T4, T5);
- T5 = _mm_mul_epu32(H3, p->R21.v);
- T4 = _mm_add_epi64(T4, T5);
- T5 = _mm_mul_epu32(H4, p->R20.v);
- T4 = _mm_add_epi64(T4, T5);
-
- /* H += [Mx,My]*[r^2,r^2] */
- T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)), _mm_loadl_epi64((xmmi *)(m + 16)));
- T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)), _mm_loadl_epi64((xmmi *)(m + 24)));
- M0 = _mm_and_si128(MMASK, T5);
- M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
- T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
- M2 = _mm_and_si128(MMASK, T5);
- M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
- M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
-
- p = &st->P[1];
- T5 = _mm_mul_epu32(M0, p->R20.v);
- T6 = _mm_mul_epu32(M0, p->R21.v);
- T0 = _mm_add_epi64(T0, T5);
- T1 = _mm_add_epi64(T1, T6);
- T5 = _mm_mul_epu32(M1, p->S24.v);
- T6 = _mm_mul_epu32(M1, p->R20.v);
- T0 = _mm_add_epi64(T0, T5);
- T1 = _mm_add_epi64(T1, T6);
- T5 = _mm_mul_epu32(M2, p->S23.v);
- T6 = _mm_mul_epu32(M2, p->S24.v);
- T0 = _mm_add_epi64(T0, T5);
- T1 = _mm_add_epi64(T1, T6);
- T5 = _mm_mul_epu32(M3, p->S22.v);
- T6 = _mm_mul_epu32(M3, p->S23.v);
- T0 = _mm_add_epi64(T0, T5);
- T1 = _mm_add_epi64(T1, T6);
- T5 = _mm_mul_epu32(M4, p->S21.v);
- T6 = _mm_mul_epu32(M4, p->S22.v);
- T0 = _mm_add_epi64(T0, T5);
- T1 = _mm_add_epi64(T1, T6);
- T5 = _mm_mul_epu32(M0, p->R22.v);
- T6 = _mm_mul_epu32(M0, p->R23.v);
- T2 = _mm_add_epi64(T2, T5);
- T3 = _mm_add_epi64(T3, T6);
- T5 = _mm_mul_epu32(M1, p->R21.v);
- T6 = _mm_mul_epu32(M1, p->R22.v);
- T2 = _mm_add_epi64(T2, T5);
- T3 = _mm_add_epi64(T3, T6);
- T5 = _mm_mul_epu32(M2, p->R20.v);
- T6 = _mm_mul_epu32(M2, p->R21.v);
- T2 = _mm_add_epi64(T2, T5);
- T3 = _mm_add_epi64(T3, T6);
- T5 = _mm_mul_epu32(M3, p->S24.v);
- T6 = _mm_mul_epu32(M3, p->R20.v);
- T2 = _mm_add_epi64(T2, T5);
- T3 = _mm_add_epi64(T3, T6);
- T5 = _mm_mul_epu32(M4, p->S23.v);
- T6 = _mm_mul_epu32(M4, p->S24.v);
- T2 = _mm_add_epi64(T2, T5);
- T3 = _mm_add_epi64(T3, T6);
- T5 = _mm_mul_epu32(M0, p->R24.v);
- T4 = _mm_add_epi64(T4, T5);
- T5 = _mm_mul_epu32(M1, p->R23.v);
- T4 = _mm_add_epi64(T4, T5);
- T5 = _mm_mul_epu32(M2, p->R22.v);
- T4 = _mm_add_epi64(T4, T5);
- T5 = _mm_mul_epu32(M3, p->R21.v);
- T4 = _mm_add_epi64(T4, T5);
- T5 = _mm_mul_epu32(M4, p->R20.v);
- T4 = _mm_add_epi64(T4, T5);
-
- /* H += [Mx,My] */
- T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 32)), _mm_loadl_epi64((xmmi *)(m + 48)));
- T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 40)), _mm_loadl_epi64((xmmi *)(m + 56)));
- M0 = _mm_and_si128(MMASK, T5);
- M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
- T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
- M2 = _mm_and_si128(MMASK, T5);
- M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
- M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
-
- T0 = _mm_add_epi64(T0, M0);
- T1 = _mm_add_epi64(T1, M1);
- T2 = _mm_add_epi64(T2, M2);
- T3 = _mm_add_epi64(T3, M3);
- T4 = _mm_add_epi64(T4, M4);
-
- /* reduce */
- C1 = _mm_srli_epi64(T0, 26);
- C2 = _mm_srli_epi64(T3, 26);
- T0 = _mm_and_si128(T0, MMASK);
- T3 = _mm_and_si128(T3, MMASK);
- T1 = _mm_add_epi64(T1, C1);
- T4 = _mm_add_epi64(T4, C2);
- C1 = _mm_srli_epi64(T1, 26);
- C2 = _mm_srli_epi64(T4, 26);
- T1 = _mm_and_si128(T1, MMASK);
- T4 = _mm_and_si128(T4, MMASK);
- T2 = _mm_add_epi64(T2, C1);
- T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
- C1 = _mm_srli_epi64(T2, 26);
- C2 = _mm_srli_epi64(T0, 26);
- T2 = _mm_and_si128(T2, MMASK);
- T0 = _mm_and_si128(T0, MMASK);
- T3 = _mm_add_epi64(T3, C1);
- T1 = _mm_add_epi64(T1, C2);
- C1 = _mm_srli_epi64(T3, 26);
- T3 = _mm_and_si128(T3, MMASK);
- T4 = _mm_add_epi64(T4, C1);
-
- /* H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My]) */
- H0 = T0;
- H1 = T1;
- H2 = T2;
- H3 = T3;
- H4 = T4;
-
- m += 64;
- bytes -= 64;
- }
-
- st->H[0] = H0;
- st->H[1] = H1;
- st->H[2] = H2;
- st->H[3] = H3;
- st->H[4] = H4;
-}
-
-static size_t
-poly1305_combine(poly1305_state_internal *st, const uint8_t *m, size_t bytes)
-{
- const xmmi MMASK = _mm_load_si128((xmmi *)poly1305_x64_sse2_message_mask);
- const xmmi HIBIT = _mm_load_si128((xmmi *)poly1305_x64_sse2_1shl128);
- const xmmi FIVE = _mm_load_si128((xmmi *)poly1305_x64_sse2_5);
-
- poly1305_power *p;
- xmmi H0, H1, H2, H3, H4;
- xmmi M0, M1, M2, M3, M4;
- xmmi T0, T1, T2, T3, T4, T5, T6;
- xmmi C1, C2;
-
- uint64_t r0, r1, r2;
- uint64_t t0, t1, t2, t3, t4;
- uint64_t c;
- size_t consumed = 0;
-
- H0 = st->H[0];
- H1 = st->H[1];
- H2 = st->H[2];
- H3 = st->H[3];
- H4 = st->H[4];
-
- /* p = [r^2,r^2] */
- p = &st->P[1];
-
- if (bytes >= 32) {
- /* H *= [r^2,r^2] */
- T0 = _mm_mul_epu32(H0, p->R20.v);
- T1 = _mm_mul_epu32(H0, p->R21.v);
- T2 = _mm_mul_epu32(H0, p->R22.v);
- T3 = _mm_mul_epu32(H0, p->R23.v);
- T4 = _mm_mul_epu32(H0, p->R24.v);
- T5 = _mm_mul_epu32(H1, p->S24.v);
- T6 = _mm_mul_epu32(H1, p->R20.v);
- T0 = _mm_add_epi64(T0, T5);
- T1 = _mm_add_epi64(T1, T6);
- T5 = _mm_mul_epu32(H2, p->S23.v);
- T6 = _mm_mul_epu32(H2, p->S24.v);
- T0 = _mm_add_epi64(T0, T5);
- T1 = _mm_add_epi64(T1, T6);
- T5 = _mm_mul_epu32(H3, p->S22.v);
- T6 = _mm_mul_epu32(H3, p->S23.v);
- T0 = _mm_add_epi64(T0, T5);
- T1 = _mm_add_epi64(T1, T6);
- T5 = _mm_mul_epu32(H4, p->S21.v);
- T6 = _mm_mul_epu32(H4, p->S22.v);
- T0 = _mm_add_epi64(T0, T5);
- T1 = _mm_add_epi64(T1, T6);
- T5 = _mm_mul_epu32(H1, p->R21.v);
- T6 = _mm_mul_epu32(H1, p->R22.v);
- T2 = _mm_add_epi64(T2, T5);
- T3 = _mm_add_epi64(T3, T6);
- T5 = _mm_mul_epu32(H2, p->R20.v);
- T6 = _mm_mul_epu32(H2, p->R21.v);
- T2 = _mm_add_epi64(T2, T5);
- T3 = _mm_add_epi64(T3, T6);
- T5 = _mm_mul_epu32(H3, p->S24.v);
- T6 = _mm_mul_epu32(H3, p->R20.v);
- T2 = _mm_add_epi64(T2, T5);
- T3 = _mm_add_epi64(T3, T6);
- T5 = _mm_mul_epu32(H4, p->S23.v);
- T6 = _mm_mul_epu32(H4, p->S24.v);
- T2 = _mm_add_epi64(T2, T5);
- T3 = _mm_add_epi64(T3, T6);
- T5 = _mm_mul_epu32(H1, p->R23.v);
- T4 = _mm_add_epi64(T4, T5);
- T5 = _mm_mul_epu32(H2, p->R22.v);
- T4 = _mm_add_epi64(T4, T5);
- T5 = _mm_mul_epu32(H3, p->R21.v);
- T4 = _mm_add_epi64(T4, T5);
- T5 = _mm_mul_epu32(H4, p->R20.v);
- T4 = _mm_add_epi64(T4, T5);
-
- /* H += [Mx,My] */
- T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)), _mm_loadl_epi64((xmmi *)(m + 16)));
- T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)), _mm_loadl_epi64((xmmi *)(m + 24)));
- M0 = _mm_and_si128(MMASK, T5);
- M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
- T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
- M2 = _mm_and_si128(MMASK, T5);
- M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
- M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
-
- T0 = _mm_add_epi64(T0, M0);
- T1 = _mm_add_epi64(T1, M1);
- T2 = _mm_add_epi64(T2, M2);
- T3 = _mm_add_epi64(T3, M3);
- T4 = _mm_add_epi64(T4, M4);
-
- /* reduce */
- C1 = _mm_srli_epi64(T0, 26);
- C2 = _mm_srli_epi64(T3, 26);
- T0 = _mm_and_si128(T0, MMASK);
- T3 = _mm_and_si128(T3, MMASK);
- T1 = _mm_add_epi64(T1, C1);
- T4 = _mm_add_epi64(T4, C2);
- C1 = _mm_srli_epi64(T1, 26);
- C2 = _mm_srli_epi64(T4, 26);
- T1 = _mm_and_si128(T1, MMASK);
- T4 = _mm_and_si128(T4, MMASK);
- T2 = _mm_add_epi64(T2, C1);
- T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
- C1 = _mm_srli_epi64(T2, 26);
- C2 = _mm_srli_epi64(T0, 26);
- T2 = _mm_and_si128(T2, MMASK);
- T0 = _mm_and_si128(T0, MMASK);
- T3 = _mm_add_epi64(T3, C1);
- T1 = _mm_add_epi64(T1, C2);
- C1 = _mm_srli_epi64(T3, 26);
- T3 = _mm_and_si128(T3, MMASK);
- T4 = _mm_add_epi64(T4, C1);
-
- /* H = (H*[r^2,r^2] + [Mx,My]) */
- H0 = T0;
- H1 = T1;
- H2 = T2;
- H3 = T3;
- H4 = T4;
-
- consumed = 32;
- }
-
- /* finalize, H *= [r^2,r] */
- r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
- r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
- r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
-
- p->R20.d[2] = (uint32_t)(r0)&0x3ffffff;
- p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
- p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
- p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
- p->R24.d[2] = (uint32_t)((r2 >> 16));
- p->S21.d[2] = p->R21.d[2] * 5;
- p->S22.d[2] = p->R22.d[2] * 5;
- p->S23.d[2] = p->R23.d[2] * 5;
- p->S24.d[2] = p->R24.d[2] * 5;
-
- /* H *= [r^2,r] */
- T0 = _mm_mul_epu32(H0, p->R20.v);
- T1 = _mm_mul_epu32(H0, p->R21.v);
- T2 = _mm_mul_epu32(H0, p->R22.v);
- T3 = _mm_mul_epu32(H0, p->R23.v);
- T4 = _mm_mul_epu32(H0, p->R24.v);
- T5 = _mm_mul_epu32(H1, p->S24.v);
- T6 = _mm_mul_epu32(H1, p->R20.v);
- T0 = _mm_add_epi64(T0, T5);
- T1 = _mm_add_epi64(T1, T6);
- T5 = _mm_mul_epu32(H2, p->S23.v);
- T6 = _mm_mul_epu32(H2, p->S24.v);
- T0 = _mm_add_epi64(T0, T5);
- T1 = _mm_add_epi64(T1, T6);
- T5 = _mm_mul_epu32(H3, p->S22.v);
- T6 = _mm_mul_epu32(H3, p->S23.v);
- T0 = _mm_add_epi64(T0, T5);
- T1 = _mm_add_epi64(T1, T6);
- T5 = _mm_mul_epu32(H4, p->S21.v);
- T6 = _mm_mul_epu32(H4, p->S22.v);
- T0 = _mm_add_epi64(T0, T5);
- T1 = _mm_add_epi64(T1, T6);
- T5 = _mm_mul_epu32(H1, p->R21.v);
- T6 = _mm_mul_epu32(H1, p->R22.v);
- T2 = _mm_add_epi64(T2, T5);
- T3 = _mm_add_epi64(T3, T6);
- T5 = _mm_mul_epu32(H2, p->R20.v);
- T6 = _mm_mul_epu32(H2, p->R21.v);
- T2 = _mm_add_epi64(T2, T5);
- T3 = _mm_add_epi64(T3, T6);
- T5 = _mm_mul_epu32(H3, p->S24.v);
- T6 = _mm_mul_epu32(H3, p->R20.v);
- T2 = _mm_add_epi64(T2, T5);
- T3 = _mm_add_epi64(T3, T6);
- T5 = _mm_mul_epu32(H4, p->S23.v);
- T6 = _mm_mul_epu32(H4, p->S24.v);
- T2 = _mm_add_epi64(T2, T5);
- T3 = _mm_add_epi64(T3, T6);
- T5 = _mm_mul_epu32(H1, p->R23.v);
- T4 = _mm_add_epi64(T4, T5);
- T5 = _mm_mul_epu32(H2, p->R22.v);
- T4 = _mm_add_epi64(T4, T5);
- T5 = _mm_mul_epu32(H3, p->R21.v);
- T4 = _mm_add_epi64(T4, T5);
- T5 = _mm_mul_epu32(H4, p->R20.v);
- T4 = _mm_add_epi64(T4, T5);
-
- C1 = _mm_srli_epi64(T0, 26);
- C2 = _mm_srli_epi64(T3, 26);
- T0 = _mm_and_si128(T0, MMASK);
- T3 = _mm_and_si128(T3, MMASK);
- T1 = _mm_add_epi64(T1, C1);
- T4 = _mm_add_epi64(T4, C2);
- C1 = _mm_srli_epi64(T1, 26);
- C2 = _mm_srli_epi64(T4, 26);
- T1 = _mm_and_si128(T1, MMASK);
- T4 = _mm_and_si128(T4, MMASK);
- T2 = _mm_add_epi64(T2, C1);
- T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
- C1 = _mm_srli_epi64(T2, 26);
- C2 = _mm_srli_epi64(T0, 26);
- T2 = _mm_and_si128(T2, MMASK);
- T0 = _mm_and_si128(T0, MMASK);
- T3 = _mm_add_epi64(T3, C1);
- T1 = _mm_add_epi64(T1, C2);
- C1 = _mm_srli_epi64(T3, 26);
- T3 = _mm_and_si128(T3, MMASK);
- T4 = _mm_add_epi64(T4, C1);
-
- /* H = H[0]+H[1] */
- H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
- H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
- H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
- H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
- H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
-
- t0 = _mm_cvtsi128_si32(H0);
- c = (t0 >> 26);
- t0 &= 0x3ffffff;
- t1 = _mm_cvtsi128_si32(H1) + c;
- c = (t1 >> 26);
- t1 &= 0x3ffffff;
- t2 = _mm_cvtsi128_si32(H2) + c;
- c = (t2 >> 26);
- t2 &= 0x3ffffff;
- t3 = _mm_cvtsi128_si32(H3) + c;
- c = (t3 >> 26);
- t3 &= 0x3ffffff;
- t4 = _mm_cvtsi128_si32(H4) + c;
- c = (t4 >> 26);
- t4 &= 0x3ffffff;
- t0 = t0 + (c * 5);
- c = (t0 >> 26);
- t0 &= 0x3ffffff;
- t1 = t1 + c;
-
- st->HH[0] = ((t0) | (t1 << 26)) & 0xfffffffffffull;
- st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & 0xfffffffffffull;
- st->HH[2] = ((t3 >> 10) | (t4 << 16)) & 0x3ffffffffffull;
-
- return consumed;
-}
-
-void
-Poly1305Update(poly1305_state *state, const unsigned char *m, size_t bytes)
-{
- poly1305_state_internal *st = poly1305_aligned_state(state);
- size_t want;
-
- /* need at least 32 initial bytes to start the accelerated branch */
- if (!st->started) {
- if ((st->leftover == 0) && (bytes > 32)) {
- poly1305_first_block(st, m);
- m += 32;
- bytes -= 32;
- } else {
- want = poly1305_min(32 - st->leftover, bytes);
- poly1305_block_copy(st->buffer + st->leftover, m, want);
- bytes -= want;
- m += want;
- st->leftover += want;
- if ((st->leftover < 32) || (bytes == 0))
- return;
- poly1305_first_block(st, st->buffer);
- st->leftover = 0;
- }
- st->started = 1;
- }
-
- /* handle leftover */
- if (st->leftover) {
- want = poly1305_min(64 - st->leftover, bytes);
- poly1305_block_copy(st->buffer + st->leftover, m, want);
- bytes -= want;
- m += want;
- st->leftover += want;
- if (st->leftover < 64)
- return;
- poly1305_blocks(st, st->buffer, 64);
- st->leftover = 0;
- }
-
- /* process 64 byte blocks */
- if (bytes >= 64) {
- want = (bytes & ~63);
- poly1305_blocks(st, m, want);
- m += want;
- bytes -= want;
- }
-
- if (bytes) {
- poly1305_block_copy(st->buffer + st->leftover, m, bytes);
- st->leftover += bytes;
- }
-}
-
-void
-Poly1305Finish(poly1305_state *state, unsigned char mac[16])
-{
- poly1305_state_internal *st = poly1305_aligned_state(state);
- size_t leftover = st->leftover;
- uint8_t *m = st->buffer;
- uint128_t d[3];
- uint64_t h0, h1, h2;
- uint64_t t0, t1;
- uint64_t g0, g1, g2, c, nc;
- uint64_t r0, r1, r2, s1, s2;
- poly1305_power *p;
-
- if (st->started) {
- size_t consumed = poly1305_combine(st, m, leftover);
- leftover -= consumed;
- m += consumed;
- }
-
- /* st->HH will either be 0 or have the combined result */
- h0 = st->HH[0];
- h1 = st->HH[1];
- h2 = st->HH[2];
-
- p = &st->P[1];
- r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
- r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
- r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
- s1 = r1 * (5 << 2);
- s2 = r2 * (5 << 2);
-
- if (leftover < 16)
- goto poly1305_donna_atmost15bytes;
-
-poly1305_donna_atleast16bytes:
- t0 = U8TO64_LE(m + 0);
- t1 = U8TO64_LE(m + 8);
- h0 += t0 & 0xfffffffffff;
- t0 = shr128_pair(t1, t0, 44);
- h1 += t0 & 0xfffffffffff;
- h2 += (t1 >> 24) | ((uint64_t)1 << 40);
-
-poly1305_donna_mul:
- d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)), mul64x64_128(h2, s1));
- d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)), mul64x64_128(h2, s2));
- d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)), mul64x64_128(h2, r0));
- h0 = lo128(d[0]) & 0xfffffffffff;
- c = shr128(d[0], 44);
- d[1] = add128_64(d[1], c);
- h1 = lo128(d[1]) & 0xfffffffffff;
- c = shr128(d[1], 44);
- d[2] = add128_64(d[2], c);
- h2 = lo128(d[2]) & 0x3ffffffffff;
- c = shr128(d[2], 42);
- h0 += c * 5;
-
- m += 16;
- leftover -= 16;
- if (leftover >= 16)
- goto poly1305_donna_atleast16bytes;
-
-/* final bytes */
-poly1305_donna_atmost15bytes:
- if (!leftover)
- goto poly1305_donna_finish;
-
- m[leftover++] = 1;
- poly1305_block_zero(m + leftover, 16 - leftover);
- leftover = 16;
-
- t0 = U8TO64_LE(m + 0);
- t1 = U8TO64_LE(m + 8);
- h0 += t0 & 0xfffffffffff;
- t0 = shr128_pair(t1, t0, 44);
- h1 += t0 & 0xfffffffffff;
- h2 += (t1 >> 24);
-
- goto poly1305_donna_mul;
-
-poly1305_donna_finish:
- c = (h0 >> 44);
- h0 &= 0xfffffffffff;
- h1 += c;
- c = (h1 >> 44);
- h1 &= 0xfffffffffff;
- h2 += c;
- c = (h2 >> 42);
- h2 &= 0x3ffffffffff;
- h0 += c * 5;
-
- g0 = h0 + 5;
- c = (g0 >> 44);
- g0 &= 0xfffffffffff;
- g1 = h1 + c;
- c = (g1 >> 44);
- g1 &= 0xfffffffffff;
- g2 = h2 + c - ((uint64_t)1 << 42);
-
- c = (g2 >> 63) - 1;
- nc = ~c;
- h0 = (h0 & nc) | (g0 & c);
- h1 = (h1 & nc) | (g1 & c);
- h2 = (h2 & nc) | (g2 & c);
-
- /* pad */
- t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
- t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
- h0 += (t0 & 0xfffffffffff);
- c = (h0 >> 44);
- h0 &= 0xfffffffffff;
- t0 = shr128_pair(t1, t0, 44);
- h1 += (t0 & 0xfffffffffff) + c;
- c = (h1 >> 44);
- h1 &= 0xfffffffffff;
- t1 = (t1 >> 24);
- h2 += (t1) + c;
-
- U64TO8_LE(mac + 0, ((h0) | (h1 << 44)));
- U64TO8_LE(mac + 8, ((h1 >> 20) | (h2 << 24)));
-}
diff --git a/lib/freebl/poly1305.c b/lib/freebl/poly1305.c
deleted file mode 100644
index eb3e3cd55..000000000
--- a/lib/freebl/poly1305.c
+++ /dev/null
@@ -1,314 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-/* This implementation of poly1305 is by Andrew Moon
- * (https://github.com/floodyberry/poly1305-donna) and released as public
- * domain. */
-
-#include <string.h>
-
-#include "poly1305.h"
-
-#if defined(_MSC_VER) && _MSC_VER < 1600
-#include "prtypes.h"
-typedef PRUint32 uint32_t;
-typedef PRUint64 uint64_t;
-#else
-#include <stdint.h>
-#endif
-
-#if defined(NSS_X86) || defined(NSS_X64)
-/* We can assume little-endian. */
-static uint32_t
-U8TO32_LE(const unsigned char *m)
-{
- uint32_t r;
- memcpy(&r, m, sizeof(r));
- return r;
-}
-
-static void
-U32TO8_LE(unsigned char *m, uint32_t v)
-{
- memcpy(m, &v, sizeof(v));
-}
-#else
-static uint32_t
-U8TO32_LE(const unsigned char *m)
-{
- return (uint32_t)m[0] |
- (uint32_t)m[1] << 8 |
- (uint32_t)m[2] << 16 |
- (uint32_t)m[3] << 24;
-}
-
-static void
-U32TO8_LE(unsigned char *m, uint32_t v)
-{
- m[0] = v;
- m[1] = v >> 8;
- m[2] = v >> 16;
- m[3] = v >> 24;
-}
-#endif
-
-static uint64_t
-mul32x32_64(uint32_t a, uint32_t b)
-{
- return (uint64_t)a * b;
-}
-
-struct poly1305_state_st {
- uint32_t r0, r1, r2, r3, r4;
- uint32_t s1, s2, s3, s4;
- uint32_t h0, h1, h2, h3, h4;
- unsigned char buf[16];
- unsigned int buf_used;
- unsigned char key[16];
-};
-
-/* update updates |state| given some amount of input data. This function may
- * only be called with a |len| that is not a multiple of 16 at the end of the
- * data. Otherwise the input must be buffered into 16 byte blocks. */
-static void
-update(struct poly1305_state_st *state, const unsigned char *in,
- size_t len)
-{
- uint32_t t0, t1, t2, t3;
- uint64_t t[5];
- uint32_t b;
- uint64_t c;
- size_t j;
- unsigned char mp[16];
-
- if (len < 16)
- goto poly1305_donna_atmost15bytes;
-
-poly1305_donna_16bytes:
- t0 = U8TO32_LE(in);
- t1 = U8TO32_LE(in + 4);
- t2 = U8TO32_LE(in + 8);
- t3 = U8TO32_LE(in + 12);
-
- in += 16;
- len -= 16;
-
- state->h0 += t0 & 0x3ffffff;
- state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff;
- state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff;
- state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff;
- state->h4 += (t3 >> 8) | (1 << 24);
-
-poly1305_donna_mul:
- t[0] = mul32x32_64(state->h0, state->r0) +
- mul32x32_64(state->h1, state->s4) +
- mul32x32_64(state->h2, state->s3) +
- mul32x32_64(state->h3, state->s2) +
- mul32x32_64(state->h4, state->s1);
- t[1] = mul32x32_64(state->h0, state->r1) +
- mul32x32_64(state->h1, state->r0) +
- mul32x32_64(state->h2, state->s4) +
- mul32x32_64(state->h3, state->s3) +
- mul32x32_64(state->h4, state->s2);
- t[2] = mul32x32_64(state->h0, state->r2) +
- mul32x32_64(state->h1, state->r1) +
- mul32x32_64(state->h2, state->r0) +
- mul32x32_64(state->h3, state->s4) +
- mul32x32_64(state->h4, state->s3);
- t[3] = mul32x32_64(state->h0, state->r3) +
- mul32x32_64(state->h1, state->r2) +
- mul32x32_64(state->h2, state->r1) +
- mul32x32_64(state->h3, state->r0) +
- mul32x32_64(state->h4, state->s4);
- t[4] = mul32x32_64(state->h0, state->r4) +
- mul32x32_64(state->h1, state->r3) +
- mul32x32_64(state->h2, state->r2) +
- mul32x32_64(state->h3, state->r1) +
- mul32x32_64(state->h4, state->r0);
-
- state->h0 = (uint32_t)t[0] & 0x3ffffff;
- c = (t[0] >> 26);
- t[1] += c;
- state->h1 = (uint32_t)t[1] & 0x3ffffff;
- b = (uint32_t)(t[1] >> 26);
- t[2] += b;
- state->h2 = (uint32_t)t[2] & 0x3ffffff;
- b = (uint32_t)(t[2] >> 26);
- t[3] += b;
- state->h3 = (uint32_t)t[3] & 0x3ffffff;
- b = (uint32_t)(t[3] >> 26);
- t[4] += b;
- state->h4 = (uint32_t)t[4] & 0x3ffffff;
- b = (uint32_t)(t[4] >> 26);
- state->h0 += b * 5;
-
- if (len >= 16)
- goto poly1305_donna_16bytes;
-
-/* final bytes */
-poly1305_donna_atmost15bytes:
- if (!len)
- return;
-
- for (j = 0; j < len; j++)
- mp[j] = in[j];
- mp[j++] = 1;
- for (; j < 16; j++)
- mp[j] = 0;
- len = 0;
-
- t0 = U8TO32_LE(mp + 0);
- t1 = U8TO32_LE(mp + 4);
- t2 = U8TO32_LE(mp + 8);
- t3 = U8TO32_LE(mp + 12);
-
- state->h0 += t0 & 0x3ffffff;
- state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff;
- state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff;
- state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff;
- state->h4 += (t3 >> 8);
-
- goto poly1305_donna_mul;
-}
-
-void
-Poly1305Init(poly1305_state *statep, const unsigned char key[32])
-{
- struct poly1305_state_st *state = (struct poly1305_state_st *)statep;
- uint32_t t0, t1, t2, t3;
-
- t0 = U8TO32_LE(key + 0);
- t1 = U8TO32_LE(key + 4);
- t2 = U8TO32_LE(key + 8);
- t3 = U8TO32_LE(key + 12);
-
- /* precompute multipliers */
- state->r0 = t0 & 0x3ffffff;
- t0 >>= 26;
- t0 |= t1 << 6;
- state->r1 = t0 & 0x3ffff03;
- t1 >>= 20;
- t1 |= t2 << 12;
- state->r2 = t1 & 0x3ffc0ff;
- t2 >>= 14;
- t2 |= t3 << 18;
- state->r3 = t2 & 0x3f03fff;
- t3 >>= 8;
- state->r4 = t3 & 0x00fffff;
-
- state->s1 = state->r1 * 5;
- state->s2 = state->r2 * 5;
- state->s3 = state->r3 * 5;
- state->s4 = state->r4 * 5;
-
- /* init state */
- state->h0 = 0;
- state->h1 = 0;
- state->h2 = 0;
- state->h3 = 0;
- state->h4 = 0;
-
- state->buf_used = 0;
- memcpy(state->key, key + 16, sizeof(state->key));
-}
-
-void
-Poly1305Update(poly1305_state *statep, const unsigned char *in,
- size_t in_len)
-{
- unsigned int i;
- struct poly1305_state_st *state = (struct poly1305_state_st *)statep;
-
- if (state->buf_used) {
- unsigned int todo = 16 - state->buf_used;
- if (todo > in_len)
- todo = in_len;
- for (i = 0; i < todo; i++)
- state->buf[state->buf_used + i] = in[i];
- state->buf_used += todo;
- in_len -= todo;
- in += todo;
-
- if (state->buf_used == 16) {
- update(state, state->buf, 16);
- state->buf_used = 0;
- }
- }
-
- if (in_len >= 16) {
- size_t todo = in_len & ~0xf;
- update(state, in, todo);
- in += todo;
- in_len &= 0xf;
- }
-
- if (in_len) {
- for (i = 0; i < in_len; i++)
- state->buf[i] = in[i];
- state->buf_used = in_len;
- }
-}
-
-void
-Poly1305Finish(poly1305_state *statep, unsigned char mac[16])
-{
- struct poly1305_state_st *state = (struct poly1305_state_st *)statep;
- uint64_t f0, f1, f2, f3;
- uint32_t g0, g1, g2, g3, g4;
- uint32_t b, nb;
-
- if (state->buf_used)
- update(state, state->buf, state->buf_used);
-
- b = state->h0 >> 26;
- state->h0 = state->h0 & 0x3ffffff;
- state->h1 += b;
- b = state->h1 >> 26;
- state->h1 = state->h1 & 0x3ffffff;
- state->h2 += b;
- b = state->h2 >> 26;
- state->h2 = state->h2 & 0x3ffffff;
- state->h3 += b;
- b = state->h3 >> 26;
- state->h3 = state->h3 & 0x3ffffff;
- state->h4 += b;
- b = state->h4 >> 26;
- state->h4 = state->h4 & 0x3ffffff;
- state->h0 += b * 5;
-
- g0 = state->h0 + 5;
- b = g0 >> 26;
- g0 &= 0x3ffffff;
- g1 = state->h1 + b;
- b = g1 >> 26;
- g1 &= 0x3ffffff;
- g2 = state->h2 + b;
- b = g2 >> 26;
- g2 &= 0x3ffffff;
- g3 = state->h3 + b;
- b = g3 >> 26;
- g3 &= 0x3ffffff;
- g4 = state->h4 + b - (1 << 26);
-
- b = (g4 >> 31) - 1;
- nb = ~b;
- state->h0 = (state->h0 & nb) | (g0 & b);
- state->h1 = (state->h1 & nb) | (g1 & b);
- state->h2 = (state->h2 & nb) | (g2 & b);
- state->h3 = (state->h3 & nb) | (g3 & b);
- state->h4 = (state->h4 & nb) | (g4 & b);
-
- f0 = ((state->h0) | (state->h1 << 26)) + (uint64_t)U8TO32_LE(&state->key[0]);
- f1 = ((state->h1 >> 6) | (state->h2 << 20)) + (uint64_t)U8TO32_LE(&state->key[4]);
- f2 = ((state->h2 >> 12) | (state->h3 << 14)) + (uint64_t)U8TO32_LE(&state->key[8]);
- f3 = ((state->h3 >> 18) | (state->h4 << 8)) + (uint64_t)U8TO32_LE(&state->key[12]);
-
- U32TO8_LE(&mac[0], (uint32_t)f0);
- f1 += (f0 >> 32);
- U32TO8_LE(&mac[4], (uint32_t)f1);
- f2 += (f1 >> 32);
- U32TO8_LE(&mac[8], (uint32_t)f2);
- f3 += (f2 >> 32);
- U32TO8_LE(&mac[12], (uint32_t)f3);
-}
diff --git a/lib/freebl/poly1305.h b/lib/freebl/poly1305.h
deleted file mode 100644
index 125f49b3b..000000000
--- a/lib/freebl/poly1305.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * poly1305.h - header file for Poly1305 implementation.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-#ifndef FREEBL_POLY1305_H_
-#define FREEBL_POLY1305_H_
-
-#include "stddef.h"
-
-typedef unsigned char poly1305_state[512];
-
-/* Poly1305Init sets up |state| so that it can be used to calculate an
- * authentication tag with the one-time key |key|. Note that |key| is a
- * one-time key and therefore there is no `reset' method because that would
- * enable several messages to be authenticated with the same key. */
-extern void Poly1305Init(poly1305_state* state, const unsigned char key[32]);
-
-/* Poly1305Update processes |in_len| bytes from |in|. It can be called zero or
- * more times after poly1305_init. */
-extern void Poly1305Update(poly1305_state* state, const unsigned char* in,
- size_t inLen);
-
-/* Poly1305Finish completes the poly1305 calculation and writes a 16 byte
- * authentication tag to |mac|. */
-extern void Poly1305Finish(poly1305_state* state, unsigned char mac[16]);
-
-#endif /* FREEBL_POLY1305_H_ */
diff --git a/lib/freebl/verified/Hacl_Poly1305_32.c b/lib/freebl/verified/Hacl_Poly1305_32.c
new file mode 100644
index 000000000..6742327f5
--- /dev/null
+++ b/lib/freebl/verified/Hacl_Poly1305_32.c
@@ -0,0 +1,576 @@
+/* Copyright 2016-2017 INRIA and Microsoft Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Hacl_Poly1305_32.h"
+
+inline static void
+Hacl_Bignum_Modulo_reduce(uint32_t *b)
+{
+ uint32_t b0 = b[0U];
+ b[0U] = (b0 << (uint32_t)2U) + b0;
+}
+
+inline static void
+Hacl_Bignum_Modulo_carry_top(uint32_t *b)
+{
+ uint32_t b4 = b[4U];
+ uint32_t b0 = b[0U];
+ uint32_t b4_26 = b4 >> (uint32_t)26U;
+ b[4U] = b4 & (uint32_t)0x3ffffffU;
+ b[0U] = (b4_26 << (uint32_t)2U) + b4_26 + b0;
+}
+
+inline static void
+Hacl_Bignum_Modulo_carry_top_wide(uint64_t *b)
+{
+ uint64_t b4 = b[4U];
+ uint64_t b0 = b[0U];
+ uint64_t b4_ = b4 & (uint64_t)(uint32_t)0x3ffffffU;
+ uint32_t b4_26 = (uint32_t)(b4 >> (uint32_t)26U);
+ uint64_t b0_ = b0 + (uint64_t)((b4_26 << (uint32_t)2U) + b4_26);
+ b[4U] = b4_;
+ b[0U] = b0_;
+}
+
+inline static void
+Hacl_Bignum_Fproduct_copy_from_wide_(uint32_t *output, uint64_t *input)
+{
+ for (uint32_t i = (uint32_t)0U; i < (uint32_t)5U; i = i + (uint32_t)1U) {
+ uint64_t xi = input[i];
+ output[i] = (uint32_t)xi;
+ }
+}
+
+inline static void
+Hacl_Bignum_Fproduct_sum_scalar_multiplication_(uint64_t *output, uint32_t *input, uint32_t s)
+{
+ for (uint32_t i = (uint32_t)0U; i < (uint32_t)5U; i = i + (uint32_t)1U) {
+ uint64_t xi = output[i];
+ uint32_t yi = input[i];
+ output[i] = xi + (uint64_t)yi * (uint64_t)s;
+ }
+}
+
+inline static void
+Hacl_Bignum_Fproduct_carry_wide_(uint64_t *tmp)
+{
+ for (uint32_t i = (uint32_t)0U; i < (uint32_t)4U; i = i + (uint32_t)1U) {
+ uint32_t ctr = i;
+ uint64_t tctr = tmp[ctr];
+ uint64_t tctrp1 = tmp[ctr + (uint32_t)1U];
+ uint32_t r0 = (uint32_t)tctr & (uint32_t)0x3ffffffU;
+ uint64_t c = tctr >> (uint32_t)26U;
+ tmp[ctr] = (uint64_t)r0;
+ tmp[ctr + (uint32_t)1U] = tctrp1 + c;
+ }
+}
+
+inline static void
+Hacl_Bignum_Fproduct_carry_limb_(uint32_t *tmp)
+{
+ for (uint32_t i = (uint32_t)0U; i < (uint32_t)4U; i = i + (uint32_t)1U) {
+ uint32_t ctr = i;
+ uint32_t tctr = tmp[ctr];
+ uint32_t tctrp1 = tmp[ctr + (uint32_t)1U];
+ uint32_t r0 = tctr & (uint32_t)0x3ffffffU;
+ uint32_t c = tctr >> (uint32_t)26U;
+ tmp[ctr] = r0;
+ tmp[ctr + (uint32_t)1U] = tctrp1 + c;
+ }
+}
+
+inline static void
+Hacl_Bignum_Fmul_shift_reduce(uint32_t *output)
+{
+ uint32_t tmp = output[4U];
+ for (uint32_t i = (uint32_t)0U; i < (uint32_t)4U; i = i + (uint32_t)1U) {
+ uint32_t ctr = (uint32_t)5U - i - (uint32_t)1U;
+ uint32_t z = output[ctr - (uint32_t)1U];
+ output[ctr] = z;
+ }
+ output[0U] = tmp;
+ Hacl_Bignum_Modulo_reduce(output);
+}
+
+static void
+Hacl_Bignum_Fmul_mul_shift_reduce_(uint64_t *output, uint32_t *input, uint32_t *input2)
+{
+ for (uint32_t i = (uint32_t)0U; i < (uint32_t)4U; i = i + (uint32_t)1U) {
+ uint32_t input2i = input2[i];
+ Hacl_Bignum_Fproduct_sum_scalar_multiplication_(output, input, input2i);
+ Hacl_Bignum_Fmul_shift_reduce(input);
+ }
+ uint32_t i = (uint32_t)4U;
+ uint32_t input2i = input2[i];
+ Hacl_Bignum_Fproduct_sum_scalar_multiplication_(output, input, input2i);
+}
+
+inline static void
+Hacl_Bignum_Fmul_fmul(uint32_t *output, uint32_t *input, uint32_t *input2)
+{
+ uint32_t tmp[5U] = { 0U };
+ memcpy(tmp, input, (uint32_t)5U * sizeof input[0U]);
+ uint64_t t[5U] = { 0U };
+ Hacl_Bignum_Fmul_mul_shift_reduce_(t, tmp, input2);
+ Hacl_Bignum_Fproduct_carry_wide_(t);
+ Hacl_Bignum_Modulo_carry_top_wide(t);
+ Hacl_Bignum_Fproduct_copy_from_wide_(output, t);
+ uint32_t i0 = output[0U];
+ uint32_t i1 = output[1U];
+ uint32_t i0_ = i0 & (uint32_t)0x3ffffffU;
+ uint32_t i1_ = i1 + (i0 >> (uint32_t)26U);
+ output[0U] = i0_;
+ output[1U] = i1_;
+}
+
+inline static void
+Hacl_Bignum_AddAndMultiply_add_and_multiply(uint32_t *acc, uint32_t *block, uint32_t *r)
+{
+ for (uint32_t i = (uint32_t)0U; i < (uint32_t)5U; i = i + (uint32_t)1U) {
+ uint32_t xi = acc[i];
+ uint32_t yi = block[i];
+ acc[i] = xi + yi;
+ }
+ Hacl_Bignum_Fmul_fmul(acc, acc, r);
+}
+
+inline static void
+Hacl_Impl_Poly1305_32_poly1305_update(
+ Hacl_Impl_Poly1305_32_State_poly1305_state st,
+ uint8_t *m)
+{
+ Hacl_Impl_Poly1305_32_State_poly1305_state scrut0 = st;
+ uint32_t *h = scrut0.h;
+ uint32_t *acc = h;
+ Hacl_Impl_Poly1305_32_State_poly1305_state scrut = st;
+ uint32_t *r = scrut.r;
+ uint32_t *r5 = r;
+ uint32_t tmp[5U] = { 0U };
+ uint8_t *s0 = m;
+ uint8_t *s1 = m + (uint32_t)3U;
+ uint8_t *s2 = m + (uint32_t)6U;
+ uint8_t *s3 = m + (uint32_t)9U;
+ uint8_t *s4 = m + (uint32_t)12U;
+ uint32_t i0 = load32_le(s0);
+ uint32_t i1 = load32_le(s1);
+ uint32_t i2 = load32_le(s2);
+ uint32_t i3 = load32_le(s3);
+ uint32_t i4 = load32_le(s4);
+ uint32_t r0 = i0 & (uint32_t)0x3ffffffU;
+ uint32_t r1 = i1 >> (uint32_t)2U & (uint32_t)0x3ffffffU;
+ uint32_t r2 = i2 >> (uint32_t)4U & (uint32_t)0x3ffffffU;
+ uint32_t r3 = i3 >> (uint32_t)6U & (uint32_t)0x3ffffffU;
+ uint32_t r4 = i4 >> (uint32_t)8U;
+ tmp[0U] = r0;
+ tmp[1U] = r1;
+ tmp[2U] = r2;
+ tmp[3U] = r3;
+ tmp[4U] = r4;
+ uint32_t b4 = tmp[4U];
+ uint32_t b4_ = (uint32_t)0x1000000U | b4;
+ tmp[4U] = b4_;
+ Hacl_Bignum_AddAndMultiply_add_and_multiply(acc, tmp, r5);
+}
+
+inline static void
+Hacl_Impl_Poly1305_32_poly1305_process_last_block_(
+ uint8_t *block,
+ Hacl_Impl_Poly1305_32_State_poly1305_state st,
+ uint8_t *m,
+ uint64_t rem_)
+{
+ uint32_t tmp[5U] = { 0U };
+ uint8_t *s0 = block;
+ uint8_t *s1 = block + (uint32_t)3U;
+ uint8_t *s2 = block + (uint32_t)6U;
+ uint8_t *s3 = block + (uint32_t)9U;
+ uint8_t *s4 = block + (uint32_t)12U;
+ uint32_t i0 = load32_le(s0);
+ uint32_t i1 = load32_le(s1);
+ uint32_t i2 = load32_le(s2);
+ uint32_t i3 = load32_le(s3);
+ uint32_t i4 = load32_le(s4);
+ uint32_t r0 = i0 & (uint32_t)0x3ffffffU;
+ uint32_t r1 = i1 >> (uint32_t)2U & (uint32_t)0x3ffffffU;
+ uint32_t r2 = i2 >> (uint32_t)4U & (uint32_t)0x3ffffffU;
+ uint32_t r3 = i3 >> (uint32_t)6U & (uint32_t)0x3ffffffU;
+ uint32_t r4 = i4 >> (uint32_t)8U;
+ tmp[0U] = r0;
+ tmp[1U] = r1;
+ tmp[2U] = r2;
+ tmp[3U] = r3;
+ tmp[4U] = r4;
+ Hacl_Impl_Poly1305_32_State_poly1305_state scrut0 = st;
+ uint32_t *h = scrut0.h;
+ Hacl_Impl_Poly1305_32_State_poly1305_state scrut = st;
+ uint32_t *r = scrut.r;
+ Hacl_Bignum_AddAndMultiply_add_and_multiply(h, tmp, r);
+}
+
+inline static void
+Hacl_Impl_Poly1305_32_poly1305_process_last_block(
+ Hacl_Impl_Poly1305_32_State_poly1305_state st,
+ uint8_t *m,
+ uint64_t rem_)
+{
+ uint8_t zero1 = (uint8_t)0U;
+ KRML_CHECK_SIZE(zero1, (uint32_t)16U);
+ uint8_t block[16U];
+ for (uint32_t _i = 0U; _i < (uint32_t)16U; ++_i)
+ block[_i] = zero1;
+ uint32_t i0 = (uint32_t)rem_;
+ uint32_t i = (uint32_t)rem_;
+ memcpy(block, m, i * sizeof m[0U]);
+ block[i0] = (uint8_t)1U;
+ Hacl_Impl_Poly1305_32_poly1305_process_last_block_(block, st, m, rem_);
+}
+
+static void
+Hacl_Impl_Poly1305_32_poly1305_last_pass(uint32_t *acc)
+{
+ Hacl_Bignum_Fproduct_carry_limb_(acc);
+ Hacl_Bignum_Modulo_carry_top(acc);
+ uint32_t t0 = acc[0U];
+ uint32_t t10 = acc[1U];
+ uint32_t t20 = acc[2U];
+ uint32_t t30 = acc[3U];
+ uint32_t t40 = acc[4U];
+ uint32_t t1_ = t10 + (t0 >> (uint32_t)26U);
+ uint32_t mask_261 = (uint32_t)0x3ffffffU;
+ uint32_t t0_ = t0 & mask_261;
+ uint32_t t2_ = t20 + (t1_ >> (uint32_t)26U);
+ uint32_t t1__ = t1_ & mask_261;
+ uint32_t t3_ = t30 + (t2_ >> (uint32_t)26U);
+ uint32_t t2__ = t2_ & mask_261;
+ uint32_t t4_ = t40 + (t3_ >> (uint32_t)26U);
+ uint32_t t3__ = t3_ & mask_261;
+ acc[0U] = t0_;
+ acc[1U] = t1__;
+ acc[2U] = t2__;
+ acc[3U] = t3__;
+ acc[4U] = t4_;
+ Hacl_Bignum_Modulo_carry_top(acc);
+ uint32_t t00 = acc[0U];
+ uint32_t t1 = acc[1U];
+ uint32_t t2 = acc[2U];
+ uint32_t t3 = acc[3U];
+ uint32_t t4 = acc[4U];
+ uint32_t t1_0 = t1 + (t00 >> (uint32_t)26U);
+ uint32_t t0_0 = t00 & (uint32_t)0x3ffffffU;
+ uint32_t t2_0 = t2 + (t1_0 >> (uint32_t)26U);
+ uint32_t t1__0 = t1_0 & (uint32_t)0x3ffffffU;
+ uint32_t t3_0 = t3 + (t2_0 >> (uint32_t)26U);
+ uint32_t t2__0 = t2_0 & (uint32_t)0x3ffffffU;
+ uint32_t t4_0 = t4 + (t3_0 >> (uint32_t)26U);
+ uint32_t t3__0 = t3_0 & (uint32_t)0x3ffffffU;
+ acc[0U] = t0_0;
+ acc[1U] = t1__0;
+ acc[2U] = t2__0;
+ acc[3U] = t3__0;
+ acc[4U] = t4_0;
+ Hacl_Bignum_Modulo_carry_top(acc);
+ uint32_t i0 = acc[0U];
+ uint32_t i1 = acc[1U];
+ uint32_t i0_ = i0 & (uint32_t)0x3ffffffU;
+ uint32_t i1_ = i1 + (i0 >> (uint32_t)26U);
+ acc[0U] = i0_;
+ acc[1U] = i1_;
+ uint32_t a0 = acc[0U];
+ uint32_t a1 = acc[1U];
+ uint32_t a2 = acc[2U];
+ uint32_t a3 = acc[3U];
+ uint32_t a4 = acc[4U];
+ uint32_t mask0 = FStar_UInt32_gte_mask(a0, (uint32_t)0x3fffffbU);
+ uint32_t mask1 = FStar_UInt32_eq_mask(a1, (uint32_t)0x3ffffffU);
+ uint32_t mask2 = FStar_UInt32_eq_mask(a2, (uint32_t)0x3ffffffU);
+ uint32_t mask3 = FStar_UInt32_eq_mask(a3, (uint32_t)0x3ffffffU);
+ uint32_t mask4 = FStar_UInt32_eq_mask(a4, (uint32_t)0x3ffffffU);
+ uint32_t mask = (((mask0 & mask1) & mask2) & mask3) & mask4;
+ uint32_t a0_ = a0 - ((uint32_t)0x3fffffbU & mask);
+ uint32_t a1_ = a1 - ((uint32_t)0x3ffffffU & mask);
+ uint32_t a2_ = a2 - ((uint32_t)0x3ffffffU & mask);
+ uint32_t a3_ = a3 - ((uint32_t)0x3ffffffU & mask);
+ uint32_t a4_ = a4 - ((uint32_t)0x3ffffffU & mask);
+ acc[0U] = a0_;
+ acc[1U] = a1_;
+ acc[2U] = a2_;
+ acc[3U] = a3_;
+ acc[4U] = a4_;
+}
+
+static Hacl_Impl_Poly1305_32_State_poly1305_state
+Hacl_Impl_Poly1305_32_mk_state(uint32_t *r, uint32_t *h)
+{
+ return ((Hacl_Impl_Poly1305_32_State_poly1305_state){.r = r, .h = h });
+}
+
+static void
+Hacl_Standalone_Poly1305_32_poly1305_blocks(
+ Hacl_Impl_Poly1305_32_State_poly1305_state st,
+ uint8_t *m,
+ uint64_t len1)
+{
+ if (!(len1 == (uint64_t)0U)) {
+ uint8_t *block = m;
+ uint8_t *tail1 = m + (uint32_t)16U;
+ Hacl_Impl_Poly1305_32_poly1305_update(st, block);
+ uint64_t len2 = len1 - (uint64_t)1U;
+ Hacl_Standalone_Poly1305_32_poly1305_blocks(st, tail1, len2);
+ }
+}
+
+static void
+Hacl_Standalone_Poly1305_32_poly1305_partial(
+ Hacl_Impl_Poly1305_32_State_poly1305_state st,
+ uint8_t *input,
+ uint64_t len1,
+ uint8_t *kr)
+{
+ Hacl_Impl_Poly1305_32_State_poly1305_state scrut = st;
+ uint32_t *r = scrut.r;
+ uint32_t *x0 = r;
+ FStar_UInt128_t k1 = load128_le(kr);
+ FStar_UInt128_t
+ k_clamped =
+ FStar_UInt128_logand(k1,
+ FStar_UInt128_logor(FStar_UInt128_shift_left(FStar_UInt128_uint64_to_uint128((uint64_t)0x0ffffffc0ffffffcU),
+ (uint32_t)64U),
+ FStar_UInt128_uint64_to_uint128((uint64_t)0x0ffffffc0fffffffU)));
+ uint32_t r0 = (uint32_t)FStar_UInt128_uint128_to_uint64(k_clamped) & (uint32_t)0x3ffffffU;
+ uint32_t
+ r1 =
+ (uint32_t)FStar_UInt128_uint128_to_uint64(FStar_UInt128_shift_right(k_clamped, (uint32_t)26U)) & (uint32_t)0x3ffffffU;
+ uint32_t
+ r2 =
+ (uint32_t)FStar_UInt128_uint128_to_uint64(FStar_UInt128_shift_right(k_clamped, (uint32_t)52U)) & (uint32_t)0x3ffffffU;
+ uint32_t
+ r3 =
+ (uint32_t)FStar_UInt128_uint128_to_uint64(FStar_UInt128_shift_right(k_clamped, (uint32_t)78U)) & (uint32_t)0x3ffffffU;
+ uint32_t
+ r4 =
+ (uint32_t)FStar_UInt128_uint128_to_uint64(FStar_UInt128_shift_right(k_clamped, (uint32_t)104U)) & (uint32_t)0x3ffffffU;
+ x0[0U] = r0;
+ x0[1U] = r1;
+ x0[2U] = r2;
+ x0[3U] = r3;
+ x0[4U] = r4;
+ Hacl_Impl_Poly1305_32_State_poly1305_state scrut0 = st;
+ uint32_t *h = scrut0.h;
+ uint32_t *x00 = h;
+ x00[0U] = (uint32_t)0U;
+ x00[1U] = (uint32_t)0U;
+ x00[2U] = (uint32_t)0U;
+ x00[3U] = (uint32_t)0U;
+ x00[4U] = (uint32_t)0U;
+ Hacl_Standalone_Poly1305_32_poly1305_blocks(st, input, len1);
+}
+
+static void
+Hacl_Standalone_Poly1305_32_poly1305_complete(
+ Hacl_Impl_Poly1305_32_State_poly1305_state st,
+ uint8_t *m,
+ uint64_t len1,
+ uint8_t *k1)
+{
+ uint8_t *kr = k1;
+ uint64_t len16 = len1 >> (uint32_t)4U;
+ uint64_t rem16 = len1 & (uint64_t)0xfU;
+ uint8_t *part_input = m;
+ uint8_t *last_block = m + (uint32_t)((uint64_t)16U * len16);
+ Hacl_Standalone_Poly1305_32_poly1305_partial(st, part_input, len16, kr);
+ if (!(rem16 == (uint64_t)0U))
+ Hacl_Impl_Poly1305_32_poly1305_process_last_block(st, last_block, rem16);
+ Hacl_Impl_Poly1305_32_State_poly1305_state scrut = st;
+ uint32_t *h = scrut.h;
+ uint32_t *acc = h;
+ Hacl_Impl_Poly1305_32_poly1305_last_pass(acc);
+}
+
+static void
+Hacl_Standalone_Poly1305_32_crypto_onetimeauth_(
+ uint8_t *output,
+ uint8_t *input,
+ uint64_t len1,
+ uint8_t *k1)
+{
+ uint32_t buf[10U] = { 0U };
+ uint32_t *r = buf;
+ uint32_t *h = buf + (uint32_t)5U;
+ Hacl_Impl_Poly1305_32_State_poly1305_state st = Hacl_Impl_Poly1305_32_mk_state(r, h);
+ uint8_t *key_s = k1 + (uint32_t)16U;
+ Hacl_Standalone_Poly1305_32_poly1305_complete(st, input, len1, k1);
+ Hacl_Impl_Poly1305_32_State_poly1305_state scrut = st;
+ uint32_t *h5 = scrut.h;
+ uint32_t *acc = h5;
+ FStar_UInt128_t k_ = load128_le(key_s);
+ uint32_t h0 = acc[0U];
+ uint32_t h1 = acc[1U];
+ uint32_t h2 = acc[2U];
+ uint32_t h3 = acc[3U];
+ uint32_t h4 = acc[4U];
+ FStar_UInt128_t
+ acc_ =
+ FStar_UInt128_logor(FStar_UInt128_shift_left(FStar_UInt128_uint64_to_uint128((uint64_t)h4),
+ (uint32_t)104U),
+ FStar_UInt128_logor(FStar_UInt128_shift_left(FStar_UInt128_uint64_to_uint128((uint64_t)h3),
+ (uint32_t)78U),
+ FStar_UInt128_logor(FStar_UInt128_shift_left(FStar_UInt128_uint64_to_uint128((uint64_t)h2),
+ (uint32_t)52U),
+ FStar_UInt128_logor(FStar_UInt128_shift_left(FStar_UInt128_uint64_to_uint128((uint64_t)h1),
+ (uint32_t)26U),
+ FStar_UInt128_uint64_to_uint128((uint64_t)h0)))));
+ FStar_UInt128_t mac_ = FStar_UInt128_add_mod(acc_, k_);
+ store128_le(output, mac_);
+}
+
+static void
+Hacl_Standalone_Poly1305_32_crypto_onetimeauth(
+ uint8_t *output,
+ uint8_t *input,
+ uint64_t len1,
+ uint8_t *k1)
+{
+ Hacl_Standalone_Poly1305_32_crypto_onetimeauth_(output, input, len1, k1);
+}
+
+void *
+Hacl_Poly1305_32_op_String_Access(FStar_Monotonic_HyperStack_mem h, uint8_t *b)
+{
+ return (void *)(uint8_t)0U;
+}
+
+Hacl_Impl_Poly1305_32_State_poly1305_state
+Hacl_Poly1305_32_mk_state(uint32_t *r, uint32_t *acc)
+{
+ return Hacl_Impl_Poly1305_32_mk_state(r, acc);
+}
+
+void
+Hacl_Poly1305_32_init(Hacl_Impl_Poly1305_32_State_poly1305_state st, uint8_t *k1)
+{
+ Hacl_Impl_Poly1305_32_State_poly1305_state scrut = st;
+ uint32_t *r = scrut.r;
+ uint32_t *x0 = r;
+ FStar_UInt128_t k10 = load128_le(k1);
+ FStar_UInt128_t
+ k_clamped =
+ FStar_UInt128_logand(k10,
+ FStar_UInt128_logor(FStar_UInt128_shift_left(FStar_UInt128_uint64_to_uint128((uint64_t)0x0ffffffc0ffffffcU),
+ (uint32_t)64U),
+ FStar_UInt128_uint64_to_uint128((uint64_t)0x0ffffffc0fffffffU)));
+ uint32_t r0 = (uint32_t)FStar_UInt128_uint128_to_uint64(k_clamped) & (uint32_t)0x3ffffffU;
+ uint32_t
+ r1 =
+ (uint32_t)FStar_UInt128_uint128_to_uint64(FStar_UInt128_shift_right(k_clamped, (uint32_t)26U)) & (uint32_t)0x3ffffffU;
+ uint32_t
+ r2 =
+ (uint32_t)FStar_UInt128_uint128_to_uint64(FStar_UInt128_shift_right(k_clamped, (uint32_t)52U)) & (uint32_t)0x3ffffffU;
+ uint32_t
+ r3 =
+ (uint32_t)FStar_UInt128_uint128_to_uint64(FStar_UInt128_shift_right(k_clamped, (uint32_t)78U)) & (uint32_t)0x3ffffffU;
+ uint32_t
+ r4 =
+ (uint32_t)FStar_UInt128_uint128_to_uint64(FStar_UInt128_shift_right(k_clamped, (uint32_t)104U)) & (uint32_t)0x3ffffffU;
+ x0[0U] = r0;
+ x0[1U] = r1;
+ x0[2U] = r2;
+ x0[3U] = r3;
+ x0[4U] = r4;
+ Hacl_Impl_Poly1305_32_State_poly1305_state scrut0 = st;
+ uint32_t *h = scrut0.h;
+ uint32_t *x00 = h;
+ x00[0U] = (uint32_t)0U;
+ x00[1U] = (uint32_t)0U;
+ x00[2U] = (uint32_t)0U;
+ x00[3U] = (uint32_t)0U;
+ x00[4U] = (uint32_t)0U;
+}
+
+void *Hacl_Poly1305_32_empty_log = (void *)(uint8_t)0U;
+
+void
+Hacl_Poly1305_32_update_block(Hacl_Impl_Poly1305_32_State_poly1305_state st, uint8_t *m)
+{
+ Hacl_Impl_Poly1305_32_poly1305_update(st, m);
+}
+
+void
+Hacl_Poly1305_32_update(
+ Hacl_Impl_Poly1305_32_State_poly1305_state st,
+ uint8_t *m,
+ uint32_t len1)
+{
+ if (!(len1 == (uint32_t)0U)) {
+ uint8_t *block = m;
+ uint8_t *m_ = m + (uint32_t)16U;
+ uint32_t len2 = len1 - (uint32_t)1U;
+ Hacl_Poly1305_32_update_block(st, block);
+ Hacl_Poly1305_32_update(st, m_, len2);
+ }
+}
+
+void
+Hacl_Poly1305_32_update_last(
+ Hacl_Impl_Poly1305_32_State_poly1305_state st,
+ uint8_t *m,
+ uint32_t len1)
+{
+ if (!((uint64_t)len1 == (uint64_t)0U))
+ Hacl_Impl_Poly1305_32_poly1305_process_last_block(st, m, (uint64_t)len1);
+ Hacl_Impl_Poly1305_32_State_poly1305_state scrut = st;
+ uint32_t *h = scrut.h;
+ uint32_t *acc = h;
+ Hacl_Impl_Poly1305_32_poly1305_last_pass(acc);
+}
+
+void
+Hacl_Poly1305_32_finish(
+ Hacl_Impl_Poly1305_32_State_poly1305_state st,
+ uint8_t *mac,
+ uint8_t *k1)
+{
+ Hacl_Impl_Poly1305_32_State_poly1305_state scrut = st;
+ uint32_t *h = scrut.h;
+ uint32_t *acc = h;
+ FStar_UInt128_t k_ = load128_le(k1);
+ uint32_t h0 = acc[0U];
+ uint32_t h1 = acc[1U];
+ uint32_t h2 = acc[2U];
+ uint32_t h3 = acc[3U];
+ uint32_t h4 = acc[4U];
+ FStar_UInt128_t
+ acc_ =
+ FStar_UInt128_logor(FStar_UInt128_shift_left(FStar_UInt128_uint64_to_uint128((uint64_t)h4),
+ (uint32_t)104U),
+ FStar_UInt128_logor(FStar_UInt128_shift_left(FStar_UInt128_uint64_to_uint128((uint64_t)h3),
+ (uint32_t)78U),
+ FStar_UInt128_logor(FStar_UInt128_shift_left(FStar_UInt128_uint64_to_uint128((uint64_t)h2),
+ (uint32_t)52U),
+ FStar_UInt128_logor(FStar_UInt128_shift_left(FStar_UInt128_uint64_to_uint128((uint64_t)h1),
+ (uint32_t)26U),
+ FStar_UInt128_uint64_to_uint128((uint64_t)h0)))));
+ FStar_UInt128_t mac_ = FStar_UInt128_add_mod(acc_, k_);
+ store128_le(mac, mac_);
+}
+
+void
+Hacl_Poly1305_32_crypto_onetimeauth(
+ uint8_t *output,
+ uint8_t *input,
+ uint64_t len1,
+ uint8_t *k1)
+{
+ Hacl_Standalone_Poly1305_32_crypto_onetimeauth(output, input, len1, k1);
+}
diff --git a/lib/freebl/verified/Hacl_Poly1305_32.h b/lib/freebl/verified/Hacl_Poly1305_32.h
new file mode 100644
index 000000000..4dd070026
--- /dev/null
+++ b/lib/freebl/verified/Hacl_Poly1305_32.h
@@ -0,0 +1,103 @@
+/* Copyright 2016-2017 INRIA and Microsoft Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kremlib.h"
+#ifndef __Hacl_Poly1305_32_H
+#define __Hacl_Poly1305_32_H
+
+typedef uint32_t Hacl_Bignum_Constants_limb;
+
+typedef uint64_t Hacl_Bignum_Constants_wide;
+
+typedef uint64_t Hacl_Bignum_Wide_t;
+
+typedef uint32_t Hacl_Bignum_Limb_t;
+
+typedef void *Hacl_Impl_Poly1305_32_State_log_t;
+
+typedef uint8_t *Hacl_Impl_Poly1305_32_State_uint8_p;
+
+typedef uint32_t *Hacl_Impl_Poly1305_32_State_bigint;
+
+typedef void *Hacl_Impl_Poly1305_32_State_seqelem;
+
+typedef uint32_t *Hacl_Impl_Poly1305_32_State_elemB;
+
+typedef uint8_t *Hacl_Impl_Poly1305_32_State_wordB;
+
+typedef uint8_t *Hacl_Impl_Poly1305_32_State_wordB_16;
+
+typedef struct
+{
+ uint32_t *r;
+ uint32_t *h;
+} Hacl_Impl_Poly1305_32_State_poly1305_state;
+
+typedef void *Hacl_Impl_Poly1305_32_log_t;
+
+typedef uint32_t *Hacl_Impl_Poly1305_32_bigint;
+
+typedef uint8_t *Hacl_Impl_Poly1305_32_uint8_p;
+
+typedef uint32_t *Hacl_Impl_Poly1305_32_elemB;
+
+typedef uint8_t *Hacl_Impl_Poly1305_32_wordB;
+
+typedef uint8_t *Hacl_Impl_Poly1305_32_wordB_16;
+
+typedef uint8_t *Hacl_Poly1305_32_uint8_p;
+
+typedef uint64_t Hacl_Poly1305_32_uint64_t;
+
+void *Hacl_Poly1305_32_op_String_Access(FStar_Monotonic_HyperStack_mem h, uint8_t *b);
+
+typedef uint8_t *Hacl_Poly1305_32_key;
+
+typedef Hacl_Impl_Poly1305_32_State_poly1305_state Hacl_Poly1305_32_state;
+
+Hacl_Impl_Poly1305_32_State_poly1305_state
+Hacl_Poly1305_32_mk_state(uint32_t *r, uint32_t *acc);
+
+void Hacl_Poly1305_32_init(Hacl_Impl_Poly1305_32_State_poly1305_state st, uint8_t *k1);
+
+extern void *Hacl_Poly1305_32_empty_log;
+
+void Hacl_Poly1305_32_update_block(Hacl_Impl_Poly1305_32_State_poly1305_state st, uint8_t *m);
+
+void
+Hacl_Poly1305_32_update(
+ Hacl_Impl_Poly1305_32_State_poly1305_state st,
+ uint8_t *m,
+ uint32_t len1);
+
+void
+Hacl_Poly1305_32_update_last(
+ Hacl_Impl_Poly1305_32_State_poly1305_state st,
+ uint8_t *m,
+ uint32_t len1);
+
+void
+Hacl_Poly1305_32_finish(
+ Hacl_Impl_Poly1305_32_State_poly1305_state st,
+ uint8_t *mac,
+ uint8_t *k1);
+
+void
+Hacl_Poly1305_32_crypto_onetimeauth(
+ uint8_t *output,
+ uint8_t *input,
+ uint64_t len1,
+ uint8_t *k1);
+#endif