summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaiki Ueno <dueno@redhat.com>2017-09-22 11:27:34 +0200
committerDaiki Ueno <dueno@redhat.com>2017-09-22 11:27:34 +0200
commit267435d56c5d1508c9cde641dc1db13ad7122f10 (patch)
tree8a00ec09274c20a28e1a92772fd5e16f8c9f10f9
parent0a198a7b2534bd5482dbd5327ce2c313a634954f (diff)
downloadnss-hg-267435d56c5d1508c9cde641dc1db13ad7122f10.tar.gz
Bug 1400603 - freebl: Reorganize AES-GCM source code based on hw/sw implementation, r=franziskus
Reviewers: franziskus Reviewed By: franziskus Bug #: 1400603 Differential Revision: https://phabricator.services.mozilla.com/D65
-rw-r--r--lib/freebl/Makefile4
-rw-r--r--lib/freebl/aes-x86.c157
-rw-r--r--lib/freebl/freebl.gyp65
-rw-r--r--lib/freebl/gcm-x86.c127
-rw-r--r--lib/freebl/gcm.c162
-rw-r--r--lib/freebl/gcm.h14
-rw-r--r--lib/freebl/rijndael.c188
-rw-r--r--lib/freebl/rijndael.h18
8 files changed, 436 insertions, 299 deletions
diff --git a/lib/freebl/Makefile b/lib/freebl/Makefile
index d50e18696..bc1ea86a5 100644
--- a/lib/freebl/Makefile
+++ b/lib/freebl/Makefile
@@ -110,7 +110,9 @@ endif
# NSS_X86_OR_X64 means the target is either x86 or x64
ifeq (,$(filter-out i386 x386 x86 x86_64,$(CPU_ARCH)))
DEFINES += -DNSS_X86_OR_X64
- CFLAGS += -mpclmul -maes
+ EXTRA_SRCS += gcm-x86.c aes-x86.c
+$(OBJDIR)/gcm-x86.o: CFLAGS += -mpclmul -maes
+$(OBJDIR)/aes-x86.o: CFLAGS += -mpclmul -maes
ifneq (,$(USE_64)$(USE_X32))
DEFINES += -DNSS_X64
else
diff --git a/lib/freebl/aes-x86.c b/lib/freebl/aes-x86.c
new file mode 100644
index 000000000..830b4782f
--- /dev/null
+++ b/lib/freebl/aes-x86.c
@@ -0,0 +1,157 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifdef FREEBL_NO_DEPEND
+#include "stubs.h"
+#endif
+#include "rijndael.h"
+#include "secerr.h"
+
+#include <wmmintrin.h> /* aes-ni */
+
+#define EXPAND_KEY128(k, rcon, res) \
+ tmp_key = _mm_aeskeygenassist_si128(k, rcon); \
+ tmp_key = _mm_shuffle_epi32(tmp_key, 0xFF); \
+ tmp = _mm_xor_si128(k, _mm_slli_si128(k, 4)); \
+ tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \
+ tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \
+ res = _mm_xor_si128(tmp, tmp_key)
+
+static void
+native_key_expansion128(AESContext *cx, const unsigned char *key)
+{
+ __m128i *keySchedule = cx->keySchedule;
+ pre_align __m128i tmp_key post_align;
+ pre_align __m128i tmp post_align;
+ keySchedule[0] = _mm_loadu_si128((__m128i *)key);
+ EXPAND_KEY128(keySchedule[0], 0x01, keySchedule[1]);
+ EXPAND_KEY128(keySchedule[1], 0x02, keySchedule[2]);
+ EXPAND_KEY128(keySchedule[2], 0x04, keySchedule[3]);
+ EXPAND_KEY128(keySchedule[3], 0x08, keySchedule[4]);
+ EXPAND_KEY128(keySchedule[4], 0x10, keySchedule[5]);
+ EXPAND_KEY128(keySchedule[5], 0x20, keySchedule[6]);
+ EXPAND_KEY128(keySchedule[6], 0x40, keySchedule[7]);
+ EXPAND_KEY128(keySchedule[7], 0x80, keySchedule[8]);
+ EXPAND_KEY128(keySchedule[8], 0x1B, keySchedule[9]);
+ EXPAND_KEY128(keySchedule[9], 0x36, keySchedule[10]);
+}
+
+#define EXPAND_KEY192_PART1(res, k0, kt, rcon) \
+ tmp2 = _mm_slli_si128(k0, 4); \
+ tmp1 = _mm_xor_si128(k0, tmp2); \
+ tmp2 = _mm_slli_si128(tmp2, 4); \
+ tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \
+ tmp2 = _mm_aeskeygenassist_si128(kt, rcon); \
+ res = _mm_xor_si128(tmp1, _mm_shuffle_epi32(tmp2, 0x55))
+
+#define EXPAND_KEY192_PART2(res, k1, k2) \
+ tmp2 = _mm_xor_si128(k1, _mm_slli_si128(k1, 4)); \
+ res = _mm_xor_si128(tmp2, _mm_shuffle_epi32(k2, 0xFF))
+
+#define EXPAND_KEY192(k0, res1, res2, res3, carry, rcon1, rcon2) \
+ EXPAND_KEY192_PART1(tmp3, k0, res1, rcon1); \
+ EXPAND_KEY192_PART2(carry, res1, tmp3); \
+ res1 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(res1), \
+ _mm_castsi128_pd(tmp3), 0)); \
+ res2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp3), \
+ _mm_castsi128_pd(carry), 1)); \
+ EXPAND_KEY192_PART1(res3, tmp3, carry, rcon2)
+
+static void
+native_key_expansion192(AESContext *cx, const unsigned char *key)
+{
+ __m128i *keySchedule = cx->keySchedule;
+ pre_align __m128i tmp1 post_align;
+ pre_align __m128i tmp2 post_align;
+ pre_align __m128i tmp3 post_align;
+ pre_align __m128i carry post_align;
+ keySchedule[0] = _mm_loadu_si128((__m128i *)key);
+ keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16));
+ EXPAND_KEY192(keySchedule[0], keySchedule[1], keySchedule[2],
+ keySchedule[3], carry, 0x1, 0x2);
+ EXPAND_KEY192_PART2(keySchedule[4], carry, keySchedule[3]);
+ EXPAND_KEY192(keySchedule[3], keySchedule[4], keySchedule[5],
+ keySchedule[6], carry, 0x4, 0x8);
+ EXPAND_KEY192_PART2(keySchedule[7], carry, keySchedule[6]);
+ EXPAND_KEY192(keySchedule[6], keySchedule[7], keySchedule[8],
+ keySchedule[9], carry, 0x10, 0x20);
+ EXPAND_KEY192_PART2(keySchedule[10], carry, keySchedule[9]);
+ EXPAND_KEY192(keySchedule[9], keySchedule[10], keySchedule[11],
+ keySchedule[12], carry, 0x40, 0x80);
+}
+
+#define EXPAND_KEY256_PART(res, rconx, k1x, k2x, X) \
+ tmp_key = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(k2x, rconx), X); \
+ tmp2 = _mm_slli_si128(k1x, 4); \
+ tmp1 = _mm_xor_si128(k1x, tmp2); \
+ tmp2 = _mm_slli_si128(tmp2, 4); \
+ tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \
+ res = _mm_xor_si128(tmp1, tmp_key);
+
+#define EXPAND_KEY256(res1, res2, k1, k2, rcon) \
+ EXPAND_KEY256_PART(res1, rcon, k1, k2, 0xFF); \
+ EXPAND_KEY256_PART(res2, 0x00, k2, res1, 0xAA)
+
+static void
+native_key_expansion256(AESContext *cx, const unsigned char *key)
+{
+ __m128i *keySchedule = cx->keySchedule;
+ pre_align __m128i tmp_key post_align;
+ pre_align __m128i tmp1 post_align;
+ pre_align __m128i tmp2 post_align;
+ keySchedule[0] = _mm_loadu_si128((__m128i *)key);
+ keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16));
+ EXPAND_KEY256(keySchedule[2], keySchedule[3], keySchedule[0],
+ keySchedule[1], 0x01);
+ EXPAND_KEY256(keySchedule[4], keySchedule[5], keySchedule[2],
+ keySchedule[3], 0x02);
+ EXPAND_KEY256(keySchedule[6], keySchedule[7], keySchedule[4],
+ keySchedule[5], 0x04);
+ EXPAND_KEY256(keySchedule[8], keySchedule[9], keySchedule[6],
+ keySchedule[7], 0x08);
+ EXPAND_KEY256(keySchedule[10], keySchedule[11], keySchedule[8],
+ keySchedule[9], 0x10);
+ EXPAND_KEY256(keySchedule[12], keySchedule[13], keySchedule[10],
+ keySchedule[11], 0x20);
+ EXPAND_KEY256_PART(keySchedule[14], 0x40, keySchedule[12],
+ keySchedule[13], 0xFF);
+}
+
+/*
+ * AES key expansion using aes-ni instructions.
+ */
+void
+rijndael_native_key_expansion(AESContext *cx, const unsigned char *key,
+ unsigned int Nk)
+{
+ switch (Nk) {
+ case 4:
+ native_key_expansion128(cx, key);
+ return;
+ case 6:
+ native_key_expansion192(cx, key);
+ return;
+ case 8:
+ native_key_expansion256(cx, key);
+ return;
+ default:
+ /* This shouldn't happen (checked by the caller). */
+ return;
+ }
+}
+
+void
+rijndael_native_encryptBlock(AESContext *cx,
+ unsigned char *output,
+ const unsigned char *input)
+{
+ int i;
+ pre_align __m128i m post_align = _mm_loadu_si128((__m128i *)input);
+ m = _mm_xor_si128(m, cx->keySchedule[0]);
+ for (i = 1; i < cx->Nr; ++i) {
+ m = _mm_aesenc_si128(m, cx->keySchedule[i]);
+ }
+ m = _mm_aesenclast_si128(m, cx->keySchedule[cx->Nr]);
+ _mm_storeu_si128((__m128i *)output, m);
+}
diff --git a/lib/freebl/freebl.gyp b/lib/freebl/freebl.gyp
index 1e9347500..5f59eef29 100644
--- a/lib/freebl/freebl.gyp
+++ b/lib/freebl/freebl.gyp
@@ -23,6 +23,37 @@
]
},
{
+ 'target_name': 'gcm-aes-x86_c_lib',
+ 'type': 'static_library',
+ 'sources': [
+ 'gcm-x86.c', 'aes-x86.c'
+ ],
+ 'dependencies': [
+ '<(DEPTH)/exports.gyp:nss_exports'
+ ],
+ # Enable isa option for pclmul and aes-ni; supported since gcc 4.4.
+ # This is only supported by x84/x64. It's not needed for Windows,
+ # unless clang-cl is used.
+ 'cflags_mozilla': [
+ '-mpclmul', '-maes'
+ ],
+ 'conditions': [
+ [ 'OS=="linux" or OS=="android" or OS=="dragonfly" or OS=="freebsd" or OS=="netbsd" or OS=="openbsd"', {
+ 'cflags': [
+ '-mpclmul', '-maes'
+ ],
+ }],
+ # macOS build doesn't use cflags.
+ [ 'OS=="mac"', {
+ 'xcode_settings': {
+ 'OTHER_CFLAGS': [
+ '-mpclmul', '-maes'
+ ],
+ },
+ }]
+ ]
+ },
+ {
'target_name': 'freebl',
'type': 'static_library',
'sources': [
@@ -45,6 +76,11 @@
'<(DEPTH)/exports.gyp:nss_exports',
],
'conditions': [
+ [ 'target_arch=="ia32" or target_arch=="x64"', {
+ 'dependencies': [
+ 'gcm-aes-x86_c_lib'
+ ],
+ }],
[ 'OS=="linux"', {
'defines!': [
'FREEBL_NO_DEPEND',
@@ -76,6 +112,11 @@
'<(DEPTH)/exports.gyp:nss_exports',
],
'conditions': [
+ [ 'target_arch=="ia32" or target_arch=="x64"', {
+ 'dependencies': [
+ 'gcm-aes-x86_c_lib'
+ ]
+ }],
[ 'OS!="linux" and OS!="android"', {
'conditions': [
[ 'moz_fold_libs==0', {
@@ -154,27 +195,11 @@
'MP_API_COMPATIBLE'
],
'conditions': [
- [ 'target_arch=="ia32" or target_arch=="x64"', {
- 'cflags_mozilla': [
- '-mpclmul',
- '-maes',
- ],
- 'conditions': [
- [ 'OS=="dragonfly" or OS=="freebsd" or OS=="netbsd" or OS=="openbsd"', {
- 'cflags': [
- '-mpclmul',
- '-maes',
- ],
- }],
- ],
- }],
[ 'OS=="mac"', {
'xcode_settings': {
# I'm not sure since when this is supported.
# But I hope that doesn't matter. We also assume this is x86/x64.
'OTHER_CFLAGS': [
- '-mpclmul',
- '-maes',
'-std=gnu99',
],
},
@@ -268,14 +293,6 @@
'MP_USE_UINT_DIGIT',
],
}],
- [ 'target_arch=="ia32" or target_arch=="x64"', {
- 'cflags': [
- # enable isa option for pclmul am aes-ni; supported since gcc 4.4
- # This is only support by x84/x64. It's not needed for Windows.
- '-mpclmul',
- '-maes',
- ],
- }],
[ 'target_arch=="arm"', {
'defines': [
'MP_ASSEMBLY_MULTIPLY',
diff --git a/lib/freebl/gcm-x86.c b/lib/freebl/gcm-x86.c
new file mode 100644
index 000000000..e34d63394
--- /dev/null
+++ b/lib/freebl/gcm-x86.c
@@ -0,0 +1,127 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifdef FREEBL_NO_DEPEND
+#include "stubs.h"
+#endif
+#include "gcm.h"
+#include "secerr.h"
+
+#include <wmmintrin.h> /* clmul */
+
+#define WRITE64(x, bytes) \
+ (bytes)[0] = (x) >> 56; \
+ (bytes)[1] = (x) >> 48; \
+ (bytes)[2] = (x) >> 40; \
+ (bytes)[3] = (x) >> 32; \
+ (bytes)[4] = (x) >> 24; \
+ (bytes)[5] = (x) >> 16; \
+ (bytes)[6] = (x) >> 8; \
+ (bytes)[7] = (x);
+
+SECStatus
+gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf)
+{
+ uint64_t tmp_out[2];
+ _mm_storeu_si128((__m128i *)tmp_out, ghash->x);
+ /* maxout must be larger than 16 byte (checked by the caller). */
+ WRITE64(tmp_out[0], outbuf + 8);
+ WRITE64(tmp_out[1], outbuf);
+ return SECSuccess;
+}
+
+SECStatus
+gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf,
+ unsigned int count)
+{
+ size_t i;
+ pre_align __m128i z_high post_align;
+ pre_align __m128i z_low post_align;
+ pre_align __m128i C post_align;
+ pre_align __m128i D post_align;
+ pre_align __m128i E post_align;
+ pre_align __m128i F post_align;
+ pre_align __m128i bin post_align;
+ pre_align __m128i Ci post_align;
+ pre_align __m128i tmp post_align;
+
+ for (i = 0; i < count; i++, buf += 16) {
+ bin = _mm_set_epi16(((uint16_t)buf[0] << 8) | buf[1],
+ ((uint16_t)buf[2] << 8) | buf[3],
+ ((uint16_t)buf[4] << 8) | buf[5],
+ ((uint16_t)buf[6] << 8) | buf[7],
+ ((uint16_t)buf[8] << 8) | buf[9],
+ ((uint16_t)buf[10] << 8) | buf[11],
+ ((uint16_t)buf[12] << 8) | buf[13],
+ ((uint16_t)buf[14] << 8) | buf[15]);
+ Ci = _mm_xor_si128(bin, ghash->x);
+
+ /* Do binary mult ghash->X = Ci * ghash->H. */
+ C = _mm_clmulepi64_si128(Ci, ghash->h, 0x00);
+ D = _mm_clmulepi64_si128(Ci, ghash->h, 0x11);
+ E = _mm_clmulepi64_si128(Ci, ghash->h, 0x01);
+ F = _mm_clmulepi64_si128(Ci, ghash->h, 0x10);
+ tmp = _mm_xor_si128(E, F);
+ z_high = _mm_xor_si128(tmp, _mm_slli_si128(D, 8));
+ z_high = _mm_unpackhi_epi64(z_high, D);
+ z_low = _mm_xor_si128(_mm_slli_si128(tmp, 8), C);
+ z_low = _mm_unpackhi_epi64(_mm_slli_si128(C, 8), z_low);
+
+ /* Shift one to the left (multiply by x) as gcm spec is stupid. */
+ C = _mm_slli_si128(z_low, 8);
+ E = _mm_srli_epi64(C, 63);
+ D = _mm_slli_si128(z_high, 8);
+ F = _mm_srli_epi64(D, 63);
+ /* Carry over */
+ C = _mm_srli_si128(z_low, 8);
+ D = _mm_srli_epi64(C, 63);
+ z_low = _mm_or_si128(_mm_slli_epi64(z_low, 1), E);
+ z_high = _mm_or_si128(_mm_or_si128(_mm_slli_epi64(z_high, 1), F), D);
+
+ /* Reduce */
+ C = _mm_slli_si128(z_low, 8);
+ /* D = z_low << 127 */
+ D = _mm_slli_epi64(C, 63);
+ /* E = z_low << 126 */
+ E = _mm_slli_epi64(C, 62);
+ /* F = z_low << 121 */
+ F = _mm_slli_epi64(C, 57);
+ /* z_low ^= (z_low << 127) ^ (z_low << 126) ^ (z_low << 121); */
+ z_low = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(z_low, D), E), F);
+ C = _mm_srli_si128(z_low, 8);
+ /* D = z_low >> 1 */
+ D = _mm_slli_epi64(C, 63);
+ D = _mm_or_si128(_mm_srli_epi64(z_low, 1), D);
+ /* E = z_low >> 2 */
+ E = _mm_slli_epi64(C, 62);
+ E = _mm_or_si128(_mm_srli_epi64(z_low, 2), E);
+ /* F = z_low >> 7 */
+ F = _mm_slli_epi64(C, 57);
+ F = _mm_or_si128(_mm_srli_epi64(z_low, 7), F);
+ /* ghash->x ^= z_low ^ (z_low >> 1) ^ (z_low >> 2) ^ (z_low >> 7); */
+ ghash->x = _mm_xor_si128(_mm_xor_si128(
+ _mm_xor_si128(_mm_xor_si128(z_high, z_low), D), E),
+ F);
+ }
+ return SECSuccess;
+}
+
+SECStatus
+gcm_HashInit_hw(gcmHashContext *ghash)
+{
+ ghash->ghash_mul = gcm_HashMult_hw;
+ ghash->x = _mm_setzero_si128();
+ /* MSVC requires __m64 to load epi64. */
+ ghash->h = _mm_set_epi32(ghash->h_high >> 32, (uint32_t)ghash->h_high,
+ ghash->h_low >> 32, (uint32_t)ghash->h_low);
+ ghash->hw = PR_TRUE;
+ return SECSuccess;
+}
+
+SECStatus
+gcm_HashZeroX_hw(gcmHashContext *ghash)
+{
+ ghash->x = _mm_setzero_si128();
+ return SECSuccess;
+}
diff --git a/lib/freebl/gcm.c b/lib/freebl/gcm.c
index 780b7a632..f1e16da78 100644
--- a/lib/freebl/gcm.c
+++ b/lib/freebl/gcm.c
@@ -17,18 +17,50 @@
#include <limits.h>
-#ifdef NSS_X86_OR_X64
-#include <wmmintrin.h> /* clmul */
-#endif
-
/* Forward declarations */
+SECStatus gcm_HashInit_hw(gcmHashContext *ghash);
+SECStatus gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf);
SECStatus gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf,
unsigned int count);
+SECStatus gcm_HashZeroX_hw(gcmHashContext *ghash);
SECStatus gcm_HashMult_sftw(gcmHashContext *ghash, const unsigned char *buf,
unsigned int count);
SECStatus gcm_HashMult_sftw32(gcmHashContext *ghash, const unsigned char *buf,
unsigned int count);
+/* Stub definitions for the above *_hw functions, which shouldn't be
+ * used unless NSS_X86_OR_X64 is defined */
+#ifndef NSS_X86_OR_X64
+SECStatus
+gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf)
+{
+ PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
+ return SECFailure;
+}
+
+SECStatus
+gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf,
+ unsigned int count)
+{
+ PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
+ return SECFailure;
+}
+
+SECStatus
+gcm_HashInit_hw(gcmHashContext *ghash)
+{
+ PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
+ return SECFailure;
+}
+
+SECStatus
+gcm_HashZeroX_hw(gcmHashContext *ghash)
+{
+ PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
+ return SECFailure;
+}
+#endif /* NSS_X86_OR_X64 */
+
uint64_t
get64(const unsigned char *bytes)
{
@@ -46,6 +78,8 @@ get64(const unsigned char *bytes)
SECStatus
gcmHash_InitContext(gcmHashContext *ghash, const unsigned char *H, PRBool sw)
{
+ SECStatus rv = SECSuccess;
+
ghash->cLen = 0;
ghash->bufLen = 0;
PORT_Memset(ghash->counterBuf, 0, sizeof(ghash->counterBuf));
@@ -53,17 +87,7 @@ gcmHash_InitContext(gcmHashContext *ghash, const unsigned char *H, PRBool sw)
ghash->h_low = get64(H + 8);
ghash->h_high = get64(H);
if (clmul_support() && !sw) {
-#ifdef NSS_X86_OR_X64
- ghash->ghash_mul = gcm_HashMult_hw;
- ghash->x = _mm_setzero_si128();
- /* MSVC requires __m64 to load epi64. */
- ghash->h = _mm_set_epi32(ghash->h_high >> 32, (uint32_t)ghash->h_high,
- ghash->h_low >> 32, (uint32_t)ghash->h_low);
- ghash->hw = PR_TRUE;
-#else
- PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
- return SECFailure;
-#endif /* NSS_X86_OR_X64 */
+ rv = gcm_HashInit_hw(ghash);
} else {
/* We fall back to the software implementation if we can't use / don't
* want to use pclmul. */
@@ -75,7 +99,7 @@ gcmHash_InitContext(gcmHashContext *ghash, const unsigned char *H, PRBool sw)
ghash->x_high = ghash->x_low = 0;
ghash->hw = PR_FALSE;
}
- return SECSuccess;
+ return rv;
}
#ifdef HAVE_INT128_SUPPORT
@@ -283,102 +307,17 @@ gcm_HashMult_sftw32(gcmHashContext *ghash, const unsigned char *buf,
}
#endif /* HAVE_INT128_SUPPORT */
-SECStatus
-gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf,
- unsigned int count)
-{
-#ifdef NSS_X86_OR_X64
- size_t i;
- pre_align __m128i z_high post_align;
- pre_align __m128i z_low post_align;
- pre_align __m128i C post_align;
- pre_align __m128i D post_align;
- pre_align __m128i E post_align;
- pre_align __m128i F post_align;
- pre_align __m128i bin post_align;
- pre_align __m128i Ci post_align;
- pre_align __m128i tmp post_align;
-
- for (i = 0; i < count; i++, buf += 16) {
- bin = _mm_set_epi16(((uint16_t)buf[0] << 8) | buf[1],
- ((uint16_t)buf[2] << 8) | buf[3],
- ((uint16_t)buf[4] << 8) | buf[5],
- ((uint16_t)buf[6] << 8) | buf[7],
- ((uint16_t)buf[8] << 8) | buf[9],
- ((uint16_t)buf[10] << 8) | buf[11],
- ((uint16_t)buf[12] << 8) | buf[13],
- ((uint16_t)buf[14] << 8) | buf[15]);
- Ci = _mm_xor_si128(bin, ghash->x);
-
- /* Do binary mult ghash->X = Ci * ghash->H. */
- C = _mm_clmulepi64_si128(Ci, ghash->h, 0x00);
- D = _mm_clmulepi64_si128(Ci, ghash->h, 0x11);
- E = _mm_clmulepi64_si128(Ci, ghash->h, 0x01);
- F = _mm_clmulepi64_si128(Ci, ghash->h, 0x10);
- tmp = _mm_xor_si128(E, F);
- z_high = _mm_xor_si128(tmp, _mm_slli_si128(D, 8));
- z_high = _mm_unpackhi_epi64(z_high, D);
- z_low = _mm_xor_si128(_mm_slli_si128(tmp, 8), C);
- z_low = _mm_unpackhi_epi64(_mm_slli_si128(C, 8), z_low);
-
- /* Shift one to the left (multiply by x) as gcm spec is stupid. */
- C = _mm_slli_si128(z_low, 8);
- E = _mm_srli_epi64(C, 63);
- D = _mm_slli_si128(z_high, 8);
- F = _mm_srli_epi64(D, 63);
- /* Carry over */
- C = _mm_srli_si128(z_low, 8);
- D = _mm_srli_epi64(C, 63);
- z_low = _mm_or_si128(_mm_slli_epi64(z_low, 1), E);
- z_high = _mm_or_si128(_mm_or_si128(_mm_slli_epi64(z_high, 1), F), D);
-
- /* Reduce */
- C = _mm_slli_si128(z_low, 8);
- /* D = z_low << 127 */
- D = _mm_slli_epi64(C, 63);
- /* E = z_low << 126 */
- E = _mm_slli_epi64(C, 62);
- /* F = z_low << 121 */
- F = _mm_slli_epi64(C, 57);
- /* z_low ^= (z_low << 127) ^ (z_low << 126) ^ (z_low << 121); */
- z_low = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(z_low, D), E), F);
- C = _mm_srli_si128(z_low, 8);
- /* D = z_low >> 1 */
- D = _mm_slli_epi64(C, 63);
- D = _mm_or_si128(_mm_srli_epi64(z_low, 1), D);
- /* E = z_low >> 2 */
- E = _mm_slli_epi64(C, 62);
- E = _mm_or_si128(_mm_srli_epi64(z_low, 2), E);
- /* F = z_low >> 7 */
- F = _mm_slli_epi64(C, 57);
- F = _mm_or_si128(_mm_srli_epi64(z_low, 7), F);
- /* ghash->x ^= z_low ^ (z_low >> 1) ^ (z_low >> 2) ^ (z_low >> 7); */
- ghash->x = _mm_xor_si128(_mm_xor_si128(
- _mm_xor_si128(_mm_xor_si128(z_high, z_low), D), E),
- F);
- }
- return SECSuccess;
-#else
- PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
- return SECFailure;
-#endif /* NSS_X86_OR_X64 */
-}
-
static SECStatus
gcm_zeroX(gcmHashContext *ghash)
{
+ SECStatus rv = SECSuccess;
+
if (ghash->hw) {
-#ifdef NSS_X86_OR_X64
- ghash->x = _mm_setzero_si128();
- return SECSuccess;
-#else
- PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
- return SECFailure;
-#endif /* NSS_X86_OR_X64 */
+ rv = gcm_HashZeroX_hw(ghash);
}
ghash->x_high = ghash->x_low = 0;
- return SECSuccess;
+ return rv;
}
/*
@@ -503,15 +442,10 @@ gcmHash_Final(gcmHashContext *ghash, unsigned char *outbuf,
}
if (ghash->hw) {
-#ifdef NSS_X86_OR_X64
- uint64_t tmp_out[2];
- _mm_storeu_si128((__m128i *)tmp_out, ghash->x);
- WRITE64(tmp_out[0], T + 8);
- WRITE64(tmp_out[1], T);
-#else
- PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
- return SECFailure;
-#endif /* NSS_X86_OR_X64 */
+ rv = gcm_HashWrite_hw(ghash, T);
+ if (rv != SECSuccess) {
+ goto cleanup;
+ }
} else {
WRITE64(ghash->x_low, T + 8);
WRITE64(ghash->x_high, T);
diff --git a/lib/freebl/gcm.h b/lib/freebl/gcm.h
index 0c707a081..42ef0f717 100644
--- a/lib/freebl/gcm.h
+++ b/lib/freebl/gcm.h
@@ -9,7 +9,21 @@
#include <stdint.h>
#ifdef NSS_X86_OR_X64
+/* GCC <= 4.8 doesn't support including emmintrin.h without enabling SSE2 */
+#if !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \
+ (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
+#pragma GCC push_options
+#pragma GCC target("sse2")
+#undef NSS_DISABLE_SSE2
+#define NSS_DISABLE_SSE2 1
+#endif /* GCC <= 4.8 */
+
#include <emmintrin.h> /* __m128i */
+
+#ifdef NSS_DISABLE_SSE2
+#undef NSS_DISABLE_SSE2
+#pragma GCC pop_options
+#endif /* NSS_DISABLE_SSE2 */
#endif
SEC_BEGIN_PROTOS
diff --git a/lib/freebl/rijndael.c b/lib/freebl/rijndael.c
index a09f13098..5de27de9c 100644
--- a/lib/freebl/rijndael.c
+++ b/lib/freebl/rijndael.c
@@ -27,6 +27,34 @@
#include "intel-gcm.h"
#endif /* INTEL_GCM */
+/* Forward declarations */
+void rijndael_native_key_expansion(AESContext *cx, const unsigned char *key,
+ unsigned int Nk);
+void rijndael_native_encryptBlock(AESContext *cx,
+ unsigned char *output,
+ const unsigned char *input);
+
+/* Stub definitions for the above rijndael_native_* functions, which
+ * shouldn't be used unless NSS_X86_OR_X64 is defined */
+#ifndef NSS_X86_OR_X64
+void
+rijndael_native_key_expansion(AESContext *cx, const unsigned char *key,
+ unsigned int Nk)
+{
+ PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
+ PORT_Assert(0);
+}
+
+void
+rijndael_native_encryptBlock(AESContext *cx,
+ unsigned char *output,
+ const unsigned char *input)
+{
+ PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
+ PORT_Assert(0);
+}
+#endif /* NSS_X86_OR_X64 */
+
/*
* There are currently three ways to build this code, varying in performance
* and code size.
@@ -309,162 +337,6 @@ rijndael_key_expansion7(AESContext *cx, const unsigned char *key, unsigned int N
}
}
-#if defined(NSS_X86_OR_X64)
-#define EXPAND_KEY128(k, rcon, res) \
- tmp_key = _mm_aeskeygenassist_si128(k, rcon); \
- tmp_key = _mm_shuffle_epi32(tmp_key, 0xFF); \
- tmp = _mm_xor_si128(k, _mm_slli_si128(k, 4)); \
- tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \
- tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \
- res = _mm_xor_si128(tmp, tmp_key)
-
-static void
-native_key_expansion128(AESContext *cx, const unsigned char *key)
-{
- __m128i *keySchedule = cx->keySchedule;
- pre_align __m128i tmp_key post_align;
- pre_align __m128i tmp post_align;
- keySchedule[0] = _mm_loadu_si128((__m128i *)key);
- EXPAND_KEY128(keySchedule[0], 0x01, keySchedule[1]);
- EXPAND_KEY128(keySchedule[1], 0x02, keySchedule[2]);
- EXPAND_KEY128(keySchedule[2], 0x04, keySchedule[3]);
- EXPAND_KEY128(keySchedule[3], 0x08, keySchedule[4]);
- EXPAND_KEY128(keySchedule[4], 0x10, keySchedule[5]);
- EXPAND_KEY128(keySchedule[5], 0x20, keySchedule[6]);
- EXPAND_KEY128(keySchedule[6], 0x40, keySchedule[7]);
- EXPAND_KEY128(keySchedule[7], 0x80, keySchedule[8]);
- EXPAND_KEY128(keySchedule[8], 0x1B, keySchedule[9]);
- EXPAND_KEY128(keySchedule[9], 0x36, keySchedule[10]);
-}
-
-#define EXPAND_KEY192_PART1(res, k0, kt, rcon) \
- tmp2 = _mm_slli_si128(k0, 4); \
- tmp1 = _mm_xor_si128(k0, tmp2); \
- tmp2 = _mm_slli_si128(tmp2, 4); \
- tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \
- tmp2 = _mm_aeskeygenassist_si128(kt, rcon); \
- res = _mm_xor_si128(tmp1, _mm_shuffle_epi32(tmp2, 0x55))
-
-#define EXPAND_KEY192_PART2(res, k1, k2) \
- tmp2 = _mm_xor_si128(k1, _mm_slli_si128(k1, 4)); \
- res = _mm_xor_si128(tmp2, _mm_shuffle_epi32(k2, 0xFF))
-
-#define EXPAND_KEY192(k0, res1, res2, res3, carry, rcon1, rcon2) \
- EXPAND_KEY192_PART1(tmp3, k0, res1, rcon1); \
- EXPAND_KEY192_PART2(carry, res1, tmp3); \
- res1 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(res1), \
- _mm_castsi128_pd(tmp3), 0)); \
- res2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp3), \
- _mm_castsi128_pd(carry), 1)); \
- EXPAND_KEY192_PART1(res3, tmp3, carry, rcon2)
-
-static void
-native_key_expansion192(AESContext *cx, const unsigned char *key)
-{
- __m128i *keySchedule = cx->keySchedule;
- pre_align __m128i tmp1 post_align;
- pre_align __m128i tmp2 post_align;
- pre_align __m128i tmp3 post_align;
- pre_align __m128i carry post_align;
- keySchedule[0] = _mm_loadu_si128((__m128i *)key);
- keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16));
- EXPAND_KEY192(keySchedule[0], keySchedule[1], keySchedule[2],
- keySchedule[3], carry, 0x1, 0x2);
- EXPAND_KEY192_PART2(keySchedule[4], carry, keySchedule[3]);
- EXPAND_KEY192(keySchedule[3], keySchedule[4], keySchedule[5],
- keySchedule[6], carry, 0x4, 0x8);
- EXPAND_KEY192_PART2(keySchedule[7], carry, keySchedule[6]);
- EXPAND_KEY192(keySchedule[6], keySchedule[7], keySchedule[8],
- keySchedule[9], carry, 0x10, 0x20);
- EXPAND_KEY192_PART2(keySchedule[10], carry, keySchedule[9]);
- EXPAND_KEY192(keySchedule[9], keySchedule[10], keySchedule[11],
- keySchedule[12], carry, 0x40, 0x80);
-}
-
-#define EXPAND_KEY256_PART(res, rconx, k1x, k2x, X) \
- tmp_key = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(k2x, rconx), X); \
- tmp2 = _mm_slli_si128(k1x, 4); \
- tmp1 = _mm_xor_si128(k1x, tmp2); \
- tmp2 = _mm_slli_si128(tmp2, 4); \
- tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \
- res = _mm_xor_si128(tmp1, tmp_key);
-
-#define EXPAND_KEY256(res1, res2, k1, k2, rcon) \
- EXPAND_KEY256_PART(res1, rcon, k1, k2, 0xFF); \
- EXPAND_KEY256_PART(res2, 0x00, k2, res1, 0xAA)
-
-static void
-native_key_expansion256(AESContext *cx, const unsigned char *key)
-{
- __m128i *keySchedule = cx->keySchedule;
- pre_align __m128i tmp_key post_align;
- pre_align __m128i tmp1 post_align;
- pre_align __m128i tmp2 post_align;
- keySchedule[0] = _mm_loadu_si128((__m128i *)key);
- keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16));
- EXPAND_KEY256(keySchedule[2], keySchedule[3], keySchedule[0],
- keySchedule[1], 0x01);
- EXPAND_KEY256(keySchedule[4], keySchedule[5], keySchedule[2],
- keySchedule[3], 0x02);
- EXPAND_KEY256(keySchedule[6], keySchedule[7], keySchedule[4],
- keySchedule[5], 0x04);
- EXPAND_KEY256(keySchedule[8], keySchedule[9], keySchedule[6],
- keySchedule[7], 0x08);
- EXPAND_KEY256(keySchedule[10], keySchedule[11], keySchedule[8],
- keySchedule[9], 0x10);
- EXPAND_KEY256(keySchedule[12], keySchedule[13], keySchedule[10],
- keySchedule[11], 0x20);
- EXPAND_KEY256_PART(keySchedule[14], 0x40, keySchedule[12],
- keySchedule[13], 0xFF);
-}
-
-#endif /* NSS_X86_OR_X64 */
-
-/*
- * AES key expansion using aes-ni instructions.
- */
-static void
-native_key_expansion(AESContext *cx, const unsigned char *key, unsigned int Nk)
-{
-#ifdef NSS_X86_OR_X64
- switch (Nk) {
- case 4:
- native_key_expansion128(cx, key);
- return;
- case 6:
- native_key_expansion192(cx, key);
- return;
- case 8:
- native_key_expansion256(cx, key);
- return;
- default:
- /* This shouldn't happen. */
- PORT_Assert(0);
- }
-#else
- PORT_Assert(0);
-#endif /* NSS_X86_OR_X64 */
-}
-
-static void
-native_encryptBlock(AESContext *cx,
- unsigned char *output,
- const unsigned char *input)
-{
-#ifdef NSS_X86_OR_X64
- int i;
- pre_align __m128i m post_align = _mm_loadu_si128((__m128i *)input);
- m = _mm_xor_si128(m, cx->keySchedule[0]);
- for (i = 1; i < cx->Nr; ++i) {
- m = _mm_aesenc_si128(m, cx->keySchedule[i]);
- }
- m = _mm_aesenclast_si128(m, cx->keySchedule[cx->Nr]);
- _mm_storeu_si128((__m128i *)output, m);
-#else
- PORT_Assert(0);
-#endif /* NSS_X86_OR_X64 */
-}
-
/* rijndael_key_expansion
*
* Generate the expanded key from the key input by the user.
@@ -830,7 +702,7 @@ rijndael_encryptECB(AESContext *cx, unsigned char *output,
if (aesni_support()) {
/* Use hardware acceleration for normal AES parameters. */
- encryptor = &native_encryptBlock;
+ encryptor = &rijndael_native_encryptBlock;
} else {
encryptor = &rijndael_encryptBlock128;
}
@@ -1026,7 +898,7 @@ aes_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize,
cx->mode == NSS_AES_CTR)) {
PORT_Assert(keysize == 16 || keysize == 24 || keysize == 32);
/* Prepare hardware key for normal AES parameters. */
- native_key_expansion(cx, key, Nk);
+ rijndael_native_key_expansion(cx, key, Nk);
} else {
rijndael_key_expansion(cx, key, Nk);
}
diff --git a/lib/freebl/rijndael.h b/lib/freebl/rijndael.h
index 1f4a8a9f7..1b63a323d 100644
--- a/lib/freebl/rijndael.h
+++ b/lib/freebl/rijndael.h
@@ -8,8 +8,22 @@
#include "blapii.h"
#include <stdint.h>
-#ifdef NSS_X86_OR_X64
-#include <wmmintrin.h> /* aes-ni */
+#if defined(NSS_X86_OR_X64)
+/* GCC <= 4.8 doesn't support including emmintrin.h without enabling SSE2 */
+#if !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \
+ (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
+#pragma GCC push_options
+#pragma GCC target("sse2")
+#undef NSS_DISABLE_SSE2
+#define NSS_DISABLE_SSE2 1
+#endif /* GCC <= 4.8 */
+
+#include <emmintrin.h> /* __m128i */
+
+#ifdef NSS_DISABLE_SSE2
+#undef NSS_DISABLE_SSE2
+#pragma GCC pop_options
+#endif /* NSS_DISABLE_SSE2 */
#endif
typedef void AESBlockFunc(AESContext *cx,