diff options
author | rrelyea%redhat.com <devnull@localhost> | 2013-01-15 02:36:11 +0000 |
---|---|---|
committer | rrelyea%redhat.com <devnull@localhost> | 2013-01-15 02:36:11 +0000 |
commit | 094bde5d3a5df11e7f03ecc73a492c07d433e8d6 (patch) | |
tree | 87c605d15e71403d3528c58c993866d0ef02f0dc | |
parent | 9f1f716dcf13a4cb0c65e8fe286693e900ada116 (diff) | |
download | nss-hg-094bde5d3a5df11e7f03ecc73a492c07d433e8d6.tar.gz |
Bug 805604 - Efficient AES-GCM implementation that uses Intel's AES and PCLMULQDQ instructions (AES-NI) and the Advanced Vector Extension (AVX) architecture.
patch by Shay Gueron, review by rrelyea.
-rw-r--r-- | security/nss/lib/freebl/Makefile | 23 | ||||
-rw-r--r-- | security/nss/lib/freebl/intel-gcm-wrap.c | 235 | ||||
-rw-r--r-- | security/nss/lib/freebl/intel-gcm.h | 62 | ||||
-rw-r--r-- | security/nss/lib/freebl/intel-gcm.s | 1335 | ||||
-rw-r--r-- | security/nss/lib/freebl/manifest.mn | 1 | ||||
-rw-r--r-- | security/nss/lib/freebl/rijndael.c | 29 |
6 files changed, 1678 insertions, 7 deletions
diff --git a/security/nss/lib/freebl/Makefile b/security/nss/lib/freebl/Makefile index 648adec14..9ad9599da 100644 --- a/security/nss/lib/freebl/Makefile +++ b/security/nss/lib/freebl/Makefile @@ -91,7 +91,7 @@ ifdef FREEBL_PRELINK_COMMAND DEFINES +=-DFREEBL_PRELINK_COMMAND=\"$(FREEBL_PRELINK_COMMAND)\" endif # NSS_X86 means the target is a 32-bits x86 CPU architecture -# NSS_X64 means the target is a 64-bits x64 CPU architecture +# NSS_X64 means the target is a 64-bits 64 CPU architecture # NSS_X86_OR_X64 means the target is either x86 or x64 ifeq (,$(filter-out i386 x386 x86 x86_64,$(CPU_ARCH))) DEFINES += -DNSS_X86_OR_X64 @@ -187,7 +187,9 @@ ifeq ($(CPU_ARCH),x86_64) # DEFINES += -DMPI_AMD64_ADD # comment the next two lines to turn off intel HW accelleration DEFINES += -DUSE_HW_AES - ASFILES += intel-aes.s + ASFILES += intel-aes.s intel-gcm.s + EXTRA_SRCS += intel-gcm-wrap.c + INTEL_GCM=1 MPI_SRCS += mpi_amd64.c mp_comba.c endif ifeq ($(CPU_ARCH),x86) @@ -442,7 +444,9 @@ else DEFINES += -DNSS_USE_COMBA -DMP_CHAR_STORE_SLOW -DMP_IS_LITTLE_ENDIAN # comment the next two lines to turn off intel HW accelleration DEFINES += -DUSE_HW_AES - ASFILES += intel-aes.s + ASFILES += intel-aes.s intel-gcm.s + EXTRA_SRCS += intel-gcm-wrap.c + INTEL_GCM=1 MPI_SRCS += mpi_amd64.c else # Solaris x86 @@ -643,3 +647,16 @@ else endif endif endif + +ifdef INTEL_GCM +# +# GCM binary needs -msse4 +# +$(OBJDIR)/$(PROG_PREFIX)intel-gcm-wrap$(OBJ_SUFFIX): intel-gcm-wrap.c + @$(MAKE_OBJDIR) +ifdef NEED_ABSOLUTE_PATH + $(CC) -o $@ -c -mssse3 $(CFLAGS) $(call core_abspath,$<) +else + $(CC) -o $@ -c -mssse3 $(CFLAGS) $< +endif +endif diff --git a/security/nss/lib/freebl/intel-gcm-wrap.c b/security/nss/lib/freebl/intel-gcm-wrap.c new file mode 100644 index 000000000..cda2fac58 --- /dev/null +++ b/security/nss/lib/freebl/intel-gcm-wrap.c @@ -0,0 +1,235 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* Wrapper funcions for Intel optimized implementation of AES-GCM */ + +#ifdef USE_HW_AES + +#ifdef FREEBL_NO_DEPEND +#include "stubs.h" +#endif + +#include "blapii.h" +#include "blapit.h" +#include "gcm.h" +#include "ctr.h" +#include "secerr.h" +#include "prtypes.h" +#include "pkcs11t.h" + +#include <limits.h> + +#include "intel-gcm.h" +#include "rijndael.h" + +#if defined(__INTEL_COMPILER) +#include <ia32intrin.h> +#elif defined(__GNUC__) +#include <emmintrin.h> +#include <tmmintrin.h> +#endif + + +struct intel_AES_GCMContextStr{ + unsigned char Htbl[16*AES_BLOCK_SIZE]; + unsigned char X0[AES_BLOCK_SIZE]; + unsigned char T[AES_BLOCK_SIZE]; + unsigned char CTR[AES_BLOCK_SIZE]; + AESContext *aes_context; + unsigned long tagBits; + unsigned long Alen; + unsigned long Mlen; +}; + +intel_AES_GCMContext *intel_AES_GCM_CreateContext(void *context, + freeblCipherFunc cipher, + const unsigned char *params, + unsigned int blocksize) +{ + intel_AES_GCMContext *gcm = NULL; + AESContext *aes = (AESContext*)context; + const CK_GCM_PARAMS *gcmParams = (const CK_GCM_PARAMS *)params; + unsigned char buff[AES_BLOCK_SIZE]; /* aux buffer */ + + int IV_whole_len = gcmParams->ulIvLen&(~0xf); + int IV_remainder_len = gcmParams->ulIvLen&0xf; + int AAD_whole_len = gcmParams->ulAADLen&(~0xf); + int AAD_remainder_len = gcmParams->ulAADLen&0xf; + + __m128i BSWAP_MASK = _mm_setr_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); + __m128i ONE = _mm_set_epi32(0,0,0,1); + unsigned int j; + SECStatus rv; + + if (blocksize != AES_BLOCK_SIZE) { + PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); + return NULL; + } + gcm = PORT_ZNew(intel_AES_GCMContext); + + if (gcm == NULL) { + return NULL; + } + /* initialize context fields */ + gcm->aes_context = aes; + gcm->tagBits = gcmParams->ulTagBits; + gcm->Alen = 0; + gcm->Mlen = 0; + /* first prepare H and its derivatives for ghash */ + intel_aes_gcmINIT(gcm->Htbl, (unsigned char*)aes->expandedKey, aes->Nr); + /* Initial TAG value is zero*/ + _mm_storeu_si128((__m128i*)gcm->T, _mm_setzero_si128()); + _mm_storeu_si128((__m128i*)gcm->X0, _mm_setzero_si128()); + /* Init the counter */ + if(gcmParams->ulIvLen == 12) { + _mm_storeu_si128((__m128i*)gcm->CTR, _mm_setr_epi32(((unsigned int*)gcmParams->pIv)[0], ((unsigned int*)gcmParams->pIv)[1], ((unsigned int*)gcmParams->pIv)[2], 0x01000000)); + } else { + /* If IV size is not 96 bits, then the initial counter value is GHASH of the IV */ + intel_aes_gcmAAD(gcm->Htbl, gcmParams->pIv, IV_whole_len, gcm->T); + /* Partial block */ + if(IV_remainder_len) { + PORT_Memset(buff, 0, AES_BLOCK_SIZE); + PORT_Memcpy(buff, gcmParams->pIv + IV_whole_len, IV_remainder_len); + intel_aes_gcmAAD(gcm->Htbl, buff, AES_BLOCK_SIZE, gcm->T); + } + + intel_aes_gcmTAG + ( + gcm->Htbl, + gcm->T, + gcmParams->ulIvLen, + 0, + gcm->X0, + gcm->CTR + ); + /* TAG should be zero again */ + _mm_storeu_si128((__m128i*)gcm->T, _mm_setzero_si128()); + } + /* Encrypt the initial counter, will be used to encrypt the GHASH value, in the end */ + rv = (*cipher)(context, gcm->X0, &j, AES_BLOCK_SIZE, gcm->CTR, AES_BLOCK_SIZE, AES_BLOCK_SIZE); + if (rv != SECSuccess) { + goto loser; + } + /* Promote the counter by 1 */ + _mm_storeu_si128((__m128i*)gcm->CTR, _mm_shuffle_epi8(_mm_add_epi32(ONE, _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)gcm->CTR), BSWAP_MASK)), BSWAP_MASK)); + +/* Now hash AAD - it would actually make sense to seperate the context creation from the AAD, + * because that would allow to reuse the H, which only changes when the AES key changes, + * and not every package, like the IV and AAD */ + intel_aes_gcmAAD(gcm->Htbl, gcmParams->pAAD, AAD_whole_len, gcm->T); + if(AAD_remainder_len) { + PORT_Memset(buff, 0, AES_BLOCK_SIZE); + PORT_Memcpy(buff, gcmParams->pAAD + AAD_whole_len, AAD_remainder_len); + intel_aes_gcmAAD(gcm->Htbl, buff, AES_BLOCK_SIZE, gcm->T); + } + gcm->Alen += gcmParams->ulAADLen; + return gcm; + + loser: + if (gcm) { + PORT_Free(gcm); + } + return NULL; +} + +void intel_AES_GCM_DestroyContext(intel_AES_GCMContext *gcm, PRBool freeit) +{ + if (freeit) { + PORT_Free(gcm); + } +} + +SECStatus intel_AES_GCM_EncryptUpdate(intel_AES_GCMContext *gcm, + unsigned char *outbuf, + unsigned int *outlen, unsigned int maxout, + const unsigned char *inbuf, unsigned int inlen, + unsigned int blocksize) +{ + unsigned int tagBytes; + unsigned char T[AES_BLOCK_SIZE]; + int j; + + tagBytes = (gcm->tagBits + (PR_BITS_PER_BYTE-1)) / PR_BITS_PER_BYTE; + if (UINT_MAX - inlen < tagBytes) { + PORT_SetError(SEC_ERROR_INPUT_LEN); + return SECFailure; + } + if (maxout < inlen + tagBytes) { + *outlen = inlen + tagBytes; + PORT_SetError(SEC_ERROR_OUTPUT_LEN); + return SECFailure; + } + + intel_aes_gcmENC( + inbuf, + outbuf, + gcm, + inlen); + + gcm->Mlen += inlen; + + intel_aes_gcmTAG( + gcm->Htbl, + gcm->T, + gcm->Mlen, + gcm->Alen, + gcm->X0, + T); + + *outlen = inlen + tagBytes; + + for(j=0; j<tagBytes; j++) + { + outbuf[inlen+j] = T[j]; + } + return SECSuccess; +} + +SECStatus intel_AES_GCM_DecryptUpdate(intel_AES_GCMContext *gcm, + unsigned char *outbuf, + unsigned int *outlen, unsigned int maxout, + const unsigned char *inbuf, unsigned int inlen, + unsigned int blocksize) +{ + unsigned int tagBytes; + unsigned char T[AES_BLOCK_SIZE]; + const unsigned char *intag; + + tagBytes = (gcm->tagBits + (PR_BITS_PER_BYTE-1)) / PR_BITS_PER_BYTE; + + /* get the authentication block */ + if (inlen < tagBytes) { + PORT_SetError(SEC_ERROR_INVALID_ARGS); + return SECFailure; + } + + inlen -= tagBytes; + intag = inbuf + inlen; + + intel_aes_gcmDEC( + inbuf, + outbuf, + gcm, + inlen); + + gcm->Mlen += inlen; + intel_aes_gcmTAG( + gcm->Htbl, + gcm->T, + gcm->Mlen, + gcm->Alen, + gcm->X0, + T); + + if (NSS_SecureMemcmp(T, intag, tagBytes) != 0) { + /* force a CKR_ENCRYPTED_DATA_INVALID error at in softoken */ + PORT_SetError(SEC_ERROR_BAD_DATA); + return SECFailure; + } + *outlen = inlen; + + return SECSuccess; +} + +#endif diff --git a/security/nss/lib/freebl/intel-gcm.h b/security/nss/lib/freebl/intel-gcm.h new file mode 100644 index 000000000..29bfba8d2 --- /dev/null +++ b/security/nss/lib/freebl/intel-gcm.h @@ -0,0 +1,62 @@ +#ifndef INTEL_GCM_H +#define INTEL_GCM_H 1 + +#include "blapii.h" + +typedef struct intel_AES_GCMContextStr intel_AES_GCMContext; + +intel_AES_GCMContext *intel_AES_GCM_CreateContext(void *context, freeblCipherFunc cipher, + const unsigned char *params, unsigned int blocksize); + +void intel_AES_GCM_DestroyContext(intel_AES_GCMContext *gcm, PRBool freeit); + +SECStatus intel_AES_GCM_EncryptUpdate(intel_AES_GCMContext *gcm, unsigned char *outbuf, + unsigned int *outlen, unsigned int maxout, + const unsigned char *inbuf, unsigned int inlen, + unsigned int blocksize); + +SECStatus intel_AES_GCM_DecryptUpdate(intel_AES_GCMContext *gcm, unsigned char *outbuf, + unsigned int *outlen, unsigned int maxout, + const unsigned char *inbuf, unsigned int inlen, + unsigned int blocksize); + +/* Prorotypes of functions in the assembler file for fast AES-GCM, using + Intel AES-NI and CLMUL-NI, as described in [1] + [1] Shay Gueron, Michael E. Kounavis: IntelĀ® Carry-Less Multiplication + Instruction and its Usage for Computing the GCM Mode */ + +/* Prepares the constants used in the aggregated reduction method */ +void intel_aes_gcmINIT(unsigned char Htbl[16*16], + unsigned char *KS, + int NR); + +/* Produces the final GHASH value */ +void intel_aes_gcmTAG(unsigned char Htbl[16*16], + unsigned char *Tp, + unsigned long Mlen, + unsigned long Alen, + unsigned char* X0, + unsigned char* TAG); + +/* Hashes the Additional Authenticated Data, should be used before enc/dec. + Operates on whole blocks only. Partial blocks should be padded externally. */ +void intel_aes_gcmAAD(unsigned char Htbl[16*16], + unsigned char *AAD, + unsigned long Alen, + unsigned char *Tp); + +/* Encrypts and hashes the Plaintext. + Operates on any length of data, however partial block should only be encrypted + at the last call, otherwise the result will be incorrect. */ +void intel_aes_gcmENC(const unsigned char* PT, + unsigned char* CT, + void *Gctx, + unsigned long len); + +/* Similar to ENC, but decrypts the Ciphertext. */ +void intel_aes_gcmDEC(const unsigned char* CT, + unsigned char* PT, + void *Gctx, + unsigned long len); + +#endif diff --git a/security/nss/lib/freebl/intel-gcm.s b/security/nss/lib/freebl/intel-gcm.s new file mode 100644 index 000000000..49d8ecd98 --- /dev/null +++ b/security/nss/lib/freebl/intel-gcm.s @@ -0,0 +1,1335 @@ + + +.align 16 +.Lone: +.quad 1,0 +.Ltwo: +.quad 2,0 +.Lbswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.Lshuff_mask: +.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f +.Lpoly: +.quad 0x1, 0xc200000000000000 + + +################################################################################ +# Generates the final GCM tag +# void intel_aes_gcmTAG(uint8_t Htbl[16*16], uint8_t *Tp, uint64_t Mlen, uint64_t Alen, uint8_t* X0, uint8_t* TAG); +.type intel_aes_gcmTAG,@function +.globl intel_aes_gcmTAG +.align 16 +intel_aes_gcmTAG: + +.set Htbl, %rdi +.set Tp, %rsi +.set Mlen, %rdx +.set Alen, %rcx +.set X0, %r8 +.set TAG, %r9 + +.set T,%xmm0 +.set TMP0,%xmm1 + + vmovdqu (Tp), T + vpshufb .Lbswap_mask(%rip), T, T + vpxor TMP0, TMP0, TMP0 + shl $3, Mlen + shl $3, Alen + vpinsrq $0, Mlen, TMP0, TMP0 + vpinsrq $1, Alen, TMP0, TMP0 + vpxor TMP0, T, T + vmovdqu (Htbl), TMP0 + call GFMUL + vpshufb .Lbswap_mask(%rip), T, T + vpxor (X0), T, T + vmovdqu T, (TAG) + +ret +.size intel_aes_gcmTAG, .-intel_aes_gcmTAG +################################################################################ +# Generates the H table +# void intel_aes_gcmINIT(uint8_t Htbl[16*16], uint8_t *KS, int NR); +.type intel_aes_gcmINIT,@function +.globl intel_aes_gcmINIT +.align 16 +intel_aes_gcmINIT: + +.set Htbl, %rdi +.set KS, %rsi +.set NR, %edx + +.set T,%xmm0 +.set TMP0,%xmm1 + +CALCULATE_POWERS_OF_H: + vmovdqu 16*0(KS), T + vaesenc 16*1(KS), T, T + vaesenc 16*2(KS), T, T + vaesenc 16*3(KS), T, T + vaesenc 16*4(KS), T, T + vaesenc 16*5(KS), T, T + vaesenc 16*6(KS), T, T + vaesenc 16*7(KS), T, T + vaesenc 16*8(KS), T, T + vaesenc 16*9(KS), T, T + vmovdqu 16*10(KS), TMP0 + cmp $10, NR + je .LH0done + vaesenc 16*10(KS), T, T + vaesenc 16*11(KS), T, T + vmovdqu 16*12(KS), TMP0 + cmp $12, NR + je .LH0done + vaesenc 16*12(KS), T, T + vaesenc 16*13(KS), T, T + vmovdqu 16*14(KS), TMP0 + +.LH0done: + vaesenclast TMP0, T, T + + vpshufb .Lbswap_mask(%rip), T, T + + vmovdqu T, TMP0 + # Calculate H` = GFMUL(H, 2) + vpsrld $7 , T , %xmm3 + vmovdqu .Lshuff_mask(%rip), %xmm4 + vpshufb %xmm4, %xmm3 , %xmm3 + movq $0xff00 , %rax + vmovq %rax, %xmm4 + vpshufb %xmm3, %xmm4 , %xmm4 + vmovdqu .Lpoly(%rip), %xmm5 + vpand %xmm4, %xmm5, %xmm5 + vpsrld $31, T, %xmm3 + vpslld $1, T, %xmm4 + vpslldq $4, %xmm3, %xmm3 + vpxor %xmm3, %xmm4, T #xmm1 holds now p(x)<<1 + + #adding p(x)<<1 to xmm5 + vpxor %xmm5, T , T + vmovdqu T, TMP0 + vmovdqu T, (Htbl) # H * 2 + call GFMUL + vmovdqu T, 16(Htbl) # H^2 * 2 + call GFMUL + vmovdqu T, 32(Htbl) # H^3 * 2 + call GFMUL + vmovdqu T, 48(Htbl) # H^4 * 2 + call GFMUL + vmovdqu T, 64(Htbl) # H^5 * 2 + call GFMUL + vmovdqu T, 80(Htbl) # H^6 * 2 + call GFMUL + vmovdqu T, 96(Htbl) # H^7 * 2 + call GFMUL + vmovdqu T, 112(Htbl) # H^8 * 2 + + # Precalculations for the reduce 4 step + vpshufd $78, (Htbl), %xmm8 + vpshufd $78, 16(Htbl), %xmm9 + vpshufd $78, 32(Htbl), %xmm10 + vpshufd $78, 48(Htbl), %xmm11 + vpshufd $78, 64(Htbl), %xmm12 + vpshufd $78, 80(Htbl), %xmm13 + vpshufd $78, 96(Htbl), %xmm14 + vpshufd $78, 112(Htbl), %xmm15 + + vpxor (Htbl), %xmm8, %xmm8 + vpxor 16(Htbl), %xmm9, %xmm9 + vpxor 32(Htbl), %xmm10, %xmm10 + vpxor 48(Htbl), %xmm11, %xmm11 + vpxor 64(Htbl), %xmm12, %xmm12 + vpxor 80(Htbl), %xmm13, %xmm13 + vpxor 96(Htbl), %xmm14, %xmm14 + vpxor 112(Htbl), %xmm15, %xmm15 + + vmovdqu %xmm8, 128(Htbl) + vmovdqu %xmm9, 144(Htbl) + vmovdqu %xmm10, 160(Htbl) + vmovdqu %xmm11, 176(Htbl) + vmovdqu %xmm12, 192(Htbl) + vmovdqu %xmm13, 208(Htbl) + vmovdqu %xmm14, 224(Htbl) + vmovdqu %xmm15, 240(Htbl) + + ret +.size intel_aes_gcmINIT, .-intel_aes_gcmINIT +################################################################################ +# Authenticate only +# void intel_aes_gcmAAD(uint8_t Htbl[16*16], uint8_t *AAD, uint64_t Alen, uint8_t *Tp); + +.globl intel_aes_gcmAAD +.type intel_aes_gcmAAD,@function +.align 16 +intel_aes_gcmAAD: + +.set DATA, %xmm0 +.set T, %xmm1 +.set BSWAP_MASK, %xmm2 +.set TMP0, %xmm3 +.set TMP1, %xmm4 +.set TMP2, %xmm5 +.set TMP3, %xmm6 +.set TMP4, %xmm7 +.set Xhi, %xmm9 + +.set Htbl, %rdi +.set inp, %rsi +.set len, %rdx +.set Tp, %rcx + +.set hlp0, %r11 + +.macro KARATSUBA_AAD i + vpclmulqdq $0x00, 16*\i(Htbl), DATA, TMP3 + vpxor TMP3, TMP0, TMP0 + vpclmulqdq $0x11, 16*\i(Htbl), DATA, TMP3 + vpxor TMP3, TMP1, TMP1 + vpshufd $78, DATA, TMP3 + vpxor DATA, TMP3, TMP3 + vpclmulqdq $0x00, 16*(\i+8)(Htbl), TMP3, TMP3 + vpxor TMP3, TMP2, TMP2 +.endm + + test len, len + jnz .LbeginAAD + ret + +.LbeginAAD: + + push hlp0 + vzeroupper + + vmovdqa .Lbswap_mask(%rip), BSWAP_MASK + + vpxor Xhi, Xhi, Xhi + + vmovdqu (Tp),T + vpshufb BSWAP_MASK,T,T + + # we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first + mov len, hlp0 + and $~-128, hlp0 + + jz .Lmod_loop + + sub hlp0, len + sub $16, hlp0 + + #hash first prefix block + vmovdqu (inp), DATA + vpshufb BSWAP_MASK, DATA, DATA + vpxor T, DATA, DATA + + vpclmulqdq $0x00, (Htbl, hlp0), DATA, TMP0 + vpclmulqdq $0x11, (Htbl, hlp0), DATA, TMP1 + vpshufd $78, DATA, TMP2 + vpxor DATA, TMP2, TMP2 + vpclmulqdq $0x00, 16*8(Htbl, hlp0), TMP2, TMP2 + + lea 16(inp), inp + test hlp0, hlp0 + jnz .Lpre_loop + jmp .Lred1 + + #hash remaining prefix bocks (up to 7 total prefix blocks) +.align 64 +.Lpre_loop: + + sub $16, hlp0 + + vmovdqu (inp),DATA # next data block + vpshufb BSWAP_MASK,DATA,DATA + + vpclmulqdq $0x00, (Htbl,hlp0), DATA, TMP3 + vpxor TMP3, TMP0, TMP0 + vpclmulqdq $0x11, (Htbl,hlp0), DATA, TMP3 + vpxor TMP3, TMP1, TMP1 + vpshufd $78, DATA, TMP3 + vpxor DATA, TMP3, TMP3 + vpclmulqdq $0x00, 16*8(Htbl,hlp0), TMP3, TMP3 + vpxor TMP3, TMP2, TMP2 + + test hlp0, hlp0 + + lea 16(inp), inp + + jnz .Lpre_loop + +.Lred1: + vpxor TMP0, TMP2, TMP2 + vpxor TMP1, TMP2, TMP2 + vpsrldq $8, TMP2, TMP3 + vpslldq $8, TMP2, TMP2 + + vpxor TMP3, TMP1, Xhi + vpxor TMP2, TMP0, T + +.align 64 +.Lmod_loop: + sub $0x80, len + jb .Ldone + + vmovdqu 16*7(inp),DATA # Ii + vpshufb BSWAP_MASK,DATA,DATA + + vpclmulqdq $0x00, (Htbl), DATA, TMP0 + vpclmulqdq $0x11, (Htbl), DATA, TMP1 + vpshufd $78, DATA, TMP2 + vpxor DATA, TMP2, TMP2 + vpclmulqdq $0x00, 16*8(Htbl), TMP2, TMP2 + ######################################################### + vmovdqu 16*6(inp),DATA + vpshufb BSWAP_MASK,DATA,DATA + KARATSUBA_AAD 1 + ######################################################### + vmovdqu 16*5(inp),DATA + vpshufb BSWAP_MASK,DATA,DATA + + vpclmulqdq $0x10, .Lpoly(%rip), T, TMP4 #reduction stage 1a + vpalignr $8, T, T, T + + KARATSUBA_AAD 2 + + vpxor TMP4, T, T #reduction stage 1b + ######################################################### + vmovdqu 16*4(inp),DATA + vpshufb BSWAP_MASK,DATA,DATA + + KARATSUBA_AAD 3 + ######################################################### + vmovdqu 16*3(inp),DATA + vpshufb BSWAP_MASK,DATA,DATA + + vpclmulqdq $0x10, .Lpoly(%rip), T, TMP4 #reduction stage 2a + vpalignr $8, T, T, T + + KARATSUBA_AAD 4 + + vpxor TMP4, T, T #reduction stage 2b + ######################################################### + vmovdqu 16*2(inp),DATA + vpshufb BSWAP_MASK,DATA,DATA + + KARATSUBA_AAD 5 + + vpxor Xhi, T, T #reduction finalize + ######################################################### + vmovdqu 16*1(inp),DATA + vpshufb BSWAP_MASK,DATA,DATA + + KARATSUBA_AAD 6 + ######################################################### + vmovdqu 16*0(inp),DATA + vpshufb BSWAP_MASK,DATA,DATA + vpxor T,DATA,DATA + + KARATSUBA_AAD 7 + ######################################################### + vpxor TMP0, TMP2, TMP2 # karatsuba fixup + vpxor TMP1, TMP2, TMP2 + vpsrldq $8, TMP2, TMP3 + vpslldq $8, TMP2, TMP2 + + vpxor TMP3, TMP1, Xhi + vpxor TMP2, TMP0, T + + lea 16*8(inp), inp + jmp .Lmod_loop + ######################################################### + +.Ldone: + vpclmulqdq $0x10, .Lpoly(%rip), T, TMP3 + vpalignr $8, T, T, T + vpxor TMP3, T, T + + vpclmulqdq $0x10, .Lpoly(%rip), T, TMP3 + vpalignr $8, T, T, T + vpxor TMP3, T, T + + vpxor Xhi, T, T + +.Lsave: + vpshufb BSWAP_MASK,T, T + vmovdqu T,(Tp) + vzeroupper + + pop hlp0 + ret +.size intel_aes_gcmAAD,.-intel_aes_gcmAAD + +################################################################################ +# Encrypt and Authenticate +# void intel_aes_gcmENC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len); +.type intel_aes_gcmENC,@function +.globl intel_aes_gcmENC +.align 16 +intel_aes_gcmENC: + +.set PT,%rdi +.set CT,%rsi +.set Htbl, %rdx +.set len, %rcx +.set KS,%r9 +.set NR,%r10d + +.set Gctx, %rdx + +.set T,%xmm0 +.set TMP0,%xmm1 +.set TMP1,%xmm2 +.set TMP2,%xmm3 +.set TMP3,%xmm4 +.set TMP4,%xmm5 +.set TMP5,%xmm6 +.set CTR0,%xmm7 +.set CTR1,%xmm8 +.set CTR2,%xmm9 +.set CTR3,%xmm10 +.set CTR4,%xmm11 +.set CTR5,%xmm12 +.set CTR6,%xmm13 +.set CTR7,%xmm14 +.set CTR,%xmm15 + +.macro ROUND i + vmovdqu \i*16(KS), TMP3 + vaesenc TMP3, CTR0, CTR0 + vaesenc TMP3, CTR1, CTR1 + vaesenc TMP3, CTR2, CTR2 + vaesenc TMP3, CTR3, CTR3 + vaesenc TMP3, CTR4, CTR4 + vaesenc TMP3, CTR5, CTR5 + vaesenc TMP3, CTR6, CTR6 + vaesenc TMP3, CTR7, CTR7 +.endm + +.macro ROUNDMUL i + + vmovdqu \i*16(%rsp), TMP5 + vmovdqu \i*16(KS), TMP3 + + vaesenc TMP3, CTR0, CTR0 + vaesenc TMP3, CTR1, CTR1 + vaesenc TMP3, CTR2, CTR2 + vaesenc TMP3, CTR3, CTR3 + + vpshufd $78, TMP5, TMP4 + vpxor TMP5, TMP4, TMP4 + + vaesenc TMP3, CTR4, CTR4 + vaesenc TMP3, CTR5, CTR5 + vaesenc TMP3, CTR6, CTR6 + vaesenc TMP3, CTR7, CTR7 + + vpclmulqdq $0x00, 128+\i*16(Htbl), TMP4, TMP3 + vpxor TMP3, TMP0, TMP0 + vmovdqa \i*16(Htbl), TMP4 + vpclmulqdq $0x11, TMP4, TMP5, TMP3 + vpxor TMP3, TMP1, TMP1 + vpclmulqdq $0x00, TMP4, TMP5, TMP3 + vpxor TMP3, TMP2, TMP2 + +.endm + +.macro KARATSUBA i + vmovdqu \i*16(%rsp), TMP5 + + vpclmulqdq $0x11, 16*\i(Htbl), TMP5, TMP3 + vpxor TMP3, TMP1, TMP1 + vpclmulqdq $0x00, 16*\i(Htbl), TMP5, TMP3 + vpxor TMP3, TMP2, TMP2 + vpshufd $78, TMP5, TMP3 + vpxor TMP5, TMP3, TMP5 + vpclmulqdq $0x00, 128+\i*16(Htbl), TMP5, TMP3 + vpxor TMP3, TMP0, TMP0 +.endm + + test len, len + jnz .Lbegin + ret + +.Lbegin: + + vzeroupper + push %rbp + push %rbx + + movq %rsp, %rbp + sub $128, %rsp + andq $-16, %rsp + + vmovdqu 288(Gctx), CTR + vmovdqu 272(Gctx), T + mov 304(Gctx), KS + mov 4(KS), NR + lea 48(KS), KS + + vpshufb .Lbswap_mask(%rip), CTR, CTR + vpshufb .Lbswap_mask(%rip), T, T + + cmp $128, len + jb .LDataSingles + +# Encrypt the first eight blocks + sub $128, len + vmovdqa CTR, CTR0 + vpaddd .Lone(%rip), CTR0, CTR1 + vpaddd .Ltwo(%rip), CTR0, CTR2 + vpaddd .Lone(%rip), CTR2, CTR3 + vpaddd .Ltwo(%rip), CTR2, CTR4 + vpaddd .Lone(%rip), CTR4, CTR5 + vpaddd .Ltwo(%rip), CTR4, CTR6 + vpaddd .Lone(%rip), CTR6, CTR7 + vpaddd .Ltwo(%rip), CTR6, CTR + + vpshufb .Lbswap_mask(%rip), CTR0, CTR0 + vpshufb .Lbswap_mask(%rip), CTR1, CTR1 + vpshufb .Lbswap_mask(%rip), CTR2, CTR2 + vpshufb .Lbswap_mask(%rip), CTR3, CTR3 + vpshufb .Lbswap_mask(%rip), CTR4, CTR4 + vpshufb .Lbswap_mask(%rip), CTR5, CTR5 + vpshufb .Lbswap_mask(%rip), CTR6, CTR6 + vpshufb .Lbswap_mask(%rip), CTR7, CTR7 + + vpxor (KS), CTR0, CTR0 + vpxor (KS), CTR1, CTR1 + vpxor (KS), CTR2, CTR2 + vpxor (KS), CTR3, CTR3 + vpxor (KS), CTR4, CTR4 + vpxor (KS), CTR5, CTR5 + vpxor (KS), CTR6, CTR6 + vpxor (KS), CTR7, CTR7 + + ROUND 1 + ROUND 2 + ROUND 3 + ROUND 4 + ROUND 5 + ROUND 6 + ROUND 7 + ROUND 8 + ROUND 9 + + vmovdqu 160(KS), TMP5 + cmp $12, NR + jb .LLast1 + + ROUND 10 + ROUND 11 + + vmovdqu 192(KS), TMP5 + cmp $14, NR + jb .LLast1 + + ROUND 12 + ROUND 13 + + vmovdqu 224(KS), TMP5 + +.LLast1: + + vpxor (PT), TMP5, TMP3 + vaesenclast TMP3, CTR0, CTR0 + vpxor 16(PT), TMP5, TMP3 + vaesenclast TMP3, CTR1, CTR1 + vpxor 32(PT), TMP5, TMP3 + vaesenclast TMP3, CTR2, CTR2 + vpxor 48(PT), TMP5, TMP3 + vaesenclast TMP3, CTR3, CTR3 + vpxor 64(PT), TMP5, TMP3 + vaesenclast TMP3, CTR4, CTR4 + vpxor 80(PT), TMP5, TMP3 + vaesenclast TMP3, CTR5, CTR5 + vpxor 96(PT), TMP5, TMP3 + vaesenclast TMP3, CTR6, CTR6 + vpxor 112(PT), TMP5, TMP3 + vaesenclast TMP3, CTR7, CTR7 + + vmovdqu .Lbswap_mask(%rip), TMP3 + + vmovdqu CTR0, (CT) + vpshufb TMP3, CTR0, CTR0 + vmovdqu CTR1, 16(CT) + vpshufb TMP3, CTR1, CTR1 + vmovdqu CTR2, 32(CT) + vpshufb TMP3, CTR2, CTR2 + vmovdqu CTR3, 48(CT) + vpshufb TMP3, CTR3, CTR3 + vmovdqu CTR4, 64(CT) + vpshufb TMP3, CTR4, CTR4 + vmovdqu CTR5, 80(CT) + vpshufb TMP3, CTR5, CTR5 + vmovdqu CTR6, 96(CT) + vpshufb TMP3, CTR6, CTR6 + vmovdqu CTR7, 112(CT) + vpshufb TMP3, CTR7, CTR7 + + lea 128(CT), CT + lea 128(PT), PT + jmp .LDataOctets + +# Encrypt 8 blocks each time while hashing previous 8 blocks +.align 64 +.LDataOctets: + cmp $128, len + jb .LEndOctets + sub $128, len + + vmovdqa CTR7, TMP5 + vmovdqa CTR6, 1*16(%rsp) + vmovdqa CTR5, 2*16(%rsp) + vmovdqa CTR4, 3*16(%rsp) + vmovdqa CTR3, 4*16(%rsp) + vmovdqa CTR2, 5*16(%rsp) + vmovdqa CTR1, 6*16(%rsp) + vmovdqa CTR0, 7*16(%rsp) + + vmovdqa CTR, CTR0 + vpaddd .Lone(%rip), CTR0, CTR1 + vpaddd .Ltwo(%rip), CTR0, CTR2 + vpaddd .Lone(%rip), CTR2, CTR3 + vpaddd .Ltwo(%rip), CTR2, CTR4 + vpaddd .Lone(%rip), CTR4, CTR5 + vpaddd .Ltwo(%rip), CTR4, CTR6 + vpaddd .Lone(%rip), CTR6, CTR7 + vpaddd .Ltwo(%rip), CTR6, CTR + + vmovdqu (KS), TMP4 + vpshufb TMP3, CTR0, CTR0 + vpxor TMP4, CTR0, CTR0 + vpshufb TMP3, CTR1, CTR1 + vpxor TMP4, CTR1, CTR1 + vpshufb TMP3, CTR2, CTR2 + vpxor TMP4, CTR2, CTR2 + vpshufb TMP3, CTR3, CTR3 + vpxor TMP4, CTR3, CTR3 + vpshufb TMP3, CTR4, CTR4 + vpxor TMP4, CTR4, CTR4 + vpshufb TMP3, CTR5, CTR5 + vpxor TMP4, CTR5, CTR5 + vpshufb TMP3, CTR6, CTR6 + vpxor TMP4, CTR6, CTR6 + vpshufb TMP3, CTR7, CTR7 + vpxor TMP4, CTR7, CTR7 + + vmovdqu 16*0(Htbl), TMP3 + vpclmulqdq $0x11, TMP3, TMP5, TMP1 + vpclmulqdq $0x00, TMP3, TMP5, TMP2 + vpshufd $78, TMP5, TMP3 + vpxor TMP5, TMP3, TMP5 + vmovdqu 128+0*16(Htbl), TMP3 + vpclmulqdq $0x00, TMP3, TMP5, TMP0 + + ROUNDMUL 1 + + ROUNDMUL 2 + + ROUNDMUL 3 + + ROUNDMUL 4 + + ROUNDMUL 5 + + ROUNDMUL 6 + + vpxor 7*16(%rsp), T, TMP5 + vmovdqu 7*16(KS), TMP3 + + vaesenc TMP3, CTR0, CTR0 + vaesenc TMP3, CTR1, CTR1 + vaesenc TMP3, CTR2, CTR2 + vaesenc TMP3, CTR3, CTR3 + + vpshufd $78, TMP5, TMP4 + vpxor TMP5, TMP4, TMP4 + + vaesenc TMP3, CTR4, CTR4 + vaesenc TMP3, CTR5, CTR5 + vaesenc TMP3, CTR6, CTR6 + vaesenc TMP3, CTR7, CTR7 + + vpclmulqdq $0x11, 7*16(Htbl), TMP5, TMP3 + vpxor TMP3, TMP1, TMP1 + vpclmulqdq $0x00, 7*16(Htbl), TMP5, TMP3 + vpxor TMP3, TMP2, TMP2 + vpclmulqdq $0x00, 128+7*16(Htbl), TMP4, TMP3 + vpxor TMP3, TMP0, TMP0 + + ROUND 8 + vmovdqa .Lpoly(%rip), TMP5 + + vpxor TMP1, TMP0, TMP0 + vpxor TMP2, TMP0, TMP0 + vpsrldq $8, TMP0, TMP3 + vpxor TMP3, TMP1, TMP4 + vpslldq $8, TMP0, TMP3 + vpxor TMP3, TMP2, T + + vpclmulqdq $0x10, TMP5, T, TMP1 + vpalignr $8, T, T, T + vpxor T, TMP1, T + + ROUND 9 + + vpclmulqdq $0x10, TMP5, T, TMP1 + vpalignr $8, T, T, T + vpxor T, TMP1, T + + vmovdqu 160(KS), TMP5 + cmp $10, NR + jbe .LLast2 + + ROUND 10 + ROUND 11 + + vmovdqu 192(KS), TMP5 + cmp $12, NR + jbe .LLast2 + + ROUND 12 + ROUND 13 + + vmovdqu 224(KS), TMP5 + +.LLast2: + + vpxor (PT), TMP5, TMP3 + vaesenclast TMP3, CTR0, CTR0 + vpxor 16(PT), TMP5, TMP3 + vaesenclast TMP3, CTR1, CTR1 + vpxor 32(PT), TMP5, TMP3 + vaesenclast TMP3, CTR2, CTR2 + vpxor 48(PT), TMP5, TMP3 + vaesenclast TMP3, CTR3, CTR3 + vpxor 64(PT), TMP5, TMP3 + vaesenclast TMP3, CTR4, CTR4 + vpxor 80(PT), TMP5, TMP3 + vaesenclast TMP3, CTR5, CTR5 + vpxor 96(PT), TMP5, TMP3 + vaesenclast TMP3, CTR6, CTR6 + vpxor 112(PT), TMP5, TMP3 + vaesenclast TMP3, CTR7, CTR7 + + vmovdqu .Lbswap_mask(%rip), TMP3 + + vmovdqu CTR0, (CT) + vpshufb TMP3, CTR0, CTR0 + vmovdqu CTR1, 16(CT) + vpshufb TMP3, CTR1, CTR1 + vmovdqu CTR2, 32(CT) + vpshufb TMP3, CTR2, CTR2 + vmovdqu CTR3, 48(CT) + vpshufb TMP3, CTR3, CTR3 + vmovdqu CTR4, 64(CT) + vpshufb TMP3, CTR4, CTR4 + vmovdqu CTR5, 80(CT) + vpshufb TMP3, CTR5, CTR5 + vmovdqu CTR6, 96(CT) + vpshufb TMP3, CTR6, CTR6 + vmovdqu CTR7,112(CT) + vpshufb TMP3, CTR7, CTR7 + + vpxor TMP4, T, T + + lea 128(CT), CT + lea 128(PT), PT + jmp .LDataOctets + +.LEndOctets: + + vmovdqa CTR7, TMP5 + vmovdqa CTR6, 1*16(%rsp) + vmovdqa CTR5, 2*16(%rsp) + vmovdqa CTR4, 3*16(%rsp) + vmovdqa CTR3, 4*16(%rsp) + vmovdqa CTR2, 5*16(%rsp) + vmovdqa CTR1, 6*16(%rsp) + vmovdqa CTR0, 7*16(%rsp) + + vmovdqu 16*0(Htbl), TMP3 + vpclmulqdq $0x11, TMP3, TMP5, TMP1 + vpclmulqdq $0x00, TMP3, TMP5, TMP2 + vpshufd $78, TMP5, TMP3 + vpxor TMP5, TMP3, TMP5 + vmovdqu 128+0*16(Htbl), TMP3 + vpclmulqdq $0x00, TMP3, TMP5, TMP0 + + KARATSUBA 1 + KARATSUBA 2 + KARATSUBA 3 + KARATSUBA 4 + KARATSUBA 5 + KARATSUBA 6 + + vmovdqu 7*16(%rsp), TMP5 + vpxor T, TMP5, TMP5 + vmovdqu 16*7(Htbl), TMP4 + vpclmulqdq $0x11, TMP4, TMP5, TMP3 + vpxor TMP3, TMP1, TMP1 + vpclmulqdq $0x00, TMP4, TMP5, TMP3 + vpxor TMP3, TMP2, TMP2 + vpshufd $78, TMP5, TMP3 + vpxor TMP5, TMP3, TMP5 + vmovdqu 128+7*16(Htbl), TMP4 + vpclmulqdq $0x00, TMP4, TMP5, TMP3 + vpxor TMP3, TMP0, TMP0 + + vpxor TMP1, TMP0, TMP0 + vpxor TMP2, TMP0, TMP0 + + vpsrldq $8, TMP0, TMP3 + vpxor TMP3, TMP1, TMP4 + vpslldq $8, TMP0, TMP3 + vpxor TMP3, TMP2, T + + vmovdqa .Lpoly(%rip), TMP2 + + vpalignr $8, T, T, TMP1 + vpclmulqdq $0x10, TMP2, T, T + vpxor T, TMP1, T + + vpalignr $8, T, T, TMP1 + vpclmulqdq $0x10, TMP2, T, T + vpxor T, TMP1, T + + vpxor TMP4, T, T + +#Here we encrypt any remaining whole block +.LDataSingles: + + cmp $16, len + jb .LDataTail + sub $16, len + + vpshufb .Lbswap_mask(%rip), CTR, TMP1 + vpaddd .Lone(%rip), CTR, CTR + + vpxor (KS), TMP1, TMP1 + vaesenc 16*1(KS), TMP1, TMP1 + vaesenc 16*2(KS), TMP1, TMP1 + vaesenc 16*3(KS), TMP1, TMP1 + vaesenc 16*4(KS), TMP1, TMP1 + vaesenc 16*5(KS), TMP1, TMP1 + vaesenc 16*6(KS), TMP1, TMP1 + vaesenc 16*7(KS), TMP1, TMP1 + vaesenc 16*8(KS), TMP1, TMP1 + vaesenc 16*9(KS), TMP1, TMP1 + vmovdqu 16*10(KS), TMP2 + cmp $10, NR + je .LLast3 + vaesenc 16*10(KS), TMP1, TMP1 + vaesenc 16*11(KS), TMP1, TMP1 + vmovdqu 16*12(KS), TMP2 + cmp $12, NR + je .LLast3 + vaesenc 16*12(KS), TMP1, TMP1 + vaesenc 16*13(KS), TMP1, TMP1 + vmovdqu 16*14(KS), TMP2 + +.LLast3: + vaesenclast TMP2, TMP1, TMP1 + + vpxor (PT), TMP1, TMP1 + vmovdqu TMP1, (CT) + addq $16, CT + addq $16, PT + + vpshufb .Lbswap_mask(%rip), TMP1, TMP1 + vpxor TMP1, T, T + vmovdqu (Htbl), TMP0 + call GFMUL + + jmp .LDataSingles + +#Here we encypt the final partial block, if there is one +.LDataTail: + + test len, len + jz DATA_END +# First prepare the counter block + vpshufb .Lbswap_mask(%rip), CTR, TMP1 + vpaddd .Lone(%rip), CTR, CTR + + vpxor (KS), TMP1, TMP1 + vaesenc 16*1(KS), TMP1, TMP1 + vaesenc 16*2(KS), TMP1, TMP1 + vaesenc 16*3(KS), TMP1, TMP1 + vaesenc 16*4(KS), TMP1, TMP1 + vaesenc 16*5(KS), TMP1, TMP1 + vaesenc 16*6(KS), TMP1, TMP1 + vaesenc 16*7(KS), TMP1, TMP1 + vaesenc 16*8(KS), TMP1, TMP1 + vaesenc 16*9(KS), TMP1, TMP1 + vmovdqu 16*10(KS), TMP2 + cmp $10, NR + je .LLast4 + vaesenc 16*10(KS), TMP1, TMP1 + vaesenc 16*11(KS), TMP1, TMP1 + vmovdqu 16*12(KS), TMP2 + cmp $12, NR + je .LLast4 + vaesenc 16*12(KS), TMP1, TMP1 + vaesenc 16*13(KS), TMP1, TMP1 + vmovdqu 16*14(KS), TMP2 + +.LLast4: + vaesenclast TMP2, TMP1, TMP1 +#Zero a temp location + vpxor TMP2, TMP2, TMP2 + vmovdqa TMP2, (%rsp) + +# Copy the required bytes only (could probably use rep movsb) + xor KS, KS +.LEncCpy: + cmp KS, len + je .LEncCpyEnd + movb (PT, KS, 1), %r8b + movb %r8b, (%rsp, KS, 1) + inc KS + jmp .LEncCpy +.LEncCpyEnd: +# Xor with the counter block + vpxor (%rsp), TMP1, TMP0 +# Again, store at temp location + vmovdqa TMP0, (%rsp) +# Copy only the required bytes to CT, and zero the rest for the hash + xor KS, KS +.LEncCpy2: + cmp KS, len + je .LEncCpy3 + movb (%rsp, KS, 1), %r8b + movb %r8b, (CT, KS, 1) + inc KS + jmp .LEncCpy2 +.LEncCpy3: + cmp $16, KS + je .LEndCpy3 + movb $0, (%rsp, KS, 1) + inc KS + jmp .LEncCpy3 +.LEndCpy3: + vmovdqa (%rsp), TMP0 + + vpshufb .Lbswap_mask(%rip), TMP0, TMP0 + vpxor TMP0, T, T + vmovdqu (Htbl), TMP0 + call GFMUL + +DATA_END: + + vpshufb .Lbswap_mask(%rip), T, T + vpshufb .Lbswap_mask(%rip), CTR, CTR + vmovdqu T, 272(Gctx) + vmovdqu CTR, 288(Gctx) + + movq %rbp, %rsp + + popq %rbx + popq %rbp + ret + .size intel_aes_gcmENC, .-intel_aes_gcmENC + +######################### +# Decrypt and Authenticate +# void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len); +.type intel_aes_gcmDEC,@function +.globl intel_aes_gcmDEC +.align 16 +intel_aes_gcmDEC: +# parameter 1: CT # input +# parameter 2: PT # output +# parameter 3: %rdx # Gctx +# parameter 4: %rcx # len + +.macro DEC_KARATSUBA i + vmovdqu (7-\i)*16(CT), TMP5 + vpshufb .Lbswap_mask(%rip), TMP5, TMP5 + + vpclmulqdq $0x11, 16*\i(Htbl), TMP5, TMP3 + vpxor TMP3, TMP1, TMP1 + vpclmulqdq $0x00, 16*\i(Htbl), TMP5, TMP3 + vpxor TMP3, TMP2, TMP2 + vpshufd $78, TMP5, TMP3 + vpxor TMP5, TMP3, TMP5 + vpclmulqdq $0x00, 128+\i*16(Htbl), TMP5, TMP3 + vpxor TMP3, TMP0, TMP0 +.endm + +.set PT,%rsi +.set CT,%rdi +.set Htbl, %rdx +.set len, %rcx +.set KS,%r9 +.set NR,%r10d + +.set Gctx, %rdx + +.set T,%xmm0 +.set TMP0,%xmm1 +.set TMP1,%xmm2 +.set TMP2,%xmm3 +.set TMP3,%xmm4 +.set TMP4,%xmm5 +.set TMP5,%xmm6 +.set CTR0,%xmm7 +.set CTR1,%xmm8 +.set CTR2,%xmm9 +.set CTR3,%xmm10 +.set CTR4,%xmm11 +.set CTR5,%xmm12 +.set CTR6,%xmm13 +.set CTR7,%xmm14 +.set CTR,%xmm15 + + test len, len + jnz .LbeginDec + ret + +.LbeginDec: + + pushq %rbp + pushq %rbx + movq %rsp, %rbp + sub $128, %rsp + andq $-16, %rsp + vmovdqu 288(Gctx), CTR + vmovdqu 272(Gctx), T + mov 304(Gctx), KS + mov 4(KS), NR + lea 48(KS), KS + + vpshufb .Lbswap_mask(%rip), CTR, CTR + vpshufb .Lbswap_mask(%rip), T, T + + vmovdqu .Lbswap_mask(%rip), TMP3 + jmp .LDECOctets + +# Decrypt 8 blocks each time while hashing them at the same time +.align 64 +.LDECOctets: + + cmp $128, len + jb .LDECSingles + sub $128, len + + vmovdqa CTR, CTR0 + vpaddd .Lone(%rip), CTR0, CTR1 + vpaddd .Ltwo(%rip), CTR0, CTR2 + vpaddd .Lone(%rip), CTR2, CTR3 + vpaddd .Ltwo(%rip), CTR2, CTR4 + vpaddd .Lone(%rip), CTR4, CTR5 + vpaddd .Ltwo(%rip), CTR4, CTR6 + vpaddd .Lone(%rip), CTR6, CTR7 + vpaddd .Ltwo(%rip), CTR6, CTR + + vpshufb TMP3, CTR0, CTR0 + vpshufb TMP3, CTR1, CTR1 + vpshufb TMP3, CTR2, CTR2 + vpshufb TMP3, CTR3, CTR3 + vpshufb TMP3, CTR4, CTR4 + vpshufb TMP3, CTR5, CTR5 + vpshufb TMP3, CTR6, CTR6 + vpshufb TMP3, CTR7, CTR7 + + vmovdqu (KS), TMP3 + vpxor TMP3, CTR0, CTR0 + vpxor TMP3, CTR1, CTR1 + vpxor TMP3, CTR2, CTR2 + vpxor TMP3, CTR3, CTR3 + vpxor TMP3, CTR4, CTR4 + vpxor TMP3, CTR5, CTR5 + vpxor TMP3, CTR6, CTR6 + vpxor TMP3, CTR7, CTR7 + + vmovdqu 7*16(CT), TMP5 + vpshufb .Lbswap_mask(%rip), TMP5, TMP5 + vmovdqu 16*0(Htbl), TMP3 + vpclmulqdq $0x11, TMP3, TMP5, TMP1 + vpclmulqdq $0x00, TMP3, TMP5, TMP2 + vpshufd $78, TMP5, TMP3 + vpxor TMP5, TMP3, TMP5 + vmovdqu 128+0*16(Htbl), TMP3 + vpclmulqdq $0x00, TMP3, TMP5, TMP0 + + ROUND 1 + DEC_KARATSUBA 1 + + ROUND 2 + DEC_KARATSUBA 2 + + ROUND 3 + DEC_KARATSUBA 3 + + ROUND 4 + DEC_KARATSUBA 4 + + ROUND 5 + DEC_KARATSUBA 5 + + ROUND 6 + DEC_KARATSUBA 6 + + ROUND 7 + + vmovdqu 0*16(CT), TMP5 + vpshufb .Lbswap_mask(%rip), TMP5, TMP5 + vpxor T, TMP5, TMP5 + vmovdqu 16*7(Htbl), TMP4 + + vpclmulqdq $0x11, TMP4, TMP5, TMP3 + vpxor TMP3, TMP1, TMP1 + vpclmulqdq $0x00, TMP4, TMP5, TMP3 + vpxor TMP3, TMP2, TMP2 + + vpshufd $78, TMP5, TMP3 + vpxor TMP5, TMP3, TMP5 + vmovdqu 128+7*16(Htbl), TMP4 + + vpclmulqdq $0x00, TMP4, TMP5, TMP3 + vpxor TMP3, TMP0, TMP0 + + ROUND 8 + + vpxor TMP1, TMP0, TMP0 + vpxor TMP2, TMP0, TMP0 + + vpsrldq $8, TMP0, TMP3 + vpxor TMP3, TMP1, TMP4 + vpslldq $8, TMP0, TMP3 + vpxor TMP3, TMP2, T + vmovdqa .Lpoly(%rip), TMP2 + + vpalignr $8, T, T, TMP1 + vpclmulqdq $0x10, TMP2, T, T + vpxor T, TMP1, T + + ROUND 9 + + vpalignr $8, T, T, TMP1 + vpclmulqdq $0x10, TMP2, T, T + vpxor T, TMP1, T + + vmovdqu 160(KS), TMP5 + cmp $10, NR + + jbe .LDECLast1 + + ROUND 10 + ROUND 11 + + vmovdqu 192(KS), TMP5 + cmp $12, NR + + jbe .LDECLast1 + + ROUND 12 + ROUND 13 + + vmovdqu 224(KS), TMP5 + +.LDECLast1: + + vpxor (CT), TMP5, TMP3 + vaesenclast TMP3, CTR0, CTR0 + vpxor 16(CT), TMP5, TMP3 + vaesenclast TMP3, CTR1, CTR1 + vpxor 32(CT), TMP5, TMP3 + vaesenclast TMP3, CTR2, CTR2 + vpxor 48(CT), TMP5, TMP3 + vaesenclast TMP3, CTR3, CTR3 + vpxor 64(CT), TMP5, TMP3 + vaesenclast TMP3, CTR4, CTR4 + vpxor 80(CT), TMP5, TMP3 + vaesenclast TMP3, CTR5, CTR5 + vpxor 96(CT), TMP5, TMP3 + vaesenclast TMP3, CTR6, CTR6 + vpxor 112(CT), TMP5, TMP3 + vaesenclast TMP3, CTR7, CTR7 + + vmovdqu .Lbswap_mask(%rip), TMP3 + + vmovdqu CTR0, (PT) + vmovdqu CTR1, 16(PT) + vmovdqu CTR2, 32(PT) + vmovdqu CTR3, 48(PT) + vmovdqu CTR4, 64(PT) + vmovdqu CTR5, 80(PT) + vmovdqu CTR6, 96(PT) + vmovdqu CTR7,112(PT) + + vpxor TMP4, T, T + + lea 128(CT), CT + lea 128(PT), PT + jmp .LDECOctets + +#Here we decrypt and hash any remaining whole block +.LDECSingles: + + cmp $16, len + jb .LDECTail + sub $16, len + + vmovdqu (CT), TMP1 + vpshufb .Lbswap_mask(%rip), TMP1, TMP1 + vpxor TMP1, T, T + vmovdqu (Htbl), TMP0 + call GFMUL + + + vpshufb .Lbswap_mask(%rip), CTR, TMP1 + vpaddd .Lone(%rip), CTR, CTR + + vpxor (KS), TMP1, TMP1 + vaesenc 16*1(KS), TMP1, TMP1 + vaesenc 16*2(KS), TMP1, TMP1 + vaesenc 16*3(KS), TMP1, TMP1 + vaesenc 16*4(KS), TMP1, TMP1 + vaesenc 16*5(KS), TMP1, TMP1 + vaesenc 16*6(KS), TMP1, TMP1 + vaesenc 16*7(KS), TMP1, TMP1 + vaesenc 16*8(KS), TMP1, TMP1 + vaesenc 16*9(KS), TMP1, TMP1 + vmovdqu 16*10(KS), TMP2 + cmp $10, NR + je .LDECLast2 + vaesenc 16*10(KS), TMP1, TMP1 + vaesenc 16*11(KS), TMP1, TMP1 + vmovdqu 16*12(KS), TMP2 + cmp $12, NR + je .LDECLast2 + vaesenc 16*12(KS), TMP1, TMP1 + vaesenc 16*13(KS), TMP1, TMP1 + vmovdqu 16*14(KS), TMP2 +.LDECLast2: + vaesenclast TMP2, TMP1, TMP1 + + vpxor (CT), TMP1, TMP1 + vmovdqu TMP1, (PT) + addq $16, CT + addq $16, PT + jmp .LDECSingles + +#Here we decrypt the final partial block, if there is one +.LDECTail: + test len, len + jz .LDEC_END + + vpshufb .Lbswap_mask(%rip), CTR, TMP1 + vpaddd .Lone(%rip), CTR, CTR + + vpxor (KS), TMP1, TMP1 + vaesenc 16*1(KS), TMP1, TMP1 + vaesenc 16*2(KS), TMP1, TMP1 + vaesenc 16*3(KS), TMP1, TMP1 + vaesenc 16*4(KS), TMP1, TMP1 + vaesenc 16*5(KS), TMP1, TMP1 + vaesenc 16*6(KS), TMP1, TMP1 + vaesenc 16*7(KS), TMP1, TMP1 + vaesenc 16*8(KS), TMP1, TMP1 + vaesenc 16*9(KS), TMP1, TMP1 + vmovdqu 16*10(KS), TMP2 + cmp $10, NR + je .LDECLast3 + vaesenc 16*10(KS), TMP1, TMP1 + vaesenc 16*11(KS), TMP1, TMP1 + vmovdqu 16*12(KS), TMP2 + cmp $12, NR + je .LDECLast3 + vaesenc 16*12(KS), TMP1, TMP1 + vaesenc 16*13(KS), TMP1, TMP1 + vmovdqu 16*14(KS), TMP2 + +.LDECLast3: + vaesenclast TMP2, TMP1, TMP1 + + vpxor TMP2, TMP2, TMP2 + vmovdqa TMP2, (%rsp) +# Copy the required bytes only (could probably use rep movsb) + xor KS, KS +.LDecCpy: + cmp KS, len + je .LDecCpy2 + movb (CT, KS, 1), %r8b + movb %r8b, (%rsp, KS, 1) + inc KS + jmp .LDecCpy +.LDecCpy2: + cmp $16, KS + je .LDecCpyEnd + movb $0, (%rsp, KS, 1) + inc KS + jmp .LDecCpy2 +.LDecCpyEnd: +# Xor with the counter block + vmovdqa (%rsp), TMP0 + vpxor TMP0, TMP1, TMP1 +# Again, store at temp location + vmovdqa TMP1, (%rsp) +# Copy only the required bytes to PT, and zero the rest for the hash + xor KS, KS +.LDecCpy3: + cmp KS, len + je .LDecCpyEnd3 + movb (%rsp, KS, 1), %r8b + movb %r8b, (PT, KS, 1) + inc KS + jmp .LDecCpy3 +.LDecCpyEnd3: + vpshufb .Lbswap_mask(%rip), TMP0, TMP0 + vpxor TMP0, T, T + vmovdqu (Htbl), TMP0 + call GFMUL +.LDEC_END: + + vpshufb .Lbswap_mask(%rip), T, T + vpshufb .Lbswap_mask(%rip), CTR, CTR + vmovdqu T, 272(Gctx) + vmovdqu CTR, 288(Gctx) + + movq %rbp, %rsp + + popq %rbx + popq %rbp + ret + .size intel_aes_gcmDEC, .-intel_aes_gcmDEC +######################### +# a = T +# b = TMP0 - remains unchanged +# res = T +# uses also TMP1,TMP2,TMP3,TMP4 +# __m128i GFMUL(__m128i A, __m128i B); +.type GFMUL,@function +.globl GFMUL +GFMUL: + vpclmulqdq $0x00, TMP0, T, TMP1 + vpclmulqdq $0x11, TMP0, T, TMP4 + + vpshufd $78, T, TMP2 + vpshufd $78, TMP0, TMP3 + vpxor T, TMP2, TMP2 + vpxor TMP0, TMP3, TMP3 + + vpclmulqdq $0x00, TMP3, TMP2, TMP2 + vpxor TMP1, TMP2, TMP2 + vpxor TMP4, TMP2, TMP2 + + vpslldq $8, TMP2, TMP3 + vpsrldq $8, TMP2, TMP2 + + vpxor TMP3, TMP1, TMP1 + vpxor TMP2, TMP4, TMP4 + + vpclmulqdq $0x10, .Lpoly(%rip), TMP1, TMP2 + vpshufd $78, TMP1, TMP3 + vpxor TMP3, TMP2, TMP1 + + vpclmulqdq $0x10, .Lpoly(%rip), TMP1, TMP2 + vpshufd $78, TMP1, TMP3 + vpxor TMP3, TMP2, TMP1 + + vpxor TMP4, TMP1, T + ret +.size GFMUL, .-GFMUL + diff --git a/security/nss/lib/freebl/manifest.mn b/security/nss/lib/freebl/manifest.mn index f2b462aa3..ea29a8eaa 100644 --- a/security/nss/lib/freebl/manifest.mn +++ b/security/nss/lib/freebl/manifest.mn @@ -119,6 +119,7 @@ CSRCS = \ $(ECL_SRCS) \ $(STUBS_SRCS) \ $(LOWHASH_SRCS) \ + $(EXTRA_SRCS) \ $(NULL) ALL_CSRCS := $(CSRCS) diff --git a/security/nss/lib/freebl/rijndael.c b/security/nss/lib/freebl/rijndael.c index 59508b3c3..e43a7346c 100644 --- a/security/nss/lib/freebl/rijndael.c +++ b/security/nss/lib/freebl/rijndael.c @@ -20,8 +20,16 @@ #include "gcm.h" #if USE_HW_AES +#include "intel-gcm.h" #include "intel-aes.h" #include "mpi.h" + +static int has_intel_aes = 0; +static int has_intel_avx = 0; +static int has_intel_clmul = 0; +static PRBool use_hw_aes = PR_FALSE; +static PRBool use_hw_avx = PR_FALSE; +static PRBool use_hw_gcm = PR_FALSE; #endif /* @@ -970,10 +978,6 @@ aes_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize, const unsigned char *iv, int mode, unsigned int encrypt, unsigned int blocksize) { -#if USE_HW_AES - static int has_intel_aes; - PRBool use_hw_aes = PR_FALSE; -#endif unsigned int Nk; /* According to Rijndael AES Proposal, section 12.1, block and key * lengths between 128 and 256 bits are supported, as long as the @@ -1009,12 +1013,18 @@ aes_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize, if (disable_hw_aes == NULL) { freebl_cpuid(1, &eax, &ebx, &ecx, &edx); has_intel_aes = (ecx & (1 << 25)) != 0 ? 1 : -1; + has_intel_clmul = (ecx & (1 << 1)) != 0 ? 1 : -1; + has_intel_avx = (ecx & (1 << 28)) != 0 ? 1 : -1; } else { has_intel_aes = -1; + has_intel_avx = -1; + has_intel_clmul = -1; } } use_hw_aes = (PRBool) (has_intel_aes > 0 && (keysize % 8) == 0 && blocksize == 16); + use_hw_gcm = (PRBool) + (use_hw_aes && has_intel_avx>0 && has_intel_clmul>0); #endif /* Nb = (block size in bits) / 32 */ cx->Nb = blocksize / 4; @@ -1117,11 +1127,22 @@ AES_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize, cx->isBlock = PR_FALSE; break; case NSS_AES_GCM: +#if USE_HW_AES + if(use_hw_gcm) { + cx->worker_cx = intel_AES_GCM_CreateContext(cx, cx->worker, iv, blocksize); + cx->worker = (freeblCipherFunc) + (encrypt ? intel_AES_GCM_EncryptUpdate : intel_AES_GCM_DecryptUpdate); + cx->destroy = (freeblDestroyFunc) intel_AES_GCM_DestroyContext; + cx->isBlock = PR_FALSE; + } else +#endif + { cx->worker_cx = GCM_CreateContext(cx, cx->worker, iv, blocksize); cx->worker = (freeblCipherFunc) (encrypt ? GCM_EncryptUpdate : GCM_DecryptUpdate); cx->destroy = (freeblDestroyFunc) GCM_DestroyContext; cx->isBlock = PR_FALSE; + } break; case NSS_AES_CTR: cx->worker_cx = CTR_CreateContext(cx, cx->worker, iv, blocksize); |