Bug 805604 - Efficient AES-GCM implementation that uses Intel's AES and PCLMULQDQ instructions (AES-NI) and the Advanced Vector Extension (AVX) architecture.

patch by Shay Gueron, review by rrelyea.
author: rrelyea%redhat.com <devnull@localhost> 2013-01-15 02:36:11 +0000
committer: rrelyea%redhat.com <devnull@localhost> 2013-01-15 02:36:11 +0000
commit: 094bde5d3a5df11e7f03ecc73a492c07d433e8d6 (patch)
tree: 87c605d15e71403d3528c58c993866d0ef02f0dc
parent: 9f1f716dcf13a4cb0c65e8fe286693e900ada116 (diff)
download: nss-hg-094bde5d3a5df11e7f03ecc73a492c07d433e8d6.tar.gz
6 files changed, 1678 insertions, 7 deletions
diff --git a/security/nss/lib/freebl/Makefile b/security/nss/lib/freebl/Makefile
index 648adec14..9ad9599da 100644
--- a/security/nss/lib/freebl/Makefile
+++ b/security/nss/lib/freebl/Makefile
@@ -91,7 +91,7 @@ ifdef FREEBL_PRELINK_COMMAND
 	DEFINES +=-DFREEBL_PRELINK_COMMAND=\"$(FREEBL_PRELINK_COMMAND)\"
 endif
 # NSS_X86 means the target is a 32-bits x86 CPU architecture
-# NSS_X64 means the target is a 64-bits x64 CPU architecture
+# NSS_X64 means the target is a 64-bits 64 CPU architecture
 # NSS_X86_OR_X64 means the target is either x86 or x64
 ifeq (,$(filter-out i386 x386 x86 x86_64,$(CPU_ARCH)))
         DEFINES += -DNSS_X86_OR_X64
@@ -187,7 +187,9 @@ ifeq ($(CPU_ARCH),x86_64)
 #   DEFINES += -DMPI_AMD64_ADD
     # comment the next two lines to turn off intel HW accelleration
     DEFINES += -DUSE_HW_AES
-    ASFILES += intel-aes.s
+    ASFILES += intel-aes.s intel-gcm.s
+    EXTRA_SRCS +=  intel-gcm-wrap.c
+    INTEL_GCM=1
     MPI_SRCS += mpi_amd64.c mp_comba.c
 endif
 ifeq ($(CPU_ARCH),x86)
@@ -442,7 +444,9 @@ else
 	DEFINES += -DNSS_USE_COMBA -DMP_CHAR_STORE_SLOW -DMP_IS_LITTLE_ENDIAN
 	# comment the next two lines to turn off intel HW accelleration
 	DEFINES += -DUSE_HW_AES
-	ASFILES += intel-aes.s
+	ASFILES += intel-aes.s intel-gcm.s
+        EXTRA_SRCS +=  intel-gcm-wrap.c
+        INTEL_GCM=1
 	MPI_SRCS += mpi_amd64.c
     else
 	# Solaris x86
@@ -643,3 +647,16 @@ else
 endif
 endif
 endif
+
+ifdef INTEL_GCM
+#
+# GCM binary needs -msse4
+#
+$(OBJDIR)/$(PROG_PREFIX)intel-gcm-wrap$(OBJ_SUFFIX): intel-gcm-wrap.c
+	@$(MAKE_OBJDIR)
+ifdef NEED_ABSOLUTE_PATH
+	$(CC) -o $@ -c -mssse3 $(CFLAGS) $(call core_abspath,$<)
+else
+	$(CC) -o $@ -c -mssse3 $(CFLAGS) $<
+endif
+endif
diff --git a/security/nss/lib/freebl/intel-gcm-wrap.c b/security/nss/lib/freebl/intel-gcm-wrap.c
new file mode 100644
index 000000000..cda2fac58
--- /dev/null
+++ b/security/nss/lib/freebl/intel-gcm-wrap.c
@@ -0,0 +1,235 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/* Wrapper funcions for Intel optimized implementation of AES-GCM */
+
+#ifdef USE_HW_AES
+
+#ifdef FREEBL_NO_DEPEND
+#include "stubs.h"
+#endif
+
+#include "blapii.h"
+#include "blapit.h"
+#include "gcm.h"
+#include "ctr.h"
+#include "secerr.h"
+#include "prtypes.h"
+#include "pkcs11t.h"
+
+#include <limits.h>
+
+#include "intel-gcm.h"
+#include "rijndael.h"
+
+#if defined(__INTEL_COMPILER)
+#include <ia32intrin.h> 
+#elif defined(__GNUC__)
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#endif
+
+
+struct intel_AES_GCMContextStr{
+    unsigned char Htbl[16*AES_BLOCK_SIZE];
+    unsigned char X0[AES_BLOCK_SIZE];
+    unsigned char T[AES_BLOCK_SIZE];
+    unsigned char CTR[AES_BLOCK_SIZE];
+    AESContext *aes_context;
+    unsigned long tagBits;
+    unsigned long Alen;
+    unsigned long Mlen;
+};
+
+intel_AES_GCMContext *intel_AES_GCM_CreateContext(void *context, 
+               freeblCipherFunc cipher,
+               const unsigned char *params, 
+               unsigned int blocksize)
+{
+    intel_AES_GCMContext *gcm = NULL;
+    AESContext *aes = (AESContext*)context;
+    const CK_GCM_PARAMS *gcmParams = (const CK_GCM_PARAMS *)params;
+    unsigned char buff[AES_BLOCK_SIZE]; /* aux buffer */
+    
+    int IV_whole_len = gcmParams->ulIvLen&(~0xf);
+    int IV_remainder_len = gcmParams->ulIvLen&0xf;
+    int AAD_whole_len = gcmParams->ulAADLen&(~0xf);
+    int AAD_remainder_len = gcmParams->ulAADLen&0xf;
+    
+    __m128i BSWAP_MASK = _mm_setr_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
+    __m128i ONE = _mm_set_epi32(0,0,0,1);
+    unsigned int j;
+    SECStatus rv;
+
+    if (blocksize != AES_BLOCK_SIZE) {
+      PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
+      return NULL;
+    }
+    gcm = PORT_ZNew(intel_AES_GCMContext);
+    
+    if (gcm == NULL) {
+        return NULL;
+    }
+    /* initialize context fields */
+    gcm->aes_context = aes;
+    gcm->tagBits = gcmParams->ulTagBits;
+    gcm->Alen = 0;
+    gcm->Mlen = 0;
+    /* first prepare H and its derivatives for ghash */
+    intel_aes_gcmINIT(gcm->Htbl, (unsigned char*)aes->expandedKey, aes->Nr);
+    /* Initial TAG value is zero*/
+    _mm_storeu_si128((__m128i*)gcm->T, _mm_setzero_si128());
+    _mm_storeu_si128((__m128i*)gcm->X0, _mm_setzero_si128());
+    /* Init the counter */
+    if(gcmParams->ulIvLen == 12) {
+        _mm_storeu_si128((__m128i*)gcm->CTR, _mm_setr_epi32(((unsigned int*)gcmParams->pIv)[0], ((unsigned int*)gcmParams->pIv)[1], ((unsigned int*)gcmParams->pIv)[2], 0x01000000));
+    } else {
+        /* If IV size is not 96 bits, then the initial counter value is GHASH of the IV */
+        intel_aes_gcmAAD(gcm->Htbl, gcmParams->pIv, IV_whole_len, gcm->T);
+        /* Partial block */
+        if(IV_remainder_len) {
+            PORT_Memset(buff, 0, AES_BLOCK_SIZE);
+            PORT_Memcpy(buff, gcmParams->pIv + IV_whole_len, IV_remainder_len);
+            intel_aes_gcmAAD(gcm->Htbl, buff, AES_BLOCK_SIZE, gcm->T);
+         }
+         
+         intel_aes_gcmTAG
+         (
+            gcm->Htbl,
+            gcm->T,
+            gcmParams->ulIvLen,
+            0,
+            gcm->X0,
+            gcm->CTR
+         );
+        /* TAG should be zero again */
+        _mm_storeu_si128((__m128i*)gcm->T, _mm_setzero_si128());
+    }
+    /* Encrypt the initial counter, will be used to encrypt the GHASH value, in the end */
+    rv = (*cipher)(context, gcm->X0, &j, AES_BLOCK_SIZE, gcm->CTR, AES_BLOCK_SIZE, AES_BLOCK_SIZE);
+    if (rv != SECSuccess) {
+        goto loser;
+    }
+    /* Promote the counter by 1 */
+    _mm_storeu_si128((__m128i*)gcm->CTR, _mm_shuffle_epi8(_mm_add_epi32(ONE, _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)gcm->CTR), BSWAP_MASK)), BSWAP_MASK));
+
+/*     Now hash AAD - it would actually make sense to seperate the context creation from the AAD, 
+ *     because that would allow to reuse the H, which only changes when the AES key changes, 
+ *     and not every package, like the IV and AAD */
+    intel_aes_gcmAAD(gcm->Htbl, gcmParams->pAAD, AAD_whole_len, gcm->T);
+    if(AAD_remainder_len) {
+        PORT_Memset(buff, 0, AES_BLOCK_SIZE);
+        PORT_Memcpy(buff, gcmParams->pAAD + AAD_whole_len, AAD_remainder_len);
+        intel_aes_gcmAAD(gcm->Htbl, buff, AES_BLOCK_SIZE, gcm->T);
+    }
+    gcm->Alen += gcmParams->ulAADLen;
+    return gcm;
+    
+    loser:
+    if (gcm) {
+        PORT_Free(gcm);
+    }
+    return NULL;
+}
+
+void intel_AES_GCM_DestroyContext(intel_AES_GCMContext *gcm, PRBool freeit)
+{
+    if (freeit) {
+        PORT_Free(gcm);
+    }
+}
+
+SECStatus intel_AES_GCM_EncryptUpdate(intel_AES_GCMContext *gcm, 
+            unsigned char *outbuf,
+			unsigned int *outlen, unsigned int maxout,
+			const unsigned char *inbuf, unsigned int inlen,
+			unsigned int blocksize)
+{
+    unsigned int tagBytes;
+    unsigned char T[AES_BLOCK_SIZE];
+    int j;
+
+    tagBytes = (gcm->tagBits + (PR_BITS_PER_BYTE-1)) / PR_BITS_PER_BYTE;
+    if (UINT_MAX - inlen < tagBytes) {
+        PORT_SetError(SEC_ERROR_INPUT_LEN);
+        return SECFailure;
+    }
+    if (maxout < inlen + tagBytes) {
+        *outlen = inlen + tagBytes;
+        PORT_SetError(SEC_ERROR_OUTPUT_LEN);
+        return SECFailure;
+    }
+
+    intel_aes_gcmENC(
+        inbuf,
+        outbuf,
+        gcm,
+        inlen);
+
+    gcm->Mlen += inlen;
+      
+    intel_aes_gcmTAG(
+        gcm->Htbl,
+        gcm->T,
+        gcm->Mlen,
+        gcm->Alen,
+        gcm->X0,
+        T);
+
+    *outlen = inlen + tagBytes;
+
+    for(j=0; j<tagBytes; j++)
+    {
+        outbuf[inlen+j] = T[j];
+    }
+    return SECSuccess;
+}
+
+SECStatus intel_AES_GCM_DecryptUpdate(intel_AES_GCMContext *gcm, 
+            unsigned char *outbuf,
+			unsigned int *outlen, unsigned int maxout,
+			const unsigned char *inbuf, unsigned int inlen,
+			unsigned int blocksize)
+{
+    unsigned int tagBytes;
+    unsigned char T[AES_BLOCK_SIZE];
+    const unsigned char *intag;
+
+    tagBytes = (gcm->tagBits + (PR_BITS_PER_BYTE-1)) / PR_BITS_PER_BYTE;
+ 
+    /* get the authentication block */
+    if (inlen < tagBytes) {
+        PORT_SetError(SEC_ERROR_INVALID_ARGS);
+        return SECFailure;
+    }
+
+    inlen -= tagBytes;
+    intag = inbuf + inlen;
+
+    intel_aes_gcmDEC(
+         inbuf,
+         outbuf,
+         gcm,
+         inlen);
+
+    gcm->Mlen += inlen;
+    intel_aes_gcmTAG(
+         gcm->Htbl,
+         gcm->T,
+         gcm->Mlen,
+         gcm->Alen,
+         gcm->X0,
+         T);
+
+    if (NSS_SecureMemcmp(T, intag, tagBytes) != 0) {
+        /* force a CKR_ENCRYPTED_DATA_INVALID error at in softoken */
+        PORT_SetError(SEC_ERROR_BAD_DATA);
+        return SECFailure;
+    }
+    *outlen = inlen;
+
+    return SECSuccess;
+}
+
+#endif
diff --git a/security/nss/lib/freebl/intel-gcm.h b/security/nss/lib/freebl/intel-gcm.h
new file mode 100644
index 000000000..29bfba8d2
--- /dev/null
+++ b/security/nss/lib/freebl/intel-gcm.h
@@ -0,0 +1,62 @@
+#ifndef INTEL_GCM_H
+#define INTEL_GCM_H 1
+
+#include "blapii.h"
+
+typedef struct intel_AES_GCMContextStr intel_AES_GCMContext;
+
+intel_AES_GCMContext *intel_AES_GCM_CreateContext(void *context, freeblCipherFunc cipher,
+			const unsigned char *params, unsigned int blocksize);
+
+void intel_AES_GCM_DestroyContext(intel_AES_GCMContext *gcm, PRBool freeit);
+
+SECStatus intel_AES_GCM_EncryptUpdate(intel_AES_GCMContext  *gcm, unsigned char *outbuf,
+			unsigned int *outlen, unsigned int maxout,
+			const unsigned char *inbuf, unsigned int inlen,
+			unsigned int blocksize);
+
+SECStatus intel_AES_GCM_DecryptUpdate(intel_AES_GCMContext *gcm, unsigned char *outbuf,
+			unsigned int *outlen, unsigned int maxout,
+			const unsigned char *inbuf, unsigned int inlen,
+			unsigned int blocksize);
+
+/* Prorotypes of functions in the assembler file for fast AES-GCM, using 
+   Intel AES-NI and CLMUL-NI, as described in [1]
+   [1] Shay Gueron, Michael E. Kounavis: Intel® Carry-Less Multiplication 
+       Instruction and its Usage for Computing the GCM Mode                */
+       
+/* Prepares the constants used in the aggregated reduction method */
+void intel_aes_gcmINIT(unsigned char Htbl[16*16],
+                       unsigned char *KS,
+                       int NR);
+
+/* Produces the final GHASH value */
+void intel_aes_gcmTAG(unsigned char Htbl[16*16], 
+                      unsigned char *Tp, 
+                      unsigned long Mlen, 
+                      unsigned long Alen, 
+                      unsigned char* X0, 
+                      unsigned char* TAG);
+
+/* Hashes the Additional Authenticated Data, should be used before enc/dec.
+   Operates on whole blocks only. Partial blocks should be padded externally. */
+void intel_aes_gcmAAD(unsigned char Htbl[16*16], 
+                      unsigned char *AAD, 
+                      unsigned long Alen, 
+                      unsigned char *Tp);
+
+/* Encrypts and hashes the Plaintext. 
+   Operates on any length of data, however partial block should only be encrypted
+   at the last call, otherwise the result will be incorrect. */
+void intel_aes_gcmENC(const unsigned char* PT, 
+                      unsigned char* CT, 
+                      void *Gctx, 
+                      unsigned long len);
+                  
+/* Similar to ENC, but decrypts the Ciphertext. */
+void intel_aes_gcmDEC(const unsigned char* CT, 
+                      unsigned char* PT, 
+                      void *Gctx, 
+                      unsigned long len);
+
+#endif
diff --git a/security/nss/lib/freebl/intel-gcm.s b/security/nss/lib/freebl/intel-gcm.s
new file mode 100644
index 000000000..49d8ecd98
--- /dev/null
+++ b/security/nss/lib/freebl/intel-gcm.s
@@ -0,0 +1,1335 @@
+
+
+.align  16
+.Lone:
+.quad 1,0
+.Ltwo:
+.quad 2,0
+.Lbswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lshuff_mask:
+.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.Lpoly:
+.quad 0x1, 0xc200000000000000 
+
+
+################################################################################
+# Generates the final GCM tag
+# void intel_aes_gcmTAG(uint8_t Htbl[16*16], uint8_t *Tp, uint64_t Mlen, uint64_t Alen, uint8_t* X0, uint8_t* TAG);
+.type intel_aes_gcmTAG,@function
+.globl intel_aes_gcmTAG
+.align 16
+intel_aes_gcmTAG:
+
+.set  Htbl, %rdi
+.set  Tp, %rsi
+.set  Mlen, %rdx
+.set  Alen, %rcx
+.set  X0, %r8
+.set  TAG, %r9
+
+.set T,%xmm0
+.set TMP0,%xmm1
+
+   vmovdqu  (Tp), T
+   vpshufb  .Lbswap_mask(%rip), T, T
+   vpxor    TMP0, TMP0, TMP0
+   shl      $3, Mlen
+   shl      $3, Alen
+   vpinsrq  $0, Mlen, TMP0, TMP0
+   vpinsrq  $1, Alen, TMP0, TMP0
+   vpxor    TMP0, T, T
+   vmovdqu  (Htbl), TMP0
+   call     GFMUL
+   vpshufb  .Lbswap_mask(%rip), T, T
+   vpxor    (X0), T, T
+   vmovdqu  T, (TAG)
+   
+ret
+.size intel_aes_gcmTAG, .-intel_aes_gcmTAG
+################################################################################
+# Generates the H table
+# void intel_aes_gcmINIT(uint8_t Htbl[16*16], uint8_t *KS, int NR);
+.type intel_aes_gcmINIT,@function
+.globl intel_aes_gcmINIT
+.align 16
+intel_aes_gcmINIT:
+   
+.set  Htbl, %rdi
+.set  KS, %rsi
+.set  NR, %edx
+
+.set T,%xmm0
+.set TMP0,%xmm1
+
+CALCULATE_POWERS_OF_H:
+    vmovdqu      16*0(KS), T
+    vaesenc      16*1(KS), T, T
+    vaesenc      16*2(KS), T, T
+    vaesenc      16*3(KS), T, T
+    vaesenc      16*4(KS), T, T
+    vaesenc      16*5(KS), T, T
+    vaesenc      16*6(KS), T, T
+    vaesenc      16*7(KS), T, T
+    vaesenc      16*8(KS), T, T
+    vaesenc      16*9(KS), T, T
+    vmovdqu      16*10(KS), TMP0
+    cmp          $10, NR
+    je           .LH0done
+    vaesenc      16*10(KS), T, T
+    vaesenc      16*11(KS), T, T
+    vmovdqu      16*12(KS), TMP0
+    cmp          $12, NR
+    je           .LH0done
+    vaesenc      16*12(KS), T, T
+    vaesenc      16*13(KS), T, T
+    vmovdqu      16*14(KS), TMP0
+  
+.LH0done:
+    vaesenclast  TMP0, T, T
+
+    vpshufb      .Lbswap_mask(%rip), T, T  
+
+    vmovdqu	T, TMP0
+    # Calculate H` = GFMUL(H, 2)
+    vpsrld	$7 , T , %xmm3
+    vmovdqu	.Lshuff_mask(%rip), %xmm4
+    vpshufb	%xmm4, %xmm3 , %xmm3
+    movq	$0xff00 , %rax
+    vmovq	%rax, %xmm4
+    vpshufb	%xmm3, %xmm4 , %xmm4
+    vmovdqu	.Lpoly(%rip), %xmm5
+    vpand	%xmm4, %xmm5, %xmm5
+    vpsrld	$31, T, %xmm3
+    vpslld	$1, T, %xmm4
+    vpslldq	$4, %xmm3, %xmm3
+    vpxor	%xmm3, %xmm4, T  #xmm1 holds now p(x)<<1
+
+    #adding p(x)<<1 to xmm5
+    vpxor     %xmm5, T , T
+    vmovdqu   T, TMP0
+    vmovdqu   T, (Htbl)     # H * 2
+    call  GFMUL
+    vmovdqu  T, 16(Htbl)    # H^2 * 2
+    call  GFMUL
+    vmovdqu  T, 32(Htbl)    # H^3 * 2
+    call  GFMUL
+    vmovdqu  T, 48(Htbl)    # H^4 * 2
+    call  GFMUL
+    vmovdqu  T, 64(Htbl)    # H^5 * 2
+    call  GFMUL
+    vmovdqu  T, 80(Htbl)    # H^6 * 2
+    call  GFMUL
+    vmovdqu  T, 96(Htbl)    # H^7 * 2
+    call  GFMUL
+    vmovdqu  T, 112(Htbl)   # H^8 * 2  
+
+    # Precalculations for the reduce 4 step
+    vpshufd  $78, (Htbl), %xmm8
+    vpshufd  $78, 16(Htbl), %xmm9
+    vpshufd  $78, 32(Htbl), %xmm10
+    vpshufd  $78, 48(Htbl), %xmm11
+    vpshufd  $78, 64(Htbl), %xmm12
+    vpshufd  $78, 80(Htbl), %xmm13
+    vpshufd  $78, 96(Htbl), %xmm14
+    vpshufd  $78, 112(Htbl), %xmm15
+
+    vpxor  (Htbl), %xmm8, %xmm8
+    vpxor  16(Htbl), %xmm9, %xmm9
+    vpxor  32(Htbl), %xmm10, %xmm10
+    vpxor  48(Htbl), %xmm11, %xmm11
+    vpxor  64(Htbl), %xmm12, %xmm12
+    vpxor  80(Htbl), %xmm13, %xmm13
+    vpxor  96(Htbl), %xmm14, %xmm14
+    vpxor  112(Htbl), %xmm15, %xmm15
+
+    vmovdqu   %xmm8, 128(Htbl)
+    vmovdqu   %xmm9, 144(Htbl)
+    vmovdqu   %xmm10, 160(Htbl)
+    vmovdqu   %xmm11, 176(Htbl)
+    vmovdqu   %xmm12, 192(Htbl)
+    vmovdqu   %xmm13, 208(Htbl)
+    vmovdqu   %xmm14, 224(Htbl)
+    vmovdqu   %xmm15, 240(Htbl)
+
+    ret
+.size intel_aes_gcmINIT, .-intel_aes_gcmINIT
+################################################################################
+# Authenticate only
+# void intel_aes_gcmAAD(uint8_t Htbl[16*16], uint8_t *AAD, uint64_t Alen, uint8_t *Tp);
+
+.globl  intel_aes_gcmAAD
+.type   intel_aes_gcmAAD,@function
+.align  16
+intel_aes_gcmAAD:
+
+.set DATA, %xmm0
+.set T, %xmm1
+.set BSWAP_MASK, %xmm2
+.set TMP0, %xmm3
+.set TMP1, %xmm4
+.set TMP2, %xmm5
+.set TMP3, %xmm6
+.set TMP4, %xmm7
+.set Xhi, %xmm9
+
+.set Htbl, %rdi
+.set inp, %rsi
+.set len, %rdx
+.set Tp, %rcx
+
+.set hlp0, %r11
+
+.macro KARATSUBA_AAD i
+    vpclmulqdq  $0x00, 16*\i(Htbl), DATA, TMP3
+    vpxor       TMP3, TMP0, TMP0
+    vpclmulqdq  $0x11, 16*\i(Htbl), DATA, TMP3
+    vpxor       TMP3, TMP1, TMP1
+    vpshufd     $78,  DATA, TMP3
+    vpxor       DATA, TMP3, TMP3
+    vpclmulqdq  $0x00, 16*(\i+8)(Htbl), TMP3, TMP3
+    vpxor       TMP3, TMP2, TMP2
+.endm
+
+    test  len, len
+    jnz   .LbeginAAD
+    ret
+
+.LbeginAAD:
+
+   push  hlp0
+   vzeroupper
+   
+   vmovdqa  .Lbswap_mask(%rip), BSWAP_MASK
+   
+   vpxor    Xhi, Xhi, Xhi
+   
+   vmovdqu  (Tp),T
+   vpshufb  BSWAP_MASK,T,T
+
+   # we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
+    mov     len, hlp0
+    and	    $~-128, hlp0
+
+    jz      .Lmod_loop
+
+    sub     hlp0, len
+    sub     $16, hlp0
+
+   #hash first prefix block
+	vmovdqu (inp), DATA
+	vpshufb  BSWAP_MASK, DATA, DATA
+	vpxor    T, DATA, DATA
+	
+	vpclmulqdq  $0x00, (Htbl, hlp0), DATA, TMP0
+	vpclmulqdq  $0x11, (Htbl, hlp0), DATA, TMP1
+	vpshufd     $78, DATA, TMP2
+	vpxor       DATA, TMP2, TMP2
+	vpclmulqdq  $0x00, 16*8(Htbl, hlp0), TMP2, TMP2
+	
+	lea	    16(inp), inp
+	test    hlp0, hlp0
+	jnz	    .Lpre_loop
+	jmp	    .Lred1
+
+    #hash remaining prefix bocks (up to 7 total prefix blocks)
+.align 64
+.Lpre_loop:
+
+    sub	$16, hlp0
+
+    vmovdqu     (inp),DATA           # next data block
+    vpshufb     BSWAP_MASK,DATA,DATA
+
+    vpclmulqdq  $0x00, (Htbl,hlp0), DATA, TMP3
+    vpxor       TMP3, TMP0, TMP0
+    vpclmulqdq  $0x11, (Htbl,hlp0), DATA, TMP3
+    vpxor       TMP3, TMP1, TMP1
+    vpshufd	    $78, DATA, TMP3
+    vpxor       DATA, TMP3, TMP3
+    vpclmulqdq  $0x00, 16*8(Htbl,hlp0), TMP3, TMP3
+    vpxor       TMP3, TMP2, TMP2
+
+    test	hlp0, hlp0
+
+    lea	16(inp), inp
+
+    jnz	.Lpre_loop
+	
+.Lred1:
+    vpxor       TMP0, TMP2, TMP2
+    vpxor       TMP1, TMP2, TMP2
+    vpsrldq     $8, TMP2, TMP3
+    vpslldq     $8, TMP2, TMP2
+
+    vpxor       TMP3, TMP1, Xhi
+    vpxor       TMP2, TMP0, T
+	
+.align 64
+.Lmod_loop:
+    sub	$0x80, len
+    jb	.Ldone
+
+    vmovdqu     16*7(inp),DATA		# Ii
+    vpshufb     BSWAP_MASK,DATA,DATA
+
+    vpclmulqdq  $0x00, (Htbl), DATA, TMP0
+    vpclmulqdq  $0x11, (Htbl), DATA, TMP1
+    vpshufd     $78, DATA, TMP2
+    vpxor       DATA, TMP2, TMP2
+    vpclmulqdq  $0x00, 16*8(Htbl), TMP2, TMP2
+    #########################################################
+    vmovdqu     16*6(inp),DATA
+    vpshufb     BSWAP_MASK,DATA,DATA
+    KARATSUBA_AAD 1
+    #########################################################
+    vmovdqu     16*5(inp),DATA
+    vpshufb     BSWAP_MASK,DATA,DATA
+
+    vpclmulqdq  $0x10, .Lpoly(%rip), T, TMP4         #reduction stage 1a
+    vpalignr    $8, T, T, T
+
+    KARATSUBA_AAD 2
+
+    vpxor       TMP4, T, T                 #reduction stage 1b
+    #########################################################
+    vmovdqu		16*4(inp),DATA
+    vpshufb	    BSWAP_MASK,DATA,DATA
+
+    KARATSUBA_AAD 3
+    #########################################################
+    vmovdqu     16*3(inp),DATA
+    vpshufb     BSWAP_MASK,DATA,DATA
+
+    vpclmulqdq  $0x10, .Lpoly(%rip), T, TMP4         #reduction stage 2a
+    vpalignr    $8, T, T, T
+
+    KARATSUBA_AAD 4
+
+    vpxor       TMP4, T, T                 #reduction stage 2b
+    #########################################################
+    vmovdqu     16*2(inp),DATA
+    vpshufb     BSWAP_MASK,DATA,DATA
+
+    KARATSUBA_AAD 5
+
+    vpxor       Xhi, T, T                  #reduction finalize
+    #########################################################
+    vmovdqu     16*1(inp),DATA
+    vpshufb     BSWAP_MASK,DATA,DATA
+
+    KARATSUBA_AAD 6
+    #########################################################
+    vmovdqu     16*0(inp),DATA
+    vpshufb     BSWAP_MASK,DATA,DATA
+    vpxor       T,DATA,DATA
+
+    KARATSUBA_AAD 7
+    #########################################################
+    vpxor       TMP0, TMP2, TMP2              # karatsuba fixup
+    vpxor       TMP1, TMP2, TMP2
+    vpsrldq     $8, TMP2, TMP3
+    vpslldq     $8, TMP2, TMP2
+
+    vpxor       TMP3, TMP1, Xhi
+    vpxor       TMP2, TMP0, T
+
+    lea	16*8(inp), inp
+    jmp .Lmod_loop
+    #########################################################
+
+.Ldone:
+    vpclmulqdq  $0x10, .Lpoly(%rip), T, TMP3
+    vpalignr    $8, T, T, T
+    vpxor       TMP3, T, T
+
+    vpclmulqdq  $0x10, .Lpoly(%rip), T, TMP3
+    vpalignr    $8, T, T, T
+    vpxor       TMP3, T, T
+
+    vpxor       Xhi, T, T
+   
+.Lsave:
+    vpshufb     BSWAP_MASK,T, T
+    vmovdqu     T,(Tp)
+    vzeroupper
+
+    pop hlp0
+    ret
+.size   intel_aes_gcmAAD,.-intel_aes_gcmAAD
+
+################################################################################
+# Encrypt and Authenticate
+# void intel_aes_gcmENC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len);
+.type intel_aes_gcmENC,@function
+.globl intel_aes_gcmENC
+.align 16
+intel_aes_gcmENC:
+
+.set PT,%rdi
+.set CT,%rsi
+.set Htbl, %rdx
+.set len, %rcx
+.set KS,%r9
+.set NR,%r10d
+
+.set Gctx, %rdx
+
+.set T,%xmm0
+.set TMP0,%xmm1
+.set TMP1,%xmm2
+.set TMP2,%xmm3
+.set TMP3,%xmm4
+.set TMP4,%xmm5
+.set TMP5,%xmm6
+.set CTR0,%xmm7
+.set CTR1,%xmm8
+.set CTR2,%xmm9
+.set CTR3,%xmm10
+.set CTR4,%xmm11
+.set CTR5,%xmm12
+.set CTR6,%xmm13
+.set CTR7,%xmm14
+.set CTR,%xmm15
+
+.macro ROUND i
+    vmovdqu \i*16(KS), TMP3
+    vaesenc TMP3, CTR0, CTR0
+    vaesenc TMP3, CTR1, CTR1
+    vaesenc TMP3, CTR2, CTR2
+    vaesenc TMP3, CTR3, CTR3
+    vaesenc TMP3, CTR4, CTR4
+    vaesenc TMP3, CTR5, CTR5
+    vaesenc TMP3, CTR6, CTR6
+    vaesenc TMP3, CTR7, CTR7
+.endm
+
+.macro ROUNDMUL i
+
+    vmovdqu \i*16(%rsp), TMP5
+    vmovdqu \i*16(KS), TMP3
+
+    vaesenc TMP3, CTR0, CTR0
+    vaesenc TMP3, CTR1, CTR1
+    vaesenc TMP3, CTR2, CTR2
+    vaesenc TMP3, CTR3, CTR3
+
+    vpshufd $78, TMP5, TMP4
+    vpxor   TMP5, TMP4, TMP4
+
+    vaesenc TMP3, CTR4, CTR4
+    vaesenc TMP3, CTR5, CTR5
+    vaesenc TMP3, CTR6, CTR6
+    vaesenc TMP3, CTR7, CTR7
+
+    vpclmulqdq  $0x00, 128+\i*16(Htbl), TMP4, TMP3
+    vpxor       TMP3, TMP0, TMP0
+    vmovdqa     \i*16(Htbl), TMP4
+    vpclmulqdq  $0x11, TMP4, TMP5, TMP3
+    vpxor       TMP3, TMP1, TMP1
+    vpclmulqdq  $0x00, TMP4, TMP5, TMP3
+    vpxor       TMP3, TMP2, TMP2
+  
+.endm
+
+.macro KARATSUBA i
+    vmovdqu \i*16(%rsp), TMP5
+
+    vpclmulqdq  $0x11, 16*\i(Htbl), TMP5, TMP3
+    vpxor       TMP3, TMP1, TMP1
+    vpclmulqdq  $0x00, 16*\i(Htbl), TMP5, TMP3
+    vpxor       TMP3, TMP2, TMP2
+    vpshufd     $78, TMP5, TMP3
+    vpxor       TMP5, TMP3, TMP5
+    vpclmulqdq  $0x00, 128+\i*16(Htbl), TMP5, TMP3
+    vpxor       TMP3, TMP0, TMP0
+.endm
+
+    test len, len
+    jnz  .Lbegin
+    ret
+   
+.Lbegin:
+
+    vzeroupper
+    push %rbp
+    push %rbx
+
+    movq %rsp, %rbp   
+    sub  $128, %rsp
+    andq $-16, %rsp
+
+    vmovdqu  288(Gctx), CTR
+    vmovdqu  272(Gctx), T
+    mov  304(Gctx), KS
+    mov  4(KS), NR
+    lea  48(KS), KS
+
+    vpshufb  .Lbswap_mask(%rip), CTR, CTR
+    vpshufb  .Lbswap_mask(%rip), T, T
+
+    cmp  $128, len
+    jb   .LDataSingles
+   
+# Encrypt the first eight blocks
+    sub     $128, len
+    vmovdqa CTR, CTR0
+    vpaddd  .Lone(%rip), CTR0, CTR1
+    vpaddd  .Ltwo(%rip), CTR0, CTR2
+    vpaddd  .Lone(%rip), CTR2, CTR3
+    vpaddd  .Ltwo(%rip), CTR2, CTR4
+    vpaddd  .Lone(%rip), CTR4, CTR5
+    vpaddd  .Ltwo(%rip), CTR4, CTR6
+    vpaddd  .Lone(%rip), CTR6, CTR7
+    vpaddd  .Ltwo(%rip), CTR6, CTR
+
+    vpshufb .Lbswap_mask(%rip), CTR0, CTR0
+    vpshufb .Lbswap_mask(%rip), CTR1, CTR1
+    vpshufb .Lbswap_mask(%rip), CTR2, CTR2
+    vpshufb .Lbswap_mask(%rip), CTR3, CTR3
+    vpshufb .Lbswap_mask(%rip), CTR4, CTR4
+    vpshufb .Lbswap_mask(%rip), CTR5, CTR5
+    vpshufb .Lbswap_mask(%rip), CTR6, CTR6
+    vpshufb .Lbswap_mask(%rip), CTR7, CTR7
+
+    vpxor   (KS), CTR0, CTR0
+    vpxor   (KS), CTR1, CTR1
+    vpxor   (KS), CTR2, CTR2
+    vpxor   (KS), CTR3, CTR3
+    vpxor   (KS), CTR4, CTR4
+    vpxor   (KS), CTR5, CTR5
+    vpxor   (KS), CTR6, CTR6
+    vpxor   (KS), CTR7, CTR7
+
+    ROUND 1
+    ROUND 2
+    ROUND 3
+    ROUND 4
+    ROUND 5
+    ROUND 6
+    ROUND 7
+    ROUND 8
+    ROUND 9
+
+    vmovdqu 160(KS), TMP5
+    cmp $12, NR
+    jb  .LLast1
+
+    ROUND 10
+    ROUND 11
+
+    vmovdqu 192(KS), TMP5
+    cmp $14, NR
+    jb  .LLast1
+
+    ROUND 12
+    ROUND 13
+
+    vmovdqu 224(KS), TMP5
+  
+.LLast1:
+
+    vpxor       (PT), TMP5, TMP3
+    vaesenclast TMP3, CTR0, CTR0
+    vpxor       16(PT), TMP5, TMP3
+    vaesenclast TMP3, CTR1, CTR1
+    vpxor       32(PT), TMP5, TMP3
+    vaesenclast TMP3, CTR2, CTR2
+    vpxor       48(PT), TMP5, TMP3
+    vaesenclast TMP3, CTR3, CTR3
+    vpxor       64(PT), TMP5, TMP3
+    vaesenclast TMP3, CTR4, CTR4
+    vpxor       80(PT), TMP5, TMP3
+    vaesenclast TMP3, CTR5, CTR5
+    vpxor       96(PT), TMP5, TMP3
+    vaesenclast TMP3, CTR6, CTR6
+    vpxor       112(PT), TMP5, TMP3
+    vaesenclast TMP3, CTR7, CTR7
+    
+    vmovdqu     .Lbswap_mask(%rip), TMP3
+   
+    vmovdqu CTR0, (CT)
+    vpshufb TMP3, CTR0, CTR0
+    vmovdqu CTR1, 16(CT)
+    vpshufb TMP3, CTR1, CTR1
+    vmovdqu CTR2, 32(CT)
+    vpshufb TMP3, CTR2, CTR2
+    vmovdqu CTR3, 48(CT)
+    vpshufb TMP3, CTR3, CTR3
+    vmovdqu CTR4, 64(CT)
+    vpshufb TMP3, CTR4, CTR4
+    vmovdqu CTR5, 80(CT)
+    vpshufb TMP3, CTR5, CTR5
+    vmovdqu CTR6, 96(CT)
+    vpshufb TMP3, CTR6, CTR6
+    vmovdqu CTR7, 112(CT)
+    vpshufb TMP3, CTR7, CTR7
+
+    lea 128(CT), CT
+    lea 128(PT), PT
+    jmp .LDataOctets
+
+# Encrypt 8 blocks each time while hashing previous 8 blocks
+.align 64
+.LDataOctets:
+        cmp $128, len
+        jb  .LEndOctets
+        sub $128, len
+
+        vmovdqa CTR7, TMP5
+        vmovdqa CTR6, 1*16(%rsp)
+        vmovdqa CTR5, 2*16(%rsp)
+        vmovdqa CTR4, 3*16(%rsp)
+        vmovdqa CTR3, 4*16(%rsp)
+        vmovdqa CTR2, 5*16(%rsp)
+        vmovdqa CTR1, 6*16(%rsp)
+        vmovdqa CTR0, 7*16(%rsp)
+
+        vmovdqa CTR, CTR0
+        vpaddd  .Lone(%rip), CTR0, CTR1
+        vpaddd  .Ltwo(%rip), CTR0, CTR2
+        vpaddd  .Lone(%rip), CTR2, CTR3
+        vpaddd  .Ltwo(%rip), CTR2, CTR4
+        vpaddd  .Lone(%rip), CTR4, CTR5
+        vpaddd  .Ltwo(%rip), CTR4, CTR6
+        vpaddd  .Lone(%rip), CTR6, CTR7
+        vpaddd  .Ltwo(%rip), CTR6, CTR
+
+        vmovdqu (KS), TMP4
+        vpshufb TMP3, CTR0, CTR0
+        vpxor   TMP4, CTR0, CTR0
+        vpshufb TMP3, CTR1, CTR1
+        vpxor   TMP4, CTR1, CTR1
+        vpshufb TMP3, CTR2, CTR2
+        vpxor   TMP4, CTR2, CTR2
+        vpshufb TMP3, CTR3, CTR3
+        vpxor   TMP4, CTR3, CTR3
+        vpshufb TMP3, CTR4, CTR4
+        vpxor   TMP4, CTR4, CTR4
+        vpshufb TMP3, CTR5, CTR5
+        vpxor   TMP4, CTR5, CTR5
+        vpshufb TMP3, CTR6, CTR6
+        vpxor   TMP4, CTR6, CTR6
+        vpshufb TMP3, CTR7, CTR7
+        vpxor   TMP4, CTR7, CTR7
+
+        vmovdqu     16*0(Htbl), TMP3
+        vpclmulqdq  $0x11, TMP3, TMP5, TMP1
+        vpclmulqdq  $0x00, TMP3, TMP5, TMP2      
+        vpshufd     $78, TMP5, TMP3
+        vpxor       TMP5, TMP3, TMP5
+        vmovdqu     128+0*16(Htbl), TMP3      
+        vpclmulqdq  $0x00, TMP3, TMP5, TMP0
+
+        ROUNDMUL 1
+
+        ROUNDMUL 2
+
+        ROUNDMUL 3
+
+        ROUNDMUL 4
+
+        ROUNDMUL 5
+
+        ROUNDMUL 6
+
+        vpxor   7*16(%rsp), T, TMP5
+        vmovdqu 7*16(KS), TMP3
+
+        vaesenc TMP3, CTR0, CTR0
+        vaesenc TMP3, CTR1, CTR1
+        vaesenc TMP3, CTR2, CTR2
+        vaesenc TMP3, CTR3, CTR3
+
+        vpshufd $78, TMP5, TMP4
+        vpxor   TMP5, TMP4, TMP4
+
+        vaesenc TMP3, CTR4, CTR4
+        vaesenc TMP3, CTR5, CTR5
+        vaesenc TMP3, CTR6, CTR6
+        vaesenc TMP3, CTR7, CTR7
+
+        vpclmulqdq  $0x11, 7*16(Htbl), TMP5, TMP3
+        vpxor       TMP3, TMP1, TMP1
+        vpclmulqdq  $0x00, 7*16(Htbl), TMP5, TMP3
+        vpxor       TMP3, TMP2, TMP2
+        vpclmulqdq  $0x00, 128+7*16(Htbl), TMP4, TMP3
+        vpxor       TMP3, TMP0, TMP0
+
+        ROUND 8    
+        vmovdqa .Lpoly(%rip), TMP5
+
+        vpxor   TMP1, TMP0, TMP0
+        vpxor   TMP2, TMP0, TMP0
+        vpsrldq $8, TMP0, TMP3
+        vpxor   TMP3, TMP1, TMP4
+        vpslldq $8, TMP0, TMP3
+        vpxor   TMP3, TMP2, T
+
+        vpclmulqdq  $0x10, TMP5, T, TMP1
+        vpalignr    $8, T, T, T
+        vpxor       T, TMP1, T
+
+        ROUND 9
+
+        vpclmulqdq  $0x10, TMP5, T, TMP1
+        vpalignr    $8, T, T, T
+        vpxor       T, TMP1, T
+
+        vmovdqu 160(KS), TMP5
+        cmp     $10, NR
+        jbe     .LLast2
+
+        ROUND 10
+        ROUND 11
+
+        vmovdqu 192(KS), TMP5
+        cmp     $12, NR
+        jbe     .LLast2
+
+        ROUND 12
+        ROUND 13
+
+        vmovdqu 224(KS), TMP5
+
+.LLast2:
+      
+        vpxor       (PT), TMP5, TMP3
+        vaesenclast TMP3, CTR0, CTR0
+        vpxor       16(PT), TMP5, TMP3
+        vaesenclast TMP3, CTR1, CTR1
+        vpxor       32(PT), TMP5, TMP3
+        vaesenclast TMP3, CTR2, CTR2
+        vpxor       48(PT), TMP5, TMP3
+        vaesenclast TMP3, CTR3, CTR3
+        vpxor       64(PT), TMP5, TMP3
+        vaesenclast TMP3, CTR4, CTR4
+        vpxor       80(PT), TMP5, TMP3
+        vaesenclast TMP3, CTR5, CTR5
+        vpxor       96(PT), TMP5, TMP3
+        vaesenclast TMP3, CTR6, CTR6
+        vpxor       112(PT), TMP5, TMP3
+        vaesenclast TMP3, CTR7, CTR7
+
+        vmovdqu .Lbswap_mask(%rip), TMP3
+
+        vmovdqu CTR0, (CT)
+        vpshufb TMP3, CTR0, CTR0
+        vmovdqu CTR1, 16(CT)
+        vpshufb TMP3, CTR1, CTR1
+        vmovdqu CTR2, 32(CT)
+        vpshufb TMP3, CTR2, CTR2
+        vmovdqu CTR3, 48(CT)
+        vpshufb TMP3, CTR3, CTR3
+        vmovdqu CTR4, 64(CT)
+        vpshufb TMP3, CTR4, CTR4
+        vmovdqu CTR5, 80(CT)
+        vpshufb TMP3, CTR5, CTR5
+        vmovdqu CTR6, 96(CT)
+        vpshufb TMP3, CTR6, CTR6
+        vmovdqu CTR7,112(CT)
+        vpshufb TMP3, CTR7, CTR7
+
+        vpxor   TMP4, T, T
+
+        lea 128(CT), CT
+        lea 128(PT), PT
+    jmp  .LDataOctets
+
+.LEndOctets:
+    
+    vmovdqa CTR7, TMP5
+    vmovdqa CTR6, 1*16(%rsp)
+    vmovdqa CTR5, 2*16(%rsp)
+    vmovdqa CTR4, 3*16(%rsp)
+    vmovdqa CTR3, 4*16(%rsp)
+    vmovdqa CTR2, 5*16(%rsp)
+    vmovdqa CTR1, 6*16(%rsp)
+    vmovdqa CTR0, 7*16(%rsp)
+
+    vmovdqu     16*0(Htbl), TMP3
+    vpclmulqdq  $0x11, TMP3, TMP5, TMP1
+    vpclmulqdq  $0x00, TMP3, TMP5, TMP2      
+    vpshufd     $78, TMP5, TMP3
+    vpxor       TMP5, TMP3, TMP5
+    vmovdqu     128+0*16(Htbl), TMP3      
+    vpclmulqdq  $0x00, TMP3, TMP5, TMP0
+
+    KARATSUBA 1
+    KARATSUBA 2
+    KARATSUBA 3      
+    KARATSUBA 4
+    KARATSUBA 5
+    KARATSUBA 6
+
+    vmovdqu     7*16(%rsp), TMP5
+    vpxor       T, TMP5, TMP5
+    vmovdqu     16*7(Htbl), TMP4            
+    vpclmulqdq  $0x11, TMP4, TMP5, TMP3
+    vpxor       TMP3, TMP1, TMP1
+    vpclmulqdq  $0x00, TMP4, TMP5, TMP3
+    vpxor       TMP3, TMP2, TMP2      
+    vpshufd     $78, TMP5, TMP3
+    vpxor       TMP5, TMP3, TMP5
+    vmovdqu     128+7*16(Htbl), TMP4      
+    vpclmulqdq  $0x00, TMP4, TMP5, TMP3
+    vpxor       TMP3, TMP0, TMP0
+
+    vpxor       TMP1, TMP0, TMP0
+    vpxor       TMP2, TMP0, TMP0
+
+    vpsrldq     $8, TMP0, TMP3
+    vpxor       TMP3, TMP1, TMP4
+    vpslldq     $8, TMP0, TMP3
+    vpxor       TMP3, TMP2, T
+
+    vmovdqa     .Lpoly(%rip), TMP2
+
+    vpalignr    $8, T, T, TMP1
+    vpclmulqdq  $0x10, TMP2, T, T
+    vpxor       T, TMP1, T
+
+    vpalignr    $8, T, T, TMP1
+    vpclmulqdq  $0x10, TMP2, T, T
+    vpxor       T, TMP1, T
+
+    vpxor       TMP4, T, T
+
+#Here we encrypt any remaining whole block
+.LDataSingles:
+
+    cmp $16, len
+    jb  .LDataTail
+    sub $16, len
+
+    vpshufb .Lbswap_mask(%rip), CTR, TMP1
+    vpaddd  .Lone(%rip), CTR, CTR
+
+    vpxor   (KS), TMP1, TMP1
+    vaesenc 16*1(KS), TMP1, TMP1
+    vaesenc 16*2(KS), TMP1, TMP1
+    vaesenc 16*3(KS), TMP1, TMP1
+    vaesenc 16*4(KS), TMP1, TMP1
+    vaesenc 16*5(KS), TMP1, TMP1
+    vaesenc 16*6(KS), TMP1, TMP1
+    vaesenc 16*7(KS), TMP1, TMP1
+    vaesenc 16*8(KS), TMP1, TMP1
+    vaesenc 16*9(KS), TMP1, TMP1
+    vmovdqu 16*10(KS), TMP2
+    cmp     $10, NR
+    je      .LLast3
+    vaesenc 16*10(KS), TMP1, TMP1
+    vaesenc 16*11(KS), TMP1, TMP1
+    vmovdqu 16*12(KS), TMP2
+    cmp     $12, NR
+    je      .LLast3
+    vaesenc 16*12(KS), TMP1, TMP1
+    vaesenc 16*13(KS), TMP1, TMP1
+    vmovdqu 16*14(KS), TMP2
+
+.LLast3:
+    vaesenclast TMP2, TMP1, TMP1
+
+    vpxor   (PT), TMP1, TMP1
+    vmovdqu TMP1, (CT)
+    addq    $16, CT
+    addq    $16, PT
+
+    vpshufb .Lbswap_mask(%rip), TMP1, TMP1
+    vpxor   TMP1, T, T
+    vmovdqu (Htbl), TMP0
+    call    GFMUL
+
+    jmp .LDataSingles
+
+#Here we encypt the final partial block, if there is one
+.LDataTail:
+
+    test    len, len
+    jz      DATA_END
+# First prepare the counter block
+    vpshufb .Lbswap_mask(%rip), CTR, TMP1
+    vpaddd  .Lone(%rip), CTR, CTR
+
+    vpxor   (KS), TMP1, TMP1
+    vaesenc 16*1(KS), TMP1, TMP1
+    vaesenc 16*2(KS), TMP1, TMP1
+    vaesenc 16*3(KS), TMP1, TMP1
+    vaesenc 16*4(KS), TMP1, TMP1
+    vaesenc 16*5(KS), TMP1, TMP1
+    vaesenc 16*6(KS), TMP1, TMP1
+    vaesenc 16*7(KS), TMP1, TMP1
+    vaesenc 16*8(KS), TMP1, TMP1
+    vaesenc 16*9(KS), TMP1, TMP1
+    vmovdqu 16*10(KS), TMP2
+    cmp     $10, NR
+    je      .LLast4
+    vaesenc 16*10(KS), TMP1, TMP1
+    vaesenc 16*11(KS), TMP1, TMP1
+    vmovdqu 16*12(KS), TMP2
+    cmp     $12, NR
+    je      .LLast4
+    vaesenc 16*12(KS), TMP1, TMP1
+    vaesenc 16*13(KS), TMP1, TMP1
+    vmovdqu 16*14(KS), TMP2
+  
+.LLast4:
+    vaesenclast TMP2, TMP1, TMP1
+#Zero a temp location
+    vpxor   TMP2, TMP2, TMP2
+    vmovdqa TMP2, (%rsp)
+    
+# Copy the required bytes only (could probably use rep movsb)
+    xor KS, KS  
+.LEncCpy:
+        cmp     KS, len
+        je      .LEncCpyEnd
+        movb    (PT, KS, 1), %r8b
+        movb    %r8b, (%rsp, KS, 1)
+        inc     KS
+        jmp .LEncCpy
+.LEncCpyEnd:
+# Xor with the counter block
+    vpxor   (%rsp), TMP1, TMP0
+# Again, store at temp location
+    vmovdqa TMP0, (%rsp)
+# Copy only the required bytes to CT, and zero the rest for the hash
+    xor KS, KS
+.LEncCpy2:
+    cmp     KS, len
+    je      .LEncCpy3
+    movb    (%rsp, KS, 1), %r8b
+    movb    %r8b, (CT, KS, 1)
+    inc     KS
+    jmp .LEncCpy2
+.LEncCpy3:
+    cmp     $16, KS
+    je      .LEndCpy3
+    movb    $0, (%rsp, KS, 1)
+    inc     KS
+    jmp .LEncCpy3
+.LEndCpy3:
+   vmovdqa  (%rsp), TMP0
+
+   vpshufb  .Lbswap_mask(%rip), TMP0, TMP0
+   vpxor    TMP0, T, T
+   vmovdqu  (Htbl), TMP0
+   call     GFMUL
+
+DATA_END:
+
+   vpshufb  .Lbswap_mask(%rip), T, T
+   vpshufb  .Lbswap_mask(%rip), CTR, CTR
+   vmovdqu  T, 272(Gctx)
+   vmovdqu  CTR, 288(Gctx)
+
+   movq   %rbp, %rsp
+
+   popq   %rbx
+   popq   %rbp
+   ret
+   .size intel_aes_gcmENC, .-intel_aes_gcmENC
+  
+#########################
+# Decrypt and Authenticate
+# void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len);
+.type intel_aes_gcmDEC,@function
+.globl intel_aes_gcmDEC
+.align 16
+intel_aes_gcmDEC:
+# parameter 1: CT    # input
+# parameter 2: PT    # output
+# parameter 3: %rdx  # Gctx
+# parameter 4: %rcx  # len
+
+.macro DEC_KARATSUBA i
+    vmovdqu     (7-\i)*16(CT), TMP5
+    vpshufb     .Lbswap_mask(%rip), TMP5, TMP5
+
+    vpclmulqdq  $0x11, 16*\i(Htbl), TMP5, TMP3
+    vpxor       TMP3, TMP1, TMP1
+    vpclmulqdq  $0x00, 16*\i(Htbl), TMP5, TMP3
+    vpxor       TMP3, TMP2, TMP2
+    vpshufd     $78, TMP5, TMP3
+    vpxor       TMP5, TMP3, TMP5
+    vpclmulqdq  $0x00, 128+\i*16(Htbl), TMP5, TMP3
+    vpxor       TMP3, TMP0, TMP0
+.endm
+
+.set PT,%rsi
+.set CT,%rdi
+.set Htbl, %rdx
+.set len, %rcx
+.set KS,%r9
+.set NR,%r10d
+
+.set Gctx, %rdx
+
+.set T,%xmm0
+.set TMP0,%xmm1
+.set TMP1,%xmm2
+.set TMP2,%xmm3
+.set TMP3,%xmm4
+.set TMP4,%xmm5
+.set TMP5,%xmm6
+.set CTR0,%xmm7
+.set CTR1,%xmm8
+.set CTR2,%xmm9
+.set CTR3,%xmm10
+.set CTR4,%xmm11
+.set CTR5,%xmm12
+.set CTR6,%xmm13
+.set CTR7,%xmm14
+.set CTR,%xmm15
+
+    test  len, len
+    jnz   .LbeginDec
+    ret
+   
+.LbeginDec:
+
+    pushq   %rbp
+    pushq   %rbx
+    movq    %rsp, %rbp   
+    sub     $128, %rsp
+    andq    $-16, %rsp
+    vmovdqu 288(Gctx), CTR
+    vmovdqu 272(Gctx), T
+    mov     304(Gctx), KS
+    mov     4(KS), NR
+    lea     48(KS), KS
+
+    vpshufb .Lbswap_mask(%rip), CTR, CTR
+    vpshufb .Lbswap_mask(%rip), T, T
+     
+    vmovdqu .Lbswap_mask(%rip), TMP3
+    jmp     .LDECOctets
+      
+# Decrypt 8 blocks each time while hashing them at the same time
+.align 64
+.LDECOctets:
+   
+        cmp $128, len
+        jb  .LDECSingles
+        sub $128, len
+
+        vmovdqa CTR, CTR0
+        vpaddd  .Lone(%rip), CTR0, CTR1
+        vpaddd  .Ltwo(%rip), CTR0, CTR2
+        vpaddd  .Lone(%rip), CTR2, CTR3
+        vpaddd  .Ltwo(%rip), CTR2, CTR4
+        vpaddd  .Lone(%rip), CTR4, CTR5
+        vpaddd  .Ltwo(%rip), CTR4, CTR6
+        vpaddd  .Lone(%rip), CTR6, CTR7
+        vpaddd  .Ltwo(%rip), CTR6, CTR
+
+        vpshufb TMP3, CTR0, CTR0
+        vpshufb TMP3, CTR1, CTR1
+        vpshufb TMP3, CTR2, CTR2
+        vpshufb TMP3, CTR3, CTR3
+        vpshufb TMP3, CTR4, CTR4
+        vpshufb TMP3, CTR5, CTR5
+        vpshufb TMP3, CTR6, CTR6
+        vpshufb TMP3, CTR7, CTR7
+
+        vmovdqu (KS), TMP3
+        vpxor  TMP3, CTR0, CTR0
+        vpxor  TMP3, CTR1, CTR1
+        vpxor  TMP3, CTR2, CTR2
+        vpxor  TMP3, CTR3, CTR3
+        vpxor  TMP3, CTR4, CTR4
+        vpxor  TMP3, CTR5, CTR5
+        vpxor  TMP3, CTR6, CTR6
+        vpxor  TMP3, CTR7, CTR7
+
+        vmovdqu     7*16(CT), TMP5
+        vpshufb     .Lbswap_mask(%rip), TMP5, TMP5
+        vmovdqu     16*0(Htbl), TMP3
+        vpclmulqdq  $0x11, TMP3, TMP5, TMP1
+        vpclmulqdq  $0x00, TMP3, TMP5, TMP2      
+        vpshufd     $78, TMP5, TMP3
+        vpxor       TMP5, TMP3, TMP5
+        vmovdqu     128+0*16(Htbl), TMP3      
+        vpclmulqdq  $0x00, TMP3, TMP5, TMP0
+
+        ROUND 1
+        DEC_KARATSUBA 1
+
+        ROUND 2
+        DEC_KARATSUBA 2
+
+        ROUND 3
+        DEC_KARATSUBA 3
+
+        ROUND 4
+        DEC_KARATSUBA 4
+
+        ROUND 5
+        DEC_KARATSUBA 5
+
+        ROUND 6
+        DEC_KARATSUBA 6
+
+        ROUND 7
+
+        vmovdqu     0*16(CT), TMP5
+        vpshufb     .Lbswap_mask(%rip), TMP5, TMP5
+        vpxor       T, TMP5, TMP5
+        vmovdqu     16*7(Htbl), TMP4
+            
+        vpclmulqdq  $0x11, TMP4, TMP5, TMP3
+        vpxor       TMP3, TMP1, TMP1
+        vpclmulqdq  $0x00, TMP4, TMP5, TMP3
+        vpxor       TMP3, TMP2, TMP2
+
+        vpshufd     $78, TMP5, TMP3
+        vpxor       TMP5, TMP3, TMP5
+        vmovdqu     128+7*16(Htbl), TMP4
+
+        vpclmulqdq  $0x00, TMP4, TMP5, TMP3
+        vpxor       TMP3, TMP0, TMP0
+
+        ROUND 8      
+
+        vpxor       TMP1, TMP0, TMP0
+        vpxor       TMP2, TMP0, TMP0
+
+        vpsrldq     $8, TMP0, TMP3
+        vpxor       TMP3, TMP1, TMP4
+        vpslldq     $8, TMP0, TMP3
+        vpxor       TMP3, TMP2, T
+        vmovdqa	  .Lpoly(%rip), TMP2
+
+        vpalignr    $8, T, T, TMP1
+        vpclmulqdq  $0x10, TMP2, T, T
+        vpxor       T, TMP1, T
+
+        ROUND 9
+
+        vpalignr    $8, T, T, TMP1
+        vpclmulqdq  $0x10, TMP2, T, T
+        vpxor       T, TMP1, T
+
+        vmovdqu     160(KS), TMP5
+        cmp         $10, NR
+
+        jbe  .LDECLast1
+
+        ROUND 10
+        ROUND 11
+
+        vmovdqu     192(KS), TMP5
+        cmp         $12, NR       
+
+        jbe  .LDECLast1
+
+        ROUND 12
+        ROUND 13
+
+        vmovdqu  224(KS), TMP5
+
+.LDECLast1:      
+      
+        vpxor   (CT), TMP5, TMP3
+        vaesenclast TMP3, CTR0, CTR0
+        vpxor   16(CT), TMP5, TMP3
+        vaesenclast TMP3, CTR1, CTR1
+        vpxor   32(CT), TMP5, TMP3
+        vaesenclast TMP3, CTR2, CTR2
+        vpxor   48(CT), TMP5, TMP3
+        vaesenclast TMP3, CTR3, CTR3
+        vpxor   64(CT), TMP5, TMP3
+        vaesenclast TMP3, CTR4, CTR4
+        vpxor   80(CT), TMP5, TMP3
+        vaesenclast TMP3, CTR5, CTR5
+        vpxor   96(CT), TMP5, TMP3
+        vaesenclast TMP3, CTR6, CTR6
+        vpxor   112(CT), TMP5, TMP3
+        vaesenclast TMP3, CTR7, CTR7
+
+        vmovdqu .Lbswap_mask(%rip), TMP3
+
+        vmovdqu CTR0, (PT)
+        vmovdqu CTR1, 16(PT)
+        vmovdqu CTR2, 32(PT)
+        vmovdqu CTR3, 48(PT)
+        vmovdqu CTR4, 64(PT)
+        vmovdqu CTR5, 80(PT)
+        vmovdqu CTR6, 96(PT)
+        vmovdqu CTR7,112(PT)
+
+        vpxor   TMP4, T, T
+
+        lea 128(CT), CT
+        lea 128(PT), PT
+   jmp  .LDECOctets
+   
+#Here we decrypt and hash any remaining whole block
+.LDECSingles:
+
+    cmp   $16, len
+    jb    .LDECTail
+    sub   $16, len
+
+    vmovdqu  (CT), TMP1
+    vpshufb  .Lbswap_mask(%rip), TMP1, TMP1
+    vpxor    TMP1, T, T
+    vmovdqu  (Htbl), TMP0
+    call     GFMUL
+
+
+    vpshufb  .Lbswap_mask(%rip), CTR, TMP1
+    vpaddd   .Lone(%rip), CTR, CTR
+
+    vpxor    (KS), TMP1, TMP1
+    vaesenc  16*1(KS), TMP1, TMP1
+    vaesenc  16*2(KS), TMP1, TMP1
+    vaesenc  16*3(KS), TMP1, TMP1
+    vaesenc  16*4(KS), TMP1, TMP1
+    vaesenc  16*5(KS), TMP1, TMP1
+    vaesenc  16*6(KS), TMP1, TMP1
+    vaesenc  16*7(KS), TMP1, TMP1
+    vaesenc  16*8(KS), TMP1, TMP1
+    vaesenc  16*9(KS), TMP1, TMP1
+    vmovdqu  16*10(KS), TMP2
+    cmp      $10, NR
+    je       .LDECLast2
+    vaesenc  16*10(KS), TMP1, TMP1
+    vaesenc  16*11(KS), TMP1, TMP1
+    vmovdqu  16*12(KS), TMP2
+    cmp      $12, NR
+    je       .LDECLast2
+    vaesenc  16*12(KS), TMP1, TMP1
+    vaesenc  16*13(KS), TMP1, TMP1
+    vmovdqu  16*14(KS), TMP2
+.LDECLast2:
+    vaesenclast TMP2, TMP1, TMP1
+
+    vpxor    (CT), TMP1, TMP1
+    vmovdqu  TMP1, (PT)
+    addq     $16, CT
+    addq     $16, PT  
+    jmp   .LDECSingles
+
+#Here we decrypt the final partial block, if there is one
+.LDECTail:
+   test   len, len
+   jz     .LDEC_END
+
+   vpshufb  .Lbswap_mask(%rip), CTR, TMP1
+   vpaddd .Lone(%rip), CTR, CTR
+
+   vpxor  (KS), TMP1, TMP1
+   vaesenc  16*1(KS), TMP1, TMP1
+   vaesenc  16*2(KS), TMP1, TMP1
+   vaesenc  16*3(KS), TMP1, TMP1
+   vaesenc  16*4(KS), TMP1, TMP1
+   vaesenc  16*5(KS), TMP1, TMP1
+   vaesenc  16*6(KS), TMP1, TMP1
+   vaesenc  16*7(KS), TMP1, TMP1
+   vaesenc  16*8(KS), TMP1, TMP1
+   vaesenc  16*9(KS), TMP1, TMP1
+   vmovdqu  16*10(KS), TMP2
+   cmp      $10, NR
+   je       .LDECLast3
+   vaesenc  16*10(KS), TMP1, TMP1
+   vaesenc  16*11(KS), TMP1, TMP1
+   vmovdqu  16*12(KS), TMP2
+   cmp      $12, NR
+   je       .LDECLast3
+   vaesenc  16*12(KS), TMP1, TMP1
+   vaesenc  16*13(KS), TMP1, TMP1
+   vmovdqu  16*14(KS), TMP2
+
+.LDECLast3:
+   vaesenclast TMP2, TMP1, TMP1
+  
+   vpxor   TMP2, TMP2, TMP2
+   vmovdqa TMP2, (%rsp) 
+# Copy the required bytes only (could probably use rep movsb)
+    xor KS, KS  
+.LDecCpy:
+        cmp     KS, len
+        je      .LDecCpy2
+        movb    (CT, KS, 1), %r8b
+        movb    %r8b, (%rsp, KS, 1)
+        inc     KS
+        jmp     .LDecCpy
+.LDecCpy2:
+        cmp     $16, KS
+        je      .LDecCpyEnd
+        movb    $0, (%rsp, KS, 1)
+        inc     KS
+        jmp     .LDecCpy2
+.LDecCpyEnd:
+# Xor with the counter block
+    vmovdqa (%rsp), TMP0
+    vpxor   TMP0, TMP1, TMP1
+# Again, store at temp location
+    vmovdqa TMP1, (%rsp)
+# Copy only the required bytes to PT, and zero the rest for the hash
+    xor KS, KS
+.LDecCpy3:
+    cmp     KS, len
+    je      .LDecCpyEnd3
+    movb    (%rsp, KS, 1), %r8b
+    movb    %r8b, (PT, KS, 1)
+    inc     KS
+    jmp     .LDecCpy3
+.LDecCpyEnd3:
+   vpshufb  .Lbswap_mask(%rip), TMP0, TMP0
+   vpxor    TMP0, T, T
+   vmovdqu  (Htbl), TMP0
+   call     GFMUL
+.LDEC_END:
+
+   vpshufb  .Lbswap_mask(%rip), T, T
+   vpshufb  .Lbswap_mask(%rip), CTR, CTR
+   vmovdqu  T, 272(Gctx)
+   vmovdqu  CTR, 288(Gctx)
+
+   movq   %rbp, %rsp
+
+   popq   %rbx
+   popq   %rbp
+   ret
+  .size intel_aes_gcmDEC, .-intel_aes_gcmDEC
+#########################
+# a = T
+# b = TMP0 - remains unchanged
+# res = T
+# uses also TMP1,TMP2,TMP3,TMP4
+# __m128i GFMUL(__m128i A, __m128i B);
+.type GFMUL,@function
+.globl GFMUL
+GFMUL:  
+    vpclmulqdq  $0x00, TMP0, T, TMP1
+    vpclmulqdq  $0x11, TMP0, T, TMP4
+
+    vpshufd     $78, T, TMP2
+    vpshufd     $78, TMP0, TMP3
+    vpxor       T, TMP2, TMP2
+    vpxor       TMP0, TMP3, TMP3
+
+    vpclmulqdq  $0x00, TMP3, TMP2, TMP2
+    vpxor       TMP1, TMP2, TMP2
+    vpxor       TMP4, TMP2, TMP2
+
+    vpslldq     $8, TMP2, TMP3
+    vpsrldq     $8, TMP2, TMP2
+
+    vpxor       TMP3, TMP1, TMP1
+    vpxor       TMP2, TMP4, TMP4
+
+    vpclmulqdq  $0x10, .Lpoly(%rip), TMP1, TMP2
+    vpshufd     $78, TMP1, TMP3
+    vpxor       TMP3, TMP2, TMP1
+
+    vpclmulqdq  $0x10, .Lpoly(%rip), TMP1, TMP2
+    vpshufd     $78, TMP1, TMP3
+    vpxor       TMP3, TMP2, TMP1
+
+    vpxor       TMP4, TMP1, T
+    ret
+.size GFMUL, .-GFMUL
+
diff --git a/security/nss/lib/freebl/manifest.mn b/security/nss/lib/freebl/manifest.mn
index f2b462aa3..ea29a8eaa 100644
--- a/security/nss/lib/freebl/manifest.mn
+++ b/security/nss/lib/freebl/manifest.mn
@@ -119,6 +119,7 @@ CSRCS = \
 	$(ECL_SRCS) \
 	$(STUBS_SRCS) \
 	$(LOWHASH_SRCS) \
+	$(EXTRA_SRCS) \
 	$(NULL)
 
 ALL_CSRCS := $(CSRCS)
diff --git a/security/nss/lib/freebl/rijndael.c b/security/nss/lib/freebl/rijndael.c
index 59508b3c3..e43a7346c 100644
--- a/security/nss/lib/freebl/rijndael.c
+++ b/security/nss/lib/freebl/rijndael.c
@@ -20,8 +20,16 @@
 #include "gcm.h"
 
 #if USE_HW_AES
+#include "intel-gcm.h"
 #include "intel-aes.h"
 #include "mpi.h"
+
+static int has_intel_aes = 0;
+static int has_intel_avx = 0;
+static int has_intel_clmul = 0;
+static PRBool use_hw_aes = PR_FALSE;
+static PRBool use_hw_avx = PR_FALSE;
+static PRBool use_hw_gcm = PR_FALSE;
 #endif
 
 /*
@@ -970,10 +978,6 @@ aes_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize,
 	        const unsigned char *iv, int mode, unsigned int encrypt,
 	        unsigned int blocksize)
 {
-#if USE_HW_AES
-    static int has_intel_aes;
-    PRBool use_hw_aes = PR_FALSE;
-#endif
     unsigned int Nk;
     /* According to Rijndael AES Proposal, section 12.1, block and key
      * lengths between 128 and 256 bits are supported, as long as the
@@ -1009,12 +1013,18 @@ aes_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize,
 	if (disable_hw_aes == NULL) {
 	    freebl_cpuid(1, &eax, &ebx, &ecx, &edx);
 	    has_intel_aes = (ecx & (1 << 25)) != 0 ? 1 : -1;
+	    has_intel_clmul = (ecx & (1 << 1)) != 0 ? 1 : -1;
+	    has_intel_avx = (ecx & (1 << 28)) != 0 ? 1 : -1;
 	} else {
 	    has_intel_aes = -1;
+	    has_intel_avx = -1;
+	    has_intel_clmul = -1;
 	}
     }
     use_hw_aes = (PRBool)
 		(has_intel_aes > 0 && (keysize % 8) == 0 && blocksize == 16);
+    use_hw_gcm = (PRBool)
+		(use_hw_aes && has_intel_avx>0 && has_intel_clmul>0);
 #endif
     /* Nb = (block size in bits) / 32 */
     cx->Nb = blocksize / 4;
@@ -1117,11 +1127,22 @@ AES_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize,
 	cx->isBlock = PR_FALSE;
 	break;
     case NSS_AES_GCM:
+#if USE_HW_AES
+	if(use_hw_gcm) {
+        	cx->worker_cx = intel_AES_GCM_CreateContext(cx, cx->worker, iv, blocksize);
+		cx->worker = (freeblCipherFunc)
+			(encrypt ? intel_AES_GCM_EncryptUpdate : intel_AES_GCM_DecryptUpdate);
+		cx->destroy = (freeblDestroyFunc) intel_AES_GCM_DestroyContext;
+		cx->isBlock = PR_FALSE;
+    	} else
+#endif
+	{
 	cx->worker_cx = GCM_CreateContext(cx, cx->worker, iv, blocksize);
 	cx->worker = (freeblCipherFunc)
 			(encrypt ? GCM_EncryptUpdate : GCM_DecryptUpdate);
 	cx->destroy = (freeblDestroyFunc) GCM_DestroyContext;
 	cx->isBlock = PR_FALSE;
+	}
 	break;
     case NSS_AES_CTR:
 	cx->worker_cx = CTR_CreateContext(cx, cx->worker, iv, blocksize);
author	rrelyea%redhat.com <devnull@localhost>	2013-01-15 02:36:11 +0000
committer	rrelyea%redhat.com <devnull@localhost>	2013-01-15 02:36:11 +0000
commit	094bde5d3a5df11e7f03ecc73a492c07d433e8d6 (patch)
tree	87c605d15e71403d3528c58c993866d0ef02f0dc
parent	9f1f716dcf13a4cb0c65e8fe286693e900ada116 (diff)
download	nss-hg-094bde5d3a5df11e7f03ecc73a492c07d433e8d6.tar.gz