summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNikos Mavrogiannopoulos <nmav@redhat.com>2016-07-01 13:51:18 +0200
committerNikos Mavrogiannopoulos <nmav@gnutls.org>2016-07-05 14:50:47 +0200
commit8b18781de703ed6778ef8fcf7cfaeb208398553d (patch)
tree2bf23d2ed4d2006b9da5032c9c58298baccedbc5
parent1c791b698204c115395c556f7c1b60f3931f939a (diff)
downloadgnutls-8b18781de703ed6778ef8fcf7cfaeb208398553d.tar.gz
asm: updated openssl and the asm sources for AES-GCM from openssl 1.0.2h
This improves the performance of AES-GCM significantly by taking advantage of AVX and MOVBE instructions where available. This utilizes Andy Polyakov's code under BSD license.
-rw-r--r--cfg.mk14
l---------devel/perlasm/aesni-gcm-x86_64.pl1
l---------devel/perlasm/aesni-gcm-x86_64.pl.license1
-rw-r--r--devel/perlasm/license.txt2
-rw-r--r--doc/cha-gtls-app.texi1
-rw-r--r--lib/accelerated/x86/Makefile.am2
-rw-r--r--lib/accelerated/x86/aes-gcm-x86-pclmul-avx.c350
-rw-r--r--lib/accelerated/x86/aes-x86.h7
-rw-r--r--lib/accelerated/x86/coff/aes-ssse3-x86_64.s56
-rw-r--r--lib/accelerated/x86/coff/aesni-gcm-x86_64.s975
-rw-r--r--lib/accelerated/x86/coff/aesni-x86.s1157
-rw-r--r--lib/accelerated/x86/coff/aesni-x86_64.s1875
-rw-r--r--lib/accelerated/x86/coff/ghash-x86_64.s934
-rw-r--r--lib/accelerated/x86/elf/aes-ssse3-x86.s3
-rw-r--r--lib/accelerated/x86/elf/aes-ssse3-x86_64.s45
-rw-r--r--lib/accelerated/x86/elf/aesni-gcm-x86_64.s794
-rw-r--r--lib/accelerated/x86/elf/aesni-x86.s1162
-rw-r--r--lib/accelerated/x86/elf/aesni-x86_64.s1796
-rw-r--r--lib/accelerated/x86/elf/cpuid-x86.s3
-rw-r--r--lib/accelerated/x86/elf/ghash-x86_64.s839
-rw-r--r--lib/accelerated/x86/files.mk6
-rw-r--r--lib/accelerated/x86/macosx/aes-ssse3-x86_64.s42
-rw-r--r--lib/accelerated/x86/macosx/aesni-gcm-x86_64.s793
-rw-r--r--lib/accelerated/x86/macosx/aesni-x86.s1157
-rw-r--r--lib/accelerated/x86/macosx/aesni-x86_64.s1793
-rw-r--r--lib/accelerated/x86/macosx/ghash-x86_64.s836
-rw-r--r--lib/accelerated/x86/x86-common.c77
-rw-r--r--tests/slow/test-ciphers-common.sh11
28 files changed, 11077 insertions, 3655 deletions
diff --git a/cfg.mk b/cfg.mk
index 4947d4dd14..74ad82aae4 100644
--- a/cfg.mk
+++ b/cfg.mk
@@ -177,13 +177,12 @@ ASM_SOURCES_XXX := \
lib/accelerated/x86/XXX/ghash-x86_64.s \
lib/accelerated/x86/XXX/aesni-x86_64.s \
lib/accelerated/x86/XXX/aesni-x86.s \
- lib/accelerated/x86/XXX/e_padlock-x86_64.s \
- lib/accelerated/x86/XXX/e_padlock-x86.s \
lib/accelerated/x86/XXX/sha1-ssse3-x86.s \
lib/accelerated/x86/XXX/sha1-ssse3-x86_64.s \
lib/accelerated/x86/XXX/sha256-ssse3-x86.s \
lib/accelerated/x86/XXX/sha512-ssse3-x86.s \
lib/accelerated/x86/XXX/sha512-ssse3-x86_64.s \
+ lib/accelerated/x86/XXX/aesni-gcm-x86_64.s \
lib/accelerated/x86/XXX/aes-ssse3-x86.s \
lib/accelerated/x86/XXX/aes-ssse3-x86_64.s
@@ -200,7 +199,8 @@ X86_FILES=XXX/aesni-x86.s XXX/cpuid-x86.s XXX/sha1-ssse3-x86.s \
XXX/sha256-ssse3-x86.s XXX/sha512-ssse3-x86.s XXX/aes-ssse3-x86.s
X86_64_FILES=XXX/aesni-x86_64.s XXX/cpuid-x86_64.s XXX/ghash-x86_64.s \
- XXX/sha1-ssse3-x86_64.s XXX/sha512-ssse3-x86_64.s XXX/aes-ssse3-x86_64.s
+ XXX/sha1-ssse3-x86_64.s XXX/sha512-ssse3-x86_64.s XXX/aes-ssse3-x86_64.s \
+ XXX/aesni-gcm-x86_64.s
X86_PADLOCK_FILES=XXX/e_padlock-x86.s
X86_64_PADLOCK_FILES=XXX/e_padlock-x86_64.s
@@ -237,25 +237,25 @@ lib/accelerated/x86/files.mk: $(ASM_SOURCES_ELF)
# Appro's code
lib/accelerated/x86/elf/%.s: devel/perlasm/%.pl .submodule.stamp
cat $<.license > $@
- perl $< elf >> $@
+ CC=gcc perl $< elf >> $@
echo "" >> $@
echo ".section .note.GNU-stack,\"\",%progbits" >> $@
sed -i 's/OPENSSL_ia32cap_P/_gnutls_x86_cpuid_s/g' $@
lib/accelerated/x86/coff/%-x86.s: devel/perlasm/%-x86.pl .submodule.stamp
cat $<.license > $@
- perl $< coff >> $@
+ CC=gcc perl $< coff >> $@
echo "" >> $@
sed -i 's/OPENSSL_ia32cap_P/_gnutls_x86_cpuid_s/g' $@
lib/accelerated/x86/coff/%-x86_64.s: devel/perlasm/%-x86_64.pl .submodule.stamp
cat $<.license > $@
- perl $< mingw64 >> $@
+ CC=gcc perl $< mingw64 >> $@
echo "" >> $@
sed -i 's/OPENSSL_ia32cap_P/_gnutls_x86_cpuid_s/g' $@
lib/accelerated/x86/macosx/%.s: devel/perlasm/%.pl .submodule.stamp
cat $<.license > $@
- perl $< macosx >> $@
+ CC=gcc perl $< macosx >> $@
echo "" >> $@
sed -i 's/OPENSSL_ia32cap_P/_gnutls_x86_cpuid_s/g' $@
diff --git a/devel/perlasm/aesni-gcm-x86_64.pl b/devel/perlasm/aesni-gcm-x86_64.pl
new file mode 120000
index 0000000000..3ce68f852c
--- /dev/null
+++ b/devel/perlasm/aesni-gcm-x86_64.pl
@@ -0,0 +1 @@
+../openssl/crypto/modes/asm/aesni-gcm-x86_64.pl \ No newline at end of file
diff --git a/devel/perlasm/aesni-gcm-x86_64.pl.license b/devel/perlasm/aesni-gcm-x86_64.pl.license
new file mode 120000
index 0000000000..cd301a44ab
--- /dev/null
+++ b/devel/perlasm/aesni-gcm-x86_64.pl.license
@@ -0,0 +1 @@
+license.txt \ No newline at end of file
diff --git a/devel/perlasm/license.txt b/devel/perlasm/license.txt
index 748d2c1aee..c60c82df40 100644
--- a/devel/perlasm/license.txt
+++ b/devel/perlasm/license.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2016, Andy Polyakov <appro@openssl.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
diff --git a/doc/cha-gtls-app.texi b/doc/cha-gtls-app.texi
index edb3d32e72..d787d0f44c 100644
--- a/doc/cha-gtls-app.texi
+++ b/doc/cha-gtls-app.texi
@@ -182,6 +182,7 @@ CPU. The currently available options are:
@item 0x2: Enable AES-NI
@item 0x4: Enable SSSE3
@item 0x8: Enable PCLMUL
+@item 0x10: Enable AVX
@item 0x100000: Enable VIA padlock
@item 0x200000: Enable VIA PHE
@item 0x400000: Enable VIA PHE SHA512
diff --git a/lib/accelerated/x86/Makefile.am b/lib/accelerated/x86/Makefile.am
index ae684bd107..c4ee6a1308 100644
--- a/lib/accelerated/x86/Makefile.am
+++ b/lib/accelerated/x86/Makefile.am
@@ -50,7 +50,7 @@ include files.mk
if ASM_X86_64
AM_CFLAGS += -DASM_X86_64 -DASM_X86
-libx86_la_SOURCES += aes-gcm-x86-pclmul.c
+libx86_la_SOURCES += aes-gcm-x86-pclmul.c aes-gcm-x86-pclmul-avx.c
if WINDOWS
libx86_la_SOURCES += $(X86_64_FILES_COFF)
diff --git a/lib/accelerated/x86/aes-gcm-x86-pclmul-avx.c b/lib/accelerated/x86/aes-gcm-x86-pclmul-avx.c
new file mode 100644
index 0000000000..4b469dead9
--- /dev/null
+++ b/lib/accelerated/x86/aes-gcm-x86-pclmul-avx.c
@@ -0,0 +1,350 @@
+/*
+ * Copyright (C) 2011-2016 Free Software Foundation, Inc.
+ * Copyright (C) 2015-2016 Red Hat, Inc.
+ *
+ * Author: Nikos Mavrogiannopoulos
+ *
+ * This file is part of GnuTLS.
+ *
+ * The GnuTLS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>
+ *
+ */
+
+/*
+ * The following code is an implementation of the AES-128-GCM cipher
+ * using intel's AES instruction set.
+ */
+
+#include "errors.h"
+#include "gnutls_int.h"
+#include <gnutls/crypto.h>
+#include "errors.h"
+#include <aes-x86.h>
+#include <x86-common.h>
+#include <nettle/memxor.h>
+#include <byteswap.h>
+
+#define GCM_BLOCK_SIZE 16
+
+/* GCM mode with PCLMUL and AVX optimization */
+
+typedef struct {
+ uint64_t hi, lo;
+} u128;
+
+/* This is the gcm128 structure used in openssl. It
+ * is compatible with the included assembly code.
+ */
+struct gcm128_context {
+ union {
+ uint64_t u[2];
+ uint32_t d[4];
+ uint8_t c[16];
+ size_t t[16 / sizeof(size_t)];
+ } Yi, EKi, EK0, len, Xi, H;
+ u128 Htable[16];
+};
+
+struct aes_gcm_ctx {
+ AES_KEY expanded_key;
+ struct gcm128_context gcm;
+};
+
+void gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]);
+void gcm_ghash_avx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, size_t len);
+void gcm_gmult_avx(uint64_t Xi[2], const u128 Htable[16]);
+
+static void aes_gcm_deinit(void *_ctx)
+{
+ struct aes_gcm_ctx *ctx = _ctx;
+
+ zeroize_temp_key(ctx, sizeof(*ctx));
+ gnutls_free(ctx);
+}
+
+static int
+aes_gcm_cipher_init(gnutls_cipher_algorithm_t algorithm, void **_ctx,
+ int enc)
+{
+ /* we use key size to distinguish */
+ if (algorithm != GNUTLS_CIPHER_AES_128_GCM &&
+ algorithm != GNUTLS_CIPHER_AES_256_GCM)
+ return GNUTLS_E_INVALID_REQUEST;
+
+ *_ctx = gnutls_calloc(1, sizeof(struct aes_gcm_ctx));
+ if (*_ctx == NULL) {
+ gnutls_assert();
+ return GNUTLS_E_MEMORY_ERROR;
+ }
+
+ return 0;
+}
+
+static int
+aes_gcm_cipher_setkey(void *_ctx, const void *userkey, size_t keysize)
+{
+ struct aes_gcm_ctx *ctx = _ctx;
+ int ret;
+
+ ret =
+ aesni_set_encrypt_key(userkey, keysize * 8,
+ ALIGN16(&ctx->expanded_key));
+ if (ret != 0)
+ return gnutls_assert_val(GNUTLS_E_ENCRYPTION_FAILED);
+
+ aesni_ecb_encrypt(ctx->gcm.H.c, ctx->gcm.H.c,
+ GCM_BLOCK_SIZE, ALIGN16(&ctx->expanded_key), 1);
+
+ ctx->gcm.H.u[0] = bswap_64(ctx->gcm.H.u[0]);
+ ctx->gcm.H.u[1] = bswap_64(ctx->gcm.H.u[1]);
+
+ gcm_init_avx(ctx->gcm.Htable, ctx->gcm.H.u);
+
+ return 0;
+}
+
+static int aes_gcm_setiv(void *_ctx, const void *iv, size_t iv_size)
+{
+ struct aes_gcm_ctx *ctx = _ctx;
+
+ if (iv_size != GCM_BLOCK_SIZE - 4)
+ return gnutls_assert_val(GNUTLS_E_INVALID_REQUEST);
+
+ memset(ctx->gcm.Xi.c, 0, sizeof(ctx->gcm.Xi.c));
+ memset(ctx->gcm.len.c, 0, sizeof(ctx->gcm.len.c));
+
+ memcpy(ctx->gcm.Yi.c, iv, GCM_BLOCK_SIZE - 4);
+ ctx->gcm.Yi.c[GCM_BLOCK_SIZE - 4] = 0;
+ ctx->gcm.Yi.c[GCM_BLOCK_SIZE - 3] = 0;
+ ctx->gcm.Yi.c[GCM_BLOCK_SIZE - 2] = 0;
+ ctx->gcm.Yi.c[GCM_BLOCK_SIZE - 1] = 1;
+
+ aesni_ecb_encrypt(ctx->gcm.Yi.c, ctx->gcm.EK0.c,
+ GCM_BLOCK_SIZE, ALIGN16(&ctx->expanded_key), 1);
+ ctx->gcm.Yi.c[GCM_BLOCK_SIZE - 1] = 2;
+ return 0;
+}
+
+static void
+gcm_ghash(struct aes_gcm_ctx *ctx, const uint8_t * src, size_t src_size)
+{
+ size_t rest = src_size % GCM_BLOCK_SIZE;
+ size_t aligned_size = src_size - rest;
+
+ if (aligned_size > 0)
+ gcm_ghash_avx(ctx->gcm.Xi.u, ctx->gcm.Htable, src,
+ aligned_size);
+
+ if (rest > 0) {
+ memxor(ctx->gcm.Xi.c, src + aligned_size, rest);
+ gcm_gmult_avx(ctx->gcm.Xi.u, ctx->gcm.Htable);
+ }
+}
+
+static inline void
+ctr_encrypt_last(struct aes_gcm_ctx *ctx, const uint8_t * src,
+ uint8_t * dst, size_t pos, size_t length)
+{
+ uint8_t tmp[GCM_BLOCK_SIZE];
+ uint8_t out[GCM_BLOCK_SIZE];
+
+ memcpy(tmp, &src[pos], length);
+ aesni_ctr32_encrypt_blocks(tmp, out, 1,
+ ALIGN16(&ctx->expanded_key),
+ ctx->gcm.Yi.c);
+
+ memcpy(&dst[pos], out, length);
+
+}
+
+static int
+aes_gcm_encrypt(void *_ctx, const void *src, size_t src_size,
+ void *dst, size_t length)
+{
+ struct aes_gcm_ctx *ctx = _ctx;
+ int blocks = src_size / GCM_BLOCK_SIZE;
+ int exp_blocks = blocks * GCM_BLOCK_SIZE;
+ int rest = src_size - (exp_blocks);
+ uint32_t counter;
+
+ if (blocks > 0) {
+ aesni_ctr32_encrypt_blocks(src, dst,
+ blocks,
+ ALIGN16(&ctx->expanded_key),
+ ctx->gcm.Yi.c);
+
+ counter = _gnutls_read_uint32(ctx->gcm.Yi.c + 12);
+ counter += blocks;
+ _gnutls_write_uint32(counter, ctx->gcm.Yi.c + 12);
+ }
+
+ if (rest > 0) /* last incomplete block */
+ ctr_encrypt_last(ctx, src, dst, exp_blocks, rest);
+
+ gcm_ghash(ctx, dst, src_size);
+ ctx->gcm.len.u[1] += src_size;
+
+ return 0;
+}
+
+static int
+aes_gcm_decrypt(void *_ctx, const void *src, size_t src_size,
+ void *dst, size_t dst_size)
+{
+ struct aes_gcm_ctx *ctx = _ctx;
+ int blocks = src_size / GCM_BLOCK_SIZE;
+ int exp_blocks = blocks * GCM_BLOCK_SIZE;
+ int rest = src_size - (exp_blocks);
+ uint32_t counter;
+
+ gcm_ghash(ctx, src, src_size);
+ ctx->gcm.len.u[1] += src_size;
+
+ if (blocks > 0) {
+ aesni_ctr32_encrypt_blocks(src, dst,
+ blocks,
+ ALIGN16(&ctx->expanded_key),
+ ctx->gcm.Yi.c);
+
+ counter = _gnutls_read_uint32(ctx->gcm.Yi.c + 12);
+ counter += blocks;
+ _gnutls_write_uint32(counter, ctx->gcm.Yi.c + 12);
+ }
+
+ if (rest > 0) /* last incomplete block */
+ ctr_encrypt_last(ctx, src, dst, exp_blocks, rest);
+
+ return 0;
+}
+
+static int aes_gcm_auth(void *_ctx, const void *src, size_t src_size)
+{
+ struct aes_gcm_ctx *ctx = _ctx;
+
+ gcm_ghash(ctx, src, src_size);
+ ctx->gcm.len.u[0] += src_size;
+
+ return 0;
+}
+
+
+static void aes_gcm_tag(void *_ctx, void *tag, size_t tagsize)
+{
+ struct aes_gcm_ctx *ctx = _ctx;
+ uint8_t buffer[GCM_BLOCK_SIZE];
+ uint64_t alen, clen;
+
+ alen = ctx->gcm.len.u[0] * 8;
+ clen = ctx->gcm.len.u[1] * 8;
+
+ _gnutls_write_uint64(alen, buffer);
+ _gnutls_write_uint64(clen, &buffer[8]);
+
+ gcm_ghash_avx(ctx->gcm.Xi.u, ctx->gcm.Htable, buffer,
+ GCM_BLOCK_SIZE);
+
+ ctx->gcm.Xi.u[0] ^= ctx->gcm.EK0.u[0];
+ ctx->gcm.Xi.u[1] ^= ctx->gcm.EK0.u[1];
+
+ memcpy(tag, ctx->gcm.Xi.c, MIN(GCM_BLOCK_SIZE, tagsize));
+}
+
+#ifdef ASM_X86_64
+/* requires AVX */
+static int
+aesni_gcm_aead_encrypt(void *_ctx,
+ const void *nonce, size_t nonce_size,
+ const void *auth, size_t auth_size,
+ size_t tag_size,
+ const void *plain, size_t plain_size,
+ void *encr, size_t encr_size)
+{
+ struct aes_gcm_ctx *ctx = _ctx;
+ size_t s = 0;
+
+ if (encr_size < plain_size + tag_size)
+ return gnutls_assert_val(GNUTLS_E_SHORT_MEMORY_BUFFER);
+
+ aes_gcm_setiv(ctx, nonce, nonce_size);
+ aes_gcm_auth(ctx, auth, auth_size);
+
+ if (plain_size >= 96) {
+ s = aesni_gcm_encrypt(plain, encr, plain_size, ALIGN16(&ctx->expanded_key),
+ ctx->gcm.Yi.c, ctx->gcm.Xi.u);
+ ctx->gcm.len.u[1] += s;
+ }
+
+ if ((plain_size-s) > 0)
+ aes_gcm_encrypt(ctx, ((uint8_t*)plain)+s, plain_size-s, ((uint8_t*)encr)+s, encr_size-s);
+
+ aes_gcm_tag(ctx, ((uint8_t*)encr) + plain_size, tag_size);
+
+ return 0;
+}
+
+static int
+aesni_gcm_aead_decrypt(void *_ctx,
+ const void *nonce, size_t nonce_size,
+ const void *auth, size_t auth_size,
+ size_t tag_size,
+ const void *encr, size_t encr_size,
+ void *plain, size_t plain_size)
+{
+ struct aes_gcm_ctx *ctx = _ctx;
+ uint8_t tag[MAX_HASH_SIZE];
+ size_t s = 0;
+
+ if (encr_size < tag_size)
+ return gnutls_assert_val(GNUTLS_E_DECRYPTION_FAILED);
+
+ aes_gcm_setiv(ctx, nonce, nonce_size);
+ aes_gcm_auth(ctx, auth, auth_size);
+
+ encr_size -= tag_size;
+
+ if (encr_size >= 96) {
+ s = aesni_gcm_decrypt(encr, plain, encr_size, ALIGN16(&ctx->expanded_key),
+ ctx->gcm.Yi.c, ctx->gcm.Xi.u);
+ ctx->gcm.len.u[1] += s;
+ }
+
+ if ((encr_size-s) > 0) {
+ aes_gcm_decrypt(ctx, ((uint8_t*)encr)+s, encr_size-s, ((uint8_t*)plain)+s, plain_size-s);
+ }
+
+ aes_gcm_tag(ctx, tag, tag_size);
+
+ if (gnutls_memcmp(((uint8_t*)encr)+encr_size, tag, tag_size) != 0)
+ return gnutls_assert_val(GNUTLS_E_DECRYPTION_FAILED);
+
+ return 0;
+}
+#else
+# define aesni_gcm_aead_decrypt aes_gcm_aead_decrypt
+# define aesni_gcm_aead_encrypt aes_gcm_aead_encrypt
+# include "aes-gcm-aead.h"
+#endif
+
+const gnutls_crypto_cipher_st _gnutls_aes_gcm_pclmul_avx = {
+ .init = aes_gcm_cipher_init,
+ .setkey = aes_gcm_cipher_setkey,
+ .setiv = aes_gcm_setiv,
+ .aead_encrypt = aesni_gcm_aead_encrypt,
+ .aead_decrypt = aesni_gcm_aead_decrypt,
+ .encrypt = aes_gcm_encrypt,
+ .decrypt = aes_gcm_decrypt,
+ .deinit = aes_gcm_deinit,
+ .tag = aes_gcm_tag,
+ .auth = aes_gcm_auth,
+};
diff --git a/lib/accelerated/x86/aes-x86.h b/lib/accelerated/x86/aes-x86.h
index e25991bb8c..703521856b 100644
--- a/lib/accelerated/x86/aes-x86.h
+++ b/lib/accelerated/x86/aes-x86.h
@@ -35,6 +35,12 @@ void aesni_ctr32_encrypt_blocks(const unsigned char *in,
const void *key,
const unsigned char *ivec);
+size_t aesni_gcm_encrypt(const void *inp, void *out, size_t len,
+ const AES_KEY *key, const unsigned char iv[16], uint64_t* Xi);
+
+size_t aesni_gcm_decrypt(const void *inp, void *out, size_t len,
+ const AES_KEY *key, const unsigned char iv[16], uint64_t* Xi);
+
int vpaes_set_encrypt_key(const unsigned char *userKey, int bits, AES_KEY *key);
int vpaes_set_decrypt_key(const unsigned char *userKey, int bits, AES_KEY *key);
void vpaes_cbc_encrypt(const unsigned char *in, unsigned char *out,
@@ -43,6 +49,7 @@ void vpaes_encrypt(const unsigned char *in, unsigned char *out, const AES_KEY *k
void vpaes_decrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key);
extern const gnutls_crypto_cipher_st _gnutls_aes_gcm_pclmul;
+extern const gnutls_crypto_cipher_st _gnutls_aes_gcm_pclmul_avx;
extern const gnutls_crypto_cipher_st _gnutls_aes_gcm_x86_aesni;
extern const gnutls_crypto_cipher_st _gnutls_aes_ccm_x86_aesni;
extern const gnutls_crypto_cipher_st _gnutls_aes_gcm_x86_ssse3;
diff --git a/lib/accelerated/x86/coff/aes-ssse3-x86_64.s b/lib/accelerated/x86/coff/aes-ssse3-x86_64.s
index 779f5a486f..7d5defdbb1 100644
--- a/lib/accelerated/x86/coff/aes-ssse3-x86_64.s
+++ b/lib/accelerated/x86/coff/aes-ssse3-x86_64.s
@@ -72,7 +72,7 @@ _vpaes_encrypt_core:
addq $16,%r11
pxor %xmm0,%xmm3
.byte 102,15,56,0,193
- andq $48,%r11
+ andq $0x30,%r11
subq $1,%rax
pxor %xmm3,%xmm0
@@ -132,10 +132,10 @@ _vpaes_decrypt_core:
pand %xmm9,%xmm0
.byte 102,15,56,0,208
movdqa .Lk_dipt+16(%rip),%xmm0
- xorq $48,%r11
+ xorq $0x30,%r11
leaq .Lk_dsbd(%rip),%r10
.byte 102,15,56,0,193
- andq $48,%r11
+ andq $0x30,%r11
pxor %xmm5,%xmm2
movdqa .Lk_mc_forward+48(%rip),%xmm5
pxor %xmm2,%xmm0
@@ -231,7 +231,7 @@ _vpaes_schedule_core:
- call _vpaes_preheat
+ call _vpaes_preheat
movdqa .Lk_rcon(%rip),%xmm8
movdqu (%rdi),%xmm0
@@ -254,7 +254,7 @@ _vpaes_schedule_core:
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
movdqu %xmm3,(%rdx)
- xorq $48,%r8
+ xorq $0x30,%r8
.Lschedule_go:
cmpl $192,%esi
@@ -277,7 +277,7 @@ _vpaes_schedule_core:
call _vpaes_schedule_round
decq %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
jmp .Loop_schedule_128
@@ -298,7 +298,7 @@ _vpaes_schedule_core:
.p2align 4
.Lschedule_192:
movdqu 8(%rdi),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movdqa %xmm0,%xmm6
pxor %xmm4,%xmm4
movhlps %xmm4,%xmm6
@@ -307,13 +307,13 @@ _vpaes_schedule_core:
.Loop_schedule_192:
call _vpaes_schedule_round
.byte 102,15,58,15,198,8
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_round
decq %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
jmp .Loop_schedule_192
@@ -330,21 +330,21 @@ _vpaes_schedule_core:
.p2align 4
.Lschedule_256:
movdqu 16(%rdi),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movl $7,%esi
.Loop_schedule_256:
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
movdqa %xmm0,%xmm6
call _vpaes_schedule_round
decq %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
- pshufd $255,%xmm0,%xmm0
+ pshufd $0xFF,%xmm0,%xmm0
movdqa %xmm7,%xmm5
movdqa %xmm6,%xmm7
call _vpaes_schedule_low_round
@@ -379,7 +379,7 @@ _vpaes_schedule_core:
.Lschedule_mangle_last_dec:
addq $-16,%rdx
pxor .Lk_s63(%rip),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movdqu %xmm0,(%rdx)
@@ -411,8 +411,8 @@ _vpaes_schedule_core:
.def _vpaes_schedule_192_smear; .scl 3; .type 32; .endef
.p2align 4
_vpaes_schedule_192_smear:
- pshufd $128,%xmm6,%xmm1
- pshufd $254,%xmm7,%xmm0
+ pshufd $0x80,%xmm6,%xmm1
+ pshufd $0xFE,%xmm7,%xmm0
pxor %xmm1,%xmm6
pxor %xmm1,%xmm1
pxor %xmm0,%xmm6
@@ -449,7 +449,7 @@ _vpaes_schedule_round:
pxor %xmm1,%xmm7
- pshufd $255,%xmm0,%xmm0
+ pshufd $0xFF,%xmm0,%xmm0
.byte 102,15,58,15,192,1
@@ -608,7 +608,7 @@ _vpaes_schedule_mangle:
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
addq $-16,%r8
- andq $48,%r8
+ andq $0x30,%r8
movdqu %xmm3,(%rdx)
.byte 0xf3,0xc3
@@ -646,7 +646,7 @@ vpaes_set_encrypt_key:
movl %eax,240(%rdx)
movl $0,%ecx
- movl $48,%r8d
+ movl $0x30,%r8d
call _vpaes_schedule_core
movaps 16(%rsp),%xmm6
movaps 32(%rsp),%xmm7
@@ -1007,7 +1007,7 @@ _vpaes_consts:
.Lk_dsbo:
.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
-.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
.p2align 6
@@ -1046,7 +1046,7 @@ se_handler:
leaq 16(%rax),%rsi
leaq 512(%r8),%rdi
movl $20,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
leaq 184(%rax),%rax
.Lin_prologue:
@@ -1059,7 +1059,7 @@ se_handler:
movq 40(%r9),%rdi
movq %r8,%rsi
movl $154,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
movq %r9,%rsi
xorq %rcx,%rcx
@@ -1116,21 +1116,21 @@ se_handler:
.LSEH_info_vpaes_set_encrypt_key:
.byte 9,0,0,0
.rva se_handler
-.rva .Lenc_key_body,.Lenc_key_epilogue
+.rva .Lenc_key_body,.Lenc_key_epilogue
.LSEH_info_vpaes_set_decrypt_key:
.byte 9,0,0,0
.rva se_handler
-.rva .Ldec_key_body,.Ldec_key_epilogue
+.rva .Ldec_key_body,.Ldec_key_epilogue
.LSEH_info_vpaes_encrypt:
.byte 9,0,0,0
.rva se_handler
-.rva .Lenc_body,.Lenc_epilogue
+.rva .Lenc_body,.Lenc_epilogue
.LSEH_info_vpaes_decrypt:
.byte 9,0,0,0
.rva se_handler
-.rva .Ldec_body,.Ldec_epilogue
+.rva .Ldec_body,.Ldec_epilogue
.LSEH_info_vpaes_cbc_encrypt:
.byte 9,0,0,0
.rva se_handler
-.rva .Lcbc_body,.Lcbc_epilogue
+.rva .Lcbc_body,.Lcbc_epilogue
diff --git a/lib/accelerated/x86/coff/aesni-gcm-x86_64.s b/lib/accelerated/x86/coff/aesni-gcm-x86_64.s
new file mode 100644
index 0000000000..bc3554ca07
--- /dev/null
+++ b/lib/accelerated/x86/coff/aesni-gcm-x86_64.s
@@ -0,0 +1,975 @@
+# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain copyright notices,
+# this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# * Neither the name of the Andy Polyakov nor the names of its
+# copyright holder and contributors may be used to endorse or
+# promote products derived from this software without specific
+# prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *** This file is auto-generated ***
+#
+.text
+
+.def _aesni_ctr32_ghash_6x; .scl 3; .type 32; .endef
+.p2align 5
+_aesni_ctr32_ghash_6x:
+ vmovdqu 32(%r11),%xmm2
+ subq $6,%rdx
+ vpxor %xmm4,%xmm4,%xmm4
+ vmovdqu 0-128(%rcx),%xmm15
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovdqu %xmm4,16+8(%rsp)
+ jmp .Loop6x
+
+.p2align 5
+.Loop6x:
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm15,%xmm10,%xmm10
+ vpxor %xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32:
+ vmovdqu %xmm1,(%r8)
+ vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
+ vpxor %xmm15,%xmm12,%xmm12
+ vmovups 16-128(%rcx),%xmm2
+ vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
+ xorq %r12,%r12
+ cmpq %r14,%r15
+
+ vaesenc %xmm2,%xmm9,%xmm9
+ vmovdqu 48+8(%rsp),%xmm0
+ vpxor %xmm15,%xmm13,%xmm13
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
+ vaesenc %xmm2,%xmm10,%xmm10
+ vpxor %xmm15,%xmm14,%xmm14
+ setnc %r12b
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vmovdqu 16-32(%r9),%xmm3
+ negq %r12
+ vaesenc %xmm2,%xmm12,%xmm12
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
+ vpxor %xmm4,%xmm8,%xmm8
+ vaesenc %xmm2,%xmm13,%xmm13
+ vpxor %xmm5,%xmm1,%xmm4
+ andq $0x60,%r12
+ vmovups 32-128(%rcx),%xmm15
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
+ vaesenc %xmm2,%xmm14,%xmm14
+
+ vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
+ leaq (%r14,%r12,1),%r14
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
+ vmovdqu 64+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 88(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 80(%r14),%r12
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,32+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,40+8(%rsp)
+ vmovdqu 48-32(%r9),%xmm5
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 48-128(%rcx),%xmm15
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
+ vmovdqu 80+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqu 64-32(%r9),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 64-128(%rcx),%xmm15
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 72(%r14),%r13
+ vpxor %xmm5,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 64(%r14),%r12
+ vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
+ vmovdqu 96+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,48+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,56+8(%rsp)
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 96-32(%r9),%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 80-128(%rcx),%xmm15
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 56(%r14),%r13
+ vpxor %xmm1,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
+ vpxor 112+8(%rsp),%xmm8,%xmm8
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 48(%r14),%r12
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,64+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,72+8(%rsp)
+ vpxor %xmm3,%xmm4,%xmm4
+ vmovdqu 112-32(%r9),%xmm3
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 96-128(%rcx),%xmm15
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 40(%r14),%r13
+ vpxor %xmm2,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 32(%r14),%r12
+ vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,80+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,88+8(%rsp)
+ vpxor %xmm5,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor %xmm1,%xmm6,%xmm6
+
+ vmovups 112-128(%rcx),%xmm15
+ vpslldq $8,%xmm6,%xmm5
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 16(%r11),%xmm3
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm8,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm5,%xmm4,%xmm4
+ movbeq 24(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 16(%r14),%r12
+ vpalignr $8,%xmm4,%xmm4,%xmm0
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ movq %r13,96+8(%rsp)
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r12,104+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ vmovups 128-128(%rcx),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 144-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm10,%xmm10
+ vpsrldq $8,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm11,%xmm11
+ vpxor %xmm6,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm12,%xmm12
+ vpxor %xmm0,%xmm4,%xmm4
+ movbeq 8(%r14),%r13
+ vaesenc %xmm1,%xmm13,%xmm13
+ movbeq 0(%r14),%r12
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 160-128(%rcx),%xmm1
+ cmpl $11,%ebp
+ jb .Lenc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 176-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 192-128(%rcx),%xmm1
+ je .Lenc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 208-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 224-128(%rcx),%xmm1
+ jmp .Lenc_tail
+
+.p2align 5
+.Lhandle_ctr32:
+ vmovdqu (%r11),%xmm0
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm15,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm15,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpshufb %xmm0,%xmm1,%xmm1
+ jmp .Lresume_ctr32
+
+.p2align 5
+.Lenc_tail:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vmovdqu %xmm7,16+8(%rsp)
+ vpalignr $8,%xmm4,%xmm4,%xmm8
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ vpxor 0(%rdi),%xmm1,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 16(%rdi),%xmm1,%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 32(%rdi),%xmm1,%xmm5
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 48(%rdi),%xmm1,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 64(%rdi),%xmm1,%xmm7
+ vpxor 80(%rdi),%xmm1,%xmm3
+ vmovdqu (%r8),%xmm1
+
+ vaesenclast %xmm2,%xmm9,%xmm9
+ vmovdqu 32(%r11),%xmm2
+ vaesenclast %xmm0,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm1,%xmm0
+ movq %r13,112+8(%rsp)
+ leaq 96(%rdi),%rdi
+ vaesenclast %xmm5,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm0,%xmm5
+ movq %r12,120+8(%rsp)
+ leaq 96(%rsi),%rsi
+ vmovdqu 0-128(%rcx),%xmm15
+ vaesenclast %xmm6,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm5,%xmm6
+ vaesenclast %xmm7,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm6,%xmm7
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vpaddb %xmm2,%xmm7,%xmm3
+
+ addq $0x60,%r10
+ subq $0x6,%rdx
+ jc .L6x_done
+
+ vmovups %xmm9,-96(%rsi)
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovups %xmm10,-80(%rsi)
+ vmovdqa %xmm0,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vmovdqa %xmm5,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vmovdqa %xmm6,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vmovdqa %xmm7,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vmovdqa %xmm3,%xmm14
+ vmovdqu 32+8(%rsp),%xmm7
+ jmp .Loop6x
+
+.L6x_done:
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpxor %xmm4,%xmm8,%xmm8
+
+ .byte 0xf3,0xc3
+
+.globl aesni_gcm_decrypt
+.def aesni_gcm_decrypt; .scl 2; .type 32; .endef
+.p2align 5
+aesni_gcm_decrypt:
+ movq %rdi,8(%rsp)
+ movq %rsi,16(%rsp)
+ movq %rsp,%rax
+.LSEH_begin_aesni_gcm_decrypt:
+ movq %rcx,%rdi
+ movq %rdx,%rsi
+ movq %r8,%rdx
+ movq %r9,%rcx
+ movq 40(%rsp),%r8
+ movq 48(%rsp),%r9
+
+ xorq %r10,%r10
+ cmpq $0x60,%rdx
+ jb .Lgcm_dec_abort
+
+ leaq (%rsp),%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ leaq -168(%rsp),%rsp
+ movaps %xmm6,-216(%rax)
+ movaps %xmm7,-200(%rax)
+ movaps %xmm8,-184(%rax)
+ movaps %xmm9,-168(%rax)
+ movaps %xmm10,-152(%rax)
+ movaps %xmm11,-136(%rax)
+ movaps %xmm12,-120(%rax)
+ movaps %xmm13,-104(%rax)
+ movaps %xmm14,-88(%rax)
+ movaps %xmm15,-72(%rax)
+.Lgcm_dec_body:
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq .Lbswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ vmovdqu (%r9),%xmm8
+ andq $-128,%rsp
+ vmovdqu (%r11),%xmm0
+ leaq 128(%rcx),%rcx
+ leaq 32+32(%r9),%r9
+ movl 240-128(%rcx),%ebp
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc .Ldec_no_key_aliasing
+ cmpq $768,%r15
+ jnc .Ldec_no_key_aliasing
+ subq %r15,%rsp
+.Ldec_no_key_aliasing:
+
+ vmovdqu 80(%rdi),%xmm7
+ leaq (%rdi),%r14
+ vmovdqu 64(%rdi),%xmm4
+ leaq -192(%rdi,%rdx,1),%r15
+ vmovdqu 48(%rdi),%xmm5
+ shrq $4,%rdx
+ xorq %r10,%r10
+ vmovdqu 32(%rdi),%xmm6
+ vpshufb %xmm0,%xmm7,%xmm7
+ vmovdqu 16(%rdi),%xmm2
+ vpshufb %xmm0,%xmm4,%xmm4
+ vmovdqu (%rdi),%xmm3
+ vpshufb %xmm0,%xmm5,%xmm5
+ vmovdqu %xmm4,48(%rsp)
+ vpshufb %xmm0,%xmm6,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm2,%xmm2
+ vmovdqu %xmm6,80(%rsp)
+ vpshufb %xmm0,%xmm3,%xmm3
+ vmovdqu %xmm2,96(%rsp)
+ vmovdqu %xmm3,112(%rsp)
+
+ call _aesni_ctr32_ghash_6x
+
+ vmovups %xmm9,-96(%rsi)
+ vmovups %xmm10,-80(%rsi)
+ vmovups %xmm11,-64(%rsi)
+ vmovups %xmm12,-48(%rsi)
+ vmovups %xmm13,-32(%rsi)
+ vmovups %xmm14,-16(%rsi)
+
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,-64(%r9)
+
+ vzeroupper
+ movaps -216(%rax),%xmm6
+ movaps -200(%rax),%xmm7
+ movaps -184(%rax),%xmm8
+ movaps -168(%rax),%xmm9
+ movaps -152(%rax),%xmm10
+ movaps -136(%rax),%xmm11
+ movaps -120(%rax),%xmm12
+ movaps -104(%rax),%xmm13
+ movaps -88(%rax),%xmm14
+ movaps -72(%rax),%xmm15
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+.Lgcm_dec_abort:
+ movq %r10,%rax
+ movq 8(%rsp),%rdi
+ movq 16(%rsp),%rsi
+ .byte 0xf3,0xc3
+.LSEH_end_aesni_gcm_decrypt:
+.def _aesni_ctr32_6x; .scl 3; .type 32; .endef
+.p2align 5
+_aesni_ctr32_6x:
+ vmovdqu 0-128(%rcx),%xmm4
+ vmovdqu 32(%r11),%xmm2
+ leaq -1(%rbp),%r13
+ vmovups 16-128(%rcx),%xmm15
+ leaq 32-128(%rcx),%r12
+ vpxor %xmm4,%xmm1,%xmm9
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32_2
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp .Loop_ctr32
+
+.p2align 4
+.Loop_ctr32:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+ vmovups (%r12),%xmm15
+ leaq 16(%r12),%r12
+ decl %r13d
+ jnz .Loop_ctr32
+
+ vmovdqu (%r12),%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 0(%rdi),%xmm3,%xmm4
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor 16(%rdi),%xmm3,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 32(%rdi),%xmm3,%xmm6
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 48(%rdi),%xmm3,%xmm8
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 64(%rdi),%xmm3,%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 80(%rdi),%xmm3,%xmm3
+ leaq 96(%rdi),%rdi
+
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vaesenclast %xmm5,%xmm10,%xmm10
+ vaesenclast %xmm6,%xmm11,%xmm11
+ vaesenclast %xmm8,%xmm12,%xmm12
+ vaesenclast %xmm2,%xmm13,%xmm13
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vmovups %xmm9,0(%rsi)
+ vmovups %xmm10,16(%rsi)
+ vmovups %xmm11,32(%rsi)
+ vmovups %xmm12,48(%rsi)
+ vmovups %xmm13,64(%rsi)
+ vmovups %xmm14,80(%rsi)
+ leaq 96(%rsi),%rsi
+
+ .byte 0xf3,0xc3
+.p2align 5
+.Lhandle_ctr32_2:
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp .Loop_ctr32
+
+
+.globl aesni_gcm_encrypt
+.def aesni_gcm_encrypt; .scl 2; .type 32; .endef
+.p2align 5
+aesni_gcm_encrypt:
+ movq %rdi,8(%rsp)
+ movq %rsi,16(%rsp)
+ movq %rsp,%rax
+.LSEH_begin_aesni_gcm_encrypt:
+ movq %rcx,%rdi
+ movq %rdx,%rsi
+ movq %r8,%rdx
+ movq %r9,%rcx
+ movq 40(%rsp),%r8
+ movq 48(%rsp),%r9
+
+ xorq %r10,%r10
+ cmpq $288,%rdx
+ jb .Lgcm_enc_abort
+
+ leaq (%rsp),%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ leaq -168(%rsp),%rsp
+ movaps %xmm6,-216(%rax)
+ movaps %xmm7,-200(%rax)
+ movaps %xmm8,-184(%rax)
+ movaps %xmm9,-168(%rax)
+ movaps %xmm10,-152(%rax)
+ movaps %xmm11,-136(%rax)
+ movaps %xmm12,-120(%rax)
+ movaps %xmm13,-104(%rax)
+ movaps %xmm14,-88(%rax)
+ movaps %xmm15,-72(%rax)
+.Lgcm_enc_body:
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq .Lbswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ leaq 128(%rcx),%rcx
+ vmovdqu (%r11),%xmm0
+ andq $-128,%rsp
+ movl 240-128(%rcx),%ebp
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc .Lenc_no_key_aliasing
+ cmpq $768,%r15
+ jnc .Lenc_no_key_aliasing
+ subq %r15,%rsp
+.Lenc_no_key_aliasing:
+
+ leaq (%rsi),%r14
+ leaq -192(%rsi,%rdx,1),%r15
+ shrq $4,%rdx
+
+ call _aesni_ctr32_6x
+ vpshufb %xmm0,%xmm9,%xmm8
+ vpshufb %xmm0,%xmm10,%xmm2
+ vmovdqu %xmm8,112(%rsp)
+ vpshufb %xmm0,%xmm11,%xmm4
+ vmovdqu %xmm2,96(%rsp)
+ vpshufb %xmm0,%xmm12,%xmm5
+ vmovdqu %xmm4,80(%rsp)
+ vpshufb %xmm0,%xmm13,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm14,%xmm7
+ vmovdqu %xmm6,48(%rsp)
+
+ call _aesni_ctr32_6x
+
+ vmovdqu (%r9),%xmm8
+ leaq 32+32(%r9),%r9
+ subq $12,%rdx
+ movq $192,%r10
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ call _aesni_ctr32_ghash_6x
+ vmovdqu 32(%rsp),%xmm7
+ vmovdqu (%r11),%xmm0
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm7,%xmm7,%xmm1
+ vmovdqu 32-32(%r9),%xmm15
+ vmovups %xmm9,-96(%rsi)
+ vpshufb %xmm0,%xmm9,%xmm9
+ vpxor %xmm7,%xmm1,%xmm1
+ vmovups %xmm10,-80(%rsi)
+ vpshufb %xmm0,%xmm10,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vpshufb %xmm0,%xmm11,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vpshufb %xmm0,%xmm12,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vpshufb %xmm0,%xmm13,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vpshufb %xmm0,%xmm14,%xmm14
+ vmovdqu %xmm9,16(%rsp)
+ vmovdqu 48(%rsp),%xmm6
+ vmovdqu 16-32(%r9),%xmm0
+ vpunpckhqdq %xmm6,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
+ vpxor %xmm6,%xmm2,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+
+ vmovdqu 64(%rsp),%xmm9
+ vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm9,%xmm9,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
+ vpxor %xmm9,%xmm5,%xmm5
+ vpxor %xmm7,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vmovdqu 80(%rsp),%xmm1
+ vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm4,%xmm7,%xmm7
+ vpunpckhqdq %xmm1,%xmm1,%xmm4
+ vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpxor %xmm6,%xmm9,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 96(%rsp),%xmm2
+ vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm7,%xmm6,%xmm6
+ vpunpckhqdq %xmm2,%xmm2,%xmm7
+ vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpxor %xmm9,%xmm1,%xmm1
+ vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm5,%xmm4,%xmm4
+
+ vpxor 112(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
+ vmovdqu 112-32(%r9),%xmm0
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm1,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
+ vpxor %xmm4,%xmm7,%xmm4
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm1
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
+ vpxor %xmm14,%xmm1,%xmm1
+ vpxor %xmm5,%xmm6,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
+ vmovdqu 32-32(%r9),%xmm15
+ vpxor %xmm2,%xmm8,%xmm7
+ vpxor %xmm4,%xmm9,%xmm6
+
+ vmovdqu 16-32(%r9),%xmm0
+ vpxor %xmm5,%xmm7,%xmm9
+ vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
+ vpxor %xmm9,%xmm6,%xmm6
+ vpunpckhqdq %xmm13,%xmm13,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
+ vpxor %xmm13,%xmm2,%xmm2
+ vpslldq $8,%xmm6,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+ vpxor %xmm9,%xmm5,%xmm8
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm12,%xmm12,%xmm9
+ vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
+ vpxor %xmm12,%xmm9,%xmm9
+ vpxor %xmm14,%xmm13,%xmm13
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm11,%xmm11,%xmm1
+ vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
+ vpxor %xmm11,%xmm1,%xmm1
+ vpxor %xmm13,%xmm12,%xmm12
+ vxorps 16(%rsp),%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm9,%xmm9
+
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm10,%xmm10,%xmm2
+ vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
+ vpxor %xmm10,%xmm2,%xmm2
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpxor %xmm12,%xmm11,%xmm11
+ vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm9,%xmm1,%xmm1
+
+ vxorps %xmm7,%xmm14,%xmm14
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
+ vmovdqu 112-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm11,%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
+ vpxor %xmm4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
+ vpxor %xmm10,%xmm7,%xmm7
+ vpxor %xmm2,%xmm6,%xmm6
+
+ vpxor %xmm5,%xmm7,%xmm4
+ vpxor %xmm4,%xmm6,%xmm6
+ vpslldq $8,%xmm6,%xmm1
+ vmovdqu 16(%r11),%xmm3
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm1,%xmm5,%xmm8
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm2,%xmm8,%xmm8
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm7,%xmm2,%xmm2
+ vpxor %xmm2,%xmm8,%xmm8
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,-64(%r9)
+
+ vzeroupper
+ movaps -216(%rax),%xmm6
+ movaps -200(%rax),%xmm7
+ movaps -184(%rax),%xmm8
+ movaps -168(%rax),%xmm9
+ movaps -152(%rax),%xmm10
+ movaps -136(%rax),%xmm11
+ movaps -120(%rax),%xmm12
+ movaps -104(%rax),%xmm13
+ movaps -88(%rax),%xmm14
+ movaps -72(%rax),%xmm15
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+.Lgcm_enc_abort:
+ movq %r10,%rax
+ movq 8(%rsp),%rdi
+ movq 16(%rsp),%rsi
+ .byte 0xf3,0xc3
+.LSEH_end_aesni_gcm_encrypt:
+.p2align 6
+.Lbswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align 6
+
+.def gcm_se_handler; .scl 3; .type 32; .endef
+.p2align 4
+gcm_se_handler:
+ pushq %rsi
+ pushq %rdi
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushfq
+ subq $64,%rsp
+
+ movq 120(%r8),%rax
+ movq 248(%r8),%rbx
+
+ movq 8(%r9),%rsi
+ movq 56(%r9),%r11
+
+ movl 0(%r11),%r10d
+ leaq (%rsi,%r10,1),%r10
+ cmpq %r10,%rbx
+ jb .Lcommon_seh_tail
+
+ movq 152(%r8),%rax
+
+ movl 4(%r11),%r10d
+ leaq (%rsi,%r10,1),%r10
+ cmpq %r10,%rbx
+ jae .Lcommon_seh_tail
+
+ movq 120(%r8),%rax
+
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ movq %r15,240(%r8)
+ movq %r14,232(%r8)
+ movq %r13,224(%r8)
+ movq %r12,216(%r8)
+ movq %rbp,160(%r8)
+ movq %rbx,144(%r8)
+
+ leaq -216(%rax),%rsi
+ leaq 512(%r8),%rdi
+ movl $20,%ecx
+.long 0xa548f3fc
+
+.Lcommon_seh_tail:
+ movq 8(%rax),%rdi
+ movq 16(%rax),%rsi
+ movq %rax,152(%r8)
+ movq %rsi,168(%r8)
+ movq %rdi,176(%r8)
+
+ movq 40(%r9),%rdi
+ movq %r8,%rsi
+ movl $154,%ecx
+.long 0xa548f3fc
+
+ movq %r9,%rsi
+ xorq %rcx,%rcx
+ movq 8(%rsi),%rdx
+ movq 0(%rsi),%r8
+ movq 16(%rsi),%r9
+ movq 40(%rsi),%r10
+ leaq 56(%rsi),%r11
+ leaq 24(%rsi),%r12
+ movq %r10,32(%rsp)
+ movq %r11,40(%rsp)
+ movq %r12,48(%rsp)
+ movq %rcx,56(%rsp)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ movl $1,%eax
+ addq $64,%rsp
+ popfq
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+ popq %rdi
+ popq %rsi
+ .byte 0xf3,0xc3
+
+
+.section .pdata
+.p2align 2
+.rva .LSEH_begin_aesni_gcm_decrypt
+.rva .LSEH_end_aesni_gcm_decrypt
+.rva .LSEH_gcm_dec_info
+
+.rva .LSEH_begin_aesni_gcm_encrypt
+.rva .LSEH_end_aesni_gcm_encrypt
+.rva .LSEH_gcm_enc_info
+.section .xdata
+.p2align 3
+.LSEH_gcm_dec_info:
+.byte 9,0,0,0
+.rva gcm_se_handler
+.rva .Lgcm_dec_body,.Lgcm_dec_abort
+.LSEH_gcm_enc_info:
+.byte 9,0,0,0
+.rva gcm_se_handler
+.rva .Lgcm_enc_body,.Lgcm_enc_abort
+
diff --git a/lib/accelerated/x86/coff/aesni-x86.s b/lib/accelerated/x86/coff/aesni-x86.s
index 502be77883..2c535e0917 100644
--- a/lib/accelerated/x86/coff/aesni-x86.s
+++ b/lib/accelerated/x86/coff/aesni-x86.s
@@ -60,7 +60,10 @@ _aesni_encrypt:
leal 16(%edx),%edx
jnz .L000enc1_loop_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
ret
.globl _aesni_decrypt
.def _aesni_decrypt; .scl 2; .type 32; .endef
@@ -83,31 +86,87 @@ _aesni_decrypt:
leal 16(%edx),%edx
jnz .L001dec1_loop_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
+ ret
+.def __aesni_encrypt2; .scl 3; .type 32; .endef
+.align 16
+__aesni_encrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L002enc2_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L002enc2_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ ret
+.def __aesni_decrypt2; .scl 3; .type 32; .endef
+.align 16
+__aesni_decrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L003dec2_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L003dec2_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
ret
.def __aesni_encrypt3; .scl 3; .type 32; .endef
.align 16
__aesni_encrypt3:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
- movups (%edx),%xmm0
-.L002enc3_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L004enc3_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
- movups (%edx),%xmm0
- jnz .L002enc3_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L004enc3_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -119,25 +178,26 @@ __aesni_encrypt3:
.align 16
__aesni_decrypt3:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
- movups (%edx),%xmm0
-.L003dec3_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L005dec3_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
- movups (%edx),%xmm0
- jnz .L003dec3_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L005dec3_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -150,27 +210,29 @@ __aesni_decrypt3:
__aesni_encrypt4:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
- shrl $1,%ecx
- leal 32(%edx),%edx
+ shll $4,%ecx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
pxor %xmm0,%xmm5
- movups (%edx),%xmm0
-.L004enc4_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+.L006enc4_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
.byte 102,15,56,220,233
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
- movups (%edx),%xmm0
- jnz .L004enc4_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L006enc4_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -185,27 +247,29 @@ __aesni_encrypt4:
__aesni_decrypt4:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
- shrl $1,%ecx
- leal 32(%edx),%edx
+ shll $4,%ecx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
pxor %xmm0,%xmm5
- movups (%edx),%xmm0
-.L005dec4_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+.L007dec4_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
.byte 102,15,56,222,233
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
- movups (%edx),%xmm0
- jnz .L005dec4_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L007dec4_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -219,45 +283,42 @@ __aesni_decrypt4:
.align 16
__aesni_encrypt6:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
+.byte 102,15,56,220,209
pxor %xmm0,%xmm5
- decl %ecx
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,220,225
pxor %xmm0,%xmm7
-.byte 102,15,56,220,241
- movups (%edx),%xmm0
-.byte 102,15,56,220,249
- jmp .L_aesni_encrypt6_enter
+ movups (%edx,%ecx,1),%xmm0
+ addl $16,%ecx
+ jmp .L008_aesni_encrypt6_inner
.align 16
-.L006enc6_loop:
+.L009enc6_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
+.L008_aesni_encrypt6_inner:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.align 16
.L_aesni_encrypt6_enter:
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%edx),%xmm0
- jnz .L006enc6_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L009enc6_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -275,45 +336,42 @@ __aesni_encrypt6:
.align 16
__aesni_decrypt6:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
-.byte 102,15,56,222,217
+.byte 102,15,56,222,209
pxor %xmm0,%xmm5
- decl %ecx
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,222,225
pxor %xmm0,%xmm7
-.byte 102,15,56,222,241
- movups (%edx),%xmm0
-.byte 102,15,56,222,249
- jmp .L_aesni_decrypt6_enter
+ movups (%edx,%ecx,1),%xmm0
+ addl $16,%ecx
+ jmp .L010_aesni_decrypt6_inner
.align 16
-.L007dec6_loop:
+.L011dec6_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
+.L010_aesni_decrypt6_inner:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.align 16
.L_aesni_decrypt6_enter:
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%edx),%xmm0
- jnz .L007dec6_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L011dec6_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -342,14 +400,14 @@ _aesni_ecb_encrypt:
movl 32(%esp),%edx
movl 36(%esp),%ebx
andl $-16,%eax
- jz .L008ecb_ret
+ jz .L012ecb_ret
movl 240(%edx),%ecx
testl %ebx,%ebx
- jz .L009ecb_decrypt
+ jz .L013ecb_decrypt
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb .L010ecb_enc_tail
+ jb .L014ecb_enc_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -358,9 +416,9 @@ _aesni_ecb_encrypt:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp .L011ecb_enc_loop6_enter
+ jmp .L015ecb_enc_loop6_enter
.align 16
-.L012ecb_enc_loop6:
+.L016ecb_enc_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -375,12 +433,12 @@ _aesni_ecb_encrypt:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-.L011ecb_enc_loop6_enter:
+.L015ecb_enc_loop6_enter:
call __aesni_encrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc .L012ecb_enc_loop6
+ jnc .L016ecb_enc_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -389,18 +447,18 @@ _aesni_ecb_encrypt:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz .L008ecb_ret
-.L010ecb_enc_tail:
+ jz .L012ecb_ret
+.L014ecb_enc_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb .L013ecb_enc_one
+ jb .L017ecb_enc_one
movups 16(%esi),%xmm3
- je .L014ecb_enc_two
+ je .L018ecb_enc_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb .L015ecb_enc_three
+ jb .L019ecb_enc_three
movups 48(%esi),%xmm5
- je .L016ecb_enc_four
+ je .L020ecb_enc_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call __aesni_encrypt6
@@ -409,50 +467,49 @@ _aesni_ecb_encrypt:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L013ecb_enc_one:
+.L017ecb_enc_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L017enc1_loop_3:
+.L021enc1_loop_3:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L017enc1_loop_3
+ jnz .L021enc1_loop_3
.byte 102,15,56,221,209
movups %xmm2,(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L014ecb_enc_two:
- xorps %xmm4,%xmm4
- call __aesni_encrypt3
+.L018ecb_enc_two:
+ call __aesni_encrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L015ecb_enc_three:
+.L019ecb_enc_three:
call __aesni_encrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L016ecb_enc_four:
+.L020ecb_enc_four:
call __aesni_encrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L009ecb_decrypt:
+.L013ecb_decrypt:
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb .L018ecb_dec_tail
+ jb .L022ecb_dec_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -461,9 +518,9 @@ _aesni_ecb_encrypt:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp .L019ecb_dec_loop6_enter
+ jmp .L023ecb_dec_loop6_enter
.align 16
-.L020ecb_dec_loop6:
+.L024ecb_dec_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -478,12 +535,12 @@ _aesni_ecb_encrypt:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-.L019ecb_dec_loop6_enter:
+.L023ecb_dec_loop6_enter:
call __aesni_decrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc .L020ecb_dec_loop6
+ jnc .L024ecb_dec_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -492,18 +549,18 @@ _aesni_ecb_encrypt:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz .L008ecb_ret
-.L018ecb_dec_tail:
+ jz .L012ecb_ret
+.L022ecb_dec_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb .L021ecb_dec_one
+ jb .L025ecb_dec_one
movups 16(%esi),%xmm3
- je .L022ecb_dec_two
+ je .L026ecb_dec_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb .L023ecb_dec_three
+ jb .L027ecb_dec_three
movups 48(%esi),%xmm5
- je .L024ecb_dec_four
+ je .L028ecb_dec_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call __aesni_decrypt6
@@ -512,44 +569,51 @@ _aesni_ecb_encrypt:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L021ecb_dec_one:
+.L025ecb_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L025dec1_loop_4:
+.L029dec1_loop_4:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L025dec1_loop_4
+ jnz .L029dec1_loop_4
.byte 102,15,56,223,209
movups %xmm2,(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L022ecb_dec_two:
- xorps %xmm4,%xmm4
- call __aesni_decrypt3
+.L026ecb_dec_two:
+ call __aesni_decrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L023ecb_dec_three:
+.L027ecb_dec_three:
call __aesni_decrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L024ecb_dec_four:
+.L028ecb_dec_four:
call __aesni_decrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-.L008ecb_ret:
+.L012ecb_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -587,48 +651,56 @@ _aesni_ccm64_encrypt_blocks:
movl %ebp,20(%esp)
movl %ebp,24(%esp)
movl %ebp,28(%esp)
- shrl $1,%ecx
+ shll $4,%ecx
+ movl $16,%ebx
leal (%edx),%ebp
movdqa (%esp),%xmm5
movdqa %xmm7,%xmm2
- movl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ subl %ecx,%ebx
.byte 102,15,56,0,253
-.L026ccm64_enc_outer:
+.L030ccm64_enc_outer:
movups (%ebp),%xmm0
movl %ebx,%ecx
movups (%esi),%xmm6
xorps %xmm0,%xmm2
movups 16(%ebp),%xmm1
xorps %xmm6,%xmm0
- leal 32(%ebp),%edx
xorps %xmm0,%xmm3
- movups (%edx),%xmm0
-.L027ccm64_enc2_loop:
+ movups 32(%ebp),%xmm0
+.L031ccm64_enc2_loop:
.byte 102,15,56,220,209
- decl %ecx
.byte 102,15,56,220,217
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
- leal 32(%edx),%edx
.byte 102,15,56,220,216
- movups (%edx),%xmm0
- jnz .L027ccm64_enc2_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L031ccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
paddq 16(%esp),%xmm7
+ decl %eax
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- decl %eax
leal 16(%esi),%esi
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
movups %xmm6,(%edi)
- leal 16(%edi),%edi
.byte 102,15,56,0,213
- jnz .L026ccm64_enc_outer
+ leal 16(%edi),%edi
+ jnz .L030ccm64_enc_outer
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -675,71 +747,82 @@ _aesni_ccm64_decrypt_blocks:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L028enc1_loop_5:
+.L032enc1_loop_5:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L028enc1_loop_5
+ jnz .L032enc1_loop_5
.byte 102,15,56,221,209
+ shll $4,%ebx
+ movl $16,%ecx
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
leal 16(%esi),%esi
- jmp .L029ccm64_dec_outer
+ subl %ebx,%ecx
+ leal 32(%ebp,%ebx,1),%edx
+ movl %ecx,%ebx
+ jmp .L033ccm64_dec_outer
.align 16
-.L029ccm64_dec_outer:
+.L033ccm64_dec_outer:
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
- movl %ebx,%ecx
movups %xmm6,(%edi)
leal 16(%edi),%edi
.byte 102,15,56,0,213
subl $1,%eax
- jz .L030ccm64_dec_break
+ jz .L034ccm64_dec_break
movups (%ebp),%xmm0
- shrl $1,%ecx
+ movl %ebx,%ecx
movups 16(%ebp),%xmm1
xorps %xmm0,%xmm6
- leal 32(%ebp),%edx
xorps %xmm0,%xmm2
xorps %xmm6,%xmm3
- movups (%edx),%xmm0
-.L031ccm64_dec2_loop:
+ movups 32(%ebp),%xmm0
+.L035ccm64_dec2_loop:
.byte 102,15,56,220,209
- decl %ecx
.byte 102,15,56,220,217
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
- leal 32(%edx),%edx
.byte 102,15,56,220,216
- movups (%edx),%xmm0
- jnz .L031ccm64_dec2_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L035ccm64_dec2_loop
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- leal 16(%esi),%esi
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- jmp .L029ccm64_dec_outer
+ leal 16(%esi),%esi
+ jmp .L033ccm64_dec_outer
.align 16
-.L030ccm64_dec_break:
+.L034ccm64_dec_break:
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movups (%edx),%xmm0
movups 16(%edx),%xmm1
xorps %xmm0,%xmm6
leal 32(%edx),%edx
xorps %xmm6,%xmm3
-.L032enc1_loop_6:
+.L036enc1_loop_6:
.byte 102,15,56,220,217
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L032enc1_loop_6
+ jnz .L036enc1_loop_6
.byte 102,15,56,221,217
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -764,7 +847,7 @@ _aesni_ctr32_encrypt_blocks:
andl $-16,%esp
movl %ebp,80(%esp)
cmpl $1,%eax
- je .L033ctr32_one_shortcut
+ je .L037ctr32_one_shortcut
movdqu (%ebx),%xmm7
movl $202182159,(%esp)
movl $134810123,4(%esp)
@@ -780,63 +863,59 @@ _aesni_ctr32_encrypt_blocks:
.byte 102,15,58,34,253,3
movl 240(%edx),%ecx
bswap %ebx
- pxor %xmm1,%xmm1
pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movdqa (%esp),%xmm2
-.byte 102,15,58,34,203,0
+.byte 102,15,58,34,195,0
leal 3(%ebx),%ebp
-.byte 102,15,58,34,197,0
+.byte 102,15,58,34,205,0
incl %ebx
-.byte 102,15,58,34,203,1
+.byte 102,15,58,34,195,1
incl %ebp
-.byte 102,15,58,34,197,1
+.byte 102,15,58,34,205,1
incl %ebx
-.byte 102,15,58,34,203,2
+.byte 102,15,58,34,195,2
incl %ebp
-.byte 102,15,58,34,197,2
- movdqa %xmm1,48(%esp)
-.byte 102,15,56,0,202
- movdqa %xmm0,64(%esp)
+.byte 102,15,58,34,205,2
+ movdqa %xmm0,48(%esp)
.byte 102,15,56,0,194
- pshufd $192,%xmm1,%xmm2
- pshufd $128,%xmm1,%xmm3
+ movdqu (%edx),%xmm6
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
+ pshufd $192,%xmm0,%xmm2
+ pshufd $128,%xmm0,%xmm3
cmpl $6,%eax
- jb .L034ctr32_tail
+ jb .L038ctr32_tail
+ pxor %xmm6,%xmm7
+ shll $4,%ecx
+ movl $16,%ebx
movdqa %xmm7,32(%esp)
- shrl $1,%ecx
movl %edx,%ebp
- movl %ecx,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
subl $6,%eax
- jmp .L035ctr32_loop6
-.align 16
-.L035ctr32_loop6:
- pshufd $64,%xmm1,%xmm4
- movdqa 32(%esp),%xmm1
- pshufd $192,%xmm0,%xmm5
- por %xmm1,%xmm2
- pshufd $128,%xmm0,%xmm6
- por %xmm1,%xmm3
- pshufd $64,%xmm0,%xmm7
- por %xmm1,%xmm4
- por %xmm1,%xmm5
- por %xmm1,%xmm6
- por %xmm1,%xmm7
- movups (%ebp),%xmm0
- movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
- decl %ecx
+ jmp .L039ctr32_loop6
+.align 16
+.L039ctr32_loop6:
+ pshufd $64,%xmm0,%xmm4
+ movdqa 32(%esp),%xmm0
+ pshufd $192,%xmm1,%xmm5
pxor %xmm0,%xmm2
+ pshufd $128,%xmm1,%xmm6
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
+ pshufd $64,%xmm1,%xmm7
+ movups 16(%ebp),%xmm1
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
+.byte 102,15,56,220,209
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
pxor %xmm0,%xmm7
+.byte 102,15,56,220,217
+ movups 32(%ebp),%xmm0
+ movl %ebx,%ecx
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%edx),%xmm0
.byte 102,15,56,220,249
call .L_aesni_encrypt6_enter
movups (%esi),%xmm1
@@ -847,51 +926,51 @@ _aesni_ctr32_encrypt_blocks:
movups %xmm2,(%edi)
movdqa 16(%esp),%xmm0
xorps %xmm1,%xmm4
- movdqa 48(%esp),%xmm1
+ movdqa 64(%esp),%xmm1
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
paddd %xmm0,%xmm1
- paddd 64(%esp),%xmm0
+ paddd 48(%esp),%xmm0
movdqa (%esp),%xmm2
movups 48(%esi),%xmm3
movups 64(%esi),%xmm4
xorps %xmm3,%xmm5
movups 80(%esi),%xmm3
leal 96(%esi),%esi
- movdqa %xmm1,48(%esp)
-.byte 102,15,56,0,202
+ movdqa %xmm0,48(%esp)
+.byte 102,15,56,0,194
xorps %xmm4,%xmm6
movups %xmm5,48(%edi)
xorps %xmm3,%xmm7
- movdqa %xmm0,64(%esp)
-.byte 102,15,56,0,194
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
movups %xmm6,64(%edi)
- pshufd $192,%xmm1,%xmm2
+ pshufd $192,%xmm0,%xmm2
movups %xmm7,80(%edi)
leal 96(%edi),%edi
- movl %ebx,%ecx
- pshufd $128,%xmm1,%xmm3
+ pshufd $128,%xmm0,%xmm3
subl $6,%eax
- jnc .L035ctr32_loop6
+ jnc .L039ctr32_loop6
addl $6,%eax
- jz .L036ctr32_ret
+ jz .L040ctr32_ret
+ movdqu (%ebp),%xmm7
movl %ebp,%edx
- leal 1(,%ecx,2),%ecx
- movdqa 32(%esp),%xmm7
-.L034ctr32_tail:
+ pxor 32(%esp),%xmm7
+ movl 240(%ebp),%ecx
+.L038ctr32_tail:
por %xmm7,%xmm2
cmpl $2,%eax
- jb .L037ctr32_one
- pshufd $64,%xmm1,%xmm4
+ jb .L041ctr32_one
+ pshufd $64,%xmm0,%xmm4
por %xmm7,%xmm3
- je .L038ctr32_two
- pshufd $192,%xmm0,%xmm5
+ je .L042ctr32_two
+ pshufd $192,%xmm1,%xmm5
por %xmm7,%xmm4
cmpl $4,%eax
- jb .L039ctr32_three
- pshufd $128,%xmm0,%xmm6
+ jb .L043ctr32_three
+ pshufd $128,%xmm1,%xmm6
por %xmm7,%xmm5
- je .L040ctr32_four
+ je .L044ctr32_four
por %xmm7,%xmm6
call __aesni_encrypt6
movups (%esi),%xmm1
@@ -909,39 +988,39 @@ _aesni_ctr32_encrypt_blocks:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L036ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L033ctr32_one_shortcut:
+.L037ctr32_one_shortcut:
movups (%ebx),%xmm2
movl 240(%edx),%ecx
-.L037ctr32_one:
+.L041ctr32_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L041enc1_loop_7:
+.L045enc1_loop_7:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L041enc1_loop_7
+ jnz .L045enc1_loop_7
.byte 102,15,56,221,209
movups (%esi),%xmm6
xorps %xmm2,%xmm6
movups %xmm6,(%edi)
- jmp .L036ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L038ctr32_two:
- call __aesni_encrypt3
+.L042ctr32_two:
+ call __aesni_encrypt2
movups (%esi),%xmm5
movups 16(%esi),%xmm6
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L036ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L039ctr32_three:
+.L043ctr32_three:
call __aesni_encrypt3
movups (%esi),%xmm5
movups 16(%esi),%xmm6
@@ -952,9 +1031,9 @@ _aesni_ctr32_encrypt_blocks:
xorps %xmm7,%xmm4
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L036ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L040ctr32_four:
+.L044ctr32_four:
call __aesni_encrypt4
movups (%esi),%xmm6
movups 16(%esi),%xmm7
@@ -968,7 +1047,18 @@ _aesni_ctr32_encrypt_blocks:
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-.L036ctr32_ret:
+.L040ctr32_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
movl 80(%esp),%esp
popl %edi
popl %esi
@@ -992,12 +1082,12 @@ _aesni_xts_encrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L042enc1_loop_8:
+.L046enc1_loop_8:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L042enc1_loop_8
+ jnz .L046enc1_loop_8
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1021,12 +1111,14 @@ _aesni_xts_encrypt:
movl %edx,%ebp
movl %ecx,%ebx
subl $96,%eax
- jc .L043xts_enc_short
- shrl $1,%ecx
- movl %ecx,%ebx
- jmp .L044xts_enc_loop6
+ jc .L047xts_enc_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp .L048xts_enc_loop6
.align 16
-.L044xts_enc_loop6:
+.L048xts_enc_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1062,6 +1154,7 @@ _aesni_xts_encrypt:
pand %xmm3,%xmm7
movups (%esi),%xmm2
pxor %xmm1,%xmm7
+ movl %ebx,%ecx
movdqu 16(%esi),%xmm3
xorps %xmm0,%xmm2
movdqu 32(%esi),%xmm4
@@ -1077,19 +1170,17 @@ _aesni_xts_encrypt:
movdqa %xmm7,80(%esp)
pxor %xmm1,%xmm7
movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
pxor 16(%esp),%xmm3
-.byte 102,15,56,220,209
pxor 32(%esp),%xmm4
-.byte 102,15,56,220,217
+.byte 102,15,56,220,209
pxor 48(%esp),%xmm5
- decl %ecx
-.byte 102,15,56,220,225
pxor 64(%esp),%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,217
pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%edx),%xmm0
.byte 102,15,56,220,249
call .L_aesni_encrypt6_enter
movdqa 80(%esp),%xmm1
@@ -1114,26 +1205,25 @@ _aesni_xts_encrypt:
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
- movl %ebx,%ecx
pxor %xmm2,%xmm1
subl $96,%eax
- jnc .L044xts_enc_loop6
- leal 1(,%ecx,2),%ecx
+ jnc .L048xts_enc_loop6
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-.L043xts_enc_short:
+.L047xts_enc_short:
addl $96,%eax
- jz .L045xts_enc_done6x
+ jz .L049xts_enc_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb .L046xts_enc_one
+ jb .L050xts_enc_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je .L047xts_enc_two
+ je .L051xts_enc_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1142,7 +1232,7 @@ _aesni_xts_encrypt:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb .L048xts_enc_three
+ jb .L052xts_enc_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1152,7 +1242,7 @@ _aesni_xts_encrypt:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je .L049xts_enc_four
+ je .L053xts_enc_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1184,9 +1274,9 @@ _aesni_xts_encrypt:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp .L050xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L046xts_enc_one:
+.L050xts_enc_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1194,37 +1284,36 @@ _aesni_xts_encrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L051enc1_loop_9:
+.L055enc1_loop_9:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L051enc1_loop_9
+ jnz .L055enc1_loop_9
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp .L050xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L047xts_enc_two:
+.L051xts_enc_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
leal 32(%esi),%esi
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
- xorps %xmm4,%xmm4
- call __aesni_encrypt3
+ call __aesni_encrypt2
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L050xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L048xts_enc_three:
+.L052xts_enc_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1242,9 +1331,9 @@ _aesni_xts_encrypt:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp .L050xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L049xts_enc_four:
+.L053xts_enc_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1266,28 +1355,28 @@ _aesni_xts_encrypt:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L050xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L045xts_enc_done6x:
+.L049xts_enc_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz .L052xts_enc_ret
+ jz .L056xts_enc_ret
movdqa %xmm1,%xmm5
movl %eax,112(%esp)
- jmp .L053xts_enc_steal
+ jmp .L057xts_enc_steal
.align 16
-.L050xts_enc_done:
+.L054xts_enc_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz .L052xts_enc_ret
+ jz .L056xts_enc_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm5
paddq %xmm1,%xmm1
pand 96(%esp),%xmm5
pxor %xmm1,%xmm5
-.L053xts_enc_steal:
+.L057xts_enc_steal:
movzbl (%esi),%ecx
movzbl -16(%edi),%edx
leal 1(%esi),%esi
@@ -1295,7 +1384,7 @@ _aesni_xts_encrypt:
movb %dl,(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz .L053xts_enc_steal
+ jnz .L057xts_enc_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1305,16 +1394,30 @@ _aesni_xts_encrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L054enc1_loop_10:
+.L058enc1_loop_10:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L054enc1_loop_10
+ jnz .L058enc1_loop_10
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,-16(%edi)
-.L052xts_enc_ret:
+.L056xts_enc_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1338,12 +1441,12 @@ _aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L055enc1_loop_11:
+.L059enc1_loop_11:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L055enc1_loop_11
+ jnz .L059enc1_loop_11
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1372,12 +1475,14 @@ _aesni_xts_decrypt:
pcmpgtd %xmm1,%xmm0
andl $-16,%eax
subl $96,%eax
- jc .L056xts_dec_short
- shrl $1,%ecx
- movl %ecx,%ebx
- jmp .L057xts_dec_loop6
+ jc .L060xts_dec_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp .L061xts_dec_loop6
.align 16
-.L057xts_dec_loop6:
+.L061xts_dec_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1413,6 +1518,7 @@ _aesni_xts_decrypt:
pand %xmm3,%xmm7
movups (%esi),%xmm2
pxor %xmm1,%xmm7
+ movl %ebx,%ecx
movdqu 16(%esi),%xmm3
xorps %xmm0,%xmm2
movdqu 32(%esi),%xmm4
@@ -1428,19 +1534,17 @@ _aesni_xts_decrypt:
movdqa %xmm7,80(%esp)
pxor %xmm1,%xmm7
movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
pxor 16(%esp),%xmm3
-.byte 102,15,56,222,209
pxor 32(%esp),%xmm4
-.byte 102,15,56,222,217
+.byte 102,15,56,222,209
pxor 48(%esp),%xmm5
- decl %ecx
-.byte 102,15,56,222,225
pxor 64(%esp),%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,217
pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
.byte 102,15,56,222,241
- movups (%edx),%xmm0
.byte 102,15,56,222,249
call .L_aesni_decrypt6_enter
movdqa 80(%esp),%xmm1
@@ -1465,26 +1569,25 @@ _aesni_xts_decrypt:
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
- movl %ebx,%ecx
pxor %xmm2,%xmm1
subl $96,%eax
- jnc .L057xts_dec_loop6
- leal 1(,%ecx,2),%ecx
+ jnc .L061xts_dec_loop6
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-.L056xts_dec_short:
+.L060xts_dec_short:
addl $96,%eax
- jz .L058xts_dec_done6x
+ jz .L062xts_dec_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb .L059xts_dec_one
+ jb .L063xts_dec_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je .L060xts_dec_two
+ je .L064xts_dec_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1493,7 +1596,7 @@ _aesni_xts_decrypt:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb .L061xts_dec_three
+ jb .L065xts_dec_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1503,7 +1606,7 @@ _aesni_xts_decrypt:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je .L062xts_dec_four
+ je .L066xts_dec_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1535,9 +1638,9 @@ _aesni_xts_decrypt:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp .L063xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L059xts_dec_one:
+.L063xts_dec_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1545,36 +1648,36 @@ _aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L064dec1_loop_12:
+.L068dec1_loop_12:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L064dec1_loop_12
+ jnz .L068dec1_loop_12
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp .L063xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L060xts_dec_two:
+.L064xts_dec_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
leal 32(%esi),%esi
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
- call __aesni_decrypt3
+ call __aesni_decrypt2
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L063xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L061xts_dec_three:
+.L065xts_dec_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1592,9 +1695,9 @@ _aesni_xts_decrypt:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp .L063xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L062xts_dec_four:
+.L066xts_dec_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1616,20 +1719,20 @@ _aesni_xts_decrypt:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L063xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L058xts_dec_done6x:
+.L062xts_dec_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz .L065xts_dec_ret
+ jz .L069xts_dec_ret
movl %eax,112(%esp)
- jmp .L066xts_dec_only_one_more
+ jmp .L070xts_dec_only_one_more
.align 16
-.L063xts_dec_done:
+.L067xts_dec_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz .L065xts_dec_ret
+ jz .L069xts_dec_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm2
@@ -1639,7 +1742,7 @@ _aesni_xts_decrypt:
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
-.L066xts_dec_only_one_more:
+.L070xts_dec_only_one_more:
pshufd $19,%xmm0,%xmm5
movdqa %xmm1,%xmm6
paddq %xmm1,%xmm1
@@ -1653,16 +1756,16 @@ _aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L067dec1_loop_13:
+.L071dec1_loop_13:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L067dec1_loop_13
+ jnz .L071dec1_loop_13
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
-.L068xts_dec_steal:
+.L072xts_dec_steal:
movzbl 16(%esi),%ecx
movzbl (%edi),%edx
leal 1(%esi),%esi
@@ -1670,7 +1773,7 @@ _aesni_xts_decrypt:
movb %dl,16(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz .L068xts_dec_steal
+ jnz .L072xts_dec_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1680,16 +1783,30 @@ _aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L069dec1_loop_14:
+.L073dec1_loop_14:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L069dec1_loop_14
+ jnz .L073dec1_loop_14
.byte 102,15,56,223,209
xorps %xmm6,%xmm2
movups %xmm2,(%edi)
-.L065xts_dec_ret:
+.L069xts_dec_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1714,7 +1831,7 @@ _aesni_cbc_encrypt:
movl 32(%esp),%edx
movl 36(%esp),%ebp
testl %eax,%eax
- jz .L070cbc_abort
+ jz .L074cbc_abort
cmpl $0,40(%esp)
xchgl %esp,%ebx
movups (%ebp),%xmm7
@@ -1722,14 +1839,14 @@ _aesni_cbc_encrypt:
movl %edx,%ebp
movl %ebx,16(%esp)
movl %ecx,%ebx
- je .L071cbc_decrypt
+ je .L075cbc_decrypt
movaps %xmm7,%xmm2
cmpl $16,%eax
- jb .L072cbc_enc_tail
+ jb .L076cbc_enc_tail
subl $16,%eax
- jmp .L073cbc_enc_loop
+ jmp .L077cbc_enc_loop
.align 16
-.L073cbc_enc_loop:
+.L077cbc_enc_loop:
movups (%esi),%xmm7
leal 16(%esi),%esi
movups (%edx),%xmm0
@@ -1737,24 +1854,25 @@ _aesni_cbc_encrypt:
xorps %xmm0,%xmm7
leal 32(%edx),%edx
xorps %xmm7,%xmm2
-.L074enc1_loop_15:
+.L078enc1_loop_15:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L074enc1_loop_15
+ jnz .L078enc1_loop_15
.byte 102,15,56,221,209
movl %ebx,%ecx
movl %ebp,%edx
movups %xmm2,(%edi)
leal 16(%edi),%edi
subl $16,%eax
- jnc .L073cbc_enc_loop
+ jnc .L077cbc_enc_loop
addl $16,%eax
- jnz .L072cbc_enc_tail
+ jnz .L076cbc_enc_tail
movaps %xmm2,%xmm7
- jmp .L075cbc_ret
-.L072cbc_enc_tail:
+ pxor %xmm2,%xmm2
+ jmp .L079cbc_ret
+.L076cbc_enc_tail:
movl %eax,%ecx
.long 2767451785
movl $16,%ecx
@@ -1765,20 +1883,20 @@ _aesni_cbc_encrypt:
movl %ebx,%ecx
movl %edi,%esi
movl %ebp,%edx
- jmp .L073cbc_enc_loop
+ jmp .L077cbc_enc_loop
.align 16
-.L071cbc_decrypt:
+.L075cbc_decrypt:
cmpl $80,%eax
- jbe .L076cbc_dec_tail
+ jbe .L080cbc_dec_tail
movaps %xmm7,(%esp)
subl $80,%eax
- jmp .L077cbc_dec_loop6_enter
+ jmp .L081cbc_dec_loop6_enter
.align 16
-.L078cbc_dec_loop6:
+.L082cbc_dec_loop6:
movaps %xmm0,(%esp)
movups %xmm7,(%edi)
leal 16(%edi),%edi
-.L077cbc_dec_loop6_enter:
+.L081cbc_dec_loop6_enter:
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -1808,28 +1926,28 @@ _aesni_cbc_encrypt:
movups %xmm6,64(%edi)
leal 80(%edi),%edi
subl $96,%eax
- ja .L078cbc_dec_loop6
+ ja .L082cbc_dec_loop6
movaps %xmm7,%xmm2
movaps %xmm0,%xmm7
addl $80,%eax
- jle .L079cbc_dec_tail_collected
+ jle .L083cbc_dec_clear_tail_collected
movups %xmm2,(%edi)
leal 16(%edi),%edi
-.L076cbc_dec_tail:
+.L080cbc_dec_tail:
movups (%esi),%xmm2
movaps %xmm2,%xmm6
cmpl $16,%eax
- jbe .L080cbc_dec_one
+ jbe .L084cbc_dec_one
movups 16(%esi),%xmm3
movaps %xmm3,%xmm5
cmpl $32,%eax
- jbe .L081cbc_dec_two
+ jbe .L085cbc_dec_two
movups 32(%esi),%xmm4
cmpl $48,%eax
- jbe .L082cbc_dec_three
+ jbe .L086cbc_dec_three
movups 48(%esi),%xmm5
cmpl $64,%eax
- jbe .L083cbc_dec_four
+ jbe .L087cbc_dec_four
movups 64(%esi),%xmm6
movaps %xmm7,(%esp)
movups (%esi),%xmm2
@@ -1847,56 +1965,62 @@ _aesni_cbc_encrypt:
xorps %xmm0,%xmm6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%edi)
+ pxor %xmm5,%xmm5
leal 64(%edi),%edi
movaps %xmm6,%xmm2
+ pxor %xmm6,%xmm6
subl $80,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L080cbc_dec_one:
+.L084cbc_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L084dec1_loop_16:
+.L089dec1_loop_16:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L084dec1_loop_16
+ jnz .L089dec1_loop_16
.byte 102,15,56,223,209
xorps %xmm7,%xmm2
movaps %xmm6,%xmm7
subl $16,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L081cbc_dec_two:
- xorps %xmm4,%xmm4
- call __aesni_decrypt3
+.L085cbc_dec_two:
+ call __aesni_decrypt2
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movaps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leal 16(%edi),%edi
movaps %xmm5,%xmm7
subl $32,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L082cbc_dec_three:
+.L086cbc_dec_three:
call __aesni_decrypt3
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
xorps %xmm5,%xmm4
movups %xmm2,(%edi)
movaps %xmm4,%xmm2
+ pxor %xmm4,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
leal 32(%edi),%edi
movups 32(%esi),%xmm7
subl $48,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L083cbc_dec_four:
+.L087cbc_dec_four:
call __aesni_decrypt4
movups 16(%esi),%xmm1
movups 32(%esi),%xmm0
@@ -1906,28 +2030,44 @@ _aesni_cbc_encrypt:
movups %xmm2,(%edi)
xorps %xmm1,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
leal 48(%edi),%edi
movaps %xmm5,%xmm2
+ pxor %xmm5,%xmm5
subl $64,%eax
-.L079cbc_dec_tail_collected:
+ jmp .L088cbc_dec_tail_collected
+.align 16
+.L083cbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+.L088cbc_dec_tail_collected:
andl $15,%eax
- jnz .L085cbc_dec_tail_partial
+ jnz .L090cbc_dec_tail_partial
movups %xmm2,(%edi)
- jmp .L075cbc_ret
+ pxor %xmm0,%xmm0
+ jmp .L079cbc_ret
.align 16
-.L085cbc_dec_tail_partial:
+.L090cbc_dec_tail_partial:
movaps %xmm2,(%esp)
+ pxor %xmm0,%xmm0
movl $16,%ecx
movl %esp,%esi
subl %eax,%ecx
.long 2767451785
-.L075cbc_ret:
+ movdqa %xmm2,(%esp)
+.L079cbc_ret:
movl 16(%esp),%esp
movl 36(%esp),%ebp
+ pxor %xmm2,%xmm2
+ pxor %xmm1,%xmm1
movups %xmm7,(%ebp)
-.L070cbc_abort:
+ pxor %xmm7,%xmm7
+.L074cbc_abort:
popl %edi
popl %esi
popl %ebx
@@ -1936,52 +2076,62 @@ _aesni_cbc_encrypt:
.def __aesni_set_encrypt_key; .scl 3; .type 32; .endef
.align 16
__aesni_set_encrypt_key:
+ pushl %ebp
+ pushl %ebx
testl %eax,%eax
- jz .L086bad_pointer
+ jz .L091bad_pointer
testl %edx,%edx
- jz .L086bad_pointer
+ jz .L091bad_pointer
+ call .L092pic
+.L092pic:
+ popl %ebx
+ leal .Lkey_const-.L092pic(%ebx),%ebx
+ leal __gnutls_x86_cpuid_s,%ebp
movups (%eax),%xmm0
xorps %xmm4,%xmm4
+ movl 4(%ebp),%ebp
leal 16(%edx),%edx
+ andl $268437504,%ebp
cmpl $256,%ecx
- je .L08714rounds
+ je .L09314rounds
cmpl $192,%ecx
- je .L08812rounds
+ je .L09412rounds
cmpl $128,%ecx
- jne .L089bad_keybits
+ jne .L095bad_keybits
.align 16
-.L09010rounds:
+.L09610rounds:
+ cmpl $268435456,%ebp
+ je .L09710rounds_alt
movl $9,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,200,1
- call .L091key_128_cold
+ call .L098key_128_cold
.byte 102,15,58,223,200,2
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,4
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,8
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,16
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,32
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,64
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,128
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,27
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,54
- call .L092key_128
+ call .L099key_128
movups %xmm0,(%edx)
movl %ecx,80(%edx)
- xorl %eax,%eax
- ret
+ jmp .L100good_key
.align 16
-.L092key_128:
+.L099key_128:
movups %xmm0,(%edx)
leal 16(%edx),%edx
-.L091key_128_cold:
+.L098key_128_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -1990,38 +2140,91 @@ __aesni_set_encrypt_key:
xorps %xmm1,%xmm0
ret
.align 16
-.L08812rounds:
+.L09710rounds_alt:
+ movdqa (%ebx),%xmm5
+ movl $8,%ecx
+ movdqa 32(%ebx),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,-16(%edx)
+.L101loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leal 16(%edx),%edx
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%edx)
+ movdqa %xmm0,%xmm2
+ decl %ecx
+ jnz .L101loop_key128
+ movdqa 48(%ebx),%xmm4
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%edx)
+ movl $9,%ecx
+ movl %ecx,96(%edx)
+ jmp .L100good_key
+.align 16
+.L09412rounds:
movq 16(%eax),%xmm2
+ cmpl $268435456,%ebp
+ je .L10212rounds_alt
movl $11,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,202,1
- call .L093key_192a_cold
+ call .L103key_192a_cold
.byte 102,15,58,223,202,2
- call .L094key_192b
+ call .L104key_192b
.byte 102,15,58,223,202,4
- call .L095key_192a
+ call .L105key_192a
.byte 102,15,58,223,202,8
- call .L094key_192b
+ call .L104key_192b
.byte 102,15,58,223,202,16
- call .L095key_192a
+ call .L105key_192a
.byte 102,15,58,223,202,32
- call .L094key_192b
+ call .L104key_192b
.byte 102,15,58,223,202,64
- call .L095key_192a
+ call .L105key_192a
.byte 102,15,58,223,202,128
- call .L094key_192b
+ call .L104key_192b
movups %xmm0,(%edx)
movl %ecx,48(%edx)
- xorl %eax,%eax
- ret
+ jmp .L100good_key
.align 16
-.L095key_192a:
+.L105key_192a:
movups %xmm0,(%edx)
leal 16(%edx),%edx
.align 16
-.L093key_192a_cold:
+.L103key_192a_cold:
movaps %xmm2,%xmm5
-.L096key_192b_warm:
+.L106key_192b_warm:
shufps $16,%xmm0,%xmm4
movdqa %xmm2,%xmm3
xorps %xmm4,%xmm0
@@ -2035,56 +2238,90 @@ __aesni_set_encrypt_key:
pxor %xmm3,%xmm2
ret
.align 16
-.L094key_192b:
+.L104key_192b:
movaps %xmm0,%xmm3
shufps $68,%xmm0,%xmm5
movups %xmm5,(%edx)
shufps $78,%xmm2,%xmm3
movups %xmm3,16(%edx)
leal 32(%edx),%edx
- jmp .L096key_192b_warm
+ jmp .L106key_192b_warm
+.align 16
+.L10212rounds_alt:
+ movdqa 16(%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $8,%ecx
+ movdqu %xmm0,-16(%edx)
+.L107loop_key192:
+ movq %xmm2,(%edx)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leal 24(%edx),%edx
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%edx)
+ decl %ecx
+ jnz .L107loop_key192
+ movl $11,%ecx
+ movl %ecx,32(%edx)
+ jmp .L100good_key
.align 16
-.L08714rounds:
+.L09314rounds:
movups 16(%eax),%xmm2
- movl $13,%ecx
leal 16(%edx),%edx
+ cmpl $268435456,%ebp
+ je .L10814rounds_alt
+ movl $13,%ecx
movups %xmm0,-32(%edx)
movups %xmm2,-16(%edx)
.byte 102,15,58,223,202,1
- call .L097key_256a_cold
+ call .L109key_256a_cold
.byte 102,15,58,223,200,1
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,2
- call .L099key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,2
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,4
- call .L099key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,4
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,8
- call .L099key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,8
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,16
- call .L099key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,16
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,32
- call .L099key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,32
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,64
- call .L099key_256a
+ call .L111key_256a
movups %xmm0,(%edx)
movl %ecx,16(%edx)
xorl %eax,%eax
- ret
+ jmp .L100good_key
.align 16
-.L099key_256a:
+.L111key_256a:
movups %xmm2,(%edx)
leal 16(%edx),%edx
-.L097key_256a_cold:
+.L109key_256a_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -2093,7 +2330,7 @@ __aesni_set_encrypt_key:
xorps %xmm1,%xmm0
ret
.align 16
-.L098key_256b:
+.L110key_256b:
movups %xmm0,(%edx)
leal 16(%edx),%edx
shufps $16,%xmm2,%xmm4
@@ -2103,13 +2340,70 @@ __aesni_set_encrypt_key:
shufps $170,%xmm1,%xmm1
xorps %xmm1,%xmm2
ret
+.align 16
+.L10814rounds_alt:
+ movdqa (%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $7,%ecx
+ movdqu %xmm0,-32(%edx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,-16(%edx)
+.L112loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ decl %ecx
+ jz .L113done_key256
+ pshufd $255,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%edx)
+ leal 32(%edx),%edx
+ movdqa %xmm2,%xmm1
+ jmp .L112loop_key256
+.L113done_key256:
+ movl $13,%ecx
+ movl %ecx,16(%edx)
+.L100good_key:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ xorl %eax,%eax
+ popl %ebx
+ popl %ebp
+ ret
.align 4
-.L086bad_pointer:
+.L091bad_pointer:
movl $-1,%eax
+ popl %ebx
+ popl %ebp
ret
.align 4
-.L089bad_keybits:
+.L095bad_keybits:
+ pxor %xmm0,%xmm0
movl $-2,%eax
+ popl %ebx
+ popl %ebp
ret
.globl _aesni_set_encrypt_key
.def _aesni_set_encrypt_key; .scl 2; .type 32; .endef
@@ -2133,7 +2427,7 @@ _aesni_set_decrypt_key:
movl 12(%esp),%edx
shll $4,%ecx
testl %eax,%eax
- jnz .L100dec_key_ret
+ jnz .L114dec_key_ret
leal 16(%edx,%ecx,1),%eax
movups (%edx),%xmm0
movups (%eax),%xmm1
@@ -2141,7 +2435,7 @@ _aesni_set_decrypt_key:
movups %xmm1,(%edx)
leal 16(%edx),%edx
leal -16(%eax),%eax
-.L101dec_key_inverse:
+.L115dec_key_inverse:
movups (%edx),%xmm0
movups (%eax),%xmm1
.byte 102,15,56,219,192
@@ -2151,15 +2445,24 @@ _aesni_set_decrypt_key:
movups %xmm0,16(%eax)
movups %xmm1,-16(%edx)
cmpl %edx,%eax
- ja .L101dec_key_inverse
+ ja .L115dec_key_inverse
movups (%edx),%xmm0
.byte 102,15,56,219,192
movups %xmm0,(%edx)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
xorl %eax,%eax
-.L100dec_key_ret:
+.L114dec_key_ret:
ret
+.align 64
+.Lkey_const:
+.long 202313229,202313229,202313229,202313229
+.long 67569157,67569157,67569157,67569157
+.long 1,1,1,1
+.long 27,27,27,27
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
.byte 115,108,46,111,114,103,62,0
+.comm __gnutls_x86_cpuid_s,16
diff --git a/lib/accelerated/x86/coff/aesni-x86_64.s b/lib/accelerated/x86/coff/aesni-x86_64.s
index fc838c18e5..79ffbf70c7 100644
--- a/lib/accelerated/x86/coff/aesni-x86_64.s
+++ b/lib/accelerated/x86/coff/aesni-x86_64.s
@@ -38,6 +38,7 @@
# *** This file is auto-generated ***
#
.text
+
.globl aesni_encrypt
.def aesni_encrypt; .scl 2; .type 32; .endef
.p2align 4
@@ -53,9 +54,12 @@ aesni_encrypt:
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz .Loop_enc1_1
+ jnz .Loop_enc1_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%rdx)
+ pxor %xmm2,%xmm2
.byte 0xf3,0xc3
@@ -74,34 +78,96 @@ aesni_decrypt:
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz .Loop_dec1_2
+ jnz .Loop_dec1_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%rdx)
+ pxor %xmm2,%xmm2
+ .byte 0xf3,0xc3
+
+.def _aesni_encrypt2; .scl 3; .type 32; .endef
+.p2align 4
+_aesni_encrypt2:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+.Lenc_loop2:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Lenc_loop2
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ .byte 0xf3,0xc3
+
+.def _aesni_decrypt2; .scl 3; .type 32; .endef
+.p2align 4
+_aesni_decrypt2:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+.Ldec_loop2:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Ldec_loop2
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
.byte 0xf3,0xc3
.def _aesni_encrypt3; .scl 3; .type 32; .endef
.p2align 4
_aesni_encrypt3:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
.Lenc_loop3:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop3
.byte 102,15,56,220,209
@@ -116,25 +182,26 @@ _aesni_encrypt3:
.p2align 4
_aesni_decrypt3:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
.Ldec_loop3:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop3
.byte 102,15,56,222,209
@@ -149,28 +216,30 @@ _aesni_decrypt3:
.p2align 4
_aesni_encrypt4:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
xorps %xmm0,%xmm5
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
.Lenc_loop4:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop4
.byte 102,15,56,220,209
@@ -187,28 +256,30 @@ _aesni_encrypt4:
.p2align 4
_aesni_decrypt4:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
xorps %xmm0,%xmm5
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
.Ldec_loop4:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop4
.byte 102,15,56,222,209
@@ -225,43 +296,40 @@ _aesni_decrypt4:
.p2align 4
_aesni_encrypt6:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
+.byte 102,15,56,220,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,225
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,220,241
- movups (%rcx),%xmm0
-.byte 102,15,56,220,249
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
jmp .Lenc_loop6_enter
.p2align 4
.Lenc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
+.Lenc_loop6_enter:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.Lenc_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop6
.byte 102,15,56,220,209
@@ -282,43 +350,40 @@ _aesni_encrypt6:
.p2align 4
_aesni_decrypt6:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
+.byte 102,15,56,222,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,222,217
pxor %xmm0,%xmm5
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,225
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,222,241
- movups (%rcx),%xmm0
-.byte 102,15,56,222,249
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
jmp .Ldec_loop6_enter
.p2align 4
.Ldec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
+.Ldec_loop6_enter:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.Ldec_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop6
.byte 102,15,56,222,209
@@ -339,52 +404,46 @@ _aesni_decrypt6:
.p2align 4
_aesni_encrypt8:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,209
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,220,241
pxor %xmm0,%xmm8
-.byte 102,15,56,220,249
+.byte 102,15,56,220,217
pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
- movups 16(%rcx),%xmm1
- jmp .Lenc_loop8_enter
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp .Lenc_loop8_inner
.p2align 4
.Lenc_loop8:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
+.Lenc_loop8_inner:
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
- movups 16(%rcx),%xmm1
.Lenc_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop8
.byte 102,15,56,220,209
@@ -409,52 +468,46 @@ _aesni_encrypt8:
.p2align 4
_aesni_decrypt8:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
-.byte 102,15,56,222,217
pxor %xmm0,%xmm5
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,222,209
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,222,241
pxor %xmm0,%xmm8
-.byte 102,15,56,222,249
+.byte 102,15,56,222,217
pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
- jmp .Ldec_loop8_enter
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp .Ldec_loop8_inner
.p2align 4
.Ldec_loop8:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
+.Ldec_loop8_inner:
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
.Ldec_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop8
.byte 102,15,56,222,209
@@ -489,6 +542,12 @@ aesni_ecb_encrypt:
movq %r9,%rcx
movq 40(%rsp),%r8
+ leaq -88(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,16(%rsp)
+ movaps %xmm8,32(%rsp)
+ movaps %xmm9,48(%rsp)
+.Lecb_enc_body:
andq $-16,%rdx
jz .Lecb_ret
@@ -499,7 +558,7 @@ aesni_ecb_encrypt:
testl %r8d,%r8d
jz .Lecb_decrypt
- cmpq $128,%rdx
+ cmpq $0x80,%rdx
jb .Lecb_enc_tail
movdqu (%rdi),%xmm2
@@ -511,7 +570,7 @@ aesni_ecb_encrypt:
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
- subq $128,%rdx
+ subq $0x80,%rdx
jmp .Lecb_enc_loop8_enter
.p2align 4
.Lecb_enc_loop8:
@@ -539,7 +598,7 @@ aesni_ecb_encrypt:
call _aesni_encrypt8
- subq $128,%rdx
+ subq $0x80,%rdx
jnc .Lecb_enc_loop8
movups %xmm2,(%rsi)
@@ -553,26 +612,27 @@ aesni_ecb_encrypt:
movups %xmm8,96(%rsi)
movups %xmm9,112(%rsi)
leaq 128(%rsi),%rsi
- addq $128,%rdx
+ addq $0x80,%rdx
jz .Lecb_ret
.Lecb_enc_tail:
movups (%rdi),%xmm2
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb .Lecb_enc_one
movups 16(%rdi),%xmm3
je .Lecb_enc_two
movups 32(%rdi),%xmm4
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb .Lecb_enc_three
movups 48(%rdi),%xmm5
je .Lecb_enc_four
movups 64(%rdi),%xmm6
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
jb .Lecb_enc_five
movups 80(%rdi),%xmm7
je .Lecb_enc_six
movdqu 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
call _aesni_encrypt8
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
@@ -593,14 +653,13 @@ aesni_ecb_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_3
+ jnz .Loop_enc1_3
.byte 102,15,56,221,209
movups %xmm2,(%rsi)
jmp .Lecb_ret
.p2align 4
.Lecb_enc_two:
- xorps %xmm4,%xmm4
- call _aesni_encrypt3
+ call _aesni_encrypt2
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
jmp .Lecb_ret
@@ -642,7 +701,7 @@ aesni_ecb_encrypt:
.p2align 4
.Lecb_decrypt:
- cmpq $128,%rdx
+ cmpq $0x80,%rdx
jb .Lecb_dec_tail
movdqu (%rdi),%xmm2
@@ -654,7 +713,7 @@ aesni_ecb_encrypt:
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
- subq $128,%rdx
+ subq $0x80,%rdx
jmp .Lecb_dec_loop8_enter
.p2align 4
.Lecb_dec_loop8:
@@ -683,49 +742,66 @@ aesni_ecb_encrypt:
call _aesni_decrypt8
movups (%r11),%xmm0
- subq $128,%rdx
+ subq $0x80,%rdx
jnc .Lecb_dec_loop8
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movq %r11,%rcx
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movl %r10d,%eax
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
movups %xmm9,112(%rsi)
+ pxor %xmm9,%xmm9
leaq 128(%rsi),%rsi
- addq $128,%rdx
+ addq $0x80,%rdx
jz .Lecb_ret
.Lecb_dec_tail:
movups (%rdi),%xmm2
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb .Lecb_dec_one
movups 16(%rdi),%xmm3
je .Lecb_dec_two
movups 32(%rdi),%xmm4
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb .Lecb_dec_three
movups 48(%rdi),%xmm5
je .Lecb_dec_four
movups 64(%rdi),%xmm6
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
jb .Lecb_dec_five
movups 80(%rdi),%xmm7
je .Lecb_dec_six
movups 96(%rdi),%xmm8
movups (%rcx),%xmm0
+ xorps %xmm9,%xmm9
call _aesni_decrypt8
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
jmp .Lecb_ret
.p2align 4
.Lecb_dec_one:
@@ -738,53 +814,86 @@ aesni_ecb_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_4
+ jnz .Loop_dec1_4
.byte 102,15,56,223,209
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
jmp .Lecb_ret
.p2align 4
.Lecb_dec_two:
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
+ call _aesni_decrypt2
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
jmp .Lecb_ret
.p2align 4
.Lecb_dec_three:
call _aesni_decrypt3
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
jmp .Lecb_ret
.p2align 4
.Lecb_dec_four:
call _aesni_decrypt4
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
jmp .Lecb_ret
.p2align 4
.Lecb_dec_five:
xorps %xmm7,%xmm7
call _aesni_decrypt6
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
jmp .Lecb_ret
.p2align 4
.Lecb_dec_six:
call _aesni_decrypt6
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
.Lecb_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp)
+ movaps 16(%rsp),%xmm7
+ movaps %xmm0,16(%rsp)
+ movaps 32(%rsp),%xmm8
+ movaps %xmm0,32(%rsp)
+ movaps 48(%rsp),%xmm9
+ movaps %xmm0,48(%rsp)
+ leaq 88(%rsp),%rsp
+.Lecb_enc_ret:
movq 8(%rsp),%rdi
movq 16(%rsp),%rsi
.byte 0xf3,0xc3
@@ -811,60 +920,70 @@ aesni_ccm64_encrypt_blocks:
movaps %xmm9,48(%rsp)
.Lccm64_enc_body:
movl 240(%rcx),%eax
- movdqu (%r8),%xmm9
- movdqa .Lincrement64(%rip),%xmm6
+ movdqu (%r8),%xmm6
+ movdqa .Lincrement64(%rip),%xmm9
movdqa .Lbswap_mask(%rip),%xmm7
- shrl $1,%eax
+ shll $4,%eax
+ movl $16,%r10d
leaq 0(%rcx),%r11
movdqu (%r9),%xmm3
- movdqa %xmm9,%xmm2
- movl %eax,%r10d
-.byte 102,68,15,56,0,207
+ movdqa %xmm6,%xmm2
+ leaq 32(%rcx,%rax,1),%rcx
+.byte 102,15,56,0,247
+ subq %rax,%r10
jmp .Lccm64_enc_outer
.p2align 4
.Lccm64_enc_outer:
movups (%r11),%xmm0
- movl %r10d,%eax
+ movq %r10,%rax
movups (%rdi),%xmm8
xorps %xmm0,%xmm2
movups 16(%r11),%xmm1
xorps %xmm8,%xmm0
- leaq 32(%r11),%rcx
xorps %xmm0,%xmm3
- movups (%rcx),%xmm0
+ movups 32(%r11),%xmm0
.Lccm64_enc2_loop:
.byte 102,15,56,220,209
- decl %eax
.byte 102,15,56,220,217
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,216
- movups 0(%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
+ decq %rdx
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- decq %rdx
leaq 16(%rdi),%rdi
xorps %xmm2,%xmm8
- movdqa %xmm9,%xmm2
+ movdqa %xmm6,%xmm2
movups %xmm8,(%rsi)
- leaq 16(%rsi),%rsi
.byte 102,15,56,0,215
+ leaq 16(%rsi),%rsi
jnz .Lccm64_enc_outer
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
movups %xmm3,(%r9)
+ pxor %xmm3,%xmm3
+ pxor %xmm8,%xmm8
+ pxor %xmm6,%xmm6
movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp)
movaps 16(%rsp),%xmm7
+ movaps %xmm0,16(%rsp)
movaps 32(%rsp),%xmm8
+ movaps %xmm0,32(%rsp)
movaps 48(%rsp),%xmm9
+ movaps %xmm0,48(%rsp)
leaq 88(%rsp),%rsp
.Lccm64_enc_ret:
movq 8(%rsp),%rdi
@@ -893,15 +1012,15 @@ aesni_ccm64_decrypt_blocks:
movaps %xmm9,48(%rsp)
.Lccm64_dec_body:
movl 240(%rcx),%eax
- movups (%r8),%xmm9
+ movups (%r8),%xmm6
movdqu (%r9),%xmm3
- movdqa .Lincrement64(%rip),%xmm6
+ movdqa .Lincrement64(%rip),%xmm9
movdqa .Lbswap_mask(%rip),%xmm7
- movaps %xmm9,%xmm2
+ movaps %xmm6,%xmm2
movl %eax,%r10d
movq %rcx,%r11
-.byte 102,68,15,56,0,207
+.byte 102,15,56,0,247
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
@@ -911,17 +1030,21 @@ aesni_ccm64_decrypt_blocks:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_5
+ jnz .Loop_enc1_5
.byte 102,15,56,221,209
+ shll $4,%r10d
+ movl $16,%eax
movups (%rdi),%xmm8
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
leaq 16(%rdi),%rdi
+ subq %r10,%rax
+ leaq 32(%r11,%r10,1),%rcx
+ movq %rax,%r10
jmp .Lccm64_dec_outer
.p2align 4
.Lccm64_dec_outer:
xorps %xmm2,%xmm8
- movdqa %xmm9,%xmm2
- movl %r10d,%eax
+ movdqa %xmm6,%xmm2
movups %xmm8,(%rsi)
leaq 16(%rsi),%rsi
.byte 102,15,56,0,215
@@ -930,36 +1053,36 @@ aesni_ccm64_decrypt_blocks:
jz .Lccm64_dec_break
movups (%r11),%xmm0
- shrl $1,%eax
+ movq %r10,%rax
movups 16(%r11),%xmm1
xorps %xmm0,%xmm8
- leaq 32(%r11),%rcx
xorps %xmm0,%xmm2
xorps %xmm8,%xmm3
- movups (%rcx),%xmm0
-
+ movups 32(%r11),%xmm0
+ jmp .Lccm64_dec2_loop
+.p2align 4
.Lccm64_dec2_loop:
.byte 102,15,56,220,209
- decl %eax
.byte 102,15,56,220,217
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,216
- movups 0(%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lccm64_dec2_loop
movups (%rdi),%xmm8
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- leaq 16(%rdi),%rdi
.byte 102,15,56,221,208
.byte 102,15,56,221,216
+ leaq 16(%rdi),%rdi
jmp .Lccm64_dec_outer
.p2align 4
.Lccm64_dec_break:
+ movl 240(%r11),%eax
movups (%r11),%xmm0
movups 16(%r11),%xmm1
xorps %xmm0,%xmm8
@@ -970,13 +1093,23 @@ aesni_ccm64_decrypt_blocks:
decl %eax
movups (%r11),%xmm1
leaq 16(%r11),%r11
- jnz .Loop_enc1_6
+ jnz .Loop_enc1_6
.byte 102,15,56,221,217
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
movups %xmm3,(%r9)
+ pxor %xmm3,%xmm3
+ pxor %xmm8,%xmm8
+ pxor %xmm6,%xmm6
movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp)
movaps 16(%rsp),%xmm7
+ movaps %xmm0,16(%rsp)
movaps 32(%rsp),%xmm8
+ movaps %xmm0,32(%rsp)
movaps 48(%rsp),%xmm9
+ movaps %xmm0,48(%rsp)
leaq 88(%rsp),%rsp
.Lccm64_dec_ret:
movq 8(%rsp),%rdi
@@ -997,6 +1130,35 @@ aesni_ctr32_encrypt_blocks:
movq %r9,%rcx
movq 40(%rsp),%r8
+ cmpq $1,%rdx
+ jne .Lctr32_bulk
+
+
+
+ movups (%r8),%xmm2
+ movups (%rdi),%xmm3
+ movl 240(%rcx),%edx
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_enc1_7:
+.byte 102,15,56,220,209
+ decl %edx
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_7
+.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ xorps %xmm2,%xmm2
+ jmp .Lctr32_epilogue
+
+.p2align 4
+.Lctr32_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $288,%rsp
@@ -1014,8 +1176,8 @@ aesni_ctr32_encrypt_blocks:
.Lctr32_body:
leaq -8(%rax),%rbp
- cmpq $1,%rdx
- je .Lctr32_one_shortcut
+
+
movdqu (%r8),%xmm2
movdqu (%rcx),%xmm0
@@ -1030,32 +1192,33 @@ aesni_ctr32_encrypt_blocks:
movdqa %xmm2,64(%rsp)
movdqa %xmm2,80(%rsp)
movdqa %xmm2,96(%rsp)
+ movq %rdx,%r10
movdqa %xmm2,112(%rsp)
- movl 240(%rcx),%eax
-
- leaq 1(%r8),%r9
- leaq 2(%r8),%r10
- bswapl %r9d
- bswapl %r10d
- xorl %r11d,%r9d
- xorl %r11d,%r10d
-.byte 102,65,15,58,34,217,3
- leaq 3(%r8),%r9
+ leaq 1(%r8),%rax
+ leaq 2(%r8),%rdx
+ bswapl %eax
+ bswapl %edx
+ xorl %r11d,%eax
+ xorl %r11d,%edx
+.byte 102,15,58,34,216,3
+ leaq 3(%r8),%rax
movdqa %xmm3,16(%rsp)
-.byte 102,65,15,58,34,226,3
- bswapl %r9d
+.byte 102,15,58,34,226,3
+ bswapl %eax
+ movq %r10,%rdx
leaq 4(%r8),%r10
movdqa %xmm4,32(%rsp)
- xorl %r11d,%r9d
+ xorl %r11d,%eax
bswapl %r10d
-.byte 102,65,15,58,34,233,3
+.byte 102,15,58,34,232,3
xorl %r11d,%r10d
movdqa %xmm5,48(%rsp)
leaq 5(%r8),%r9
movl %r10d,64+12(%rsp)
bswapl %r9d
leaq 6(%r8),%r10
+ movl 240(%rcx),%eax
xorl %r11d,%r9d
bswapl %r10d
movl %r9d,80+12(%rsp)
@@ -1063,7 +1226,9 @@ aesni_ctr32_encrypt_blocks:
leaq 7(%r8),%r9
movl %r10d,96+12(%rsp)
bswapl %r9d
+ movl _gnutls_x86_cpuid_s+4(%rip),%r10d
xorl %r11d,%r9d
+ andl $71303168,%r10d
movl %r9d,112+12(%rsp)
movups 16(%rcx),%xmm1
@@ -1074,10 +1239,104 @@ aesni_ctr32_encrypt_blocks:
cmpq $8,%rdx
jb .Lctr32_tail
+ subq $6,%rdx
+ cmpl $4194304,%r10d
+ je .Lctr32_6x
+
leaq 128(%rcx),%rcx
- subq $8,%rdx
+ subq $2,%rdx
jmp .Lctr32_loop8
+.p2align 4
+.Lctr32_6x:
+ shll $4,%eax
+ movl $48,%r10d
+ bswapl %r11d
+ leaq 32(%rcx,%rax,1),%rcx
+ subq %rax,%r10
+ jmp .Lctr32_loop6
+
+.p2align 4
+.Lctr32_loop6:
+ addl $6,%r8d
+ movups -48(%rcx,%r10,1),%xmm0
+.byte 102,15,56,220,209
+ movl %r8d,%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,217
+.byte 0x0f,0x38,0xf1,0x44,0x24,12
+ leal 1(%r8),%eax
+.byte 102,15,56,220,225
+ xorl %r11d,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,28
+.byte 102,15,56,220,233
+ leal 2(%r8),%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,241
+.byte 0x0f,0x38,0xf1,0x44,0x24,44
+ leal 3(%r8),%eax
+.byte 102,15,56,220,249
+ movups -32(%rcx,%r10,1),%xmm1
+ xorl %r11d,%eax
+
+.byte 102,15,56,220,208
+.byte 0x0f,0x38,0xf1,0x44,0x24,60
+ leal 4(%r8),%eax
+.byte 102,15,56,220,216
+ xorl %r11d,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,76
+.byte 102,15,56,220,224
+ leal 5(%r8),%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,232
+.byte 0x0f,0x38,0xf1,0x44,0x24,92
+ movq %r10,%rax
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups -16(%rcx,%r10,1),%xmm0
+
+ call .Lenc_loop6
+
+ movdqu (%rdi),%xmm8
+ movdqu 16(%rdi),%xmm9
+ movdqu 32(%rdi),%xmm10
+ movdqu 48(%rdi),%xmm11
+ movdqu 64(%rdi),%xmm12
+ movdqu 80(%rdi),%xmm13
+ leaq 96(%rdi),%rdi
+ movups -64(%rcx,%r10,1),%xmm1
+ pxor %xmm2,%xmm8
+ movaps 0(%rsp),%xmm2
+ pxor %xmm3,%xmm9
+ movaps 16(%rsp),%xmm3
+ pxor %xmm4,%xmm10
+ movaps 32(%rsp),%xmm4
+ pxor %xmm5,%xmm11
+ movaps 48(%rsp),%xmm5
+ pxor %xmm6,%xmm12
+ movaps 64(%rsp),%xmm6
+ pxor %xmm7,%xmm13
+ movaps 80(%rsp),%xmm7
+ movdqu %xmm8,(%rsi)
+ movdqu %xmm9,16(%rsi)
+ movdqu %xmm10,32(%rsi)
+ movdqu %xmm11,48(%rsi)
+ movdqu %xmm12,64(%rsi)
+ movdqu %xmm13,80(%rsi)
+ leaq 96(%rsi),%rsi
+
+ subq $6,%rdx
+ jnc .Lctr32_loop6
+
+ addq $6,%rdx
+ jz .Lctr32_done
+
+ leal -48(%r10),%eax
+ leaq -80(%rcx,%r10,1),%rcx
+ negl %eax
+ shrl $4,%eax
+ jmp .Lctr32_tail
+
.p2align 5
.Lctr32_loop8:
addl $8,%r8d
@@ -1090,6 +1349,7 @@ aesni_ctr32_encrypt_blocks:
movups 32-128(%rcx),%xmm0
.byte 102,15,56,220,225
xorl %r11d,%r9d
+ nop
.byte 102,15,56,220,233
movl %r9d,0+12(%rsp)
leaq 1(%r8),%r9
@@ -1098,11 +1358,12 @@ aesni_ctr32_encrypt_blocks:
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 48-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
-.byte 102,15,56,220,224
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
.byte 102,15,56,220,232
movl %r9d,16+12(%rsp)
leaq 2(%r8),%r9
@@ -1111,11 +1372,12 @@ aesni_ctr32_encrypt_blocks:
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 64-128(%rcx),%xmm0
+ bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- bswapl %r9d
-.byte 102,15,56,220,225
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
.byte 102,15,56,220,233
movl %r9d,32+12(%rsp)
leaq 3(%r8),%r9
@@ -1124,11 +1386,12 @@ aesni_ctr32_encrypt_blocks:
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 80-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
-.byte 102,15,56,220,224
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
.byte 102,15,56,220,232
movl %r9d,48+12(%rsp)
leaq 4(%r8),%r9
@@ -1137,11 +1400,12 @@ aesni_ctr32_encrypt_blocks:
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 96-128(%rcx),%xmm0
+ bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- bswapl %r9d
-.byte 102,15,56,220,225
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
.byte 102,15,56,220,233
movl %r9d,64+12(%rsp)
leaq 5(%r8),%r9
@@ -1150,11 +1414,12 @@ aesni_ctr32_encrypt_blocks:
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 112-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
-.byte 102,15,56,220,224
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
.byte 102,15,56,220,232
movl %r9d,80+12(%rsp)
leaq 6(%r8),%r9
@@ -1163,11 +1428,12 @@ aesni_ctr32_encrypt_blocks:
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 128-128(%rcx),%xmm0
+ bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- bswapl %r9d
-.byte 102,15,56,220,225
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
.byte 102,15,56,220,233
movl %r9d,96+12(%rsp)
leaq 7(%r8),%r9
@@ -1176,21 +1442,21 @@ aesni_ctr32_encrypt_blocks:
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 144-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
.byte 102,15,56,220,224
xorl %r11d,%r9d
+ movdqu 0(%rdi),%xmm10
.byte 102,15,56,220,232
movl %r9d,112+12(%rsp)
+ cmpl $11,%eax
.byte 102,15,56,220,240
.byte 102,15,56,220,248
.byte 102,68,15,56,220,192
- movdqu 0(%rdi),%xmm10
.byte 102,68,15,56,220,200
movups 160-128(%rcx),%xmm0
- cmpl $11,%eax
jb .Lctr32_enc_done
.byte 102,15,56,220,209
@@ -1233,7 +1499,9 @@ aesni_ctr32_encrypt_blocks:
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 224-128(%rcx),%xmm0
+ jmp .Lctr32_enc_done
+.p2align 4
.Lctr32_enc_done:
movdqu 16(%rdi),%xmm11
pxor %xmm0,%xmm10
@@ -1245,8 +1513,8 @@ aesni_ctr32_encrypt_blocks:
pxor %xmm0,%xmm13
movdqu 80(%rdi),%xmm15
pxor %xmm0,%xmm14
-.byte 102,15,56,220,209
pxor %xmm0,%xmm15
+.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.byte 102,15,56,220,233
@@ -1255,26 +1523,26 @@ aesni_ctr32_encrypt_blocks:
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movdqu 96(%rdi),%xmm1
+ leaq 128(%rdi),%rdi
.byte 102,65,15,56,221,210
pxor %xmm0,%xmm1
- movdqu 112(%rdi),%xmm10
- leaq 128(%rdi),%rdi
+ movdqu 112-128(%rdi),%xmm10
.byte 102,65,15,56,221,219
pxor %xmm0,%xmm10
movdqa 0(%rsp),%xmm11
.byte 102,65,15,56,221,228
- movdqa 16(%rsp),%xmm12
.byte 102,65,15,56,221,237
+ movdqa 16(%rsp),%xmm12
movdqa 32(%rsp),%xmm13
.byte 102,65,15,56,221,246
- movdqa 48(%rsp),%xmm14
.byte 102,65,15,56,221,255
+ movdqa 48(%rsp),%xmm14
movdqa 64(%rsp),%xmm15
.byte 102,68,15,56,221,193
movdqa 80(%rsp),%xmm0
-.byte 102,69,15,56,221,202
movups 16-128(%rcx),%xmm1
+.byte 102,69,15,56,221,202
movups %xmm2,(%rsi)
movdqa %xmm11,%xmm2
@@ -1300,29 +1568,32 @@ aesni_ctr32_encrypt_blocks:
leaq -128(%rcx),%rcx
.Lctr32_tail:
+
+
leaq 16(%rcx),%rcx
cmpq $4,%rdx
jb .Lctr32_loop3
je .Lctr32_loop4
+
+ shll $4,%eax
movdqa 96(%rsp),%xmm8
pxor %xmm9,%xmm9
movups 16(%rcx),%xmm0
.byte 102,15,56,220,209
- leaq 16(%rcx),%rcx
.byte 102,15,56,220,217
- shrl $1,%eax
+ leaq 32-16(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,220,225
- decl %eax
-.byte 102,15,56,220,233
+ addq $16,%rax
movups (%rdi),%xmm10
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
movups 16(%rdi),%xmm11
-.byte 102,15,56,220,249
movups 32(%rdi),%xmm12
+.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
- movups 16(%rcx),%xmm1
call .Lenc_loop8_enter
@@ -1355,19 +1626,19 @@ aesni_ctr32_encrypt_blocks:
.Lctr32_loop4:
.byte 102,15,56,220,209
leaq 16(%rcx),%rcx
+ decl %eax
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.byte 102,15,56,220,233
movups (%rcx),%xmm1
- decl %eax
jnz .Lctr32_loop4
.byte 102,15,56,221,209
- movups (%rdi),%xmm10
.byte 102,15,56,221,217
+ movups (%rdi),%xmm10
movups 16(%rdi),%xmm11
.byte 102,15,56,221,225
- movups 32(%rdi),%xmm12
.byte 102,15,56,221,233
+ movups 32(%rdi),%xmm12
movups 48(%rdi),%xmm13
xorps %xmm10,%xmm2
@@ -1384,10 +1655,10 @@ aesni_ctr32_encrypt_blocks:
.Lctr32_loop3:
.byte 102,15,56,220,209
leaq 16(%rcx),%rcx
+ decl %eax
.byte 102,15,56,220,217
.byte 102,15,56,220,225
movups (%rcx),%xmm1
- decl %eax
jnz .Lctr32_loop3
.byte 102,15,56,221,209
.byte 102,15,56,221,217
@@ -1407,40 +1678,43 @@ aesni_ctr32_encrypt_blocks:
movups 32(%rdi),%xmm12
xorps %xmm12,%xmm4
movups %xmm4,32(%rsi)
- jmp .Lctr32_done
-
-.p2align 4
-.Lctr32_one_shortcut:
- movups (%r8),%xmm2
- movups (%rdi),%xmm10
- movl 240(%rcx),%eax
- movups (%rcx),%xmm0
- movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
- xorps %xmm0,%xmm2
-.Loop_enc1_7:
-.byte 102,15,56,220,209
- decl %eax
- movups (%rcx),%xmm1
- leaq 16(%rcx),%rcx
- jnz .Loop_enc1_7
-.byte 102,15,56,221,209
- xorps %xmm10,%xmm2
- movups %xmm2,(%rsi)
- jmp .Lctr32_done
-.p2align 4
.Lctr32_done:
+ xorps %xmm0,%xmm0
+ xorl %r11d,%r11d
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
movaps -160(%rbp),%xmm6
+ movaps %xmm0,-160(%rbp)
movaps -144(%rbp),%xmm7
+ movaps %xmm0,-144(%rbp)
movaps -128(%rbp),%xmm8
+ movaps %xmm0,-128(%rbp)
movaps -112(%rbp),%xmm9
+ movaps %xmm0,-112(%rbp)
movaps -96(%rbp),%xmm10
+ movaps %xmm0,-96(%rbp)
movaps -80(%rbp),%xmm11
+ movaps %xmm0,-80(%rbp)
movaps -64(%rbp),%xmm12
+ movaps %xmm0,-64(%rbp)
movaps -48(%rbp),%xmm13
+ movaps %xmm0,-48(%rbp)
movaps -32(%rbp),%xmm14
+ movaps %xmm0,-32(%rbp)
movaps -16(%rbp),%xmm15
+ movaps %xmm0,-16(%rbp)
+ movaps %xmm0,0(%rsp)
+ movaps %xmm0,16(%rsp)
+ movaps %xmm0,32(%rsp)
+ movaps %xmm0,48(%rsp)
+ movaps %xmm0,64(%rsp)
+ movaps %xmm0,80(%rsp)
+ movaps %xmm0,96(%rsp)
+ movaps %xmm0,112(%rsp)
leaq (%rbp),%rsp
popq %rbp
.Lctr32_epilogue:
@@ -1465,7 +1739,7 @@ aesni_xts_encrypt:
leaq (%rsp),%rax
pushq %rbp
- subq $256,%rsp
+ subq $272,%rsp
andq $-16,%rsp
movaps %xmm6,-168(%rax)
movaps %xmm7,-152(%rax)
@@ -1479,242 +1753,282 @@ aesni_xts_encrypt:
movaps %xmm15,-24(%rax)
.Lxts_enc_body:
leaq -8(%rax),%rbp
- movups (%r9),%xmm15
+ movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
movups (%r8),%xmm0
movups 16(%r8),%xmm1
leaq 32(%r8),%r8
- xorps %xmm0,%xmm15
+ xorps %xmm0,%xmm2
.Loop_enc1_8:
-.byte 102,68,15,56,220,249
+.byte 102,15,56,220,209
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz .Loop_enc1_8
-.byte 102,68,15,56,221,249
+ jnz .Loop_enc1_8
+.byte 102,15,56,221,209
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+
movdqa .Lxts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ movdqa %xmm2,%xmm15
+ pshufd $0x5f,%xmm2,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc .Lxts_enc_short
- shrl $1,%eax
- subl $1,%eax
- movl %eax,%r10d
+ movl $16+96,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ subq %r10,%rax
+ movups 16(%r11),%xmm1
+ movq %rax,%r10
+ leaq .Lxts_magic(%rip),%r8
jmp .Lxts_enc_grandloop
-.p2align 4
+.p2align 5
.Lxts_enc_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,220,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,220,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,220,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,220,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,220,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,220,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,220,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm12
+
+.byte 102,15,56,220,208
+ pxor %xmm9,%xmm13
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,220,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,220,216
+ pxor %xmm9,%xmm14
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,220,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,220,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,220,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,220,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp .Lxts_enc_loop6_enter
-
-.p2align 4
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups 64(%r11),%xmm0
+ movdqa %xmm8,80(%rsp)
+ pshufd $0x5f,%xmm15,%xmm9
+ jmp .Lxts_enc_loop6
+.p2align 5
.Lxts_enc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.Lxts_enc_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups -64(%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
+ movups -80(%rcx,%rax,1),%xmm0
jnz .Lxts_enc_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
.byte 102,15,56,220,233
.byte 102,15,56,220,241
+ pxor %xmm14,%xmm15
+ movaps %xmm10,%xmm11
.byte 102,15,56,220,249
- movups 16(%rcx),%xmm1
+ movups -64(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,208
- pand %xmm8,%xmm9
+ paddd %xmm9,%xmm9
+ pxor %xmm15,%xmm10
.byte 102,15,56,220,216
- pcmpgtd %xmm15,%xmm14
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
.byte 102,15,56,220,224
- pxor %xmm9,%xmm15
.byte 102,15,56,220,232
+ pand %xmm8,%xmm14
+ movaps %xmm11,%xmm12
.byte 102,15,56,220,240
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,248
- movups 32(%rcx),%xmm0
+ movups -48(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
.byte 102,15,56,220,233
+ movdqa %xmm13,48(%rsp)
+ pxor %xmm14,%xmm15
.byte 102,15,56,220,241
+ movaps %xmm12,%xmm13
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,249
+ movups -32(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,220,216
paddq %xmm15,%xmm15
-.byte 102,15,56,221,208
- pand %xmm8,%xmm9
-.byte 102,15,56,221,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,221,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,221,232
-.byte 102,15,56,221,240
-.byte 102,15,56,221,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+ pxor %xmm14,%xmm15
+ movaps %xmm13,%xmm14
+.byte 102,15,56,220,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,220,217
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
+ pand %xmm8,%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+.byte 102,15,56,221,84,36,0
+ psrad $31,%xmm9
+ paddq %xmm15,%xmm15
+.byte 102,15,56,221,92,36,16
+.byte 102,15,56,221,100,36,32
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+ movq %r10,%rax
+.byte 102,15,56,221,108,36,48
+.byte 102,15,56,221,116,36,64
+.byte 102,15,56,221,124,36,80
pxor %xmm9,%xmm15
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
- movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc .Lxts_enc_grandloop
- leal 3(%rax,%rax,1),%eax
+ movl $16+96,%eax
+ subl %r10d,%eax
movq %r11,%rcx
- movl %eax,%r10d
+ shrl $4,%eax
.Lxts_enc_short:
+
+ movl %eax,%r10d
+ pxor %xmm0,%xmm10
addq $96,%rdx
jz .Lxts_enc_done
- cmpq $32,%rdx
+ pxor %xmm0,%xmm11
+ cmpq $0x20,%rdx
jb .Lxts_enc_one
+ pxor %xmm0,%xmm12
je .Lxts_enc_two
- cmpq $64,%rdx
+ pxor %xmm0,%xmm13
+ cmpq $0x40,%rdx
jb .Lxts_enc_three
+ pxor %xmm0,%xmm14
je .Lxts_enc_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
@@ -1724,6 +2038,7 @@ aesni_xts_encrypt:
pxor %xmm12,%xmm4
pxor %xmm13,%xmm5
pxor %xmm14,%xmm6
+ pxor %xmm7,%xmm7
call _aesni_encrypt6
@@ -1755,7 +2070,7 @@ aesni_xts_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_9
+ jnz .Loop_enc1_9
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movdqa %xmm11,%xmm10
@@ -1771,7 +2086,7 @@ aesni_xts_encrypt:
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
- call _aesni_encrypt3
+ call _aesni_encrypt2
xorps %xmm10,%xmm2
movdqa %xmm12,%xmm10
@@ -1817,15 +2132,15 @@ aesni_xts_encrypt:
call _aesni_encrypt4
- xorps %xmm10,%xmm2
- movdqa %xmm15,%xmm10
- xorps %xmm11,%xmm3
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm10,%xmm2
+ movdqa %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp .Lxts_enc_done
@@ -1860,22 +2175,45 @@ aesni_xts_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_10
+ jnz .Loop_enc1_10
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movups %xmm2,-16(%rsi)
.Lxts_enc_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
movaps -160(%rbp),%xmm6
+ movaps %xmm0,-160(%rbp)
movaps -144(%rbp),%xmm7
+ movaps %xmm0,-144(%rbp)
movaps -128(%rbp),%xmm8
+ movaps %xmm0,-128(%rbp)
movaps -112(%rbp),%xmm9
+ movaps %xmm0,-112(%rbp)
movaps -96(%rbp),%xmm10
+ movaps %xmm0,-96(%rbp)
movaps -80(%rbp),%xmm11
+ movaps %xmm0,-80(%rbp)
movaps -64(%rbp),%xmm12
+ movaps %xmm0,-64(%rbp)
movaps -48(%rbp),%xmm13
+ movaps %xmm0,-48(%rbp)
movaps -32(%rbp),%xmm14
+ movaps %xmm0,-32(%rbp)
movaps -16(%rbp),%xmm15
+ movaps %xmm0,-16(%rbp)
+ movaps %xmm0,0(%rsp)
+ movaps %xmm0,16(%rsp)
+ movaps %xmm0,32(%rsp)
+ movaps %xmm0,48(%rsp)
+ movaps %xmm0,64(%rsp)
+ movaps %xmm0,80(%rsp)
+ movaps %xmm0,96(%rsp)
leaq (%rbp),%rsp
popq %rbp
.Lxts_enc_epilogue:
@@ -1900,7 +2238,7 @@ aesni_xts_decrypt:
leaq (%rsp),%rax
pushq %rbp
- subq $256,%rsp
+ subq $272,%rsp
andq $-16,%rsp
movaps %xmm6,-168(%rax)
movaps %xmm7,-152(%rax)
@@ -1914,248 +2252,288 @@ aesni_xts_decrypt:
movaps %xmm15,-24(%rax)
.Lxts_dec_body:
leaq -8(%rax),%rbp
- movups (%r9),%xmm15
+ movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
movups (%r8),%xmm0
movups 16(%r8),%xmm1
leaq 32(%r8),%r8
- xorps %xmm0,%xmm15
+ xorps %xmm0,%xmm2
.Loop_enc1_11:
-.byte 102,68,15,56,220,249
+.byte 102,15,56,220,209
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz .Loop_enc1_11
-.byte 102,68,15,56,221,249
+ jnz .Loop_enc1_11
+.byte 102,15,56,221,209
xorl %eax,%eax
testq $15,%rdx
setnz %al
shlq $4,%rax
subq %rax,%rdx
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+
movdqa .Lxts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ movdqa %xmm2,%xmm15
+ pshufd $0x5f,%xmm2,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc .Lxts_dec_short
- shrl $1,%eax
- subl $1,%eax
- movl %eax,%r10d
+ movl $16+96,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ subq %r10,%rax
+ movups 16(%r11),%xmm1
+ movq %rax,%r10
+ leaq .Lxts_magic(%rip),%r8
jmp .Lxts_dec_grandloop
-.p2align 4
+.p2align 5
.Lxts_dec_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,222,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,222,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,222,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,222,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,222,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,222,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,222,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm12
+
+.byte 102,15,56,222,208
+ pxor %xmm9,%xmm13
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,222,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,222,216
+ pxor %xmm9,%xmm14
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,222,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,222,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,222,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,222,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp .Lxts_dec_loop6_enter
-
-.p2align 4
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups 64(%r11),%xmm0
+ movdqa %xmm8,80(%rsp)
+ pshufd $0x5f,%xmm15,%xmm9
+ jmp .Lxts_dec_loop6
+.p2align 5
.Lxts_dec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.Lxts_dec_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups -64(%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%rcx),%xmm0
+ movups -80(%rcx,%rax,1),%xmm0
jnz .Lxts_dec_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
.byte 102,15,56,222,233
.byte 102,15,56,222,241
+ pxor %xmm14,%xmm15
+ movaps %xmm10,%xmm11
.byte 102,15,56,222,249
- movups 16(%rcx),%xmm1
+ movups -64(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,208
- pand %xmm8,%xmm9
+ paddd %xmm9,%xmm9
+ pxor %xmm15,%xmm10
.byte 102,15,56,222,216
- pcmpgtd %xmm15,%xmm14
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
.byte 102,15,56,222,224
- pxor %xmm9,%xmm15
.byte 102,15,56,222,232
+ pand %xmm8,%xmm14
+ movaps %xmm11,%xmm12
.byte 102,15,56,222,240
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,248
- movups 32(%rcx),%xmm0
+ movups -48(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
.byte 102,15,56,222,233
+ movdqa %xmm13,48(%rsp)
+ pxor %xmm14,%xmm15
.byte 102,15,56,222,241
+ movaps %xmm12,%xmm13
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,249
+ movups -32(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,222,216
paddq %xmm15,%xmm15
-.byte 102,15,56,223,208
- pand %xmm8,%xmm9
-.byte 102,15,56,223,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,223,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,223,232
-.byte 102,15,56,223,240
-.byte 102,15,56,223,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+ pxor %xmm14,%xmm15
+ movaps %xmm13,%xmm14
+.byte 102,15,56,222,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,222,217
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+.byte 102,15,56,223,84,36,0
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
+.byte 102,15,56,223,92,36,16
+.byte 102,15,56,223,100,36,32
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+ movq %r10,%rax
+.byte 102,15,56,223,108,36,48
+.byte 102,15,56,223,116,36,64
+.byte 102,15,56,223,124,36,80
pxor %xmm9,%xmm15
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
- movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc .Lxts_dec_grandloop
- leal 3(%rax,%rax,1),%eax
+ movl $16+96,%eax
+ subl %r10d,%eax
movq %r11,%rcx
- movl %eax,%r10d
+ shrl $4,%eax
.Lxts_dec_short:
+
+ movl %eax,%r10d
+ pxor %xmm0,%xmm10
+ pxor %xmm0,%xmm11
addq $96,%rdx
jz .Lxts_dec_done
- cmpq $32,%rdx
+ pxor %xmm0,%xmm12
+ cmpq $0x20,%rdx
jb .Lxts_dec_one
+ pxor %xmm0,%xmm13
je .Lxts_dec_two
- cmpq $64,%rdx
+ pxor %xmm0,%xmm14
+ cmpq $0x40,%rdx
jb .Lxts_dec_three
je .Lxts_dec_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
@@ -2181,7 +2559,7 @@ aesni_xts_decrypt:
pcmpgtd %xmm15,%xmm14
movdqu %xmm6,64(%rsi)
leaq 80(%rsi),%rsi
- pshufd $19,%xmm14,%xmm11
+ pshufd $0x13,%xmm14,%xmm11
andq $15,%r9
jz .Lxts_dec_ret
@@ -2205,7 +2583,7 @@ aesni_xts_decrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_12
+ jnz .Loop_dec1_12
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movdqa %xmm11,%xmm10
@@ -2222,7 +2600,7 @@ aesni_xts_decrypt:
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
- call _aesni_decrypt3
+ call _aesni_decrypt2
xorps %xmm10,%xmm2
movdqa %xmm12,%xmm10
@@ -2248,7 +2626,7 @@ aesni_xts_decrypt:
xorps %xmm10,%xmm2
movdqa %xmm13,%xmm10
xorps %xmm11,%xmm3
- movdqa %xmm15,%xmm11
+ movdqa %xmm14,%xmm11
xorps %xmm12,%xmm4
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
@@ -2258,14 +2636,8 @@ aesni_xts_decrypt:
.p2align 4
.Lxts_dec_four:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movups (%rdi),%xmm2
- pand %xmm8,%xmm9
movups 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movups 32(%rdi),%xmm4
xorps %xmm10,%xmm2
movups 48(%rdi),%xmm5
@@ -2276,16 +2648,16 @@ aesni_xts_decrypt:
call _aesni_decrypt4
- xorps %xmm10,%xmm2
+ pxor %xmm10,%xmm2
movdqa %xmm14,%xmm10
- xorps %xmm11,%xmm3
+ pxor %xmm11,%xmm3
movdqa %xmm15,%xmm11
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp .Lxts_dec_done
@@ -2309,7 +2681,7 @@ aesni_xts_decrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_13
+ jnz .Loop_dec1_13
.byte 102,15,56,223,209
xorps %xmm11,%xmm2
movups %xmm2,(%rsi)
@@ -2339,22 +2711,45 @@ aesni_xts_decrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_14
+ jnz .Loop_dec1_14
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movups %xmm2,(%rsi)
.Lxts_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
movaps -160(%rbp),%xmm6
+ movaps %xmm0,-160(%rbp)
movaps -144(%rbp),%xmm7
+ movaps %xmm0,-144(%rbp)
movaps -128(%rbp),%xmm8
+ movaps %xmm0,-128(%rbp)
movaps -112(%rbp),%xmm9
+ movaps %xmm0,-112(%rbp)
movaps -96(%rbp),%xmm10
+ movaps %xmm0,-96(%rbp)
movaps -80(%rbp),%xmm11
+ movaps %xmm0,-80(%rbp)
movaps -64(%rbp),%xmm12
+ movaps %xmm0,-64(%rbp)
movaps -48(%rbp),%xmm13
+ movaps %xmm0,-48(%rbp)
movaps -32(%rbp),%xmm14
+ movaps %xmm0,-32(%rbp)
movaps -16(%rbp),%xmm15
+ movaps %xmm0,-16(%rbp)
+ movaps %xmm0,0(%rsp)
+ movaps %xmm0,16(%rsp)
+ movaps %xmm0,32(%rsp)
+ movaps %xmm0,48(%rsp)
+ movaps %xmm0,64(%rsp)
+ movaps %xmm0,80(%rsp)
+ movaps %xmm0,96(%rsp)
leaq (%rbp),%rsp
popq %rbp
.Lxts_dec_epilogue:
@@ -2406,7 +2801,7 @@ aesni_cbc_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_15
+ jnz .Loop_enc1_15
.byte 102,15,56,221,209
movl %r10d,%eax
movq %r11,%rcx
@@ -2416,26 +2811,59 @@ aesni_cbc_encrypt:
jnc .Lcbc_enc_loop
addq $16,%rdx
jnz .Lcbc_enc_tail
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%r8)
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
jmp .Lcbc_ret
.Lcbc_enc_tail:
movq %rdx,%rcx
xchgq %rdi,%rsi
-.long 0x9066A4F3
+.long 0x9066A4F3
movl $16,%ecx
subq %rdx,%rcx
xorl %eax,%eax
-.long 0x9066AAF3
+.long 0x9066AAF3
leaq -16(%rdi),%rdi
movl %r10d,%eax
movq %rdi,%rsi
movq %r11,%rcx
xorq %rdx,%rdx
- jmp .Lcbc_enc_loop
+ jmp .Lcbc_enc_loop
.p2align 4
.Lcbc_decrypt:
+ cmpq $16,%rdx
+ jne .Lcbc_decrypt_bulk
+
+
+
+ movdqu (%rdi),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa %xmm2,%xmm4
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_16:
+.byte 102,15,56,222,209
+ decl %r10d
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_16
+.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movdqu %xmm4,(%r8)
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ jmp .Lcbc_ret
+.p2align 4
+.Lcbc_decrypt_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $176,%rsp
@@ -2454,7 +2882,7 @@ aesni_cbc_encrypt:
leaq -8(%rax),%rbp
movups (%r8),%xmm10
movl %r10d,%eax
- cmpq $80,%rdx
+ cmpq $0x50,%rdx
jbe .Lcbc_dec_tail
movups (%rcx),%xmm0
@@ -2469,10 +2897,15 @@ aesni_cbc_encrypt:
movdqa %xmm5,%xmm14
movdqu 80(%rdi),%xmm7
movdqa %xmm6,%xmm15
- cmpq $112,%rdx
+ movl _gnutls_x86_cpuid_s+4(%rip),%r9d
+ cmpq $0x70,%rdx
jbe .Lcbc_dec_six_or_seven
- subq $112,%rdx
+ andl $71303168,%r9d
+ subq $0x50,%rdx
+ cmpl $4194304,%r9d
+ je .Lcbc_dec_loop6_enter
+ subq $0x20,%rdx
leaq 112(%rcx),%rcx
jmp .Lcbc_dec_loop8_enter
.p2align 4
@@ -2487,7 +2920,7 @@ aesni_cbc_encrypt:
movups 16-112(%rcx),%xmm1
pxor %xmm0,%xmm4
xorq %r11,%r11
- cmpq $112,%rdx
+ cmpq $0x70,%rdx
pxor %xmm0,%xmm5
pxor %xmm0,%xmm6
pxor %xmm0,%xmm7
@@ -2501,8 +2934,8 @@ aesni_cbc_encrypt:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
- setnc %r11b
.byte 102,68,15,56,222,193
+ setnc %r11b
shlq $7,%r11
.byte 102,68,15,56,222,201
addq %rdi,%r11
@@ -2516,6 +2949,7 @@ aesni_cbc_encrypt:
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 64-112(%rcx),%xmm0
+ nop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -2525,6 +2959,7 @@ aesni_cbc_encrypt:
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 80-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
@@ -2534,6 +2969,7 @@ aesni_cbc_encrypt:
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 96-112(%rcx),%xmm0
+ nop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -2543,6 +2979,7 @@ aesni_cbc_encrypt:
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 112-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
@@ -2552,6 +2989,7 @@ aesni_cbc_encrypt:
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 128-112(%rcx),%xmm0
+ nop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -2561,6 +2999,7 @@ aesni_cbc_encrypt:
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 144-112(%rcx),%xmm1
+ cmpl $11,%eax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
@@ -2570,7 +3009,6 @@ aesni_cbc_encrypt:
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 160-112(%rcx),%xmm0
- cmpl $11,%eax
jb .Lcbc_dec_done
.byte 102,15,56,222,209
.byte 102,15,56,222,217
@@ -2581,6 +3019,7 @@ aesni_cbc_encrypt:
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 176-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
@@ -2600,6 +3039,7 @@ aesni_cbc_encrypt:
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 208-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
@@ -2609,18 +3049,20 @@ aesni_cbc_encrypt:
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 224-112(%rcx),%xmm0
+ jmp .Lcbc_dec_done
+.p2align 4
.Lcbc_dec_done:
.byte 102,15,56,222,209
- pxor %xmm0,%xmm10
.byte 102,15,56,222,217
+ pxor %xmm0,%xmm10
pxor %xmm0,%xmm11
.byte 102,15,56,222,225
- pxor %xmm0,%xmm12
.byte 102,15,56,222,233
+ pxor %xmm0,%xmm12
pxor %xmm0,%xmm13
.byte 102,15,56,222,241
- pxor %xmm0,%xmm14
.byte 102,15,56,222,249
+ pxor %xmm0,%xmm14
pxor %xmm0,%xmm15
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
@@ -2632,16 +3074,16 @@ aesni_cbc_encrypt:
.byte 102,65,15,56,223,219
pxor %xmm0,%xmm10
movdqu 112(%rdi),%xmm0
- leaq 128(%rdi),%rdi
.byte 102,65,15,56,223,228
+ leaq 128(%rdi),%rdi
movdqu 0(%r11),%xmm11
.byte 102,65,15,56,223,237
- movdqu 16(%r11),%xmm12
.byte 102,65,15,56,223,246
+ movdqu 16(%r11),%xmm12
movdqu 32(%r11),%xmm13
.byte 102,65,15,56,223,255
- movdqu 48(%r11),%xmm14
.byte 102,68,15,56,223,193
+ movdqu 48(%r11),%xmm14
movdqu 64(%r11),%xmm15
.byte 102,69,15,56,223,202
movdqa %xmm0,%xmm10
@@ -2663,21 +3105,21 @@ aesni_cbc_encrypt:
movups %xmm8,96(%rsi)
leaq 112(%rsi),%rsi
- subq $128,%rdx
+ subq $0x80,%rdx
ja .Lcbc_dec_loop8
movaps %xmm9,%xmm2
leaq -112(%rcx),%rcx
- addq $112,%rdx
- jle .Lcbc_dec_tail_collected
+ addq $0x70,%rdx
+ jle .Lcbc_dec_clear_tail_collected
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
- cmpq $80,%rdx
+ cmpq $0x50,%rdx
jbe .Lcbc_dec_tail
movaps %xmm11,%xmm2
.Lcbc_dec_six_or_seven:
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
ja .Lcbc_dec_seven
movaps %xmm7,%xmm8
@@ -2688,14 +3130,19 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
leaq 80(%rsi),%rsi
movdqa %xmm7,%xmm2
+ pxor %xmm7,%xmm7
jmp .Lcbc_dec_tail_collected
.p2align 4
@@ -2710,36 +3157,88 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
pxor %xmm9,%xmm8
movdqu %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
leaq 96(%rsi),%rsi
movdqa %xmm8,%xmm2
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
jmp .Lcbc_dec_tail_collected
+.p2align 4
+.Lcbc_dec_loop6:
+ movups %xmm7,(%rsi)
+ leaq 16(%rsi),%rsi
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqa %xmm2,%xmm11
+ movdqu 32(%rdi),%xmm4
+ movdqa %xmm3,%xmm12
+ movdqu 48(%rdi),%xmm5
+ movdqa %xmm4,%xmm13
+ movdqu 64(%rdi),%xmm6
+ movdqa %xmm5,%xmm14
+ movdqu 80(%rdi),%xmm7
+ movdqa %xmm6,%xmm15
+.Lcbc_dec_loop6_enter:
+ leaq 96(%rdi),%rdi
+ movdqa %xmm7,%xmm8
+
+ call _aesni_decrypt6
+
+ pxor %xmm10,%xmm2
+ movdqa %xmm8,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movq %r11,%rcx
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movl %r10d,%eax
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ subq $0x60,%rdx
+ ja .Lcbc_dec_loop6
+
+ movdqa %xmm7,%xmm2
+ addq $0x50,%rdx
+ jle .Lcbc_dec_clear_tail_collected
+ movups %xmm7,(%rsi)
+ leaq 16(%rsi),%rsi
+
.Lcbc_dec_tail:
movups (%rdi),%xmm2
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_one
movups 16(%rdi),%xmm3
movaps %xmm2,%xmm11
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_two
movups 32(%rdi),%xmm4
movaps %xmm3,%xmm12
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_three
movups 48(%rdi),%xmm5
movaps %xmm4,%xmm13
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_four
movups 64(%rdi),%xmm6
@@ -2753,13 +3252,18 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
leaq 64(%rsi),%rsi
movdqa %xmm6,%xmm2
- subq $16,%rdx
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ subq $0x10,%rdx
jmp .Lcbc_dec_tail_collected
.p2align 4
@@ -2769,12 +3273,12 @@ aesni_cbc_encrypt:
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
-.Loop_dec1_16:
+.Loop_dec1_17:
.byte 102,15,56,222,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_16
+ jnz .Loop_dec1_17
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movaps %xmm11,%xmm10
@@ -2782,13 +3286,13 @@ aesni_cbc_encrypt:
.p2align 4
.Lcbc_dec_two:
movaps %xmm3,%xmm12
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
+ call _aesni_decrypt2
pxor %xmm10,%xmm2
movaps %xmm12,%xmm10
pxor %xmm11,%xmm3
movdqu %xmm2,(%rsi)
movdqa %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leaq 16(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.p2align 4
@@ -2801,7 +3305,9 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movdqa %xmm4,%xmm2
+ pxor %xmm4,%xmm4
leaq 32(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.p2align 4
@@ -2814,39 +3320,61 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movdqa %xmm5,%xmm2
+ pxor %xmm5,%xmm5
leaq 48(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.p2align 4
+.Lcbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
.Lcbc_dec_tail_collected:
movups %xmm10,(%r8)
andq $15,%rdx
jnz .Lcbc_dec_tail_partial
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
jmp .Lcbc_dec_ret
.p2align 4
.Lcbc_dec_tail_partial:
movaps %xmm2,(%rsp)
+ pxor %xmm2,%xmm2
movq $16,%rcx
movq %rsi,%rdi
subq %rdx,%rcx
leaq (%rsp),%rsi
-.long 0x9066A4F3
+.long 0x9066A4F3
+ movdqa %xmm2,(%rsp)
.Lcbc_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movaps 16(%rsp),%xmm6
+ movaps %xmm0,16(%rsp)
movaps 32(%rsp),%xmm7
+ movaps %xmm0,32(%rsp)
movaps 48(%rsp),%xmm8
+ movaps %xmm0,48(%rsp)
movaps 64(%rsp),%xmm9
+ movaps %xmm0,64(%rsp)
movaps 80(%rsp),%xmm10
+ movaps %xmm0,80(%rsp)
movaps 96(%rsp),%xmm11
+ movaps %xmm0,96(%rsp)
movaps 112(%rsp),%xmm12
+ movaps %xmm0,112(%rsp)
movaps 128(%rsp),%xmm13
+ movaps %xmm0,128(%rsp)
movaps 144(%rsp),%xmm14
+ movaps %xmm0,144(%rsp)
movaps 160(%rsp),%xmm15
+ movaps %xmm0,160(%rsp)
leaq (%rbp),%rsp
popq %rbp
.Lcbc_ret:
@@ -2858,7 +3386,7 @@ aesni_cbc_encrypt:
.def aesni_set_decrypt_key; .scl 2; .type 32; .endef
.p2align 4
aesni_set_decrypt_key:
-.byte 0x48,0x83,0xEC,0x08
+.byte 0x48,0x83,0xEC,0x08
call __aesni_set_encrypt_key
shll $4,%edx
testl %eax,%eax
@@ -2886,7 +3414,9 @@ aesni_set_decrypt_key:
movups (%r8),%xmm0
.byte 102,15,56,219,192
+ pxor %xmm1,%xmm1
movups %xmm0,(%rcx)
+ pxor %xmm0,%xmm0
.Ldec_key_ret:
addq $8,%rsp
.byte 0xf3,0xc3
@@ -2897,15 +3427,17 @@ aesni_set_decrypt_key:
.p2align 4
aesni_set_encrypt_key:
__aesni_set_encrypt_key:
-.byte 0x48,0x83,0xEC,0x08
+.byte 0x48,0x83,0xEC,0x08
movq $-1,%rax
testq %rcx,%rcx
jz .Lenc_key_ret
testq %r8,%r8
jz .Lenc_key_ret
+ movl $268437504,%r10d
movups (%rcx),%xmm0
xorps %xmm4,%xmm4
+ andl _gnutls_x86_cpuid_s+4(%rip),%r10d
leaq 16(%r8),%rax
cmpl $256,%edx
je .L14rounds
@@ -2916,6 +3448,9 @@ __aesni_set_encrypt_key:
.L10rounds:
movl $9,%edx
+ cmpl $268435456,%r10d
+ je .L10rounds_alt
+
movups %xmm0,(%r8)
.byte 102,15,58,223,200,1
call .Lkey_expansion_128_cold
@@ -2943,9 +3478,79 @@ __aesni_set_encrypt_key:
jmp .Lenc_key_ret
.p2align 4
+.L10rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movl $8,%r10d
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,(%r8)
+ jmp .Loop_key128
+
+.p2align 4
+.Loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leaq 16(%rax),%rax
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%rax)
+ movdqa %xmm0,%xmm2
+
+ decl %r10d
+ jnz .Loop_key128
+
+ movdqa .Lkey_rcon1b(%rip),%xmm4
+
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%rax)
+
+ movl %edx,96(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.p2align 4
.L12rounds:
movq 16(%rcx),%xmm2
movl $11,%edx
+ cmpl $268435456,%r10d
+ je .L12rounds_alt
+
movups %xmm0,(%r8)
.byte 102,15,58,223,202,1
call .Lkey_expansion_192a_cold
@@ -2969,10 +3574,54 @@ __aesni_set_encrypt_key:
jmp .Lenc_key_ret
.p2align 4
+.L12rounds_alt:
+ movdqa .Lkey_rotate192(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movl $8,%r10d
+ movdqu %xmm0,(%r8)
+ jmp .Loop_key192
+
+.p2align 4
+.Loop_key192:
+ movq %xmm2,0(%rax)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leaq 24(%rax),%rax
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+
+ pshufd $0xff,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%rax)
+
+ decl %r10d
+ jnz .Loop_key192
+
+ movl %edx,32(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.p2align 4
.L14rounds:
movups 16(%rcx),%xmm2
movl $13,%edx
leaq 16(%rax),%rax
+ cmpl $268435456,%r10d
+ je .L14rounds_alt
+
movups %xmm0,(%r8)
movups %xmm2,16(%r8)
.byte 102,15,58,223,202,1
@@ -3007,9 +3656,69 @@ __aesni_set_encrypt_key:
jmp .Lenc_key_ret
.p2align 4
+.L14rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movl $7,%r10d
+ movdqu %xmm0,0(%r8)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,16(%r8)
+ jmp .Loop_key256
+
+.p2align 4
+.Loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ decl %r10d
+ jz .Ldone_key256
+
+ pshufd $0xff,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%rax)
+ leaq 32(%rax),%rax
+ movdqa %xmm2,%xmm1
+
+ jmp .Loop_key256
+
+.Ldone_key256:
+ movl %edx,16(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.p2align 4
.Lbad_keybits:
movq $-2,%rax
.Lenc_key_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
addq $8,%rsp
.byte 0xf3,0xc3
.LSEH_end_set_encrypt_key:
@@ -3095,32 +3804,21 @@ __aesni_set_encrypt_key:
.long 0x87,0,1,0
.Lincrement1:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Lkey_rotate:
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+.Lkey_rotate192:
+.long 0x04070605,0x04070605,0x04070605,0x04070605
+.Lkey_rcon1:
+.long 1,1,1,1
+.Lkey_rcon1b:
+.long 0x1b,0x1b,0x1b,0x1b
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 6
-.def ecb_se_handler; .scl 3; .type 32; .endef
-.p2align 4
-ecb_se_handler:
- pushq %rsi
- pushq %rdi
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushfq
- subq $64,%rsp
-
- movq 152(%r8),%rax
-
- jmp .Lcommon_seh_tail
-
-
-.def ccm64_se_handler; .scl 3; .type 32; .endef
+.def ecb_ccm64_se_handler; .scl 3; .type 32; .endef
.p2align 4
-ccm64_se_handler:
+ecb_ccm64_se_handler:
pushq %rsi
pushq %rdi
pushq %rbx
@@ -3153,7 +3851,7 @@ ccm64_se_handler:
leaq 0(%rax),%rsi
leaq 512(%r8),%rdi
movl $8,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
leaq 88(%rax),%rax
jmp .Lcommon_seh_tail
@@ -3195,7 +3893,7 @@ ctr_xts_se_handler:
leaq -160(%rax),%rsi
leaq 512(%r8),%rdi
movl $20,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
jmp .Lcommon_rbp_tail
@@ -3216,7 +3914,7 @@ cbc_se_handler:
movq 152(%r8),%rax
movq 248(%r8),%rbx
- leaq .Lcbc_decrypt(%rip),%r10
+ leaq .Lcbc_decrypt_bulk(%rip),%r10
cmpq %r10,%rbx
jb .Lcommon_seh_tail
@@ -3231,7 +3929,7 @@ cbc_se_handler:
leaq 16(%rax),%rsi
leaq 512(%r8),%rdi
movl $20,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
.Lcommon_rbp_tail:
movq 160(%r8),%rax
@@ -3253,7 +3951,7 @@ cbc_se_handler:
movq 40(%r9),%rdi
movq %r8,%rsi
movl $154,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
movq %r9,%rsi
xorq %rcx,%rcx
@@ -3323,31 +4021,32 @@ cbc_se_handler:
.p2align 3
.LSEH_info_ecb:
.byte 9,0,0,0
-.rva ecb_se_handler
+.rva ecb_ccm64_se_handler
+.rva .Lecb_enc_body,.Lecb_enc_ret
.LSEH_info_ccm64_enc:
.byte 9,0,0,0
-.rva ccm64_se_handler
-.rva .Lccm64_enc_body,.Lccm64_enc_ret
+.rva ecb_ccm64_se_handler
+.rva .Lccm64_enc_body,.Lccm64_enc_ret
.LSEH_info_ccm64_dec:
.byte 9,0,0,0
-.rva ccm64_se_handler
-.rva .Lccm64_dec_body,.Lccm64_dec_ret
+.rva ecb_ccm64_se_handler
+.rva .Lccm64_dec_body,.Lccm64_dec_ret
.LSEH_info_ctr32:
.byte 9,0,0,0
.rva ctr_xts_se_handler
-.rva .Lctr32_body,.Lctr32_epilogue
+.rva .Lctr32_body,.Lctr32_epilogue
.LSEH_info_xts_enc:
.byte 9,0,0,0
.rva ctr_xts_se_handler
-.rva .Lxts_enc_body,.Lxts_enc_epilogue
+.rva .Lxts_enc_body,.Lxts_enc_epilogue
.LSEH_info_xts_dec:
.byte 9,0,0,0
.rva ctr_xts_se_handler
-.rva .Lxts_dec_body,.Lxts_dec_epilogue
+.rva .Lxts_dec_body,.Lxts_dec_epilogue
.LSEH_info_cbc:
.byte 9,0,0,0
.rva cbc_se_handler
.LSEH_info_key:
.byte 0x01,0x04,0x01,0x00
-.byte 0x04,0x02,0x00,0x00
+.byte 0x04,0x02,0x00,0x00
diff --git a/lib/accelerated/x86/coff/ghash-x86_64.s b/lib/accelerated/x86/coff/ghash-x86_64.s
index 221a226d6b..f4bcee28f0 100644
--- a/lib/accelerated/x86/coff/ghash-x86_64.s
+++ b/lib/accelerated/x86/coff/ghash-x86_64.s
@@ -39,6 +39,7 @@
#
.text
+
.globl gcm_gmult_4bit
.def gcm_gmult_4bit; .scl 2; .type 32; .endef
.p2align 4
@@ -65,14 +66,14 @@ gcm_gmult_4bit:
movq $14,%rcx
movq 8(%rsi,%rax,1),%r8
movq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
movq %r8,%rdx
jmp .Loop1
.p2align 4
.Loop1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
movb (%rdi,%rcx,1),%al
shrq $4,%r9
@@ -88,13 +89,13 @@ gcm_gmult_4bit:
js .Lbreak1
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
@@ -103,19 +104,19 @@ gcm_gmult_4bit:
.p2align 4
.Lbreak1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rbx,1),%r8
@@ -720,8 +721,8 @@ gcm_init_clmul:
.L_init_clmul:
.LSEH_begin_gcm_init_clmul:
-.byte 0x48,0x83,0xec,0x18
-.byte 0x0f,0x29,0x34,0x24
+.byte 0x48,0x83,0xec,0x18
+.byte 0x0f,0x29,0x34,0x24
movdqu (%rdx),%xmm2
pshufd $78,%xmm2,%xmm2
@@ -935,184 +936,188 @@ gcm_ghash_clmul:
leaq -136(%rsp),%rax
.LSEH_begin_gcm_ghash_clmul:
-.byte 0x48,0x8d,0x60,0xe0
-.byte 0x0f,0x29,0x70,0xe0
-.byte 0x0f,0x29,0x78,0xf0
-.byte 0x44,0x0f,0x29,0x00
-.byte 0x44,0x0f,0x29,0x48,0x10
-.byte 0x44,0x0f,0x29,0x50,0x20
-.byte 0x44,0x0f,0x29,0x58,0x30
-.byte 0x44,0x0f,0x29,0x60,0x40
-.byte 0x44,0x0f,0x29,0x68,0x50
-.byte 0x44,0x0f,0x29,0x70,0x60
-.byte 0x44,0x0f,0x29,0x78,0x70
- movdqa .Lbswap_mask(%rip),%xmm5
- movq $11547335547999543296,%rax
+.byte 0x48,0x8d,0x60,0xe0
+.byte 0x0f,0x29,0x70,0xe0
+.byte 0x0f,0x29,0x78,0xf0
+.byte 0x44,0x0f,0x29,0x00
+.byte 0x44,0x0f,0x29,0x48,0x10
+.byte 0x44,0x0f,0x29,0x50,0x20
+.byte 0x44,0x0f,0x29,0x58,0x30
+.byte 0x44,0x0f,0x29,0x60,0x40
+.byte 0x44,0x0f,0x29,0x68,0x50
+.byte 0x44,0x0f,0x29,0x70,0x60
+.byte 0x44,0x0f,0x29,0x78,0x70
+ movdqa .Lbswap_mask(%rip),%xmm10
movdqu (%rcx),%xmm0
movdqu (%rdx),%xmm2
- movdqu 32(%rdx),%xmm10
-.byte 102,15,56,0,197
+ movdqu 32(%rdx),%xmm7
+.byte 102,65,15,56,0,194
- subq $16,%r9
+ subq $0x10,%r9
jz .Lodd_tail
- movdqu 16(%rdx),%xmm9
- cmpq $48,%r9
+ movdqu 16(%rdx),%xmm6
+ movl _gnutls_x86_cpuid_s+4(%rip),%eax
+ cmpq $0x30,%r9
jb .Lskip4x
- subq $48,%r9
+ andl $71303168,%eax
+ cmpl $4194304,%eax
+ je .Lskip4x
+
+ subq $0x30,%r9
+ movq $0xA040608020C0E000,%rax
movdqu 48(%rdx),%xmm14
movdqu 64(%rdx),%xmm15
- movdqu 48(%r8),%xmm6
+ movdqu 48(%r8),%xmm3
movdqu 32(%r8),%xmm11
-.byte 102,15,56,0,245
-.byte 102,68,15,56,0,221
- movdqa %xmm6,%xmm8
- pshufd $78,%xmm6,%xmm7
- pxor %xmm6,%xmm7
-.byte 102,15,58,68,242,0
-.byte 102,68,15,58,68,194,17
-.byte 102,65,15,58,68,250,0
+.byte 102,65,15,56,0,218
+.byte 102,69,15,56,0,218
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
movdqa %xmm11,%xmm13
pshufd $78,%xmm11,%xmm12
pxor %xmm11,%xmm12
-.byte 102,69,15,58,68,217,0
-.byte 102,69,15,58,68,233,17
- xorps %xmm11,%xmm6
-.byte 102,69,15,58,68,226,16
- xorps %xmm13,%xmm8
- movups 80(%rdx),%xmm10
- xorps %xmm12,%xmm7
+.byte 102,68,15,58,68,222,0
+.byte 102,68,15,58,68,238,17
+.byte 102,68,15,58,68,231,16
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
+ movups 80(%rdx),%xmm7
+ xorps %xmm12,%xmm4
movdqu 16(%r8),%xmm11
- movdqu 0(%r8),%xmm3
-.byte 102,68,15,56,0,221
-.byte 102,15,56,0,221
+ movdqu 0(%r8),%xmm8
+.byte 102,69,15,56,0,218
+.byte 102,69,15,56,0,194
movdqa %xmm11,%xmm13
pshufd $78,%xmm11,%xmm12
- pxor %xmm3,%xmm0
+ pxor %xmm8,%xmm0
pxor %xmm11,%xmm12
.byte 102,69,15,58,68,222,0
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pxor %xmm0,%xmm3
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
.byte 102,69,15,58,68,238,17
- xorps %xmm11,%xmm6
-.byte 102,69,15,58,68,226,0
- xorps %xmm13,%xmm8
+.byte 102,68,15,58,68,231,0
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
leaq 64(%r8),%r8
- subq $64,%r9
+ subq $0x40,%r9
jc .Ltail4x
jmp .Lmod4_loop
.p2align 5
.Lmod4_loop:
.byte 102,65,15,58,68,199,0
- xorps %xmm12,%xmm7
+ xorps %xmm12,%xmm4
movdqu 48(%r8),%xmm11
-.byte 102,68,15,56,0,221
+.byte 102,69,15,56,0,218
.byte 102,65,15,58,68,207,17
- xorps %xmm6,%xmm0
- movdqu 32(%r8),%xmm6
+ xorps %xmm3,%xmm0
+ movdqu 32(%r8),%xmm3
movdqa %xmm11,%xmm13
+.byte 102,68,15,58,68,199,16
pshufd $78,%xmm11,%xmm12
-.byte 102,65,15,58,68,218,16
- xorps %xmm8,%xmm1
+ xorps %xmm5,%xmm1
pxor %xmm11,%xmm12
-.byte 102,15,56,0,245
- movups 32(%rdx),%xmm10
+.byte 102,65,15,56,0,218
+ movups 32(%rdx),%xmm7
+ xorps %xmm4,%xmm8
.byte 102,68,15,58,68,218,0
- xorps %xmm7,%xmm3
- movdqa %xmm6,%xmm8
- pshufd $78,%xmm6,%xmm7
+ pshufd $78,%xmm3,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm6,%xmm7
- pxor %xmm1,%xmm3
- movdqa %xmm3,%xmm4
- pslldq $8,%xmm3
+ pxor %xmm0,%xmm8
+ movdqa %xmm3,%xmm5
+ pxor %xmm1,%xmm8
+ pxor %xmm3,%xmm4
+ movdqa %xmm8,%xmm9
.byte 102,68,15,58,68,234,17
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- movdqa .L7_mask(%rip),%xmm3
- pxor %xmm4,%xmm1
-.byte 102,72,15,110,224
-
- pand %xmm0,%xmm3
-.byte 102,15,56,0,227
-.byte 102,69,15,58,68,226,0
- pxor %xmm0,%xmm4
- psllq $57,%xmm4
- movdqa %xmm4,%xmm3
- pslldq $8,%xmm4
-.byte 102,65,15,58,68,241,0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
- movdqu 0(%r8),%xmm3
+ pslldq $8,%xmm8
+ psrldq $8,%xmm9
+ pxor %xmm8,%xmm0
+ movdqa .L7_mask(%rip),%xmm8
+ pxor %xmm9,%xmm1
+.byte 102,76,15,110,200
+
+ pand %xmm0,%xmm8
+.byte 102,69,15,56,0,200
+ pxor %xmm0,%xmm9
+.byte 102,68,15,58,68,231,0
+ psllq $57,%xmm9
+ movdqa %xmm9,%xmm8
+ pslldq $8,%xmm9
+.byte 102,15,58,68,222,0
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
+ movdqu 0(%r8),%xmm8
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm9
psrlq $1,%xmm0
-.byte 102,69,15,58,68,193,17
- xorps %xmm11,%xmm6
+.byte 102,15,58,68,238,17
+ xorps %xmm11,%xmm3
movdqu 16(%r8),%xmm11
-.byte 102,68,15,56,0,221
-.byte 102,65,15,58,68,250,16
- xorps %xmm13,%xmm8
- movups 80(%rdx),%xmm10
-.byte 102,15,56,0,221
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
+.byte 102,69,15,56,0,218
+.byte 102,15,58,68,231,16
+ xorps %xmm13,%xmm5
+ movups 80(%rdx),%xmm7
+.byte 102,69,15,56,0,194
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
psrlq $5,%xmm0
movdqa %xmm11,%xmm13
- pxor %xmm12,%xmm7
+ pxor %xmm12,%xmm4
pshufd $78,%xmm11,%xmm12
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
pxor %xmm11,%xmm12
.byte 102,69,15,58,68,222,0
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
psrlq $1,%xmm0
-.byte 102,69,15,58,68,238,17
- xorps %xmm11,%xmm6
pxor %xmm1,%xmm0
-
-.byte 102,69,15,58,68,226,0
- xorps %xmm13,%xmm8
-
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pxor %xmm0,%xmm3
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm3
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
+
+.byte 102,68,15,58,68,231,0
+ xorps %xmm13,%xmm5
leaq 64(%r8),%r8
- subq $64,%r9
+ subq $0x40,%r9
jnc .Lmod4_loop
.Ltail4x:
.byte 102,65,15,58,68,199,0
- xorps %xmm12,%xmm7
.byte 102,65,15,58,68,207,17
- xorps %xmm6,%xmm0
-.byte 102,65,15,58,68,218,16
- xorps %xmm8,%xmm1
+.byte 102,68,15,58,68,199,16
+ xorps %xmm12,%xmm4
+ xorps %xmm3,%xmm0
+ xorps %xmm5,%xmm1
pxor %xmm0,%xmm1
- pxor %xmm7,%xmm3
+ pxor %xmm4,%xmm8
- pxor %xmm1,%xmm3
+ pxor %xmm1,%xmm8
pxor %xmm0,%xmm1
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
+ movdqa %xmm8,%xmm9
+ psrldq $8,%xmm8
+ pslldq $8,%xmm9
+ pxor %xmm8,%xmm1
+ pxor %xmm9,%xmm0
movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
@@ -1136,10 +1141,10 @@ gcm_ghash_clmul:
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- addq $64,%r9
+ addq $0x40,%r9
jz .Ldone
- movdqu 32(%rdx),%xmm10
- subq $16,%r9
+ movdqu 32(%rdx),%xmm7
+ subq $0x10,%r9
jz .Lodd_tail
.Lskip4x:
@@ -1147,102 +1152,106 @@ gcm_ghash_clmul:
- movdqu (%r8),%xmm3
- movdqu 16(%r8),%xmm6
-.byte 102,15,56,0,221
-.byte 102,15,56,0,245
- pxor %xmm3,%xmm0
+ movdqu (%r8),%xmm8
+ movdqu 16(%r8),%xmm3
+.byte 102,69,15,56,0,194
+.byte 102,65,15,56,0,218
+ pxor %xmm8,%xmm0
- movdqa %xmm6,%xmm8
- pshufd $78,%xmm6,%xmm3
- pxor %xmm6,%xmm3
-.byte 102,15,58,68,242,0
-.byte 102,68,15,58,68,194,17
-.byte 102,65,15,58,68,218,0
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
leaq 32(%r8),%r8
- subq $32,%r9
+ nop
+ subq $0x20,%r9
jbe .Leven_tail
+ nop
jmp .Lmod_loop
.p2align 5
.Lmod_loop:
movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
pshufd $78,%xmm0,%xmm4
pxor %xmm0,%xmm4
-.byte 102,65,15,58,68,193,0
-.byte 102,65,15,58,68,201,17
-.byte 102,65,15,58,68,226,16
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
- pxor %xmm6,%xmm0
- pxor %xmm8,%xmm1
- movdqu (%r8),%xmm8
-.byte 102,68,15,56,0,197
- movdqu 16(%r8),%xmm6
-
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
- pxor %xmm8,%xmm1
- pxor %xmm3,%xmm4
-.byte 102,15,56,0,245
- movdqa %xmm4,%xmm3
- psrldq $8,%xmm3
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ movdqu (%r8),%xmm9
+ pxor %xmm0,%xmm8
+.byte 102,69,15,56,0,202
+ movdqu 16(%r8),%xmm3
+
+ pxor %xmm1,%xmm8
+ pxor %xmm9,%xmm1
+ pxor %xmm8,%xmm4
+.byte 102,65,15,56,0,218
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
pslldq $8,%xmm4
- pxor %xmm3,%xmm1
+ pxor %xmm8,%xmm1
pxor %xmm4,%xmm0
- movdqa %xmm6,%xmm8
+ movdqa %xmm3,%xmm5
- movdqa %xmm0,%xmm4
- movdqa %xmm0,%xmm3
+ movdqa %xmm0,%xmm9
+ movdqa %xmm0,%xmm8
psllq $5,%xmm0
-.byte 102,15,58,68,242,0
- pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm8
+.byte 102,15,58,68,218,0
psllq $1,%xmm0
- pxor %xmm3,%xmm0
+ pxor %xmm8,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm3
+ movdqa %xmm0,%xmm8
pslldq $8,%xmm0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
- pshufd $78,%xmm8,%xmm3
- pxor %xmm8,%xmm3
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pshufd $78,%xmm5,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm5,%xmm4
-.byte 102,68,15,58,68,194,17
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm9
psrlq $1,%xmm0
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
+.byte 102,15,58,68,234,17
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
psrlq $5,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm9,%xmm0
+ leaq 32(%r8),%r8
psrlq $1,%xmm0
-.byte 102,65,15,58,68,218,0
+.byte 102,15,58,68,231,0
pxor %xmm1,%xmm0
- leaq 32(%r8),%r8
- subq $32,%r9
+ subq $0x20,%r9
ja .Lmod_loop
.Leven_tail:
movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
pshufd $78,%xmm0,%xmm4
pxor %xmm0,%xmm4
-.byte 102,65,15,58,68,193,0
-.byte 102,65,15,58,68,201,17
-.byte 102,65,15,58,68,226,16
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
- pxor %xmm6,%xmm0
- pxor %xmm8,%xmm1
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
- pxor %xmm3,%xmm4
- movdqa %xmm4,%xmm3
- psrldq $8,%xmm3
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ pxor %xmm0,%xmm8
+ pxor %xmm1,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
pslldq $8,%xmm4
- pxor %xmm3,%xmm1
+ pxor %xmm8,%xmm1
pxor %xmm4,%xmm0
movdqa %xmm0,%xmm4
@@ -1271,15 +1280,15 @@ gcm_ghash_clmul:
jnz .Ldone
.Lodd_tail:
- movdqu (%r8),%xmm3
-.byte 102,15,56,0,221
- pxor %xmm3,%xmm0
+ movdqu (%r8),%xmm8
+.byte 102,69,15,56,0,194
+ pxor %xmm8,%xmm0
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
-.byte 102,65,15,58,68,218,0
+.byte 102,15,58,68,223,0
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -1312,7 +1321,7 @@ gcm_ghash_clmul:
psrlq $1,%xmm0
pxor %xmm1,%xmm0
.Ldone:
-.byte 102,15,56,0,197
+.byte 102,65,15,56,0,194
movdqu %xmm0,(%rcx)
movaps (%rsp),%xmm6
movaps 16(%rsp),%xmm7
@@ -1332,7 +1341,115 @@ gcm_ghash_clmul:
.def gcm_init_avx; .scl 2; .type 32; .endef
.p2align 5
gcm_init_avx:
- jmp .L_init_clmul
+.LSEH_begin_gcm_init_avx:
+
+.byte 0x48,0x83,0xec,0x18
+.byte 0x0f,0x29,0x34,0x24
+ vzeroupper
+
+ vmovdqu (%rdx),%xmm2
+ vpshufd $78,%xmm2,%xmm2
+
+
+ vpshufd $255,%xmm2,%xmm4
+ vpsrlq $63,%xmm2,%xmm3
+ vpsllq $1,%xmm2,%xmm2
+ vpxor %xmm5,%xmm5,%xmm5
+ vpcmpgtd %xmm4,%xmm5,%xmm5
+ vpslldq $8,%xmm3,%xmm3
+ vpor %xmm3,%xmm2,%xmm2
+
+
+ vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vpunpckhqdq %xmm2,%xmm2,%xmm6
+ vmovdqa %xmm2,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ movq $4,%r10
+ jmp .Linit_start_avx
+.p2align 5
+.Linit_loop_avx:
+ vpalignr $8,%xmm3,%xmm4,%xmm5
+ vmovdqu %xmm5,-16(%rcx)
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+.Linit_start_avx:
+ vmovdqa %xmm0,%xmm5
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+ vpshufd $78,%xmm5,%xmm3
+ vpshufd $78,%xmm0,%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqu %xmm5,0(%rcx)
+ vpxor %xmm0,%xmm4,%xmm4
+ vmovdqu %xmm0,16(%rcx)
+ leaq 48(%rcx),%rcx
+ subq $1,%r10
+ jnz .Linit_loop_avx
+
+ vpalignr $8,%xmm4,%xmm3,%xmm5
+ vmovdqu %xmm5,-16(%rcx)
+
+ vzeroupper
+ movaps (%rsp),%xmm6
+ leaq 24(%rsp),%rsp
+.LSEH_end_gcm_init_avx:
+ .byte 0xf3,0xc3
.globl gcm_gmult_avx
.def gcm_gmult_avx; .scl 2; .type 32; .endef
@@ -1344,7 +1461,403 @@ gcm_gmult_avx:
.def gcm_ghash_avx; .scl 2; .type 32; .endef
.p2align 5
gcm_ghash_avx:
- jmp .L_ghash_clmul
+ leaq -136(%rsp),%rax
+.LSEH_begin_gcm_ghash_avx:
+
+.byte 0x48,0x8d,0x60,0xe0
+.byte 0x0f,0x29,0x70,0xe0
+.byte 0x0f,0x29,0x78,0xf0
+.byte 0x44,0x0f,0x29,0x00
+.byte 0x44,0x0f,0x29,0x48,0x10
+.byte 0x44,0x0f,0x29,0x50,0x20
+.byte 0x44,0x0f,0x29,0x58,0x30
+.byte 0x44,0x0f,0x29,0x60,0x40
+.byte 0x44,0x0f,0x29,0x68,0x50
+.byte 0x44,0x0f,0x29,0x70,0x60
+.byte 0x44,0x0f,0x29,0x78,0x70
+ vzeroupper
+
+ vmovdqu (%rcx),%xmm10
+ leaq .L0x1c2_polynomial(%rip),%r10
+ leaq 64(%rdx),%rdx
+ vmovdqu .Lbswap_mask(%rip),%xmm13
+ vpshufb %xmm13,%xmm10,%xmm10
+ cmpq $0x80,%r9
+ jb .Lshort_avx
+ subq $0x80,%r9
+
+ vmovdqu 112(%r8),%xmm14
+ vmovdqu 0-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vmovdqu 32-64(%rdx),%xmm7
+
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vmovdqu 96(%r8),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm14,%xmm9,%xmm9
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 16-64(%rdx),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 80(%r8),%xmm14
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 48-64(%rdx),%xmm6
+ vpxor %xmm14,%xmm9,%xmm9
+ vmovdqu 64(%r8),%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rdx),%xmm7
+
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rdx),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 48(%r8),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rdx),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rdx),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 32(%r8),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rdx),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 16(%r8),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rdx),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rdx),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu (%r8),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rdx),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+
+ leaq 128(%r8),%r8
+ cmpq $0x80,%r9
+ jb .Ltail_avx
+
+ vpxor %xmm10,%xmm15,%xmm15
+ subq $0x80,%r9
+ jmp .Loop8x_avx
+
+.p2align 5
+.Loop8x_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 112(%r8),%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpxor %xmm15,%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
+ vmovdqu 0-64(%rdx),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
+ vmovdqu 32-64(%rdx),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 96(%r8),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm3,%xmm10,%xmm10
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vxorps %xmm4,%xmm11,%xmm11
+ vmovdqu 16-64(%rdx),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm5,%xmm12,%xmm12
+ vxorps %xmm15,%xmm8,%xmm8
+
+ vmovdqu 80(%r8),%xmm14
+ vpxor %xmm10,%xmm12,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm11,%xmm12,%xmm12
+ vpslldq $8,%xmm12,%xmm9
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vpsrldq $8,%xmm12,%xmm12
+ vpxor %xmm9,%xmm10,%xmm10
+ vmovdqu 48-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vxorps %xmm12,%xmm11,%xmm11
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rdx),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 64(%r8),%xmm15
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rdx),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vxorps %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vmovdqu 48(%r8),%xmm14
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rdx),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rdx),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 32(%r8),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rdx),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+ vxorps %xmm12,%xmm10,%xmm10
+
+ vmovdqu 16(%r8),%xmm14
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rdx),%xmm6
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vxorps %xmm11,%xmm12,%xmm12
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rdx),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu (%r8),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rdx),%xmm6
+ vpxor %xmm12,%xmm15,%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+ vpxor %xmm10,%xmm15,%xmm15
+
+ leaq 128(%r8),%r8
+ subq $0x80,%r9
+ jnc .Loop8x_avx
+
+ addq $0x80,%r9
+ jmp .Ltail_no_xor_avx
+
+.p2align 5
+.Lshort_avx:
+ vmovdqu -16(%r8,%r9,1),%xmm14
+ leaq (%r8,%r9,1),%r8
+ vmovdqu 0-64(%rdx),%xmm6
+ vmovdqu 32-64(%rdx),%xmm7
+ vpshufb %xmm13,%xmm14,%xmm15
+
+ vmovdqa %xmm0,%xmm3
+ vmovdqa %xmm1,%xmm4
+ vmovdqa %xmm2,%xmm5
+ subq $0x10,%r9
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -32(%r8),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 16-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%r9
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -48(%r8),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 48-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 80-64(%rdx),%xmm7
+ subq $0x10,%r9
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -64(%r8),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 64-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%r9
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -80(%r8),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 96-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 128-64(%rdx),%xmm7
+ subq $0x10,%r9
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -96(%r8),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 112-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%r9
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -112(%r8),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 144-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovq 184-64(%rdx),%xmm7
+ subq $0x10,%r9
+ jmp .Ltail_avx
+
+.p2align 5
+.Ltail_avx:
+ vpxor %xmm10,%xmm15,%xmm15
+.Ltail_no_xor_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+
+ vmovdqu (%r10),%xmm12
+
+ vpxor %xmm0,%xmm3,%xmm10
+ vpxor %xmm1,%xmm4,%xmm11
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vpxor %xmm10,%xmm5,%xmm5
+ vpxor %xmm11,%xmm5,%xmm5
+ vpslldq $8,%xmm5,%xmm9
+ vpsrldq $8,%xmm5,%xmm5
+ vpxor %xmm9,%xmm10,%xmm10
+ vpxor %xmm5,%xmm11,%xmm11
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm11,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ cmpq $0,%r9
+ jne .Lshort_avx
+
+ vpshufb %xmm13,%xmm10,%xmm10
+ vmovdqu %xmm10,(%rcx)
+ vzeroupper
+ movaps (%rsp),%xmm6
+ movaps 16(%rsp),%xmm7
+ movaps 32(%rsp),%xmm8
+ movaps 48(%rsp),%xmm9
+ movaps 64(%rsp),%xmm10
+ movaps 80(%rsp),%xmm11
+ movaps 96(%rsp),%xmm12
+ movaps 112(%rsp),%xmm13
+ movaps 128(%rsp),%xmm14
+ movaps 144(%rsp),%xmm15
+ leaq 168(%rsp),%rsp
+.LSEH_end_gcm_ghash_avx:
+ .byte 0xf3,0xc3
.p2align 6
.Lbswap_mask:
@@ -1451,7 +1964,7 @@ se_handler:
movq 40(%r9),%rdi
movq %r8,%rsi
movl $154,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
movq %r9,%rsi
xorq %rcx,%rcx
@@ -1498,31 +2011,38 @@ se_handler:
.rva .LSEH_begin_gcm_ghash_clmul
.rva .LSEH_end_gcm_ghash_clmul
.rva .LSEH_info_gcm_ghash_clmul
+.rva .LSEH_begin_gcm_init_avx
+.rva .LSEH_end_gcm_init_avx
+.rva .LSEH_info_gcm_init_clmul
+
+.rva .LSEH_begin_gcm_ghash_avx
+.rva .LSEH_end_gcm_ghash_avx
+.rva .LSEH_info_gcm_ghash_clmul
.section .xdata
.p2align 3
.LSEH_info_gcm_gmult_4bit:
.byte 9,0,0,0
.rva se_handler
-.rva .Lgmult_prologue,.Lgmult_epilogue
+.rva .Lgmult_prologue,.Lgmult_epilogue
.LSEH_info_gcm_ghash_4bit:
.byte 9,0,0,0
.rva se_handler
-.rva .Lghash_prologue,.Lghash_epilogue
+.rva .Lghash_prologue,.Lghash_epilogue
.LSEH_info_gcm_init_clmul:
.byte 0x01,0x08,0x03,0x00
-.byte 0x08,0x68,0x00,0x00
-.byte 0x04,0x22,0x00,0x00
+.byte 0x08,0x68,0x00,0x00
+.byte 0x04,0x22,0x00,0x00
.LSEH_info_gcm_ghash_clmul:
.byte 0x01,0x33,0x16,0x00
-.byte 0x33,0xf8,0x09,0x00
-.byte 0x2e,0xe8,0x08,0x00
-.byte 0x29,0xd8,0x07,0x00
-.byte 0x24,0xc8,0x06,0x00
-.byte 0x1f,0xb8,0x05,0x00
-.byte 0x1a,0xa8,0x04,0x00
-.byte 0x15,0x98,0x03,0x00
-.byte 0x10,0x88,0x02,0x00
-.byte 0x0c,0x78,0x01,0x00
-.byte 0x08,0x68,0x00,0x00
-.byte 0x04,0x01,0x15,0x00
+.byte 0x33,0xf8,0x09,0x00
+.byte 0x2e,0xe8,0x08,0x00
+.byte 0x29,0xd8,0x07,0x00
+.byte 0x24,0xc8,0x06,0x00
+.byte 0x1f,0xb8,0x05,0x00
+.byte 0x1a,0xa8,0x04,0x00
+.byte 0x15,0x98,0x03,0x00
+.byte 0x10,0x88,0x02,0x00
+.byte 0x0c,0x78,0x01,0x00
+.byte 0x08,0x68,0x00,0x00
+.byte 0x04,0x01,0x15,0x00
diff --git a/lib/accelerated/x86/elf/aes-ssse3-x86.s b/lib/accelerated/x86/elf/aes-ssse3-x86.s
index 1f102940ef..3aa221267a 100644
--- a/lib/accelerated/x86/elf/aes-ssse3-x86.s
+++ b/lib/accelerated/x86/elf/aes-ssse3-x86.s
@@ -672,7 +672,4 @@ vpaes_cbc_encrypt:
ret
.size vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin
-
.section .note.GNU-stack,"",%progbits
-
-
diff --git a/lib/accelerated/x86/elf/aes-ssse3-x86_64.s b/lib/accelerated/x86/elf/aes-ssse3-x86_64.s
index 27a9f692e1..31a6161c58 100644
--- a/lib/accelerated/x86/elf/aes-ssse3-x86_64.s
+++ b/lib/accelerated/x86/elf/aes-ssse3-x86_64.s
@@ -72,7 +72,7 @@ _vpaes_encrypt_core:
addq $16,%r11
pxor %xmm0,%xmm3
.byte 102,15,56,0,193
- andq $48,%r11
+ andq $0x30,%r11
subq $1,%rax
pxor %xmm3,%xmm0
@@ -132,10 +132,10 @@ _vpaes_decrypt_core:
pand %xmm9,%xmm0
.byte 102,15,56,0,208
movdqa .Lk_dipt+16(%rip),%xmm0
- xorq $48,%r11
+ xorq $0x30,%r11
leaq .Lk_dsbd(%rip),%r10
.byte 102,15,56,0,193
- andq $48,%r11
+ andq $0x30,%r11
pxor %xmm5,%xmm2
movdqa .Lk_mc_forward+48(%rip),%xmm5
pxor %xmm2,%xmm0
@@ -231,7 +231,7 @@ _vpaes_schedule_core:
- call _vpaes_preheat
+ call _vpaes_preheat
movdqa .Lk_rcon(%rip),%xmm8
movdqu (%rdi),%xmm0
@@ -254,7 +254,7 @@ _vpaes_schedule_core:
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
movdqu %xmm3,(%rdx)
- xorq $48,%r8
+ xorq $0x30,%r8
.Lschedule_go:
cmpl $192,%esi
@@ -277,7 +277,7 @@ _vpaes_schedule_core:
call _vpaes_schedule_round
decq %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
jmp .Loop_schedule_128
@@ -298,7 +298,7 @@ _vpaes_schedule_core:
.align 16
.Lschedule_192:
movdqu 8(%rdi),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movdqa %xmm0,%xmm6
pxor %xmm4,%xmm4
movhlps %xmm4,%xmm6
@@ -307,13 +307,13 @@ _vpaes_schedule_core:
.Loop_schedule_192:
call _vpaes_schedule_round
.byte 102,15,58,15,198,8
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_round
decq %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
jmp .Loop_schedule_192
@@ -330,21 +330,21 @@ _vpaes_schedule_core:
.align 16
.Lschedule_256:
movdqu 16(%rdi),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movl $7,%esi
.Loop_schedule_256:
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
movdqa %xmm0,%xmm6
call _vpaes_schedule_round
decq %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
- pshufd $255,%xmm0,%xmm0
+ pshufd $0xFF,%xmm0,%xmm0
movdqa %xmm7,%xmm5
movdqa %xmm6,%xmm7
call _vpaes_schedule_low_round
@@ -379,7 +379,7 @@ _vpaes_schedule_core:
.Lschedule_mangle_last_dec:
addq $-16,%rdx
pxor .Lk_s63(%rip),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movdqu %xmm0,(%rdx)
@@ -411,8 +411,8 @@ _vpaes_schedule_core:
.type _vpaes_schedule_192_smear,@function
.align 16
_vpaes_schedule_192_smear:
- pshufd $128,%xmm6,%xmm1
- pshufd $254,%xmm7,%xmm0
+ pshufd $0x80,%xmm6,%xmm1
+ pshufd $0xFE,%xmm7,%xmm0
pxor %xmm1,%xmm6
pxor %xmm1,%xmm1
pxor %xmm0,%xmm6
@@ -449,7 +449,7 @@ _vpaes_schedule_round:
pxor %xmm1,%xmm7
- pshufd $255,%xmm0,%xmm0
+ pshufd $0xFF,%xmm0,%xmm0
.byte 102,15,58,15,192,1
@@ -608,7 +608,7 @@ _vpaes_schedule_mangle:
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
addq $-16,%r8
- andq $48,%r8
+ andq $0x30,%r8
movdqu %xmm3,(%rdx)
.byte 0xf3,0xc3
.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
@@ -626,7 +626,7 @@ vpaes_set_encrypt_key:
movl %eax,240(%rdx)
movl $0,%ecx
- movl $48,%r8d
+ movl $0x30,%r8d
call _vpaes_schedule_core
xorl %eax,%eax
.byte 0xf3,0xc3
@@ -834,11 +834,8 @@ _vpaes_consts:
.Lk_dsbo:
.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
-.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
.align 64
.size _vpaes_consts,.-_vpaes_consts
-
.section .note.GNU-stack,"",%progbits
-
-
diff --git a/lib/accelerated/x86/elf/aesni-gcm-x86_64.s b/lib/accelerated/x86/elf/aesni-gcm-x86_64.s
new file mode 100644
index 0000000000..07f177d8d4
--- /dev/null
+++ b/lib/accelerated/x86/elf/aesni-gcm-x86_64.s
@@ -0,0 +1,794 @@
+# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain copyright notices,
+# this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# * Neither the name of the Andy Polyakov nor the names of its
+# copyright holder and contributors may be used to endorse or
+# promote products derived from this software without specific
+# prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *** This file is auto-generated ***
+#
+.text
+
+.type _aesni_ctr32_ghash_6x,@function
+.align 32
+_aesni_ctr32_ghash_6x:
+ vmovdqu 32(%r11),%xmm2
+ subq $6,%rdx
+ vpxor %xmm4,%xmm4,%xmm4
+ vmovdqu 0-128(%rcx),%xmm15
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovdqu %xmm4,16+8(%rsp)
+ jmp .Loop6x
+
+.align 32
+.Loop6x:
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm15,%xmm10,%xmm10
+ vpxor %xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32:
+ vmovdqu %xmm1,(%r8)
+ vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
+ vpxor %xmm15,%xmm12,%xmm12
+ vmovups 16-128(%rcx),%xmm2
+ vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
+ xorq %r12,%r12
+ cmpq %r14,%r15
+
+ vaesenc %xmm2,%xmm9,%xmm9
+ vmovdqu 48+8(%rsp),%xmm0
+ vpxor %xmm15,%xmm13,%xmm13
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
+ vaesenc %xmm2,%xmm10,%xmm10
+ vpxor %xmm15,%xmm14,%xmm14
+ setnc %r12b
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vmovdqu 16-32(%r9),%xmm3
+ negq %r12
+ vaesenc %xmm2,%xmm12,%xmm12
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
+ vpxor %xmm4,%xmm8,%xmm8
+ vaesenc %xmm2,%xmm13,%xmm13
+ vpxor %xmm5,%xmm1,%xmm4
+ andq $0x60,%r12
+ vmovups 32-128(%rcx),%xmm15
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
+ vaesenc %xmm2,%xmm14,%xmm14
+
+ vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
+ leaq (%r14,%r12,1),%r14
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
+ vmovdqu 64+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 88(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 80(%r14),%r12
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,32+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,40+8(%rsp)
+ vmovdqu 48-32(%r9),%xmm5
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 48-128(%rcx),%xmm15
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
+ vmovdqu 80+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqu 64-32(%r9),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 64-128(%rcx),%xmm15
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 72(%r14),%r13
+ vpxor %xmm5,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 64(%r14),%r12
+ vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
+ vmovdqu 96+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,48+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,56+8(%rsp)
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 96-32(%r9),%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 80-128(%rcx),%xmm15
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 56(%r14),%r13
+ vpxor %xmm1,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
+ vpxor 112+8(%rsp),%xmm8,%xmm8
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 48(%r14),%r12
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,64+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,72+8(%rsp)
+ vpxor %xmm3,%xmm4,%xmm4
+ vmovdqu 112-32(%r9),%xmm3
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 96-128(%rcx),%xmm15
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 40(%r14),%r13
+ vpxor %xmm2,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 32(%r14),%r12
+ vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,80+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,88+8(%rsp)
+ vpxor %xmm5,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor %xmm1,%xmm6,%xmm6
+
+ vmovups 112-128(%rcx),%xmm15
+ vpslldq $8,%xmm6,%xmm5
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 16(%r11),%xmm3
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm8,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm5,%xmm4,%xmm4
+ movbeq 24(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 16(%r14),%r12
+ vpalignr $8,%xmm4,%xmm4,%xmm0
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ movq %r13,96+8(%rsp)
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r12,104+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ vmovups 128-128(%rcx),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 144-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm10,%xmm10
+ vpsrldq $8,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm11,%xmm11
+ vpxor %xmm6,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm12,%xmm12
+ vpxor %xmm0,%xmm4,%xmm4
+ movbeq 8(%r14),%r13
+ vaesenc %xmm1,%xmm13,%xmm13
+ movbeq 0(%r14),%r12
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 160-128(%rcx),%xmm1
+ cmpl $11,%ebp
+ jb .Lenc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 176-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 192-128(%rcx),%xmm1
+ je .Lenc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 208-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 224-128(%rcx),%xmm1
+ jmp .Lenc_tail
+
+.align 32
+.Lhandle_ctr32:
+ vmovdqu (%r11),%xmm0
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm15,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm15,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpshufb %xmm0,%xmm1,%xmm1
+ jmp .Lresume_ctr32
+
+.align 32
+.Lenc_tail:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vmovdqu %xmm7,16+8(%rsp)
+ vpalignr $8,%xmm4,%xmm4,%xmm8
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ vpxor 0(%rdi),%xmm1,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 16(%rdi),%xmm1,%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 32(%rdi),%xmm1,%xmm5
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 48(%rdi),%xmm1,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 64(%rdi),%xmm1,%xmm7
+ vpxor 80(%rdi),%xmm1,%xmm3
+ vmovdqu (%r8),%xmm1
+
+ vaesenclast %xmm2,%xmm9,%xmm9
+ vmovdqu 32(%r11),%xmm2
+ vaesenclast %xmm0,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm1,%xmm0
+ movq %r13,112+8(%rsp)
+ leaq 96(%rdi),%rdi
+ vaesenclast %xmm5,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm0,%xmm5
+ movq %r12,120+8(%rsp)
+ leaq 96(%rsi),%rsi
+ vmovdqu 0-128(%rcx),%xmm15
+ vaesenclast %xmm6,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm5,%xmm6
+ vaesenclast %xmm7,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm6,%xmm7
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vpaddb %xmm2,%xmm7,%xmm3
+
+ addq $0x60,%r10
+ subq $0x6,%rdx
+ jc .L6x_done
+
+ vmovups %xmm9,-96(%rsi)
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovups %xmm10,-80(%rsi)
+ vmovdqa %xmm0,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vmovdqa %xmm5,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vmovdqa %xmm6,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vmovdqa %xmm7,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vmovdqa %xmm3,%xmm14
+ vmovdqu 32+8(%rsp),%xmm7
+ jmp .Loop6x
+
+.L6x_done:
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpxor %xmm4,%xmm8,%xmm8
+
+ .byte 0xf3,0xc3
+.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+.globl aesni_gcm_decrypt
+.type aesni_gcm_decrypt,@function
+.align 32
+aesni_gcm_decrypt:
+ xorq %r10,%r10
+ cmpq $0x60,%rdx
+ jb .Lgcm_dec_abort
+
+ leaq (%rsp),%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq .Lbswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ vmovdqu (%r9),%xmm8
+ andq $-128,%rsp
+ vmovdqu (%r11),%xmm0
+ leaq 128(%rcx),%rcx
+ leaq 32+32(%r9),%r9
+ movl 240-128(%rcx),%ebp
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc .Ldec_no_key_aliasing
+ cmpq $768,%r15
+ jnc .Ldec_no_key_aliasing
+ subq %r15,%rsp
+.Ldec_no_key_aliasing:
+
+ vmovdqu 80(%rdi),%xmm7
+ leaq (%rdi),%r14
+ vmovdqu 64(%rdi),%xmm4
+ leaq -192(%rdi,%rdx,1),%r15
+ vmovdqu 48(%rdi),%xmm5
+ shrq $4,%rdx
+ xorq %r10,%r10
+ vmovdqu 32(%rdi),%xmm6
+ vpshufb %xmm0,%xmm7,%xmm7
+ vmovdqu 16(%rdi),%xmm2
+ vpshufb %xmm0,%xmm4,%xmm4
+ vmovdqu (%rdi),%xmm3
+ vpshufb %xmm0,%xmm5,%xmm5
+ vmovdqu %xmm4,48(%rsp)
+ vpshufb %xmm0,%xmm6,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm2,%xmm2
+ vmovdqu %xmm6,80(%rsp)
+ vpshufb %xmm0,%xmm3,%xmm3
+ vmovdqu %xmm2,96(%rsp)
+ vmovdqu %xmm3,112(%rsp)
+
+ call _aesni_ctr32_ghash_6x
+
+ vmovups %xmm9,-96(%rsi)
+ vmovups %xmm10,-80(%rsi)
+ vmovups %xmm11,-64(%rsi)
+ vmovups %xmm12,-48(%rsi)
+ vmovups %xmm13,-32(%rsi)
+ vmovups %xmm14,-16(%rsi)
+
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,-64(%r9)
+
+ vzeroupper
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+.Lgcm_dec_abort:
+ movq %r10,%rax
+ .byte 0xf3,0xc3
+.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
+.type _aesni_ctr32_6x,@function
+.align 32
+_aesni_ctr32_6x:
+ vmovdqu 0-128(%rcx),%xmm4
+ vmovdqu 32(%r11),%xmm2
+ leaq -1(%rbp),%r13
+ vmovups 16-128(%rcx),%xmm15
+ leaq 32-128(%rcx),%r12
+ vpxor %xmm4,%xmm1,%xmm9
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32_2
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp .Loop_ctr32
+
+.align 16
+.Loop_ctr32:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+ vmovups (%r12),%xmm15
+ leaq 16(%r12),%r12
+ decl %r13d
+ jnz .Loop_ctr32
+
+ vmovdqu (%r12),%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 0(%rdi),%xmm3,%xmm4
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor 16(%rdi),%xmm3,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 32(%rdi),%xmm3,%xmm6
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 48(%rdi),%xmm3,%xmm8
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 64(%rdi),%xmm3,%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 80(%rdi),%xmm3,%xmm3
+ leaq 96(%rdi),%rdi
+
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vaesenclast %xmm5,%xmm10,%xmm10
+ vaesenclast %xmm6,%xmm11,%xmm11
+ vaesenclast %xmm8,%xmm12,%xmm12
+ vaesenclast %xmm2,%xmm13,%xmm13
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vmovups %xmm9,0(%rsi)
+ vmovups %xmm10,16(%rsi)
+ vmovups %xmm11,32(%rsi)
+ vmovups %xmm12,48(%rsi)
+ vmovups %xmm13,64(%rsi)
+ vmovups %xmm14,80(%rsi)
+ leaq 96(%rsi),%rsi
+
+ .byte 0xf3,0xc3
+.align 32
+.Lhandle_ctr32_2:
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp .Loop_ctr32
+.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl aesni_gcm_encrypt
+.type aesni_gcm_encrypt,@function
+.align 32
+aesni_gcm_encrypt:
+ xorq %r10,%r10
+ cmpq $288,%rdx
+ jb .Lgcm_enc_abort
+
+ leaq (%rsp),%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq .Lbswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ leaq 128(%rcx),%rcx
+ vmovdqu (%r11),%xmm0
+ andq $-128,%rsp
+ movl 240-128(%rcx),%ebp
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc .Lenc_no_key_aliasing
+ cmpq $768,%r15
+ jnc .Lenc_no_key_aliasing
+ subq %r15,%rsp
+.Lenc_no_key_aliasing:
+
+ leaq (%rsi),%r14
+ leaq -192(%rsi,%rdx,1),%r15
+ shrq $4,%rdx
+
+ call _aesni_ctr32_6x
+ vpshufb %xmm0,%xmm9,%xmm8
+ vpshufb %xmm0,%xmm10,%xmm2
+ vmovdqu %xmm8,112(%rsp)
+ vpshufb %xmm0,%xmm11,%xmm4
+ vmovdqu %xmm2,96(%rsp)
+ vpshufb %xmm0,%xmm12,%xmm5
+ vmovdqu %xmm4,80(%rsp)
+ vpshufb %xmm0,%xmm13,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm14,%xmm7
+ vmovdqu %xmm6,48(%rsp)
+
+ call _aesni_ctr32_6x
+
+ vmovdqu (%r9),%xmm8
+ leaq 32+32(%r9),%r9
+ subq $12,%rdx
+ movq $192,%r10
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ call _aesni_ctr32_ghash_6x
+ vmovdqu 32(%rsp),%xmm7
+ vmovdqu (%r11),%xmm0
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm7,%xmm7,%xmm1
+ vmovdqu 32-32(%r9),%xmm15
+ vmovups %xmm9,-96(%rsi)
+ vpshufb %xmm0,%xmm9,%xmm9
+ vpxor %xmm7,%xmm1,%xmm1
+ vmovups %xmm10,-80(%rsi)
+ vpshufb %xmm0,%xmm10,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vpshufb %xmm0,%xmm11,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vpshufb %xmm0,%xmm12,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vpshufb %xmm0,%xmm13,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vpshufb %xmm0,%xmm14,%xmm14
+ vmovdqu %xmm9,16(%rsp)
+ vmovdqu 48(%rsp),%xmm6
+ vmovdqu 16-32(%r9),%xmm0
+ vpunpckhqdq %xmm6,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
+ vpxor %xmm6,%xmm2,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+
+ vmovdqu 64(%rsp),%xmm9
+ vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm9,%xmm9,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
+ vpxor %xmm9,%xmm5,%xmm5
+ vpxor %xmm7,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vmovdqu 80(%rsp),%xmm1
+ vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm4,%xmm7,%xmm7
+ vpunpckhqdq %xmm1,%xmm1,%xmm4
+ vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpxor %xmm6,%xmm9,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 96(%rsp),%xmm2
+ vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm7,%xmm6,%xmm6
+ vpunpckhqdq %xmm2,%xmm2,%xmm7
+ vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpxor %xmm9,%xmm1,%xmm1
+ vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm5,%xmm4,%xmm4
+
+ vpxor 112(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
+ vmovdqu 112-32(%r9),%xmm0
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm1,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
+ vpxor %xmm4,%xmm7,%xmm4
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm1
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
+ vpxor %xmm14,%xmm1,%xmm1
+ vpxor %xmm5,%xmm6,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
+ vmovdqu 32-32(%r9),%xmm15
+ vpxor %xmm2,%xmm8,%xmm7
+ vpxor %xmm4,%xmm9,%xmm6
+
+ vmovdqu 16-32(%r9),%xmm0
+ vpxor %xmm5,%xmm7,%xmm9
+ vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
+ vpxor %xmm9,%xmm6,%xmm6
+ vpunpckhqdq %xmm13,%xmm13,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
+ vpxor %xmm13,%xmm2,%xmm2
+ vpslldq $8,%xmm6,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+ vpxor %xmm9,%xmm5,%xmm8
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm12,%xmm12,%xmm9
+ vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
+ vpxor %xmm12,%xmm9,%xmm9
+ vpxor %xmm14,%xmm13,%xmm13
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm11,%xmm11,%xmm1
+ vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
+ vpxor %xmm11,%xmm1,%xmm1
+ vpxor %xmm13,%xmm12,%xmm12
+ vxorps 16(%rsp),%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm9,%xmm9
+
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm10,%xmm10,%xmm2
+ vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
+ vpxor %xmm10,%xmm2,%xmm2
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpxor %xmm12,%xmm11,%xmm11
+ vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm9,%xmm1,%xmm1
+
+ vxorps %xmm7,%xmm14,%xmm14
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
+ vmovdqu 112-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm11,%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
+ vpxor %xmm4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
+ vpxor %xmm10,%xmm7,%xmm7
+ vpxor %xmm2,%xmm6,%xmm6
+
+ vpxor %xmm5,%xmm7,%xmm4
+ vpxor %xmm4,%xmm6,%xmm6
+ vpslldq $8,%xmm6,%xmm1
+ vmovdqu 16(%r11),%xmm3
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm1,%xmm5,%xmm8
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm2,%xmm8,%xmm8
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm7,%xmm2,%xmm2
+ vpxor %xmm2,%xmm8,%xmm8
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,-64(%r9)
+
+ vzeroupper
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+.Lgcm_enc_abort:
+ movq %r10,%rax
+ .byte 0xf3,0xc3
+.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
+.align 64
+.Lbswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
+
+.section .note.GNU-stack,"",%progbits
diff --git a/lib/accelerated/x86/elf/aesni-x86.s b/lib/accelerated/x86/elf/aesni-x86.s
index 5d70f2568f..73d623cbda 100644
--- a/lib/accelerated/x86/elf/aesni-x86.s
+++ b/lib/accelerated/x86/elf/aesni-x86.s
@@ -60,7 +60,10 @@ aesni_encrypt:
leal 16(%edx),%edx
jnz .L000enc1_loop_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
ret
.size aesni_encrypt,.-.L_aesni_encrypt_begin
.globl aesni_decrypt
@@ -84,32 +87,90 @@ aesni_decrypt:
leal 16(%edx),%edx
jnz .L001dec1_loop_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
ret
.size aesni_decrypt,.-.L_aesni_decrypt_begin
+.type _aesni_encrypt2,@function
+.align 16
+_aesni_encrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L002enc2_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L002enc2_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ ret
+.size _aesni_encrypt2,.-_aesni_encrypt2
+.type _aesni_decrypt2,@function
+.align 16
+_aesni_decrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L003dec2_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L003dec2_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+ ret
+.size _aesni_decrypt2,.-_aesni_decrypt2
.type _aesni_encrypt3,@function
.align 16
_aesni_encrypt3:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
- movups (%edx),%xmm0
-.L002enc3_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L004enc3_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
- movups (%edx),%xmm0
- jnz .L002enc3_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L004enc3_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -122,25 +183,26 @@ _aesni_encrypt3:
.align 16
_aesni_decrypt3:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
- movups (%edx),%xmm0
-.L003dec3_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L005dec3_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
- movups (%edx),%xmm0
- jnz .L003dec3_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L005dec3_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -154,27 +216,29 @@ _aesni_decrypt3:
_aesni_encrypt4:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
- shrl $1,%ecx
- leal 32(%edx),%edx
+ shll $4,%ecx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
pxor %xmm0,%xmm5
- movups (%edx),%xmm0
-.L004enc4_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+.L006enc4_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
.byte 102,15,56,220,233
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
- movups (%edx),%xmm0
- jnz .L004enc4_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L006enc4_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -190,27 +254,29 @@ _aesni_encrypt4:
_aesni_decrypt4:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
- shrl $1,%ecx
- leal 32(%edx),%edx
+ shll $4,%ecx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
pxor %xmm0,%xmm5
- movups (%edx),%xmm0
-.L005dec4_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+.L007dec4_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
.byte 102,15,56,222,233
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
- movups (%edx),%xmm0
- jnz .L005dec4_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L007dec4_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -225,45 +291,42 @@ _aesni_decrypt4:
.align 16
_aesni_encrypt6:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
+.byte 102,15,56,220,209
pxor %xmm0,%xmm5
- decl %ecx
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,220,225
pxor %xmm0,%xmm7
-.byte 102,15,56,220,241
- movups (%edx),%xmm0
-.byte 102,15,56,220,249
- jmp .L_aesni_encrypt6_enter
+ movups (%edx,%ecx,1),%xmm0
+ addl $16,%ecx
+ jmp .L008_aesni_encrypt6_inner
.align 16
-.L006enc6_loop:
+.L009enc6_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
+.L008_aesni_encrypt6_inner:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.align 16
.L_aesni_encrypt6_enter:
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%edx),%xmm0
- jnz .L006enc6_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L009enc6_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -282,45 +345,42 @@ _aesni_encrypt6:
.align 16
_aesni_decrypt6:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
-.byte 102,15,56,222,217
+.byte 102,15,56,222,209
pxor %xmm0,%xmm5
- decl %ecx
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,222,225
pxor %xmm0,%xmm7
-.byte 102,15,56,222,241
- movups (%edx),%xmm0
-.byte 102,15,56,222,249
- jmp .L_aesni_decrypt6_enter
+ movups (%edx,%ecx,1),%xmm0
+ addl $16,%ecx
+ jmp .L010_aesni_decrypt6_inner
.align 16
-.L007dec6_loop:
+.L011dec6_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
+.L010_aesni_decrypt6_inner:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.align 16
.L_aesni_decrypt6_enter:
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%edx),%xmm0
- jnz .L007dec6_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L011dec6_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -350,14 +410,14 @@ aesni_ecb_encrypt:
movl 32(%esp),%edx
movl 36(%esp),%ebx
andl $-16,%eax
- jz .L008ecb_ret
+ jz .L012ecb_ret
movl 240(%edx),%ecx
testl %ebx,%ebx
- jz .L009ecb_decrypt
+ jz .L013ecb_decrypt
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb .L010ecb_enc_tail
+ jb .L014ecb_enc_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -366,9 +426,9 @@ aesni_ecb_encrypt:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp .L011ecb_enc_loop6_enter
+ jmp .L015ecb_enc_loop6_enter
.align 16
-.L012ecb_enc_loop6:
+.L016ecb_enc_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -383,12 +443,12 @@ aesni_ecb_encrypt:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-.L011ecb_enc_loop6_enter:
+.L015ecb_enc_loop6_enter:
call _aesni_encrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc .L012ecb_enc_loop6
+ jnc .L016ecb_enc_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -397,18 +457,18 @@ aesni_ecb_encrypt:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz .L008ecb_ret
-.L010ecb_enc_tail:
+ jz .L012ecb_ret
+.L014ecb_enc_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb .L013ecb_enc_one
+ jb .L017ecb_enc_one
movups 16(%esi),%xmm3
- je .L014ecb_enc_two
+ je .L018ecb_enc_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb .L015ecb_enc_three
+ jb .L019ecb_enc_three
movups 48(%esi),%xmm5
- je .L016ecb_enc_four
+ je .L020ecb_enc_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call _aesni_encrypt6
@@ -417,50 +477,49 @@ aesni_ecb_encrypt:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L013ecb_enc_one:
+.L017ecb_enc_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L017enc1_loop_3:
+.L021enc1_loop_3:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L017enc1_loop_3
+ jnz .L021enc1_loop_3
.byte 102,15,56,221,209
movups %xmm2,(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L014ecb_enc_two:
- xorps %xmm4,%xmm4
- call _aesni_encrypt3
+.L018ecb_enc_two:
+ call _aesni_encrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L015ecb_enc_three:
+.L019ecb_enc_three:
call _aesni_encrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L016ecb_enc_four:
+.L020ecb_enc_four:
call _aesni_encrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L009ecb_decrypt:
+.L013ecb_decrypt:
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb .L018ecb_dec_tail
+ jb .L022ecb_dec_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -469,9 +528,9 @@ aesni_ecb_encrypt:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp .L019ecb_dec_loop6_enter
+ jmp .L023ecb_dec_loop6_enter
.align 16
-.L020ecb_dec_loop6:
+.L024ecb_dec_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -486,12 +545,12 @@ aesni_ecb_encrypt:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-.L019ecb_dec_loop6_enter:
+.L023ecb_dec_loop6_enter:
call _aesni_decrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc .L020ecb_dec_loop6
+ jnc .L024ecb_dec_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -500,18 +559,18 @@ aesni_ecb_encrypt:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz .L008ecb_ret
-.L018ecb_dec_tail:
+ jz .L012ecb_ret
+.L022ecb_dec_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb .L021ecb_dec_one
+ jb .L025ecb_dec_one
movups 16(%esi),%xmm3
- je .L022ecb_dec_two
+ je .L026ecb_dec_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb .L023ecb_dec_three
+ jb .L027ecb_dec_three
movups 48(%esi),%xmm5
- je .L024ecb_dec_four
+ je .L028ecb_dec_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call _aesni_decrypt6
@@ -520,44 +579,51 @@ aesni_ecb_encrypt:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L021ecb_dec_one:
+.L025ecb_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L025dec1_loop_4:
+.L029dec1_loop_4:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L025dec1_loop_4
+ jnz .L029dec1_loop_4
.byte 102,15,56,223,209
movups %xmm2,(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L022ecb_dec_two:
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
+.L026ecb_dec_two:
+ call _aesni_decrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L023ecb_dec_three:
+.L027ecb_dec_three:
call _aesni_decrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L008ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L024ecb_dec_four:
+.L028ecb_dec_four:
call _aesni_decrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-.L008ecb_ret:
+.L012ecb_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -596,48 +662,56 @@ aesni_ccm64_encrypt_blocks:
movl %ebp,20(%esp)
movl %ebp,24(%esp)
movl %ebp,28(%esp)
- shrl $1,%ecx
+ shll $4,%ecx
+ movl $16,%ebx
leal (%edx),%ebp
movdqa (%esp),%xmm5
movdqa %xmm7,%xmm2
- movl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ subl %ecx,%ebx
.byte 102,15,56,0,253
-.L026ccm64_enc_outer:
+.L030ccm64_enc_outer:
movups (%ebp),%xmm0
movl %ebx,%ecx
movups (%esi),%xmm6
xorps %xmm0,%xmm2
movups 16(%ebp),%xmm1
xorps %xmm6,%xmm0
- leal 32(%ebp),%edx
xorps %xmm0,%xmm3
- movups (%edx),%xmm0
-.L027ccm64_enc2_loop:
+ movups 32(%ebp),%xmm0
+.L031ccm64_enc2_loop:
.byte 102,15,56,220,209
- decl %ecx
.byte 102,15,56,220,217
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
- leal 32(%edx),%edx
.byte 102,15,56,220,216
- movups (%edx),%xmm0
- jnz .L027ccm64_enc2_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L031ccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
paddq 16(%esp),%xmm7
+ decl %eax
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- decl %eax
leal 16(%esi),%esi
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
movups %xmm6,(%edi)
- leal 16(%edi),%edi
.byte 102,15,56,0,213
- jnz .L026ccm64_enc_outer
+ leal 16(%edi),%edi
+ jnz .L030ccm64_enc_outer
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -685,71 +759,82 @@ aesni_ccm64_decrypt_blocks:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L028enc1_loop_5:
+.L032enc1_loop_5:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L028enc1_loop_5
+ jnz .L032enc1_loop_5
.byte 102,15,56,221,209
+ shll $4,%ebx
+ movl $16,%ecx
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
leal 16(%esi),%esi
- jmp .L029ccm64_dec_outer
+ subl %ebx,%ecx
+ leal 32(%ebp,%ebx,1),%edx
+ movl %ecx,%ebx
+ jmp .L033ccm64_dec_outer
.align 16
-.L029ccm64_dec_outer:
+.L033ccm64_dec_outer:
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
- movl %ebx,%ecx
movups %xmm6,(%edi)
leal 16(%edi),%edi
.byte 102,15,56,0,213
subl $1,%eax
- jz .L030ccm64_dec_break
+ jz .L034ccm64_dec_break
movups (%ebp),%xmm0
- shrl $1,%ecx
+ movl %ebx,%ecx
movups 16(%ebp),%xmm1
xorps %xmm0,%xmm6
- leal 32(%ebp),%edx
xorps %xmm0,%xmm2
xorps %xmm6,%xmm3
- movups (%edx),%xmm0
-.L031ccm64_dec2_loop:
+ movups 32(%ebp),%xmm0
+.L035ccm64_dec2_loop:
.byte 102,15,56,220,209
- decl %ecx
.byte 102,15,56,220,217
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
- leal 32(%edx),%edx
.byte 102,15,56,220,216
- movups (%edx),%xmm0
- jnz .L031ccm64_dec2_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L035ccm64_dec2_loop
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- leal 16(%esi),%esi
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- jmp .L029ccm64_dec_outer
+ leal 16(%esi),%esi
+ jmp .L033ccm64_dec_outer
.align 16
-.L030ccm64_dec_break:
+.L034ccm64_dec_break:
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movups (%edx),%xmm0
movups 16(%edx),%xmm1
xorps %xmm0,%xmm6
leal 32(%edx),%edx
xorps %xmm6,%xmm3
-.L032enc1_loop_6:
+.L036enc1_loop_6:
.byte 102,15,56,220,217
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L032enc1_loop_6
+ jnz .L036enc1_loop_6
.byte 102,15,56,221,217
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -775,7 +860,7 @@ aesni_ctr32_encrypt_blocks:
andl $-16,%esp
movl %ebp,80(%esp)
cmpl $1,%eax
- je .L033ctr32_one_shortcut
+ je .L037ctr32_one_shortcut
movdqu (%ebx),%xmm7
movl $202182159,(%esp)
movl $134810123,4(%esp)
@@ -791,63 +876,59 @@ aesni_ctr32_encrypt_blocks:
.byte 102,15,58,34,253,3
movl 240(%edx),%ecx
bswap %ebx
- pxor %xmm1,%xmm1
pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movdqa (%esp),%xmm2
-.byte 102,15,58,34,203,0
+.byte 102,15,58,34,195,0
leal 3(%ebx),%ebp
-.byte 102,15,58,34,197,0
+.byte 102,15,58,34,205,0
incl %ebx
-.byte 102,15,58,34,203,1
+.byte 102,15,58,34,195,1
incl %ebp
-.byte 102,15,58,34,197,1
+.byte 102,15,58,34,205,1
incl %ebx
-.byte 102,15,58,34,203,2
+.byte 102,15,58,34,195,2
incl %ebp
-.byte 102,15,58,34,197,2
- movdqa %xmm1,48(%esp)
-.byte 102,15,56,0,202
- movdqa %xmm0,64(%esp)
+.byte 102,15,58,34,205,2
+ movdqa %xmm0,48(%esp)
.byte 102,15,56,0,194
- pshufd $192,%xmm1,%xmm2
- pshufd $128,%xmm1,%xmm3
+ movdqu (%edx),%xmm6
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
+ pshufd $192,%xmm0,%xmm2
+ pshufd $128,%xmm0,%xmm3
cmpl $6,%eax
- jb .L034ctr32_tail
+ jb .L038ctr32_tail
+ pxor %xmm6,%xmm7
+ shll $4,%ecx
+ movl $16,%ebx
movdqa %xmm7,32(%esp)
- shrl $1,%ecx
movl %edx,%ebp
- movl %ecx,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
subl $6,%eax
- jmp .L035ctr32_loop6
-.align 16
-.L035ctr32_loop6:
- pshufd $64,%xmm1,%xmm4
- movdqa 32(%esp),%xmm1
- pshufd $192,%xmm0,%xmm5
- por %xmm1,%xmm2
- pshufd $128,%xmm0,%xmm6
- por %xmm1,%xmm3
- pshufd $64,%xmm0,%xmm7
- por %xmm1,%xmm4
- por %xmm1,%xmm5
- por %xmm1,%xmm6
- por %xmm1,%xmm7
- movups (%ebp),%xmm0
- movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
- decl %ecx
+ jmp .L039ctr32_loop6
+.align 16
+.L039ctr32_loop6:
+ pshufd $64,%xmm0,%xmm4
+ movdqa 32(%esp),%xmm0
+ pshufd $192,%xmm1,%xmm5
pxor %xmm0,%xmm2
+ pshufd $128,%xmm1,%xmm6
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
+ pshufd $64,%xmm1,%xmm7
+ movups 16(%ebp),%xmm1
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
+.byte 102,15,56,220,209
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
pxor %xmm0,%xmm7
+.byte 102,15,56,220,217
+ movups 32(%ebp),%xmm0
+ movl %ebx,%ecx
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%edx),%xmm0
.byte 102,15,56,220,249
call .L_aesni_encrypt6_enter
movups (%esi),%xmm1
@@ -858,51 +939,51 @@ aesni_ctr32_encrypt_blocks:
movups %xmm2,(%edi)
movdqa 16(%esp),%xmm0
xorps %xmm1,%xmm4
- movdqa 48(%esp),%xmm1
+ movdqa 64(%esp),%xmm1
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
paddd %xmm0,%xmm1
- paddd 64(%esp),%xmm0
+ paddd 48(%esp),%xmm0
movdqa (%esp),%xmm2
movups 48(%esi),%xmm3
movups 64(%esi),%xmm4
xorps %xmm3,%xmm5
movups 80(%esi),%xmm3
leal 96(%esi),%esi
- movdqa %xmm1,48(%esp)
-.byte 102,15,56,0,202
+ movdqa %xmm0,48(%esp)
+.byte 102,15,56,0,194
xorps %xmm4,%xmm6
movups %xmm5,48(%edi)
xorps %xmm3,%xmm7
- movdqa %xmm0,64(%esp)
-.byte 102,15,56,0,194
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
movups %xmm6,64(%edi)
- pshufd $192,%xmm1,%xmm2
+ pshufd $192,%xmm0,%xmm2
movups %xmm7,80(%edi)
leal 96(%edi),%edi
- movl %ebx,%ecx
- pshufd $128,%xmm1,%xmm3
+ pshufd $128,%xmm0,%xmm3
subl $6,%eax
- jnc .L035ctr32_loop6
+ jnc .L039ctr32_loop6
addl $6,%eax
- jz .L036ctr32_ret
+ jz .L040ctr32_ret
+ movdqu (%ebp),%xmm7
movl %ebp,%edx
- leal 1(,%ecx,2),%ecx
- movdqa 32(%esp),%xmm7
-.L034ctr32_tail:
+ pxor 32(%esp),%xmm7
+ movl 240(%ebp),%ecx
+.L038ctr32_tail:
por %xmm7,%xmm2
cmpl $2,%eax
- jb .L037ctr32_one
- pshufd $64,%xmm1,%xmm4
+ jb .L041ctr32_one
+ pshufd $64,%xmm0,%xmm4
por %xmm7,%xmm3
- je .L038ctr32_two
- pshufd $192,%xmm0,%xmm5
+ je .L042ctr32_two
+ pshufd $192,%xmm1,%xmm5
por %xmm7,%xmm4
cmpl $4,%eax
- jb .L039ctr32_three
- pshufd $128,%xmm0,%xmm6
+ jb .L043ctr32_three
+ pshufd $128,%xmm1,%xmm6
por %xmm7,%xmm5
- je .L040ctr32_four
+ je .L044ctr32_four
por %xmm7,%xmm6
call _aesni_encrypt6
movups (%esi),%xmm1
@@ -920,39 +1001,39 @@ aesni_ctr32_encrypt_blocks:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L036ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L033ctr32_one_shortcut:
+.L037ctr32_one_shortcut:
movups (%ebx),%xmm2
movl 240(%edx),%ecx
-.L037ctr32_one:
+.L041ctr32_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L041enc1_loop_7:
+.L045enc1_loop_7:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L041enc1_loop_7
+ jnz .L045enc1_loop_7
.byte 102,15,56,221,209
movups (%esi),%xmm6
xorps %xmm2,%xmm6
movups %xmm6,(%edi)
- jmp .L036ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L038ctr32_two:
- call _aesni_encrypt3
+.L042ctr32_two:
+ call _aesni_encrypt2
movups (%esi),%xmm5
movups 16(%esi),%xmm6
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L036ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L039ctr32_three:
+.L043ctr32_three:
call _aesni_encrypt3
movups (%esi),%xmm5
movups 16(%esi),%xmm6
@@ -963,9 +1044,9 @@ aesni_ctr32_encrypt_blocks:
xorps %xmm7,%xmm4
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L036ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L040ctr32_four:
+.L044ctr32_four:
call _aesni_encrypt4
movups (%esi),%xmm6
movups 16(%esi),%xmm7
@@ -979,7 +1060,18 @@ aesni_ctr32_encrypt_blocks:
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-.L036ctr32_ret:
+.L040ctr32_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
movl 80(%esp),%esp
popl %edi
popl %esi
@@ -1004,12 +1096,12 @@ aesni_xts_encrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L042enc1_loop_8:
+.L046enc1_loop_8:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L042enc1_loop_8
+ jnz .L046enc1_loop_8
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1033,12 +1125,14 @@ aesni_xts_encrypt:
movl %edx,%ebp
movl %ecx,%ebx
subl $96,%eax
- jc .L043xts_enc_short
- shrl $1,%ecx
- movl %ecx,%ebx
- jmp .L044xts_enc_loop6
+ jc .L047xts_enc_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp .L048xts_enc_loop6
.align 16
-.L044xts_enc_loop6:
+.L048xts_enc_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1074,6 +1168,7 @@ aesni_xts_encrypt:
pand %xmm3,%xmm7
movups (%esi),%xmm2
pxor %xmm1,%xmm7
+ movl %ebx,%ecx
movdqu 16(%esi),%xmm3
xorps %xmm0,%xmm2
movdqu 32(%esi),%xmm4
@@ -1089,19 +1184,17 @@ aesni_xts_encrypt:
movdqa %xmm7,80(%esp)
pxor %xmm1,%xmm7
movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
pxor 16(%esp),%xmm3
-.byte 102,15,56,220,209
pxor 32(%esp),%xmm4
-.byte 102,15,56,220,217
+.byte 102,15,56,220,209
pxor 48(%esp),%xmm5
- decl %ecx
-.byte 102,15,56,220,225
pxor 64(%esp),%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,217
pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%edx),%xmm0
.byte 102,15,56,220,249
call .L_aesni_encrypt6_enter
movdqa 80(%esp),%xmm1
@@ -1126,26 +1219,25 @@ aesni_xts_encrypt:
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
- movl %ebx,%ecx
pxor %xmm2,%xmm1
subl $96,%eax
- jnc .L044xts_enc_loop6
- leal 1(,%ecx,2),%ecx
+ jnc .L048xts_enc_loop6
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-.L043xts_enc_short:
+.L047xts_enc_short:
addl $96,%eax
- jz .L045xts_enc_done6x
+ jz .L049xts_enc_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb .L046xts_enc_one
+ jb .L050xts_enc_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je .L047xts_enc_two
+ je .L051xts_enc_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1154,7 +1246,7 @@ aesni_xts_encrypt:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb .L048xts_enc_three
+ jb .L052xts_enc_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1164,7 +1256,7 @@ aesni_xts_encrypt:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je .L049xts_enc_four
+ je .L053xts_enc_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1196,9 +1288,9 @@ aesni_xts_encrypt:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp .L050xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L046xts_enc_one:
+.L050xts_enc_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1206,37 +1298,36 @@ aesni_xts_encrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L051enc1_loop_9:
+.L055enc1_loop_9:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L051enc1_loop_9
+ jnz .L055enc1_loop_9
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp .L050xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L047xts_enc_two:
+.L051xts_enc_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
leal 32(%esi),%esi
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
- xorps %xmm4,%xmm4
- call _aesni_encrypt3
+ call _aesni_encrypt2
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L050xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L048xts_enc_three:
+.L052xts_enc_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1254,9 +1345,9 @@ aesni_xts_encrypt:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp .L050xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L049xts_enc_four:
+.L053xts_enc_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1278,28 +1369,28 @@ aesni_xts_encrypt:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L050xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L045xts_enc_done6x:
+.L049xts_enc_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz .L052xts_enc_ret
+ jz .L056xts_enc_ret
movdqa %xmm1,%xmm5
movl %eax,112(%esp)
- jmp .L053xts_enc_steal
+ jmp .L057xts_enc_steal
.align 16
-.L050xts_enc_done:
+.L054xts_enc_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz .L052xts_enc_ret
+ jz .L056xts_enc_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm5
paddq %xmm1,%xmm1
pand 96(%esp),%xmm5
pxor %xmm1,%xmm5
-.L053xts_enc_steal:
+.L057xts_enc_steal:
movzbl (%esi),%ecx
movzbl -16(%edi),%edx
leal 1(%esi),%esi
@@ -1307,7 +1398,7 @@ aesni_xts_encrypt:
movb %dl,(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz .L053xts_enc_steal
+ jnz .L057xts_enc_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1317,16 +1408,30 @@ aesni_xts_encrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L054enc1_loop_10:
+.L058enc1_loop_10:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L054enc1_loop_10
+ jnz .L058enc1_loop_10
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,-16(%edi)
-.L052xts_enc_ret:
+.L056xts_enc_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1351,12 +1456,12 @@ aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L055enc1_loop_11:
+.L059enc1_loop_11:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L055enc1_loop_11
+ jnz .L059enc1_loop_11
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1385,12 +1490,14 @@ aesni_xts_decrypt:
pcmpgtd %xmm1,%xmm0
andl $-16,%eax
subl $96,%eax
- jc .L056xts_dec_short
- shrl $1,%ecx
- movl %ecx,%ebx
- jmp .L057xts_dec_loop6
+ jc .L060xts_dec_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp .L061xts_dec_loop6
.align 16
-.L057xts_dec_loop6:
+.L061xts_dec_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1426,6 +1533,7 @@ aesni_xts_decrypt:
pand %xmm3,%xmm7
movups (%esi),%xmm2
pxor %xmm1,%xmm7
+ movl %ebx,%ecx
movdqu 16(%esi),%xmm3
xorps %xmm0,%xmm2
movdqu 32(%esi),%xmm4
@@ -1441,19 +1549,17 @@ aesni_xts_decrypt:
movdqa %xmm7,80(%esp)
pxor %xmm1,%xmm7
movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
pxor 16(%esp),%xmm3
-.byte 102,15,56,222,209
pxor 32(%esp),%xmm4
-.byte 102,15,56,222,217
+.byte 102,15,56,222,209
pxor 48(%esp),%xmm5
- decl %ecx
-.byte 102,15,56,222,225
pxor 64(%esp),%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,217
pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
.byte 102,15,56,222,241
- movups (%edx),%xmm0
.byte 102,15,56,222,249
call .L_aesni_decrypt6_enter
movdqa 80(%esp),%xmm1
@@ -1478,26 +1584,25 @@ aesni_xts_decrypt:
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
- movl %ebx,%ecx
pxor %xmm2,%xmm1
subl $96,%eax
- jnc .L057xts_dec_loop6
- leal 1(,%ecx,2),%ecx
+ jnc .L061xts_dec_loop6
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-.L056xts_dec_short:
+.L060xts_dec_short:
addl $96,%eax
- jz .L058xts_dec_done6x
+ jz .L062xts_dec_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb .L059xts_dec_one
+ jb .L063xts_dec_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je .L060xts_dec_two
+ je .L064xts_dec_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1506,7 +1611,7 @@ aesni_xts_decrypt:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb .L061xts_dec_three
+ jb .L065xts_dec_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1516,7 +1621,7 @@ aesni_xts_decrypt:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je .L062xts_dec_four
+ je .L066xts_dec_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1548,9 +1653,9 @@ aesni_xts_decrypt:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp .L063xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L059xts_dec_one:
+.L063xts_dec_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1558,36 +1663,36 @@ aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L064dec1_loop_12:
+.L068dec1_loop_12:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L064dec1_loop_12
+ jnz .L068dec1_loop_12
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp .L063xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L060xts_dec_two:
+.L064xts_dec_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
leal 32(%esi),%esi
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
- call _aesni_decrypt3
+ call _aesni_decrypt2
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L063xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L061xts_dec_three:
+.L065xts_dec_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1605,9 +1710,9 @@ aesni_xts_decrypt:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp .L063xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L062xts_dec_four:
+.L066xts_dec_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1629,20 +1734,20 @@ aesni_xts_decrypt:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L063xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L058xts_dec_done6x:
+.L062xts_dec_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz .L065xts_dec_ret
+ jz .L069xts_dec_ret
movl %eax,112(%esp)
- jmp .L066xts_dec_only_one_more
+ jmp .L070xts_dec_only_one_more
.align 16
-.L063xts_dec_done:
+.L067xts_dec_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz .L065xts_dec_ret
+ jz .L069xts_dec_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm2
@@ -1652,7 +1757,7 @@ aesni_xts_decrypt:
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
-.L066xts_dec_only_one_more:
+.L070xts_dec_only_one_more:
pshufd $19,%xmm0,%xmm5
movdqa %xmm1,%xmm6
paddq %xmm1,%xmm1
@@ -1666,16 +1771,16 @@ aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L067dec1_loop_13:
+.L071dec1_loop_13:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L067dec1_loop_13
+ jnz .L071dec1_loop_13
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
-.L068xts_dec_steal:
+.L072xts_dec_steal:
movzbl 16(%esi),%ecx
movzbl (%edi),%edx
leal 1(%esi),%esi
@@ -1683,7 +1788,7 @@ aesni_xts_decrypt:
movb %dl,16(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz .L068xts_dec_steal
+ jnz .L072xts_dec_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1693,16 +1798,30 @@ aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L069dec1_loop_14:
+.L073dec1_loop_14:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L069dec1_loop_14
+ jnz .L073dec1_loop_14
.byte 102,15,56,223,209
xorps %xmm6,%xmm2
movups %xmm2,(%edi)
-.L065xts_dec_ret:
+.L069xts_dec_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1728,7 +1847,7 @@ aesni_cbc_encrypt:
movl 32(%esp),%edx
movl 36(%esp),%ebp
testl %eax,%eax
- jz .L070cbc_abort
+ jz .L074cbc_abort
cmpl $0,40(%esp)
xchgl %esp,%ebx
movups (%ebp),%xmm7
@@ -1736,14 +1855,14 @@ aesni_cbc_encrypt:
movl %edx,%ebp
movl %ebx,16(%esp)
movl %ecx,%ebx
- je .L071cbc_decrypt
+ je .L075cbc_decrypt
movaps %xmm7,%xmm2
cmpl $16,%eax
- jb .L072cbc_enc_tail
+ jb .L076cbc_enc_tail
subl $16,%eax
- jmp .L073cbc_enc_loop
+ jmp .L077cbc_enc_loop
.align 16
-.L073cbc_enc_loop:
+.L077cbc_enc_loop:
movups (%esi),%xmm7
leal 16(%esi),%esi
movups (%edx),%xmm0
@@ -1751,24 +1870,25 @@ aesni_cbc_encrypt:
xorps %xmm0,%xmm7
leal 32(%edx),%edx
xorps %xmm7,%xmm2
-.L074enc1_loop_15:
+.L078enc1_loop_15:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L074enc1_loop_15
+ jnz .L078enc1_loop_15
.byte 102,15,56,221,209
movl %ebx,%ecx
movl %ebp,%edx
movups %xmm2,(%edi)
leal 16(%edi),%edi
subl $16,%eax
- jnc .L073cbc_enc_loop
+ jnc .L077cbc_enc_loop
addl $16,%eax
- jnz .L072cbc_enc_tail
+ jnz .L076cbc_enc_tail
movaps %xmm2,%xmm7
- jmp .L075cbc_ret
-.L072cbc_enc_tail:
+ pxor %xmm2,%xmm2
+ jmp .L079cbc_ret
+.L076cbc_enc_tail:
movl %eax,%ecx
.long 2767451785
movl $16,%ecx
@@ -1779,20 +1899,20 @@ aesni_cbc_encrypt:
movl %ebx,%ecx
movl %edi,%esi
movl %ebp,%edx
- jmp .L073cbc_enc_loop
+ jmp .L077cbc_enc_loop
.align 16
-.L071cbc_decrypt:
+.L075cbc_decrypt:
cmpl $80,%eax
- jbe .L076cbc_dec_tail
+ jbe .L080cbc_dec_tail
movaps %xmm7,(%esp)
subl $80,%eax
- jmp .L077cbc_dec_loop6_enter
+ jmp .L081cbc_dec_loop6_enter
.align 16
-.L078cbc_dec_loop6:
+.L082cbc_dec_loop6:
movaps %xmm0,(%esp)
movups %xmm7,(%edi)
leal 16(%edi),%edi
-.L077cbc_dec_loop6_enter:
+.L081cbc_dec_loop6_enter:
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -1822,28 +1942,28 @@ aesni_cbc_encrypt:
movups %xmm6,64(%edi)
leal 80(%edi),%edi
subl $96,%eax
- ja .L078cbc_dec_loop6
+ ja .L082cbc_dec_loop6
movaps %xmm7,%xmm2
movaps %xmm0,%xmm7
addl $80,%eax
- jle .L079cbc_dec_tail_collected
+ jle .L083cbc_dec_clear_tail_collected
movups %xmm2,(%edi)
leal 16(%edi),%edi
-.L076cbc_dec_tail:
+.L080cbc_dec_tail:
movups (%esi),%xmm2
movaps %xmm2,%xmm6
cmpl $16,%eax
- jbe .L080cbc_dec_one
+ jbe .L084cbc_dec_one
movups 16(%esi),%xmm3
movaps %xmm3,%xmm5
cmpl $32,%eax
- jbe .L081cbc_dec_two
+ jbe .L085cbc_dec_two
movups 32(%esi),%xmm4
cmpl $48,%eax
- jbe .L082cbc_dec_three
+ jbe .L086cbc_dec_three
movups 48(%esi),%xmm5
cmpl $64,%eax
- jbe .L083cbc_dec_four
+ jbe .L087cbc_dec_four
movups 64(%esi),%xmm6
movaps %xmm7,(%esp)
movups (%esi),%xmm2
@@ -1861,56 +1981,62 @@ aesni_cbc_encrypt:
xorps %xmm0,%xmm6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%edi)
+ pxor %xmm5,%xmm5
leal 64(%edi),%edi
movaps %xmm6,%xmm2
+ pxor %xmm6,%xmm6
subl $80,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L080cbc_dec_one:
+.L084cbc_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L084dec1_loop_16:
+.L089dec1_loop_16:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L084dec1_loop_16
+ jnz .L089dec1_loop_16
.byte 102,15,56,223,209
xorps %xmm7,%xmm2
movaps %xmm6,%xmm7
subl $16,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L081cbc_dec_two:
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
+.L085cbc_dec_two:
+ call _aesni_decrypt2
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movaps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leal 16(%edi),%edi
movaps %xmm5,%xmm7
subl $32,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L082cbc_dec_three:
+.L086cbc_dec_three:
call _aesni_decrypt3
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
xorps %xmm5,%xmm4
movups %xmm2,(%edi)
movaps %xmm4,%xmm2
+ pxor %xmm4,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
leal 32(%edi),%edi
movups 32(%esi),%xmm7
subl $48,%eax
- jmp .L079cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L083cbc_dec_four:
+.L087cbc_dec_four:
call _aesni_decrypt4
movups 16(%esi),%xmm1
movups 32(%esi),%xmm0
@@ -1920,28 +2046,44 @@ aesni_cbc_encrypt:
movups %xmm2,(%edi)
xorps %xmm1,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
leal 48(%edi),%edi
movaps %xmm5,%xmm2
+ pxor %xmm5,%xmm5
subl $64,%eax
-.L079cbc_dec_tail_collected:
+ jmp .L088cbc_dec_tail_collected
+.align 16
+.L083cbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+.L088cbc_dec_tail_collected:
andl $15,%eax
- jnz .L085cbc_dec_tail_partial
+ jnz .L090cbc_dec_tail_partial
movups %xmm2,(%edi)
- jmp .L075cbc_ret
+ pxor %xmm0,%xmm0
+ jmp .L079cbc_ret
.align 16
-.L085cbc_dec_tail_partial:
+.L090cbc_dec_tail_partial:
movaps %xmm2,(%esp)
+ pxor %xmm0,%xmm0
movl $16,%ecx
movl %esp,%esi
subl %eax,%ecx
.long 2767451785
-.L075cbc_ret:
+ movdqa %xmm2,(%esp)
+.L079cbc_ret:
movl 16(%esp),%esp
movl 36(%esp),%ebp
+ pxor %xmm2,%xmm2
+ pxor %xmm1,%xmm1
movups %xmm7,(%ebp)
-.L070cbc_abort:
+ pxor %xmm7,%xmm7
+.L074cbc_abort:
popl %edi
popl %esi
popl %ebx
@@ -1951,52 +2093,62 @@ aesni_cbc_encrypt:
.type _aesni_set_encrypt_key,@function
.align 16
_aesni_set_encrypt_key:
+ pushl %ebp
+ pushl %ebx
testl %eax,%eax
- jz .L086bad_pointer
+ jz .L091bad_pointer
testl %edx,%edx
- jz .L086bad_pointer
+ jz .L091bad_pointer
+ call .L092pic
+.L092pic:
+ popl %ebx
+ leal .Lkey_const-.L092pic(%ebx),%ebx
+ leal _gnutls_x86_cpuid_s,%ebp
movups (%eax),%xmm0
xorps %xmm4,%xmm4
+ movl 4(%ebp),%ebp
leal 16(%edx),%edx
+ andl $268437504,%ebp
cmpl $256,%ecx
- je .L08714rounds
+ je .L09314rounds
cmpl $192,%ecx
- je .L08812rounds
+ je .L09412rounds
cmpl $128,%ecx
- jne .L089bad_keybits
+ jne .L095bad_keybits
.align 16
-.L09010rounds:
+.L09610rounds:
+ cmpl $268435456,%ebp
+ je .L09710rounds_alt
movl $9,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,200,1
- call .L091key_128_cold
+ call .L098key_128_cold
.byte 102,15,58,223,200,2
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,4
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,8
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,16
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,32
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,64
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,128
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,27
- call .L092key_128
+ call .L099key_128
.byte 102,15,58,223,200,54
- call .L092key_128
+ call .L099key_128
movups %xmm0,(%edx)
movl %ecx,80(%edx)
- xorl %eax,%eax
- ret
+ jmp .L100good_key
.align 16
-.L092key_128:
+.L099key_128:
movups %xmm0,(%edx)
leal 16(%edx),%edx
-.L091key_128_cold:
+.L098key_128_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -2005,38 +2157,91 @@ _aesni_set_encrypt_key:
xorps %xmm1,%xmm0
ret
.align 16
-.L08812rounds:
+.L09710rounds_alt:
+ movdqa (%ebx),%xmm5
+ movl $8,%ecx
+ movdqa 32(%ebx),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,-16(%edx)
+.L101loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leal 16(%edx),%edx
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%edx)
+ movdqa %xmm0,%xmm2
+ decl %ecx
+ jnz .L101loop_key128
+ movdqa 48(%ebx),%xmm4
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%edx)
+ movl $9,%ecx
+ movl %ecx,96(%edx)
+ jmp .L100good_key
+.align 16
+.L09412rounds:
movq 16(%eax),%xmm2
+ cmpl $268435456,%ebp
+ je .L10212rounds_alt
movl $11,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,202,1
- call .L093key_192a_cold
+ call .L103key_192a_cold
.byte 102,15,58,223,202,2
- call .L094key_192b
+ call .L104key_192b
.byte 102,15,58,223,202,4
- call .L095key_192a
+ call .L105key_192a
.byte 102,15,58,223,202,8
- call .L094key_192b
+ call .L104key_192b
.byte 102,15,58,223,202,16
- call .L095key_192a
+ call .L105key_192a
.byte 102,15,58,223,202,32
- call .L094key_192b
+ call .L104key_192b
.byte 102,15,58,223,202,64
- call .L095key_192a
+ call .L105key_192a
.byte 102,15,58,223,202,128
- call .L094key_192b
+ call .L104key_192b
movups %xmm0,(%edx)
movl %ecx,48(%edx)
- xorl %eax,%eax
- ret
+ jmp .L100good_key
.align 16
-.L095key_192a:
+.L105key_192a:
movups %xmm0,(%edx)
leal 16(%edx),%edx
.align 16
-.L093key_192a_cold:
+.L103key_192a_cold:
movaps %xmm2,%xmm5
-.L096key_192b_warm:
+.L106key_192b_warm:
shufps $16,%xmm0,%xmm4
movdqa %xmm2,%xmm3
xorps %xmm4,%xmm0
@@ -2050,56 +2255,90 @@ _aesni_set_encrypt_key:
pxor %xmm3,%xmm2
ret
.align 16
-.L094key_192b:
+.L104key_192b:
movaps %xmm0,%xmm3
shufps $68,%xmm0,%xmm5
movups %xmm5,(%edx)
shufps $78,%xmm2,%xmm3
movups %xmm3,16(%edx)
leal 32(%edx),%edx
- jmp .L096key_192b_warm
+ jmp .L106key_192b_warm
+.align 16
+.L10212rounds_alt:
+ movdqa 16(%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $8,%ecx
+ movdqu %xmm0,-16(%edx)
+.L107loop_key192:
+ movq %xmm2,(%edx)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leal 24(%edx),%edx
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%edx)
+ decl %ecx
+ jnz .L107loop_key192
+ movl $11,%ecx
+ movl %ecx,32(%edx)
+ jmp .L100good_key
.align 16
-.L08714rounds:
+.L09314rounds:
movups 16(%eax),%xmm2
- movl $13,%ecx
leal 16(%edx),%edx
+ cmpl $268435456,%ebp
+ je .L10814rounds_alt
+ movl $13,%ecx
movups %xmm0,-32(%edx)
movups %xmm2,-16(%edx)
.byte 102,15,58,223,202,1
- call .L097key_256a_cold
+ call .L109key_256a_cold
.byte 102,15,58,223,200,1
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,2
- call .L099key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,2
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,4
- call .L099key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,4
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,8
- call .L099key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,8
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,16
- call .L099key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,16
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,32
- call .L099key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,32
- call .L098key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,64
- call .L099key_256a
+ call .L111key_256a
movups %xmm0,(%edx)
movl %ecx,16(%edx)
xorl %eax,%eax
- ret
+ jmp .L100good_key
.align 16
-.L099key_256a:
+.L111key_256a:
movups %xmm2,(%edx)
leal 16(%edx),%edx
-.L097key_256a_cold:
+.L109key_256a_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -2108,7 +2347,7 @@ _aesni_set_encrypt_key:
xorps %xmm1,%xmm0
ret
.align 16
-.L098key_256b:
+.L110key_256b:
movups %xmm0,(%edx)
leal 16(%edx),%edx
shufps $16,%xmm2,%xmm4
@@ -2118,13 +2357,70 @@ _aesni_set_encrypt_key:
shufps $170,%xmm1,%xmm1
xorps %xmm1,%xmm2
ret
+.align 16
+.L10814rounds_alt:
+ movdqa (%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $7,%ecx
+ movdqu %xmm0,-32(%edx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,-16(%edx)
+.L112loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ decl %ecx
+ jz .L113done_key256
+ pshufd $255,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%edx)
+ leal 32(%edx),%edx
+ movdqa %xmm2,%xmm1
+ jmp .L112loop_key256
+.L113done_key256:
+ movl $13,%ecx
+ movl %ecx,16(%edx)
+.L100good_key:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ xorl %eax,%eax
+ popl %ebx
+ popl %ebp
+ ret
.align 4
-.L086bad_pointer:
+.L091bad_pointer:
movl $-1,%eax
+ popl %ebx
+ popl %ebp
ret
.align 4
-.L089bad_keybits:
+.L095bad_keybits:
+ pxor %xmm0,%xmm0
movl $-2,%eax
+ popl %ebx
+ popl %ebp
ret
.size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key
.globl aesni_set_encrypt_key
@@ -2150,7 +2446,7 @@ aesni_set_decrypt_key:
movl 12(%esp),%edx
shll $4,%ecx
testl %eax,%eax
- jnz .L100dec_key_ret
+ jnz .L114dec_key_ret
leal 16(%edx,%ecx,1),%eax
movups (%edx),%xmm0
movups (%eax),%xmm1
@@ -2158,7 +2454,7 @@ aesni_set_decrypt_key:
movups %xmm1,(%edx)
leal 16(%edx),%edx
leal -16(%eax),%eax
-.L101dec_key_inverse:
+.L115dec_key_inverse:
movups (%edx),%xmm0
movups (%eax),%xmm1
.byte 102,15,56,219,192
@@ -2168,20 +2464,26 @@ aesni_set_decrypt_key:
movups %xmm0,16(%eax)
movups %xmm1,-16(%edx)
cmpl %edx,%eax
- ja .L101dec_key_inverse
+ ja .L115dec_key_inverse
movups (%edx),%xmm0
.byte 102,15,56,219,192
movups %xmm0,(%edx)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
xorl %eax,%eax
-.L100dec_key_ret:
+.L114dec_key_ret:
ret
.size aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin
+.align 64
+.Lkey_const:
+.long 202313229,202313229,202313229,202313229
+.long 67569157,67569157,67569157,67569157
+.long 1,1,1,1
+.long 27,27,27,27
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
.byte 115,108,46,111,114,103,62,0
-
+.comm _gnutls_x86_cpuid_s,16,4
.section .note.GNU-stack,"",%progbits
-
-
diff --git a/lib/accelerated/x86/elf/aesni-x86_64.s b/lib/accelerated/x86/elf/aesni-x86_64.s
index 5d7ea46937..76d44fc2a8 100644
--- a/lib/accelerated/x86/elf/aesni-x86_64.s
+++ b/lib/accelerated/x86/elf/aesni-x86_64.s
@@ -38,6 +38,7 @@
# *** This file is auto-generated ***
#
.text
+
.globl aesni_encrypt
.type aesni_encrypt,@function
.align 16
@@ -53,9 +54,12 @@ aesni_encrypt:
decl %eax
movups (%rdx),%xmm1
leaq 16(%rdx),%rdx
- jnz .Loop_enc1_1
+ jnz .Loop_enc1_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
.byte 0xf3,0xc3
.size aesni_encrypt,.-aesni_encrypt
@@ -74,34 +78,96 @@ aesni_decrypt:
decl %eax
movups (%rdx),%xmm1
leaq 16(%rdx),%rdx
- jnz .Loop_dec1_2
+ jnz .Loop_dec1_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
.byte 0xf3,0xc3
.size aesni_decrypt, .-aesni_decrypt
+.type _aesni_encrypt2,@function
+.align 16
+_aesni_encrypt2:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+.Lenc_loop2:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Lenc_loop2
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ .byte 0xf3,0xc3
+.size _aesni_encrypt2,.-_aesni_encrypt2
+.type _aesni_decrypt2,@function
+.align 16
+_aesni_decrypt2:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+.Ldec_loop2:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Ldec_loop2
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+ .byte 0xf3,0xc3
+.size _aesni_decrypt2,.-_aesni_decrypt2
.type _aesni_encrypt3,@function
.align 16
_aesni_encrypt3:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
.Lenc_loop3:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop3
.byte 102,15,56,220,209
@@ -116,25 +182,26 @@ _aesni_encrypt3:
.align 16
_aesni_decrypt3:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
.Ldec_loop3:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop3
.byte 102,15,56,222,209
@@ -149,28 +216,30 @@ _aesni_decrypt3:
.align 16
_aesni_encrypt4:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
xorps %xmm0,%xmm5
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
.Lenc_loop4:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop4
.byte 102,15,56,220,209
@@ -187,28 +256,30 @@ _aesni_encrypt4:
.align 16
_aesni_decrypt4:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
xorps %xmm0,%xmm5
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
.Ldec_loop4:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop4
.byte 102,15,56,222,209
@@ -225,43 +296,40 @@ _aesni_decrypt4:
.align 16
_aesni_encrypt6:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
+.byte 102,15,56,220,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,225
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,220,241
- movups (%rcx),%xmm0
-.byte 102,15,56,220,249
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
jmp .Lenc_loop6_enter
.align 16
.Lenc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
+.Lenc_loop6_enter:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.Lenc_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop6
.byte 102,15,56,220,209
@@ -282,43 +350,40 @@ _aesni_encrypt6:
.align 16
_aesni_decrypt6:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
+.byte 102,15,56,222,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,222,217
pxor %xmm0,%xmm5
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,225
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,222,241
- movups (%rcx),%xmm0
-.byte 102,15,56,222,249
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
jmp .Ldec_loop6_enter
.align 16
.Ldec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
+.Ldec_loop6_enter:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.Ldec_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop6
.byte 102,15,56,222,209
@@ -339,52 +404,46 @@ _aesni_decrypt6:
.align 16
_aesni_encrypt8:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,209
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,220,241
pxor %xmm0,%xmm8
-.byte 102,15,56,220,249
+.byte 102,15,56,220,217
pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
- movups 16(%rcx),%xmm1
- jmp .Lenc_loop8_enter
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp .Lenc_loop8_inner
.align 16
.Lenc_loop8:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
+.Lenc_loop8_inner:
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
- movups 16(%rcx),%xmm1
.Lenc_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lenc_loop8
.byte 102,15,56,220,209
@@ -409,52 +468,46 @@ _aesni_encrypt8:
.align 16
_aesni_decrypt8:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
-.byte 102,15,56,222,217
pxor %xmm0,%xmm5
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,222,209
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,222,241
pxor %xmm0,%xmm8
-.byte 102,15,56,222,249
+.byte 102,15,56,222,217
pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
- jmp .Ldec_loop8_enter
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp .Ldec_loop8_inner
.align 16
.Ldec_loop8:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
+.Ldec_loop8_inner:
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
.Ldec_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Ldec_loop8
.byte 102,15,56,222,209
@@ -489,7 +542,7 @@ aesni_ecb_encrypt:
testl %r8d,%r8d
jz .Lecb_decrypt
- cmpq $128,%rdx
+ cmpq $0x80,%rdx
jb .Lecb_enc_tail
movdqu (%rdi),%xmm2
@@ -501,7 +554,7 @@ aesni_ecb_encrypt:
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
- subq $128,%rdx
+ subq $0x80,%rdx
jmp .Lecb_enc_loop8_enter
.align 16
.Lecb_enc_loop8:
@@ -529,7 +582,7 @@ aesni_ecb_encrypt:
call _aesni_encrypt8
- subq $128,%rdx
+ subq $0x80,%rdx
jnc .Lecb_enc_loop8
movups %xmm2,(%rsi)
@@ -543,26 +596,27 @@ aesni_ecb_encrypt:
movups %xmm8,96(%rsi)
movups %xmm9,112(%rsi)
leaq 128(%rsi),%rsi
- addq $128,%rdx
+ addq $0x80,%rdx
jz .Lecb_ret
.Lecb_enc_tail:
movups (%rdi),%xmm2
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb .Lecb_enc_one
movups 16(%rdi),%xmm3
je .Lecb_enc_two
movups 32(%rdi),%xmm4
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb .Lecb_enc_three
movups 48(%rdi),%xmm5
je .Lecb_enc_four
movups 64(%rdi),%xmm6
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
jb .Lecb_enc_five
movups 80(%rdi),%xmm7
je .Lecb_enc_six
movdqu 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
call _aesni_encrypt8
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
@@ -583,14 +637,13 @@ aesni_ecb_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_3
+ jnz .Loop_enc1_3
.byte 102,15,56,221,209
movups %xmm2,(%rsi)
jmp .Lecb_ret
.align 16
.Lecb_enc_two:
- xorps %xmm4,%xmm4
- call _aesni_encrypt3
+ call _aesni_encrypt2
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
jmp .Lecb_ret
@@ -632,7 +685,7 @@ aesni_ecb_encrypt:
.align 16
.Lecb_decrypt:
- cmpq $128,%rdx
+ cmpq $0x80,%rdx
jb .Lecb_dec_tail
movdqu (%rdi),%xmm2
@@ -644,7 +697,7 @@ aesni_ecb_encrypt:
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
- subq $128,%rdx
+ subq $0x80,%rdx
jmp .Lecb_dec_loop8_enter
.align 16
.Lecb_dec_loop8:
@@ -673,49 +726,66 @@ aesni_ecb_encrypt:
call _aesni_decrypt8
movups (%r11),%xmm0
- subq $128,%rdx
+ subq $0x80,%rdx
jnc .Lecb_dec_loop8
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movq %r11,%rcx
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movl %r10d,%eax
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
movups %xmm9,112(%rsi)
+ pxor %xmm9,%xmm9
leaq 128(%rsi),%rsi
- addq $128,%rdx
+ addq $0x80,%rdx
jz .Lecb_ret
.Lecb_dec_tail:
movups (%rdi),%xmm2
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb .Lecb_dec_one
movups 16(%rdi),%xmm3
je .Lecb_dec_two
movups 32(%rdi),%xmm4
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb .Lecb_dec_three
movups 48(%rdi),%xmm5
je .Lecb_dec_four
movups 64(%rdi),%xmm6
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
jb .Lecb_dec_five
movups 80(%rdi),%xmm7
je .Lecb_dec_six
movups 96(%rdi),%xmm8
movups (%rcx),%xmm0
+ xorps %xmm9,%xmm9
call _aesni_decrypt8
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
jmp .Lecb_ret
.align 16
.Lecb_dec_one:
@@ -728,53 +798,76 @@ aesni_ecb_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_4
+ jnz .Loop_dec1_4
.byte 102,15,56,223,209
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
jmp .Lecb_ret
.align 16
.Lecb_dec_two:
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
+ call _aesni_decrypt2
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
jmp .Lecb_ret
.align 16
.Lecb_dec_three:
call _aesni_decrypt3
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
jmp .Lecb_ret
.align 16
.Lecb_dec_four:
call _aesni_decrypt4
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
jmp .Lecb_ret
.align 16
.Lecb_dec_five:
xorps %xmm7,%xmm7
call _aesni_decrypt6
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
jmp .Lecb_ret
.align 16
.Lecb_dec_six:
call _aesni_decrypt6
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
.Lecb_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
.byte 0xf3,0xc3
.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
.globl aesni_ccm64_encrypt_blocks
@@ -782,56 +875,62 @@ aesni_ecb_encrypt:
.align 16
aesni_ccm64_encrypt_blocks:
movl 240(%rcx),%eax
- movdqu (%r8),%xmm9
- movdqa .Lincrement64(%rip),%xmm6
+ movdqu (%r8),%xmm6
+ movdqa .Lincrement64(%rip),%xmm9
movdqa .Lbswap_mask(%rip),%xmm7
- shrl $1,%eax
+ shll $4,%eax
+ movl $16,%r10d
leaq 0(%rcx),%r11
movdqu (%r9),%xmm3
- movdqa %xmm9,%xmm2
- movl %eax,%r10d
-.byte 102,68,15,56,0,207
+ movdqa %xmm6,%xmm2
+ leaq 32(%rcx,%rax,1),%rcx
+.byte 102,15,56,0,247
+ subq %rax,%r10
jmp .Lccm64_enc_outer
.align 16
.Lccm64_enc_outer:
movups (%r11),%xmm0
- movl %r10d,%eax
+ movq %r10,%rax
movups (%rdi),%xmm8
xorps %xmm0,%xmm2
movups 16(%r11),%xmm1
xorps %xmm8,%xmm0
- leaq 32(%r11),%rcx
xorps %xmm0,%xmm3
- movups (%rcx),%xmm0
+ movups 32(%r11),%xmm0
.Lccm64_enc2_loop:
.byte 102,15,56,220,209
- decl %eax
.byte 102,15,56,220,217
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,216
- movups 0(%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
+ decq %rdx
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- decq %rdx
leaq 16(%rdi),%rdi
xorps %xmm2,%xmm8
- movdqa %xmm9,%xmm2
+ movdqa %xmm6,%xmm2
movups %xmm8,(%rsi)
- leaq 16(%rsi),%rsi
.byte 102,15,56,0,215
+ leaq 16(%rsi),%rsi
jnz .Lccm64_enc_outer
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
movups %xmm3,(%r9)
+ pxor %xmm3,%xmm3
+ pxor %xmm8,%xmm8
+ pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
.globl aesni_ccm64_decrypt_blocks
@@ -839,15 +938,15 @@ aesni_ccm64_encrypt_blocks:
.align 16
aesni_ccm64_decrypt_blocks:
movl 240(%rcx),%eax
- movups (%r8),%xmm9
+ movups (%r8),%xmm6
movdqu (%r9),%xmm3
- movdqa .Lincrement64(%rip),%xmm6
+ movdqa .Lincrement64(%rip),%xmm9
movdqa .Lbswap_mask(%rip),%xmm7
- movaps %xmm9,%xmm2
+ movaps %xmm6,%xmm2
movl %eax,%r10d
movq %rcx,%r11
-.byte 102,68,15,56,0,207
+.byte 102,15,56,0,247
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
@@ -857,17 +956,21 @@ aesni_ccm64_decrypt_blocks:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_5
+ jnz .Loop_enc1_5
.byte 102,15,56,221,209
+ shll $4,%r10d
+ movl $16,%eax
movups (%rdi),%xmm8
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
leaq 16(%rdi),%rdi
+ subq %r10,%rax
+ leaq 32(%r11,%r10,1),%rcx
+ movq %rax,%r10
jmp .Lccm64_dec_outer
.align 16
.Lccm64_dec_outer:
xorps %xmm2,%xmm8
- movdqa %xmm9,%xmm2
- movl %r10d,%eax
+ movdqa %xmm6,%xmm2
movups %xmm8,(%rsi)
leaq 16(%rsi),%rsi
.byte 102,15,56,0,215
@@ -876,36 +979,36 @@ aesni_ccm64_decrypt_blocks:
jz .Lccm64_dec_break
movups (%r11),%xmm0
- shrl $1,%eax
+ movq %r10,%rax
movups 16(%r11),%xmm1
xorps %xmm0,%xmm8
- leaq 32(%r11),%rcx
xorps %xmm0,%xmm2
xorps %xmm8,%xmm3
- movups (%rcx),%xmm0
-
+ movups 32(%r11),%xmm0
+ jmp .Lccm64_dec2_loop
+.align 16
.Lccm64_dec2_loop:
.byte 102,15,56,220,209
- decl %eax
.byte 102,15,56,220,217
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,216
- movups 0(%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz .Lccm64_dec2_loop
movups (%rdi),%xmm8
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- leaq 16(%rdi),%rdi
.byte 102,15,56,221,208
.byte 102,15,56,221,216
+ leaq 16(%rdi),%rdi
jmp .Lccm64_dec_outer
.align 16
.Lccm64_dec_break:
+ movl 240(%r11),%eax
movups (%r11),%xmm0
movups 16(%r11),%xmm1
xorps %xmm0,%xmm8
@@ -916,23 +1019,58 @@ aesni_ccm64_decrypt_blocks:
decl %eax
movups (%r11),%xmm1
leaq 16(%r11),%r11
- jnz .Loop_enc1_6
+ jnz .Loop_enc1_6
.byte 102,15,56,221,217
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
movups %xmm3,(%r9)
+ pxor %xmm3,%xmm3
+ pxor %xmm8,%xmm8
+ pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
.globl aesni_ctr32_encrypt_blocks
.type aesni_ctr32_encrypt_blocks,@function
.align 16
aesni_ctr32_encrypt_blocks:
+ cmpq $1,%rdx
+ jne .Lctr32_bulk
+
+
+
+ movups (%r8),%xmm2
+ movups (%rdi),%xmm3
+ movl 240(%rcx),%edx
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_enc1_7:
+.byte 102,15,56,220,209
+ decl %edx
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_7
+.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ xorps %xmm2,%xmm2
+ jmp .Lctr32_epilogue
+
+.align 16
+.Lctr32_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $128,%rsp
andq $-16,%rsp
leaq -8(%rax),%rbp
- cmpq $1,%rdx
- je .Lctr32_one_shortcut
+
+
movdqu (%r8),%xmm2
movdqu (%rcx),%xmm0
@@ -947,32 +1085,33 @@ aesni_ctr32_encrypt_blocks:
movdqa %xmm2,64(%rsp)
movdqa %xmm2,80(%rsp)
movdqa %xmm2,96(%rsp)
+ movq %rdx,%r10
movdqa %xmm2,112(%rsp)
- movl 240(%rcx),%eax
-
- leaq 1(%r8),%r9
- leaq 2(%r8),%r10
- bswapl %r9d
- bswapl %r10d
- xorl %r11d,%r9d
- xorl %r11d,%r10d
-.byte 102,65,15,58,34,217,3
- leaq 3(%r8),%r9
+ leaq 1(%r8),%rax
+ leaq 2(%r8),%rdx
+ bswapl %eax
+ bswapl %edx
+ xorl %r11d,%eax
+ xorl %r11d,%edx
+.byte 102,15,58,34,216,3
+ leaq 3(%r8),%rax
movdqa %xmm3,16(%rsp)
-.byte 102,65,15,58,34,226,3
- bswapl %r9d
+.byte 102,15,58,34,226,3
+ bswapl %eax
+ movq %r10,%rdx
leaq 4(%r8),%r10
movdqa %xmm4,32(%rsp)
- xorl %r11d,%r9d
+ xorl %r11d,%eax
bswapl %r10d
-.byte 102,65,15,58,34,233,3
+.byte 102,15,58,34,232,3
xorl %r11d,%r10d
movdqa %xmm5,48(%rsp)
leaq 5(%r8),%r9
movl %r10d,64+12(%rsp)
bswapl %r9d
leaq 6(%r8),%r10
+ movl 240(%rcx),%eax
xorl %r11d,%r9d
bswapl %r10d
movl %r9d,80+12(%rsp)
@@ -980,7 +1119,9 @@ aesni_ctr32_encrypt_blocks:
leaq 7(%r8),%r9
movl %r10d,96+12(%rsp)
bswapl %r9d
+ movl _gnutls_x86_cpuid_s+4(%rip),%r10d
xorl %r11d,%r9d
+ andl $71303168,%r10d
movl %r9d,112+12(%rsp)
movups 16(%rcx),%xmm1
@@ -991,10 +1132,104 @@ aesni_ctr32_encrypt_blocks:
cmpq $8,%rdx
jb .Lctr32_tail
+ subq $6,%rdx
+ cmpl $4194304,%r10d
+ je .Lctr32_6x
+
leaq 128(%rcx),%rcx
- subq $8,%rdx
+ subq $2,%rdx
jmp .Lctr32_loop8
+.align 16
+.Lctr32_6x:
+ shll $4,%eax
+ movl $48,%r10d
+ bswapl %r11d
+ leaq 32(%rcx,%rax,1),%rcx
+ subq %rax,%r10
+ jmp .Lctr32_loop6
+
+.align 16
+.Lctr32_loop6:
+ addl $6,%r8d
+ movups -48(%rcx,%r10,1),%xmm0
+.byte 102,15,56,220,209
+ movl %r8d,%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,217
+.byte 0x0f,0x38,0xf1,0x44,0x24,12
+ leal 1(%r8),%eax
+.byte 102,15,56,220,225
+ xorl %r11d,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,28
+.byte 102,15,56,220,233
+ leal 2(%r8),%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,241
+.byte 0x0f,0x38,0xf1,0x44,0x24,44
+ leal 3(%r8),%eax
+.byte 102,15,56,220,249
+ movups -32(%rcx,%r10,1),%xmm1
+ xorl %r11d,%eax
+
+.byte 102,15,56,220,208
+.byte 0x0f,0x38,0xf1,0x44,0x24,60
+ leal 4(%r8),%eax
+.byte 102,15,56,220,216
+ xorl %r11d,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,76
+.byte 102,15,56,220,224
+ leal 5(%r8),%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,232
+.byte 0x0f,0x38,0xf1,0x44,0x24,92
+ movq %r10,%rax
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups -16(%rcx,%r10,1),%xmm0
+
+ call .Lenc_loop6
+
+ movdqu (%rdi),%xmm8
+ movdqu 16(%rdi),%xmm9
+ movdqu 32(%rdi),%xmm10
+ movdqu 48(%rdi),%xmm11
+ movdqu 64(%rdi),%xmm12
+ movdqu 80(%rdi),%xmm13
+ leaq 96(%rdi),%rdi
+ movups -64(%rcx,%r10,1),%xmm1
+ pxor %xmm2,%xmm8
+ movaps 0(%rsp),%xmm2
+ pxor %xmm3,%xmm9
+ movaps 16(%rsp),%xmm3
+ pxor %xmm4,%xmm10
+ movaps 32(%rsp),%xmm4
+ pxor %xmm5,%xmm11
+ movaps 48(%rsp),%xmm5
+ pxor %xmm6,%xmm12
+ movaps 64(%rsp),%xmm6
+ pxor %xmm7,%xmm13
+ movaps 80(%rsp),%xmm7
+ movdqu %xmm8,(%rsi)
+ movdqu %xmm9,16(%rsi)
+ movdqu %xmm10,32(%rsi)
+ movdqu %xmm11,48(%rsi)
+ movdqu %xmm12,64(%rsi)
+ movdqu %xmm13,80(%rsi)
+ leaq 96(%rsi),%rsi
+
+ subq $6,%rdx
+ jnc .Lctr32_loop6
+
+ addq $6,%rdx
+ jz .Lctr32_done
+
+ leal -48(%r10),%eax
+ leaq -80(%rcx,%r10,1),%rcx
+ negl %eax
+ shrl $4,%eax
+ jmp .Lctr32_tail
+
.align 32
.Lctr32_loop8:
addl $8,%r8d
@@ -1007,6 +1242,7 @@ aesni_ctr32_encrypt_blocks:
movups 32-128(%rcx),%xmm0
.byte 102,15,56,220,225
xorl %r11d,%r9d
+ nop
.byte 102,15,56,220,233
movl %r9d,0+12(%rsp)
leaq 1(%r8),%r9
@@ -1015,11 +1251,12 @@ aesni_ctr32_encrypt_blocks:
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 48-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
-.byte 102,15,56,220,224
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
.byte 102,15,56,220,232
movl %r9d,16+12(%rsp)
leaq 2(%r8),%r9
@@ -1028,11 +1265,12 @@ aesni_ctr32_encrypt_blocks:
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 64-128(%rcx),%xmm0
+ bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- bswapl %r9d
-.byte 102,15,56,220,225
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
.byte 102,15,56,220,233
movl %r9d,32+12(%rsp)
leaq 3(%r8),%r9
@@ -1041,11 +1279,12 @@ aesni_ctr32_encrypt_blocks:
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 80-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
-.byte 102,15,56,220,224
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
.byte 102,15,56,220,232
movl %r9d,48+12(%rsp)
leaq 4(%r8),%r9
@@ -1054,11 +1293,12 @@ aesni_ctr32_encrypt_blocks:
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 96-128(%rcx),%xmm0
+ bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- bswapl %r9d
-.byte 102,15,56,220,225
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
.byte 102,15,56,220,233
movl %r9d,64+12(%rsp)
leaq 5(%r8),%r9
@@ -1067,11 +1307,12 @@ aesni_ctr32_encrypt_blocks:
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 112-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
-.byte 102,15,56,220,224
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
.byte 102,15,56,220,232
movl %r9d,80+12(%rsp)
leaq 6(%r8),%r9
@@ -1080,11 +1321,12 @@ aesni_ctr32_encrypt_blocks:
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 128-128(%rcx),%xmm0
+ bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- bswapl %r9d
-.byte 102,15,56,220,225
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
.byte 102,15,56,220,233
movl %r9d,96+12(%rsp)
leaq 7(%r8),%r9
@@ -1093,21 +1335,21 @@ aesni_ctr32_encrypt_blocks:
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 144-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
.byte 102,15,56,220,224
xorl %r11d,%r9d
+ movdqu 0(%rdi),%xmm10
.byte 102,15,56,220,232
movl %r9d,112+12(%rsp)
+ cmpl $11,%eax
.byte 102,15,56,220,240
.byte 102,15,56,220,248
.byte 102,68,15,56,220,192
- movdqu 0(%rdi),%xmm10
.byte 102,68,15,56,220,200
movups 160-128(%rcx),%xmm0
- cmpl $11,%eax
jb .Lctr32_enc_done
.byte 102,15,56,220,209
@@ -1150,7 +1392,9 @@ aesni_ctr32_encrypt_blocks:
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 224-128(%rcx),%xmm0
+ jmp .Lctr32_enc_done
+.align 16
.Lctr32_enc_done:
movdqu 16(%rdi),%xmm11
pxor %xmm0,%xmm10
@@ -1162,8 +1406,8 @@ aesni_ctr32_encrypt_blocks:
pxor %xmm0,%xmm13
movdqu 80(%rdi),%xmm15
pxor %xmm0,%xmm14
-.byte 102,15,56,220,209
pxor %xmm0,%xmm15
+.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.byte 102,15,56,220,233
@@ -1172,26 +1416,26 @@ aesni_ctr32_encrypt_blocks:
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movdqu 96(%rdi),%xmm1
+ leaq 128(%rdi),%rdi
.byte 102,65,15,56,221,210
pxor %xmm0,%xmm1
- movdqu 112(%rdi),%xmm10
- leaq 128(%rdi),%rdi
+ movdqu 112-128(%rdi),%xmm10
.byte 102,65,15,56,221,219
pxor %xmm0,%xmm10
movdqa 0(%rsp),%xmm11
.byte 102,65,15,56,221,228
- movdqa 16(%rsp),%xmm12
.byte 102,65,15,56,221,237
+ movdqa 16(%rsp),%xmm12
movdqa 32(%rsp),%xmm13
.byte 102,65,15,56,221,246
- movdqa 48(%rsp),%xmm14
.byte 102,65,15,56,221,255
+ movdqa 48(%rsp),%xmm14
movdqa 64(%rsp),%xmm15
.byte 102,68,15,56,221,193
movdqa 80(%rsp),%xmm0
-.byte 102,69,15,56,221,202
movups 16-128(%rcx),%xmm1
+.byte 102,69,15,56,221,202
movups %xmm2,(%rsi)
movdqa %xmm11,%xmm2
@@ -1217,29 +1461,32 @@ aesni_ctr32_encrypt_blocks:
leaq -128(%rcx),%rcx
.Lctr32_tail:
+
+
leaq 16(%rcx),%rcx
cmpq $4,%rdx
jb .Lctr32_loop3
je .Lctr32_loop4
+
+ shll $4,%eax
movdqa 96(%rsp),%xmm8
pxor %xmm9,%xmm9
movups 16(%rcx),%xmm0
.byte 102,15,56,220,209
- leaq 16(%rcx),%rcx
.byte 102,15,56,220,217
- shrl $1,%eax
+ leaq 32-16(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,220,225
- decl %eax
-.byte 102,15,56,220,233
+ addq $16,%rax
movups (%rdi),%xmm10
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
movups 16(%rdi),%xmm11
-.byte 102,15,56,220,249
movups 32(%rdi),%xmm12
+.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
- movups 16(%rcx),%xmm1
call .Lenc_loop8_enter
@@ -1272,19 +1519,19 @@ aesni_ctr32_encrypt_blocks:
.Lctr32_loop4:
.byte 102,15,56,220,209
leaq 16(%rcx),%rcx
+ decl %eax
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.byte 102,15,56,220,233
movups (%rcx),%xmm1
- decl %eax
jnz .Lctr32_loop4
.byte 102,15,56,221,209
- movups (%rdi),%xmm10
.byte 102,15,56,221,217
+ movups (%rdi),%xmm10
movups 16(%rdi),%xmm11
.byte 102,15,56,221,225
- movups 32(%rdi),%xmm12
.byte 102,15,56,221,233
+ movups 32(%rdi),%xmm12
movups 48(%rdi),%xmm13
xorps %xmm10,%xmm2
@@ -1301,10 +1548,10 @@ aesni_ctr32_encrypt_blocks:
.Lctr32_loop3:
.byte 102,15,56,220,209
leaq 16(%rcx),%rcx
+ decl %eax
.byte 102,15,56,220,217
.byte 102,15,56,220,225
movups (%rcx),%xmm1
- decl %eax
jnz .Lctr32_loop3
.byte 102,15,56,221,209
.byte 102,15,56,221,217
@@ -1324,30 +1571,33 @@ aesni_ctr32_encrypt_blocks:
movups 32(%rdi),%xmm12
xorps %xmm12,%xmm4
movups %xmm4,32(%rsi)
- jmp .Lctr32_done
-.align 16
-.Lctr32_one_shortcut:
- movups (%r8),%xmm2
- movups (%rdi),%xmm10
- movl 240(%rcx),%eax
- movups (%rcx),%xmm0
- movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
- xorps %xmm0,%xmm2
-.Loop_enc1_7:
-.byte 102,15,56,220,209
- decl %eax
- movups (%rcx),%xmm1
- leaq 16(%rcx),%rcx
- jnz .Loop_enc1_7
-.byte 102,15,56,221,209
- xorps %xmm10,%xmm2
- movups %xmm2,(%rsi)
- jmp .Lctr32_done
-
-.align 16
.Lctr32_done:
+ xorps %xmm0,%xmm0
+ xorl %r11d,%r11d
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ movaps %xmm0,112(%rsp)
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
.Lctr32_epilogue:
@@ -1359,245 +1609,285 @@ aesni_ctr32_encrypt_blocks:
aesni_xts_encrypt:
leaq (%rsp),%rax
pushq %rbp
- subq $96,%rsp
+ subq $112,%rsp
andq $-16,%rsp
leaq -8(%rax),%rbp
- movups (%r9),%xmm15
+ movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
movups (%r8),%xmm0
movups 16(%r8),%xmm1
leaq 32(%r8),%r8
- xorps %xmm0,%xmm15
+ xorps %xmm0,%xmm2
.Loop_enc1_8:
-.byte 102,68,15,56,220,249
+.byte 102,15,56,220,209
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz .Loop_enc1_8
-.byte 102,68,15,56,221,249
+ jnz .Loop_enc1_8
+.byte 102,15,56,221,209
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+
movdqa .Lxts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ movdqa %xmm2,%xmm15
+ pshufd $0x5f,%xmm2,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc .Lxts_enc_short
- shrl $1,%eax
- subl $1,%eax
- movl %eax,%r10d
+ movl $16+96,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ subq %r10,%rax
+ movups 16(%r11),%xmm1
+ movq %rax,%r10
+ leaq .Lxts_magic(%rip),%r8
jmp .Lxts_enc_grandloop
-.align 16
+.align 32
.Lxts_enc_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,220,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,220,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,220,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,220,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,220,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,220,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,220,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm12
+
+.byte 102,15,56,220,208
+ pxor %xmm9,%xmm13
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,220,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,220,216
+ pxor %xmm9,%xmm14
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,220,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,220,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,220,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,220,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp .Lxts_enc_loop6_enter
-
-.align 16
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups 64(%r11),%xmm0
+ movdqa %xmm8,80(%rsp)
+ pshufd $0x5f,%xmm15,%xmm9
+ jmp .Lxts_enc_loop6
+.align 32
.Lxts_enc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.Lxts_enc_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups -64(%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
+ movups -80(%rcx,%rax,1),%xmm0
jnz .Lxts_enc_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
.byte 102,15,56,220,233
.byte 102,15,56,220,241
+ pxor %xmm14,%xmm15
+ movaps %xmm10,%xmm11
.byte 102,15,56,220,249
- movups 16(%rcx),%xmm1
+ movups -64(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,208
- pand %xmm8,%xmm9
+ paddd %xmm9,%xmm9
+ pxor %xmm15,%xmm10
.byte 102,15,56,220,216
- pcmpgtd %xmm15,%xmm14
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
.byte 102,15,56,220,224
- pxor %xmm9,%xmm15
.byte 102,15,56,220,232
+ pand %xmm8,%xmm14
+ movaps %xmm11,%xmm12
.byte 102,15,56,220,240
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,248
- movups 32(%rcx),%xmm0
+ movups -48(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
.byte 102,15,56,220,233
+ movdqa %xmm13,48(%rsp)
+ pxor %xmm14,%xmm15
.byte 102,15,56,220,241
+ movaps %xmm12,%xmm13
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,249
+ movups -32(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,220,216
paddq %xmm15,%xmm15
-.byte 102,15,56,221,208
- pand %xmm8,%xmm9
-.byte 102,15,56,221,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,221,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,221,232
-.byte 102,15,56,221,240
-.byte 102,15,56,221,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+ pxor %xmm14,%xmm15
+ movaps %xmm13,%xmm14
+.byte 102,15,56,220,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,220,217
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
+ pand %xmm8,%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+.byte 102,15,56,221,84,36,0
+ psrad $31,%xmm9
+ paddq %xmm15,%xmm15
+.byte 102,15,56,221,92,36,16
+.byte 102,15,56,221,100,36,32
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+ movq %r10,%rax
+.byte 102,15,56,221,108,36,48
+.byte 102,15,56,221,116,36,64
+.byte 102,15,56,221,124,36,80
pxor %xmm9,%xmm15
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
- movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc .Lxts_enc_grandloop
- leal 3(%rax,%rax,1),%eax
+ movl $16+96,%eax
+ subl %r10d,%eax
movq %r11,%rcx
- movl %eax,%r10d
+ shrl $4,%eax
.Lxts_enc_short:
+
+ movl %eax,%r10d
+ pxor %xmm0,%xmm10
addq $96,%rdx
jz .Lxts_enc_done
- cmpq $32,%rdx
+ pxor %xmm0,%xmm11
+ cmpq $0x20,%rdx
jb .Lxts_enc_one
+ pxor %xmm0,%xmm12
je .Lxts_enc_two
- cmpq $64,%rdx
+ pxor %xmm0,%xmm13
+ cmpq $0x40,%rdx
jb .Lxts_enc_three
+ pxor %xmm0,%xmm14
je .Lxts_enc_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
@@ -1607,6 +1897,7 @@ aesni_xts_encrypt:
pxor %xmm12,%xmm4
pxor %xmm13,%xmm5
pxor %xmm14,%xmm6
+ pxor %xmm7,%xmm7
call _aesni_encrypt6
@@ -1638,7 +1929,7 @@ aesni_xts_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_9
+ jnz .Loop_enc1_9
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movdqa %xmm11,%xmm10
@@ -1654,7 +1945,7 @@ aesni_xts_encrypt:
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
- call _aesni_encrypt3
+ call _aesni_encrypt2
xorps %xmm10,%xmm2
movdqa %xmm12,%xmm10
@@ -1700,15 +1991,15 @@ aesni_xts_encrypt:
call _aesni_encrypt4
- xorps %xmm10,%xmm2
- movdqa %xmm15,%xmm10
- xorps %xmm11,%xmm3
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm10,%xmm2
+ movdqa %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp .Lxts_enc_done
@@ -1743,12 +2034,35 @@ aesni_xts_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_10
+ jnz .Loop_enc1_10
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movups %xmm2,-16(%rsi)
.Lxts_enc_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
.Lxts_enc_epilogue:
@@ -1760,251 +2074,291 @@ aesni_xts_encrypt:
aesni_xts_decrypt:
leaq (%rsp),%rax
pushq %rbp
- subq $96,%rsp
+ subq $112,%rsp
andq $-16,%rsp
leaq -8(%rax),%rbp
- movups (%r9),%xmm15
+ movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
movups (%r8),%xmm0
movups 16(%r8),%xmm1
leaq 32(%r8),%r8
- xorps %xmm0,%xmm15
+ xorps %xmm0,%xmm2
.Loop_enc1_11:
-.byte 102,68,15,56,220,249
+.byte 102,15,56,220,209
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz .Loop_enc1_11
-.byte 102,68,15,56,221,249
+ jnz .Loop_enc1_11
+.byte 102,15,56,221,209
xorl %eax,%eax
testq $15,%rdx
setnz %al
shlq $4,%rax
subq %rax,%rdx
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+
movdqa .Lxts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ movdqa %xmm2,%xmm15
+ pshufd $0x5f,%xmm2,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc .Lxts_dec_short
- shrl $1,%eax
- subl $1,%eax
- movl %eax,%r10d
+ movl $16+96,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ subq %r10,%rax
+ movups 16(%r11),%xmm1
+ movq %rax,%r10
+ leaq .Lxts_magic(%rip),%r8
jmp .Lxts_dec_grandloop
-.align 16
+.align 32
.Lxts_dec_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,222,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,222,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,222,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,222,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,222,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,222,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,222,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm12
+
+.byte 102,15,56,222,208
+ pxor %xmm9,%xmm13
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,222,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,222,216
+ pxor %xmm9,%xmm14
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,222,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,222,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,222,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,222,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp .Lxts_dec_loop6_enter
-
-.align 16
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups 64(%r11),%xmm0
+ movdqa %xmm8,80(%rsp)
+ pshufd $0x5f,%xmm15,%xmm9
+ jmp .Lxts_dec_loop6
+.align 32
.Lxts_dec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.Lxts_dec_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups -64(%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%rcx),%xmm0
+ movups -80(%rcx,%rax,1),%xmm0
jnz .Lxts_dec_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
.byte 102,15,56,222,233
.byte 102,15,56,222,241
+ pxor %xmm14,%xmm15
+ movaps %xmm10,%xmm11
.byte 102,15,56,222,249
- movups 16(%rcx),%xmm1
+ movups -64(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,208
- pand %xmm8,%xmm9
+ paddd %xmm9,%xmm9
+ pxor %xmm15,%xmm10
.byte 102,15,56,222,216
- pcmpgtd %xmm15,%xmm14
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
.byte 102,15,56,222,224
- pxor %xmm9,%xmm15
.byte 102,15,56,222,232
+ pand %xmm8,%xmm14
+ movaps %xmm11,%xmm12
.byte 102,15,56,222,240
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,248
- movups 32(%rcx),%xmm0
+ movups -48(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
.byte 102,15,56,222,233
+ movdqa %xmm13,48(%rsp)
+ pxor %xmm14,%xmm15
.byte 102,15,56,222,241
+ movaps %xmm12,%xmm13
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,249
+ movups -32(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,222,216
paddq %xmm15,%xmm15
-.byte 102,15,56,223,208
- pand %xmm8,%xmm9
-.byte 102,15,56,223,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,223,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,223,232
-.byte 102,15,56,223,240
-.byte 102,15,56,223,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+ pxor %xmm14,%xmm15
+ movaps %xmm13,%xmm14
+.byte 102,15,56,222,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,222,217
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+.byte 102,15,56,223,84,36,0
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
+.byte 102,15,56,223,92,36,16
+.byte 102,15,56,223,100,36,32
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+ movq %r10,%rax
+.byte 102,15,56,223,108,36,48
+.byte 102,15,56,223,116,36,64
+.byte 102,15,56,223,124,36,80
pxor %xmm9,%xmm15
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
- movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc .Lxts_dec_grandloop
- leal 3(%rax,%rax,1),%eax
+ movl $16+96,%eax
+ subl %r10d,%eax
movq %r11,%rcx
- movl %eax,%r10d
+ shrl $4,%eax
.Lxts_dec_short:
+
+ movl %eax,%r10d
+ pxor %xmm0,%xmm10
+ pxor %xmm0,%xmm11
addq $96,%rdx
jz .Lxts_dec_done
- cmpq $32,%rdx
+ pxor %xmm0,%xmm12
+ cmpq $0x20,%rdx
jb .Lxts_dec_one
+ pxor %xmm0,%xmm13
je .Lxts_dec_two
- cmpq $64,%rdx
+ pxor %xmm0,%xmm14
+ cmpq $0x40,%rdx
jb .Lxts_dec_three
je .Lxts_dec_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
@@ -2030,7 +2384,7 @@ aesni_xts_decrypt:
pcmpgtd %xmm15,%xmm14
movdqu %xmm6,64(%rsi)
leaq 80(%rsi),%rsi
- pshufd $19,%xmm14,%xmm11
+ pshufd $0x13,%xmm14,%xmm11
andq $15,%r9
jz .Lxts_dec_ret
@@ -2054,7 +2408,7 @@ aesni_xts_decrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_12
+ jnz .Loop_dec1_12
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movdqa %xmm11,%xmm10
@@ -2071,7 +2425,7 @@ aesni_xts_decrypt:
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
- call _aesni_decrypt3
+ call _aesni_decrypt2
xorps %xmm10,%xmm2
movdqa %xmm12,%xmm10
@@ -2097,7 +2451,7 @@ aesni_xts_decrypt:
xorps %xmm10,%xmm2
movdqa %xmm13,%xmm10
xorps %xmm11,%xmm3
- movdqa %xmm15,%xmm11
+ movdqa %xmm14,%xmm11
xorps %xmm12,%xmm4
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
@@ -2107,14 +2461,8 @@ aesni_xts_decrypt:
.align 16
.Lxts_dec_four:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movups (%rdi),%xmm2
- pand %xmm8,%xmm9
movups 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movups 32(%rdi),%xmm4
xorps %xmm10,%xmm2
movups 48(%rdi),%xmm5
@@ -2125,16 +2473,16 @@ aesni_xts_decrypt:
call _aesni_decrypt4
- xorps %xmm10,%xmm2
+ pxor %xmm10,%xmm2
movdqa %xmm14,%xmm10
- xorps %xmm11,%xmm3
+ pxor %xmm11,%xmm3
movdqa %xmm15,%xmm11
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp .Lxts_dec_done
@@ -2158,7 +2506,7 @@ aesni_xts_decrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_13
+ jnz .Loop_dec1_13
.byte 102,15,56,223,209
xorps %xmm11,%xmm2
movups %xmm2,(%rsi)
@@ -2188,12 +2536,35 @@ aesni_xts_decrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_14
+ jnz .Loop_dec1_14
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movups %xmm2,(%rsi)
.Lxts_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
.Lxts_dec_epilogue:
@@ -2232,7 +2603,7 @@ aesni_cbc_encrypt:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_enc1_15
+ jnz .Loop_enc1_15
.byte 102,15,56,221,209
movl %r10d,%eax
movq %r11,%rcx
@@ -2242,26 +2613,59 @@ aesni_cbc_encrypt:
jnc .Lcbc_enc_loop
addq $16,%rdx
jnz .Lcbc_enc_tail
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%r8)
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
jmp .Lcbc_ret
.Lcbc_enc_tail:
movq %rdx,%rcx
xchgq %rdi,%rsi
-.long 0x9066A4F3
+.long 0x9066A4F3
movl $16,%ecx
subq %rdx,%rcx
xorl %eax,%eax
-.long 0x9066AAF3
+.long 0x9066AAF3
leaq -16(%rdi),%rdi
movl %r10d,%eax
movq %rdi,%rsi
movq %r11,%rcx
xorq %rdx,%rdx
- jmp .Lcbc_enc_loop
+ jmp .Lcbc_enc_loop
.align 16
.Lcbc_decrypt:
+ cmpq $16,%rdx
+ jne .Lcbc_decrypt_bulk
+
+
+
+ movdqu (%rdi),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa %xmm2,%xmm4
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_16:
+.byte 102,15,56,222,209
+ decl %r10d
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_16
+.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movdqu %xmm4,(%r8)
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ jmp .Lcbc_ret
+.align 16
+.Lcbc_decrypt_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $16,%rsp
@@ -2269,7 +2673,7 @@ aesni_cbc_encrypt:
leaq -8(%rax),%rbp
movups (%r8),%xmm10
movl %r10d,%eax
- cmpq $80,%rdx
+ cmpq $0x50,%rdx
jbe .Lcbc_dec_tail
movups (%rcx),%xmm0
@@ -2284,10 +2688,15 @@ aesni_cbc_encrypt:
movdqa %xmm5,%xmm14
movdqu 80(%rdi),%xmm7
movdqa %xmm6,%xmm15
- cmpq $112,%rdx
+ movl _gnutls_x86_cpuid_s+4(%rip),%r9d
+ cmpq $0x70,%rdx
jbe .Lcbc_dec_six_or_seven
- subq $112,%rdx
+ andl $71303168,%r9d
+ subq $0x50,%rdx
+ cmpl $4194304,%r9d
+ je .Lcbc_dec_loop6_enter
+ subq $0x20,%rdx
leaq 112(%rcx),%rcx
jmp .Lcbc_dec_loop8_enter
.align 16
@@ -2302,7 +2711,7 @@ aesni_cbc_encrypt:
movups 16-112(%rcx),%xmm1
pxor %xmm0,%xmm4
xorq %r11,%r11
- cmpq $112,%rdx
+ cmpq $0x70,%rdx
pxor %xmm0,%xmm5
pxor %xmm0,%xmm6
pxor %xmm0,%xmm7
@@ -2316,8 +2725,8 @@ aesni_cbc_encrypt:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
- setnc %r11b
.byte 102,68,15,56,222,193
+ setnc %r11b
shlq $7,%r11
.byte 102,68,15,56,222,201
addq %rdi,%r11
@@ -2331,6 +2740,7 @@ aesni_cbc_encrypt:
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 64-112(%rcx),%xmm0
+ nop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -2340,6 +2750,7 @@ aesni_cbc_encrypt:
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 80-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
@@ -2349,6 +2760,7 @@ aesni_cbc_encrypt:
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 96-112(%rcx),%xmm0
+ nop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -2358,6 +2770,7 @@ aesni_cbc_encrypt:
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 112-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
@@ -2367,6 +2780,7 @@ aesni_cbc_encrypt:
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 128-112(%rcx),%xmm0
+ nop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -2376,6 +2790,7 @@ aesni_cbc_encrypt:
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 144-112(%rcx),%xmm1
+ cmpl $11,%eax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
@@ -2385,7 +2800,6 @@ aesni_cbc_encrypt:
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 160-112(%rcx),%xmm0
- cmpl $11,%eax
jb .Lcbc_dec_done
.byte 102,15,56,222,209
.byte 102,15,56,222,217
@@ -2396,6 +2810,7 @@ aesni_cbc_encrypt:
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 176-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
@@ -2415,6 +2830,7 @@ aesni_cbc_encrypt:
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 208-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
@@ -2424,18 +2840,20 @@ aesni_cbc_encrypt:
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 224-112(%rcx),%xmm0
+ jmp .Lcbc_dec_done
+.align 16
.Lcbc_dec_done:
.byte 102,15,56,222,209
- pxor %xmm0,%xmm10
.byte 102,15,56,222,217
+ pxor %xmm0,%xmm10
pxor %xmm0,%xmm11
.byte 102,15,56,222,225
- pxor %xmm0,%xmm12
.byte 102,15,56,222,233
+ pxor %xmm0,%xmm12
pxor %xmm0,%xmm13
.byte 102,15,56,222,241
- pxor %xmm0,%xmm14
.byte 102,15,56,222,249
+ pxor %xmm0,%xmm14
pxor %xmm0,%xmm15
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
@@ -2447,16 +2865,16 @@ aesni_cbc_encrypt:
.byte 102,65,15,56,223,219
pxor %xmm0,%xmm10
movdqu 112(%rdi),%xmm0
- leaq 128(%rdi),%rdi
.byte 102,65,15,56,223,228
+ leaq 128(%rdi),%rdi
movdqu 0(%r11),%xmm11
.byte 102,65,15,56,223,237
- movdqu 16(%r11),%xmm12
.byte 102,65,15,56,223,246
+ movdqu 16(%r11),%xmm12
movdqu 32(%r11),%xmm13
.byte 102,65,15,56,223,255
- movdqu 48(%r11),%xmm14
.byte 102,68,15,56,223,193
+ movdqu 48(%r11),%xmm14
movdqu 64(%r11),%xmm15
.byte 102,69,15,56,223,202
movdqa %xmm0,%xmm10
@@ -2478,21 +2896,21 @@ aesni_cbc_encrypt:
movups %xmm8,96(%rsi)
leaq 112(%rsi),%rsi
- subq $128,%rdx
+ subq $0x80,%rdx
ja .Lcbc_dec_loop8
movaps %xmm9,%xmm2
leaq -112(%rcx),%rcx
- addq $112,%rdx
- jle .Lcbc_dec_tail_collected
+ addq $0x70,%rdx
+ jle .Lcbc_dec_clear_tail_collected
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
- cmpq $80,%rdx
+ cmpq $0x50,%rdx
jbe .Lcbc_dec_tail
movaps %xmm11,%xmm2
.Lcbc_dec_six_or_seven:
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
ja .Lcbc_dec_seven
movaps %xmm7,%xmm8
@@ -2503,14 +2921,19 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
leaq 80(%rsi),%rsi
movdqa %xmm7,%xmm2
+ pxor %xmm7,%xmm7
jmp .Lcbc_dec_tail_collected
.align 16
@@ -2525,36 +2948,88 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
pxor %xmm9,%xmm8
movdqu %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
leaq 96(%rsi),%rsi
movdqa %xmm8,%xmm2
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_loop6:
+ movups %xmm7,(%rsi)
+ leaq 16(%rsi),%rsi
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqa %xmm2,%xmm11
+ movdqu 32(%rdi),%xmm4
+ movdqa %xmm3,%xmm12
+ movdqu 48(%rdi),%xmm5
+ movdqa %xmm4,%xmm13
+ movdqu 64(%rdi),%xmm6
+ movdqa %xmm5,%xmm14
+ movdqu 80(%rdi),%xmm7
+ movdqa %xmm6,%xmm15
+.Lcbc_dec_loop6_enter:
+ leaq 96(%rdi),%rdi
+ movdqa %xmm7,%xmm8
+
+ call _aesni_decrypt6
+
+ pxor %xmm10,%xmm2
+ movdqa %xmm8,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movq %r11,%rcx
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movl %r10d,%eax
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ subq $0x60,%rdx
+ ja .Lcbc_dec_loop6
+
+ movdqa %xmm7,%xmm2
+ addq $0x50,%rdx
+ jle .Lcbc_dec_clear_tail_collected
+ movups %xmm7,(%rsi)
+ leaq 16(%rsi),%rsi
+
.Lcbc_dec_tail:
movups (%rdi),%xmm2
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_one
movups 16(%rdi),%xmm3
movaps %xmm2,%xmm11
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_two
movups 32(%rdi),%xmm4
movaps %xmm3,%xmm12
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_three
movups 48(%rdi),%xmm5
movaps %xmm4,%xmm13
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_four
movups 64(%rdi),%xmm6
@@ -2568,13 +3043,18 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
leaq 64(%rsi),%rsi
movdqa %xmm6,%xmm2
- subq $16,%rdx
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ subq $0x10,%rdx
jmp .Lcbc_dec_tail_collected
.align 16
@@ -2584,12 +3064,12 @@ aesni_cbc_encrypt:
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
-.Loop_dec1_16:
+.Loop_dec1_17:
.byte 102,15,56,222,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_16
+ jnz .Loop_dec1_17
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movaps %xmm11,%xmm10
@@ -2597,13 +3077,13 @@ aesni_cbc_encrypt:
.align 16
.Lcbc_dec_two:
movaps %xmm3,%xmm12
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
+ call _aesni_decrypt2
pxor %xmm10,%xmm2
movaps %xmm12,%xmm10
pxor %xmm11,%xmm3
movdqu %xmm2,(%rsi)
movdqa %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leaq 16(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.align 16
@@ -2616,7 +3096,9 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movdqa %xmm4,%xmm2
+ pxor %xmm4,%xmm4
leaq 32(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.align 16
@@ -2629,29 +3111,45 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movdqa %xmm5,%xmm2
+ pxor %xmm5,%xmm5
leaq 48(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.align 16
+.Lcbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
.Lcbc_dec_tail_collected:
movups %xmm10,(%r8)
andq $15,%rdx
jnz .Lcbc_dec_tail_partial
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
jmp .Lcbc_dec_ret
.align 16
.Lcbc_dec_tail_partial:
movaps %xmm2,(%rsp)
+ pxor %xmm2,%xmm2
movq $16,%rcx
movq %rsi,%rdi
subq %rdx,%rcx
leaq (%rsp),%rsi
-.long 0x9066A4F3
+.long 0x9066A4F3
+ movdqa %xmm2,(%rsp)
.Lcbc_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
leaq (%rbp),%rsp
popq %rbp
.Lcbc_ret:
@@ -2661,7 +3159,7 @@ aesni_cbc_encrypt:
.type aesni_set_decrypt_key,@function
.align 16
aesni_set_decrypt_key:
-.byte 0x48,0x83,0xEC,0x08
+.byte 0x48,0x83,0xEC,0x08
call __aesni_set_encrypt_key
shll $4,%esi
testl %eax,%eax
@@ -2689,7 +3187,9 @@ aesni_set_decrypt_key:
movups (%rdx),%xmm0
.byte 102,15,56,219,192
+ pxor %xmm1,%xmm1
movups %xmm0,(%rdi)
+ pxor %xmm0,%xmm0
.Ldec_key_ret:
addq $8,%rsp
.byte 0xf3,0xc3
@@ -2700,15 +3200,17 @@ aesni_set_decrypt_key:
.align 16
aesni_set_encrypt_key:
__aesni_set_encrypt_key:
-.byte 0x48,0x83,0xEC,0x08
+.byte 0x48,0x83,0xEC,0x08
movq $-1,%rax
testq %rdi,%rdi
jz .Lenc_key_ret
testq %rdx,%rdx
jz .Lenc_key_ret
+ movl $268437504,%r10d
movups (%rdi),%xmm0
xorps %xmm4,%xmm4
+ andl _gnutls_x86_cpuid_s+4(%rip),%r10d
leaq 16(%rdx),%rax
cmpl $256,%esi
je .L14rounds
@@ -2719,6 +3221,9 @@ __aesni_set_encrypt_key:
.L10rounds:
movl $9,%esi
+ cmpl $268435456,%r10d
+ je .L10rounds_alt
+
movups %xmm0,(%rdx)
.byte 102,15,58,223,200,1
call .Lkey_expansion_128_cold
@@ -2746,9 +3251,79 @@ __aesni_set_encrypt_key:
jmp .Lenc_key_ret
.align 16
+.L10rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movl $8,%r10d
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,(%rdx)
+ jmp .Loop_key128
+
+.align 16
+.Loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leaq 16(%rax),%rax
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%rax)
+ movdqa %xmm0,%xmm2
+
+ decl %r10d
+ jnz .Loop_key128
+
+ movdqa .Lkey_rcon1b(%rip),%xmm4
+
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%rax)
+
+ movl %esi,96(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
.L12rounds:
movq 16(%rdi),%xmm2
movl $11,%esi
+ cmpl $268435456,%r10d
+ je .L12rounds_alt
+
movups %xmm0,(%rdx)
.byte 102,15,58,223,202,1
call .Lkey_expansion_192a_cold
@@ -2772,10 +3347,54 @@ __aesni_set_encrypt_key:
jmp .Lenc_key_ret
.align 16
+.L12rounds_alt:
+ movdqa .Lkey_rotate192(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movl $8,%r10d
+ movdqu %xmm0,(%rdx)
+ jmp .Loop_key192
+
+.align 16
+.Loop_key192:
+ movq %xmm2,0(%rax)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leaq 24(%rax),%rax
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+
+ pshufd $0xff,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%rax)
+
+ decl %r10d
+ jnz .Loop_key192
+
+ movl %esi,32(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
.L14rounds:
movups 16(%rdi),%xmm2
movl $13,%esi
leaq 16(%rax),%rax
+ cmpl $268435456,%r10d
+ je .L14rounds_alt
+
movups %xmm0,(%rdx)
movups %xmm2,16(%rdx)
.byte 102,15,58,223,202,1
@@ -2810,9 +3429,69 @@ __aesni_set_encrypt_key:
jmp .Lenc_key_ret
.align 16
+.L14rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movl $7,%r10d
+ movdqu %xmm0,0(%rdx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,16(%rdx)
+ jmp .Loop_key256
+
+.align 16
+.Loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ decl %r10d
+ jz .Ldone_key256
+
+ pshufd $0xff,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%rax)
+ leaq 32(%rax),%rax
+ movdqa %xmm2,%xmm1
+
+ jmp .Loop_key256
+
+.Ldone_key256:
+ movl %esi,16(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
.Lbad_keybits:
movq $-2,%rax
.Lenc_key_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
addq $8,%rsp
.byte 0xf3,0xc3
.LSEH_end_set_encrypt_key:
@@ -2898,11 +3577,16 @@ __aesni_set_encrypt_key:
.long 0x87,0,1,0
.Lincrement1:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Lkey_rotate:
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+.Lkey_rotate192:
+.long 0x04070605,0x04070605,0x04070605,0x04070605
+.Lkey_rcon1:
+.long 1,1,1,1
+.Lkey_rcon1b:
+.long 0x1b,0x1b,0x1b,0x1b
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64
-
.section .note.GNU-stack,"",%progbits
-
-
diff --git a/lib/accelerated/x86/elf/cpuid-x86.s b/lib/accelerated/x86/elf/cpuid-x86.s
index c7a2b9e166..83a6d23545 100644
--- a/lib/accelerated/x86/elf/cpuid-x86.s
+++ b/lib/accelerated/x86/elf/cpuid-x86.s
@@ -72,7 +72,4 @@ gnutls_have_cpuid:
.size gnutls_have_cpuid,.-.L_gnutls_have_cpuid_begin
.byte 67,80,85,73,68,32,102,111,114,32,120,56,54,0
-
.section .note.GNU-stack,"",%progbits
-
-
diff --git a/lib/accelerated/x86/elf/ghash-x86_64.s b/lib/accelerated/x86/elf/ghash-x86_64.s
index 067ff0a760..e2568a6fd6 100644
--- a/lib/accelerated/x86/elf/ghash-x86_64.s
+++ b/lib/accelerated/x86/elf/ghash-x86_64.s
@@ -39,6 +39,7 @@
#
.text
+
.globl gcm_gmult_4bit
.type gcm_gmult_4bit,@function
.align 16
@@ -58,14 +59,14 @@ gcm_gmult_4bit:
movq $14,%rcx
movq 8(%rsi,%rax,1),%r8
movq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
movq %r8,%rdx
jmp .Loop1
.align 16
.Loop1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
movb (%rdi,%rcx,1),%al
shrq $4,%r9
@@ -81,13 +82,13 @@ gcm_gmult_4bit:
js .Lbreak1
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
@@ -96,19 +97,19 @@ gcm_gmult_4bit:
.align 16
.Lbreak1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rbx,1),%r8
@@ -905,173 +906,177 @@ gcm_gmult_clmul:
.align 32
gcm_ghash_clmul:
.L_ghash_clmul:
- movdqa .Lbswap_mask(%rip),%xmm5
- movq $11547335547999543296,%rax
+ movdqa .Lbswap_mask(%rip),%xmm10
movdqu (%rdi),%xmm0
movdqu (%rsi),%xmm2
- movdqu 32(%rsi),%xmm10
-.byte 102,15,56,0,197
+ movdqu 32(%rsi),%xmm7
+.byte 102,65,15,56,0,194
- subq $16,%rcx
+ subq $0x10,%rcx
jz .Lodd_tail
- movdqu 16(%rsi),%xmm9
- cmpq $48,%rcx
+ movdqu 16(%rsi),%xmm6
+ movl _gnutls_x86_cpuid_s+4(%rip),%eax
+ cmpq $0x30,%rcx
jb .Lskip4x
- subq $48,%rcx
+ andl $71303168,%eax
+ cmpl $4194304,%eax
+ je .Lskip4x
+
+ subq $0x30,%rcx
+ movq $0xA040608020C0E000,%rax
movdqu 48(%rsi),%xmm14
movdqu 64(%rsi),%xmm15
- movdqu 48(%rdx),%xmm6
+ movdqu 48(%rdx),%xmm3
movdqu 32(%rdx),%xmm11
-.byte 102,15,56,0,245
-.byte 102,68,15,56,0,221
- movdqa %xmm6,%xmm8
- pshufd $78,%xmm6,%xmm7
- pxor %xmm6,%xmm7
-.byte 102,15,58,68,242,0
-.byte 102,68,15,58,68,194,17
-.byte 102,65,15,58,68,250,0
+.byte 102,65,15,56,0,218
+.byte 102,69,15,56,0,218
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
movdqa %xmm11,%xmm13
pshufd $78,%xmm11,%xmm12
pxor %xmm11,%xmm12
-.byte 102,69,15,58,68,217,0
-.byte 102,69,15,58,68,233,17
- xorps %xmm11,%xmm6
-.byte 102,69,15,58,68,226,16
- xorps %xmm13,%xmm8
- movups 80(%rsi),%xmm10
- xorps %xmm12,%xmm7
+.byte 102,68,15,58,68,222,0
+.byte 102,68,15,58,68,238,17
+.byte 102,68,15,58,68,231,16
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
+ movups 80(%rsi),%xmm7
+ xorps %xmm12,%xmm4
movdqu 16(%rdx),%xmm11
- movdqu 0(%rdx),%xmm3
-.byte 102,68,15,56,0,221
-.byte 102,15,56,0,221
+ movdqu 0(%rdx),%xmm8
+.byte 102,69,15,56,0,218
+.byte 102,69,15,56,0,194
movdqa %xmm11,%xmm13
pshufd $78,%xmm11,%xmm12
- pxor %xmm3,%xmm0
+ pxor %xmm8,%xmm0
pxor %xmm11,%xmm12
.byte 102,69,15,58,68,222,0
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pxor %xmm0,%xmm3
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
.byte 102,69,15,58,68,238,17
- xorps %xmm11,%xmm6
-.byte 102,69,15,58,68,226,0
- xorps %xmm13,%xmm8
+.byte 102,68,15,58,68,231,0
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
- subq $64,%rcx
+ subq $0x40,%rcx
jc .Ltail4x
jmp .Lmod4_loop
.align 32
.Lmod4_loop:
.byte 102,65,15,58,68,199,0
- xorps %xmm12,%xmm7
+ xorps %xmm12,%xmm4
movdqu 48(%rdx),%xmm11
-.byte 102,68,15,56,0,221
+.byte 102,69,15,56,0,218
.byte 102,65,15,58,68,207,17
- xorps %xmm6,%xmm0
- movdqu 32(%rdx),%xmm6
+ xorps %xmm3,%xmm0
+ movdqu 32(%rdx),%xmm3
movdqa %xmm11,%xmm13
+.byte 102,68,15,58,68,199,16
pshufd $78,%xmm11,%xmm12
-.byte 102,65,15,58,68,218,16
- xorps %xmm8,%xmm1
+ xorps %xmm5,%xmm1
pxor %xmm11,%xmm12
-.byte 102,15,56,0,245
- movups 32(%rsi),%xmm10
+.byte 102,65,15,56,0,218
+ movups 32(%rsi),%xmm7
+ xorps %xmm4,%xmm8
.byte 102,68,15,58,68,218,0
- xorps %xmm7,%xmm3
- movdqa %xmm6,%xmm8
- pshufd $78,%xmm6,%xmm7
+ pshufd $78,%xmm3,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm6,%xmm7
- pxor %xmm1,%xmm3
- movdqa %xmm3,%xmm4
- pslldq $8,%xmm3
+ pxor %xmm0,%xmm8
+ movdqa %xmm3,%xmm5
+ pxor %xmm1,%xmm8
+ pxor %xmm3,%xmm4
+ movdqa %xmm8,%xmm9
.byte 102,68,15,58,68,234,17
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- movdqa .L7_mask(%rip),%xmm3
- pxor %xmm4,%xmm1
-.byte 102,72,15,110,224
-
- pand %xmm0,%xmm3
-.byte 102,15,56,0,227
-.byte 102,69,15,58,68,226,0
- pxor %xmm0,%xmm4
- psllq $57,%xmm4
- movdqa %xmm4,%xmm3
- pslldq $8,%xmm4
-.byte 102,65,15,58,68,241,0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
- movdqu 0(%rdx),%xmm3
+ pslldq $8,%xmm8
+ psrldq $8,%xmm9
+ pxor %xmm8,%xmm0
+ movdqa .L7_mask(%rip),%xmm8
+ pxor %xmm9,%xmm1
+.byte 102,76,15,110,200
+
+ pand %xmm0,%xmm8
+.byte 102,69,15,56,0,200
+ pxor %xmm0,%xmm9
+.byte 102,68,15,58,68,231,0
+ psllq $57,%xmm9
+ movdqa %xmm9,%xmm8
+ pslldq $8,%xmm9
+.byte 102,15,58,68,222,0
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
+ movdqu 0(%rdx),%xmm8
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm9
psrlq $1,%xmm0
-.byte 102,69,15,58,68,193,17
- xorps %xmm11,%xmm6
+.byte 102,15,58,68,238,17
+ xorps %xmm11,%xmm3
movdqu 16(%rdx),%xmm11
-.byte 102,68,15,56,0,221
-.byte 102,65,15,58,68,250,16
- xorps %xmm13,%xmm8
- movups 80(%rsi),%xmm10
-.byte 102,15,56,0,221
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
+.byte 102,69,15,56,0,218
+.byte 102,15,58,68,231,16
+ xorps %xmm13,%xmm5
+ movups 80(%rsi),%xmm7
+.byte 102,69,15,56,0,194
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
psrlq $5,%xmm0
movdqa %xmm11,%xmm13
- pxor %xmm12,%xmm7
+ pxor %xmm12,%xmm4
pshufd $78,%xmm11,%xmm12
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
pxor %xmm11,%xmm12
.byte 102,69,15,58,68,222,0
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
psrlq $1,%xmm0
-.byte 102,69,15,58,68,238,17
- xorps %xmm11,%xmm6
pxor %xmm1,%xmm0
-
-.byte 102,69,15,58,68,226,0
- xorps %xmm13,%xmm8
-
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pxor %xmm0,%xmm3
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm3
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
+
+.byte 102,68,15,58,68,231,0
+ xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
- subq $64,%rcx
+ subq $0x40,%rcx
jnc .Lmod4_loop
.Ltail4x:
.byte 102,65,15,58,68,199,0
- xorps %xmm12,%xmm7
.byte 102,65,15,58,68,207,17
- xorps %xmm6,%xmm0
-.byte 102,65,15,58,68,218,16
- xorps %xmm8,%xmm1
+.byte 102,68,15,58,68,199,16
+ xorps %xmm12,%xmm4
+ xorps %xmm3,%xmm0
+ xorps %xmm5,%xmm1
pxor %xmm0,%xmm1
- pxor %xmm7,%xmm3
+ pxor %xmm4,%xmm8
- pxor %xmm1,%xmm3
+ pxor %xmm1,%xmm8
pxor %xmm0,%xmm1
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
+ movdqa %xmm8,%xmm9
+ psrldq $8,%xmm8
+ pslldq $8,%xmm9
+ pxor %xmm8,%xmm1
+ pxor %xmm9,%xmm0
movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
@@ -1095,10 +1100,10 @@ gcm_ghash_clmul:
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- addq $64,%rcx
+ addq $0x40,%rcx
jz .Ldone
- movdqu 32(%rsi),%xmm10
- subq $16,%rcx
+ movdqu 32(%rsi),%xmm7
+ subq $0x10,%rcx
jz .Lodd_tail
.Lskip4x:
@@ -1106,102 +1111,106 @@ gcm_ghash_clmul:
- movdqu (%rdx),%xmm3
- movdqu 16(%rdx),%xmm6
-.byte 102,15,56,0,221
-.byte 102,15,56,0,245
- pxor %xmm3,%xmm0
+ movdqu (%rdx),%xmm8
+ movdqu 16(%rdx),%xmm3
+.byte 102,69,15,56,0,194
+.byte 102,65,15,56,0,218
+ pxor %xmm8,%xmm0
- movdqa %xmm6,%xmm8
- pshufd $78,%xmm6,%xmm3
- pxor %xmm6,%xmm3
-.byte 102,15,58,68,242,0
-.byte 102,68,15,58,68,194,17
-.byte 102,65,15,58,68,218,0
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
leaq 32(%rdx),%rdx
- subq $32,%rcx
+ nop
+ subq $0x20,%rcx
jbe .Leven_tail
+ nop
jmp .Lmod_loop
.align 32
.Lmod_loop:
movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
pshufd $78,%xmm0,%xmm4
pxor %xmm0,%xmm4
-.byte 102,65,15,58,68,193,0
-.byte 102,65,15,58,68,201,17
-.byte 102,65,15,58,68,226,16
-
- pxor %xmm6,%xmm0
- pxor %xmm8,%xmm1
- movdqu (%rdx),%xmm8
-.byte 102,68,15,56,0,197
- movdqu 16(%rdx),%xmm6
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
- pxor %xmm8,%xmm1
- pxor %xmm3,%xmm4
-.byte 102,15,56,0,245
- movdqa %xmm4,%xmm3
- psrldq $8,%xmm3
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ movdqu (%rdx),%xmm9
+ pxor %xmm0,%xmm8
+.byte 102,69,15,56,0,202
+ movdqu 16(%rdx),%xmm3
+
+ pxor %xmm1,%xmm8
+ pxor %xmm9,%xmm1
+ pxor %xmm8,%xmm4
+.byte 102,65,15,56,0,218
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
pslldq $8,%xmm4
- pxor %xmm3,%xmm1
+ pxor %xmm8,%xmm1
pxor %xmm4,%xmm0
- movdqa %xmm6,%xmm8
+ movdqa %xmm3,%xmm5
- movdqa %xmm0,%xmm4
- movdqa %xmm0,%xmm3
+ movdqa %xmm0,%xmm9
+ movdqa %xmm0,%xmm8
psllq $5,%xmm0
-.byte 102,15,58,68,242,0
- pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm8
+.byte 102,15,58,68,218,0
psllq $1,%xmm0
- pxor %xmm3,%xmm0
+ pxor %xmm8,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm3
+ movdqa %xmm0,%xmm8
pslldq $8,%xmm0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
- pshufd $78,%xmm8,%xmm3
- pxor %xmm8,%xmm3
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pshufd $78,%xmm5,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm5,%xmm4
-.byte 102,68,15,58,68,194,17
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm9
psrlq $1,%xmm0
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
+.byte 102,15,58,68,234,17
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
psrlq $5,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm9,%xmm0
+ leaq 32(%rdx),%rdx
psrlq $1,%xmm0
-.byte 102,65,15,58,68,218,0
+.byte 102,15,58,68,231,0
pxor %xmm1,%xmm0
- leaq 32(%rdx),%rdx
- subq $32,%rcx
+ subq $0x20,%rcx
ja .Lmod_loop
.Leven_tail:
movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
pshufd $78,%xmm0,%xmm4
pxor %xmm0,%xmm4
-.byte 102,65,15,58,68,193,0
-.byte 102,65,15,58,68,201,17
-.byte 102,65,15,58,68,226,16
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
- pxor %xmm6,%xmm0
- pxor %xmm8,%xmm1
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
- pxor %xmm3,%xmm4
- movdqa %xmm4,%xmm3
- psrldq $8,%xmm3
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ pxor %xmm0,%xmm8
+ pxor %xmm1,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
pslldq $8,%xmm4
- pxor %xmm3,%xmm1
+ pxor %xmm8,%xmm1
pxor %xmm4,%xmm0
movdqa %xmm0,%xmm4
@@ -1230,15 +1239,15 @@ gcm_ghash_clmul:
jnz .Ldone
.Lodd_tail:
- movdqu (%rdx),%xmm3
-.byte 102,15,56,0,221
- pxor %xmm3,%xmm0
+ movdqu (%rdx),%xmm8
+.byte 102,69,15,56,0,194
+ pxor %xmm8,%xmm0
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
-.byte 102,65,15,58,68,218,0
+.byte 102,15,58,68,223,0
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -1271,7 +1280,7 @@ gcm_ghash_clmul:
psrlq $1,%xmm0
pxor %xmm1,%xmm0
.Ldone:
-.byte 102,15,56,0,197
+.byte 102,65,15,56,0,194
movdqu %xmm0,(%rdi)
.byte 0xf3,0xc3
.size gcm_ghash_clmul,.-gcm_ghash_clmul
@@ -1279,7 +1288,108 @@ gcm_ghash_clmul:
.type gcm_init_avx,@function
.align 32
gcm_init_avx:
- jmp .L_init_clmul
+ vzeroupper
+
+ vmovdqu (%rsi),%xmm2
+ vpshufd $78,%xmm2,%xmm2
+
+
+ vpshufd $255,%xmm2,%xmm4
+ vpsrlq $63,%xmm2,%xmm3
+ vpsllq $1,%xmm2,%xmm2
+ vpxor %xmm5,%xmm5,%xmm5
+ vpcmpgtd %xmm4,%xmm5,%xmm5
+ vpslldq $8,%xmm3,%xmm3
+ vpor %xmm3,%xmm2,%xmm2
+
+
+ vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vpunpckhqdq %xmm2,%xmm2,%xmm6
+ vmovdqa %xmm2,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ movq $4,%r10
+ jmp .Linit_start_avx
+.align 32
+.Linit_loop_avx:
+ vpalignr $8,%xmm3,%xmm4,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+.Linit_start_avx:
+ vmovdqa %xmm0,%xmm5
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+ vpshufd $78,%xmm5,%xmm3
+ vpshufd $78,%xmm0,%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqu %xmm5,0(%rdi)
+ vpxor %xmm0,%xmm4,%xmm4
+ vmovdqu %xmm0,16(%rdi)
+ leaq 48(%rdi),%rdi
+ subq $1,%r10
+ jnz .Linit_loop_avx
+
+ vpalignr $8,%xmm4,%xmm3,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+
+ vzeroupper
+ .byte 0xf3,0xc3
.size gcm_init_avx,.-gcm_init_avx
.globl gcm_gmult_avx
.type gcm_gmult_avx,@function
@@ -1291,7 +1401,377 @@ gcm_gmult_avx:
.type gcm_ghash_avx,@function
.align 32
gcm_ghash_avx:
- jmp .L_ghash_clmul
+ vzeroupper
+
+ vmovdqu (%rdi),%xmm10
+ leaq .L0x1c2_polynomial(%rip),%r10
+ leaq 64(%rsi),%rsi
+ vmovdqu .Lbswap_mask(%rip),%xmm13
+ vpshufb %xmm13,%xmm10,%xmm10
+ cmpq $0x80,%rcx
+ jb .Lshort_avx
+ subq $0x80,%rcx
+
+ vmovdqu 112(%rdx),%xmm14
+ vmovdqu 0-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vmovdqu 32-64(%rsi),%xmm7
+
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm14,%xmm9,%xmm9
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 80(%rdx),%xmm14
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 48-64(%rsi),%xmm6
+ vpxor %xmm14,%xmm9,%xmm9
+ vmovdqu 64(%rdx),%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 48(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 32(%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 16(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu (%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+
+ leaq 128(%rdx),%rdx
+ cmpq $0x80,%rcx
+ jb .Ltail_avx
+
+ vpxor %xmm10,%xmm15,%xmm15
+ subq $0x80,%rcx
+ jmp .Loop8x_avx
+
+.align 32
+.Loop8x_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 112(%rdx),%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpxor %xmm15,%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
+ vmovdqu 0-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
+ vmovdqu 32-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm3,%xmm10,%xmm10
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vxorps %xmm4,%xmm11,%xmm11
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm5,%xmm12,%xmm12
+ vxorps %xmm15,%xmm8,%xmm8
+
+ vmovdqu 80(%rdx),%xmm14
+ vpxor %xmm10,%xmm12,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm11,%xmm12,%xmm12
+ vpslldq $8,%xmm12,%xmm9
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vpsrldq $8,%xmm12,%xmm12
+ vpxor %xmm9,%xmm10,%xmm10
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vxorps %xmm12,%xmm11,%xmm11
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 64(%rdx),%xmm15
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vxorps %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vmovdqu 48(%rdx),%xmm14
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 32(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+ vxorps %xmm12,%xmm10,%xmm10
+
+ vmovdqu 16(%rdx),%xmm14
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vxorps %xmm11,%xmm12,%xmm12
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu (%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm12,%xmm15,%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+ vpxor %xmm10,%xmm15,%xmm15
+
+ leaq 128(%rdx),%rdx
+ subq $0x80,%rcx
+ jnc .Loop8x_avx
+
+ addq $0x80,%rcx
+ jmp .Ltail_no_xor_avx
+
+.align 32
+.Lshort_avx:
+ vmovdqu -16(%rdx,%rcx,1),%xmm14
+ leaq (%rdx,%rcx,1),%rdx
+ vmovdqu 0-64(%rsi),%xmm6
+ vmovdqu 32-64(%rsi),%xmm7
+ vpshufb %xmm13,%xmm14,%xmm15
+
+ vmovdqa %xmm0,%xmm3
+ vmovdqa %xmm1,%xmm4
+ vmovdqa %xmm2,%xmm5
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -32(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -48(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 80-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -64(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -80(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 96-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 128-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -96(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -112(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 144-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovq 184-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jmp .Ltail_avx
+
+.align 32
+.Ltail_avx:
+ vpxor %xmm10,%xmm15,%xmm15
+.Ltail_no_xor_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+
+ vmovdqu (%r10),%xmm12
+
+ vpxor %xmm0,%xmm3,%xmm10
+ vpxor %xmm1,%xmm4,%xmm11
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vpxor %xmm10,%xmm5,%xmm5
+ vpxor %xmm11,%xmm5,%xmm5
+ vpslldq $8,%xmm5,%xmm9
+ vpsrldq $8,%xmm5,%xmm5
+ vpxor %xmm9,%xmm10,%xmm10
+ vpxor %xmm5,%xmm11,%xmm11
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm11,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ cmpq $0,%rcx
+ jne .Lshort_avx
+
+ vpshufb %xmm13,%xmm10,%xmm10
+ vmovdqu %xmm10,(%rdi)
+ vzeroupper
+ .byte 0xf3,0xc3
.size gcm_ghash_avx,.-gcm_ghash_avx
.align 64
.Lbswap_mask:
@@ -1347,7 +1827,4 @@ gcm_ghash_avx:
.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64
-
.section .note.GNU-stack,"",%progbits
-
-
diff --git a/lib/accelerated/x86/files.mk b/lib/accelerated/x86/files.mk
index 596a9578a9..a134213922 100644
--- a/lib/accelerated/x86/files.mk
+++ b/lib/accelerated/x86/files.mk
@@ -1,9 +1,9 @@
X86_FILES_ELF=elf/aesni-x86.s elf/cpuid-x86.s elf/sha1-ssse3-x86.s elf/sha256-ssse3-x86.s elf/sha512-ssse3-x86.s elf/aes-ssse3-x86.s
X86_FILES_COFF=coff/aesni-x86.s coff/cpuid-x86.s coff/sha1-ssse3-x86.s coff/sha256-ssse3-x86.s coff/sha512-ssse3-x86.s coff/aes-ssse3-x86.s
X86_FILES_MACOSX=macosx/aesni-x86.s macosx/cpuid-x86.s macosx/sha1-ssse3-x86.s macosx/sha256-ssse3-x86.s macosx/sha512-ssse3-x86.s macosx/aes-ssse3-x86.s
-X86_64_FILES_ELF=elf/aesni-x86_64.s elf/cpuid-x86_64.s elf/ghash-x86_64.s elf/sha1-ssse3-x86_64.s elf/sha512-ssse3-x86_64.s elf/aes-ssse3-x86_64.s
-X86_64_FILES_COFF=coff/aesni-x86_64.s coff/cpuid-x86_64.s coff/ghash-x86_64.s coff/sha1-ssse3-x86_64.s coff/sha512-ssse3-x86_64.s coff/aes-ssse3-x86_64.s
-X86_64_FILES_MACOSX=macosx/aesni-x86_64.s macosx/cpuid-x86_64.s macosx/ghash-x86_64.s macosx/sha1-ssse3-x86_64.s macosx/sha512-ssse3-x86_64.s macosx/aes-ssse3-x86_64.s
+X86_64_FILES_ELF=elf/aesni-x86_64.s elf/cpuid-x86_64.s elf/ghash-x86_64.s elf/sha1-ssse3-x86_64.s elf/sha512-ssse3-x86_64.s elf/aes-ssse3-x86_64.s elf/aesni-gcm-x86_64.s
+X86_64_FILES_COFF=coff/aesni-x86_64.s coff/cpuid-x86_64.s coff/ghash-x86_64.s coff/sha1-ssse3-x86_64.s coff/sha512-ssse3-x86_64.s coff/aes-ssse3-x86_64.s coff/aesni-gcm-x86_64.s
+X86_64_FILES_MACOSX=macosx/aesni-x86_64.s macosx/cpuid-x86_64.s macosx/ghash-x86_64.s macosx/sha1-ssse3-x86_64.s macosx/sha512-ssse3-x86_64.s macosx/aes-ssse3-x86_64.s macosx/aesni-gcm-x86_64.s
X86_PADLOCK_FILES_ELF=elf/e_padlock-x86.s
X86_PADLOCK_FILES_COFF=coff/e_padlock-x86.s
X86_PADLOCK_FILES_MACOSX=macosx/e_padlock-x86.s
diff --git a/lib/accelerated/x86/macosx/aes-ssse3-x86_64.s b/lib/accelerated/x86/macosx/aes-ssse3-x86_64.s
index 27f25ed866..6014d41ba0 100644
--- a/lib/accelerated/x86/macosx/aes-ssse3-x86_64.s
+++ b/lib/accelerated/x86/macosx/aes-ssse3-x86_64.s
@@ -72,7 +72,7 @@ L$enc_loop:
addq $16,%r11
pxor %xmm0,%xmm3
.byte 102,15,56,0,193
- andq $48,%r11
+ andq $0x30,%r11
subq $1,%rax
pxor %xmm3,%xmm0
@@ -132,10 +132,10 @@ _vpaes_decrypt_core:
pand %xmm9,%xmm0
.byte 102,15,56,0,208
movdqa L$k_dipt+16(%rip),%xmm0
- xorq $48,%r11
+ xorq $0x30,%r11
leaq L$k_dsbd(%rip),%r10
.byte 102,15,56,0,193
- andq $48,%r11
+ andq $0x30,%r11
pxor %xmm5,%xmm2
movdqa L$k_mc_forward+48(%rip),%xmm5
pxor %xmm2,%xmm0
@@ -231,7 +231,7 @@ _vpaes_schedule_core:
- call _vpaes_preheat
+ call _vpaes_preheat
movdqa L$k_rcon(%rip),%xmm8
movdqu (%rdi),%xmm0
@@ -254,7 +254,7 @@ L$schedule_am_decrypting:
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
movdqu %xmm3,(%rdx)
- xorq $48,%r8
+ xorq $0x30,%r8
L$schedule_go:
cmpl $192,%esi
@@ -277,7 +277,7 @@ L$oop_schedule_128:
call _vpaes_schedule_round
decq %rsi
jz L$schedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
jmp L$oop_schedule_128
@@ -298,7 +298,7 @@ L$oop_schedule_128:
.p2align 4
L$schedule_192:
movdqu 8(%rdi),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movdqa %xmm0,%xmm6
pxor %xmm4,%xmm4
movhlps %xmm4,%xmm6
@@ -307,13 +307,13 @@ L$schedule_192:
L$oop_schedule_192:
call _vpaes_schedule_round
.byte 102,15,58,15,198,8
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_round
decq %rsi
jz L$schedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
jmp L$oop_schedule_192
@@ -330,21 +330,21 @@ L$oop_schedule_192:
.p2align 4
L$schedule_256:
movdqu 16(%rdi),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movl $7,%esi
L$oop_schedule_256:
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
movdqa %xmm0,%xmm6
call _vpaes_schedule_round
decq %rsi
jz L$schedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
- pshufd $255,%xmm0,%xmm0
+ pshufd $0xFF,%xmm0,%xmm0
movdqa %xmm7,%xmm5
movdqa %xmm6,%xmm7
call _vpaes_schedule_low_round
@@ -379,7 +379,7 @@ L$schedule_mangle_last:
L$schedule_mangle_last_dec:
addq $-16,%rdx
pxor L$k_s63(%rip),%xmm0
- call _vpaes_schedule_transform
+ call _vpaes_schedule_transform
movdqu %xmm0,(%rdx)
@@ -411,8 +411,8 @@ L$schedule_mangle_last_dec:
.p2align 4
_vpaes_schedule_192_smear:
- pshufd $128,%xmm6,%xmm1
- pshufd $254,%xmm7,%xmm0
+ pshufd $0x80,%xmm6,%xmm1
+ pshufd $0xFE,%xmm7,%xmm0
pxor %xmm1,%xmm6
pxor %xmm1,%xmm1
pxor %xmm0,%xmm6
@@ -449,7 +449,7 @@ _vpaes_schedule_round:
pxor %xmm1,%xmm7
- pshufd $255,%xmm0,%xmm0
+ pshufd $0xFF,%xmm0,%xmm0
.byte 102,15,58,15,192,1
@@ -608,7 +608,7 @@ L$schedule_mangle_both:
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
addq $-16,%r8
- andq $48,%r8
+ andq $0x30,%r8
movdqu %xmm3,(%rdx)
.byte 0xf3,0xc3
@@ -626,7 +626,7 @@ _vpaes_set_encrypt_key:
movl %eax,240(%rdx)
movl $0,%ecx
- movl $48,%r8d
+ movl $0x30,%r8d
call _vpaes_schedule_core
xorl %eax,%eax
.byte 0xf3,0xc3
@@ -834,7 +834,7 @@ L$k_dsbe:
L$k_dsbo:
.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
-.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
.p2align 6
diff --git a/lib/accelerated/x86/macosx/aesni-gcm-x86_64.s b/lib/accelerated/x86/macosx/aesni-gcm-x86_64.s
new file mode 100644
index 0000000000..002041cee2
--- /dev/null
+++ b/lib/accelerated/x86/macosx/aesni-gcm-x86_64.s
@@ -0,0 +1,793 @@
+# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain copyright notices,
+# this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# * Neither the name of the Andy Polyakov nor the names of its
+# copyright holder and contributors may be used to endorse or
+# promote products derived from this software without specific
+# prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *** This file is auto-generated ***
+#
+.text
+
+
+.p2align 5
+_aesni_ctr32_ghash_6x:
+ vmovdqu 32(%r11),%xmm2
+ subq $6,%rdx
+ vpxor %xmm4,%xmm4,%xmm4
+ vmovdqu 0-128(%rcx),%xmm15
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovdqu %xmm4,16+8(%rsp)
+ jmp L$oop6x
+
+.p2align 5
+L$oop6x:
+ addl $100663296,%ebx
+ jc L$handle_ctr32
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm15,%xmm10,%xmm10
+ vpxor %xmm15,%xmm11,%xmm11
+
+L$resume_ctr32:
+ vmovdqu %xmm1,(%r8)
+ vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
+ vpxor %xmm15,%xmm12,%xmm12
+ vmovups 16-128(%rcx),%xmm2
+ vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
+ xorq %r12,%r12
+ cmpq %r14,%r15
+
+ vaesenc %xmm2,%xmm9,%xmm9
+ vmovdqu 48+8(%rsp),%xmm0
+ vpxor %xmm15,%xmm13,%xmm13
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
+ vaesenc %xmm2,%xmm10,%xmm10
+ vpxor %xmm15,%xmm14,%xmm14
+ setnc %r12b
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vmovdqu 16-32(%r9),%xmm3
+ negq %r12
+ vaesenc %xmm2,%xmm12,%xmm12
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
+ vpxor %xmm4,%xmm8,%xmm8
+ vaesenc %xmm2,%xmm13,%xmm13
+ vpxor %xmm5,%xmm1,%xmm4
+ andq $0x60,%r12
+ vmovups 32-128(%rcx),%xmm15
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
+ vaesenc %xmm2,%xmm14,%xmm14
+
+ vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
+ leaq (%r14,%r12,1),%r14
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
+ vmovdqu 64+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 88(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 80(%r14),%r12
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,32+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,40+8(%rsp)
+ vmovdqu 48-32(%r9),%xmm5
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 48-128(%rcx),%xmm15
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
+ vmovdqu 80+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqu 64-32(%r9),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 64-128(%rcx),%xmm15
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 72(%r14),%r13
+ vpxor %xmm5,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 64(%r14),%r12
+ vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
+ vmovdqu 96+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,48+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,56+8(%rsp)
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 96-32(%r9),%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 80-128(%rcx),%xmm15
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 56(%r14),%r13
+ vpxor %xmm1,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
+ vpxor 112+8(%rsp),%xmm8,%xmm8
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 48(%r14),%r12
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,64+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,72+8(%rsp)
+ vpxor %xmm3,%xmm4,%xmm4
+ vmovdqu 112-32(%r9),%xmm3
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 96-128(%rcx),%xmm15
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 40(%r14),%r13
+ vpxor %xmm2,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 32(%r14),%r12
+ vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,80+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,88+8(%rsp)
+ vpxor %xmm5,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor %xmm1,%xmm6,%xmm6
+
+ vmovups 112-128(%rcx),%xmm15
+ vpslldq $8,%xmm6,%xmm5
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 16(%r11),%xmm3
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm8,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm5,%xmm4,%xmm4
+ movbeq 24(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 16(%r14),%r12
+ vpalignr $8,%xmm4,%xmm4,%xmm0
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ movq %r13,96+8(%rsp)
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r12,104+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ vmovups 128-128(%rcx),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 144-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm10,%xmm10
+ vpsrldq $8,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm11,%xmm11
+ vpxor %xmm6,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm12,%xmm12
+ vpxor %xmm0,%xmm4,%xmm4
+ movbeq 8(%r14),%r13
+ vaesenc %xmm1,%xmm13,%xmm13
+ movbeq 0(%r14),%r12
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 160-128(%rcx),%xmm1
+ cmpl $11,%ebp
+ jb L$enc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 176-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 192-128(%rcx),%xmm1
+ je L$enc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 208-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 224-128(%rcx),%xmm1
+ jmp L$enc_tail
+
+.p2align 5
+L$handle_ctr32:
+ vmovdqu (%r11),%xmm0
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm15,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm15,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpshufb %xmm0,%xmm1,%xmm1
+ jmp L$resume_ctr32
+
+.p2align 5
+L$enc_tail:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vmovdqu %xmm7,16+8(%rsp)
+ vpalignr $8,%xmm4,%xmm4,%xmm8
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ vpxor 0(%rdi),%xmm1,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 16(%rdi),%xmm1,%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 32(%rdi),%xmm1,%xmm5
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 48(%rdi),%xmm1,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 64(%rdi),%xmm1,%xmm7
+ vpxor 80(%rdi),%xmm1,%xmm3
+ vmovdqu (%r8),%xmm1
+
+ vaesenclast %xmm2,%xmm9,%xmm9
+ vmovdqu 32(%r11),%xmm2
+ vaesenclast %xmm0,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm1,%xmm0
+ movq %r13,112+8(%rsp)
+ leaq 96(%rdi),%rdi
+ vaesenclast %xmm5,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm0,%xmm5
+ movq %r12,120+8(%rsp)
+ leaq 96(%rsi),%rsi
+ vmovdqu 0-128(%rcx),%xmm15
+ vaesenclast %xmm6,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm5,%xmm6
+ vaesenclast %xmm7,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm6,%xmm7
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vpaddb %xmm2,%xmm7,%xmm3
+
+ addq $0x60,%r10
+ subq $0x6,%rdx
+ jc L$6x_done
+
+ vmovups %xmm9,-96(%rsi)
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovups %xmm10,-80(%rsi)
+ vmovdqa %xmm0,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vmovdqa %xmm5,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vmovdqa %xmm6,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vmovdqa %xmm7,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vmovdqa %xmm3,%xmm14
+ vmovdqu 32+8(%rsp),%xmm7
+ jmp L$oop6x
+
+L$6x_done:
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpxor %xmm4,%xmm8,%xmm8
+
+ .byte 0xf3,0xc3
+
+.globl _aesni_gcm_decrypt
+
+.p2align 5
+_aesni_gcm_decrypt:
+ xorq %r10,%r10
+ cmpq $0x60,%rdx
+ jb L$gcm_dec_abort
+
+ leaq (%rsp),%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq L$bswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ vmovdqu (%r9),%xmm8
+ andq $-128,%rsp
+ vmovdqu (%r11),%xmm0
+ leaq 128(%rcx),%rcx
+ leaq 32+32(%r9),%r9
+ movl 240-128(%rcx),%ebp
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc L$dec_no_key_aliasing
+ cmpq $768,%r15
+ jnc L$dec_no_key_aliasing
+ subq %r15,%rsp
+L$dec_no_key_aliasing:
+
+ vmovdqu 80(%rdi),%xmm7
+ leaq (%rdi),%r14
+ vmovdqu 64(%rdi),%xmm4
+ leaq -192(%rdi,%rdx,1),%r15
+ vmovdqu 48(%rdi),%xmm5
+ shrq $4,%rdx
+ xorq %r10,%r10
+ vmovdqu 32(%rdi),%xmm6
+ vpshufb %xmm0,%xmm7,%xmm7
+ vmovdqu 16(%rdi),%xmm2
+ vpshufb %xmm0,%xmm4,%xmm4
+ vmovdqu (%rdi),%xmm3
+ vpshufb %xmm0,%xmm5,%xmm5
+ vmovdqu %xmm4,48(%rsp)
+ vpshufb %xmm0,%xmm6,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm2,%xmm2
+ vmovdqu %xmm6,80(%rsp)
+ vpshufb %xmm0,%xmm3,%xmm3
+ vmovdqu %xmm2,96(%rsp)
+ vmovdqu %xmm3,112(%rsp)
+
+ call _aesni_ctr32_ghash_6x
+
+ vmovups %xmm9,-96(%rsi)
+ vmovups %xmm10,-80(%rsi)
+ vmovups %xmm11,-64(%rsi)
+ vmovups %xmm12,-48(%rsi)
+ vmovups %xmm13,-32(%rsi)
+ vmovups %xmm14,-16(%rsi)
+
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,-64(%r9)
+
+ vzeroupper
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+L$gcm_dec_abort:
+ movq %r10,%rax
+ .byte 0xf3,0xc3
+
+
+.p2align 5
+_aesni_ctr32_6x:
+ vmovdqu 0-128(%rcx),%xmm4
+ vmovdqu 32(%r11),%xmm2
+ leaq -1(%rbp),%r13
+ vmovups 16-128(%rcx),%xmm15
+ leaq 32-128(%rcx),%r12
+ vpxor %xmm4,%xmm1,%xmm9
+ addl $100663296,%ebx
+ jc L$handle_ctr32_2
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp L$oop_ctr32
+
+.p2align 4
+L$oop_ctr32:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+ vmovups (%r12),%xmm15
+ leaq 16(%r12),%r12
+ decl %r13d
+ jnz L$oop_ctr32
+
+ vmovdqu (%r12),%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 0(%rdi),%xmm3,%xmm4
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor 16(%rdi),%xmm3,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 32(%rdi),%xmm3,%xmm6
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 48(%rdi),%xmm3,%xmm8
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 64(%rdi),%xmm3,%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 80(%rdi),%xmm3,%xmm3
+ leaq 96(%rdi),%rdi
+
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vaesenclast %xmm5,%xmm10,%xmm10
+ vaesenclast %xmm6,%xmm11,%xmm11
+ vaesenclast %xmm8,%xmm12,%xmm12
+ vaesenclast %xmm2,%xmm13,%xmm13
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vmovups %xmm9,0(%rsi)
+ vmovups %xmm10,16(%rsi)
+ vmovups %xmm11,32(%rsi)
+ vmovups %xmm12,48(%rsi)
+ vmovups %xmm13,64(%rsi)
+ vmovups %xmm14,80(%rsi)
+ leaq 96(%rsi),%rsi
+
+ .byte 0xf3,0xc3
+.p2align 5
+L$handle_ctr32_2:
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp L$oop_ctr32
+
+
+.globl _aesni_gcm_encrypt
+
+.p2align 5
+_aesni_gcm_encrypt:
+ xorq %r10,%r10
+ cmpq $288,%rdx
+ jb L$gcm_enc_abort
+
+ leaq (%rsp),%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq L$bswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ leaq 128(%rcx),%rcx
+ vmovdqu (%r11),%xmm0
+ andq $-128,%rsp
+ movl 240-128(%rcx),%ebp
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc L$enc_no_key_aliasing
+ cmpq $768,%r15
+ jnc L$enc_no_key_aliasing
+ subq %r15,%rsp
+L$enc_no_key_aliasing:
+
+ leaq (%rsi),%r14
+ leaq -192(%rsi,%rdx,1),%r15
+ shrq $4,%rdx
+
+ call _aesni_ctr32_6x
+ vpshufb %xmm0,%xmm9,%xmm8
+ vpshufb %xmm0,%xmm10,%xmm2
+ vmovdqu %xmm8,112(%rsp)
+ vpshufb %xmm0,%xmm11,%xmm4
+ vmovdqu %xmm2,96(%rsp)
+ vpshufb %xmm0,%xmm12,%xmm5
+ vmovdqu %xmm4,80(%rsp)
+ vpshufb %xmm0,%xmm13,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm14,%xmm7
+ vmovdqu %xmm6,48(%rsp)
+
+ call _aesni_ctr32_6x
+
+ vmovdqu (%r9),%xmm8
+ leaq 32+32(%r9),%r9
+ subq $12,%rdx
+ movq $192,%r10
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ call _aesni_ctr32_ghash_6x
+ vmovdqu 32(%rsp),%xmm7
+ vmovdqu (%r11),%xmm0
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm7,%xmm7,%xmm1
+ vmovdqu 32-32(%r9),%xmm15
+ vmovups %xmm9,-96(%rsi)
+ vpshufb %xmm0,%xmm9,%xmm9
+ vpxor %xmm7,%xmm1,%xmm1
+ vmovups %xmm10,-80(%rsi)
+ vpshufb %xmm0,%xmm10,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vpshufb %xmm0,%xmm11,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vpshufb %xmm0,%xmm12,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vpshufb %xmm0,%xmm13,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vpshufb %xmm0,%xmm14,%xmm14
+ vmovdqu %xmm9,16(%rsp)
+ vmovdqu 48(%rsp),%xmm6
+ vmovdqu 16-32(%r9),%xmm0
+ vpunpckhqdq %xmm6,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
+ vpxor %xmm6,%xmm2,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+
+ vmovdqu 64(%rsp),%xmm9
+ vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm9,%xmm9,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
+ vpxor %xmm9,%xmm5,%xmm5
+ vpxor %xmm7,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vmovdqu 80(%rsp),%xmm1
+ vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm4,%xmm7,%xmm7
+ vpunpckhqdq %xmm1,%xmm1,%xmm4
+ vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpxor %xmm6,%xmm9,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 96(%rsp),%xmm2
+ vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm7,%xmm6,%xmm6
+ vpunpckhqdq %xmm2,%xmm2,%xmm7
+ vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpxor %xmm9,%xmm1,%xmm1
+ vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm5,%xmm4,%xmm4
+
+ vpxor 112(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
+ vmovdqu 112-32(%r9),%xmm0
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm1,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
+ vpxor %xmm4,%xmm7,%xmm4
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm1
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
+ vpxor %xmm14,%xmm1,%xmm1
+ vpxor %xmm5,%xmm6,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
+ vmovdqu 32-32(%r9),%xmm15
+ vpxor %xmm2,%xmm8,%xmm7
+ vpxor %xmm4,%xmm9,%xmm6
+
+ vmovdqu 16-32(%r9),%xmm0
+ vpxor %xmm5,%xmm7,%xmm9
+ vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
+ vpxor %xmm9,%xmm6,%xmm6
+ vpunpckhqdq %xmm13,%xmm13,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
+ vpxor %xmm13,%xmm2,%xmm2
+ vpslldq $8,%xmm6,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+ vpxor %xmm9,%xmm5,%xmm8
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm12,%xmm12,%xmm9
+ vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
+ vpxor %xmm12,%xmm9,%xmm9
+ vpxor %xmm14,%xmm13,%xmm13
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm11,%xmm11,%xmm1
+ vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
+ vpxor %xmm11,%xmm1,%xmm1
+ vpxor %xmm13,%xmm12,%xmm12
+ vxorps 16(%rsp),%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm9,%xmm9
+
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm10,%xmm10,%xmm2
+ vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
+ vpxor %xmm10,%xmm2,%xmm2
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpxor %xmm12,%xmm11,%xmm11
+ vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm9,%xmm1,%xmm1
+
+ vxorps %xmm7,%xmm14,%xmm14
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
+ vmovdqu 112-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm11,%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
+ vpxor %xmm4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
+ vpxor %xmm10,%xmm7,%xmm7
+ vpxor %xmm2,%xmm6,%xmm6
+
+ vpxor %xmm5,%xmm7,%xmm4
+ vpxor %xmm4,%xmm6,%xmm6
+ vpslldq $8,%xmm6,%xmm1
+ vmovdqu 16(%r11),%xmm3
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm1,%xmm5,%xmm8
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm2,%xmm8,%xmm8
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm7,%xmm2,%xmm2
+ vpxor %xmm2,%xmm8,%xmm8
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,-64(%r9)
+
+ vzeroupper
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+L$gcm_enc_abort:
+ movq %r10,%rax
+ .byte 0xf3,0xc3
+
+.p2align 6
+L$bswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+L$poly:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+L$one_msb:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+L$two_lsb:
+.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+L$one_lsb:
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align 6
+
diff --git a/lib/accelerated/x86/macosx/aesni-x86.s b/lib/accelerated/x86/macosx/aesni-x86.s
index 09ca1cbc5c..275ab58ec5 100644
--- a/lib/accelerated/x86/macosx/aesni-x86.s
+++ b/lib/accelerated/x86/macosx/aesni-x86.s
@@ -59,7 +59,10 @@ L000enc1_loop_1:
leal 16(%edx),%edx
jnz L000enc1_loop_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
ret
.globl _aesni_decrypt
.align 4
@@ -81,30 +84,84 @@ L001dec1_loop_2:
leal 16(%edx),%edx
jnz L001dec1_loop_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
+ ret
+.align 4
+__aesni_encrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+L002enc2_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L002enc2_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ ret
+.align 4
+__aesni_decrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+L003dec2_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L003dec2_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
ret
.align 4
__aesni_encrypt3:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
- movups (%edx),%xmm0
-L002enc3_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+L004enc3_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
- movups (%edx),%xmm0
- jnz L002enc3_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L004enc3_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -115,25 +172,26 @@ L002enc3_loop:
.align 4
__aesni_decrypt3:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
- movups (%edx),%xmm0
-L003dec3_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+L005dec3_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
- movups (%edx),%xmm0
- jnz L003dec3_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L005dec3_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -145,27 +203,29 @@ L003dec3_loop:
__aesni_encrypt4:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
- shrl $1,%ecx
- leal 32(%edx),%edx
+ shll $4,%ecx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
pxor %xmm0,%xmm5
- movups (%edx),%xmm0
-L004enc4_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+L006enc4_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
.byte 102,15,56,220,233
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
- movups (%edx),%xmm0
- jnz L004enc4_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L006enc4_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -179,27 +239,29 @@ L004enc4_loop:
__aesni_decrypt4:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
- shrl $1,%ecx
- leal 32(%edx),%edx
+ shll $4,%ecx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
pxor %xmm0,%xmm4
pxor %xmm0,%xmm5
- movups (%edx),%xmm0
-L005dec4_loop:
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+L007dec4_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
.byte 102,15,56,222,233
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
- movups (%edx),%xmm0
- jnz L005dec4_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L007dec4_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -212,45 +274,42 @@ L005dec4_loop:
.align 4
__aesni_encrypt6:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
+.byte 102,15,56,220,209
pxor %xmm0,%xmm5
- decl %ecx
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,220,225
pxor %xmm0,%xmm7
-.byte 102,15,56,220,241
- movups (%edx),%xmm0
-.byte 102,15,56,220,249
- jmp L_aesni_encrypt6_enter
+ movups (%edx,%ecx,1),%xmm0
+ addl $16,%ecx
+ jmp L008_aesni_encrypt6_inner
.align 4,0x90
-L006enc6_loop:
+L009enc6_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %ecx
.byte 102,15,56,220,225
+L008_aesni_encrypt6_inner:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.align 4,0x90
L_aesni_encrypt6_enter:
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leal 32(%edx),%edx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%edx),%xmm0
- jnz L006enc6_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L009enc6_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -267,45 +326,42 @@ L_aesni_encrypt6_enter:
.align 4
__aesni_decrypt6:
movups (%edx),%xmm0
- shrl $1,%ecx
+ shll $4,%ecx
movups 16(%edx),%xmm1
- leal 32(%edx),%edx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
-.byte 102,15,56,222,217
+.byte 102,15,56,222,209
pxor %xmm0,%xmm5
- decl %ecx
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,222,225
pxor %xmm0,%xmm7
-.byte 102,15,56,222,241
- movups (%edx),%xmm0
-.byte 102,15,56,222,249
- jmp L_aesni_decrypt6_enter
+ movups (%edx,%ecx,1),%xmm0
+ addl $16,%ecx
+ jmp L010_aesni_decrypt6_inner
.align 4,0x90
-L007dec6_loop:
+L011dec6_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %ecx
.byte 102,15,56,222,225
+L010_aesni_decrypt6_inner:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.align 4,0x90
L_aesni_decrypt6_enter:
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leal 32(%edx),%edx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%edx),%xmm0
- jnz L007dec6_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L011dec6_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -333,14 +389,14 @@ L_aesni_ecb_encrypt_begin:
movl 32(%esp),%edx
movl 36(%esp),%ebx
andl $-16,%eax
- jz L008ecb_ret
+ jz L012ecb_ret
movl 240(%edx),%ecx
testl %ebx,%ebx
- jz L009ecb_decrypt
+ jz L013ecb_decrypt
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb L010ecb_enc_tail
+ jb L014ecb_enc_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -349,9 +405,9 @@ L_aesni_ecb_encrypt_begin:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp L011ecb_enc_loop6_enter
+ jmp L015ecb_enc_loop6_enter
.align 4,0x90
-L012ecb_enc_loop6:
+L016ecb_enc_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -366,12 +422,12 @@ L012ecb_enc_loop6:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-L011ecb_enc_loop6_enter:
+L015ecb_enc_loop6_enter:
call __aesni_encrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc L012ecb_enc_loop6
+ jnc L016ecb_enc_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -380,18 +436,18 @@ L011ecb_enc_loop6_enter:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz L008ecb_ret
-L010ecb_enc_tail:
+ jz L012ecb_ret
+L014ecb_enc_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb L013ecb_enc_one
+ jb L017ecb_enc_one
movups 16(%esi),%xmm3
- je L014ecb_enc_two
+ je L018ecb_enc_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb L015ecb_enc_three
+ jb L019ecb_enc_three
movups 48(%esi),%xmm5
- je L016ecb_enc_four
+ je L020ecb_enc_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call __aesni_encrypt6
@@ -400,50 +456,49 @@ L010ecb_enc_tail:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp L008ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L013ecb_enc_one:
+L017ecb_enc_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L017enc1_loop_3:
+L021enc1_loop_3:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L017enc1_loop_3
+ jnz L021enc1_loop_3
.byte 102,15,56,221,209
movups %xmm2,(%edi)
- jmp L008ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L014ecb_enc_two:
- xorps %xmm4,%xmm4
- call __aesni_encrypt3
+L018ecb_enc_two:
+ call __aesni_encrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp L008ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L015ecb_enc_three:
+L019ecb_enc_three:
call __aesni_encrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp L008ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L016ecb_enc_four:
+L020ecb_enc_four:
call __aesni_encrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
- jmp L008ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L009ecb_decrypt:
+L013ecb_decrypt:
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb L018ecb_dec_tail
+ jb L022ecb_dec_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -452,9 +507,9 @@ L009ecb_decrypt:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp L019ecb_dec_loop6_enter
+ jmp L023ecb_dec_loop6_enter
.align 4,0x90
-L020ecb_dec_loop6:
+L024ecb_dec_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -469,12 +524,12 @@ L020ecb_dec_loop6:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-L019ecb_dec_loop6_enter:
+L023ecb_dec_loop6_enter:
call __aesni_decrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc L020ecb_dec_loop6
+ jnc L024ecb_dec_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -483,18 +538,18 @@ L019ecb_dec_loop6_enter:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz L008ecb_ret
-L018ecb_dec_tail:
+ jz L012ecb_ret
+L022ecb_dec_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb L021ecb_dec_one
+ jb L025ecb_dec_one
movups 16(%esi),%xmm3
- je L022ecb_dec_two
+ je L026ecb_dec_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb L023ecb_dec_three
+ jb L027ecb_dec_three
movups 48(%esi),%xmm5
- je L024ecb_dec_four
+ je L028ecb_dec_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call __aesni_decrypt6
@@ -503,44 +558,51 @@ L018ecb_dec_tail:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp L008ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L021ecb_dec_one:
+L025ecb_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L025dec1_loop_4:
+L029dec1_loop_4:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L025dec1_loop_4
+ jnz L029dec1_loop_4
.byte 102,15,56,223,209
movups %xmm2,(%edi)
- jmp L008ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L022ecb_dec_two:
- xorps %xmm4,%xmm4
- call __aesni_decrypt3
+L026ecb_dec_two:
+ call __aesni_decrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp L008ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L023ecb_dec_three:
+L027ecb_dec_three:
call __aesni_decrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp L008ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L024ecb_dec_four:
+L028ecb_dec_four:
call __aesni_decrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-L008ecb_ret:
+L012ecb_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -577,48 +639,56 @@ L_aesni_ccm64_encrypt_blocks_begin:
movl %ebp,20(%esp)
movl %ebp,24(%esp)
movl %ebp,28(%esp)
- shrl $1,%ecx
+ shll $4,%ecx
+ movl $16,%ebx
leal (%edx),%ebp
movdqa (%esp),%xmm5
movdqa %xmm7,%xmm2
- movl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ subl %ecx,%ebx
.byte 102,15,56,0,253
-L026ccm64_enc_outer:
+L030ccm64_enc_outer:
movups (%ebp),%xmm0
movl %ebx,%ecx
movups (%esi),%xmm6
xorps %xmm0,%xmm2
movups 16(%ebp),%xmm1
xorps %xmm6,%xmm0
- leal 32(%ebp),%edx
xorps %xmm0,%xmm3
- movups (%edx),%xmm0
-L027ccm64_enc2_loop:
+ movups 32(%ebp),%xmm0
+L031ccm64_enc2_loop:
.byte 102,15,56,220,209
- decl %ecx
.byte 102,15,56,220,217
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
- leal 32(%edx),%edx
.byte 102,15,56,220,216
- movups (%edx),%xmm0
- jnz L027ccm64_enc2_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L031ccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
paddq 16(%esp),%xmm7
+ decl %eax
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- decl %eax
leal 16(%esi),%esi
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
movups %xmm6,(%edi)
- leal 16(%edi),%edi
.byte 102,15,56,0,213
- jnz L026ccm64_enc_outer
+ leal 16(%edi),%edi
+ jnz L030ccm64_enc_outer
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -664,71 +734,82 @@ L_aesni_ccm64_decrypt_blocks_begin:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L028enc1_loop_5:
+L032enc1_loop_5:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L028enc1_loop_5
+ jnz L032enc1_loop_5
.byte 102,15,56,221,209
+ shll $4,%ebx
+ movl $16,%ecx
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
leal 16(%esi),%esi
- jmp L029ccm64_dec_outer
+ subl %ebx,%ecx
+ leal 32(%ebp,%ebx,1),%edx
+ movl %ecx,%ebx
+ jmp L033ccm64_dec_outer
.align 4,0x90
-L029ccm64_dec_outer:
+L033ccm64_dec_outer:
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
- movl %ebx,%ecx
movups %xmm6,(%edi)
leal 16(%edi),%edi
.byte 102,15,56,0,213
subl $1,%eax
- jz L030ccm64_dec_break
+ jz L034ccm64_dec_break
movups (%ebp),%xmm0
- shrl $1,%ecx
+ movl %ebx,%ecx
movups 16(%ebp),%xmm1
xorps %xmm0,%xmm6
- leal 32(%ebp),%edx
xorps %xmm0,%xmm2
xorps %xmm6,%xmm3
- movups (%edx),%xmm0
-L031ccm64_dec2_loop:
+ movups 32(%ebp),%xmm0
+L035ccm64_dec2_loop:
.byte 102,15,56,220,209
- decl %ecx
.byte 102,15,56,220,217
- movups 16(%edx),%xmm1
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
.byte 102,15,56,220,208
- leal 32(%edx),%edx
.byte 102,15,56,220,216
- movups (%edx),%xmm0
- jnz L031ccm64_dec2_loop
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L035ccm64_dec2_loop
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- leal 16(%esi),%esi
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- jmp L029ccm64_dec_outer
+ leal 16(%esi),%esi
+ jmp L033ccm64_dec_outer
.align 4,0x90
-L030ccm64_dec_break:
+L034ccm64_dec_break:
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movups (%edx),%xmm0
movups 16(%edx),%xmm1
xorps %xmm0,%xmm6
leal 32(%edx),%edx
xorps %xmm6,%xmm3
-L032enc1_loop_6:
+L036enc1_loop_6:
.byte 102,15,56,220,217
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L032enc1_loop_6
+ jnz L036enc1_loop_6
.byte 102,15,56,221,217
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -752,7 +833,7 @@ L_aesni_ctr32_encrypt_blocks_begin:
andl $-16,%esp
movl %ebp,80(%esp)
cmpl $1,%eax
- je L033ctr32_one_shortcut
+ je L037ctr32_one_shortcut
movdqu (%ebx),%xmm7
movl $202182159,(%esp)
movl $134810123,4(%esp)
@@ -768,63 +849,59 @@ L_aesni_ctr32_encrypt_blocks_begin:
.byte 102,15,58,34,253,3
movl 240(%edx),%ecx
bswap %ebx
- pxor %xmm1,%xmm1
pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movdqa (%esp),%xmm2
-.byte 102,15,58,34,203,0
+.byte 102,15,58,34,195,0
leal 3(%ebx),%ebp
-.byte 102,15,58,34,197,0
+.byte 102,15,58,34,205,0
incl %ebx
-.byte 102,15,58,34,203,1
+.byte 102,15,58,34,195,1
incl %ebp
-.byte 102,15,58,34,197,1
+.byte 102,15,58,34,205,1
incl %ebx
-.byte 102,15,58,34,203,2
+.byte 102,15,58,34,195,2
incl %ebp
-.byte 102,15,58,34,197,2
- movdqa %xmm1,48(%esp)
-.byte 102,15,56,0,202
- movdqa %xmm0,64(%esp)
+.byte 102,15,58,34,205,2
+ movdqa %xmm0,48(%esp)
.byte 102,15,56,0,194
- pshufd $192,%xmm1,%xmm2
- pshufd $128,%xmm1,%xmm3
+ movdqu (%edx),%xmm6
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
+ pshufd $192,%xmm0,%xmm2
+ pshufd $128,%xmm0,%xmm3
cmpl $6,%eax
- jb L034ctr32_tail
+ jb L038ctr32_tail
+ pxor %xmm6,%xmm7
+ shll $4,%ecx
+ movl $16,%ebx
movdqa %xmm7,32(%esp)
- shrl $1,%ecx
movl %edx,%ebp
- movl %ecx,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
subl $6,%eax
- jmp L035ctr32_loop6
+ jmp L039ctr32_loop6
.align 4,0x90
-L035ctr32_loop6:
- pshufd $64,%xmm1,%xmm4
- movdqa 32(%esp),%xmm1
- pshufd $192,%xmm0,%xmm5
- por %xmm1,%xmm2
- pshufd $128,%xmm0,%xmm6
- por %xmm1,%xmm3
- pshufd $64,%xmm0,%xmm7
- por %xmm1,%xmm4
- por %xmm1,%xmm5
- por %xmm1,%xmm6
- por %xmm1,%xmm7
- movups (%ebp),%xmm0
- movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
- decl %ecx
+L039ctr32_loop6:
+ pshufd $64,%xmm0,%xmm4
+ movdqa 32(%esp),%xmm0
+ pshufd $192,%xmm1,%xmm5
pxor %xmm0,%xmm2
+ pshufd $128,%xmm1,%xmm6
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
+ pshufd $64,%xmm1,%xmm7
+ movups 16(%ebp),%xmm1
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
+.byte 102,15,56,220,209
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
pxor %xmm0,%xmm7
+.byte 102,15,56,220,217
+ movups 32(%ebp),%xmm0
+ movl %ebx,%ecx
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%edx),%xmm0
.byte 102,15,56,220,249
call L_aesni_encrypt6_enter
movups (%esi),%xmm1
@@ -835,51 +912,51 @@ L035ctr32_loop6:
movups %xmm2,(%edi)
movdqa 16(%esp),%xmm0
xorps %xmm1,%xmm4
- movdqa 48(%esp),%xmm1
+ movdqa 64(%esp),%xmm1
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
paddd %xmm0,%xmm1
- paddd 64(%esp),%xmm0
+ paddd 48(%esp),%xmm0
movdqa (%esp),%xmm2
movups 48(%esi),%xmm3
movups 64(%esi),%xmm4
xorps %xmm3,%xmm5
movups 80(%esi),%xmm3
leal 96(%esi),%esi
- movdqa %xmm1,48(%esp)
-.byte 102,15,56,0,202
+ movdqa %xmm0,48(%esp)
+.byte 102,15,56,0,194
xorps %xmm4,%xmm6
movups %xmm5,48(%edi)
xorps %xmm3,%xmm7
- movdqa %xmm0,64(%esp)
-.byte 102,15,56,0,194
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
movups %xmm6,64(%edi)
- pshufd $192,%xmm1,%xmm2
+ pshufd $192,%xmm0,%xmm2
movups %xmm7,80(%edi)
leal 96(%edi),%edi
- movl %ebx,%ecx
- pshufd $128,%xmm1,%xmm3
+ pshufd $128,%xmm0,%xmm3
subl $6,%eax
- jnc L035ctr32_loop6
+ jnc L039ctr32_loop6
addl $6,%eax
- jz L036ctr32_ret
+ jz L040ctr32_ret
+ movdqu (%ebp),%xmm7
movl %ebp,%edx
- leal 1(,%ecx,2),%ecx
- movdqa 32(%esp),%xmm7
-L034ctr32_tail:
+ pxor 32(%esp),%xmm7
+ movl 240(%ebp),%ecx
+L038ctr32_tail:
por %xmm7,%xmm2
cmpl $2,%eax
- jb L037ctr32_one
- pshufd $64,%xmm1,%xmm4
+ jb L041ctr32_one
+ pshufd $64,%xmm0,%xmm4
por %xmm7,%xmm3
- je L038ctr32_two
- pshufd $192,%xmm0,%xmm5
+ je L042ctr32_two
+ pshufd $192,%xmm1,%xmm5
por %xmm7,%xmm4
cmpl $4,%eax
- jb L039ctr32_three
- pshufd $128,%xmm0,%xmm6
+ jb L043ctr32_three
+ pshufd $128,%xmm1,%xmm6
por %xmm7,%xmm5
- je L040ctr32_four
+ je L044ctr32_four
por %xmm7,%xmm6
call __aesni_encrypt6
movups (%esi),%xmm1
@@ -897,39 +974,39 @@ L034ctr32_tail:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp L036ctr32_ret
+ jmp L040ctr32_ret
.align 4,0x90
-L033ctr32_one_shortcut:
+L037ctr32_one_shortcut:
movups (%ebx),%xmm2
movl 240(%edx),%ecx
-L037ctr32_one:
+L041ctr32_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L041enc1_loop_7:
+L045enc1_loop_7:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L041enc1_loop_7
+ jnz L045enc1_loop_7
.byte 102,15,56,221,209
movups (%esi),%xmm6
xorps %xmm2,%xmm6
movups %xmm6,(%edi)
- jmp L036ctr32_ret
+ jmp L040ctr32_ret
.align 4,0x90
-L038ctr32_two:
- call __aesni_encrypt3
+L042ctr32_two:
+ call __aesni_encrypt2
movups (%esi),%xmm5
movups 16(%esi),%xmm6
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp L036ctr32_ret
+ jmp L040ctr32_ret
.align 4,0x90
-L039ctr32_three:
+L043ctr32_three:
call __aesni_encrypt3
movups (%esi),%xmm5
movups 16(%esi),%xmm6
@@ -940,9 +1017,9 @@ L039ctr32_three:
xorps %xmm7,%xmm4
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp L036ctr32_ret
+ jmp L040ctr32_ret
.align 4,0x90
-L040ctr32_four:
+L044ctr32_four:
call __aesni_encrypt4
movups (%esi),%xmm6
movups 16(%esi),%xmm7
@@ -956,7 +1033,18 @@ L040ctr32_four:
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-L036ctr32_ret:
+L040ctr32_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
movl 80(%esp),%esp
popl %edi
popl %esi
@@ -979,12 +1067,12 @@ L_aesni_xts_encrypt_begin:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L042enc1_loop_8:
+L046enc1_loop_8:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L042enc1_loop_8
+ jnz L046enc1_loop_8
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1008,12 +1096,14 @@ L042enc1_loop_8:
movl %edx,%ebp
movl %ecx,%ebx
subl $96,%eax
- jc L043xts_enc_short
- shrl $1,%ecx
- movl %ecx,%ebx
- jmp L044xts_enc_loop6
+ jc L047xts_enc_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp L048xts_enc_loop6
.align 4,0x90
-L044xts_enc_loop6:
+L048xts_enc_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1049,6 +1139,7 @@ L044xts_enc_loop6:
pand %xmm3,%xmm7
movups (%esi),%xmm2
pxor %xmm1,%xmm7
+ movl %ebx,%ecx
movdqu 16(%esi),%xmm3
xorps %xmm0,%xmm2
movdqu 32(%esi),%xmm4
@@ -1064,19 +1155,17 @@ L044xts_enc_loop6:
movdqa %xmm7,80(%esp)
pxor %xmm1,%xmm7
movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
pxor 16(%esp),%xmm3
-.byte 102,15,56,220,209
pxor 32(%esp),%xmm4
-.byte 102,15,56,220,217
+.byte 102,15,56,220,209
pxor 48(%esp),%xmm5
- decl %ecx
-.byte 102,15,56,220,225
pxor 64(%esp),%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,217
pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
- movups (%edx),%xmm0
.byte 102,15,56,220,249
call L_aesni_encrypt6_enter
movdqa 80(%esp),%xmm1
@@ -1101,26 +1190,25 @@ L044xts_enc_loop6:
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
- movl %ebx,%ecx
pxor %xmm2,%xmm1
subl $96,%eax
- jnc L044xts_enc_loop6
- leal 1(,%ecx,2),%ecx
+ jnc L048xts_enc_loop6
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-L043xts_enc_short:
+L047xts_enc_short:
addl $96,%eax
- jz L045xts_enc_done6x
+ jz L049xts_enc_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb L046xts_enc_one
+ jb L050xts_enc_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je L047xts_enc_two
+ je L051xts_enc_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1129,7 +1217,7 @@ L043xts_enc_short:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb L048xts_enc_three
+ jb L052xts_enc_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1139,7 +1227,7 @@ L043xts_enc_short:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je L049xts_enc_four
+ je L053xts_enc_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1171,9 +1259,9 @@ L043xts_enc_short:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp L050xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L046xts_enc_one:
+L050xts_enc_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1181,37 +1269,36 @@ L046xts_enc_one:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L051enc1_loop_9:
+L055enc1_loop_9:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L051enc1_loop_9
+ jnz L055enc1_loop_9
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp L050xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L047xts_enc_two:
+L051xts_enc_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
leal 32(%esi),%esi
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
- xorps %xmm4,%xmm4
- call __aesni_encrypt3
+ call __aesni_encrypt2
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L050xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L048xts_enc_three:
+L052xts_enc_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1229,9 +1316,9 @@ L048xts_enc_three:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp L050xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L049xts_enc_four:
+L053xts_enc_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1253,28 +1340,28 @@ L049xts_enc_four:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L050xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L045xts_enc_done6x:
+L049xts_enc_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz L052xts_enc_ret
+ jz L056xts_enc_ret
movdqa %xmm1,%xmm5
movl %eax,112(%esp)
- jmp L053xts_enc_steal
+ jmp L057xts_enc_steal
.align 4,0x90
-L050xts_enc_done:
+L054xts_enc_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz L052xts_enc_ret
+ jz L056xts_enc_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm5
paddq %xmm1,%xmm1
pand 96(%esp),%xmm5
pxor %xmm1,%xmm5
-L053xts_enc_steal:
+L057xts_enc_steal:
movzbl (%esi),%ecx
movzbl -16(%edi),%edx
leal 1(%esi),%esi
@@ -1282,7 +1369,7 @@ L053xts_enc_steal:
movb %dl,(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz L053xts_enc_steal
+ jnz L057xts_enc_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1292,16 +1379,30 @@ L053xts_enc_steal:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L054enc1_loop_10:
+L058enc1_loop_10:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L054enc1_loop_10
+ jnz L058enc1_loop_10
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,-16(%edi)
-L052xts_enc_ret:
+L056xts_enc_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1324,12 +1425,12 @@ L_aesni_xts_decrypt_begin:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L055enc1_loop_11:
+L059enc1_loop_11:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L055enc1_loop_11
+ jnz L059enc1_loop_11
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1358,12 +1459,14 @@ L055enc1_loop_11:
pcmpgtd %xmm1,%xmm0
andl $-16,%eax
subl $96,%eax
- jc L056xts_dec_short
- shrl $1,%ecx
- movl %ecx,%ebx
- jmp L057xts_dec_loop6
+ jc L060xts_dec_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp L061xts_dec_loop6
.align 4,0x90
-L057xts_dec_loop6:
+L061xts_dec_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1399,6 +1502,7 @@ L057xts_dec_loop6:
pand %xmm3,%xmm7
movups (%esi),%xmm2
pxor %xmm1,%xmm7
+ movl %ebx,%ecx
movdqu 16(%esi),%xmm3
xorps %xmm0,%xmm2
movdqu 32(%esi),%xmm4
@@ -1414,19 +1518,17 @@ L057xts_dec_loop6:
movdqa %xmm7,80(%esp)
pxor %xmm1,%xmm7
movups 16(%ebp),%xmm1
- leal 32(%ebp),%edx
pxor 16(%esp),%xmm3
-.byte 102,15,56,222,209
pxor 32(%esp),%xmm4
-.byte 102,15,56,222,217
+.byte 102,15,56,222,209
pxor 48(%esp),%xmm5
- decl %ecx
-.byte 102,15,56,222,225
pxor 64(%esp),%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,217
pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
.byte 102,15,56,222,241
- movups (%edx),%xmm0
.byte 102,15,56,222,249
call L_aesni_decrypt6_enter
movdqa 80(%esp),%xmm1
@@ -1451,26 +1553,25 @@ L057xts_dec_loop6:
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
- movl %ebx,%ecx
pxor %xmm2,%xmm1
subl $96,%eax
- jnc L057xts_dec_loop6
- leal 1(,%ecx,2),%ecx
+ jnc L061xts_dec_loop6
+ movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-L056xts_dec_short:
+L060xts_dec_short:
addl $96,%eax
- jz L058xts_dec_done6x
+ jz L062xts_dec_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb L059xts_dec_one
+ jb L063xts_dec_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je L060xts_dec_two
+ je L064xts_dec_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1479,7 +1580,7 @@ L056xts_dec_short:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb L061xts_dec_three
+ jb L065xts_dec_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1489,7 +1590,7 @@ L056xts_dec_short:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je L062xts_dec_four
+ je L066xts_dec_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1521,9 +1622,9 @@ L056xts_dec_short:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp L063xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L059xts_dec_one:
+L063xts_dec_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1531,36 +1632,36 @@ L059xts_dec_one:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L064dec1_loop_12:
+L068dec1_loop_12:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L064dec1_loop_12
+ jnz L068dec1_loop_12
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp L063xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L060xts_dec_two:
+L064xts_dec_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
leal 32(%esi),%esi
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
- call __aesni_decrypt3
+ call __aesni_decrypt2
xorps %xmm5,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L063xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L061xts_dec_three:
+L065xts_dec_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1578,9 +1679,9 @@ L061xts_dec_three:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp L063xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L062xts_dec_four:
+L066xts_dec_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1602,20 +1703,20 @@ L062xts_dec_four:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L063xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L058xts_dec_done6x:
+L062xts_dec_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz L065xts_dec_ret
+ jz L069xts_dec_ret
movl %eax,112(%esp)
- jmp L066xts_dec_only_one_more
+ jmp L070xts_dec_only_one_more
.align 4,0x90
-L063xts_dec_done:
+L067xts_dec_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz L065xts_dec_ret
+ jz L069xts_dec_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm2
@@ -1625,7 +1726,7 @@ L063xts_dec_done:
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
-L066xts_dec_only_one_more:
+L070xts_dec_only_one_more:
pshufd $19,%xmm0,%xmm5
movdqa %xmm1,%xmm6
paddq %xmm1,%xmm1
@@ -1639,16 +1740,16 @@ L066xts_dec_only_one_more:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L067dec1_loop_13:
+L071dec1_loop_13:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L067dec1_loop_13
+ jnz L071dec1_loop_13
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
-L068xts_dec_steal:
+L072xts_dec_steal:
movzbl 16(%esi),%ecx
movzbl (%edi),%edx
leal 1(%esi),%esi
@@ -1656,7 +1757,7 @@ L068xts_dec_steal:
movb %dl,16(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz L068xts_dec_steal
+ jnz L072xts_dec_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1666,16 +1767,30 @@ L068xts_dec_steal:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L069dec1_loop_14:
+L073dec1_loop_14:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L069dec1_loop_14
+ jnz L073dec1_loop_14
.byte 102,15,56,223,209
xorps %xmm6,%xmm2
movups %xmm2,(%edi)
-L065xts_dec_ret:
+L069xts_dec_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1699,7 +1814,7 @@ L_aesni_cbc_encrypt_begin:
movl 32(%esp),%edx
movl 36(%esp),%ebp
testl %eax,%eax
- jz L070cbc_abort
+ jz L074cbc_abort
cmpl $0,40(%esp)
xchgl %esp,%ebx
movups (%ebp),%xmm7
@@ -1707,14 +1822,14 @@ L_aesni_cbc_encrypt_begin:
movl %edx,%ebp
movl %ebx,16(%esp)
movl %ecx,%ebx
- je L071cbc_decrypt
+ je L075cbc_decrypt
movaps %xmm7,%xmm2
cmpl $16,%eax
- jb L072cbc_enc_tail
+ jb L076cbc_enc_tail
subl $16,%eax
- jmp L073cbc_enc_loop
+ jmp L077cbc_enc_loop
.align 4,0x90
-L073cbc_enc_loop:
+L077cbc_enc_loop:
movups (%esi),%xmm7
leal 16(%esi),%esi
movups (%edx),%xmm0
@@ -1722,24 +1837,25 @@ L073cbc_enc_loop:
xorps %xmm0,%xmm7
leal 32(%edx),%edx
xorps %xmm7,%xmm2
-L074enc1_loop_15:
+L078enc1_loop_15:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L074enc1_loop_15
+ jnz L078enc1_loop_15
.byte 102,15,56,221,209
movl %ebx,%ecx
movl %ebp,%edx
movups %xmm2,(%edi)
leal 16(%edi),%edi
subl $16,%eax
- jnc L073cbc_enc_loop
+ jnc L077cbc_enc_loop
addl $16,%eax
- jnz L072cbc_enc_tail
+ jnz L076cbc_enc_tail
movaps %xmm2,%xmm7
- jmp L075cbc_ret
-L072cbc_enc_tail:
+ pxor %xmm2,%xmm2
+ jmp L079cbc_ret
+L076cbc_enc_tail:
movl %eax,%ecx
.long 2767451785
movl $16,%ecx
@@ -1750,20 +1866,20 @@ L072cbc_enc_tail:
movl %ebx,%ecx
movl %edi,%esi
movl %ebp,%edx
- jmp L073cbc_enc_loop
+ jmp L077cbc_enc_loop
.align 4,0x90
-L071cbc_decrypt:
+L075cbc_decrypt:
cmpl $80,%eax
- jbe L076cbc_dec_tail
+ jbe L080cbc_dec_tail
movaps %xmm7,(%esp)
subl $80,%eax
- jmp L077cbc_dec_loop6_enter
+ jmp L081cbc_dec_loop6_enter
.align 4,0x90
-L078cbc_dec_loop6:
+L082cbc_dec_loop6:
movaps %xmm0,(%esp)
movups %xmm7,(%edi)
leal 16(%edi),%edi
-L077cbc_dec_loop6_enter:
+L081cbc_dec_loop6_enter:
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -1793,28 +1909,28 @@ L077cbc_dec_loop6_enter:
movups %xmm6,64(%edi)
leal 80(%edi),%edi
subl $96,%eax
- ja L078cbc_dec_loop6
+ ja L082cbc_dec_loop6
movaps %xmm7,%xmm2
movaps %xmm0,%xmm7
addl $80,%eax
- jle L079cbc_dec_tail_collected
+ jle L083cbc_dec_clear_tail_collected
movups %xmm2,(%edi)
leal 16(%edi),%edi
-L076cbc_dec_tail:
+L080cbc_dec_tail:
movups (%esi),%xmm2
movaps %xmm2,%xmm6
cmpl $16,%eax
- jbe L080cbc_dec_one
+ jbe L084cbc_dec_one
movups 16(%esi),%xmm3
movaps %xmm3,%xmm5
cmpl $32,%eax
- jbe L081cbc_dec_two
+ jbe L085cbc_dec_two
movups 32(%esi),%xmm4
cmpl $48,%eax
- jbe L082cbc_dec_three
+ jbe L086cbc_dec_three
movups 48(%esi),%xmm5
cmpl $64,%eax
- jbe L083cbc_dec_four
+ jbe L087cbc_dec_four
movups 64(%esi),%xmm6
movaps %xmm7,(%esp)
movups (%esi),%xmm2
@@ -1832,56 +1948,62 @@ L076cbc_dec_tail:
xorps %xmm0,%xmm6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%edi)
+ pxor %xmm5,%xmm5
leal 64(%edi),%edi
movaps %xmm6,%xmm2
+ pxor %xmm6,%xmm6
subl $80,%eax
- jmp L079cbc_dec_tail_collected
+ jmp L088cbc_dec_tail_collected
.align 4,0x90
-L080cbc_dec_one:
+L084cbc_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L084dec1_loop_16:
+L089dec1_loop_16:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L084dec1_loop_16
+ jnz L089dec1_loop_16
.byte 102,15,56,223,209
xorps %xmm7,%xmm2
movaps %xmm6,%xmm7
subl $16,%eax
- jmp L079cbc_dec_tail_collected
+ jmp L088cbc_dec_tail_collected
.align 4,0x90
-L081cbc_dec_two:
- xorps %xmm4,%xmm4
- call __aesni_decrypt3
+L085cbc_dec_two:
+ call __aesni_decrypt2
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movaps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leal 16(%edi),%edi
movaps %xmm5,%xmm7
subl $32,%eax
- jmp L079cbc_dec_tail_collected
+ jmp L088cbc_dec_tail_collected
.align 4,0x90
-L082cbc_dec_three:
+L086cbc_dec_three:
call __aesni_decrypt3
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
xorps %xmm5,%xmm4
movups %xmm2,(%edi)
movaps %xmm4,%xmm2
+ pxor %xmm4,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
leal 32(%edi),%edi
movups 32(%esi),%xmm7
subl $48,%eax
- jmp L079cbc_dec_tail_collected
+ jmp L088cbc_dec_tail_collected
.align 4,0x90
-L083cbc_dec_four:
+L087cbc_dec_four:
call __aesni_decrypt4
movups 16(%esi),%xmm1
movups 32(%esi),%xmm0
@@ -1891,28 +2013,44 @@ L083cbc_dec_four:
movups %xmm2,(%edi)
xorps %xmm1,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
leal 48(%edi),%edi
movaps %xmm5,%xmm2
+ pxor %xmm5,%xmm5
subl $64,%eax
-L079cbc_dec_tail_collected:
+ jmp L088cbc_dec_tail_collected
+.align 4,0x90
+L083cbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+L088cbc_dec_tail_collected:
andl $15,%eax
- jnz L085cbc_dec_tail_partial
+ jnz L090cbc_dec_tail_partial
movups %xmm2,(%edi)
- jmp L075cbc_ret
+ pxor %xmm0,%xmm0
+ jmp L079cbc_ret
.align 4,0x90
-L085cbc_dec_tail_partial:
+L090cbc_dec_tail_partial:
movaps %xmm2,(%esp)
+ pxor %xmm0,%xmm0
movl $16,%ecx
movl %esp,%esi
subl %eax,%ecx
.long 2767451785
-L075cbc_ret:
+ movdqa %xmm2,(%esp)
+L079cbc_ret:
movl 16(%esp),%esp
movl 36(%esp),%ebp
+ pxor %xmm2,%xmm2
+ pxor %xmm1,%xmm1
movups %xmm7,(%ebp)
-L070cbc_abort:
+ pxor %xmm7,%xmm7
+L074cbc_abort:
popl %edi
popl %esi
popl %ebx
@@ -1920,52 +2058,62 @@ L070cbc_abort:
ret
.align 4
__aesni_set_encrypt_key:
+ pushl %ebp
+ pushl %ebx
testl %eax,%eax
- jz L086bad_pointer
+ jz L091bad_pointer
testl %edx,%edx
- jz L086bad_pointer
+ jz L091bad_pointer
+ call L092pic
+L092pic:
+ popl %ebx
+ leal Lkey_const-L092pic(%ebx),%ebx
+ movl L__gnutls_x86_cpuid_s$non_lazy_ptr-Lkey_const(%ebx),%ebp
movups (%eax),%xmm0
xorps %xmm4,%xmm4
+ movl 4(%ebp),%ebp
leal 16(%edx),%edx
+ andl $268437504,%ebp
cmpl $256,%ecx
- je L08714rounds
+ je L09314rounds
cmpl $192,%ecx
- je L08812rounds
+ je L09412rounds
cmpl $128,%ecx
- jne L089bad_keybits
+ jne L095bad_keybits
.align 4,0x90
-L09010rounds:
+L09610rounds:
+ cmpl $268435456,%ebp
+ je L09710rounds_alt
movl $9,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,200,1
- call L091key_128_cold
+ call L098key_128_cold
.byte 102,15,58,223,200,2
- call L092key_128
+ call L099key_128
.byte 102,15,58,223,200,4
- call L092key_128
+ call L099key_128
.byte 102,15,58,223,200,8
- call L092key_128
+ call L099key_128
.byte 102,15,58,223,200,16
- call L092key_128
+ call L099key_128
.byte 102,15,58,223,200,32
- call L092key_128
+ call L099key_128
.byte 102,15,58,223,200,64
- call L092key_128
+ call L099key_128
.byte 102,15,58,223,200,128
- call L092key_128
+ call L099key_128
.byte 102,15,58,223,200,27
- call L092key_128
+ call L099key_128
.byte 102,15,58,223,200,54
- call L092key_128
+ call L099key_128
movups %xmm0,(%edx)
movl %ecx,80(%edx)
- xorl %eax,%eax
- ret
+ jmp L100good_key
.align 4,0x90
-L092key_128:
+L099key_128:
movups %xmm0,(%edx)
leal 16(%edx),%edx
-L091key_128_cold:
+L098key_128_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -1974,38 +2122,91 @@ L091key_128_cold:
xorps %xmm1,%xmm0
ret
.align 4,0x90
-L08812rounds:
+L09710rounds_alt:
+ movdqa (%ebx),%xmm5
+ movl $8,%ecx
+ movdqa 32(%ebx),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,-16(%edx)
+L101loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leal 16(%edx),%edx
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%edx)
+ movdqa %xmm0,%xmm2
+ decl %ecx
+ jnz L101loop_key128
+ movdqa 48(%ebx),%xmm4
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%edx)
+ movl $9,%ecx
+ movl %ecx,96(%edx)
+ jmp L100good_key
+.align 4,0x90
+L09412rounds:
movq 16(%eax),%xmm2
+ cmpl $268435456,%ebp
+ je L10212rounds_alt
movl $11,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,202,1
- call L093key_192a_cold
+ call L103key_192a_cold
.byte 102,15,58,223,202,2
- call L094key_192b
+ call L104key_192b
.byte 102,15,58,223,202,4
- call L095key_192a
+ call L105key_192a
.byte 102,15,58,223,202,8
- call L094key_192b
+ call L104key_192b
.byte 102,15,58,223,202,16
- call L095key_192a
+ call L105key_192a
.byte 102,15,58,223,202,32
- call L094key_192b
+ call L104key_192b
.byte 102,15,58,223,202,64
- call L095key_192a
+ call L105key_192a
.byte 102,15,58,223,202,128
- call L094key_192b
+ call L104key_192b
movups %xmm0,(%edx)
movl %ecx,48(%edx)
- xorl %eax,%eax
- ret
+ jmp L100good_key
.align 4,0x90
-L095key_192a:
+L105key_192a:
movups %xmm0,(%edx)
leal 16(%edx),%edx
.align 4,0x90
-L093key_192a_cold:
+L103key_192a_cold:
movaps %xmm2,%xmm5
-L096key_192b_warm:
+L106key_192b_warm:
shufps $16,%xmm0,%xmm4
movdqa %xmm2,%xmm3
xorps %xmm4,%xmm0
@@ -2019,56 +2220,90 @@ L096key_192b_warm:
pxor %xmm3,%xmm2
ret
.align 4,0x90
-L094key_192b:
+L104key_192b:
movaps %xmm0,%xmm3
shufps $68,%xmm0,%xmm5
movups %xmm5,(%edx)
shufps $78,%xmm2,%xmm3
movups %xmm3,16(%edx)
leal 32(%edx),%edx
- jmp L096key_192b_warm
+ jmp L106key_192b_warm
.align 4,0x90
-L08714rounds:
+L10212rounds_alt:
+ movdqa 16(%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $8,%ecx
+ movdqu %xmm0,-16(%edx)
+L107loop_key192:
+ movq %xmm2,(%edx)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leal 24(%edx),%edx
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%edx)
+ decl %ecx
+ jnz L107loop_key192
+ movl $11,%ecx
+ movl %ecx,32(%edx)
+ jmp L100good_key
+.align 4,0x90
+L09314rounds:
movups 16(%eax),%xmm2
- movl $13,%ecx
leal 16(%edx),%edx
+ cmpl $268435456,%ebp
+ je L10814rounds_alt
+ movl $13,%ecx
movups %xmm0,-32(%edx)
movups %xmm2,-16(%edx)
.byte 102,15,58,223,202,1
- call L097key_256a_cold
+ call L109key_256a_cold
.byte 102,15,58,223,200,1
- call L098key_256b
+ call L110key_256b
.byte 102,15,58,223,202,2
- call L099key_256a
+ call L111key_256a
.byte 102,15,58,223,200,2
- call L098key_256b
+ call L110key_256b
.byte 102,15,58,223,202,4
- call L099key_256a
+ call L111key_256a
.byte 102,15,58,223,200,4
- call L098key_256b
+ call L110key_256b
.byte 102,15,58,223,202,8
- call L099key_256a
+ call L111key_256a
.byte 102,15,58,223,200,8
- call L098key_256b
+ call L110key_256b
.byte 102,15,58,223,202,16
- call L099key_256a
+ call L111key_256a
.byte 102,15,58,223,200,16
- call L098key_256b
+ call L110key_256b
.byte 102,15,58,223,202,32
- call L099key_256a
+ call L111key_256a
.byte 102,15,58,223,200,32
- call L098key_256b
+ call L110key_256b
.byte 102,15,58,223,202,64
- call L099key_256a
+ call L111key_256a
movups %xmm0,(%edx)
movl %ecx,16(%edx)
xorl %eax,%eax
- ret
+ jmp L100good_key
.align 4,0x90
-L099key_256a:
+L111key_256a:
movups %xmm2,(%edx)
leal 16(%edx),%edx
-L097key_256a_cold:
+L109key_256a_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -2077,7 +2312,7 @@ L097key_256a_cold:
xorps %xmm1,%xmm0
ret
.align 4,0x90
-L098key_256b:
+L110key_256b:
movups %xmm0,(%edx)
leal 16(%edx),%edx
shufps $16,%xmm2,%xmm4
@@ -2087,13 +2322,70 @@ L098key_256b:
shufps $170,%xmm1,%xmm1
xorps %xmm1,%xmm2
ret
+.align 4,0x90
+L10814rounds_alt:
+ movdqa (%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $7,%ecx
+ movdqu %xmm0,-32(%edx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,-16(%edx)
+L112loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ decl %ecx
+ jz L113done_key256
+ pshufd $255,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%edx)
+ leal 32(%edx),%edx
+ movdqa %xmm2,%xmm1
+ jmp L112loop_key256
+L113done_key256:
+ movl $13,%ecx
+ movl %ecx,16(%edx)
+L100good_key:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ xorl %eax,%eax
+ popl %ebx
+ popl %ebp
+ ret
.align 2,0x90
-L086bad_pointer:
+L091bad_pointer:
movl $-1,%eax
+ popl %ebx
+ popl %ebp
ret
.align 2,0x90
-L089bad_keybits:
+L095bad_keybits:
+ pxor %xmm0,%xmm0
movl $-2,%eax
+ popl %ebx
+ popl %ebp
ret
.globl _aesni_set_encrypt_key
.align 4
@@ -2115,7 +2407,7 @@ L_aesni_set_decrypt_key_begin:
movl 12(%esp),%edx
shll $4,%ecx
testl %eax,%eax
- jnz L100dec_key_ret
+ jnz L114dec_key_ret
leal 16(%edx,%ecx,1),%eax
movups (%edx),%xmm0
movups (%eax),%xmm1
@@ -2123,7 +2415,7 @@ L_aesni_set_decrypt_key_begin:
movups %xmm1,(%edx)
leal 16(%edx),%edx
leal -16(%eax),%eax
-L101dec_key_inverse:
+L115dec_key_inverse:
movups (%edx),%xmm0
movups (%eax),%xmm1
.byte 102,15,56,219,192
@@ -2133,15 +2425,28 @@ L101dec_key_inverse:
movups %xmm0,16(%eax)
movups %xmm1,-16(%edx)
cmpl %edx,%eax
- ja L101dec_key_inverse
+ ja L115dec_key_inverse
movups (%edx),%xmm0
.byte 102,15,56,219,192
movups %xmm0,(%edx)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
xorl %eax,%eax
-L100dec_key_ret:
+L114dec_key_ret:
ret
+.align 6,0x90
+Lkey_const:
+.long 202313229,202313229,202313229,202313229
+.long 67569157,67569157,67569157,67569157
+.long 1,1,1,1
+.long 27,27,27,27
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
.byte 115,108,46,111,114,103,62,0
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L__gnutls_x86_cpuid_s$non_lazy_ptr:
+.indirect_symbol __gnutls_x86_cpuid_s
+.long 0
+.comm __gnutls_x86_cpuid_s,16,2
diff --git a/lib/accelerated/x86/macosx/aesni-x86_64.s b/lib/accelerated/x86/macosx/aesni-x86_64.s
index 420fd5a7b6..f0a5606348 100644
--- a/lib/accelerated/x86/macosx/aesni-x86_64.s
+++ b/lib/accelerated/x86/macosx/aesni-x86_64.s
@@ -38,6 +38,7 @@
# *** This file is auto-generated ***
#
.text
+
.globl _aesni_encrypt
.p2align 4
@@ -53,9 +54,12 @@ L$oop_enc1_1:
decl %eax
movups (%rdx),%xmm1
leaq 16(%rdx),%rdx
- jnz L$oop_enc1_1
+ jnz L$oop_enc1_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
.byte 0xf3,0xc3
@@ -74,34 +78,96 @@ L$oop_dec1_2:
decl %eax
movups (%rdx),%xmm1
leaq 16(%rdx),%rdx
- jnz L$oop_dec1_2
+ jnz L$oop_dec1_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ .byte 0xf3,0xc3
+
+
+.p2align 4
+_aesni_encrypt2:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+L$enc_loop2:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$enc_loop2
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ .byte 0xf3,0xc3
+
+
+.p2align 4
+_aesni_decrypt2:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+L$dec_loop2:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$dec_loop2
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
.byte 0xf3,0xc3
.p2align 4
_aesni_encrypt3:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
L$enc_loop3:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$enc_loop3
.byte 102,15,56,220,209
@@ -116,25 +182,26 @@ L$enc_loop3:
.p2align 4
_aesni_decrypt3:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
L$dec_loop3:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$dec_loop3
.byte 102,15,56,222,209
@@ -149,28 +216,30 @@ L$dec_loop3:
.p2align 4
_aesni_encrypt4:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
xorps %xmm0,%xmm5
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
L$enc_loop4:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$enc_loop4
.byte 102,15,56,220,209
@@ -187,28 +256,30 @@ L$enc_loop4:
.p2align 4
_aesni_decrypt4:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
xorps %xmm0,%xmm5
- movups (%rcx),%xmm0
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
L$dec_loop4:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$dec_loop4
.byte 102,15,56,222,209
@@ -225,43 +296,40 @@ L$dec_loop4:
.p2align 4
_aesni_encrypt6:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
+.byte 102,15,56,220,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+.byte 102,15,56,220,225
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,220,241
- movups (%rcx),%xmm0
-.byte 102,15,56,220,249
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
jmp L$enc_loop6_enter
.p2align 4
L$enc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
+L$enc_loop6_enter:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-L$enc_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$enc_loop6
.byte 102,15,56,220,209
@@ -282,43 +350,40 @@ L$enc_loop6_enter:
.p2align 4
_aesni_decrypt6:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
+.byte 102,15,56,222,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,222,217
pxor %xmm0,%xmm5
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+.byte 102,15,56,222,225
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,222,241
- movups (%rcx),%xmm0
-.byte 102,15,56,222,249
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
jmp L$dec_loop6_enter
.p2align 4
L$dec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
+L$dec_loop6_enter:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-L$dec_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$dec_loop6
.byte 102,15,56,222,209
@@ -339,52 +404,46 @@ L$dec_loop6_enter:
.p2align 4
_aesni_encrypt8:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
-.byte 102,15,56,220,209
pxor %xmm0,%xmm4
-.byte 102,15,56,220,217
pxor %xmm0,%xmm5
-.byte 102,15,56,220,225
pxor %xmm0,%xmm6
-.byte 102,15,56,220,233
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,209
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,220,241
pxor %xmm0,%xmm8
-.byte 102,15,56,220,249
+.byte 102,15,56,220,217
pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
- movups 16(%rcx),%xmm1
- jmp L$enc_loop8_enter
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp L$enc_loop8_inner
.p2align 4
L$enc_loop8:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
+L$enc_loop8_inner:
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
- movups 16(%rcx),%xmm1
L$enc_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$enc_loop8
.byte 102,15,56,220,209
@@ -409,52 +468,46 @@ L$enc_loop8_enter:
.p2align 4
_aesni_decrypt8:
movups (%rcx),%xmm0
- shrl $1,%eax
+ shll $4,%eax
movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
-.byte 102,15,56,222,209
pxor %xmm0,%xmm4
-.byte 102,15,56,222,217
pxor %xmm0,%xmm5
-.byte 102,15,56,222,225
pxor %xmm0,%xmm6
-.byte 102,15,56,222,233
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,222,209
pxor %xmm0,%xmm7
- decl %eax
-.byte 102,15,56,222,241
pxor %xmm0,%xmm8
-.byte 102,15,56,222,249
+.byte 102,15,56,222,217
pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
- jmp L$dec_loop8_enter
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp L$dec_loop8_inner
.p2align 4
L$dec_loop8:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
+L$dec_loop8_inner:
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
L$dec_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
- movups (%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$dec_loop8
.byte 102,15,56,222,209
@@ -489,7 +542,7 @@ _aesni_ecb_encrypt:
testl %r8d,%r8d
jz L$ecb_decrypt
- cmpq $128,%rdx
+ cmpq $0x80,%rdx
jb L$ecb_enc_tail
movdqu (%rdi),%xmm2
@@ -501,7 +554,7 @@ _aesni_ecb_encrypt:
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
- subq $128,%rdx
+ subq $0x80,%rdx
jmp L$ecb_enc_loop8_enter
.p2align 4
L$ecb_enc_loop8:
@@ -529,7 +582,7 @@ L$ecb_enc_loop8_enter:
call _aesni_encrypt8
- subq $128,%rdx
+ subq $0x80,%rdx
jnc L$ecb_enc_loop8
movups %xmm2,(%rsi)
@@ -543,26 +596,27 @@ L$ecb_enc_loop8_enter:
movups %xmm8,96(%rsi)
movups %xmm9,112(%rsi)
leaq 128(%rsi),%rsi
- addq $128,%rdx
+ addq $0x80,%rdx
jz L$ecb_ret
L$ecb_enc_tail:
movups (%rdi),%xmm2
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb L$ecb_enc_one
movups 16(%rdi),%xmm3
je L$ecb_enc_two
movups 32(%rdi),%xmm4
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb L$ecb_enc_three
movups 48(%rdi),%xmm5
je L$ecb_enc_four
movups 64(%rdi),%xmm6
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
jb L$ecb_enc_five
movups 80(%rdi),%xmm7
je L$ecb_enc_six
movdqu 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
call _aesni_encrypt8
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
@@ -583,14 +637,13 @@ L$oop_enc1_3:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_enc1_3
+ jnz L$oop_enc1_3
.byte 102,15,56,221,209
movups %xmm2,(%rsi)
jmp L$ecb_ret
.p2align 4
L$ecb_enc_two:
- xorps %xmm4,%xmm4
- call _aesni_encrypt3
+ call _aesni_encrypt2
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
jmp L$ecb_ret
@@ -632,7 +685,7 @@ L$ecb_enc_six:
.p2align 4
L$ecb_decrypt:
- cmpq $128,%rdx
+ cmpq $0x80,%rdx
jb L$ecb_dec_tail
movdqu (%rdi),%xmm2
@@ -644,7 +697,7 @@ L$ecb_decrypt:
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
- subq $128,%rdx
+ subq $0x80,%rdx
jmp L$ecb_dec_loop8_enter
.p2align 4
L$ecb_dec_loop8:
@@ -673,49 +726,66 @@ L$ecb_dec_loop8_enter:
call _aesni_decrypt8
movups (%r11),%xmm0
- subq $128,%rdx
+ subq $0x80,%rdx
jnc L$ecb_dec_loop8
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movq %r11,%rcx
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movl %r10d,%eax
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
movups %xmm9,112(%rsi)
+ pxor %xmm9,%xmm9
leaq 128(%rsi),%rsi
- addq $128,%rdx
+ addq $0x80,%rdx
jz L$ecb_ret
L$ecb_dec_tail:
movups (%rdi),%xmm2
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb L$ecb_dec_one
movups 16(%rdi),%xmm3
je L$ecb_dec_two
movups 32(%rdi),%xmm4
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb L$ecb_dec_three
movups 48(%rdi),%xmm5
je L$ecb_dec_four
movups 64(%rdi),%xmm6
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
jb L$ecb_dec_five
movups 80(%rdi),%xmm7
je L$ecb_dec_six
movups 96(%rdi),%xmm8
movups (%rcx),%xmm0
+ xorps %xmm9,%xmm9
call _aesni_decrypt8
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
jmp L$ecb_ret
.p2align 4
L$ecb_dec_one:
@@ -728,53 +798,76 @@ L$oop_dec1_4:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_dec1_4
+ jnz L$oop_dec1_4
.byte 102,15,56,223,209
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
jmp L$ecb_ret
.p2align 4
L$ecb_dec_two:
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
+ call _aesni_decrypt2
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
jmp L$ecb_ret
.p2align 4
L$ecb_dec_three:
call _aesni_decrypt3
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
jmp L$ecb_ret
.p2align 4
L$ecb_dec_four:
call _aesni_decrypt4
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
jmp L$ecb_ret
.p2align 4
L$ecb_dec_five:
xorps %xmm7,%xmm7
call _aesni_decrypt6
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
jmp L$ecb_ret
.p2align 4
L$ecb_dec_six:
call _aesni_decrypt6
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
L$ecb_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
.byte 0xf3,0xc3
.globl _aesni_ccm64_encrypt_blocks
@@ -782,56 +875,62 @@ L$ecb_ret:
.p2align 4
_aesni_ccm64_encrypt_blocks:
movl 240(%rcx),%eax
- movdqu (%r8),%xmm9
- movdqa L$increment64(%rip),%xmm6
+ movdqu (%r8),%xmm6
+ movdqa L$increment64(%rip),%xmm9
movdqa L$bswap_mask(%rip),%xmm7
- shrl $1,%eax
+ shll $4,%eax
+ movl $16,%r10d
leaq 0(%rcx),%r11
movdqu (%r9),%xmm3
- movdqa %xmm9,%xmm2
- movl %eax,%r10d
-.byte 102,68,15,56,0,207
+ movdqa %xmm6,%xmm2
+ leaq 32(%rcx,%rax,1),%rcx
+.byte 102,15,56,0,247
+ subq %rax,%r10
jmp L$ccm64_enc_outer
.p2align 4
L$ccm64_enc_outer:
movups (%r11),%xmm0
- movl %r10d,%eax
+ movq %r10,%rax
movups (%rdi),%xmm8
xorps %xmm0,%xmm2
movups 16(%r11),%xmm1
xorps %xmm8,%xmm0
- leaq 32(%r11),%rcx
xorps %xmm0,%xmm3
- movups (%rcx),%xmm0
+ movups 32(%r11),%xmm0
L$ccm64_enc2_loop:
.byte 102,15,56,220,209
- decl %eax
.byte 102,15,56,220,217
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,216
- movups 0(%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$ccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
+ decq %rdx
.byte 102,15,56,221,208
.byte 102,15,56,221,216
- decq %rdx
leaq 16(%rdi),%rdi
xorps %xmm2,%xmm8
- movdqa %xmm9,%xmm2
+ movdqa %xmm6,%xmm2
movups %xmm8,(%rsi)
- leaq 16(%rsi),%rsi
.byte 102,15,56,0,215
+ leaq 16(%rsi),%rsi
jnz L$ccm64_enc_outer
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
movups %xmm3,(%r9)
+ pxor %xmm3,%xmm3
+ pxor %xmm8,%xmm8
+ pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.globl _aesni_ccm64_decrypt_blocks
@@ -839,15 +938,15 @@ L$ccm64_enc2_loop:
.p2align 4
_aesni_ccm64_decrypt_blocks:
movl 240(%rcx),%eax
- movups (%r8),%xmm9
+ movups (%r8),%xmm6
movdqu (%r9),%xmm3
- movdqa L$increment64(%rip),%xmm6
+ movdqa L$increment64(%rip),%xmm9
movdqa L$bswap_mask(%rip),%xmm7
- movaps %xmm9,%xmm2
+ movaps %xmm6,%xmm2
movl %eax,%r10d
movq %rcx,%r11
-.byte 102,68,15,56,0,207
+.byte 102,15,56,0,247
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
@@ -857,17 +956,21 @@ L$oop_enc1_5:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_enc1_5
+ jnz L$oop_enc1_5
.byte 102,15,56,221,209
+ shll $4,%r10d
+ movl $16,%eax
movups (%rdi),%xmm8
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
leaq 16(%rdi),%rdi
+ subq %r10,%rax
+ leaq 32(%r11,%r10,1),%rcx
+ movq %rax,%r10
jmp L$ccm64_dec_outer
.p2align 4
L$ccm64_dec_outer:
xorps %xmm2,%xmm8
- movdqa %xmm9,%xmm2
- movl %r10d,%eax
+ movdqa %xmm6,%xmm2
movups %xmm8,(%rsi)
leaq 16(%rsi),%rsi
.byte 102,15,56,0,215
@@ -876,36 +979,36 @@ L$ccm64_dec_outer:
jz L$ccm64_dec_break
movups (%r11),%xmm0
- shrl $1,%eax
+ movq %r10,%rax
movups 16(%r11),%xmm1
xorps %xmm0,%xmm8
- leaq 32(%r11),%rcx
xorps %xmm0,%xmm2
xorps %xmm8,%xmm3
- movups (%rcx),%xmm0
-
+ movups 32(%r11),%xmm0
+ jmp L$ccm64_dec2_loop
+.p2align 4
L$ccm64_dec2_loop:
.byte 102,15,56,220,209
- decl %eax
.byte 102,15,56,220,217
- movups 16(%rcx),%xmm1
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
.byte 102,15,56,220,208
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,216
- movups 0(%rcx),%xmm0
+ movups -16(%rcx,%rax,1),%xmm0
jnz L$ccm64_dec2_loop
movups (%rdi),%xmm8
- paddq %xmm6,%xmm9
+ paddq %xmm9,%xmm6
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- leaq 16(%rdi),%rdi
.byte 102,15,56,221,208
.byte 102,15,56,221,216
+ leaq 16(%rdi),%rdi
jmp L$ccm64_dec_outer
.p2align 4
L$ccm64_dec_break:
+ movl 240(%r11),%eax
movups (%r11),%xmm0
movups 16(%r11),%xmm1
xorps %xmm0,%xmm8
@@ -916,23 +1019,58 @@ L$oop_enc1_6:
decl %eax
movups (%r11),%xmm1
leaq 16(%r11),%r11
- jnz L$oop_enc1_6
+ jnz L$oop_enc1_6
.byte 102,15,56,221,217
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
movups %xmm3,(%r9)
+ pxor %xmm3,%xmm3
+ pxor %xmm8,%xmm8
+ pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.globl _aesni_ctr32_encrypt_blocks
.p2align 4
_aesni_ctr32_encrypt_blocks:
+ cmpq $1,%rdx
+ jne L$ctr32_bulk
+
+
+
+ movups (%r8),%xmm2
+ movups (%rdi),%xmm3
+ movl 240(%rcx),%edx
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_enc1_7:
+.byte 102,15,56,220,209
+ decl %edx
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_enc1_7
+.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ xorps %xmm2,%xmm2
+ jmp L$ctr32_epilogue
+
+.p2align 4
+L$ctr32_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $128,%rsp
andq $-16,%rsp
leaq -8(%rax),%rbp
- cmpq $1,%rdx
- je L$ctr32_one_shortcut
+
+
movdqu (%r8),%xmm2
movdqu (%rcx),%xmm0
@@ -947,32 +1085,33 @@ _aesni_ctr32_encrypt_blocks:
movdqa %xmm2,64(%rsp)
movdqa %xmm2,80(%rsp)
movdqa %xmm2,96(%rsp)
+ movq %rdx,%r10
movdqa %xmm2,112(%rsp)
- movl 240(%rcx),%eax
-
- leaq 1(%r8),%r9
- leaq 2(%r8),%r10
- bswapl %r9d
- bswapl %r10d
- xorl %r11d,%r9d
- xorl %r11d,%r10d
-.byte 102,65,15,58,34,217,3
- leaq 3(%r8),%r9
+ leaq 1(%r8),%rax
+ leaq 2(%r8),%rdx
+ bswapl %eax
+ bswapl %edx
+ xorl %r11d,%eax
+ xorl %r11d,%edx
+.byte 102,15,58,34,216,3
+ leaq 3(%r8),%rax
movdqa %xmm3,16(%rsp)
-.byte 102,65,15,58,34,226,3
- bswapl %r9d
+.byte 102,15,58,34,226,3
+ bswapl %eax
+ movq %r10,%rdx
leaq 4(%r8),%r10
movdqa %xmm4,32(%rsp)
- xorl %r11d,%r9d
+ xorl %r11d,%eax
bswapl %r10d
-.byte 102,65,15,58,34,233,3
+.byte 102,15,58,34,232,3
xorl %r11d,%r10d
movdqa %xmm5,48(%rsp)
leaq 5(%r8),%r9
movl %r10d,64+12(%rsp)
bswapl %r9d
leaq 6(%r8),%r10
+ movl 240(%rcx),%eax
xorl %r11d,%r9d
bswapl %r10d
movl %r9d,80+12(%rsp)
@@ -980,7 +1119,9 @@ _aesni_ctr32_encrypt_blocks:
leaq 7(%r8),%r9
movl %r10d,96+12(%rsp)
bswapl %r9d
+ movl __gnutls_x86_cpuid_s+4(%rip),%r10d
xorl %r11d,%r9d
+ andl $71303168,%r10d
movl %r9d,112+12(%rsp)
movups 16(%rcx),%xmm1
@@ -991,10 +1132,104 @@ _aesni_ctr32_encrypt_blocks:
cmpq $8,%rdx
jb L$ctr32_tail
+ subq $6,%rdx
+ cmpl $4194304,%r10d
+ je L$ctr32_6x
+
leaq 128(%rcx),%rcx
- subq $8,%rdx
+ subq $2,%rdx
jmp L$ctr32_loop8
+.p2align 4
+L$ctr32_6x:
+ shll $4,%eax
+ movl $48,%r10d
+ bswapl %r11d
+ leaq 32(%rcx,%rax,1),%rcx
+ subq %rax,%r10
+ jmp L$ctr32_loop6
+
+.p2align 4
+L$ctr32_loop6:
+ addl $6,%r8d
+ movups -48(%rcx,%r10,1),%xmm0
+.byte 102,15,56,220,209
+ movl %r8d,%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,217
+.byte 0x0f,0x38,0xf1,0x44,0x24,12
+ leal 1(%r8),%eax
+.byte 102,15,56,220,225
+ xorl %r11d,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,28
+.byte 102,15,56,220,233
+ leal 2(%r8),%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,241
+.byte 0x0f,0x38,0xf1,0x44,0x24,44
+ leal 3(%r8),%eax
+.byte 102,15,56,220,249
+ movups -32(%rcx,%r10,1),%xmm1
+ xorl %r11d,%eax
+
+.byte 102,15,56,220,208
+.byte 0x0f,0x38,0xf1,0x44,0x24,60
+ leal 4(%r8),%eax
+.byte 102,15,56,220,216
+ xorl %r11d,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,76
+.byte 102,15,56,220,224
+ leal 5(%r8),%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,232
+.byte 0x0f,0x38,0xf1,0x44,0x24,92
+ movq %r10,%rax
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups -16(%rcx,%r10,1),%xmm0
+
+ call L$enc_loop6
+
+ movdqu (%rdi),%xmm8
+ movdqu 16(%rdi),%xmm9
+ movdqu 32(%rdi),%xmm10
+ movdqu 48(%rdi),%xmm11
+ movdqu 64(%rdi),%xmm12
+ movdqu 80(%rdi),%xmm13
+ leaq 96(%rdi),%rdi
+ movups -64(%rcx,%r10,1),%xmm1
+ pxor %xmm2,%xmm8
+ movaps 0(%rsp),%xmm2
+ pxor %xmm3,%xmm9
+ movaps 16(%rsp),%xmm3
+ pxor %xmm4,%xmm10
+ movaps 32(%rsp),%xmm4
+ pxor %xmm5,%xmm11
+ movaps 48(%rsp),%xmm5
+ pxor %xmm6,%xmm12
+ movaps 64(%rsp),%xmm6
+ pxor %xmm7,%xmm13
+ movaps 80(%rsp),%xmm7
+ movdqu %xmm8,(%rsi)
+ movdqu %xmm9,16(%rsi)
+ movdqu %xmm10,32(%rsi)
+ movdqu %xmm11,48(%rsi)
+ movdqu %xmm12,64(%rsi)
+ movdqu %xmm13,80(%rsi)
+ leaq 96(%rsi),%rsi
+
+ subq $6,%rdx
+ jnc L$ctr32_loop6
+
+ addq $6,%rdx
+ jz L$ctr32_done
+
+ leal -48(%r10),%eax
+ leaq -80(%rcx,%r10,1),%rcx
+ negl %eax
+ shrl $4,%eax
+ jmp L$ctr32_tail
+
.p2align 5
L$ctr32_loop8:
addl $8,%r8d
@@ -1007,6 +1242,7 @@ L$ctr32_loop8:
movups 32-128(%rcx),%xmm0
.byte 102,15,56,220,225
xorl %r11d,%r9d
+ nop
.byte 102,15,56,220,233
movl %r9d,0+12(%rsp)
leaq 1(%r8),%r9
@@ -1015,11 +1251,12 @@ L$ctr32_loop8:
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 48-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
-.byte 102,15,56,220,224
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
.byte 102,15,56,220,232
movl %r9d,16+12(%rsp)
leaq 2(%r8),%r9
@@ -1028,11 +1265,12 @@ L$ctr32_loop8:
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 64-128(%rcx),%xmm0
+ bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- bswapl %r9d
-.byte 102,15,56,220,225
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
.byte 102,15,56,220,233
movl %r9d,32+12(%rsp)
leaq 3(%r8),%r9
@@ -1041,11 +1279,12 @@ L$ctr32_loop8:
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 80-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
-.byte 102,15,56,220,224
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
.byte 102,15,56,220,232
movl %r9d,48+12(%rsp)
leaq 4(%r8),%r9
@@ -1054,11 +1293,12 @@ L$ctr32_loop8:
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 96-128(%rcx),%xmm0
+ bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- bswapl %r9d
-.byte 102,15,56,220,225
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
.byte 102,15,56,220,233
movl %r9d,64+12(%rsp)
leaq 5(%r8),%r9
@@ -1067,11 +1307,12 @@ L$ctr32_loop8:
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 112-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
-.byte 102,15,56,220,224
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
.byte 102,15,56,220,232
movl %r9d,80+12(%rsp)
leaq 6(%r8),%r9
@@ -1080,11 +1321,12 @@ L$ctr32_loop8:
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 128-128(%rcx),%xmm0
+ bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- bswapl %r9d
-.byte 102,15,56,220,225
xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
.byte 102,15,56,220,233
movl %r9d,96+12(%rsp)
leaq 7(%r8),%r9
@@ -1093,21 +1335,21 @@ L$ctr32_loop8:
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 144-128(%rcx),%xmm1
+ bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- bswapl %r9d
.byte 102,15,56,220,224
xorl %r11d,%r9d
+ movdqu 0(%rdi),%xmm10
.byte 102,15,56,220,232
movl %r9d,112+12(%rsp)
+ cmpl $11,%eax
.byte 102,15,56,220,240
.byte 102,15,56,220,248
.byte 102,68,15,56,220,192
- movdqu 0(%rdi),%xmm10
.byte 102,68,15,56,220,200
movups 160-128(%rcx),%xmm0
- cmpl $11,%eax
jb L$ctr32_enc_done
.byte 102,15,56,220,209
@@ -1150,7 +1392,9 @@ L$ctr32_loop8:
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups 224-128(%rcx),%xmm0
+ jmp L$ctr32_enc_done
+.p2align 4
L$ctr32_enc_done:
movdqu 16(%rdi),%xmm11
pxor %xmm0,%xmm10
@@ -1162,8 +1406,8 @@ L$ctr32_enc_done:
pxor %xmm0,%xmm13
movdqu 80(%rdi),%xmm15
pxor %xmm0,%xmm14
-.byte 102,15,56,220,209
pxor %xmm0,%xmm15
+.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.byte 102,15,56,220,233
@@ -1172,26 +1416,26 @@ L$ctr32_enc_done:
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movdqu 96(%rdi),%xmm1
+ leaq 128(%rdi),%rdi
.byte 102,65,15,56,221,210
pxor %xmm0,%xmm1
- movdqu 112(%rdi),%xmm10
- leaq 128(%rdi),%rdi
+ movdqu 112-128(%rdi),%xmm10
.byte 102,65,15,56,221,219
pxor %xmm0,%xmm10
movdqa 0(%rsp),%xmm11
.byte 102,65,15,56,221,228
- movdqa 16(%rsp),%xmm12
.byte 102,65,15,56,221,237
+ movdqa 16(%rsp),%xmm12
movdqa 32(%rsp),%xmm13
.byte 102,65,15,56,221,246
- movdqa 48(%rsp),%xmm14
.byte 102,65,15,56,221,255
+ movdqa 48(%rsp),%xmm14
movdqa 64(%rsp),%xmm15
.byte 102,68,15,56,221,193
movdqa 80(%rsp),%xmm0
-.byte 102,69,15,56,221,202
movups 16-128(%rcx),%xmm1
+.byte 102,69,15,56,221,202
movups %xmm2,(%rsi)
movdqa %xmm11,%xmm2
@@ -1217,29 +1461,32 @@ L$ctr32_enc_done:
leaq -128(%rcx),%rcx
L$ctr32_tail:
+
+
leaq 16(%rcx),%rcx
cmpq $4,%rdx
jb L$ctr32_loop3
je L$ctr32_loop4
+
+ shll $4,%eax
movdqa 96(%rsp),%xmm8
pxor %xmm9,%xmm9
movups 16(%rcx),%xmm0
.byte 102,15,56,220,209
- leaq 16(%rcx),%rcx
.byte 102,15,56,220,217
- shrl $1,%eax
+ leaq 32-16(%rcx,%rax,1),%rcx
+ negq %rax
.byte 102,15,56,220,225
- decl %eax
-.byte 102,15,56,220,233
+ addq $16,%rax
movups (%rdi),%xmm10
+.byte 102,15,56,220,233
.byte 102,15,56,220,241
movups 16(%rdi),%xmm11
-.byte 102,15,56,220,249
movups 32(%rdi),%xmm12
+.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
- movups 16(%rcx),%xmm1
call L$enc_loop8_enter
@@ -1272,19 +1519,19 @@ L$ctr32_tail:
L$ctr32_loop4:
.byte 102,15,56,220,209
leaq 16(%rcx),%rcx
+ decl %eax
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.byte 102,15,56,220,233
movups (%rcx),%xmm1
- decl %eax
jnz L$ctr32_loop4
.byte 102,15,56,221,209
- movups (%rdi),%xmm10
.byte 102,15,56,221,217
+ movups (%rdi),%xmm10
movups 16(%rdi),%xmm11
.byte 102,15,56,221,225
- movups 32(%rdi),%xmm12
.byte 102,15,56,221,233
+ movups 32(%rdi),%xmm12
movups 48(%rdi),%xmm13
xorps %xmm10,%xmm2
@@ -1301,10 +1548,10 @@ L$ctr32_loop4:
L$ctr32_loop3:
.byte 102,15,56,220,209
leaq 16(%rcx),%rcx
+ decl %eax
.byte 102,15,56,220,217
.byte 102,15,56,220,225
movups (%rcx),%xmm1
- decl %eax
jnz L$ctr32_loop3
.byte 102,15,56,221,209
.byte 102,15,56,221,217
@@ -1324,30 +1571,33 @@ L$ctr32_loop3:
movups 32(%rdi),%xmm12
xorps %xmm12,%xmm4
movups %xmm4,32(%rsi)
- jmp L$ctr32_done
-.p2align 4
-L$ctr32_one_shortcut:
- movups (%r8),%xmm2
- movups (%rdi),%xmm10
- movl 240(%rcx),%eax
- movups (%rcx),%xmm0
- movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
- xorps %xmm0,%xmm2
-L$oop_enc1_7:
-.byte 102,15,56,220,209
- decl %eax
- movups (%rcx),%xmm1
- leaq 16(%rcx),%rcx
- jnz L$oop_enc1_7
-.byte 102,15,56,221,209
- xorps %xmm10,%xmm2
- movups %xmm2,(%rsi)
- jmp L$ctr32_done
-
-.p2align 4
L$ctr32_done:
+ xorps %xmm0,%xmm0
+ xorl %r11d,%r11d
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ movaps %xmm0,112(%rsp)
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
L$ctr32_epilogue:
@@ -1359,245 +1609,285 @@ L$ctr32_epilogue:
_aesni_xts_encrypt:
leaq (%rsp),%rax
pushq %rbp
- subq $96,%rsp
+ subq $112,%rsp
andq $-16,%rsp
leaq -8(%rax),%rbp
- movups (%r9),%xmm15
+ movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
movups (%r8),%xmm0
movups 16(%r8),%xmm1
leaq 32(%r8),%r8
- xorps %xmm0,%xmm15
+ xorps %xmm0,%xmm2
L$oop_enc1_8:
-.byte 102,68,15,56,220,249
+.byte 102,15,56,220,209
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz L$oop_enc1_8
-.byte 102,68,15,56,221,249
+ jnz L$oop_enc1_8
+.byte 102,15,56,221,209
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+
movdqa L$xts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ movdqa %xmm2,%xmm15
+ pshufd $0x5f,%xmm2,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc L$xts_enc_short
- shrl $1,%eax
- subl $1,%eax
- movl %eax,%r10d
+ movl $16+96,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ subq %r10,%rax
+ movups 16(%r11),%xmm1
+ movq %rax,%r10
+ leaq L$xts_magic(%rip),%r8
jmp L$xts_enc_grandloop
-.p2align 4
+.p2align 5
L$xts_enc_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,220,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,220,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,220,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,220,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,220,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,220,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,220,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm12
+
+.byte 102,15,56,220,208
+ pxor %xmm9,%xmm13
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,220,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,220,216
+ pxor %xmm9,%xmm14
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,220,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,220,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,220,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,220,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp L$xts_enc_loop6_enter
-
-.p2align 4
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups 64(%r11),%xmm0
+ movdqa %xmm8,80(%rsp)
+ pshufd $0x5f,%xmm15,%xmm9
+ jmp L$xts_enc_loop6
+.p2align 5
L$xts_enc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-L$xts_enc_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups -64(%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
+ movups -80(%rcx,%rax,1),%xmm0
jnz L$xts_enc_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
.byte 102,15,56,220,233
.byte 102,15,56,220,241
+ pxor %xmm14,%xmm15
+ movaps %xmm10,%xmm11
.byte 102,15,56,220,249
- movups 16(%rcx),%xmm1
+ movups -64(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,208
- pand %xmm8,%xmm9
+ paddd %xmm9,%xmm9
+ pxor %xmm15,%xmm10
.byte 102,15,56,220,216
- pcmpgtd %xmm15,%xmm14
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
.byte 102,15,56,220,224
- pxor %xmm9,%xmm15
.byte 102,15,56,220,232
+ pand %xmm8,%xmm14
+ movaps %xmm11,%xmm12
.byte 102,15,56,220,240
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,248
- movups 32(%rcx),%xmm0
+ movups -48(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
.byte 102,15,56,220,233
+ movdqa %xmm13,48(%rsp)
+ pxor %xmm14,%xmm15
.byte 102,15,56,220,241
+ movaps %xmm12,%xmm13
+ movdqa %xmm9,%xmm14
.byte 102,15,56,220,249
+ movups -32(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,220,216
paddq %xmm15,%xmm15
-.byte 102,15,56,221,208
- pand %xmm8,%xmm9
-.byte 102,15,56,221,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,221,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,221,232
-.byte 102,15,56,221,240
-.byte 102,15,56,221,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+ pxor %xmm14,%xmm15
+ movaps %xmm13,%xmm14
+.byte 102,15,56,220,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,220,217
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+.byte 102,15,56,221,84,36,0
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
+.byte 102,15,56,221,92,36,16
+.byte 102,15,56,221,100,36,32
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+ movq %r10,%rax
+.byte 102,15,56,221,108,36,48
+.byte 102,15,56,221,116,36,64
+.byte 102,15,56,221,124,36,80
pxor %xmm9,%xmm15
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
- movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc L$xts_enc_grandloop
- leal 3(%rax,%rax,1),%eax
+ movl $16+96,%eax
+ subl %r10d,%eax
movq %r11,%rcx
- movl %eax,%r10d
+ shrl $4,%eax
L$xts_enc_short:
+
+ movl %eax,%r10d
+ pxor %xmm0,%xmm10
addq $96,%rdx
jz L$xts_enc_done
- cmpq $32,%rdx
+ pxor %xmm0,%xmm11
+ cmpq $0x20,%rdx
jb L$xts_enc_one
+ pxor %xmm0,%xmm12
je L$xts_enc_two
- cmpq $64,%rdx
+ pxor %xmm0,%xmm13
+ cmpq $0x40,%rdx
jb L$xts_enc_three
+ pxor %xmm0,%xmm14
je L$xts_enc_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
@@ -1607,6 +1897,7 @@ L$xts_enc_short:
pxor %xmm12,%xmm4
pxor %xmm13,%xmm5
pxor %xmm14,%xmm6
+ pxor %xmm7,%xmm7
call _aesni_encrypt6
@@ -1638,7 +1929,7 @@ L$oop_enc1_9:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_enc1_9
+ jnz L$oop_enc1_9
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movdqa %xmm11,%xmm10
@@ -1654,7 +1945,7 @@ L$xts_enc_two:
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
- call _aesni_encrypt3
+ call _aesni_encrypt2
xorps %xmm10,%xmm2
movdqa %xmm12,%xmm10
@@ -1700,15 +1991,15 @@ L$xts_enc_four:
call _aesni_encrypt4
- xorps %xmm10,%xmm2
- movdqa %xmm15,%xmm10
- xorps %xmm11,%xmm3
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm10,%xmm2
+ movdqa %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp L$xts_enc_done
@@ -1743,12 +2034,35 @@ L$oop_enc1_10:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_enc1_10
+ jnz L$oop_enc1_10
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movups %xmm2,-16(%rsi)
L$xts_enc_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
L$xts_enc_epilogue:
@@ -1760,251 +2074,291 @@ L$xts_enc_epilogue:
_aesni_xts_decrypt:
leaq (%rsp),%rax
pushq %rbp
- subq $96,%rsp
+ subq $112,%rsp
andq $-16,%rsp
leaq -8(%rax),%rbp
- movups (%r9),%xmm15
+ movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
movups (%r8),%xmm0
movups 16(%r8),%xmm1
leaq 32(%r8),%r8
- xorps %xmm0,%xmm15
+ xorps %xmm0,%xmm2
L$oop_enc1_11:
-.byte 102,68,15,56,220,249
+.byte 102,15,56,220,209
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
- jnz L$oop_enc1_11
-.byte 102,68,15,56,221,249
+ jnz L$oop_enc1_11
+.byte 102,15,56,221,209
xorl %eax,%eax
testq $15,%rdx
setnz %al
shlq $4,%rax
subq %rax,%rdx
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+
movdqa L$xts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ movdqa %xmm2,%xmm15
+ pshufd $0x5f,%xmm2,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc L$xts_dec_short
- shrl $1,%eax
- subl $1,%eax
- movl %eax,%r10d
+ movl $16+96,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ subq %r10,%rax
+ movups 16(%r11),%xmm1
+ movq %rax,%r10
+ leaq L$xts_magic(%rip),%r8
jmp L$xts_dec_grandloop
-.p2align 4
+.p2align 5
L$xts_dec_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,222,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,222,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,222,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,222,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,222,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,222,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,222,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm12
+
+.byte 102,15,56,222,208
+ pxor %xmm9,%xmm13
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,222,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,222,216
+ pxor %xmm9,%xmm14
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,222,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,222,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,222,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,222,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp L$xts_dec_loop6_enter
-
-.p2align 4
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups 64(%r11),%xmm0
+ movdqa %xmm8,80(%rsp)
+ pshufd $0x5f,%xmm15,%xmm9
+ jmp L$xts_dec_loop6
+.p2align 5
L$xts_dec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-L$xts_dec_loop6_enter:
- movups 16(%rcx),%xmm1
+ movups -64(%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups (%rcx),%xmm0
+ movups -80(%rcx,%rax,1),%xmm0
jnz L$xts_dec_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
.byte 102,15,56,222,233
.byte 102,15,56,222,241
+ pxor %xmm14,%xmm15
+ movaps %xmm10,%xmm11
.byte 102,15,56,222,249
- movups 16(%rcx),%xmm1
+ movups -64(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,208
- pand %xmm8,%xmm9
+ paddd %xmm9,%xmm9
+ pxor %xmm15,%xmm10
.byte 102,15,56,222,216
- pcmpgtd %xmm15,%xmm14
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
.byte 102,15,56,222,224
- pxor %xmm9,%xmm15
.byte 102,15,56,222,232
+ pand %xmm8,%xmm14
+ movaps %xmm11,%xmm12
.byte 102,15,56,222,240
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,248
- movups 32(%rcx),%xmm0
+ movups -48(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
.byte 102,15,56,222,233
+ movdqa %xmm13,48(%rsp)
+ pxor %xmm14,%xmm15
.byte 102,15,56,222,241
+ movaps %xmm12,%xmm13
+ movdqa %xmm9,%xmm14
.byte 102,15,56,222,249
+ movups -32(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,222,216
paddq %xmm15,%xmm15
-.byte 102,15,56,223,208
- pand %xmm8,%xmm9
-.byte 102,15,56,223,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,223,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,223,232
-.byte 102,15,56,223,240
-.byte 102,15,56,223,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+ pxor %xmm14,%xmm15
+ movaps %xmm13,%xmm14
+.byte 102,15,56,222,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,222,217
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
+ pand %xmm8,%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+.byte 102,15,56,223,84,36,0
+ psrad $31,%xmm9
+ paddq %xmm15,%xmm15
+.byte 102,15,56,223,92,36,16
+.byte 102,15,56,223,100,36,32
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+ movq %r10,%rax
+.byte 102,15,56,223,108,36,48
+.byte 102,15,56,223,116,36,64
+.byte 102,15,56,223,124,36,80
pxor %xmm9,%xmm15
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
- movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc L$xts_dec_grandloop
- leal 3(%rax,%rax,1),%eax
+ movl $16+96,%eax
+ subl %r10d,%eax
movq %r11,%rcx
- movl %eax,%r10d
+ shrl $4,%eax
L$xts_dec_short:
+
+ movl %eax,%r10d
+ pxor %xmm0,%xmm10
+ pxor %xmm0,%xmm11
addq $96,%rdx
jz L$xts_dec_done
- cmpq $32,%rdx
+ pxor %xmm0,%xmm12
+ cmpq $0x20,%rdx
jb L$xts_dec_one
+ pxor %xmm0,%xmm13
je L$xts_dec_two
- cmpq $64,%rdx
+ pxor %xmm0,%xmm14
+ cmpq $0x40,%rdx
jb L$xts_dec_three
je L$xts_dec_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
@@ -2030,7 +2384,7 @@ L$xts_dec_short:
pcmpgtd %xmm15,%xmm14
movdqu %xmm6,64(%rsi)
leaq 80(%rsi),%rsi
- pshufd $19,%xmm14,%xmm11
+ pshufd $0x13,%xmm14,%xmm11
andq $15,%r9
jz L$xts_dec_ret
@@ -2054,7 +2408,7 @@ L$oop_dec1_12:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_dec1_12
+ jnz L$oop_dec1_12
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movdqa %xmm11,%xmm10
@@ -2071,7 +2425,7 @@ L$xts_dec_two:
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
- call _aesni_decrypt3
+ call _aesni_decrypt2
xorps %xmm10,%xmm2
movdqa %xmm12,%xmm10
@@ -2097,7 +2451,7 @@ L$xts_dec_three:
xorps %xmm10,%xmm2
movdqa %xmm13,%xmm10
xorps %xmm11,%xmm3
- movdqa %xmm15,%xmm11
+ movdqa %xmm14,%xmm11
xorps %xmm12,%xmm4
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
@@ -2107,14 +2461,8 @@ L$xts_dec_three:
.p2align 4
L$xts_dec_four:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movups (%rdi),%xmm2
- pand %xmm8,%xmm9
movups 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movups 32(%rdi),%xmm4
xorps %xmm10,%xmm2
movups 48(%rdi),%xmm5
@@ -2125,16 +2473,16 @@ L$xts_dec_four:
call _aesni_decrypt4
- xorps %xmm10,%xmm2
+ pxor %xmm10,%xmm2
movdqa %xmm14,%xmm10
- xorps %xmm11,%xmm3
+ pxor %xmm11,%xmm3
movdqa %xmm15,%xmm11
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp L$xts_dec_done
@@ -2158,7 +2506,7 @@ L$oop_dec1_13:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_dec1_13
+ jnz L$oop_dec1_13
.byte 102,15,56,223,209
xorps %xmm11,%xmm2
movups %xmm2,(%rsi)
@@ -2188,12 +2536,35 @@ L$oop_dec1_14:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_dec1_14
+ jnz L$oop_dec1_14
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movups %xmm2,(%rsi)
L$xts_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
L$xts_dec_epilogue:
@@ -2232,7 +2603,7 @@ L$oop_enc1_15:
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_enc1_15
+ jnz L$oop_enc1_15
.byte 102,15,56,221,209
movl %r10d,%eax
movq %r11,%rcx
@@ -2242,26 +2613,59 @@ L$oop_enc1_15:
jnc L$cbc_enc_loop
addq $16,%rdx
jnz L$cbc_enc_tail
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%r8)
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
jmp L$cbc_ret
L$cbc_enc_tail:
movq %rdx,%rcx
xchgq %rdi,%rsi
-.long 0x9066A4F3
+.long 0x9066A4F3
movl $16,%ecx
subq %rdx,%rcx
xorl %eax,%eax
-.long 0x9066AAF3
+.long 0x9066AAF3
leaq -16(%rdi),%rdi
movl %r10d,%eax
movq %rdi,%rsi
movq %r11,%rcx
xorq %rdx,%rdx
- jmp L$cbc_enc_loop
+ jmp L$cbc_enc_loop
.p2align 4
L$cbc_decrypt:
+ cmpq $16,%rdx
+ jne L$cbc_decrypt_bulk
+
+
+
+ movdqu (%rdi),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa %xmm2,%xmm4
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_dec1_16:
+.byte 102,15,56,222,209
+ decl %r10d
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_dec1_16
+.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movdqu %xmm4,(%r8)
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ jmp L$cbc_ret
+.p2align 4
+L$cbc_decrypt_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $16,%rsp
@@ -2269,7 +2673,7 @@ L$cbc_decrypt:
leaq -8(%rax),%rbp
movups (%r8),%xmm10
movl %r10d,%eax
- cmpq $80,%rdx
+ cmpq $0x50,%rdx
jbe L$cbc_dec_tail
movups (%rcx),%xmm0
@@ -2284,10 +2688,15 @@ L$cbc_decrypt:
movdqa %xmm5,%xmm14
movdqu 80(%rdi),%xmm7
movdqa %xmm6,%xmm15
- cmpq $112,%rdx
+ movl __gnutls_x86_cpuid_s+4(%rip),%r9d
+ cmpq $0x70,%rdx
jbe L$cbc_dec_six_or_seven
- subq $112,%rdx
+ andl $71303168,%r9d
+ subq $0x50,%rdx
+ cmpl $4194304,%r9d
+ je L$cbc_dec_loop6_enter
+ subq $0x20,%rdx
leaq 112(%rcx),%rcx
jmp L$cbc_dec_loop8_enter
.p2align 4
@@ -2302,7 +2711,7 @@ L$cbc_dec_loop8_enter:
movups 16-112(%rcx),%xmm1
pxor %xmm0,%xmm4
xorq %r11,%r11
- cmpq $112,%rdx
+ cmpq $0x70,%rdx
pxor %xmm0,%xmm5
pxor %xmm0,%xmm6
pxor %xmm0,%xmm7
@@ -2316,8 +2725,8 @@ L$cbc_dec_loop8_enter:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
- setnc %r11b
.byte 102,68,15,56,222,193
+ setnc %r11b
shlq $7,%r11
.byte 102,68,15,56,222,201
addq %rdi,%r11
@@ -2331,6 +2740,7 @@ L$cbc_dec_loop8_enter:
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 64-112(%rcx),%xmm0
+ nop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -2340,6 +2750,7 @@ L$cbc_dec_loop8_enter:
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 80-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
@@ -2349,6 +2760,7 @@ L$cbc_dec_loop8_enter:
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 96-112(%rcx),%xmm0
+ nop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -2358,6 +2770,7 @@ L$cbc_dec_loop8_enter:
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 112-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
@@ -2367,6 +2780,7 @@ L$cbc_dec_loop8_enter:
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 128-112(%rcx),%xmm0
+ nop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -2376,6 +2790,7 @@ L$cbc_dec_loop8_enter:
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 144-112(%rcx),%xmm1
+ cmpl $11,%eax
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
@@ -2385,7 +2800,6 @@ L$cbc_dec_loop8_enter:
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 160-112(%rcx),%xmm0
- cmpl $11,%eax
jb L$cbc_dec_done
.byte 102,15,56,222,209
.byte 102,15,56,222,217
@@ -2396,6 +2810,7 @@ L$cbc_dec_loop8_enter:
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 176-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
@@ -2415,6 +2830,7 @@ L$cbc_dec_loop8_enter:
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 208-112(%rcx),%xmm1
+ nop
.byte 102,15,56,222,208
.byte 102,15,56,222,216
.byte 102,15,56,222,224
@@ -2424,18 +2840,20 @@ L$cbc_dec_loop8_enter:
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups 224-112(%rcx),%xmm0
+ jmp L$cbc_dec_done
+.p2align 4
L$cbc_dec_done:
.byte 102,15,56,222,209
- pxor %xmm0,%xmm10
.byte 102,15,56,222,217
+ pxor %xmm0,%xmm10
pxor %xmm0,%xmm11
.byte 102,15,56,222,225
- pxor %xmm0,%xmm12
.byte 102,15,56,222,233
+ pxor %xmm0,%xmm12
pxor %xmm0,%xmm13
.byte 102,15,56,222,241
- pxor %xmm0,%xmm14
.byte 102,15,56,222,249
+ pxor %xmm0,%xmm14
pxor %xmm0,%xmm15
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
@@ -2447,16 +2865,16 @@ L$cbc_dec_done:
.byte 102,65,15,56,223,219
pxor %xmm0,%xmm10
movdqu 112(%rdi),%xmm0
- leaq 128(%rdi),%rdi
.byte 102,65,15,56,223,228
+ leaq 128(%rdi),%rdi
movdqu 0(%r11),%xmm11
.byte 102,65,15,56,223,237
- movdqu 16(%r11),%xmm12
.byte 102,65,15,56,223,246
+ movdqu 16(%r11),%xmm12
movdqu 32(%r11),%xmm13
.byte 102,65,15,56,223,255
- movdqu 48(%r11),%xmm14
.byte 102,68,15,56,223,193
+ movdqu 48(%r11),%xmm14
movdqu 64(%r11),%xmm15
.byte 102,69,15,56,223,202
movdqa %xmm0,%xmm10
@@ -2478,21 +2896,21 @@ L$cbc_dec_done:
movups %xmm8,96(%rsi)
leaq 112(%rsi),%rsi
- subq $128,%rdx
+ subq $0x80,%rdx
ja L$cbc_dec_loop8
movaps %xmm9,%xmm2
leaq -112(%rcx),%rcx
- addq $112,%rdx
- jle L$cbc_dec_tail_collected
+ addq $0x70,%rdx
+ jle L$cbc_dec_clear_tail_collected
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
- cmpq $80,%rdx
+ cmpq $0x50,%rdx
jbe L$cbc_dec_tail
movaps %xmm11,%xmm2
L$cbc_dec_six_or_seven:
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
ja L$cbc_dec_seven
movaps %xmm7,%xmm8
@@ -2503,14 +2921,19 @@ L$cbc_dec_six_or_seven:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
leaq 80(%rsi),%rsi
movdqa %xmm7,%xmm2
+ pxor %xmm7,%xmm7
jmp L$cbc_dec_tail_collected
.p2align 4
@@ -2525,36 +2948,88 @@ L$cbc_dec_seven:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
pxor %xmm9,%xmm8
movdqu %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
leaq 96(%rsi),%rsi
movdqa %xmm8,%xmm2
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
jmp L$cbc_dec_tail_collected
+.p2align 4
+L$cbc_dec_loop6:
+ movups %xmm7,(%rsi)
+ leaq 16(%rsi),%rsi
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqa %xmm2,%xmm11
+ movdqu 32(%rdi),%xmm4
+ movdqa %xmm3,%xmm12
+ movdqu 48(%rdi),%xmm5
+ movdqa %xmm4,%xmm13
+ movdqu 64(%rdi),%xmm6
+ movdqa %xmm5,%xmm14
+ movdqu 80(%rdi),%xmm7
+ movdqa %xmm6,%xmm15
+L$cbc_dec_loop6_enter:
+ leaq 96(%rdi),%rdi
+ movdqa %xmm7,%xmm8
+
+ call _aesni_decrypt6
+
+ pxor %xmm10,%xmm2
+ movdqa %xmm8,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movq %r11,%rcx
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movl %r10d,%eax
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ subq $0x60,%rdx
+ ja L$cbc_dec_loop6
+
+ movdqa %xmm7,%xmm2
+ addq $0x50,%rdx
+ jle L$cbc_dec_clear_tail_collected
+ movups %xmm7,(%rsi)
+ leaq 16(%rsi),%rsi
+
L$cbc_dec_tail:
movups (%rdi),%xmm2
- subq $16,%rdx
+ subq $0x10,%rdx
jbe L$cbc_dec_one
movups 16(%rdi),%xmm3
movaps %xmm2,%xmm11
- subq $16,%rdx
+ subq $0x10,%rdx
jbe L$cbc_dec_two
movups 32(%rdi),%xmm4
movaps %xmm3,%xmm12
- subq $16,%rdx
+ subq $0x10,%rdx
jbe L$cbc_dec_three
movups 48(%rdi),%xmm5
movaps %xmm4,%xmm13
- subq $16,%rdx
+ subq $0x10,%rdx
jbe L$cbc_dec_four
movups 64(%rdi),%xmm6
@@ -2568,13 +3043,18 @@ L$cbc_dec_tail:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
leaq 64(%rsi),%rsi
movdqa %xmm6,%xmm2
- subq $16,%rdx
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ subq $0x10,%rdx
jmp L$cbc_dec_tail_collected
.p2align 4
@@ -2584,12 +3064,12 @@ L$cbc_dec_one:
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
-L$oop_dec1_16:
+L$oop_dec1_17:
.byte 102,15,56,222,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_dec1_16
+ jnz L$oop_dec1_17
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movaps %xmm11,%xmm10
@@ -2597,13 +3077,13 @@ L$oop_dec1_16:
.p2align 4
L$cbc_dec_two:
movaps %xmm3,%xmm12
- xorps %xmm4,%xmm4
- call _aesni_decrypt3
+ call _aesni_decrypt2
pxor %xmm10,%xmm2
movaps %xmm12,%xmm10
pxor %xmm11,%xmm3
movdqu %xmm2,(%rsi)
movdqa %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leaq 16(%rsi),%rsi
jmp L$cbc_dec_tail_collected
.p2align 4
@@ -2616,7 +3096,9 @@ L$cbc_dec_three:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movdqa %xmm4,%xmm2
+ pxor %xmm4,%xmm4
leaq 32(%rsi),%rsi
jmp L$cbc_dec_tail_collected
.p2align 4
@@ -2629,29 +3111,45 @@ L$cbc_dec_four:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movdqa %xmm5,%xmm2
+ pxor %xmm5,%xmm5
leaq 48(%rsi),%rsi
jmp L$cbc_dec_tail_collected
.p2align 4
+L$cbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
L$cbc_dec_tail_collected:
movups %xmm10,(%r8)
andq $15,%rdx
jnz L$cbc_dec_tail_partial
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
jmp L$cbc_dec_ret
.p2align 4
L$cbc_dec_tail_partial:
movaps %xmm2,(%rsp)
+ pxor %xmm2,%xmm2
movq $16,%rcx
movq %rsi,%rdi
subq %rdx,%rcx
leaq (%rsp),%rsi
-.long 0x9066A4F3
+.long 0x9066A4F3
+ movdqa %xmm2,(%rsp)
L$cbc_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
leaq (%rbp),%rsp
popq %rbp
L$cbc_ret:
@@ -2661,7 +3159,7 @@ L$cbc_ret:
.p2align 4
_aesni_set_decrypt_key:
-.byte 0x48,0x83,0xEC,0x08
+.byte 0x48,0x83,0xEC,0x08
call __aesni_set_encrypt_key
shll $4,%esi
testl %eax,%eax
@@ -2689,7 +3187,9 @@ L$dec_key_inverse:
movups (%rdx),%xmm0
.byte 102,15,56,219,192
+ pxor %xmm1,%xmm1
movups %xmm0,(%rdi)
+ pxor %xmm0,%xmm0
L$dec_key_ret:
addq $8,%rsp
.byte 0xf3,0xc3
@@ -2700,15 +3200,17 @@ L$SEH_end_set_decrypt_key:
.p2align 4
_aesni_set_encrypt_key:
__aesni_set_encrypt_key:
-.byte 0x48,0x83,0xEC,0x08
+.byte 0x48,0x83,0xEC,0x08
movq $-1,%rax
testq %rdi,%rdi
jz L$enc_key_ret
testq %rdx,%rdx
jz L$enc_key_ret
+ movl $268437504,%r10d
movups (%rdi),%xmm0
xorps %xmm4,%xmm4
+ andl __gnutls_x86_cpuid_s+4(%rip),%r10d
leaq 16(%rdx),%rax
cmpl $256,%esi
je L$14rounds
@@ -2719,6 +3221,9 @@ __aesni_set_encrypt_key:
L$10rounds:
movl $9,%esi
+ cmpl $268435456,%r10d
+ je L$10rounds_alt
+
movups %xmm0,(%rdx)
.byte 102,15,58,223,200,1
call L$key_expansion_128_cold
@@ -2746,9 +3251,79 @@ L$10rounds:
jmp L$enc_key_ret
.p2align 4
+L$10rounds_alt:
+ movdqa L$key_rotate(%rip),%xmm5
+ movl $8,%r10d
+ movdqa L$key_rcon1(%rip),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,(%rdx)
+ jmp L$oop_key128
+
+.p2align 4
+L$oop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leaq 16(%rax),%rax
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%rax)
+ movdqa %xmm0,%xmm2
+
+ decl %r10d
+ jnz L$oop_key128
+
+ movdqa L$key_rcon1b(%rip),%xmm4
+
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%rax)
+
+ movl %esi,96(%rax)
+ xorl %eax,%eax
+ jmp L$enc_key_ret
+
+.p2align 4
L$12rounds:
movq 16(%rdi),%xmm2
movl $11,%esi
+ cmpl $268435456,%r10d
+ je L$12rounds_alt
+
movups %xmm0,(%rdx)
.byte 102,15,58,223,202,1
call L$key_expansion_192a_cold
@@ -2772,10 +3347,54 @@ L$12rounds:
jmp L$enc_key_ret
.p2align 4
+L$12rounds_alt:
+ movdqa L$key_rotate192(%rip),%xmm5
+ movdqa L$key_rcon1(%rip),%xmm4
+ movl $8,%r10d
+ movdqu %xmm0,(%rdx)
+ jmp L$oop_key192
+
+.p2align 4
+L$oop_key192:
+ movq %xmm2,0(%rax)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leaq 24(%rax),%rax
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+
+ pshufd $0xff,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%rax)
+
+ decl %r10d
+ jnz L$oop_key192
+
+ movl %esi,32(%rax)
+ xorl %eax,%eax
+ jmp L$enc_key_ret
+
+.p2align 4
L$14rounds:
movups 16(%rdi),%xmm2
movl $13,%esi
leaq 16(%rax),%rax
+ cmpl $268435456,%r10d
+ je L$14rounds_alt
+
movups %xmm0,(%rdx)
movups %xmm2,16(%rdx)
.byte 102,15,58,223,202,1
@@ -2810,9 +3429,69 @@ L$14rounds:
jmp L$enc_key_ret
.p2align 4
+L$14rounds_alt:
+ movdqa L$key_rotate(%rip),%xmm5
+ movdqa L$key_rcon1(%rip),%xmm4
+ movl $7,%r10d
+ movdqu %xmm0,0(%rdx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,16(%rdx)
+ jmp L$oop_key256
+
+.p2align 4
+L$oop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ decl %r10d
+ jz L$done_key256
+
+ pshufd $0xff,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%rax)
+ leaq 32(%rax),%rax
+ movdqa %xmm2,%xmm1
+
+ jmp L$oop_key256
+
+L$done_key256:
+ movl %esi,16(%rax)
+ xorl %eax,%eax
+ jmp L$enc_key_ret
+
+.p2align 4
L$bad_keybits:
movq $-2,%rax
L$enc_key_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
addq $8,%rsp
.byte 0xf3,0xc3
L$SEH_end_set_encrypt_key:
@@ -2898,6 +3577,14 @@ L$xts_magic:
.long 0x87,0,1,0
L$increment1:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+L$key_rotate:
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+L$key_rotate192:
+.long 0x04070605,0x04070605,0x04070605,0x04070605
+L$key_rcon1:
+.long 1,1,1,1
+L$key_rcon1b:
+.long 0x1b,0x1b,0x1b,0x1b
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 6
diff --git a/lib/accelerated/x86/macosx/ghash-x86_64.s b/lib/accelerated/x86/macosx/ghash-x86_64.s
index 3d314b8020..8fe772fd35 100644
--- a/lib/accelerated/x86/macosx/ghash-x86_64.s
+++ b/lib/accelerated/x86/macosx/ghash-x86_64.s
@@ -39,6 +39,7 @@
#
.text
+
.globl _gcm_gmult_4bit
.p2align 4
@@ -58,14 +59,14 @@ L$gmult_prologue:
movq $14,%rcx
movq 8(%rsi,%rax,1),%r8
movq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
movq %r8,%rdx
jmp L$oop1
.p2align 4
L$oop1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
movb (%rdi,%rcx,1),%al
shrq $4,%r9
@@ -81,13 +82,13 @@ L$oop1:
js L$break1
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
@@ -96,19 +97,19 @@ L$oop1:
.p2align 4
L$break1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rbx,1),%r8
@@ -905,173 +906,177 @@ L$_gmult_clmul:
.p2align 5
_gcm_ghash_clmul:
L$_ghash_clmul:
- movdqa L$bswap_mask(%rip),%xmm5
- movq $11547335547999543296,%rax
+ movdqa L$bswap_mask(%rip),%xmm10
movdqu (%rdi),%xmm0
movdqu (%rsi),%xmm2
- movdqu 32(%rsi),%xmm10
-.byte 102,15,56,0,197
+ movdqu 32(%rsi),%xmm7
+.byte 102,65,15,56,0,194
- subq $16,%rcx
+ subq $0x10,%rcx
jz L$odd_tail
- movdqu 16(%rsi),%xmm9
- cmpq $48,%rcx
+ movdqu 16(%rsi),%xmm6
+ movl __gnutls_x86_cpuid_s+4(%rip),%eax
+ cmpq $0x30,%rcx
jb L$skip4x
- subq $48,%rcx
+ andl $71303168,%eax
+ cmpl $4194304,%eax
+ je L$skip4x
+
+ subq $0x30,%rcx
+ movq $0xA040608020C0E000,%rax
movdqu 48(%rsi),%xmm14
movdqu 64(%rsi),%xmm15
- movdqu 48(%rdx),%xmm6
+ movdqu 48(%rdx),%xmm3
movdqu 32(%rdx),%xmm11
-.byte 102,15,56,0,245
-.byte 102,68,15,56,0,221
- movdqa %xmm6,%xmm8
- pshufd $78,%xmm6,%xmm7
- pxor %xmm6,%xmm7
-.byte 102,15,58,68,242,0
-.byte 102,68,15,58,68,194,17
-.byte 102,65,15,58,68,250,0
+.byte 102,65,15,56,0,218
+.byte 102,69,15,56,0,218
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
movdqa %xmm11,%xmm13
pshufd $78,%xmm11,%xmm12
pxor %xmm11,%xmm12
-.byte 102,69,15,58,68,217,0
-.byte 102,69,15,58,68,233,17
- xorps %xmm11,%xmm6
-.byte 102,69,15,58,68,226,16
- xorps %xmm13,%xmm8
- movups 80(%rsi),%xmm10
- xorps %xmm12,%xmm7
+.byte 102,68,15,58,68,222,0
+.byte 102,68,15,58,68,238,17
+.byte 102,68,15,58,68,231,16
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
+ movups 80(%rsi),%xmm7
+ xorps %xmm12,%xmm4
movdqu 16(%rdx),%xmm11
- movdqu 0(%rdx),%xmm3
-.byte 102,68,15,56,0,221
-.byte 102,15,56,0,221
+ movdqu 0(%rdx),%xmm8
+.byte 102,69,15,56,0,218
+.byte 102,69,15,56,0,194
movdqa %xmm11,%xmm13
pshufd $78,%xmm11,%xmm12
- pxor %xmm3,%xmm0
+ pxor %xmm8,%xmm0
pxor %xmm11,%xmm12
.byte 102,69,15,58,68,222,0
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pxor %xmm0,%xmm3
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
.byte 102,69,15,58,68,238,17
- xorps %xmm11,%xmm6
-.byte 102,69,15,58,68,226,0
- xorps %xmm13,%xmm8
+.byte 102,68,15,58,68,231,0
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
- subq $64,%rcx
+ subq $0x40,%rcx
jc L$tail4x
jmp L$mod4_loop
.p2align 5
L$mod4_loop:
.byte 102,65,15,58,68,199,0
- xorps %xmm12,%xmm7
+ xorps %xmm12,%xmm4
movdqu 48(%rdx),%xmm11
-.byte 102,68,15,56,0,221
+.byte 102,69,15,56,0,218
.byte 102,65,15,58,68,207,17
- xorps %xmm6,%xmm0
- movdqu 32(%rdx),%xmm6
+ xorps %xmm3,%xmm0
+ movdqu 32(%rdx),%xmm3
movdqa %xmm11,%xmm13
+.byte 102,68,15,58,68,199,16
pshufd $78,%xmm11,%xmm12
-.byte 102,65,15,58,68,218,16
- xorps %xmm8,%xmm1
+ xorps %xmm5,%xmm1
pxor %xmm11,%xmm12
-.byte 102,15,56,0,245
- movups 32(%rsi),%xmm10
+.byte 102,65,15,56,0,218
+ movups 32(%rsi),%xmm7
+ xorps %xmm4,%xmm8
.byte 102,68,15,58,68,218,0
- xorps %xmm7,%xmm3
- movdqa %xmm6,%xmm8
- pshufd $78,%xmm6,%xmm7
+ pshufd $78,%xmm3,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm6,%xmm7
- pxor %xmm1,%xmm3
- movdqa %xmm3,%xmm4
- pslldq $8,%xmm3
+ pxor %xmm0,%xmm8
+ movdqa %xmm3,%xmm5
+ pxor %xmm1,%xmm8
+ pxor %xmm3,%xmm4
+ movdqa %xmm8,%xmm9
.byte 102,68,15,58,68,234,17
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- movdqa L$7_mask(%rip),%xmm3
- pxor %xmm4,%xmm1
-.byte 102,72,15,110,224
-
- pand %xmm0,%xmm3
-.byte 102,15,56,0,227
-.byte 102,69,15,58,68,226,0
- pxor %xmm0,%xmm4
- psllq $57,%xmm4
- movdqa %xmm4,%xmm3
- pslldq $8,%xmm4
-.byte 102,65,15,58,68,241,0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
- movdqu 0(%rdx),%xmm3
+ pslldq $8,%xmm8
+ psrldq $8,%xmm9
+ pxor %xmm8,%xmm0
+ movdqa L$7_mask(%rip),%xmm8
+ pxor %xmm9,%xmm1
+.byte 102,76,15,110,200
+
+ pand %xmm0,%xmm8
+.byte 102,69,15,56,0,200
+ pxor %xmm0,%xmm9
+.byte 102,68,15,58,68,231,0
+ psllq $57,%xmm9
+ movdqa %xmm9,%xmm8
+ pslldq $8,%xmm9
+.byte 102,15,58,68,222,0
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
+ movdqu 0(%rdx),%xmm8
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm9
psrlq $1,%xmm0
-.byte 102,69,15,58,68,193,17
- xorps %xmm11,%xmm6
+.byte 102,15,58,68,238,17
+ xorps %xmm11,%xmm3
movdqu 16(%rdx),%xmm11
-.byte 102,68,15,56,0,221
-.byte 102,65,15,58,68,250,16
- xorps %xmm13,%xmm8
- movups 80(%rsi),%xmm10
-.byte 102,15,56,0,221
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
+.byte 102,69,15,56,0,218
+.byte 102,15,58,68,231,16
+ xorps %xmm13,%xmm5
+ movups 80(%rsi),%xmm7
+.byte 102,69,15,56,0,194
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
psrlq $5,%xmm0
movdqa %xmm11,%xmm13
- pxor %xmm12,%xmm7
+ pxor %xmm12,%xmm4
pshufd $78,%xmm11,%xmm12
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
pxor %xmm11,%xmm12
.byte 102,69,15,58,68,222,0
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
psrlq $1,%xmm0
-.byte 102,69,15,58,68,238,17
- xorps %xmm11,%xmm6
pxor %xmm1,%xmm0
-
-.byte 102,69,15,58,68,226,0
- xorps %xmm13,%xmm8
-
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pxor %xmm0,%xmm3
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm3
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
+
+.byte 102,68,15,58,68,231,0
+ xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
- subq $64,%rcx
+ subq $0x40,%rcx
jnc L$mod4_loop
L$tail4x:
.byte 102,65,15,58,68,199,0
- xorps %xmm12,%xmm7
.byte 102,65,15,58,68,207,17
- xorps %xmm6,%xmm0
-.byte 102,65,15,58,68,218,16
- xorps %xmm8,%xmm1
+.byte 102,68,15,58,68,199,16
+ xorps %xmm12,%xmm4
+ xorps %xmm3,%xmm0
+ xorps %xmm5,%xmm1
pxor %xmm0,%xmm1
- pxor %xmm7,%xmm3
+ pxor %xmm4,%xmm8
- pxor %xmm1,%xmm3
+ pxor %xmm1,%xmm8
pxor %xmm0,%xmm1
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
+ movdqa %xmm8,%xmm9
+ psrldq $8,%xmm8
+ pslldq $8,%xmm9
+ pxor %xmm8,%xmm1
+ pxor %xmm9,%xmm0
movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
@@ -1095,10 +1100,10 @@ L$tail4x:
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- addq $64,%rcx
+ addq $0x40,%rcx
jz L$done
- movdqu 32(%rsi),%xmm10
- subq $16,%rcx
+ movdqu 32(%rsi),%xmm7
+ subq $0x10,%rcx
jz L$odd_tail
L$skip4x:
@@ -1106,102 +1111,106 @@ L$skip4x:
- movdqu (%rdx),%xmm3
- movdqu 16(%rdx),%xmm6
-.byte 102,15,56,0,221
-.byte 102,15,56,0,245
- pxor %xmm3,%xmm0
+ movdqu (%rdx),%xmm8
+ movdqu 16(%rdx),%xmm3
+.byte 102,69,15,56,0,194
+.byte 102,65,15,56,0,218
+ pxor %xmm8,%xmm0
- movdqa %xmm6,%xmm8
- pshufd $78,%xmm6,%xmm3
- pxor %xmm6,%xmm3
-.byte 102,15,58,68,242,0
-.byte 102,68,15,58,68,194,17
-.byte 102,65,15,58,68,218,0
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
leaq 32(%rdx),%rdx
- subq $32,%rcx
+ nop
+ subq $0x20,%rcx
jbe L$even_tail
+ nop
jmp L$mod_loop
.p2align 5
L$mod_loop:
movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
pshufd $78,%xmm0,%xmm4
pxor %xmm0,%xmm4
-.byte 102,65,15,58,68,193,0
-.byte 102,65,15,58,68,201,17
-.byte 102,65,15,58,68,226,16
-
- pxor %xmm6,%xmm0
- pxor %xmm8,%xmm1
- movdqu (%rdx),%xmm8
-.byte 102,68,15,56,0,197
- movdqu 16(%rdx),%xmm6
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
- pxor %xmm8,%xmm1
- pxor %xmm3,%xmm4
-.byte 102,15,56,0,245
- movdqa %xmm4,%xmm3
- psrldq $8,%xmm3
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ movdqu (%rdx),%xmm9
+ pxor %xmm0,%xmm8
+.byte 102,69,15,56,0,202
+ movdqu 16(%rdx),%xmm3
+
+ pxor %xmm1,%xmm8
+ pxor %xmm9,%xmm1
+ pxor %xmm8,%xmm4
+.byte 102,65,15,56,0,218
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
pslldq $8,%xmm4
- pxor %xmm3,%xmm1
+ pxor %xmm8,%xmm1
pxor %xmm4,%xmm0
- movdqa %xmm6,%xmm8
+ movdqa %xmm3,%xmm5
- movdqa %xmm0,%xmm4
- movdqa %xmm0,%xmm3
+ movdqa %xmm0,%xmm9
+ movdqa %xmm0,%xmm8
psllq $5,%xmm0
-.byte 102,15,58,68,242,0
- pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm8
+.byte 102,15,58,68,218,0
psllq $1,%xmm0
- pxor %xmm3,%xmm0
+ pxor %xmm8,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm3
+ movdqa %xmm0,%xmm8
pslldq $8,%xmm0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
- pshufd $78,%xmm8,%xmm3
- pxor %xmm8,%xmm3
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pshufd $78,%xmm5,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm5,%xmm4
-.byte 102,68,15,58,68,194,17
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm9
psrlq $1,%xmm0
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
+.byte 102,15,58,68,234,17
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
psrlq $5,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm9,%xmm0
+ leaq 32(%rdx),%rdx
psrlq $1,%xmm0
-.byte 102,65,15,58,68,218,0
+.byte 102,15,58,68,231,0
pxor %xmm1,%xmm0
- leaq 32(%rdx),%rdx
- subq $32,%rcx
+ subq $0x20,%rcx
ja L$mod_loop
L$even_tail:
movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
pshufd $78,%xmm0,%xmm4
pxor %xmm0,%xmm4
-.byte 102,65,15,58,68,193,0
-.byte 102,65,15,58,68,201,17
-.byte 102,65,15,58,68,226,16
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
- pxor %xmm6,%xmm0
- pxor %xmm8,%xmm1
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
- pxor %xmm3,%xmm4
- movdqa %xmm4,%xmm3
- psrldq $8,%xmm3
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ pxor %xmm0,%xmm8
+ pxor %xmm1,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
pslldq $8,%xmm4
- pxor %xmm3,%xmm1
+ pxor %xmm8,%xmm1
pxor %xmm4,%xmm0
movdqa %xmm0,%xmm4
@@ -1230,15 +1239,15 @@ L$even_tail:
jnz L$done
L$odd_tail:
- movdqu (%rdx),%xmm3
-.byte 102,15,56,0,221
- pxor %xmm3,%xmm0
+ movdqu (%rdx),%xmm8
+.byte 102,69,15,56,0,194
+ pxor %xmm8,%xmm0
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
-.byte 102,65,15,58,68,218,0
+.byte 102,15,58,68,223,0
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -1271,7 +1280,7 @@ L$odd_tail:
psrlq $1,%xmm0
pxor %xmm1,%xmm0
L$done:
-.byte 102,15,56,0,197
+.byte 102,65,15,56,0,194
movdqu %xmm0,(%rdi)
.byte 0xf3,0xc3
@@ -1279,7 +1288,108 @@ L$done:
.p2align 5
_gcm_init_avx:
- jmp L$_init_clmul
+ vzeroupper
+
+ vmovdqu (%rsi),%xmm2
+ vpshufd $78,%xmm2,%xmm2
+
+
+ vpshufd $255,%xmm2,%xmm4
+ vpsrlq $63,%xmm2,%xmm3
+ vpsllq $1,%xmm2,%xmm2
+ vpxor %xmm5,%xmm5,%xmm5
+ vpcmpgtd %xmm4,%xmm5,%xmm5
+ vpslldq $8,%xmm3,%xmm3
+ vpor %xmm3,%xmm2,%xmm2
+
+
+ vpand L$0x1c2_polynomial(%rip),%xmm5,%xmm5
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vpunpckhqdq %xmm2,%xmm2,%xmm6
+ vmovdqa %xmm2,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ movq $4,%r10
+ jmp L$init_start_avx
+.p2align 5
+L$init_loop_avx:
+ vpalignr $8,%xmm3,%xmm4,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+L$init_start_avx:
+ vmovdqa %xmm0,%xmm5
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+ vpshufd $78,%xmm5,%xmm3
+ vpshufd $78,%xmm0,%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqu %xmm5,0(%rdi)
+ vpxor %xmm0,%xmm4,%xmm4
+ vmovdqu %xmm0,16(%rdi)
+ leaq 48(%rdi),%rdi
+ subq $1,%r10
+ jnz L$init_loop_avx
+
+ vpalignr $8,%xmm4,%xmm3,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+
+ vzeroupper
+ .byte 0xf3,0xc3
.globl _gcm_gmult_avx
@@ -1291,7 +1401,377 @@ _gcm_gmult_avx:
.p2align 5
_gcm_ghash_avx:
- jmp L$_ghash_clmul
+ vzeroupper
+
+ vmovdqu (%rdi),%xmm10
+ leaq L$0x1c2_polynomial(%rip),%r10
+ leaq 64(%rsi),%rsi
+ vmovdqu L$bswap_mask(%rip),%xmm13
+ vpshufb %xmm13,%xmm10,%xmm10
+ cmpq $0x80,%rcx
+ jb L$short_avx
+ subq $0x80,%rcx
+
+ vmovdqu 112(%rdx),%xmm14
+ vmovdqu 0-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vmovdqu 32-64(%rsi),%xmm7
+
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm14,%xmm9,%xmm9
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 80(%rdx),%xmm14
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 48-64(%rsi),%xmm6
+ vpxor %xmm14,%xmm9,%xmm9
+ vmovdqu 64(%rdx),%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 48(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 32(%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 16(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu (%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+
+ leaq 128(%rdx),%rdx
+ cmpq $0x80,%rcx
+ jb L$tail_avx
+
+ vpxor %xmm10,%xmm15,%xmm15
+ subq $0x80,%rcx
+ jmp L$oop8x_avx
+
+.p2align 5
+L$oop8x_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 112(%rdx),%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpxor %xmm15,%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
+ vmovdqu 0-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
+ vmovdqu 32-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm3,%xmm10,%xmm10
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vxorps %xmm4,%xmm11,%xmm11
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm5,%xmm12,%xmm12
+ vxorps %xmm15,%xmm8,%xmm8
+
+ vmovdqu 80(%rdx),%xmm14
+ vpxor %xmm10,%xmm12,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm11,%xmm12,%xmm12
+ vpslldq $8,%xmm12,%xmm9
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vpsrldq $8,%xmm12,%xmm12
+ vpxor %xmm9,%xmm10,%xmm10
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vxorps %xmm12,%xmm11,%xmm11
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 64(%rdx),%xmm15
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vxorps %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vmovdqu 48(%rdx),%xmm14
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 32(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+ vxorps %xmm12,%xmm10,%xmm10
+
+ vmovdqu 16(%rdx),%xmm14
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vxorps %xmm11,%xmm12,%xmm12
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu (%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm12,%xmm15,%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+ vpxor %xmm10,%xmm15,%xmm15
+
+ leaq 128(%rdx),%rdx
+ subq $0x80,%rcx
+ jnc L$oop8x_avx
+
+ addq $0x80,%rcx
+ jmp L$tail_no_xor_avx
+
+.p2align 5
+L$short_avx:
+ vmovdqu -16(%rdx,%rcx,1),%xmm14
+ leaq (%rdx,%rcx,1),%rdx
+ vmovdqu 0-64(%rsi),%xmm6
+ vmovdqu 32-64(%rsi),%xmm7
+ vpshufb %xmm13,%xmm14,%xmm15
+
+ vmovdqa %xmm0,%xmm3
+ vmovdqa %xmm1,%xmm4
+ vmovdqa %xmm2,%xmm5
+ subq $0x10,%rcx
+ jz L$tail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -32(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz L$tail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -48(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 80-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz L$tail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -64(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz L$tail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -80(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 96-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 128-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz L$tail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -96(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz L$tail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -112(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 144-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovq 184-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jmp L$tail_avx
+
+.p2align 5
+L$tail_avx:
+ vpxor %xmm10,%xmm15,%xmm15
+L$tail_no_xor_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+
+ vmovdqu (%r10),%xmm12
+
+ vpxor %xmm0,%xmm3,%xmm10
+ vpxor %xmm1,%xmm4,%xmm11
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vpxor %xmm10,%xmm5,%xmm5
+ vpxor %xmm11,%xmm5,%xmm5
+ vpslldq $8,%xmm5,%xmm9
+ vpsrldq $8,%xmm5,%xmm5
+ vpxor %xmm9,%xmm10,%xmm10
+ vpxor %xmm5,%xmm11,%xmm11
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm11,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ cmpq $0,%rcx
+ jne L$short_avx
+
+ vpshufb %xmm13,%xmm10,%xmm10
+ vmovdqu %xmm10,(%rdi)
+ vzeroupper
+ .byte 0xf3,0xc3
.p2align 6
L$bswap_mask:
diff --git a/lib/accelerated/x86/x86-common.c b/lib/accelerated/x86/x86-common.c
index 9ce5030c8b..dcd74a4f94 100644
--- a/lib/accelerated/x86/x86-common.c
+++ b/lib/accelerated/x86/x86-common.c
@@ -60,6 +60,14 @@ unsigned int _gnutls_x86_cpuid_s[4];
# define bit_AES 0x2000000
#endif
+#ifndef bit_AVX
+# define bit_AVX 0x10000000
+#endif
+
+#ifndef bit_MOVBE
+# define bit_MOVBE 0x00400000
+#endif
+
#define via_bit_PADLOCK (0x3 << 6)
#define via_bit_PADLOCK_PHE (0x3 << 10)
#define via_bit_PADLOCK_PHE_SHA512 (0x3 << 25)
@@ -70,6 +78,7 @@ unsigned int _gnutls_x86_cpuid_s[4];
#define INTEL_AES_NI (1<<1)
#define INTEL_SSSE3 (1<<2)
#define INTEL_PCLMUL (1<<3)
+#define INTEL_AVX (1<<4)
#define VIA_PADLOCK (1<<20)
#define VIA_PADLOCK_PHE (1<<21)
#define VIA_PADLOCK_PHE_SHA512 (1<<22)
@@ -104,6 +113,15 @@ static void capabilities_to_intel_cpuid(unsigned capabilities)
}
}
+ if (capabilities & INTEL_AVX) {
+ if ((b & bit_AVX) && (b & bit_MOVBE)) {
+ _gnutls_x86_cpuid_s[1] |= bit_AVX|bit_MOVBE;
+ } else {
+ _gnutls_debug_log
+ ("AVX acceleration requested but not available\n");
+ }
+ }
+
if (capabilities & INTEL_PCLMUL) {
if (b & bit_PCLMUL) {
_gnutls_x86_cpuid_s[1] |= bit_PCLMUL;
@@ -126,6 +144,11 @@ static unsigned check_ssse3(void)
}
#ifdef ASM_X86_64
+static unsigned check_avx_movbe(void)
+{
+ return ((_gnutls_x86_cpuid_s[1] & bit_AVX) && (_gnutls_x86_cpuid_s[1] & bit_MOVBE));
+}
+
static unsigned check_pclmul(void)
{
return (_gnutls_x86_cpuid_s[1] & bit_PCLMUL);
@@ -613,22 +636,42 @@ void register_x86_intel_crypto(unsigned capabilities)
#ifdef ASM_X86_64
if (check_pclmul()) {
/* register GCM ciphers */
- _gnutls_debug_log
- ("Intel GCM accelerator was detected\n");
- ret =
- gnutls_crypto_single_cipher_register
- (GNUTLS_CIPHER_AES_128_GCM, 80,
- &_gnutls_aes_gcm_pclmul, 0);
- if (ret < 0) {
- gnutls_assert();
- }
-
- ret =
- gnutls_crypto_single_cipher_register
- (GNUTLS_CIPHER_AES_256_GCM, 80,
- &_gnutls_aes_gcm_pclmul, 0);
- if (ret < 0) {
- gnutls_assert();
+ if (check_avx_movbe()) {
+ _gnutls_debug_log
+ ("Intel GCM accelerator (AVX) was detected\n");
+ ret =
+ gnutls_crypto_single_cipher_register
+ (GNUTLS_CIPHER_AES_128_GCM, 80,
+ &_gnutls_aes_gcm_pclmul_avx, 0);
+ if (ret < 0) {
+ gnutls_assert();
+ }
+
+ ret =
+ gnutls_crypto_single_cipher_register
+ (GNUTLS_CIPHER_AES_256_GCM, 80,
+ &_gnutls_aes_gcm_pclmul_avx, 0);
+ if (ret < 0) {
+ gnutls_assert();
+ }
+ } else {
+ _gnutls_debug_log
+ ("Intel GCM accelerator was detected\n");
+ ret =
+ gnutls_crypto_single_cipher_register
+ (GNUTLS_CIPHER_AES_128_GCM, 80,
+ &_gnutls_aes_gcm_pclmul, 0);
+ if (ret < 0) {
+ gnutls_assert();
+ }
+
+ ret =
+ gnutls_crypto_single_cipher_register
+ (GNUTLS_CIPHER_AES_256_GCM, 80,
+ &_gnutls_aes_gcm_pclmul, 0);
+ if (ret < 0) {
+ gnutls_assert();
+ }
}
} else
#endif
@@ -665,7 +708,7 @@ void register_x86_crypto(void)
if (p) {
capabilities = strtol(p, NULL, 0);
}
-
+
register_x86_intel_crypto(capabilities);
#ifdef ENABLE_PADLOCK
register_x86_padlock_crypto(capabilities);
diff --git a/tests/slow/test-ciphers-common.sh b/tests/slow/test-ciphers-common.sh
index e5e2d51ac8..cb415a35bc 100644
--- a/tests/slow/test-ciphers-common.sh
+++ b/tests/slow/test-ciphers-common.sh
@@ -51,13 +51,22 @@ if test $ret != 0; then
exit $ret
fi
-GNUTLS_CPUID_OVERRIDE=0x8 ${PROG}
+#AESNI+PCLMUL
+GNUTLS_CPUID_OVERRIDE=0xA ${PROG}
ret=$?
if test $ret != 0; then
echo "PCLMUL cipher tests failed"
exit $ret
fi
+#AESNI+PCLMUL+AVX
+GNUTLS_CPUID_OVERRIDE=0x1A ${PROG}
+ret=$?
+if test $ret != 0; then
+ echo "PCLMUL-AVX cipher tests failed"
+ exit $ret
+fi
+
GNUTLS_CPUID_OVERRIDE=0x100000 ${PROG}
ret=$?
if test $ret != 0; then