summaryrefslogtreecommitdiff
path: root/deps
diff options
context:
space:
mode:
authorShigeki Ohtsu <ohtsu@iij.ad.jp>2015-06-12 00:09:20 +0900
committerShigeki Ohtsu <ohtsu@iij.ad.jp>2015-06-12 09:47:45 +0900
commit94804969b76dff831f195d5ddd355e3b9094b629 (patch)
treee7e0f890419913f865f51c2fe99d920eb08a6cab /deps
parent38444915e0953dc092dde6d749b5cc5005dec90e (diff)
downloadnode-new-94804969b76dff831f195d5ddd355e3b9094b629.tar.gz
deps: update asm files for openssl-1.0.2b
asm files are generated as - In `deps/openssl/asm/`, make with CC=gcc and ASM=nasm - In `deps/openssl/asm_obsolute/`, make with no envs for compilers Fixes: https://github.com/nodejs/io.js/issues/1921 PR-URL: https://github.com/nodejs/io.js/pull/1950 Reviewed-By: Fedor Indutny <fedor@indutny.com> Reviewed-By: Ben Noordhuis <info@bnoordhuis.nl>
Diffstat (limited to 'deps')
-rw-r--r--deps/openssl/asm/arm-void-gas/aes/aesv8-armx.S206
-rw-r--r--deps/openssl/asm/arm-void-gas/modes/ghash-armv4.S2
-rw-r--r--deps/openssl/asm/arm-void-gas/modes/ghashv8-armx.S202
-rw-r--r--deps/openssl/asm/arm-void-gas/sha/sha256-armv4.S212
-rw-r--r--deps/openssl/asm/arm64-linux64-gas/aes/aesv8-armx.S206
-rw-r--r--deps/openssl/asm/arm64-linux64-gas/modes/ghashv8-armx.S200
-rw-r--r--deps/openssl/asm/x64-elf-gas/aes/aesni-x86_64.s502
-rw-r--r--deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s13
-rw-r--r--deps/openssl/asm/x64-macosx-gas/aes/aesni-x86_64.s502
-rw-r--r--deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s13
-rw-r--r--deps/openssl/asm/x64-win32-masm/aes/aesni-x86_64.asm522
-rw-r--r--deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm13
-rw-r--r--deps/openssl/asm/x86-elf-gas/aes/aesni-x86.s790
-rw-r--r--deps/openssl/asm/x86-macosx-gas/aes/aesni-x86.s794
-rw-r--r--deps/openssl/asm/x86-win32-masm/aes/aesni-x86.asm793
-rw-r--r--deps/openssl/asm_obsolete/arm-void-gas/aes/aesv8-armx.S206
-rw-r--r--deps/openssl/asm_obsolete/arm-void-gas/modes/ghash-armv4.S2
-rw-r--r--deps/openssl/asm_obsolete/arm-void-gas/modes/ghashv8-armx.S202
-rw-r--r--deps/openssl/asm_obsolete/arm-void-gas/sha/sha256-armv4.S212
-rw-r--r--deps/openssl/asm_obsolete/arm64-linux64-gas/aes/aesv8-armx.S206
-rw-r--r--deps/openssl/asm_obsolete/arm64-linux64-gas/modes/ghashv8-armx.S200
-rw-r--r--deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-x86_64.s502
-rw-r--r--deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont5.s13
-rw-r--r--deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-x86_64.s502
-rw-r--r--deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont5.s13
-rw-r--r--deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-sha256-x86_64.asm71
-rw-r--r--deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-x86_64.asm522
-rw-r--r--deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont5.asm13
-rw-r--r--deps/openssl/asm_obsolete/x86-elf-gas/aes/aesni-x86.s790
-rw-r--r--deps/openssl/asm_obsolete/x86-macosx-gas/aes/aesni-x86.s794
-rw-r--r--deps/openssl/asm_obsolete/x86-win32-masm/aes/aesni-x86.asm793
31 files changed, 7274 insertions, 2737 deletions
diff --git a/deps/openssl/asm/arm-void-gas/aes/aesv8-armx.S b/deps/openssl/asm/arm-void-gas/aes/aesv8-armx.S
index 732ba3d9c8..fd979d078f 100644
--- a/deps/openssl/asm/arm-void-gas/aes/aesv8-armx.S
+++ b/deps/openssl/asm/arm-void-gas/aes/aesv8-armx.S
@@ -230,17 +230,17 @@ aes_v8_encrypt:
.Loop_enc:
.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
- vld1.32 {q0},[r2]!
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
+ vld1.32 {q0},[r2]!
subs r3,r3,#2
.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
- vld1.32 {q1},[r2]!
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
+ vld1.32 {q1},[r2]!
bgt .Loop_enc
.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
- vld1.32 {q0},[r2]
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
+ vld1.32 {q0},[r2]
.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
veor q2,q2,q0
@@ -259,17 +259,17 @@ aes_v8_decrypt:
.Loop_dec:
.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
- vld1.32 {q0},[r2]!
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
+ vld1.32 {q0},[r2]!
subs r3,r3,#2
.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
- vld1.32 {q1},[r2]!
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
+ vld1.32 {q1},[r2]!
bgt .Loop_dec
.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
- vld1.32 {q0},[r2]
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
+ vld1.32 {q0},[r2]
.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
veor q2,q2,q0
@@ -313,16 +313,42 @@ aes_v8_cbc_encrypt:
veor q5,q8,q7
beq .Lcbc_enc128
+ vld1.32 {q2-q3},[r7]
+ add r7,r3,#16
+ add r6,r3,#16*4
+ add r12,r3,#16*5
+ .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ add r14,r3,#16*6
+ add r3,r3,#16*7
+ b .Lenter_cbc_enc
+
+.align 4
.Loop_cbc_enc:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
- vld1.32 {q8},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- subs r6,r6,#2
+ vst1.8 {q6},[r1]!
+.Lenter_cbc_enc:
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
- vld1.32 {q9},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- bgt .Loop_cbc_enc
+ .byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q8},[r6]
+ cmp r5,#4
+ .byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q9},[r12]
+ beq .Lcbc_enc192
+
+ .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q8},[r14]
+ .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q9},[r3]
+ nop
+.Lcbc_enc192:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
subs r2,r2,#16
@@ -331,7 +357,6 @@ aes_v8_cbc_encrypt:
moveq r8,#0
.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- add r7,r3,#16
.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.8 {q8},[r0],r8
@@ -340,16 +365,14 @@ aes_v8_cbc_encrypt:
veor q8,q8,q5
.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
+ vld1.32 {q9},[r7] @ re-pre-load rndkey[1]
.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
-
- mov r6,r5
veor q6,q0,q7
- vst1.8 {q6},[r1]!
bhs .Loop_cbc_enc
+ vst1.8 {q6},[r1]!
b .Lcbc_done
.align 5
@@ -407,79 +430,78 @@ aes_v8_cbc_encrypt:
.Loop3x_cbc_dec:
.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
- .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
- .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
- vld1.32 {q8},[r7]!
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.32 {q8},[r7]!
subs r6,r6,#2
.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
- .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
- .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
- vld1.32 {q9},[r7]!
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.32 {q9},[r7]!
bgt .Loop3x_cbc_dec
.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
- .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
- .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
- veor q4,q6,q7
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ veor q4,q6,q7
+ subs r2,r2,#0x30
veor q5,q2,q7
+ movlo r6,r2 @ r6, r6, is zero at this point
.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
- .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
- .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
- veor q9,q3,q7
- subs r2,r2,#0x30
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
- vorr q6,q11,q11
- movlo r6,r2 @ r6, r6, is zero at this point
- .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
- .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
- .byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
+ veor q9,q3,q7
add r0,r0,r6 @ r0 is adjusted in such way that
@ at exit from the loop q1-q10
@ are loaded with last "words"
+ vorr q6,q11,q11
+ mov r7,r3
+ .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
- mov r7,r3
- .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
- .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
- .byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
vld1.8 {q2},[r0]!
+ .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.8 {q3},[r0]!
.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14
- .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
- .byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
- vld1.8 {q11},[r0]!
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
- vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
+ vld1.8 {q11},[r0]!
.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15
.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15
-
+ vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
add r6,r5,#2
veor q4,q4,q0
veor q5,q5,q1
veor q10,q10,q9
vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
- vorr q0,q2,q2
vst1.8 {q4},[r1]!
- vorr q1,q3,q3
+ vorr q0,q2,q2
vst1.8 {q5},[r1]!
+ vorr q1,q3,q3
vst1.8 {q10},[r1]!
vorr q10,q11,q11
bhs .Loop3x_cbc_dec
@@ -490,39 +512,39 @@ aes_v8_cbc_encrypt:
.Lcbc_dec_tail:
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
- .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
- vld1.32 {q8},[r7]!
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.32 {q8},[r7]!
subs r6,r6,#2
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
- .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
- vld1.32 {q9},[r7]!
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.32 {q9},[r7]!
bgt .Lcbc_dec_tail
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
- .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
- .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
- .byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
cmn r2,#0x20
.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
- .byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
veor q5,q6,q7
.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
- .byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
veor q9,q3,q7
.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
@@ -590,70 +612,69 @@ aes_v8_ctr32_encrypt_blocks:
.align 4
.Loop3x_ctr32:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
- .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
- .byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
- vld1.32 {q8},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ .byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
+ vld1.32 {q8},[r7]!
subs r6,r6,#2
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
- .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
- .byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
- vld1.32 {q9},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ .byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
+ vld1.32 {q9},[r7]!
bgt .Loop3x_ctr32
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
- .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
- .byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
- mov r7,r3
.byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0
- vld1.8 {q2},[r0]!
+ .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1
- .byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
+ vld1.8 {q2},[r0]!
vorr q0,q6,q6
- .byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9
+ .byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
+ .byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
vld1.8 {q3},[r0]!
- .byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9
- .byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
vorr q1,q6,q6
+ .byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
- vld1.8 {q11},[r0]!
+ .byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ vld1.8 {q11},[r0]!
+ mov r7,r3
+ .byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
.byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10
vorr q10,q6,q6
add r9,r8,#1
.byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12
+ .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12
- .byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12
+ .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
veor q2,q2,q7
add r10,r8,#2
- .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
- .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ .byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12
.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
veor q3,q3,q7
add r8,r8,#3
.byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13
+ .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13
- .byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13
+ .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
veor q11,q11,q7
rev r9,r9
- .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
- vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
- .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ .byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13
.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
vmov.32 d1[1], r9
rev r10,r10
.byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14
+ .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14
- .byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14
+ .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
vmov.32 d3[1], r10
rev r12,r8
- .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
- .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ .byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14
.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
vmov.32 d21[1], r12
subs r2,r2,#3
@@ -661,13 +682,14 @@ aes_v8_ctr32_encrypt_blocks:
.byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15
.byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15
- mov r6,r5
veor q2,q2,q4
+ vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
+ vst1.8 {q2},[r1]!
veor q3,q3,q5
+ mov r6,r5
+ vst1.8 {q3},[r1]!
veor q11,q11,q9
vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
- vst1.8 {q2},[r1]!
- vst1.8 {q3},[r1]!
vst1.8 {q11},[r1]!
bhs .Loop3x_ctr32
@@ -679,40 +701,40 @@ aes_v8_ctr32_encrypt_blocks:
.Lctr32_tail:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
- .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
- vld1.32 {q8},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ vld1.32 {q8},[r7]!
subs r6,r6,#2
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
- .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
- vld1.32 {q9},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ vld1.32 {q9},[r7]!
bgt .Lctr32_tail
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
- .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
- .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
vld1.8 {q2},[r0],r12
.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
- .byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12
- vld1.8 {q3},[r0]
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ vld1.8 {q3},[r0]
.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
- .byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
- .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
- .byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14
veor q2,q2,q7
+ .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
veor q3,q3,q7
.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
diff --git a/deps/openssl/asm/arm-void-gas/modes/ghash-armv4.S b/deps/openssl/asm/arm-void-gas/modes/ghash-armv4.S
index d321235f79..c54f514997 100644
--- a/deps/openssl/asm/arm-void-gas/modes/ghash-armv4.S
+++ b/deps/openssl/asm/arm-void-gas/modes/ghash-armv4.S
@@ -495,7 +495,7 @@ gcm_ghash_neon:
veor q10,q10,q9 @
vshl.i64 q9,q0,#63
veor q10, q10, q9 @
- veor d1,d1,d20 @
+ veor d1,d1,d20 @
veor d4,d4,d21
vshr.u64 q10,q0,#1 @ 2nd phase
diff --git a/deps/openssl/asm/arm-void-gas/modes/ghashv8-armx.S b/deps/openssl/asm/arm-void-gas/modes/ghashv8-armx.S
index 570d9175c4..269574945f 100644
--- a/deps/openssl/asm/arm-void-gas/modes/ghashv8-armx.S
+++ b/deps/openssl/asm/arm-void-gas/modes/ghashv8-armx.S
@@ -7,109 +7,223 @@
.type gcm_init_v8,%function
.align 4
gcm_init_v8:
- vld1.64 {q9},[r1] @ load H
- vmov.i8 q8,#0xe1
+ vld1.64 {q9},[r1] @ load input H
+ vmov.i8 q11,#0xe1
+ vshl.i64 q11,q11,#57 @ 0xc2.0
vext.8 q3,q9,q9,#8
- vshl.i64 q8,q8,#57
- vshr.u64 q10,q8,#63
- vext.8 q8,q10,q8,#8 @ t0=0xc2....01
+ vshr.u64 q10,q11,#63
vdup.32 q9,d18[1]
- vshr.u64 q11,q3,#63
+ vext.8 q8,q10,q11,#8 @ t0=0xc2....01
+ vshr.u64 q10,q3,#63
vshr.s32 q9,q9,#31 @ broadcast carry bit
- vand q11,q11,q8
+ vand q10,q10,q8
vshl.i64 q3,q3,#1
- vext.8 q11,q11,q11,#8
+ vext.8 q10,q10,q10,#8
vand q8,q8,q9
- vorr q3,q3,q11 @ H<<<=1
- veor q3,q3,q8 @ twisted H
- vst1.64 {q3},[r0]
+ vorr q3,q3,q10 @ H<<<=1
+ veor q12,q3,q8 @ twisted H
+ vst1.64 {q12},[r0]! @ store Htable[0]
+
+ @ calculate H^2
+ vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing
+ .byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12
+ veor q8,q8,q12
+ .byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12
+ .byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8
+
+ vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
+ veor q10,q0,q2
+ veor q1,q1,q9
+ veor q1,q1,q10
+ .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase
+
+ vmov d4,d3 @ Xh|Xm - 256-bit result
+ vmov d3,d0 @ Xm is rotated Xl
+ veor q0,q1,q10
+
+ vext.8 q10,q0,q0,#8 @ 2nd phase
+ .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
+ veor q10,q10,q2
+ veor q14,q0,q10
+
+ vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing
+ veor q9,q9,q14
+ vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed
+ vst1.64 {q13-q14},[r0] @ store Htable[1..2]
bx lr
.size gcm_init_v8,.-gcm_init_v8
-
.global gcm_gmult_v8
.type gcm_gmult_v8,%function
.align 4
gcm_gmult_v8:
vld1.64 {q9},[r0] @ load Xi
vmov.i8 q11,#0xe1
- vld1.64 {q12},[r1] @ load twisted H
+ vld1.64 {q12-q13},[r1] @ load twisted H, ...
vshl.u64 q11,q11,#57
#ifndef __ARMEB__
vrev64.8 q9,q9
#endif
- vext.8 q13,q12,q12,#8
- mov r3,#0
vext.8 q3,q9,q9,#8
- mov r12,#0
- veor q13,q13,q12 @ Karatsuba pre-processing
- mov r2,r0
- b .Lgmult_v8
-.size gcm_gmult_v8,.-gcm_gmult_v8
+ .byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
+ veor q9,q9,q3 @ Karatsuba pre-processing
+ .byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
+ .byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
+ veor q10,q0,q2
+ veor q1,q1,q9
+ veor q1,q1,q10
+ .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
+
+ vmov d4,d3 @ Xh|Xm - 256-bit result
+ vmov d3,d0 @ Xm is rotated Xl
+ veor q0,q1,q10
+
+ vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
+ .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
+ veor q10,q10,q2
+ veor q0,q0,q10
+
+#ifndef __ARMEB__
+ vrev64.8 q0,q0
+#endif
+ vext.8 q0,q0,q0,#8
+ vst1.64 {q0},[r0] @ write out Xi
+
+ bx lr
+.size gcm_gmult_v8,.-gcm_gmult_v8
.global gcm_ghash_v8
.type gcm_ghash_v8,%function
.align 4
gcm_ghash_v8:
+ vstmdb sp!,{d8-d15} @ 32-bit ABI says so
vld1.64 {q0},[r0] @ load [rotated] Xi
- subs r3,r3,#16
+ @ "[rotated]" means that
+ @ loaded value would have
+ @ to be rotated in order to
+ @ make it appear as in
+ @ alorithm specification
+ subs r3,r3,#32 @ see if r3 is 32 or larger
+ mov r12,#16 @ r12 is used as post-
+ @ increment for input pointer;
+ @ as loop is modulo-scheduled
+ @ r12 is zeroed just in time
+ @ to preclude oversteping
+ @ inp[len], which means that
+ @ last block[s] are actually
+ @ loaded twice, but last
+ @ copy is not processed
+ vld1.64 {q12-q13},[r1]! @ load twisted H, ..., H^2
vmov.i8 q11,#0xe1
- mov r12,#16
- vld1.64 {q12},[r1] @ load twisted H
- moveq r12,#0
- vext.8 q0,q0,q0,#8
- vshl.u64 q11,q11,#57
- vld1.64 {q9},[r2],r12 @ load [rotated] inp
- vext.8 q13,q12,q12,#8
+ vld1.64 {q14},[r1]
+ moveq r12,#0 @ is it time to zero r12?
+ vext.8 q0,q0,q0,#8 @ rotate Xi
+ vld1.64 {q8},[r2]! @ load [rotated] I[0]
+ vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant
#ifndef __ARMEB__
+ vrev64.8 q8,q8
vrev64.8 q0,q0
+#endif
+ vext.8 q3,q8,q8,#8 @ rotate I[0]
+ blo .Lodd_tail_v8 @ r3 was less than 32
+ vld1.64 {q9},[r2],r12 @ load [rotated] I[1]
+#ifndef __ARMEB__
vrev64.8 q9,q9
#endif
- veor q13,q13,q12 @ Karatsuba pre-processing
- vext.8 q3,q9,q9,#8
- b .Loop_v8
+ vext.8 q7,q9,q9,#8
+ veor q3,q3,q0 @ I[i]^=Xi
+ .byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
+ veor q9,q9,q7 @ Karatsuba pre-processing
+ .byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
+ b .Loop_mod2x_v8
.align 4
-.Loop_v8:
+.Loop_mod2x_v8:
+ vext.8 q10,q3,q3,#8
+ subs r3,r3,#32 @ is there more data?
+ .byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo
+ movlo r12,#0 @ is it time to zero r12?
+
+ .byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9
+ veor q10,q10,q3 @ Karatsuba pre-processing
+ .byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi
+ veor q0,q0,q4 @ accumulate
+ .byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+ vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2]
+
+ veor q2,q2,q6
+ moveq r12,#0 @ is it time to zero r12?
+ veor q1,q1,q5
+
+ vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
+ veor q10,q0,q2
+ veor q1,q1,q9
+ vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3]
+#ifndef __ARMEB__
+ vrev64.8 q8,q8
+#endif
+ veor q1,q1,q10
+ .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
+
+#ifndef __ARMEB__
+ vrev64.8 q9,q9
+#endif
+ vmov d4,d3 @ Xh|Xm - 256-bit result
+ vmov d3,d0 @ Xm is rotated Xl
+ vext.8 q7,q9,q9,#8
+ vext.8 q3,q8,q8,#8
+ veor q0,q1,q10
+ .byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
+ veor q3,q3,q2 @ accumulate q3 early
+
+ vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
+ .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
+ veor q3,q3,q10
+ veor q9,q9,q7 @ Karatsuba pre-processing
+ veor q3,q3,q0
+ .byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
+ bhs .Loop_mod2x_v8 @ there was at least 32 more bytes
+
+ veor q2,q2,q10
+ vext.8 q3,q8,q8,#8 @ re-construct q3
+ adds r3,r3,#32 @ re-construct r3
+ veor q0,q0,q2 @ re-construct q0
+ beq .Ldone_v8 @ is r3 zero?
+.Lodd_tail_v8:
vext.8 q10,q0,q0,#8
veor q3,q3,q0 @ inp^=Xi
- veor q9,q9,q10 @ q9 is rotated inp^Xi
+ veor q9,q8,q10 @ q9 is rotated inp^Xi
-.Lgmult_v8:
.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
veor q9,q9,q3 @ Karatsuba pre-processing
.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
- subs r3,r3,#16
.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
- moveq r12,#0
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
veor q10,q0,q2
veor q1,q1,q9
- vld1.64 {q9},[r2],r12 @ load [rotated] inp
veor q1,q1,q10
- .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase
+ .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
vmov d4,d3 @ Xh|Xm - 256-bit result
vmov d3,d0 @ Xm is rotated Xl
-#ifndef __ARMEB__
- vrev64.8 q9,q9
-#endif
veor q0,q1,q10
- vext.8 q3,q9,q9,#8
- vext.8 q10,q0,q0,#8 @ 2nd phase
+ vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
veor q10,q10,q2
veor q0,q0,q10
- bhs .Loop_v8
+.Ldone_v8:
#ifndef __ARMEB__
vrev64.8 q0,q0
#endif
vext.8 q0,q0,q0,#8
vst1.64 {q0},[r0] @ write out Xi
+ vldmia sp!,{d8-d15} @ 32-bit ABI says so
bx lr
.size gcm_ghash_v8,.-gcm_ghash_v8
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
diff --git a/deps/openssl/asm/arm-void-gas/sha/sha256-armv4.S b/deps/openssl/asm/arm-void-gas/sha/sha256-armv4.S
index bf1ce4f997..683f1cc0c8 100644
--- a/deps/openssl/asm/arm-void-gas/sha/sha256-armv4.S
+++ b/deps/openssl/asm/arm-void-gas/sha/sha256-armv4.S
@@ -1,7 +1,59 @@
-#include "arm_arch.h"
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA256 block procedure for ARMv4. May 2007.
+
+@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
+@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+@ byte [on single-issue Xscale PXA250 core].
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
+@ Cortex A8 core and ~20 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+@ September 2013.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process one
+@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+@ code (meaning that latter performs sub-optimally, nothing was done
+@ about it).
+
+@ May 2014.
+@
+@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
.text
+#if __ARM_ARCH__<7
.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code 32
+# endif
+#endif
.type K256,%object
.align 5
@@ -24,7 +76,7 @@ K256:
.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.size K256,.-K256
.word 0 @ terminator
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-sha256_block_data_order
#endif
@@ -33,9 +85,12 @@ K256:
.global sha256_block_data_order
.type sha256_block_data_order,%function
sha256_block_data_order:
+#if __ARM_ARCH__<7
sub r3,pc,#8 @ sha256_block_data_order
- add r2,r1,r2,lsl#6 @ len to point at the end of inp
-#if __ARM_MAX_ARCH__>=7
+#else
+ adr r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
tst r12,#ARMV8_SHA256
@@ -43,6 +98,7 @@ sha256_block_data_order:
tst r12,#ARMV7_NEON
bne .LNEON
#endif
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
stmdb sp!,{r0,r1,r2,r4-r11,lr}
ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}
sub r14,r3,#256+32 @ K256
@@ -1736,6 +1792,9 @@ sha256_block_data_order:
eor r12,r12,r6 @ Maj(a,b,c)
add r4,r4,r0,ror#2 @ h+=Sigma0(a)
@ add r4,r4,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ ite eq @ Thumb2 thing, sanity check in ARM
+#endif
ldreq r3,[sp,#16*4] @ pull ctx
bne .Lrounds_16_xx
@@ -1777,16 +1836,19 @@ sha256_block_data_order:
.arch armv7-a
.fpu neon
+.global sha256_block_data_order_neon
.type sha256_block_data_order_neon,%function
.align 4
sha256_block_data_order_neon:
.LNEON:
stmdb sp!,{r4-r12,lr}
+ sub r11,sp,#16*4+16
+ adr r14,K256
+ bic r11,r11,#15 @ align for 128-bit stores
mov r12,sp
- sub sp,sp,#16*4+16 @ alloca
- sub r14,r3,#256+32 @ K256
- bic sp,sp,#15 @ align for 128-bit stores
+ mov sp,r11 @ alloca
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
vld1.8 {q0},[r1]!
vld1.8 {q1},[r1]!
@@ -2224,11 +2286,13 @@ sha256_block_data_order_neon:
ldr r0,[sp,#72]
sub r14,r14,#256 @ rewind r14
teq r1,r0
+ it eq
subeq r1,r1,#64 @ avoid SEGV
vld1.8 {q0},[r1]! @ load next input block
vld1.8 {q1},[r1]!
vld1.8 {q2},[r1]!
vld1.8 {q3},[r1]!
+ it ne
strne r1,[sp,#68]
mov r1,sp
add r11,r11,r2
@@ -2542,23 +2606,38 @@ sha256_block_data_order_neon:
str r7,[r2],#4
stmia r2,{r8-r11}
+ ittte ne
movne r1,sp
ldrne r2,[sp,#0]
eorne r12,r12,r12
ldreq sp,[sp,#76] @ restore original sp
+ itt ne
eorne r3,r5,r6
bne .L_00_48
ldmia sp!,{r4-r12,pc}
.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
#endif
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d
+# endif
+
.type sha256_block_data_order_armv8,%function
.align 5
sha256_block_data_order_armv8:
.LARMv8:
vld1.32 {q0,q1},[r0]
- sub r3,r3,#sha256_block_data_order-K256
+# ifdef __thumb2__
+ adr r3,.LARMv8
+ sub r3,r3,#.LARMv8-K256
+# else
+ adrl r3,K256
+# endif
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
.Loop_v8:
vld1.8 {q8-q9},[r1]!
@@ -2573,114 +2652,115 @@ sha256_block_data_order_armv8:
teq r1,r2
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q8
- .byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
vmov q2,q0
- .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
- .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
- .byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q9
- .byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
vmov q2,q0
- .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
- .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
- .byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q10
- .byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
vmov q2,q0
- .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
- .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
- .byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q11
- .byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
vmov q2,q0
- .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
- .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
- .byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q8
- .byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
vmov q2,q0
- .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
- .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
- .byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q9
- .byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
vmov q2,q0
- .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
- .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
- .byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q10
- .byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
vmov q2,q0
- .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
- .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
- .byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q11
- .byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
vmov q2,q0
- .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
- .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
- .byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q8
- .byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
vmov q2,q0
- .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
- .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
- .byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q9
- .byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
vmov q2,q0
- .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
- .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
- .byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q10
- .byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
vmov q2,q0
- .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
- .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
- .byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q11
- .byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
vmov q2,q0
- .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
- .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
- .byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q8
vmov q2,q0
- .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
- .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q9
vmov q2,q0
- .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
- .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
vld1.32 {q13},[r3]
vadd.i32 q12,q12,q10
sub r3,r3,#256-16 @ rewind
vmov q2,q0
- .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
- .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
vadd.i32 q13,q13,q11
vmov q2,q0
- .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
- .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
vadd.i32 q0,q0,q14
vadd.i32 q1,q1,q15
+ it ne
bne .Loop_v8
vst1.32 {q0,q1},[r0]
@@ -2690,6 +2770,6 @@ sha256_block_data_order_armv8:
#endif
.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>"
.align 2
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.comm OPENSSL_armcap_P,4,4
#endif
diff --git a/deps/openssl/asm/arm64-linux64-gas/aes/aesv8-armx.S b/deps/openssl/asm/arm64-linux64-gas/aes/aesv8-armx.S
index 0a4b1ac4c4..f5dd6cbb86 100644
--- a/deps/openssl/asm/arm64-linux64-gas/aes/aesv8-armx.S
+++ b/deps/openssl/asm/arm64-linux64-gas/aes/aesv8-armx.S
@@ -227,17 +227,17 @@ aes_v8_encrypt:
.Loop_enc:
aese v2.16b,v0.16b
- ld1 {v0.4s},[x2],#16
aesmc v2.16b,v2.16b
+ ld1 {v0.4s},[x2],#16
subs w3,w3,#2
aese v2.16b,v1.16b
- ld1 {v1.4s},[x2],#16
aesmc v2.16b,v2.16b
+ ld1 {v1.4s},[x2],#16
b.gt .Loop_enc
aese v2.16b,v0.16b
- ld1 {v0.4s},[x2]
aesmc v2.16b,v2.16b
+ ld1 {v0.4s},[x2]
aese v2.16b,v1.16b
eor v2.16b,v2.16b,v0.16b
@@ -256,17 +256,17 @@ aes_v8_decrypt:
.Loop_dec:
aesd v2.16b,v0.16b
- ld1 {v0.4s},[x2],#16
aesimc v2.16b,v2.16b
+ ld1 {v0.4s},[x2],#16
subs w3,w3,#2
aesd v2.16b,v1.16b
- ld1 {v1.4s},[x2],#16
aesimc v2.16b,v2.16b
+ ld1 {v1.4s},[x2],#16
b.gt .Loop_dec
aesd v2.16b,v0.16b
- ld1 {v0.4s},[x2]
aesimc v2.16b,v2.16b
+ ld1 {v0.4s},[x2]
aesd v2.16b,v1.16b
eor v2.16b,v2.16b,v0.16b
@@ -308,16 +308,42 @@ aes_v8_cbc_encrypt:
eor v5.16b,v16.16b,v7.16b
b.eq .Lcbc_enc128
+ ld1 {v2.4s-v3.4s},[x7]
+ add x7,x3,#16
+ add x6,x3,#16*4
+ add x12,x3,#16*5
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ add x14,x3,#16*6
+ add x3,x3,#16*7
+ b .Lenter_cbc_enc
+
+.align 4
.Loop_cbc_enc:
aese v0.16b,v16.16b
- ld1 {v16.4s},[x7],#16
aesmc v0.16b,v0.16b
- subs w6,w6,#2
+ st1 {v6.16b},[x1],#16
+.Lenter_cbc_enc:
aese v0.16b,v17.16b
- ld1 {v17.4s},[x7],#16
aesmc v0.16b,v0.16b
- b.gt .Loop_cbc_enc
+ aese v0.16b,v2.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x6]
+ cmp w5,#4
+ aese v0.16b,v3.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x12]
+ b.eq .Lcbc_enc192
+
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x14]
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x3]
+ nop
+.Lcbc_enc192:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
subs x2,x2,#16
@@ -326,7 +352,6 @@ aes_v8_cbc_encrypt:
csel x8,xzr,x8,eq
aese v0.16b,v18.16b
aesmc v0.16b,v0.16b
- add x7,x3,#16
aese v0.16b,v19.16b
aesmc v0.16b,v0.16b
ld1 {v16.16b},[x0],x8
@@ -335,16 +360,14 @@ aes_v8_cbc_encrypt:
eor v16.16b,v16.16b,v5.16b
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
- ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
+ ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
aese v0.16b,v23.16b
-
- mov w6,w5
eor v6.16b,v0.16b,v7.16b
- st1 {v6.16b},[x1],#16
b.hs .Loop_cbc_enc
+ st1 {v6.16b},[x1],#16
b .Lcbc_done
.align 5
@@ -402,79 +425,78 @@ aes_v8_cbc_encrypt:
.Loop3x_cbc_dec:
aesd v0.16b,v16.16b
- aesd v1.16b,v16.16b
- aesd v18.16b,v16.16b
- ld1 {v16.4s},[x7],#16
aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aesd v0.16b,v17.16b
- aesd v1.16b,v17.16b
- aesd v18.16b,v17.16b
- ld1 {v17.4s},[x7],#16
aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
b.gt .Loop3x_cbc_dec
aesd v0.16b,v16.16b
- aesd v1.16b,v16.16b
- aesd v18.16b,v16.16b
- eor v4.16b,v6.16b,v7.16b
aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
+ eor v4.16b,v6.16b,v7.16b
+ subs x2,x2,#0x30
eor v5.16b,v2.16b,v7.16b
+ csel x6,x2,x6,lo // x6, w6, is zero at this point
aesd v0.16b,v17.16b
- aesd v1.16b,v17.16b
- aesd v18.16b,v17.16b
- eor v17.16b,v3.16b,v7.16b
- subs x2,x2,#0x30
aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
- orr v6.16b,v19.16b,v19.16b
- csel x6,x2,x6,lo // x6, w6, is zero at this point
- aesd v0.16b,v20.16b
- aesd v1.16b,v20.16b
- aesd v18.16b,v20.16b
+ eor v17.16b,v3.16b,v7.16b
add x0,x0,x6 // x0 is adjusted in such way that
// at exit from the loop v1.16b-v18.16b
// are loaded with last "words"
+ orr v6.16b,v19.16b,v19.16b
+ mov x7,x3
+ aesd v0.16b,v20.16b
aesimc v0.16b,v0.16b
+ aesd v1.16b,v20.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v20.16b
aesimc v18.16b,v18.16b
- mov x7,x3
- aesd v0.16b,v21.16b
- aesd v1.16b,v21.16b
- aesd v18.16b,v21.16b
ld1 {v2.16b},[x0],#16
+ aesd v0.16b,v21.16b
aesimc v0.16b,v0.16b
+ aesd v1.16b,v21.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v21.16b
aesimc v18.16b,v18.16b
ld1 {v3.16b},[x0],#16
aesd v0.16b,v22.16b
- aesd v1.16b,v22.16b
- aesd v18.16b,v22.16b
- ld1 {v19.16b},[x0],#16
aesimc v0.16b,v0.16b
+ aesd v1.16b,v22.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v22.16b
aesimc v18.16b,v18.16b
- ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
+ ld1 {v19.16b},[x0],#16
aesd v0.16b,v23.16b
aesd v1.16b,v23.16b
aesd v18.16b,v23.16b
-
+ ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
add w6,w5,#2
eor v4.16b,v4.16b,v0.16b
eor v5.16b,v5.16b,v1.16b
eor v18.16b,v18.16b,v17.16b
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
- orr v0.16b,v2.16b,v2.16b
st1 {v4.16b},[x1],#16
- orr v1.16b,v3.16b,v3.16b
+ orr v0.16b,v2.16b,v2.16b
st1 {v5.16b},[x1],#16
+ orr v1.16b,v3.16b,v3.16b
st1 {v18.16b},[x1],#16
orr v18.16b,v19.16b,v19.16b
b.hs .Loop3x_cbc_dec
@@ -485,39 +507,39 @@ aes_v8_cbc_encrypt:
.Lcbc_dec_tail:
aesd v1.16b,v16.16b
- aesd v18.16b,v16.16b
- ld1 {v16.4s},[x7],#16
aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aesd v1.16b,v17.16b
- aesd v18.16b,v17.16b
- ld1 {v17.4s},[x7],#16
aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
b.gt .Lcbc_dec_tail
aesd v1.16b,v16.16b
- aesd v18.16b,v16.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
aesd v1.16b,v17.16b
- aesd v18.16b,v17.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
aesd v1.16b,v20.16b
- aesd v18.16b,v20.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v20.16b
aesimc v18.16b,v18.16b
cmn x2,#0x20
aesd v1.16b,v21.16b
- aesd v18.16b,v21.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v21.16b
aesimc v18.16b,v18.16b
eor v5.16b,v6.16b,v7.16b
aesd v1.16b,v22.16b
- aesd v18.16b,v22.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v22.16b
aesimc v18.16b,v18.16b
eor v17.16b,v3.16b,v7.16b
aesd v1.16b,v23.16b
@@ -583,70 +605,69 @@ aes_v8_ctr32_encrypt_blocks:
.align 4
.Loop3x_ctr32:
aese v0.16b,v16.16b
- aese v1.16b,v16.16b
- aese v18.16b,v16.16b
- ld1 {v16.4s},[x7],#16
aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
+ aese v18.16b,v16.16b
aesmc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aese v0.16b,v17.16b
- aese v1.16b,v17.16b
- aese v18.16b,v17.16b
- ld1 {v17.4s},[x7],#16
aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
+ aese v18.16b,v17.16b
aesmc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
b.gt .Loop3x_ctr32
aese v0.16b,v16.16b
- aese v1.16b,v16.16b
- aese v18.16b,v16.16b
- mov x7,x3
aesmc v4.16b,v0.16b
- ld1 {v2.16b},[x0],#16
+ aese v1.16b,v16.16b
aesmc v5.16b,v1.16b
- aesmc v18.16b,v18.16b
+ ld1 {v2.16b},[x0],#16
orr v0.16b,v6.16b,v6.16b
- aese v4.16b,v17.16b
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
ld1 {v3.16b},[x0],#16
- aese v5.16b,v17.16b
- aese v18.16b,v17.16b
orr v1.16b,v6.16b,v6.16b
+ aese v4.16b,v17.16b
aesmc v4.16b,v4.16b
- ld1 {v19.16b},[x0],#16
+ aese v5.16b,v17.16b
aesmc v5.16b,v5.16b
+ ld1 {v19.16b},[x0],#16
+ mov x7,x3
+ aese v18.16b,v17.16b
aesmc v17.16b,v18.16b
orr v18.16b,v6.16b,v6.16b
add w9,w8,#1
aese v4.16b,v20.16b
+ aesmc v4.16b,v4.16b
aese v5.16b,v20.16b
- aese v17.16b,v20.16b
+ aesmc v5.16b,v5.16b
eor v2.16b,v2.16b,v7.16b
add w10,w8,#2
- aesmc v4.16b,v4.16b
- aesmc v5.16b,v5.16b
+ aese v17.16b,v20.16b
aesmc v17.16b,v17.16b
eor v3.16b,v3.16b,v7.16b
add w8,w8,#3
aese v4.16b,v21.16b
+ aesmc v4.16b,v4.16b
aese v5.16b,v21.16b
- aese v17.16b,v21.16b
+ aesmc v5.16b,v5.16b
eor v19.16b,v19.16b,v7.16b
rev w9,w9
- aesmc v4.16b,v4.16b
- ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
- aesmc v5.16b,v5.16b
+ aese v17.16b,v21.16b
aesmc v17.16b,v17.16b
mov v0.s[3], w9
rev w10,w10
aese v4.16b,v22.16b
+ aesmc v4.16b,v4.16b
aese v5.16b,v22.16b
- aese v17.16b,v22.16b
+ aesmc v5.16b,v5.16b
mov v1.s[3], w10
rev w12,w8
- aesmc v4.16b,v4.16b
- aesmc v5.16b,v5.16b
+ aese v17.16b,v22.16b
aesmc v17.16b,v17.16b
mov v18.s[3], w12
subs x2,x2,#3
@@ -654,13 +675,14 @@ aes_v8_ctr32_encrypt_blocks:
aese v5.16b,v23.16b
aese v17.16b,v23.16b
- mov w6,w5
eor v2.16b,v2.16b,v4.16b
+ ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
+ st1 {v2.16b},[x1],#16
eor v3.16b,v3.16b,v5.16b
+ mov w6,w5
+ st1 {v3.16b},[x1],#16
eor v19.16b,v19.16b,v17.16b
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
- st1 {v2.16b},[x1],#16
- st1 {v3.16b},[x1],#16
st1 {v19.16b},[x1],#16
b.hs .Loop3x_ctr32
@@ -672,40 +694,40 @@ aes_v8_ctr32_encrypt_blocks:
.Lctr32_tail:
aese v0.16b,v16.16b
- aese v1.16b,v16.16b
- ld1 {v16.4s},[x7],#16
aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
+ ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aese v0.16b,v17.16b
- aese v1.16b,v17.16b
- ld1 {v17.4s},[x7],#16
aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
+ ld1 {v17.4s},[x7],#16
b.gt .Lctr32_tail
aese v0.16b,v16.16b
- aese v1.16b,v16.16b
aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
aese v0.16b,v17.16b
- aese v1.16b,v17.16b
aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
ld1 {v2.16b},[x0],x12
aese v0.16b,v20.16b
- aese v1.16b,v20.16b
- ld1 {v3.16b},[x0]
aesmc v0.16b,v0.16b
+ aese v1.16b,v20.16b
aesmc v1.16b,v1.16b
+ ld1 {v3.16b},[x0]
aese v0.16b,v21.16b
- aese v1.16b,v21.16b
aesmc v0.16b,v0.16b
+ aese v1.16b,v21.16b
aesmc v1.16b,v1.16b
- aese v0.16b,v22.16b
- aese v1.16b,v22.16b
eor v2.16b,v2.16b,v7.16b
+ aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
+ aese v1.16b,v22.16b
aesmc v1.16b,v1.16b
eor v3.16b,v3.16b,v7.16b
aese v0.16b,v23.16b
diff --git a/deps/openssl/asm/arm64-linux64-gas/modes/ghashv8-armx.S b/deps/openssl/asm/arm64-linux64-gas/modes/ghashv8-armx.S
index 1bfb26340a..479007dc54 100644
--- a/deps/openssl/asm/arm64-linux64-gas/modes/ghashv8-armx.S
+++ b/deps/openssl/asm/arm64-linux64-gas/modes/ghashv8-armx.S
@@ -6,103 +6,215 @@
.type gcm_init_v8,%function
.align 4
gcm_init_v8:
- ld1 {v17.2d},[x1] //load H
- movi v16.16b,#0xe1
+ ld1 {v17.2d},[x1] //load input H
+ movi v19.16b,#0xe1
+ shl v19.2d,v19.2d,#57 //0xc2.0
ext v3.16b,v17.16b,v17.16b,#8
- shl v16.2d,v16.2d,#57
- ushr v18.2d,v16.2d,#63
- ext v16.16b,v18.16b,v16.16b,#8 //t0=0xc2....01
+ ushr v18.2d,v19.2d,#63
dup v17.4s,v17.s[1]
- ushr v19.2d,v3.2d,#63
+ ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
+ ushr v18.2d,v3.2d,#63
sshr v17.4s,v17.4s,#31 //broadcast carry bit
- and v19.16b,v19.16b,v16.16b
+ and v18.16b,v18.16b,v16.16b
shl v3.2d,v3.2d,#1
- ext v19.16b,v19.16b,v19.16b,#8
+ ext v18.16b,v18.16b,v18.16b,#8
and v16.16b,v16.16b,v17.16b
- orr v3.16b,v3.16b,v19.16b //H<<<=1
- eor v3.16b,v3.16b,v16.16b //twisted H
- st1 {v3.2d},[x0]
+ orr v3.16b,v3.16b,v18.16b //H<<<=1
+ eor v20.16b,v3.16b,v16.16b //twisted H
+ st1 {v20.2d},[x0],#16 //store Htable[0]
+
+ //calculate H^2
+ ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
+ pmull v0.1q,v20.1d,v20.1d
+ eor v16.16b,v16.16b,v20.16b
+ pmull2 v2.1q,v20.2d,v20.2d
+ pmull v1.1q,v16.1d,v16.1d
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase
+
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v22.16b,v0.16b,v18.16b
+
+ ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
+ eor v17.16b,v17.16b,v22.16b
+ ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
+ st1 {v21.2d-v22.2d},[x0] //store Htable[1..2]
ret
.size gcm_init_v8,.-gcm_init_v8
-
.global gcm_gmult_v8
.type gcm_gmult_v8,%function
.align 4
gcm_gmult_v8:
ld1 {v17.2d},[x0] //load Xi
movi v19.16b,#0xe1
- ld1 {v20.2d},[x1] //load twisted H
+ ld1 {v20.2d-v21.2d},[x1] //load twisted H, ...
shl v19.2d,v19.2d,#57
#ifndef __ARMEB__
rev64 v17.16b,v17.16b
#endif
- ext v21.16b,v20.16b,v20.16b,#8
- mov x3,#0
ext v3.16b,v17.16b,v17.16b,#8
- mov x12,#0
- eor v21.16b,v21.16b,v20.16b //Karatsuba pre-processing
- mov x2,x0
- b .Lgmult_v8
-.size gcm_gmult_v8,.-gcm_gmult_v8
+ pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
+ eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
+ pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
+ pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+
+#ifndef __ARMEB__
+ rev64 v0.16b,v0.16b
+#endif
+ ext v0.16b,v0.16b,v0.16b,#8
+ st1 {v0.2d},[x0] //write out Xi
+
+ ret
+.size gcm_gmult_v8,.-gcm_gmult_v8
.global gcm_ghash_v8
.type gcm_ghash_v8,%function
.align 4
gcm_ghash_v8:
ld1 {v0.2d},[x0] //load [rotated] Xi
- subs x3,x3,#16
+ //"[rotated]" means that
+ //loaded value would have
+ //to be rotated in order to
+ //make it appear as in
+ //alorithm specification
+ subs x3,x3,#32 //see if x3 is 32 or larger
+ mov x12,#16 //x12 is used as post-
+ //increment for input pointer;
+ //as loop is modulo-scheduled
+ //x12 is zeroed just in time
+ //to preclude oversteping
+ //inp[len], which means that
+ //last block[s] are actually
+ //loaded twice, but last
+ //copy is not processed
+ ld1 {v20.2d-v21.2d},[x1],#32 //load twisted H, ..., H^2
movi v19.16b,#0xe1
- mov x12,#16
- ld1 {v20.2d},[x1] //load twisted H
- csel x12,xzr,x12,eq
- ext v0.16b,v0.16b,v0.16b,#8
- shl v19.2d,v19.2d,#57
- ld1 {v17.2d},[x2],x12 //load [rotated] inp
- ext v21.16b,v20.16b,v20.16b,#8
+ ld1 {v22.2d},[x1]
+ csel x12,xzr,x12,eq //is it time to zero x12?
+ ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
+ ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
+ shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
#ifndef __ARMEB__
+ rev64 v16.16b,v16.16b
rev64 v0.16b,v0.16b
+#endif
+ ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
+ b.lo .Lodd_tail_v8 //x3 was less than 32
+ ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
+#ifndef __ARMEB__
rev64 v17.16b,v17.16b
#endif
- eor v21.16b,v21.16b,v20.16b //Karatsuba pre-processing
- ext v3.16b,v17.16b,v17.16b,#8
- b .Loop_v8
+ ext v7.16b,v17.16b,v17.16b,#8
+ eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
+ pmull v4.1q,v20.1d,v7.1d //H·Ii+1
+ eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
+ pmull2 v6.1q,v20.2d,v7.2d
+ b .Loop_mod2x_v8
.align 4
-.Loop_v8:
+.Loop_mod2x_v8:
+ ext v18.16b,v3.16b,v3.16b,#8
+ subs x3,x3,#32 //is there more data?
+ pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
+ csel x12,xzr,x12,lo //is it time to zero x12?
+
+ pmull v5.1q,v21.1d,v17.1d
+ eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
+ pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
+ eor v0.16b,v0.16b,v4.16b //accumulate
+ pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+ ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
+
+ eor v2.16b,v2.16b,v6.16b
+ csel x12,xzr,x12,eq //is it time to zero x12?
+ eor v1.16b,v1.16b,v5.16b
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
+#ifndef __ARMEB__
+ rev64 v16.16b,v16.16b
+#endif
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+
+#ifndef __ARMEB__
+ rev64 v17.16b,v17.16b
+#endif
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ ext v7.16b,v17.16b,v17.16b,#8
+ ext v3.16b,v16.16b,v16.16b,#8
+ eor v0.16b,v1.16b,v18.16b
+ pmull v4.1q,v20.1d,v7.1d //H·Ii+1
+ eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v3.16b,v3.16b,v18.16b
+ eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
+ eor v3.16b,v3.16b,v0.16b
+ pmull2 v6.1q,v20.2d,v7.2d
+ b.hs .Loop_mod2x_v8 //there was at least 32 more bytes
+
+ eor v2.16b,v2.16b,v18.16b
+ ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
+ adds x3,x3,#32 //re-construct x3
+ eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
+ b.eq .Ldone_v8 //is x3 zero?
+.Lodd_tail_v8:
ext v18.16b,v0.16b,v0.16b,#8
eor v3.16b,v3.16b,v0.16b //inp^=Xi
- eor v17.16b,v17.16b,v18.16b //v17.16b is rotated inp^Xi
+ eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
-.Lgmult_v8:
pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
- subs x3,x3,#16
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
- csel x12,xzr,x12,eq
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
- ld1 {v17.2d},[x2],x12 //load [rotated] inp
eor v1.16b,v1.16b,v18.16b
- pmull v18.1q,v0.1d,v19.1d //1st phase
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
-#ifndef __ARMEB__
- rev64 v17.16b,v17.16b
-#endif
eor v0.16b,v1.16b,v18.16b
- ext v3.16b,v17.16b,v17.16b,#8
- ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
- b.hs .Loop_v8
+.Ldone_v8:
#ifndef __ARMEB__
rev64 v0.16b,v0.16b
#endif
diff --git a/deps/openssl/asm/x64-elf-gas/aes/aesni-x86_64.s b/deps/openssl/asm/x64-elf-gas/aes/aesni-x86_64.s
index 84708afbbb..6573fe4be3 100644
--- a/deps/openssl/asm/x64-elf-gas/aes/aesni-x86_64.s
+++ b/deps/openssl/asm/x64-elf-gas/aes/aesni-x86_64.s
@@ -17,7 +17,10 @@ aesni_encrypt:
leaq 16(%rdx),%rdx
jnz .Loop_enc1_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
.byte 0xf3,0xc3
.size aesni_encrypt,.-aesni_encrypt
@@ -38,7 +41,10 @@ aesni_decrypt:
leaq 16(%rdx),%rdx
jnz .Loop_dec1_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
.byte 0xf3,0xc3
.size aesni_decrypt, .-aesni_decrypt
.type _aesni_encrypt2,@function
@@ -264,21 +270,18 @@ _aesni_encrypt6:
pxor %xmm0,%xmm6
.byte 102,15,56,220,225
pxor %xmm0,%xmm7
+ movups (%rcx,%rax,1),%xmm0
addq $16,%rax
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
- movups -16(%rcx,%rax,1),%xmm0
jmp .Lenc_loop6_enter
.align 16
.Lenc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
+.Lenc_loop6_enter:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.Lenc_loop6_enter:
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
.byte 102,15,56,220,208
@@ -321,21 +324,18 @@ _aesni_decrypt6:
pxor %xmm0,%xmm6
.byte 102,15,56,222,225
pxor %xmm0,%xmm7
+ movups (%rcx,%rax,1),%xmm0
addq $16,%rax
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
- movups -16(%rcx,%rax,1),%xmm0
jmp .Ldec_loop6_enter
.align 16
.Ldec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
+.Ldec_loop6_enter:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.Ldec_loop6_enter:
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
.byte 102,15,56,222,208
@@ -375,23 +375,18 @@ _aesni_encrypt8:
leaq 32(%rcx,%rax,1),%rcx
negq %rax
.byte 102,15,56,220,209
- addq $16,%rax
pxor %xmm0,%xmm7
-.byte 102,15,56,220,217
pxor %xmm0,%xmm8
+.byte 102,15,56,220,217
pxor %xmm0,%xmm9
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
- movups -16(%rcx,%rax,1),%xmm0
- jmp .Lenc_loop8_enter
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp .Lenc_loop8_inner
.align 16
.Lenc_loop8:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
+.Lenc_loop8_inner:
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
@@ -444,23 +439,18 @@ _aesni_decrypt8:
leaq 32(%rcx,%rax,1),%rcx
negq %rax
.byte 102,15,56,222,209
- addq $16,%rax
pxor %xmm0,%xmm7
-.byte 102,15,56,222,217
pxor %xmm0,%xmm8
+.byte 102,15,56,222,217
pxor %xmm0,%xmm9
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
- movups -16(%rcx,%rax,1),%xmm0
- jmp .Ldec_loop8_enter
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp .Ldec_loop8_inner
.align 16
.Ldec_loop8:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
+.Ldec_loop8_inner:
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
@@ -587,6 +577,7 @@ aesni_ecb_encrypt:
movups 80(%rdi),%xmm7
je .Lecb_enc_six
movdqu 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
call _aesni_encrypt8
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
@@ -700,15 +691,23 @@ aesni_ecb_encrypt:
jnc .Lecb_dec_loop8
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movq %r11,%rcx
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movl %r10d,%eax
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
movups %xmm9,112(%rsi)
+ pxor %xmm9,%xmm9
leaq 128(%rsi),%rsi
addq $128,%rdx
jz .Lecb_ret
@@ -731,14 +730,23 @@ aesni_ecb_encrypt:
je .Lecb_dec_six
movups 96(%rdi),%xmm8
movups (%rcx),%xmm0
+ xorps %xmm9,%xmm9
call _aesni_decrypt8
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
jmp .Lecb_ret
.align 16
.Lecb_dec_one:
@@ -754,49 +762,73 @@ aesni_ecb_encrypt:
jnz .Loop_dec1_4
.byte 102,15,56,223,209
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
jmp .Lecb_ret
.align 16
.Lecb_dec_two:
call _aesni_decrypt2
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
jmp .Lecb_ret
.align 16
.Lecb_dec_three:
call _aesni_decrypt3
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
jmp .Lecb_ret
.align 16
.Lecb_dec_four:
call _aesni_decrypt4
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
jmp .Lecb_ret
.align 16
.Lecb_dec_five:
xorps %xmm7,%xmm7
call _aesni_decrypt6
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
jmp .Lecb_ret
.align 16
.Lecb_dec_six:
call _aesni_decrypt6
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
.Lecb_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
.byte 0xf3,0xc3
.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
.globl aesni_ccm64_encrypt_blocks
@@ -853,7 +885,13 @@ aesni_ccm64_encrypt_blocks:
leaq 16(%rsi),%rsi
jnz .Lccm64_enc_outer
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
movups %xmm3,(%r9)
+ pxor %xmm3,%xmm3
+ pxor %xmm8,%xmm8
+ pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
.globl aesni_ccm64_decrypt_blocks
@@ -944,21 +982,56 @@ aesni_ccm64_decrypt_blocks:
leaq 16(%r11),%r11
jnz .Loop_enc1_6
.byte 102,15,56,221,217
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
movups %xmm3,(%r9)
+ pxor %xmm3,%xmm3
+ pxor %xmm8,%xmm8
+ pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
.globl aesni_ctr32_encrypt_blocks
.type aesni_ctr32_encrypt_blocks,@function
.align 16
aesni_ctr32_encrypt_blocks:
+ cmpq $1,%rdx
+ jne .Lctr32_bulk
+
+
+
+ movups (%r8),%xmm2
+ movups (%rdi),%xmm3
+ movl 240(%rcx),%edx
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_enc1_7:
+.byte 102,15,56,220,209
+ decl %edx
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_7
+.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ xorps %xmm2,%xmm2
+ jmp .Lctr32_epilogue
+
+.align 16
+.Lctr32_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $128,%rsp
andq $-16,%rsp
leaq -8(%rax),%rbp
- cmpq $1,%rdx
- je .Lctr32_one_shortcut
+
+
movdqu (%r8),%xmm2
movdqu (%rcx),%xmm0
@@ -1349,11 +1422,14 @@ aesni_ctr32_encrypt_blocks:
leaq -128(%rcx),%rcx
.Lctr32_tail:
+
+
leaq 16(%rcx),%rcx
cmpq $4,%rdx
jb .Lctr32_loop3
je .Lctr32_loop4
+
shll $4,%eax
movdqa 96(%rsp),%xmm8
pxor %xmm9,%xmm9
@@ -1456,30 +1532,33 @@ aesni_ctr32_encrypt_blocks:
movups 32(%rdi),%xmm12
xorps %xmm12,%xmm4
movups %xmm4,32(%rsi)
- jmp .Lctr32_done
-.align 16
-.Lctr32_one_shortcut:
- movups (%r8),%xmm2
- movups (%rdi),%xmm10
- movl 240(%rcx),%eax
- movups (%rcx),%xmm0
- movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
- xorps %xmm0,%xmm2
-.Loop_enc1_7:
-.byte 102,15,56,220,209
- decl %eax
- movups (%rcx),%xmm1
- leaq 16(%rcx),%rcx
- jnz .Loop_enc1_7
-.byte 102,15,56,221,209
- xorps %xmm10,%xmm2
- movups %xmm2,(%rsi)
- jmp .Lctr32_done
-
-.align 16
.Lctr32_done:
+ xorps %xmm0,%xmm0
+ xorl %r11d,%r11d
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ movaps %xmm0,112(%rsp)
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
.Lctr32_epilogue:
@@ -1750,6 +1829,7 @@ aesni_xts_encrypt:
shrl $4,%eax
.Lxts_enc_short:
+
movl %eax,%r10d
pxor %xmm0,%xmm10
addq $96,%rdx
@@ -1778,6 +1858,7 @@ aesni_xts_encrypt:
pxor %xmm12,%xmm4
pxor %xmm13,%xmm5
pxor %xmm14,%xmm6
+ pxor %xmm7,%xmm7
call _aesni_encrypt6
@@ -1920,6 +2001,29 @@ aesni_xts_encrypt:
movups %xmm2,-16(%rsi)
.Lxts_enc_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
.Lxts_enc_epilogue:
@@ -2196,6 +2300,7 @@ aesni_xts_decrypt:
shrl $4,%eax
.Lxts_dec_short:
+
movl %eax,%r10d
pxor %xmm0,%xmm10
pxor %xmm0,%xmm11
@@ -2398,6 +2503,29 @@ aesni_xts_decrypt:
movups %xmm2,(%rsi)
.Lxts_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
.Lxts_dec_epilogue:
@@ -2446,7 +2574,11 @@ aesni_cbc_encrypt:
jnc .Lcbc_enc_loop
addq $16,%rdx
jnz .Lcbc_enc_tail
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%r8)
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
jmp .Lcbc_ret
.Lcbc_enc_tail:
@@ -2466,6 +2598,35 @@ aesni_cbc_encrypt:
.align 16
.Lcbc_decrypt:
+ cmpq $16,%rdx
+ jne .Lcbc_decrypt_bulk
+
+
+
+ movdqu (%rdi),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa %xmm2,%xmm4
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_16:
+.byte 102,15,56,222,209
+ decl %r10d
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_16
+.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movdqu %xmm4,(%r8)
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ jmp .Lcbc_ret
+.align 16
+.Lcbc_decrypt_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $16,%rsp
@@ -2702,7 +2863,7 @@ aesni_cbc_encrypt:
movaps %xmm9,%xmm2
leaq -112(%rcx),%rcx
addq $112,%rdx
- jle .Lcbc_dec_tail_collected
+ jle .Lcbc_dec_clear_tail_collected
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
cmpq $80,%rdx
@@ -2721,14 +2882,19 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
leaq 80(%rsi),%rsi
movdqa %xmm7,%xmm2
+ pxor %xmm7,%xmm7
jmp .Lcbc_dec_tail_collected
.align 16
@@ -2743,16 +2909,23 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
pxor %xmm9,%xmm8
movdqu %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
leaq 96(%rsi),%rsi
movdqa %xmm8,%xmm2
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
jmp .Lcbc_dec_tail_collected
.align 16
@@ -2796,7 +2969,7 @@ aesni_cbc_encrypt:
movdqa %xmm7,%xmm2
addq $80,%rdx
- jle .Lcbc_dec_tail_collected
+ jle .Lcbc_dec_clear_tail_collected
movups %xmm7,(%rsi)
leaq 16(%rsi),%rsi
@@ -2831,12 +3004,17 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
leaq 64(%rsi),%rsi
movdqa %xmm6,%xmm2
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
subq $16,%rdx
jmp .Lcbc_dec_tail_collected
@@ -2847,12 +3025,12 @@ aesni_cbc_encrypt:
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
-.Loop_dec1_16:
+.Loop_dec1_17:
.byte 102,15,56,222,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_16
+ jnz .Loop_dec1_17
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movaps %xmm11,%xmm10
@@ -2866,6 +3044,7 @@ aesni_cbc_encrypt:
pxor %xmm11,%xmm3
movdqu %xmm2,(%rsi)
movdqa %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leaq 16(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.align 16
@@ -2878,7 +3057,9 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movdqa %xmm4,%xmm2
+ pxor %xmm4,%xmm4
leaq 32(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.align 16
@@ -2891,29 +3072,45 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movdqa %xmm5,%xmm2
+ pxor %xmm5,%xmm5
leaq 48(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.align 16
+.Lcbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
.Lcbc_dec_tail_collected:
movups %xmm10,(%r8)
andq $15,%rdx
jnz .Lcbc_dec_tail_partial
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
jmp .Lcbc_dec_ret
.align 16
.Lcbc_dec_tail_partial:
movaps %xmm2,(%rsp)
+ pxor %xmm2,%xmm2
movq $16,%rcx
movq %rsi,%rdi
subq %rdx,%rcx
leaq (%rsp),%rsi
.long 0x9066A4F3
+ movdqa %xmm2,(%rsp)
.Lcbc_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
leaq (%rbp),%rsp
popq %rbp
.Lcbc_ret:
@@ -2951,7 +3148,9 @@ aesni_set_decrypt_key:
movups (%rdx),%xmm0
.byte 102,15,56,219,192
+ pxor %xmm1,%xmm1
movups %xmm0,(%rdi)
+ pxor %xmm0,%xmm0
.Ldec_key_ret:
addq $8,%rsp
.byte 0xf3,0xc3
@@ -2969,8 +3168,10 @@ __aesni_set_encrypt_key:
testq %rdx,%rdx
jz .Lenc_key_ret
+ movl $268437504,%r10d
movups (%rdi),%xmm0
xorps %xmm4,%xmm4
+ andl OPENSSL_ia32cap_P+4(%rip),%r10d
leaq 16(%rdx),%rax
cmpl $256,%esi
je .L14rounds
@@ -2981,6 +3182,9 @@ __aesni_set_encrypt_key:
.L10rounds:
movl $9,%esi
+ cmpl $268435456,%r10d
+ je .L10rounds_alt
+
movups %xmm0,(%rdx)
.byte 102,15,58,223,200,1
call .Lkey_expansion_128_cold
@@ -3008,9 +3212,79 @@ __aesni_set_encrypt_key:
jmp .Lenc_key_ret
.align 16
+.L10rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movl $8,%r10d
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,(%rdx)
+ jmp .Loop_key128
+
+.align 16
+.Loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leaq 16(%rax),%rax
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%rax)
+ movdqa %xmm0,%xmm2
+
+ decl %r10d
+ jnz .Loop_key128
+
+ movdqa .Lkey_rcon1b(%rip),%xmm4
+
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%rax)
+
+ movl %esi,96(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
.L12rounds:
movq 16(%rdi),%xmm2
movl $11,%esi
+ cmpl $268435456,%r10d
+ je .L12rounds_alt
+
movups %xmm0,(%rdx)
.byte 102,15,58,223,202,1
call .Lkey_expansion_192a_cold
@@ -3034,10 +3308,54 @@ __aesni_set_encrypt_key:
jmp .Lenc_key_ret
.align 16
+.L12rounds_alt:
+ movdqa .Lkey_rotate192(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movl $8,%r10d
+ movdqu %xmm0,(%rdx)
+ jmp .Loop_key192
+
+.align 16
+.Loop_key192:
+ movq %xmm2,0(%rax)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leaq 24(%rax),%rax
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%rax)
+
+ decl %r10d
+ jnz .Loop_key192
+
+ movl %esi,32(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
.L14rounds:
movups 16(%rdi),%xmm2
movl $13,%esi
leaq 16(%rax),%rax
+ cmpl $268435456,%r10d
+ je .L14rounds_alt
+
movups %xmm0,(%rdx)
movups %xmm2,16(%rdx)
.byte 102,15,58,223,202,1
@@ -3072,9 +3390,69 @@ __aesni_set_encrypt_key:
jmp .Lenc_key_ret
.align 16
+.L14rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movl $7,%r10d
+ movdqu %xmm0,0(%rdx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,16(%rdx)
+ jmp .Loop_key256
+
+.align 16
+.Loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ decl %r10d
+ jz .Ldone_key256
+
+ pshufd $255,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%rax)
+ leaq 32(%rax),%rax
+ movdqa %xmm2,%xmm1
+
+ jmp .Loop_key256
+
+.Ldone_key256:
+ movl %esi,16(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
.Lbad_keybits:
movq $-2,%rax
.Lenc_key_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
addq $8,%rsp
.byte 0xf3,0xc3
.LSEH_end_set_encrypt_key:
@@ -3160,6 +3538,14 @@ __aesni_set_encrypt_key:
.long 0x87,0,1,0
.Lincrement1:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Lkey_rotate:
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+.Lkey_rotate192:
+.long 0x04070605,0x04070605,0x04070605,0x04070605
+.Lkey_rcon1:
+.long 1,1,1,1
+.Lkey_rcon1b:
+.long 0x1b,0x1b,0x1b,0x1b
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64
diff --git a/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s b/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s
index 84dd72075d..db3fe399ab 100644
--- a/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s
+++ b/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s
@@ -2884,11 +2884,16 @@ sqrx8x_reduction:
.type bn_get_bits5,@function
.align 16
bn_get_bits5:
- movq %rdi,%r10
+ leaq 0(%rdi),%r10
+ leaq 1(%rdi),%r11
movl %esi,%ecx
- shrl $3,%esi
- movzwl (%r10,%rsi,1),%eax
- andl $7,%ecx
+ shrl $4,%esi
+ andl $15,%ecx
+ leal -8(%rcx),%eax
+ cmpl $11,%ecx
+ cmovaq %r11,%r10
+ cmoval %eax,%ecx
+ movzwl (%r10,%rsi,2),%eax
shrl %cl,%eax
andl $31,%eax
.byte 0xf3,0xc3
diff --git a/deps/openssl/asm/x64-macosx-gas/aes/aesni-x86_64.s b/deps/openssl/asm/x64-macosx-gas/aes/aesni-x86_64.s
index 57509ae719..41ad80eebd 100644
--- a/deps/openssl/asm/x64-macosx-gas/aes/aesni-x86_64.s
+++ b/deps/openssl/asm/x64-macosx-gas/aes/aesni-x86_64.s
@@ -17,7 +17,10 @@ L$oop_enc1_1:
leaq 16(%rdx),%rdx
jnz L$oop_enc1_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
.byte 0xf3,0xc3
@@ -38,7 +41,10 @@ L$oop_dec1_2:
leaq 16(%rdx),%rdx
jnz L$oop_dec1_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
.byte 0xf3,0xc3
@@ -264,21 +270,18 @@ _aesni_encrypt6:
pxor %xmm0,%xmm6
.byte 102,15,56,220,225
pxor %xmm0,%xmm7
+ movups (%rcx,%rax,1),%xmm0
addq $16,%rax
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
- movups -16(%rcx,%rax,1),%xmm0
jmp L$enc_loop6_enter
.p2align 4
L$enc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
+L$enc_loop6_enter:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-L$enc_loop6_enter:
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
.byte 102,15,56,220,208
@@ -321,21 +324,18 @@ _aesni_decrypt6:
pxor %xmm0,%xmm6
.byte 102,15,56,222,225
pxor %xmm0,%xmm7
+ movups (%rcx,%rax,1),%xmm0
addq $16,%rax
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
- movups -16(%rcx,%rax,1),%xmm0
jmp L$dec_loop6_enter
.p2align 4
L$dec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
+L$dec_loop6_enter:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-L$dec_loop6_enter:
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
.byte 102,15,56,222,208
@@ -375,23 +375,18 @@ _aesni_encrypt8:
leaq 32(%rcx,%rax,1),%rcx
negq %rax
.byte 102,15,56,220,209
- addq $16,%rax
pxor %xmm0,%xmm7
-.byte 102,15,56,220,217
pxor %xmm0,%xmm8
+.byte 102,15,56,220,217
pxor %xmm0,%xmm9
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
- movups -16(%rcx,%rax,1),%xmm0
- jmp L$enc_loop8_enter
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp L$enc_loop8_inner
.p2align 4
L$enc_loop8:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
+L$enc_loop8_inner:
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
@@ -444,23 +439,18 @@ _aesni_decrypt8:
leaq 32(%rcx,%rax,1),%rcx
negq %rax
.byte 102,15,56,222,209
- addq $16,%rax
pxor %xmm0,%xmm7
-.byte 102,15,56,222,217
pxor %xmm0,%xmm8
+.byte 102,15,56,222,217
pxor %xmm0,%xmm9
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
- movups -16(%rcx,%rax,1),%xmm0
- jmp L$dec_loop8_enter
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp L$dec_loop8_inner
.p2align 4
L$dec_loop8:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
+L$dec_loop8_inner:
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
@@ -587,6 +577,7 @@ L$ecb_enc_tail:
movups 80(%rdi),%xmm7
je L$ecb_enc_six
movdqu 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
call _aesni_encrypt8
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
@@ -700,15 +691,23 @@ L$ecb_dec_loop8_enter:
jnc L$ecb_dec_loop8
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movq %r11,%rcx
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movl %r10d,%eax
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
movups %xmm9,112(%rsi)
+ pxor %xmm9,%xmm9
leaq 128(%rsi),%rsi
addq $128,%rdx
jz L$ecb_ret
@@ -731,14 +730,23 @@ L$ecb_dec_tail:
je L$ecb_dec_six
movups 96(%rdi),%xmm8
movups (%rcx),%xmm0
+ xorps %xmm9,%xmm9
call _aesni_decrypt8
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
jmp L$ecb_ret
.p2align 4
L$ecb_dec_one:
@@ -754,49 +762,73 @@ L$oop_dec1_4:
jnz L$oop_dec1_4
.byte 102,15,56,223,209
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
jmp L$ecb_ret
.p2align 4
L$ecb_dec_two:
call _aesni_decrypt2
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
jmp L$ecb_ret
.p2align 4
L$ecb_dec_three:
call _aesni_decrypt3
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
jmp L$ecb_ret
.p2align 4
L$ecb_dec_four:
call _aesni_decrypt4
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
jmp L$ecb_ret
.p2align 4
L$ecb_dec_five:
xorps %xmm7,%xmm7
call _aesni_decrypt6
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
jmp L$ecb_ret
.p2align 4
L$ecb_dec_six:
call _aesni_decrypt6
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
L$ecb_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
.byte 0xf3,0xc3
.globl _aesni_ccm64_encrypt_blocks
@@ -853,7 +885,13 @@ L$ccm64_enc2_loop:
leaq 16(%rsi),%rsi
jnz L$ccm64_enc_outer
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
movups %xmm3,(%r9)
+ pxor %xmm3,%xmm3
+ pxor %xmm8,%xmm8
+ pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.globl _aesni_ccm64_decrypt_blocks
@@ -944,21 +982,56 @@ L$oop_enc1_6:
leaq 16(%r11),%r11
jnz L$oop_enc1_6
.byte 102,15,56,221,217
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
movups %xmm3,(%r9)
+ pxor %xmm3,%xmm3
+ pxor %xmm8,%xmm8
+ pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.globl _aesni_ctr32_encrypt_blocks
.p2align 4
_aesni_ctr32_encrypt_blocks:
+ cmpq $1,%rdx
+ jne L$ctr32_bulk
+
+
+
+ movups (%r8),%xmm2
+ movups (%rdi),%xmm3
+ movl 240(%rcx),%edx
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_enc1_7:
+.byte 102,15,56,220,209
+ decl %edx
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_enc1_7
+.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ xorps %xmm2,%xmm2
+ jmp L$ctr32_epilogue
+
+.p2align 4
+L$ctr32_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $128,%rsp
andq $-16,%rsp
leaq -8(%rax),%rbp
- cmpq $1,%rdx
- je L$ctr32_one_shortcut
+
+
movdqu (%r8),%xmm2
movdqu (%rcx),%xmm0
@@ -1349,11 +1422,14 @@ L$ctr32_enc_done:
leaq -128(%rcx),%rcx
L$ctr32_tail:
+
+
leaq 16(%rcx),%rcx
cmpq $4,%rdx
jb L$ctr32_loop3
je L$ctr32_loop4
+
shll $4,%eax
movdqa 96(%rsp),%xmm8
pxor %xmm9,%xmm9
@@ -1456,30 +1532,33 @@ L$ctr32_loop3:
movups 32(%rdi),%xmm12
xorps %xmm12,%xmm4
movups %xmm4,32(%rsi)
- jmp L$ctr32_done
-.p2align 4
-L$ctr32_one_shortcut:
- movups (%r8),%xmm2
- movups (%rdi),%xmm10
- movl 240(%rcx),%eax
- movups (%rcx),%xmm0
- movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
- xorps %xmm0,%xmm2
-L$oop_enc1_7:
-.byte 102,15,56,220,209
- decl %eax
- movups (%rcx),%xmm1
- leaq 16(%rcx),%rcx
- jnz L$oop_enc1_7
-.byte 102,15,56,221,209
- xorps %xmm10,%xmm2
- movups %xmm2,(%rsi)
- jmp L$ctr32_done
-
-.p2align 4
L$ctr32_done:
+ xorps %xmm0,%xmm0
+ xorl %r11d,%r11d
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ movaps %xmm0,112(%rsp)
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
L$ctr32_epilogue:
@@ -1750,6 +1829,7 @@ L$xts_enc_loop6:
shrl $4,%eax
L$xts_enc_short:
+
movl %eax,%r10d
pxor %xmm0,%xmm10
addq $96,%rdx
@@ -1778,6 +1858,7 @@ L$xts_enc_short:
pxor %xmm12,%xmm4
pxor %xmm13,%xmm5
pxor %xmm14,%xmm6
+ pxor %xmm7,%xmm7
call _aesni_encrypt6
@@ -1920,6 +2001,29 @@ L$oop_enc1_10:
movups %xmm2,-16(%rsi)
L$xts_enc_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
L$xts_enc_epilogue:
@@ -2196,6 +2300,7 @@ L$xts_dec_loop6:
shrl $4,%eax
L$xts_dec_short:
+
movl %eax,%r10d
pxor %xmm0,%xmm10
pxor %xmm0,%xmm11
@@ -2398,6 +2503,29 @@ L$oop_dec1_14:
movups %xmm2,(%rsi)
L$xts_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
L$xts_dec_epilogue:
@@ -2446,7 +2574,11 @@ L$oop_enc1_15:
jnc L$cbc_enc_loop
addq $16,%rdx
jnz L$cbc_enc_tail
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%r8)
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
jmp L$cbc_ret
L$cbc_enc_tail:
@@ -2466,6 +2598,35 @@ L$cbc_enc_tail:
.p2align 4
L$cbc_decrypt:
+ cmpq $16,%rdx
+ jne L$cbc_decrypt_bulk
+
+
+
+ movdqu (%rdi),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa %xmm2,%xmm4
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_dec1_16:
+.byte 102,15,56,222,209
+ decl %r10d
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_dec1_16
+.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movdqu %xmm4,(%r8)
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ jmp L$cbc_ret
+.p2align 4
+L$cbc_decrypt_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $16,%rsp
@@ -2702,7 +2863,7 @@ L$cbc_dec_done:
movaps %xmm9,%xmm2
leaq -112(%rcx),%rcx
addq $112,%rdx
- jle L$cbc_dec_tail_collected
+ jle L$cbc_dec_clear_tail_collected
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
cmpq $80,%rdx
@@ -2721,14 +2882,19 @@ L$cbc_dec_six_or_seven:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
leaq 80(%rsi),%rsi
movdqa %xmm7,%xmm2
+ pxor %xmm7,%xmm7
jmp L$cbc_dec_tail_collected
.p2align 4
@@ -2743,16 +2909,23 @@ L$cbc_dec_seven:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
pxor %xmm9,%xmm8
movdqu %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
leaq 96(%rsi),%rsi
movdqa %xmm8,%xmm2
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
jmp L$cbc_dec_tail_collected
.p2align 4
@@ -2796,7 +2969,7 @@ L$cbc_dec_loop6_enter:
movdqa %xmm7,%xmm2
addq $80,%rdx
- jle L$cbc_dec_tail_collected
+ jle L$cbc_dec_clear_tail_collected
movups %xmm7,(%rsi)
leaq 16(%rsi),%rsi
@@ -2831,12 +3004,17 @@ L$cbc_dec_tail:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
leaq 64(%rsi),%rsi
movdqa %xmm6,%xmm2
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
subq $16,%rdx
jmp L$cbc_dec_tail_collected
@@ -2847,12 +3025,12 @@ L$cbc_dec_one:
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
-L$oop_dec1_16:
+L$oop_dec1_17:
.byte 102,15,56,222,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_dec1_16
+ jnz L$oop_dec1_17
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movaps %xmm11,%xmm10
@@ -2866,6 +3044,7 @@ L$cbc_dec_two:
pxor %xmm11,%xmm3
movdqu %xmm2,(%rsi)
movdqa %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leaq 16(%rsi),%rsi
jmp L$cbc_dec_tail_collected
.p2align 4
@@ -2878,7 +3057,9 @@ L$cbc_dec_three:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movdqa %xmm4,%xmm2
+ pxor %xmm4,%xmm4
leaq 32(%rsi),%rsi
jmp L$cbc_dec_tail_collected
.p2align 4
@@ -2891,29 +3072,45 @@ L$cbc_dec_four:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movdqa %xmm5,%xmm2
+ pxor %xmm5,%xmm5
leaq 48(%rsi),%rsi
jmp L$cbc_dec_tail_collected
.p2align 4
+L$cbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
L$cbc_dec_tail_collected:
movups %xmm10,(%r8)
andq $15,%rdx
jnz L$cbc_dec_tail_partial
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
jmp L$cbc_dec_ret
.p2align 4
L$cbc_dec_tail_partial:
movaps %xmm2,(%rsp)
+ pxor %xmm2,%xmm2
movq $16,%rcx
movq %rsi,%rdi
subq %rdx,%rcx
leaq (%rsp),%rsi
.long 0x9066A4F3
+ movdqa %xmm2,(%rsp)
L$cbc_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
leaq (%rbp),%rsp
popq %rbp
L$cbc_ret:
@@ -2951,7 +3148,9 @@ L$dec_key_inverse:
movups (%rdx),%xmm0
.byte 102,15,56,219,192
+ pxor %xmm1,%xmm1
movups %xmm0,(%rdi)
+ pxor %xmm0,%xmm0
L$dec_key_ret:
addq $8,%rsp
.byte 0xf3,0xc3
@@ -2969,8 +3168,10 @@ __aesni_set_encrypt_key:
testq %rdx,%rdx
jz L$enc_key_ret
+ movl $268437504,%r10d
movups (%rdi),%xmm0
xorps %xmm4,%xmm4
+ andl _OPENSSL_ia32cap_P+4(%rip),%r10d
leaq 16(%rdx),%rax
cmpl $256,%esi
je L$14rounds
@@ -2981,6 +3182,9 @@ __aesni_set_encrypt_key:
L$10rounds:
movl $9,%esi
+ cmpl $268435456,%r10d
+ je L$10rounds_alt
+
movups %xmm0,(%rdx)
.byte 102,15,58,223,200,1
call L$key_expansion_128_cold
@@ -3008,9 +3212,79 @@ L$10rounds:
jmp L$enc_key_ret
.p2align 4
+L$10rounds_alt:
+ movdqa L$key_rotate(%rip),%xmm5
+ movl $8,%r10d
+ movdqa L$key_rcon1(%rip),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,(%rdx)
+ jmp L$oop_key128
+
+.p2align 4
+L$oop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leaq 16(%rax),%rax
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%rax)
+ movdqa %xmm0,%xmm2
+
+ decl %r10d
+ jnz L$oop_key128
+
+ movdqa L$key_rcon1b(%rip),%xmm4
+
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%rax)
+
+ movl %esi,96(%rax)
+ xorl %eax,%eax
+ jmp L$enc_key_ret
+
+.p2align 4
L$12rounds:
movq 16(%rdi),%xmm2
movl $11,%esi
+ cmpl $268435456,%r10d
+ je L$12rounds_alt
+
movups %xmm0,(%rdx)
.byte 102,15,58,223,202,1
call L$key_expansion_192a_cold
@@ -3034,10 +3308,54 @@ L$12rounds:
jmp L$enc_key_ret
.p2align 4
+L$12rounds_alt:
+ movdqa L$key_rotate192(%rip),%xmm5
+ movdqa L$key_rcon1(%rip),%xmm4
+ movl $8,%r10d
+ movdqu %xmm0,(%rdx)
+ jmp L$oop_key192
+
+.p2align 4
+L$oop_key192:
+ movq %xmm2,0(%rax)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leaq 24(%rax),%rax
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%rax)
+
+ decl %r10d
+ jnz L$oop_key192
+
+ movl %esi,32(%rax)
+ xorl %eax,%eax
+ jmp L$enc_key_ret
+
+.p2align 4
L$14rounds:
movups 16(%rdi),%xmm2
movl $13,%esi
leaq 16(%rax),%rax
+ cmpl $268435456,%r10d
+ je L$14rounds_alt
+
movups %xmm0,(%rdx)
movups %xmm2,16(%rdx)
.byte 102,15,58,223,202,1
@@ -3072,9 +3390,69 @@ L$14rounds:
jmp L$enc_key_ret
.p2align 4
+L$14rounds_alt:
+ movdqa L$key_rotate(%rip),%xmm5
+ movdqa L$key_rcon1(%rip),%xmm4
+ movl $7,%r10d
+ movdqu %xmm0,0(%rdx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,16(%rdx)
+ jmp L$oop_key256
+
+.p2align 4
+L$oop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ decl %r10d
+ jz L$done_key256
+
+ pshufd $255,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%rax)
+ leaq 32(%rax),%rax
+ movdqa %xmm2,%xmm1
+
+ jmp L$oop_key256
+
+L$done_key256:
+ movl %esi,16(%rax)
+ xorl %eax,%eax
+ jmp L$enc_key_ret
+
+.p2align 4
L$bad_keybits:
movq $-2,%rax
L$enc_key_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
addq $8,%rsp
.byte 0xf3,0xc3
L$SEH_end_set_encrypt_key:
@@ -3160,6 +3538,14 @@ L$xts_magic:
.long 0x87,0,1,0
L$increment1:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+L$key_rotate:
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+L$key_rotate192:
+.long 0x04070605,0x04070605,0x04070605,0x04070605
+L$key_rcon1:
+.long 1,1,1,1
+L$key_rcon1b:
+.long 0x1b,0x1b,0x1b,0x1b
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 6
diff --git a/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s b/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s
index 65cf9993d8..5470fb0336 100644
--- a/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s
+++ b/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s
@@ -2884,11 +2884,16 @@ L$sqrx4x_sub:
.p2align 4
_bn_get_bits5:
- movq %rdi,%r10
+ leaq 0(%rdi),%r10
+ leaq 1(%rdi),%r11
movl %esi,%ecx
- shrl $3,%esi
- movzwl (%r10,%rsi,1),%eax
- andl $7,%ecx
+ shrl $4,%esi
+ andl $15,%ecx
+ leal -8(%rcx),%eax
+ cmpl $11,%ecx
+ cmovaq %r11,%r10
+ cmoval %eax,%ecx
+ movzwl (%r10,%rsi,2),%eax
shrl %cl,%eax
andl $31,%eax
.byte 0xf3,0xc3
diff --git a/deps/openssl/asm/x64-win32-masm/aes/aesni-x86_64.asm b/deps/openssl/asm/x64-win32-masm/aes/aesni-x86_64.asm
index 53d8afc950..5e848125d6 100644
--- a/deps/openssl/asm/x64-win32-masm/aes/aesni-x86_64.asm
+++ b/deps/openssl/asm/x64-win32-masm/aes/aesni-x86_64.asm
@@ -18,7 +18,10 @@ DB 102,15,56,220,209
lea r8,QWORD PTR[16+r8]
jnz $L$oop_enc1_1
DB 102,15,56,221,209
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
movups XMMWORD PTR[rdx],xmm2
+ pxor xmm2,xmm2
DB 0F3h,0C3h ;repret
aesni_encrypt ENDP
@@ -39,7 +42,10 @@ DB 102,15,56,222,209
lea r8,QWORD PTR[16+r8]
jnz $L$oop_dec1_2
DB 102,15,56,223,209
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
movups XMMWORD PTR[rdx],xmm2
+ pxor xmm2,xmm2
DB 0F3h,0C3h ;repret
aesni_decrypt ENDP
@@ -265,21 +271,18 @@ DB 102,15,56,220,217
pxor xmm6,xmm0
DB 102,15,56,220,225
pxor xmm7,xmm0
+ movups xmm0,XMMWORD PTR[rax*1+rcx]
add rax,16
-DB 102,15,56,220,233
-DB 102,15,56,220,241
-DB 102,15,56,220,249
- movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
jmp $L$enc_loop6_enter
ALIGN 16
$L$enc_loop6::
DB 102,15,56,220,209
DB 102,15,56,220,217
DB 102,15,56,220,225
+$L$enc_loop6_enter::
DB 102,15,56,220,233
DB 102,15,56,220,241
DB 102,15,56,220,249
-$L$enc_loop6_enter::
movups xmm1,XMMWORD PTR[rax*1+rcx]
add rax,32
DB 102,15,56,220,208
@@ -322,21 +325,18 @@ DB 102,15,56,222,217
pxor xmm6,xmm0
DB 102,15,56,222,225
pxor xmm7,xmm0
+ movups xmm0,XMMWORD PTR[rax*1+rcx]
add rax,16
-DB 102,15,56,222,233
-DB 102,15,56,222,241
-DB 102,15,56,222,249
- movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
jmp $L$dec_loop6_enter
ALIGN 16
$L$dec_loop6::
DB 102,15,56,222,209
DB 102,15,56,222,217
DB 102,15,56,222,225
+$L$dec_loop6_enter::
DB 102,15,56,222,233
DB 102,15,56,222,241
DB 102,15,56,222,249
-$L$dec_loop6_enter::
movups xmm1,XMMWORD PTR[rax*1+rcx]
add rax,32
DB 102,15,56,222,208
@@ -376,23 +376,18 @@ _aesni_encrypt8 PROC PRIVATE
lea rcx,QWORD PTR[32+rax*1+rcx]
neg rax
DB 102,15,56,220,209
- add rax,16
pxor xmm7,xmm0
-DB 102,15,56,220,217
pxor xmm8,xmm0
+DB 102,15,56,220,217
pxor xmm9,xmm0
-DB 102,15,56,220,225
-DB 102,15,56,220,233
-DB 102,15,56,220,241
-DB 102,15,56,220,249
-DB 102,68,15,56,220,193
-DB 102,68,15,56,220,201
- movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
- jmp $L$enc_loop8_enter
+ movups xmm0,XMMWORD PTR[rax*1+rcx]
+ add rax,16
+ jmp $L$enc_loop8_inner
ALIGN 16
$L$enc_loop8::
DB 102,15,56,220,209
DB 102,15,56,220,217
+$L$enc_loop8_inner::
DB 102,15,56,220,225
DB 102,15,56,220,233
DB 102,15,56,220,241
@@ -445,23 +440,18 @@ _aesni_decrypt8 PROC PRIVATE
lea rcx,QWORD PTR[32+rax*1+rcx]
neg rax
DB 102,15,56,222,209
- add rax,16
pxor xmm7,xmm0
-DB 102,15,56,222,217
pxor xmm8,xmm0
+DB 102,15,56,222,217
pxor xmm9,xmm0
-DB 102,15,56,222,225
-DB 102,15,56,222,233
-DB 102,15,56,222,241
-DB 102,15,56,222,249
-DB 102,68,15,56,222,193
-DB 102,68,15,56,222,201
- movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
- jmp $L$dec_loop8_enter
+ movups xmm0,XMMWORD PTR[rax*1+rcx]
+ add rax,16
+ jmp $L$dec_loop8_inner
ALIGN 16
$L$dec_loop8::
DB 102,15,56,222,209
DB 102,15,56,222,217
+$L$dec_loop8_inner::
DB 102,15,56,222,225
DB 102,15,56,222,233
DB 102,15,56,222,241
@@ -605,6 +595,7 @@ $L$ecb_enc_tail::
movups xmm7,XMMWORD PTR[80+rdi]
je $L$ecb_enc_six
movdqu xmm8,XMMWORD PTR[96+rdi]
+ xorps xmm9,xmm9
call _aesni_encrypt8
movups XMMWORD PTR[rsi],xmm2
movups XMMWORD PTR[16+rsi],xmm3
@@ -718,15 +709,23 @@ $L$ecb_dec_loop8_enter::
jnc $L$ecb_dec_loop8
movups XMMWORD PTR[rsi],xmm2
+ pxor xmm2,xmm2
mov rcx,r11
movups XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
mov eax,r10d
movups XMMWORD PTR[32+rsi],xmm4
+ pxor xmm4,xmm4
movups XMMWORD PTR[48+rsi],xmm5
+ pxor xmm5,xmm5
movups XMMWORD PTR[64+rsi],xmm6
+ pxor xmm6,xmm6
movups XMMWORD PTR[80+rsi],xmm7
+ pxor xmm7,xmm7
movups XMMWORD PTR[96+rsi],xmm8
+ pxor xmm8,xmm8
movups XMMWORD PTR[112+rsi],xmm9
+ pxor xmm9,xmm9
lea rsi,QWORD PTR[128+rsi]
add rdx,080h
jz $L$ecb_ret
@@ -749,14 +748,23 @@ $L$ecb_dec_tail::
je $L$ecb_dec_six
movups xmm8,XMMWORD PTR[96+rdi]
movups xmm0,XMMWORD PTR[rcx]
+ xorps xmm9,xmm9
call _aesni_decrypt8
movups XMMWORD PTR[rsi],xmm2
+ pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
movups XMMWORD PTR[32+rsi],xmm4
+ pxor xmm4,xmm4
movups XMMWORD PTR[48+rsi],xmm5
+ pxor xmm5,xmm5
movups XMMWORD PTR[64+rsi],xmm6
+ pxor xmm6,xmm6
movups XMMWORD PTR[80+rsi],xmm7
+ pxor xmm7,xmm7
movups XMMWORD PTR[96+rsi],xmm8
+ pxor xmm8,xmm8
+ pxor xmm9,xmm9
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_one::
@@ -772,53 +780,81 @@ DB 102,15,56,222,209
jnz $L$oop_dec1_4
DB 102,15,56,223,209
movups XMMWORD PTR[rsi],xmm2
+ pxor xmm2,xmm2
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_two::
call _aesni_decrypt2
movups XMMWORD PTR[rsi],xmm2
+ pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_three::
call _aesni_decrypt3
movups XMMWORD PTR[rsi],xmm2
+ pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
movups XMMWORD PTR[32+rsi],xmm4
+ pxor xmm4,xmm4
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_four::
call _aesni_decrypt4
movups XMMWORD PTR[rsi],xmm2
+ pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
movups XMMWORD PTR[32+rsi],xmm4
+ pxor xmm4,xmm4
movups XMMWORD PTR[48+rsi],xmm5
+ pxor xmm5,xmm5
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_five::
xorps xmm7,xmm7
call _aesni_decrypt6
movups XMMWORD PTR[rsi],xmm2
+ pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
movups XMMWORD PTR[32+rsi],xmm4
+ pxor xmm4,xmm4
movups XMMWORD PTR[48+rsi],xmm5
+ pxor xmm5,xmm5
movups XMMWORD PTR[64+rsi],xmm6
+ pxor xmm6,xmm6
+ pxor xmm7,xmm7
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_six::
call _aesni_decrypt6
movups XMMWORD PTR[rsi],xmm2
+ pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
movups XMMWORD PTR[32+rsi],xmm4
+ pxor xmm4,xmm4
movups XMMWORD PTR[48+rsi],xmm5
+ pxor xmm5,xmm5
movups XMMWORD PTR[64+rsi],xmm6
+ pxor xmm6,xmm6
movups XMMWORD PTR[80+rsi],xmm7
+ pxor xmm7,xmm7
$L$ecb_ret::
+ xorps xmm0,xmm0
+ pxor xmm1,xmm1
movaps xmm6,XMMWORD PTR[rsp]
+ movaps XMMWORD PTR[rsp],xmm0
movaps xmm7,XMMWORD PTR[16+rsp]
+ movaps XMMWORD PTR[16+rsp],xmm0
movaps xmm8,XMMWORD PTR[32+rsp]
+ movaps XMMWORD PTR[32+rsp],xmm0
movaps xmm9,XMMWORD PTR[48+rsp]
+ movaps XMMWORD PTR[48+rsp],xmm0
lea rsp,QWORD PTR[88+rsp]
$L$ecb_enc_ret::
mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue
@@ -898,11 +934,21 @@ DB 102,15,56,0,215
lea rsi,QWORD PTR[16+rsi]
jnz $L$ccm64_enc_outer
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
movups XMMWORD PTR[r9],xmm3
+ pxor xmm3,xmm3
+ pxor xmm8,xmm8
+ pxor xmm6,xmm6
movaps xmm6,XMMWORD PTR[rsp]
+ movaps XMMWORD PTR[rsp],xmm0
movaps xmm7,XMMWORD PTR[16+rsp]
+ movaps XMMWORD PTR[16+rsp],xmm0
movaps xmm8,XMMWORD PTR[32+rsp]
+ movaps XMMWORD PTR[32+rsp],xmm0
movaps xmm9,XMMWORD PTR[48+rsp]
+ movaps XMMWORD PTR[48+rsp],xmm0
lea rsp,QWORD PTR[88+rsp]
$L$ccm64_enc_ret::
mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue
@@ -1016,11 +1062,21 @@ DB 102,15,56,220,217
lea r11,QWORD PTR[16+r11]
jnz $L$oop_enc1_6
DB 102,15,56,221,217
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
movups XMMWORD PTR[r9],xmm3
+ pxor xmm3,xmm3
+ pxor xmm8,xmm8
+ pxor xmm6,xmm6
movaps xmm6,XMMWORD PTR[rsp]
+ movaps XMMWORD PTR[rsp],xmm0
movaps xmm7,XMMWORD PTR[16+rsp]
+ movaps XMMWORD PTR[16+rsp],xmm0
movaps xmm8,XMMWORD PTR[32+rsp]
+ movaps XMMWORD PTR[32+rsp],xmm0
movaps xmm9,XMMWORD PTR[48+rsp]
+ movaps XMMWORD PTR[48+rsp],xmm0
lea rsp,QWORD PTR[88+rsp]
$L$ccm64_dec_ret::
mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue
@@ -1043,6 +1099,35 @@ $L$SEH_begin_aesni_ctr32_encrypt_blocks::
mov r8,QWORD PTR[40+rsp]
+ cmp rdx,1
+ jne $L$ctr32_bulk
+
+
+
+ movups xmm2,XMMWORD PTR[r8]
+ movups xmm3,XMMWORD PTR[rdi]
+ mov edx,DWORD PTR[240+rcx]
+ movups xmm0,XMMWORD PTR[rcx]
+ movups xmm1,XMMWORD PTR[16+rcx]
+ lea rcx,QWORD PTR[32+rcx]
+ xorps xmm2,xmm0
+$L$oop_enc1_7::
+DB 102,15,56,220,209
+ dec edx
+ movups xmm1,XMMWORD PTR[rcx]
+ lea rcx,QWORD PTR[16+rcx]
+ jnz $L$oop_enc1_7
+DB 102,15,56,221,209
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ xorps xmm2,xmm3
+ pxor xmm3,xmm3
+ movups XMMWORD PTR[rsi],xmm2
+ xorps xmm2,xmm2
+ jmp $L$ctr32_epilogue
+
+ALIGN 16
+$L$ctr32_bulk::
lea rax,QWORD PTR[rsp]
push rbp
sub rsp,288
@@ -1060,8 +1145,8 @@ $L$SEH_begin_aesni_ctr32_encrypt_blocks::
$L$ctr32_body::
lea rbp,QWORD PTR[((-8))+rax]
- cmp rdx,1
- je $L$ctr32_one_shortcut
+
+
movdqu xmm2,XMMWORD PTR[r8]
movdqu xmm0,XMMWORD PTR[rcx]
@@ -1452,11 +1537,14 @@ DB 102,69,15,56,221,202
lea rcx,QWORD PTR[((-128))+rcx]
$L$ctr32_tail::
+
+
lea rcx,QWORD PTR[16+rcx]
cmp rdx,4
jb $L$ctr32_loop3
je $L$ctr32_loop4
+
shl eax,4
movdqa xmm8,XMMWORD PTR[96+rsp]
pxor xmm9,xmm9
@@ -1559,40 +1647,43 @@ DB 102,15,56,221,225
movups xmm12,XMMWORD PTR[32+rdi]
xorps xmm4,xmm12
movups XMMWORD PTR[32+rsi],xmm4
- jmp $L$ctr32_done
-ALIGN 16
-$L$ctr32_one_shortcut::
- movups xmm2,XMMWORD PTR[r8]
- movups xmm10,XMMWORD PTR[rdi]
- mov eax,DWORD PTR[240+rcx]
- movups xmm0,XMMWORD PTR[rcx]
- movups xmm1,XMMWORD PTR[16+rcx]
- lea rcx,QWORD PTR[32+rcx]
- xorps xmm2,xmm0
-$L$oop_enc1_7::
-DB 102,15,56,220,209
- dec eax
- movups xmm1,XMMWORD PTR[rcx]
- lea rcx,QWORD PTR[16+rcx]
- jnz $L$oop_enc1_7
-DB 102,15,56,221,209
- xorps xmm2,xmm10
- movups XMMWORD PTR[rsi],xmm2
- jmp $L$ctr32_done
-
-ALIGN 16
$L$ctr32_done::
+ xorps xmm0,xmm0
+ xor r11d,r11d
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
movaps xmm6,XMMWORD PTR[((-160))+rbp]
+ movaps XMMWORD PTR[(-160)+rbp],xmm0
movaps xmm7,XMMWORD PTR[((-144))+rbp]
+ movaps XMMWORD PTR[(-144)+rbp],xmm0
movaps xmm8,XMMWORD PTR[((-128))+rbp]
+ movaps XMMWORD PTR[(-128)+rbp],xmm0
movaps xmm9,XMMWORD PTR[((-112))+rbp]
+ movaps XMMWORD PTR[(-112)+rbp],xmm0
movaps xmm10,XMMWORD PTR[((-96))+rbp]
+ movaps XMMWORD PTR[(-96)+rbp],xmm0
movaps xmm11,XMMWORD PTR[((-80))+rbp]
+ movaps XMMWORD PTR[(-80)+rbp],xmm0
movaps xmm12,XMMWORD PTR[((-64))+rbp]
+ movaps XMMWORD PTR[(-64)+rbp],xmm0
movaps xmm13,XMMWORD PTR[((-48))+rbp]
+ movaps XMMWORD PTR[(-48)+rbp],xmm0
movaps xmm14,XMMWORD PTR[((-32))+rbp]
+ movaps XMMWORD PTR[(-32)+rbp],xmm0
movaps xmm15,XMMWORD PTR[((-16))+rbp]
+ movaps XMMWORD PTR[(-16)+rbp],xmm0
+ movaps XMMWORD PTR[rsp],xmm0
+ movaps XMMWORD PTR[16+rsp],xmm0
+ movaps XMMWORD PTR[32+rsp],xmm0
+ movaps XMMWORD PTR[48+rsp],xmm0
+ movaps XMMWORD PTR[64+rsp],xmm0
+ movaps XMMWORD PTR[80+rsp],xmm0
+ movaps XMMWORD PTR[96+rsp],xmm0
+ movaps XMMWORD PTR[112+rsp],xmm0
lea rsp,QWORD PTR[rbp]
pop rbp
$L$ctr32_epilogue::
@@ -1889,6 +1980,7 @@ DB 102,15,56,221,124,36,80
shr eax,4
$L$xts_enc_short::
+
mov r10d,eax
pxor xmm10,xmm0
add rdx,16*6
@@ -1917,6 +2009,7 @@ $L$xts_enc_short::
pxor xmm4,xmm12
pxor xmm5,xmm13
pxor xmm6,xmm14
+ pxor xmm7,xmm7
call _aesni_encrypt6
@@ -2059,16 +2152,39 @@ DB 102,15,56,221,209
movups XMMWORD PTR[(-16)+rsi],xmm2
$L$xts_enc_ret::
+ xorps xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
movaps xmm6,XMMWORD PTR[((-160))+rbp]
+ movaps XMMWORD PTR[(-160)+rbp],xmm0
movaps xmm7,XMMWORD PTR[((-144))+rbp]
+ movaps XMMWORD PTR[(-144)+rbp],xmm0
movaps xmm8,XMMWORD PTR[((-128))+rbp]
+ movaps XMMWORD PTR[(-128)+rbp],xmm0
movaps xmm9,XMMWORD PTR[((-112))+rbp]
+ movaps XMMWORD PTR[(-112)+rbp],xmm0
movaps xmm10,XMMWORD PTR[((-96))+rbp]
+ movaps XMMWORD PTR[(-96)+rbp],xmm0
movaps xmm11,XMMWORD PTR[((-80))+rbp]
+ movaps XMMWORD PTR[(-80)+rbp],xmm0
movaps xmm12,XMMWORD PTR[((-64))+rbp]
+ movaps XMMWORD PTR[(-64)+rbp],xmm0
movaps xmm13,XMMWORD PTR[((-48))+rbp]
+ movaps XMMWORD PTR[(-48)+rbp],xmm0
movaps xmm14,XMMWORD PTR[((-32))+rbp]
+ movaps XMMWORD PTR[(-32)+rbp],xmm0
movaps xmm15,XMMWORD PTR[((-16))+rbp]
+ movaps XMMWORD PTR[(-16)+rbp],xmm0
+ movaps XMMWORD PTR[rsp],xmm0
+ movaps XMMWORD PTR[16+rsp],xmm0
+ movaps XMMWORD PTR[32+rsp],xmm0
+ movaps XMMWORD PTR[48+rsp],xmm0
+ movaps XMMWORD PTR[64+rsp],xmm0
+ movaps XMMWORD PTR[80+rsp],xmm0
+ movaps XMMWORD PTR[96+rsp],xmm0
lea rsp,QWORD PTR[rbp]
pop rbp
$L$xts_enc_epilogue::
@@ -2371,6 +2487,7 @@ DB 102,15,56,223,124,36,80
shr eax,4
$L$xts_dec_short::
+
mov r10d,eax
pxor xmm10,xmm0
pxor xmm11,xmm0
@@ -2573,16 +2690,39 @@ DB 102,15,56,223,209
movups XMMWORD PTR[rsi],xmm2
$L$xts_dec_ret::
+ xorps xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
movaps xmm6,XMMWORD PTR[((-160))+rbp]
+ movaps XMMWORD PTR[(-160)+rbp],xmm0
movaps xmm7,XMMWORD PTR[((-144))+rbp]
+ movaps XMMWORD PTR[(-144)+rbp],xmm0
movaps xmm8,XMMWORD PTR[((-128))+rbp]
+ movaps XMMWORD PTR[(-128)+rbp],xmm0
movaps xmm9,XMMWORD PTR[((-112))+rbp]
+ movaps XMMWORD PTR[(-112)+rbp],xmm0
movaps xmm10,XMMWORD PTR[((-96))+rbp]
+ movaps XMMWORD PTR[(-96)+rbp],xmm0
movaps xmm11,XMMWORD PTR[((-80))+rbp]
+ movaps XMMWORD PTR[(-80)+rbp],xmm0
movaps xmm12,XMMWORD PTR[((-64))+rbp]
+ movaps XMMWORD PTR[(-64)+rbp],xmm0
movaps xmm13,XMMWORD PTR[((-48))+rbp]
+ movaps XMMWORD PTR[(-48)+rbp],xmm0
movaps xmm14,XMMWORD PTR[((-32))+rbp]
+ movaps XMMWORD PTR[(-32)+rbp],xmm0
movaps xmm15,XMMWORD PTR[((-16))+rbp]
+ movaps XMMWORD PTR[(-16)+rbp],xmm0
+ movaps XMMWORD PTR[rsp],xmm0
+ movaps XMMWORD PTR[16+rsp],xmm0
+ movaps XMMWORD PTR[32+rsp],xmm0
+ movaps XMMWORD PTR[48+rsp],xmm0
+ movaps XMMWORD PTR[64+rsp],xmm0
+ movaps XMMWORD PTR[80+rsp],xmm0
+ movaps XMMWORD PTR[96+rsp],xmm0
lea rsp,QWORD PTR[rbp]
pop rbp
$L$xts_dec_epilogue::
@@ -2646,7 +2786,11 @@ DB 102,15,56,221,209
jnc $L$cbc_enc_loop
add rdx,16
jnz $L$cbc_enc_tail
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
movups XMMWORD PTR[r8],xmm2
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
jmp $L$cbc_ret
$L$cbc_enc_tail::
@@ -2666,6 +2810,35 @@ $L$cbc_enc_tail::
ALIGN 16
$L$cbc_decrypt::
+ cmp rdx,16
+ jne $L$cbc_decrypt_bulk
+
+
+
+ movdqu xmm2,XMMWORD PTR[rdi]
+ movdqu xmm3,XMMWORD PTR[r8]
+ movdqa xmm4,xmm2
+ movups xmm0,XMMWORD PTR[rcx]
+ movups xmm1,XMMWORD PTR[16+rcx]
+ lea rcx,QWORD PTR[32+rcx]
+ xorps xmm2,xmm0
+$L$oop_dec1_16::
+DB 102,15,56,222,209
+ dec r10d
+ movups xmm1,XMMWORD PTR[rcx]
+ lea rcx,QWORD PTR[16+rcx]
+ jnz $L$oop_dec1_16
+DB 102,15,56,223,209
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ movdqu XMMWORD PTR[r8],xmm4
+ xorps xmm2,xmm3
+ pxor xmm3,xmm3
+ movups XMMWORD PTR[rsi],xmm2
+ pxor xmm2,xmm2
+ jmp $L$cbc_ret
+ALIGN 16
+$L$cbc_decrypt_bulk::
lea rax,QWORD PTR[rsp]
push rbp
sub rsp,176
@@ -2913,7 +3086,7 @@ DB 102,69,15,56,223,202
movaps xmm2,xmm9
lea rcx,QWORD PTR[((-112))+rcx]
add rdx,070h
- jle $L$cbc_dec_tail_collected
+ jle $L$cbc_dec_clear_tail_collected
movups XMMWORD PTR[rsi],xmm9
lea rsi,QWORD PTR[16+rsi]
cmp rdx,050h
@@ -2932,14 +3105,19 @@ $L$cbc_dec_six_or_seven::
movdqu XMMWORD PTR[rsi],xmm2
pxor xmm4,xmm12
movdqu XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
pxor xmm5,xmm13
movdqu XMMWORD PTR[32+rsi],xmm4
+ pxor xmm4,xmm4
pxor xmm6,xmm14
movdqu XMMWORD PTR[48+rsi],xmm5
+ pxor xmm5,xmm5
pxor xmm7,xmm15
movdqu XMMWORD PTR[64+rsi],xmm6
+ pxor xmm6,xmm6
lea rsi,QWORD PTR[80+rsi]
movdqa xmm2,xmm7
+ pxor xmm7,xmm7
jmp $L$cbc_dec_tail_collected
ALIGN 16
@@ -2954,16 +3132,23 @@ $L$cbc_dec_seven::
movdqu XMMWORD PTR[rsi],xmm2
pxor xmm4,xmm12
movdqu XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
pxor xmm5,xmm13
movdqu XMMWORD PTR[32+rsi],xmm4
+ pxor xmm4,xmm4
pxor xmm6,xmm14
movdqu XMMWORD PTR[48+rsi],xmm5
+ pxor xmm5,xmm5
pxor xmm7,xmm15
movdqu XMMWORD PTR[64+rsi],xmm6
+ pxor xmm6,xmm6
pxor xmm8,xmm9
movdqu XMMWORD PTR[80+rsi],xmm7
+ pxor xmm7,xmm7
lea rsi,QWORD PTR[96+rsi]
movdqa xmm2,xmm8
+ pxor xmm8,xmm8
+ pxor xmm9,xmm9
jmp $L$cbc_dec_tail_collected
ALIGN 16
@@ -3007,7 +3192,7 @@ $L$cbc_dec_loop6_enter::
movdqa xmm2,xmm7
add rdx,050h
- jle $L$cbc_dec_tail_collected
+ jle $L$cbc_dec_clear_tail_collected
movups XMMWORD PTR[rsi],xmm7
lea rsi,QWORD PTR[16+rsi]
@@ -3042,12 +3227,17 @@ $L$cbc_dec_tail::
movdqu XMMWORD PTR[rsi],xmm2
pxor xmm4,xmm12
movdqu XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
pxor xmm5,xmm13
movdqu XMMWORD PTR[32+rsi],xmm4
+ pxor xmm4,xmm4
pxor xmm6,xmm14
movdqu XMMWORD PTR[48+rsi],xmm5
+ pxor xmm5,xmm5
lea rsi,QWORD PTR[64+rsi]
movdqa xmm2,xmm6
+ pxor xmm6,xmm6
+ pxor xmm7,xmm7
sub rdx,010h
jmp $L$cbc_dec_tail_collected
@@ -3058,12 +3248,12 @@ $L$cbc_dec_one::
movups xmm1,XMMWORD PTR[16+rcx]
lea rcx,QWORD PTR[32+rcx]
xorps xmm2,xmm0
-$L$oop_dec1_16::
+$L$oop_dec1_17::
DB 102,15,56,222,209
dec eax
movups xmm1,XMMWORD PTR[rcx]
lea rcx,QWORD PTR[16+rcx]
- jnz $L$oop_dec1_16
+ jnz $L$oop_dec1_17
DB 102,15,56,223,209
xorps xmm2,xmm10
movaps xmm10,xmm11
@@ -3077,6 +3267,7 @@ $L$cbc_dec_two::
pxor xmm3,xmm11
movdqu XMMWORD PTR[rsi],xmm2
movdqa xmm2,xmm3
+ pxor xmm3,xmm3
lea rsi,QWORD PTR[16+rsi]
jmp $L$cbc_dec_tail_collected
ALIGN 16
@@ -3089,7 +3280,9 @@ $L$cbc_dec_three::
movdqu XMMWORD PTR[rsi],xmm2
pxor xmm4,xmm12
movdqu XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
movdqa xmm2,xmm4
+ pxor xmm4,xmm4
lea rsi,QWORD PTR[32+rsi]
jmp $L$cbc_dec_tail_collected
ALIGN 16
@@ -3102,39 +3295,61 @@ $L$cbc_dec_four::
movdqu XMMWORD PTR[rsi],xmm2
pxor xmm4,xmm12
movdqu XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
pxor xmm5,xmm13
movdqu XMMWORD PTR[32+rsi],xmm4
+ pxor xmm4,xmm4
movdqa xmm2,xmm5
+ pxor xmm5,xmm5
lea rsi,QWORD PTR[48+rsi]
jmp $L$cbc_dec_tail_collected
ALIGN 16
+$L$cbc_dec_clear_tail_collected::
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
$L$cbc_dec_tail_collected::
movups XMMWORD PTR[r8],xmm10
and rdx,15
jnz $L$cbc_dec_tail_partial
movups XMMWORD PTR[rsi],xmm2
+ pxor xmm2,xmm2
jmp $L$cbc_dec_ret
ALIGN 16
$L$cbc_dec_tail_partial::
movaps XMMWORD PTR[rsp],xmm2
+ pxor xmm2,xmm2
mov rcx,16
mov rdi,rsi
sub rcx,rdx
lea rsi,QWORD PTR[rsp]
DD 09066A4F3h
+ movdqa XMMWORD PTR[rsp],xmm2
$L$cbc_dec_ret::
+ xorps xmm0,xmm0
+ pxor xmm1,xmm1
movaps xmm6,XMMWORD PTR[16+rsp]
+ movaps XMMWORD PTR[16+rsp],xmm0
movaps xmm7,XMMWORD PTR[32+rsp]
+ movaps XMMWORD PTR[32+rsp],xmm0
movaps xmm8,XMMWORD PTR[48+rsp]
+ movaps XMMWORD PTR[48+rsp],xmm0
movaps xmm9,XMMWORD PTR[64+rsp]
+ movaps XMMWORD PTR[64+rsp],xmm0
movaps xmm10,XMMWORD PTR[80+rsp]
+ movaps XMMWORD PTR[80+rsp],xmm0
movaps xmm11,XMMWORD PTR[96+rsp]
+ movaps XMMWORD PTR[96+rsp],xmm0
movaps xmm12,XMMWORD PTR[112+rsp]
+ movaps XMMWORD PTR[112+rsp],xmm0
movaps xmm13,XMMWORD PTR[128+rsp]
+ movaps XMMWORD PTR[128+rsp],xmm0
movaps xmm14,XMMWORD PTR[144+rsp]
+ movaps XMMWORD PTR[144+rsp],xmm0
movaps xmm15,XMMWORD PTR[160+rsp]
+ movaps XMMWORD PTR[160+rsp],xmm0
lea rsp,QWORD PTR[rbp]
pop rbp
$L$cbc_ret::
@@ -3175,7 +3390,9 @@ DB 102,15,56,219,201
movups xmm0,XMMWORD PTR[r8]
DB 102,15,56,219,192
+ pxor xmm1,xmm1
movups XMMWORD PTR[rcx],xmm0
+ pxor xmm0,xmm0
$L$dec_key_ret::
add rsp,8
DB 0F3h,0C3h ;repret
@@ -3193,8 +3410,10 @@ DB 048h,083h,0ECh,008h
test r8,r8
jz $L$enc_key_ret
+ mov r10d,268437504
movups xmm0,XMMWORD PTR[rcx]
xorps xmm4,xmm4
+ and r10d,DWORD PTR[((OPENSSL_ia32cap_P+4))]
lea rax,QWORD PTR[16+r8]
cmp edx,256
je $L$14rounds
@@ -3205,6 +3424,9 @@ DB 048h,083h,0ECh,008h
$L$10rounds::
mov edx,9
+ cmp r10d,268435456
+ je $L$10rounds_alt
+
movups XMMWORD PTR[r8],xmm0
DB 102,15,58,223,200,1
call $L$key_expansion_128_cold
@@ -3232,9 +3454,79 @@ DB 102,15,58,223,200,54
jmp $L$enc_key_ret
ALIGN 16
+$L$10rounds_alt::
+ movdqa xmm5,XMMWORD PTR[$L$key_rotate]
+ mov r10d,8
+ movdqa xmm4,XMMWORD PTR[$L$key_rcon1]
+ movdqa xmm2,xmm0
+ movdqu XMMWORD PTR[r8],xmm0
+ jmp $L$oop_key128
+
+ALIGN 16
+$L$oop_key128::
+DB 102,15,56,0,197
+DB 102,15,56,221,196
+ pslld xmm4,1
+ lea rax,QWORD PTR[16+rax]
+
+ movdqa xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm2,xmm3
+
+ pxor xmm0,xmm2
+ movdqu XMMWORD PTR[(-16)+rax],xmm0
+ movdqa xmm2,xmm0
+
+ dec r10d
+ jnz $L$oop_key128
+
+ movdqa xmm4,XMMWORD PTR[$L$key_rcon1b]
+
+DB 102,15,56,0,197
+DB 102,15,56,221,196
+ pslld xmm4,1
+
+ movdqa xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm2,xmm3
+
+ pxor xmm0,xmm2
+ movdqu XMMWORD PTR[rax],xmm0
+
+ movdqa xmm2,xmm0
+DB 102,15,56,0,197
+DB 102,15,56,221,196
+
+ movdqa xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm2,xmm3
+
+ pxor xmm0,xmm2
+ movdqu XMMWORD PTR[16+rax],xmm0
+
+ mov DWORD PTR[96+rax],edx
+ xor eax,eax
+ jmp $L$enc_key_ret
+
+ALIGN 16
$L$12rounds::
movq xmm2,QWORD PTR[16+rcx]
mov edx,11
+ cmp r10d,268435456
+ je $L$12rounds_alt
+
movups XMMWORD PTR[r8],xmm0
DB 102,15,58,223,202,1
call $L$key_expansion_192a_cold
@@ -3258,10 +3550,54 @@ DB 102,15,58,223,202,128
jmp $L$enc_key_ret
ALIGN 16
+$L$12rounds_alt::
+ movdqa xmm5,XMMWORD PTR[$L$key_rotate192]
+ movdqa xmm4,XMMWORD PTR[$L$key_rcon1]
+ mov r10d,8
+ movdqu XMMWORD PTR[r8],xmm0
+ jmp $L$oop_key192
+
+ALIGN 16
+$L$oop_key192::
+ movq QWORD PTR[rax],xmm2
+ movdqa xmm1,xmm2
+DB 102,15,56,0,213
+DB 102,15,56,221,212
+ pslld xmm4,1
+ lea rax,QWORD PTR[24+rax]
+
+ movdqa xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm0,xmm3
+
+ pshufd xmm3,xmm0,0ffh
+ pxor xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm3,xmm1
+
+ pxor xmm0,xmm2
+ pxor xmm2,xmm3
+ movdqu XMMWORD PTR[(-16)+rax],xmm0
+
+ dec r10d
+ jnz $L$oop_key192
+
+ mov DWORD PTR[32+rax],edx
+ xor eax,eax
+ jmp $L$enc_key_ret
+
+ALIGN 16
$L$14rounds::
movups xmm2,XMMWORD PTR[16+rcx]
mov edx,13
lea rax,QWORD PTR[16+rax]
+ cmp r10d,268435456
+ je $L$14rounds_alt
+
movups XMMWORD PTR[r8],xmm0
movups XMMWORD PTR[16+r8],xmm2
DB 102,15,58,223,202,1
@@ -3296,9 +3632,69 @@ DB 102,15,58,223,202,64
jmp $L$enc_key_ret
ALIGN 16
+$L$14rounds_alt::
+ movdqa xmm5,XMMWORD PTR[$L$key_rotate]
+ movdqa xmm4,XMMWORD PTR[$L$key_rcon1]
+ mov r10d,7
+ movdqu XMMWORD PTR[r8],xmm0
+ movdqa xmm1,xmm2
+ movdqu XMMWORD PTR[16+r8],xmm2
+ jmp $L$oop_key256
+
+ALIGN 16
+$L$oop_key256::
+DB 102,15,56,0,213
+DB 102,15,56,221,212
+
+ movdqa xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm0,xmm3
+ pslld xmm4,1
+
+ pxor xmm0,xmm2
+ movdqu XMMWORD PTR[rax],xmm0
+
+ dec r10d
+ jz $L$done_key256
+
+ pshufd xmm2,xmm0,0ffh
+ pxor xmm3,xmm3
+DB 102,15,56,221,211
+
+ movdqa xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm1,xmm3
+
+ pxor xmm2,xmm1
+ movdqu XMMWORD PTR[16+rax],xmm2
+ lea rax,QWORD PTR[32+rax]
+ movdqa xmm1,xmm2
+
+ jmp $L$oop_key256
+
+$L$done_key256::
+ mov DWORD PTR[16+rax],edx
+ xor eax,eax
+ jmp $L$enc_key_ret
+
+ALIGN 16
$L$bad_keybits::
mov rax,-2
$L$enc_key_ret::
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
add rsp,8
DB 0F3h,0C3h ;repret
$L$SEH_end_set_encrypt_key::
@@ -3384,6 +3780,14 @@ $L$xts_magic::
DD 087h,0,1,0
$L$increment1::
DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+$L$key_rotate::
+ DD 00c0f0e0dh,00c0f0e0dh,00c0f0e0dh,00c0f0e0dh
+$L$key_rotate192::
+ DD 004070605h,004070605h,004070605h,004070605h
+$L$key_rcon1::
+ DD 1,1,1,1
+$L$key_rcon1b::
+ DD 01bh,01bh,01bh,01bh
DB 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
DB 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
@@ -3489,7 +3893,7 @@ cbc_se_handler PROC PRIVATE
mov rax,QWORD PTR[152+r8]
mov rbx,QWORD PTR[248+r8]
- lea r10,QWORD PTR[$L$cbc_decrypt]
+ lea r10,QWORD PTR[$L$cbc_decrypt_bulk]
cmp rbx,r10
jb $L$common_seh_tail
diff --git a/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm b/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm
index 64a1b42cfe..9fdd91d016 100644
--- a/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm
+++ b/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm
@@ -3001,11 +3001,16 @@ PUBLIC bn_get_bits5
ALIGN 16
bn_get_bits5 PROC PUBLIC
- mov r10,rcx
+ lea r10,QWORD PTR[rcx]
+ lea r11,QWORD PTR[1+rcx]
mov ecx,edx
- shr edx,3
- movzx eax,WORD PTR[rdx*1+r10]
- and ecx,7
+ shr edx,4
+ and ecx,15
+ lea eax,DWORD PTR[((-8))+rcx]
+ cmp ecx,11
+ cmova r10,r11
+ cmova ecx,eax
+ movzx eax,WORD PTR[rdx*2+r10]
shr eax,cl
and eax,31
DB 0F3h,0C3h ;repret
diff --git a/deps/openssl/asm/x86-elf-gas/aes/aesni-x86.s b/deps/openssl/asm/x86-elf-gas/aes/aesni-x86.s
index a68f7cdbe9..3bbc4e47d6 100644
--- a/deps/openssl/asm/x86-elf-gas/aes/aesni-x86.s
+++ b/deps/openssl/asm/x86-elf-gas/aes/aesni-x86.s
@@ -21,7 +21,10 @@ aesni_encrypt:
leal 16(%edx),%edx
jnz .L000enc1_loop_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
ret
.size aesni_encrypt,.-.L_aesni_encrypt_begin
.globl aesni_decrypt
@@ -45,7 +48,10 @@ aesni_decrypt:
leal 16(%edx),%edx
jnz .L001dec1_loop_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
ret
.size aesni_decrypt,.-.L_aesni_decrypt_begin
.type _aesni_encrypt2,@function
@@ -259,17 +265,15 @@ _aesni_encrypt6:
negl %ecx
.byte 102,15,56,220,225
pxor %xmm0,%xmm7
+ movups (%edx,%ecx,1),%xmm0
addl $16,%ecx
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
- movups -16(%edx,%ecx,1),%xmm0
- jmp .L_aesni_encrypt6_enter
+ jmp .L008_aesni_encrypt6_inner
.align 16
-.L008enc6_loop:
+.L009enc6_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
+.L008_aesni_encrypt6_inner:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
@@ -283,7 +287,7 @@ _aesni_encrypt6:
.byte 102,15,56,220,240
.byte 102,15,56,220,248
movups -16(%edx,%ecx,1),%xmm0
- jnz .L008enc6_loop
+ jnz .L009enc6_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -315,17 +319,15 @@ _aesni_decrypt6:
negl %ecx
.byte 102,15,56,222,225
pxor %xmm0,%xmm7
+ movups (%edx,%ecx,1),%xmm0
addl $16,%ecx
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
- movups -16(%edx,%ecx,1),%xmm0
- jmp .L_aesni_decrypt6_enter
+ jmp .L010_aesni_decrypt6_inner
.align 16
-.L009dec6_loop:
+.L011dec6_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
+.L010_aesni_decrypt6_inner:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
@@ -339,7 +341,7 @@ _aesni_decrypt6:
.byte 102,15,56,222,240
.byte 102,15,56,222,248
movups -16(%edx,%ecx,1),%xmm0
- jnz .L009dec6_loop
+ jnz .L011dec6_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -369,14 +371,14 @@ aesni_ecb_encrypt:
movl 32(%esp),%edx
movl 36(%esp),%ebx
andl $-16,%eax
- jz .L010ecb_ret
+ jz .L012ecb_ret
movl 240(%edx),%ecx
testl %ebx,%ebx
- jz .L011ecb_decrypt
+ jz .L013ecb_decrypt
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb .L012ecb_enc_tail
+ jb .L014ecb_enc_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -385,9 +387,9 @@ aesni_ecb_encrypt:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp .L013ecb_enc_loop6_enter
+ jmp .L015ecb_enc_loop6_enter
.align 16
-.L014ecb_enc_loop6:
+.L016ecb_enc_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -402,12 +404,12 @@ aesni_ecb_encrypt:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-.L013ecb_enc_loop6_enter:
+.L015ecb_enc_loop6_enter:
call _aesni_encrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc .L014ecb_enc_loop6
+ jnc .L016ecb_enc_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -416,18 +418,18 @@ aesni_ecb_encrypt:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz .L010ecb_ret
-.L012ecb_enc_tail:
+ jz .L012ecb_ret
+.L014ecb_enc_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb .L015ecb_enc_one
+ jb .L017ecb_enc_one
movups 16(%esi),%xmm3
- je .L016ecb_enc_two
+ je .L018ecb_enc_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb .L017ecb_enc_three
+ jb .L019ecb_enc_three
movups 48(%esi),%xmm5
- je .L018ecb_enc_four
+ je .L020ecb_enc_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call _aesni_encrypt6
@@ -436,49 +438,49 @@ aesni_ecb_encrypt:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L015ecb_enc_one:
+.L017ecb_enc_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L019enc1_loop_3:
+.L021enc1_loop_3:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L019enc1_loop_3
+ jnz .L021enc1_loop_3
.byte 102,15,56,221,209
movups %xmm2,(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L016ecb_enc_two:
+.L018ecb_enc_two:
call _aesni_encrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L017ecb_enc_three:
+.L019ecb_enc_three:
call _aesni_encrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L018ecb_enc_four:
+.L020ecb_enc_four:
call _aesni_encrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L011ecb_decrypt:
+.L013ecb_decrypt:
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb .L020ecb_dec_tail
+ jb .L022ecb_dec_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -487,9 +489,9 @@ aesni_ecb_encrypt:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp .L021ecb_dec_loop6_enter
+ jmp .L023ecb_dec_loop6_enter
.align 16
-.L022ecb_dec_loop6:
+.L024ecb_dec_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -504,12 +506,12 @@ aesni_ecb_encrypt:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-.L021ecb_dec_loop6_enter:
+.L023ecb_dec_loop6_enter:
call _aesni_decrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc .L022ecb_dec_loop6
+ jnc .L024ecb_dec_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -518,18 +520,18 @@ aesni_ecb_encrypt:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz .L010ecb_ret
-.L020ecb_dec_tail:
+ jz .L012ecb_ret
+.L022ecb_dec_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb .L023ecb_dec_one
+ jb .L025ecb_dec_one
movups 16(%esi),%xmm3
- je .L024ecb_dec_two
+ je .L026ecb_dec_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb .L025ecb_dec_three
+ jb .L027ecb_dec_three
movups 48(%esi),%xmm5
- je .L026ecb_dec_four
+ je .L028ecb_dec_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call _aesni_decrypt6
@@ -538,43 +540,51 @@ aesni_ecb_encrypt:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L023ecb_dec_one:
+.L025ecb_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L027dec1_loop_4:
+.L029dec1_loop_4:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L027dec1_loop_4
+ jnz .L029dec1_loop_4
.byte 102,15,56,223,209
movups %xmm2,(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L024ecb_dec_two:
+.L026ecb_dec_two:
call _aesni_decrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L025ecb_dec_three:
+.L027ecb_dec_three:
call _aesni_decrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L026ecb_dec_four:
+.L028ecb_dec_four:
call _aesni_decrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-.L010ecb_ret:
+.L012ecb_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -621,7 +631,7 @@ aesni_ccm64_encrypt_blocks:
leal 32(%edx,%ecx,1),%edx
subl %ecx,%ebx
.byte 102,15,56,0,253
-.L028ccm64_enc_outer:
+.L030ccm64_enc_outer:
movups (%ebp),%xmm0
movl %ebx,%ecx
movups (%esi),%xmm6
@@ -630,7 +640,7 @@ aesni_ccm64_encrypt_blocks:
xorps %xmm6,%xmm0
xorps %xmm0,%xmm3
movups 32(%ebp),%xmm0
-.L029ccm64_enc2_loop:
+.L031ccm64_enc2_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
movups (%edx,%ecx,1),%xmm1
@@ -638,7 +648,7 @@ aesni_ccm64_encrypt_blocks:
.byte 102,15,56,220,208
.byte 102,15,56,220,216
movups -16(%edx,%ecx,1),%xmm0
- jnz .L029ccm64_enc2_loop
+ jnz .L031ccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
paddq 16(%esp),%xmm7
@@ -651,10 +661,18 @@ aesni_ccm64_encrypt_blocks:
movups %xmm6,(%edi)
.byte 102,15,56,0,213
leal 16(%edi),%edi
- jnz .L028ccm64_enc_outer
+ jnz .L030ccm64_enc_outer
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -702,12 +720,12 @@ aesni_ccm64_decrypt_blocks:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L030enc1_loop_5:
+.L032enc1_loop_5:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L030enc1_loop_5
+ jnz .L032enc1_loop_5
.byte 102,15,56,221,209
shll $4,%ebx
movl $16,%ecx
@@ -717,16 +735,16 @@ aesni_ccm64_decrypt_blocks:
subl %ebx,%ecx
leal 32(%ebp,%ebx,1),%edx
movl %ecx,%ebx
- jmp .L031ccm64_dec_outer
+ jmp .L033ccm64_dec_outer
.align 16
-.L031ccm64_dec_outer:
+.L033ccm64_dec_outer:
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
movups %xmm6,(%edi)
leal 16(%edi),%edi
.byte 102,15,56,0,213
subl $1,%eax
- jz .L032ccm64_dec_break
+ jz .L034ccm64_dec_break
movups (%ebp),%xmm0
movl %ebx,%ecx
movups 16(%ebp),%xmm1
@@ -734,7 +752,7 @@ aesni_ccm64_decrypt_blocks:
xorps %xmm0,%xmm2
xorps %xmm6,%xmm3
movups 32(%ebp),%xmm0
-.L033ccm64_dec2_loop:
+.L035ccm64_dec2_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
movups (%edx,%ecx,1),%xmm1
@@ -742,7 +760,7 @@ aesni_ccm64_decrypt_blocks:
.byte 102,15,56,220,208
.byte 102,15,56,220,216
movups -16(%edx,%ecx,1),%xmm0
- jnz .L033ccm64_dec2_loop
+ jnz .L035ccm64_dec2_loop
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
.byte 102,15,56,220,209
@@ -750,9 +768,9 @@ aesni_ccm64_decrypt_blocks:
.byte 102,15,56,221,208
.byte 102,15,56,221,216
leal 16(%esi),%esi
- jmp .L031ccm64_dec_outer
+ jmp .L033ccm64_dec_outer
.align 16
-.L032ccm64_dec_break:
+.L034ccm64_dec_break:
movl 240(%ebp),%ecx
movl %ebp,%edx
movups (%edx),%xmm0
@@ -760,16 +778,24 @@ aesni_ccm64_decrypt_blocks:
xorps %xmm0,%xmm6
leal 32(%edx),%edx
xorps %xmm6,%xmm3
-.L034enc1_loop_6:
+.L036enc1_loop_6:
.byte 102,15,56,220,217
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L034enc1_loop_6
+ jnz .L036enc1_loop_6
.byte 102,15,56,221,217
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -795,7 +821,7 @@ aesni_ctr32_encrypt_blocks:
andl $-16,%esp
movl %ebp,80(%esp)
cmpl $1,%eax
- je .L035ctr32_one_shortcut
+ je .L037ctr32_one_shortcut
movdqu (%ebx),%xmm7
movl $202182159,(%esp)
movl $134810123,4(%esp)
@@ -833,7 +859,7 @@ aesni_ctr32_encrypt_blocks:
pshufd $192,%xmm0,%xmm2
pshufd $128,%xmm0,%xmm3
cmpl $6,%eax
- jb .L036ctr32_tail
+ jb .L038ctr32_tail
pxor %xmm6,%xmm7
shll $4,%ecx
movl $16,%ebx
@@ -842,9 +868,9 @@ aesni_ctr32_encrypt_blocks:
subl %ecx,%ebx
leal 32(%edx,%ecx,1),%edx
subl $6,%eax
- jmp .L037ctr32_loop6
+ jmp .L039ctr32_loop6
.align 16
-.L037ctr32_loop6:
+.L039ctr32_loop6:
pshufd $64,%xmm0,%xmm4
movdqa 32(%esp),%xmm0
pshufd $192,%xmm1,%xmm5
@@ -898,27 +924,27 @@ aesni_ctr32_encrypt_blocks:
leal 96(%edi),%edi
pshufd $128,%xmm0,%xmm3
subl $6,%eax
- jnc .L037ctr32_loop6
+ jnc .L039ctr32_loop6
addl $6,%eax
- jz .L038ctr32_ret
+ jz .L040ctr32_ret
movdqu (%ebp),%xmm7
movl %ebp,%edx
pxor 32(%esp),%xmm7
movl 240(%ebp),%ecx
-.L036ctr32_tail:
+.L038ctr32_tail:
por %xmm7,%xmm2
cmpl $2,%eax
- jb .L039ctr32_one
+ jb .L041ctr32_one
pshufd $64,%xmm0,%xmm4
por %xmm7,%xmm3
- je .L040ctr32_two
+ je .L042ctr32_two
pshufd $192,%xmm1,%xmm5
por %xmm7,%xmm4
cmpl $4,%eax
- jb .L041ctr32_three
+ jb .L043ctr32_three
pshufd $128,%xmm1,%xmm6
por %xmm7,%xmm5
- je .L042ctr32_four
+ je .L044ctr32_four
por %xmm7,%xmm6
call _aesni_encrypt6
movups (%esi),%xmm1
@@ -936,29 +962,29 @@ aesni_ctr32_encrypt_blocks:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L038ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L035ctr32_one_shortcut:
+.L037ctr32_one_shortcut:
movups (%ebx),%xmm2
movl 240(%edx),%ecx
-.L039ctr32_one:
+.L041ctr32_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L043enc1_loop_7:
+.L045enc1_loop_7:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L043enc1_loop_7
+ jnz .L045enc1_loop_7
.byte 102,15,56,221,209
movups (%esi),%xmm6
xorps %xmm2,%xmm6
movups %xmm6,(%edi)
- jmp .L038ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L040ctr32_two:
+.L042ctr32_two:
call _aesni_encrypt2
movups (%esi),%xmm5
movups 16(%esi),%xmm6
@@ -966,9 +992,9 @@ aesni_ctr32_encrypt_blocks:
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L038ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L041ctr32_three:
+.L043ctr32_three:
call _aesni_encrypt3
movups (%esi),%xmm5
movups 16(%esi),%xmm6
@@ -979,9 +1005,9 @@ aesni_ctr32_encrypt_blocks:
xorps %xmm7,%xmm4
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L038ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L042ctr32_four:
+.L044ctr32_four:
call _aesni_encrypt4
movups (%esi),%xmm6
movups 16(%esi),%xmm7
@@ -995,7 +1021,18 @@ aesni_ctr32_encrypt_blocks:
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-.L038ctr32_ret:
+.L040ctr32_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
movl 80(%esp),%esp
popl %edi
popl %esi
@@ -1020,12 +1057,12 @@ aesni_xts_encrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L044enc1_loop_8:
+.L046enc1_loop_8:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L044enc1_loop_8
+ jnz .L046enc1_loop_8
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1049,14 +1086,14 @@ aesni_xts_encrypt:
movl %edx,%ebp
movl %ecx,%ebx
subl $96,%eax
- jc .L045xts_enc_short
+ jc .L047xts_enc_short
shll $4,%ecx
movl $16,%ebx
subl %ecx,%ebx
leal 32(%edx,%ecx,1),%edx
- jmp .L046xts_enc_loop6
+ jmp .L048xts_enc_loop6
.align 16
-.L046xts_enc_loop6:
+.L048xts_enc_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1145,23 +1182,23 @@ aesni_xts_encrypt:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
subl $96,%eax
- jnc .L046xts_enc_loop6
+ jnc .L048xts_enc_loop6
movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-.L045xts_enc_short:
+.L047xts_enc_short:
addl $96,%eax
- jz .L047xts_enc_done6x
+ jz .L049xts_enc_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb .L048xts_enc_one
+ jb .L050xts_enc_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je .L049xts_enc_two
+ je .L051xts_enc_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1170,7 +1207,7 @@ aesni_xts_encrypt:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb .L050xts_enc_three
+ jb .L052xts_enc_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1180,7 +1217,7 @@ aesni_xts_encrypt:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je .L051xts_enc_four
+ je .L053xts_enc_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1212,9 +1249,9 @@ aesni_xts_encrypt:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp .L052xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L048xts_enc_one:
+.L050xts_enc_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1222,20 +1259,20 @@ aesni_xts_encrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L053enc1_loop_9:
+.L055enc1_loop_9:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L053enc1_loop_9
+ jnz .L055enc1_loop_9
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp .L052xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L049xts_enc_two:
+.L051xts_enc_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1249,9 +1286,9 @@ aesni_xts_encrypt:
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L052xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L050xts_enc_three:
+.L052xts_enc_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1269,9 +1306,9 @@ aesni_xts_encrypt:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp .L052xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L051xts_enc_four:
+.L053xts_enc_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1293,28 +1330,28 @@ aesni_xts_encrypt:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L052xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L047xts_enc_done6x:
+.L049xts_enc_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz .L054xts_enc_ret
+ jz .L056xts_enc_ret
movdqa %xmm1,%xmm5
movl %eax,112(%esp)
- jmp .L055xts_enc_steal
+ jmp .L057xts_enc_steal
.align 16
-.L052xts_enc_done:
+.L054xts_enc_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz .L054xts_enc_ret
+ jz .L056xts_enc_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm5
paddq %xmm1,%xmm1
pand 96(%esp),%xmm5
pxor %xmm1,%xmm5
-.L055xts_enc_steal:
+.L057xts_enc_steal:
movzbl (%esi),%ecx
movzbl -16(%edi),%edx
leal 1(%esi),%esi
@@ -1322,7 +1359,7 @@ aesni_xts_encrypt:
movb %dl,(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz .L055xts_enc_steal
+ jnz .L057xts_enc_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1332,16 +1369,30 @@ aesni_xts_encrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L056enc1_loop_10:
+.L058enc1_loop_10:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L056enc1_loop_10
+ jnz .L058enc1_loop_10
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,-16(%edi)
-.L054xts_enc_ret:
+.L056xts_enc_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1366,12 +1417,12 @@ aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L057enc1_loop_11:
+.L059enc1_loop_11:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L057enc1_loop_11
+ jnz .L059enc1_loop_11
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1400,14 +1451,14 @@ aesni_xts_decrypt:
pcmpgtd %xmm1,%xmm0
andl $-16,%eax
subl $96,%eax
- jc .L058xts_dec_short
+ jc .L060xts_dec_short
shll $4,%ecx
movl $16,%ebx
subl %ecx,%ebx
leal 32(%edx,%ecx,1),%edx
- jmp .L059xts_dec_loop6
+ jmp .L061xts_dec_loop6
.align 16
-.L059xts_dec_loop6:
+.L061xts_dec_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1496,23 +1547,23 @@ aesni_xts_decrypt:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
subl $96,%eax
- jnc .L059xts_dec_loop6
+ jnc .L061xts_dec_loop6
movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-.L058xts_dec_short:
+.L060xts_dec_short:
addl $96,%eax
- jz .L060xts_dec_done6x
+ jz .L062xts_dec_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb .L061xts_dec_one
+ jb .L063xts_dec_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je .L062xts_dec_two
+ je .L064xts_dec_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1521,7 +1572,7 @@ aesni_xts_decrypt:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb .L063xts_dec_three
+ jb .L065xts_dec_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1531,7 +1582,7 @@ aesni_xts_decrypt:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je .L064xts_dec_four
+ je .L066xts_dec_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1563,9 +1614,9 @@ aesni_xts_decrypt:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp .L065xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L061xts_dec_one:
+.L063xts_dec_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1573,20 +1624,20 @@ aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L066dec1_loop_12:
+.L068dec1_loop_12:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L066dec1_loop_12
+ jnz .L068dec1_loop_12
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp .L065xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L062xts_dec_two:
+.L064xts_dec_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1600,9 +1651,9 @@ aesni_xts_decrypt:
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L065xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L063xts_dec_three:
+.L065xts_dec_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1620,9 +1671,9 @@ aesni_xts_decrypt:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp .L065xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L064xts_dec_four:
+.L066xts_dec_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1644,20 +1695,20 @@ aesni_xts_decrypt:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L065xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L060xts_dec_done6x:
+.L062xts_dec_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz .L067xts_dec_ret
+ jz .L069xts_dec_ret
movl %eax,112(%esp)
- jmp .L068xts_dec_only_one_more
+ jmp .L070xts_dec_only_one_more
.align 16
-.L065xts_dec_done:
+.L067xts_dec_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz .L067xts_dec_ret
+ jz .L069xts_dec_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm2
@@ -1667,7 +1718,7 @@ aesni_xts_decrypt:
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
-.L068xts_dec_only_one_more:
+.L070xts_dec_only_one_more:
pshufd $19,%xmm0,%xmm5
movdqa %xmm1,%xmm6
paddq %xmm1,%xmm1
@@ -1681,16 +1732,16 @@ aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L069dec1_loop_13:
+.L071dec1_loop_13:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L069dec1_loop_13
+ jnz .L071dec1_loop_13
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
-.L070xts_dec_steal:
+.L072xts_dec_steal:
movzbl 16(%esi),%ecx
movzbl (%edi),%edx
leal 1(%esi),%esi
@@ -1698,7 +1749,7 @@ aesni_xts_decrypt:
movb %dl,16(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz .L070xts_dec_steal
+ jnz .L072xts_dec_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1708,16 +1759,30 @@ aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L071dec1_loop_14:
+.L073dec1_loop_14:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L071dec1_loop_14
+ jnz .L073dec1_loop_14
.byte 102,15,56,223,209
xorps %xmm6,%xmm2
movups %xmm2,(%edi)
-.L067xts_dec_ret:
+.L069xts_dec_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1743,7 +1808,7 @@ aesni_cbc_encrypt:
movl 32(%esp),%edx
movl 36(%esp),%ebp
testl %eax,%eax
- jz .L072cbc_abort
+ jz .L074cbc_abort
cmpl $0,40(%esp)
xchgl %esp,%ebx
movups (%ebp),%xmm7
@@ -1751,14 +1816,14 @@ aesni_cbc_encrypt:
movl %edx,%ebp
movl %ebx,16(%esp)
movl %ecx,%ebx
- je .L073cbc_decrypt
+ je .L075cbc_decrypt
movaps %xmm7,%xmm2
cmpl $16,%eax
- jb .L074cbc_enc_tail
+ jb .L076cbc_enc_tail
subl $16,%eax
- jmp .L075cbc_enc_loop
+ jmp .L077cbc_enc_loop
.align 16
-.L075cbc_enc_loop:
+.L077cbc_enc_loop:
movups (%esi),%xmm7
leal 16(%esi),%esi
movups (%edx),%xmm0
@@ -1766,24 +1831,25 @@ aesni_cbc_encrypt:
xorps %xmm0,%xmm7
leal 32(%edx),%edx
xorps %xmm7,%xmm2
-.L076enc1_loop_15:
+.L078enc1_loop_15:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L076enc1_loop_15
+ jnz .L078enc1_loop_15
.byte 102,15,56,221,209
movl %ebx,%ecx
movl %ebp,%edx
movups %xmm2,(%edi)
leal 16(%edi),%edi
subl $16,%eax
- jnc .L075cbc_enc_loop
+ jnc .L077cbc_enc_loop
addl $16,%eax
- jnz .L074cbc_enc_tail
+ jnz .L076cbc_enc_tail
movaps %xmm2,%xmm7
- jmp .L077cbc_ret
-.L074cbc_enc_tail:
+ pxor %xmm2,%xmm2
+ jmp .L079cbc_ret
+.L076cbc_enc_tail:
movl %eax,%ecx
.long 2767451785
movl $16,%ecx
@@ -1794,20 +1860,20 @@ aesni_cbc_encrypt:
movl %ebx,%ecx
movl %edi,%esi
movl %ebp,%edx
- jmp .L075cbc_enc_loop
+ jmp .L077cbc_enc_loop
.align 16
-.L073cbc_decrypt:
+.L075cbc_decrypt:
cmpl $80,%eax
- jbe .L078cbc_dec_tail
+ jbe .L080cbc_dec_tail
movaps %xmm7,(%esp)
subl $80,%eax
- jmp .L079cbc_dec_loop6_enter
+ jmp .L081cbc_dec_loop6_enter
.align 16
-.L080cbc_dec_loop6:
+.L082cbc_dec_loop6:
movaps %xmm0,(%esp)
movups %xmm7,(%edi)
leal 16(%edi),%edi
-.L079cbc_dec_loop6_enter:
+.L081cbc_dec_loop6_enter:
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -1837,28 +1903,28 @@ aesni_cbc_encrypt:
movups %xmm6,64(%edi)
leal 80(%edi),%edi
subl $96,%eax
- ja .L080cbc_dec_loop6
+ ja .L082cbc_dec_loop6
movaps %xmm7,%xmm2
movaps %xmm0,%xmm7
addl $80,%eax
- jle .L081cbc_dec_tail_collected
+ jle .L083cbc_dec_clear_tail_collected
movups %xmm2,(%edi)
leal 16(%edi),%edi
-.L078cbc_dec_tail:
+.L080cbc_dec_tail:
movups (%esi),%xmm2
movaps %xmm2,%xmm6
cmpl $16,%eax
- jbe .L082cbc_dec_one
+ jbe .L084cbc_dec_one
movups 16(%esi),%xmm3
movaps %xmm3,%xmm5
cmpl $32,%eax
- jbe .L083cbc_dec_two
+ jbe .L085cbc_dec_two
movups 32(%esi),%xmm4
cmpl $48,%eax
- jbe .L084cbc_dec_three
+ jbe .L086cbc_dec_three
movups 48(%esi),%xmm5
cmpl $64,%eax
- jbe .L085cbc_dec_four
+ jbe .L087cbc_dec_four
movups 64(%esi),%xmm6
movaps %xmm7,(%esp)
movups (%esi),%xmm2
@@ -1876,55 +1942,62 @@ aesni_cbc_encrypt:
xorps %xmm0,%xmm6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%edi)
+ pxor %xmm5,%xmm5
leal 64(%edi),%edi
movaps %xmm6,%xmm2
+ pxor %xmm6,%xmm6
subl $80,%eax
- jmp .L081cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L082cbc_dec_one:
+.L084cbc_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L086dec1_loop_16:
+.L089dec1_loop_16:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L086dec1_loop_16
+ jnz .L089dec1_loop_16
.byte 102,15,56,223,209
xorps %xmm7,%xmm2
movaps %xmm6,%xmm7
subl $16,%eax
- jmp .L081cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L083cbc_dec_two:
+.L085cbc_dec_two:
call _aesni_decrypt2
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movaps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leal 16(%edi),%edi
movaps %xmm5,%xmm7
subl $32,%eax
- jmp .L081cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L084cbc_dec_three:
+.L086cbc_dec_three:
call _aesni_decrypt3
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
xorps %xmm5,%xmm4
movups %xmm2,(%edi)
movaps %xmm4,%xmm2
+ pxor %xmm4,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
leal 32(%edi),%edi
movups 32(%esi),%xmm7
subl $48,%eax
- jmp .L081cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L085cbc_dec_four:
+.L087cbc_dec_four:
call _aesni_decrypt4
movups 16(%esi),%xmm1
movups 32(%esi),%xmm0
@@ -1934,28 +2007,44 @@ aesni_cbc_encrypt:
movups %xmm2,(%edi)
xorps %xmm1,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
leal 48(%edi),%edi
movaps %xmm5,%xmm2
+ pxor %xmm5,%xmm5
subl $64,%eax
-.L081cbc_dec_tail_collected:
+ jmp .L088cbc_dec_tail_collected
+.align 16
+.L083cbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+.L088cbc_dec_tail_collected:
andl $15,%eax
- jnz .L087cbc_dec_tail_partial
+ jnz .L090cbc_dec_tail_partial
movups %xmm2,(%edi)
- jmp .L077cbc_ret
+ pxor %xmm0,%xmm0
+ jmp .L079cbc_ret
.align 16
-.L087cbc_dec_tail_partial:
+.L090cbc_dec_tail_partial:
movaps %xmm2,(%esp)
+ pxor %xmm0,%xmm0
movl $16,%ecx
movl %esp,%esi
subl %eax,%ecx
.long 2767451785
-.L077cbc_ret:
+ movdqa %xmm2,(%esp)
+.L079cbc_ret:
movl 16(%esp),%esp
movl 36(%esp),%ebp
+ pxor %xmm2,%xmm2
+ pxor %xmm1,%xmm1
movups %xmm7,(%ebp)
-.L072cbc_abort:
+ pxor %xmm7,%xmm7
+.L074cbc_abort:
popl %edi
popl %esi
popl %ebx
@@ -1965,52 +2054,62 @@ aesni_cbc_encrypt:
.type _aesni_set_encrypt_key,@function
.align 16
_aesni_set_encrypt_key:
+ pushl %ebp
+ pushl %ebx
testl %eax,%eax
- jz .L088bad_pointer
+ jz .L091bad_pointer
testl %edx,%edx
- jz .L088bad_pointer
+ jz .L091bad_pointer
+ call .L092pic
+.L092pic:
+ popl %ebx
+ leal .Lkey_const-.L092pic(%ebx),%ebx
+ leal OPENSSL_ia32cap_P,%ebp
movups (%eax),%xmm0
xorps %xmm4,%xmm4
+ movl 4(%ebp),%ebp
leal 16(%edx),%edx
+ andl $268437504,%ebp
cmpl $256,%ecx
- je .L08914rounds
+ je .L09314rounds
cmpl $192,%ecx
- je .L09012rounds
+ je .L09412rounds
cmpl $128,%ecx
- jne .L091bad_keybits
+ jne .L095bad_keybits
.align 16
-.L09210rounds:
+.L09610rounds:
+ cmpl $268435456,%ebp
+ je .L09710rounds_alt
movl $9,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,200,1
- call .L093key_128_cold
+ call .L098key_128_cold
.byte 102,15,58,223,200,2
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,4
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,8
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,16
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,32
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,64
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,128
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,27
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,54
- call .L094key_128
+ call .L099key_128
movups %xmm0,(%edx)
movl %ecx,80(%edx)
- xorl %eax,%eax
- ret
+ jmp .L100good_key
.align 16
-.L094key_128:
+.L099key_128:
movups %xmm0,(%edx)
leal 16(%edx),%edx
-.L093key_128_cold:
+.L098key_128_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -2019,38 +2118,91 @@ _aesni_set_encrypt_key:
xorps %xmm1,%xmm0
ret
.align 16
-.L09012rounds:
+.L09710rounds_alt:
+ movdqa (%ebx),%xmm5
+ movl $8,%ecx
+ movdqa 32(%ebx),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,-16(%edx)
+.L101loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leal 16(%edx),%edx
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%edx)
+ movdqa %xmm0,%xmm2
+ decl %ecx
+ jnz .L101loop_key128
+ movdqa 48(%ebx),%xmm4
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%edx)
+ movl $9,%ecx
+ movl %ecx,96(%edx)
+ jmp .L100good_key
+.align 16
+.L09412rounds:
movq 16(%eax),%xmm2
+ cmpl $268435456,%ebp
+ je .L10212rounds_alt
movl $11,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,202,1
- call .L095key_192a_cold
+ call .L103key_192a_cold
.byte 102,15,58,223,202,2
- call .L096key_192b
+ call .L104key_192b
.byte 102,15,58,223,202,4
- call .L097key_192a
+ call .L105key_192a
.byte 102,15,58,223,202,8
- call .L096key_192b
+ call .L104key_192b
.byte 102,15,58,223,202,16
- call .L097key_192a
+ call .L105key_192a
.byte 102,15,58,223,202,32
- call .L096key_192b
+ call .L104key_192b
.byte 102,15,58,223,202,64
- call .L097key_192a
+ call .L105key_192a
.byte 102,15,58,223,202,128
- call .L096key_192b
+ call .L104key_192b
movups %xmm0,(%edx)
movl %ecx,48(%edx)
- xorl %eax,%eax
- ret
+ jmp .L100good_key
.align 16
-.L097key_192a:
+.L105key_192a:
movups %xmm0,(%edx)
leal 16(%edx),%edx
.align 16
-.L095key_192a_cold:
+.L103key_192a_cold:
movaps %xmm2,%xmm5
-.L098key_192b_warm:
+.L106key_192b_warm:
shufps $16,%xmm0,%xmm4
movdqa %xmm2,%xmm3
xorps %xmm4,%xmm0
@@ -2064,56 +2216,90 @@ _aesni_set_encrypt_key:
pxor %xmm3,%xmm2
ret
.align 16
-.L096key_192b:
+.L104key_192b:
movaps %xmm0,%xmm3
shufps $68,%xmm0,%xmm5
movups %xmm5,(%edx)
shufps $78,%xmm2,%xmm3
movups %xmm3,16(%edx)
leal 32(%edx),%edx
- jmp .L098key_192b_warm
+ jmp .L106key_192b_warm
+.align 16
+.L10212rounds_alt:
+ movdqa 16(%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $8,%ecx
+ movdqu %xmm0,-16(%edx)
+.L107loop_key192:
+ movq %xmm2,(%edx)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leal 24(%edx),%edx
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%edx)
+ decl %ecx
+ jnz .L107loop_key192
+ movl $11,%ecx
+ movl %ecx,32(%edx)
+ jmp .L100good_key
.align 16
-.L08914rounds:
+.L09314rounds:
movups 16(%eax),%xmm2
- movl $13,%ecx
leal 16(%edx),%edx
+ cmpl $268435456,%ebp
+ je .L10814rounds_alt
+ movl $13,%ecx
movups %xmm0,-32(%edx)
movups %xmm2,-16(%edx)
.byte 102,15,58,223,202,1
- call .L099key_256a_cold
+ call .L109key_256a_cold
.byte 102,15,58,223,200,1
- call .L100key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,2
- call .L101key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,2
- call .L100key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,4
- call .L101key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,4
- call .L100key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,8
- call .L101key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,8
- call .L100key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,16
- call .L101key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,16
- call .L100key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,32
- call .L101key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,32
- call .L100key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,64
- call .L101key_256a
+ call .L111key_256a
movups %xmm0,(%edx)
movl %ecx,16(%edx)
xorl %eax,%eax
- ret
+ jmp .L100good_key
.align 16
-.L101key_256a:
+.L111key_256a:
movups %xmm2,(%edx)
leal 16(%edx),%edx
-.L099key_256a_cold:
+.L109key_256a_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -2122,7 +2308,7 @@ _aesni_set_encrypt_key:
xorps %xmm1,%xmm0
ret
.align 16
-.L100key_256b:
+.L110key_256b:
movups %xmm0,(%edx)
leal 16(%edx),%edx
shufps $16,%xmm2,%xmm4
@@ -2132,13 +2318,70 @@ _aesni_set_encrypt_key:
shufps $170,%xmm1,%xmm1
xorps %xmm1,%xmm2
ret
+.align 16
+.L10814rounds_alt:
+ movdqa (%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $7,%ecx
+ movdqu %xmm0,-32(%edx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,-16(%edx)
+.L112loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ decl %ecx
+ jz .L113done_key256
+ pshufd $255,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%edx)
+ leal 32(%edx),%edx
+ movdqa %xmm2,%xmm1
+ jmp .L112loop_key256
+.L113done_key256:
+ movl $13,%ecx
+ movl %ecx,16(%edx)
+.L100good_key:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ xorl %eax,%eax
+ popl %ebx
+ popl %ebp
+ ret
.align 4
-.L088bad_pointer:
+.L091bad_pointer:
movl $-1,%eax
+ popl %ebx
+ popl %ebp
ret
.align 4
-.L091bad_keybits:
+.L095bad_keybits:
+ pxor %xmm0,%xmm0
movl $-2,%eax
+ popl %ebx
+ popl %ebp
ret
.size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key
.globl aesni_set_encrypt_key
@@ -2164,7 +2407,7 @@ aesni_set_decrypt_key:
movl 12(%esp),%edx
shll $4,%ecx
testl %eax,%eax
- jnz .L102dec_key_ret
+ jnz .L114dec_key_ret
leal 16(%edx,%ecx,1),%eax
movups (%edx),%xmm0
movups (%eax),%xmm1
@@ -2172,7 +2415,7 @@ aesni_set_decrypt_key:
movups %xmm1,(%edx)
leal 16(%edx),%edx
leal -16(%eax),%eax
-.L103dec_key_inverse:
+.L115dec_key_inverse:
movups (%edx),%xmm0
movups (%eax),%xmm1
.byte 102,15,56,219,192
@@ -2182,15 +2425,24 @@ aesni_set_decrypt_key:
movups %xmm0,16(%eax)
movups %xmm1,-16(%edx)
cmpl %edx,%eax
- ja .L103dec_key_inverse
+ ja .L115dec_key_inverse
movups (%edx),%xmm0
.byte 102,15,56,219,192
movups %xmm0,(%edx)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
xorl %eax,%eax
-.L102dec_key_ret:
+.L114dec_key_ret:
ret
.size aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin
+.align 64
+.Lkey_const:
+.long 202313229,202313229,202313229,202313229
+.long 67569157,67569157,67569157,67569157
+.long 1,1,1,1
+.long 27,27,27,27
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
.byte 115,108,46,111,114,103,62,0
+.comm OPENSSL_ia32cap_P,16,4
diff --git a/deps/openssl/asm/x86-macosx-gas/aes/aesni-x86.s b/deps/openssl/asm/x86-macosx-gas/aes/aesni-x86.s
index cecd5f83f7..c1f5aec62c 100644
--- a/deps/openssl/asm/x86-macosx-gas/aes/aesni-x86.s
+++ b/deps/openssl/asm/x86-macosx-gas/aes/aesni-x86.s
@@ -20,7 +20,10 @@ L000enc1_loop_1:
leal 16(%edx),%edx
jnz L000enc1_loop_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
ret
.globl _aesni_decrypt
.align 4
@@ -42,7 +45,10 @@ L001dec1_loop_2:
leal 16(%edx),%edx
jnz L001dec1_loop_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
ret
.align 4
__aesni_encrypt2:
@@ -242,17 +248,15 @@ __aesni_encrypt6:
negl %ecx
.byte 102,15,56,220,225
pxor %xmm0,%xmm7
+ movups (%edx,%ecx,1),%xmm0
addl $16,%ecx
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
- movups -16(%edx,%ecx,1),%xmm0
- jmp L_aesni_encrypt6_enter
+ jmp L008_aesni_encrypt6_inner
.align 4,0x90
-L008enc6_loop:
+L009enc6_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
+L008_aesni_encrypt6_inner:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
@@ -266,7 +270,7 @@ L_aesni_encrypt6_enter:
.byte 102,15,56,220,240
.byte 102,15,56,220,248
movups -16(%edx,%ecx,1),%xmm0
- jnz L008enc6_loop
+ jnz L009enc6_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -296,17 +300,15 @@ __aesni_decrypt6:
negl %ecx
.byte 102,15,56,222,225
pxor %xmm0,%xmm7
+ movups (%edx,%ecx,1),%xmm0
addl $16,%ecx
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
- movups -16(%edx,%ecx,1),%xmm0
- jmp L_aesni_decrypt6_enter
+ jmp L010_aesni_decrypt6_inner
.align 4,0x90
-L009dec6_loop:
+L011dec6_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
+L010_aesni_decrypt6_inner:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
@@ -320,7 +322,7 @@ L_aesni_decrypt6_enter:
.byte 102,15,56,222,240
.byte 102,15,56,222,248
movups -16(%edx,%ecx,1),%xmm0
- jnz L009dec6_loop
+ jnz L011dec6_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -348,14 +350,14 @@ L_aesni_ecb_encrypt_begin:
movl 32(%esp),%edx
movl 36(%esp),%ebx
andl $-16,%eax
- jz L010ecb_ret
+ jz L012ecb_ret
movl 240(%edx),%ecx
testl %ebx,%ebx
- jz L011ecb_decrypt
+ jz L013ecb_decrypt
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb L012ecb_enc_tail
+ jb L014ecb_enc_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -364,9 +366,9 @@ L_aesni_ecb_encrypt_begin:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp L013ecb_enc_loop6_enter
+ jmp L015ecb_enc_loop6_enter
.align 4,0x90
-L014ecb_enc_loop6:
+L016ecb_enc_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -381,12 +383,12 @@ L014ecb_enc_loop6:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-L013ecb_enc_loop6_enter:
+L015ecb_enc_loop6_enter:
call __aesni_encrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc L014ecb_enc_loop6
+ jnc L016ecb_enc_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -395,18 +397,18 @@ L013ecb_enc_loop6_enter:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz L010ecb_ret
-L012ecb_enc_tail:
+ jz L012ecb_ret
+L014ecb_enc_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb L015ecb_enc_one
+ jb L017ecb_enc_one
movups 16(%esi),%xmm3
- je L016ecb_enc_two
+ je L018ecb_enc_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb L017ecb_enc_three
+ jb L019ecb_enc_three
movups 48(%esi),%xmm5
- je L018ecb_enc_four
+ je L020ecb_enc_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call __aesni_encrypt6
@@ -415,49 +417,49 @@ L012ecb_enc_tail:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L015ecb_enc_one:
+L017ecb_enc_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L019enc1_loop_3:
+L021enc1_loop_3:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L019enc1_loop_3
+ jnz L021enc1_loop_3
.byte 102,15,56,221,209
movups %xmm2,(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L016ecb_enc_two:
+L018ecb_enc_two:
call __aesni_encrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L017ecb_enc_three:
+L019ecb_enc_three:
call __aesni_encrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L018ecb_enc_four:
+L020ecb_enc_four:
call __aesni_encrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L011ecb_decrypt:
+L013ecb_decrypt:
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb L020ecb_dec_tail
+ jb L022ecb_dec_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -466,9 +468,9 @@ L011ecb_decrypt:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp L021ecb_dec_loop6_enter
+ jmp L023ecb_dec_loop6_enter
.align 4,0x90
-L022ecb_dec_loop6:
+L024ecb_dec_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -483,12 +485,12 @@ L022ecb_dec_loop6:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-L021ecb_dec_loop6_enter:
+L023ecb_dec_loop6_enter:
call __aesni_decrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc L022ecb_dec_loop6
+ jnc L024ecb_dec_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -497,18 +499,18 @@ L021ecb_dec_loop6_enter:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz L010ecb_ret
-L020ecb_dec_tail:
+ jz L012ecb_ret
+L022ecb_dec_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb L023ecb_dec_one
+ jb L025ecb_dec_one
movups 16(%esi),%xmm3
- je L024ecb_dec_two
+ je L026ecb_dec_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb L025ecb_dec_three
+ jb L027ecb_dec_three
movups 48(%esi),%xmm5
- je L026ecb_dec_four
+ je L028ecb_dec_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call __aesni_decrypt6
@@ -517,43 +519,51 @@ L020ecb_dec_tail:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L023ecb_dec_one:
+L025ecb_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L027dec1_loop_4:
+L029dec1_loop_4:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L027dec1_loop_4
+ jnz L029dec1_loop_4
.byte 102,15,56,223,209
movups %xmm2,(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L024ecb_dec_two:
+L026ecb_dec_two:
call __aesni_decrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L025ecb_dec_three:
+L027ecb_dec_three:
call __aesni_decrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L026ecb_dec_four:
+L028ecb_dec_four:
call __aesni_decrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-L010ecb_ret:
+L012ecb_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -598,7 +608,7 @@ L_aesni_ccm64_encrypt_blocks_begin:
leal 32(%edx,%ecx,1),%edx
subl %ecx,%ebx
.byte 102,15,56,0,253
-L028ccm64_enc_outer:
+L030ccm64_enc_outer:
movups (%ebp),%xmm0
movl %ebx,%ecx
movups (%esi),%xmm6
@@ -607,7 +617,7 @@ L028ccm64_enc_outer:
xorps %xmm6,%xmm0
xorps %xmm0,%xmm3
movups 32(%ebp),%xmm0
-L029ccm64_enc2_loop:
+L031ccm64_enc2_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
movups (%edx,%ecx,1),%xmm1
@@ -615,7 +625,7 @@ L029ccm64_enc2_loop:
.byte 102,15,56,220,208
.byte 102,15,56,220,216
movups -16(%edx,%ecx,1),%xmm0
- jnz L029ccm64_enc2_loop
+ jnz L031ccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
paddq 16(%esp),%xmm7
@@ -628,10 +638,18 @@ L029ccm64_enc2_loop:
movups %xmm6,(%edi)
.byte 102,15,56,0,213
leal 16(%edi),%edi
- jnz L028ccm64_enc_outer
+ jnz L030ccm64_enc_outer
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -677,12 +695,12 @@ L_aesni_ccm64_decrypt_blocks_begin:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L030enc1_loop_5:
+L032enc1_loop_5:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L030enc1_loop_5
+ jnz L032enc1_loop_5
.byte 102,15,56,221,209
shll $4,%ebx
movl $16,%ecx
@@ -692,16 +710,16 @@ L030enc1_loop_5:
subl %ebx,%ecx
leal 32(%ebp,%ebx,1),%edx
movl %ecx,%ebx
- jmp L031ccm64_dec_outer
+ jmp L033ccm64_dec_outer
.align 4,0x90
-L031ccm64_dec_outer:
+L033ccm64_dec_outer:
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
movups %xmm6,(%edi)
leal 16(%edi),%edi
.byte 102,15,56,0,213
subl $1,%eax
- jz L032ccm64_dec_break
+ jz L034ccm64_dec_break
movups (%ebp),%xmm0
movl %ebx,%ecx
movups 16(%ebp),%xmm1
@@ -709,7 +727,7 @@ L031ccm64_dec_outer:
xorps %xmm0,%xmm2
xorps %xmm6,%xmm3
movups 32(%ebp),%xmm0
-L033ccm64_dec2_loop:
+L035ccm64_dec2_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
movups (%edx,%ecx,1),%xmm1
@@ -717,7 +735,7 @@ L033ccm64_dec2_loop:
.byte 102,15,56,220,208
.byte 102,15,56,220,216
movups -16(%edx,%ecx,1),%xmm0
- jnz L033ccm64_dec2_loop
+ jnz L035ccm64_dec2_loop
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
.byte 102,15,56,220,209
@@ -725,9 +743,9 @@ L033ccm64_dec2_loop:
.byte 102,15,56,221,208
.byte 102,15,56,221,216
leal 16(%esi),%esi
- jmp L031ccm64_dec_outer
+ jmp L033ccm64_dec_outer
.align 4,0x90
-L032ccm64_dec_break:
+L034ccm64_dec_break:
movl 240(%ebp),%ecx
movl %ebp,%edx
movups (%edx),%xmm0
@@ -735,16 +753,24 @@ L032ccm64_dec_break:
xorps %xmm0,%xmm6
leal 32(%edx),%edx
xorps %xmm6,%xmm3
-L034enc1_loop_6:
+L036enc1_loop_6:
.byte 102,15,56,220,217
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L034enc1_loop_6
+ jnz L036enc1_loop_6
.byte 102,15,56,221,217
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -768,7 +794,7 @@ L_aesni_ctr32_encrypt_blocks_begin:
andl $-16,%esp
movl %ebp,80(%esp)
cmpl $1,%eax
- je L035ctr32_one_shortcut
+ je L037ctr32_one_shortcut
movdqu (%ebx),%xmm7
movl $202182159,(%esp)
movl $134810123,4(%esp)
@@ -806,7 +832,7 @@ L_aesni_ctr32_encrypt_blocks_begin:
pshufd $192,%xmm0,%xmm2
pshufd $128,%xmm0,%xmm3
cmpl $6,%eax
- jb L036ctr32_tail
+ jb L038ctr32_tail
pxor %xmm6,%xmm7
shll $4,%ecx
movl $16,%ebx
@@ -815,9 +841,9 @@ L_aesni_ctr32_encrypt_blocks_begin:
subl %ecx,%ebx
leal 32(%edx,%ecx,1),%edx
subl $6,%eax
- jmp L037ctr32_loop6
+ jmp L039ctr32_loop6
.align 4,0x90
-L037ctr32_loop6:
+L039ctr32_loop6:
pshufd $64,%xmm0,%xmm4
movdqa 32(%esp),%xmm0
pshufd $192,%xmm1,%xmm5
@@ -871,27 +897,27 @@ L037ctr32_loop6:
leal 96(%edi),%edi
pshufd $128,%xmm0,%xmm3
subl $6,%eax
- jnc L037ctr32_loop6
+ jnc L039ctr32_loop6
addl $6,%eax
- jz L038ctr32_ret
+ jz L040ctr32_ret
movdqu (%ebp),%xmm7
movl %ebp,%edx
pxor 32(%esp),%xmm7
movl 240(%ebp),%ecx
-L036ctr32_tail:
+L038ctr32_tail:
por %xmm7,%xmm2
cmpl $2,%eax
- jb L039ctr32_one
+ jb L041ctr32_one
pshufd $64,%xmm0,%xmm4
por %xmm7,%xmm3
- je L040ctr32_two
+ je L042ctr32_two
pshufd $192,%xmm1,%xmm5
por %xmm7,%xmm4
cmpl $4,%eax
- jb L041ctr32_three
+ jb L043ctr32_three
pshufd $128,%xmm1,%xmm6
por %xmm7,%xmm5
- je L042ctr32_four
+ je L044ctr32_four
por %xmm7,%xmm6
call __aesni_encrypt6
movups (%esi),%xmm1
@@ -909,29 +935,29 @@ L036ctr32_tail:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp L038ctr32_ret
+ jmp L040ctr32_ret
.align 4,0x90
-L035ctr32_one_shortcut:
+L037ctr32_one_shortcut:
movups (%ebx),%xmm2
movl 240(%edx),%ecx
-L039ctr32_one:
+L041ctr32_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L043enc1_loop_7:
+L045enc1_loop_7:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L043enc1_loop_7
+ jnz L045enc1_loop_7
.byte 102,15,56,221,209
movups (%esi),%xmm6
xorps %xmm2,%xmm6
movups %xmm6,(%edi)
- jmp L038ctr32_ret
+ jmp L040ctr32_ret
.align 4,0x90
-L040ctr32_two:
+L042ctr32_two:
call __aesni_encrypt2
movups (%esi),%xmm5
movups 16(%esi),%xmm6
@@ -939,9 +965,9 @@ L040ctr32_two:
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp L038ctr32_ret
+ jmp L040ctr32_ret
.align 4,0x90
-L041ctr32_three:
+L043ctr32_three:
call __aesni_encrypt3
movups (%esi),%xmm5
movups 16(%esi),%xmm6
@@ -952,9 +978,9 @@ L041ctr32_three:
xorps %xmm7,%xmm4
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp L038ctr32_ret
+ jmp L040ctr32_ret
.align 4,0x90
-L042ctr32_four:
+L044ctr32_four:
call __aesni_encrypt4
movups (%esi),%xmm6
movups 16(%esi),%xmm7
@@ -968,7 +994,18 @@ L042ctr32_four:
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-L038ctr32_ret:
+L040ctr32_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
movl 80(%esp),%esp
popl %edi
popl %esi
@@ -991,12 +1028,12 @@ L_aesni_xts_encrypt_begin:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L044enc1_loop_8:
+L046enc1_loop_8:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L044enc1_loop_8
+ jnz L046enc1_loop_8
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1020,14 +1057,14 @@ L044enc1_loop_8:
movl %edx,%ebp
movl %ecx,%ebx
subl $96,%eax
- jc L045xts_enc_short
+ jc L047xts_enc_short
shll $4,%ecx
movl $16,%ebx
subl %ecx,%ebx
leal 32(%edx,%ecx,1),%edx
- jmp L046xts_enc_loop6
+ jmp L048xts_enc_loop6
.align 4,0x90
-L046xts_enc_loop6:
+L048xts_enc_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1116,23 +1153,23 @@ L046xts_enc_loop6:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
subl $96,%eax
- jnc L046xts_enc_loop6
+ jnc L048xts_enc_loop6
movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-L045xts_enc_short:
+L047xts_enc_short:
addl $96,%eax
- jz L047xts_enc_done6x
+ jz L049xts_enc_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb L048xts_enc_one
+ jb L050xts_enc_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je L049xts_enc_two
+ je L051xts_enc_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1141,7 +1178,7 @@ L045xts_enc_short:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb L050xts_enc_three
+ jb L052xts_enc_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1151,7 +1188,7 @@ L045xts_enc_short:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je L051xts_enc_four
+ je L053xts_enc_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1183,9 +1220,9 @@ L045xts_enc_short:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp L052xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L048xts_enc_one:
+L050xts_enc_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1193,20 +1230,20 @@ L048xts_enc_one:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L053enc1_loop_9:
+L055enc1_loop_9:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L053enc1_loop_9
+ jnz L055enc1_loop_9
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp L052xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L049xts_enc_two:
+L051xts_enc_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1220,9 +1257,9 @@ L049xts_enc_two:
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L052xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L050xts_enc_three:
+L052xts_enc_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1240,9 +1277,9 @@ L050xts_enc_three:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp L052xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L051xts_enc_four:
+L053xts_enc_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1264,28 +1301,28 @@ L051xts_enc_four:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L052xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L047xts_enc_done6x:
+L049xts_enc_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz L054xts_enc_ret
+ jz L056xts_enc_ret
movdqa %xmm1,%xmm5
movl %eax,112(%esp)
- jmp L055xts_enc_steal
+ jmp L057xts_enc_steal
.align 4,0x90
-L052xts_enc_done:
+L054xts_enc_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz L054xts_enc_ret
+ jz L056xts_enc_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm5
paddq %xmm1,%xmm1
pand 96(%esp),%xmm5
pxor %xmm1,%xmm5
-L055xts_enc_steal:
+L057xts_enc_steal:
movzbl (%esi),%ecx
movzbl -16(%edi),%edx
leal 1(%esi),%esi
@@ -1293,7 +1330,7 @@ L055xts_enc_steal:
movb %dl,(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz L055xts_enc_steal
+ jnz L057xts_enc_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1303,16 +1340,30 @@ L055xts_enc_steal:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L056enc1_loop_10:
+L058enc1_loop_10:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L056enc1_loop_10
+ jnz L058enc1_loop_10
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,-16(%edi)
-L054xts_enc_ret:
+L056xts_enc_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1335,12 +1386,12 @@ L_aesni_xts_decrypt_begin:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L057enc1_loop_11:
+L059enc1_loop_11:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L057enc1_loop_11
+ jnz L059enc1_loop_11
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1369,14 +1420,14 @@ L057enc1_loop_11:
pcmpgtd %xmm1,%xmm0
andl $-16,%eax
subl $96,%eax
- jc L058xts_dec_short
+ jc L060xts_dec_short
shll $4,%ecx
movl $16,%ebx
subl %ecx,%ebx
leal 32(%edx,%ecx,1),%edx
- jmp L059xts_dec_loop6
+ jmp L061xts_dec_loop6
.align 4,0x90
-L059xts_dec_loop6:
+L061xts_dec_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1465,23 +1516,23 @@ L059xts_dec_loop6:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
subl $96,%eax
- jnc L059xts_dec_loop6
+ jnc L061xts_dec_loop6
movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-L058xts_dec_short:
+L060xts_dec_short:
addl $96,%eax
- jz L060xts_dec_done6x
+ jz L062xts_dec_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb L061xts_dec_one
+ jb L063xts_dec_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je L062xts_dec_two
+ je L064xts_dec_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1490,7 +1541,7 @@ L058xts_dec_short:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb L063xts_dec_three
+ jb L065xts_dec_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1500,7 +1551,7 @@ L058xts_dec_short:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je L064xts_dec_four
+ je L066xts_dec_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1532,9 +1583,9 @@ L058xts_dec_short:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp L065xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L061xts_dec_one:
+L063xts_dec_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1542,20 +1593,20 @@ L061xts_dec_one:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L066dec1_loop_12:
+L068dec1_loop_12:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L066dec1_loop_12
+ jnz L068dec1_loop_12
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp L065xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L062xts_dec_two:
+L064xts_dec_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1569,9 +1620,9 @@ L062xts_dec_two:
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L065xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L063xts_dec_three:
+L065xts_dec_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1589,9 +1640,9 @@ L063xts_dec_three:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp L065xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L064xts_dec_four:
+L066xts_dec_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1613,20 +1664,20 @@ L064xts_dec_four:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L065xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L060xts_dec_done6x:
+L062xts_dec_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz L067xts_dec_ret
+ jz L069xts_dec_ret
movl %eax,112(%esp)
- jmp L068xts_dec_only_one_more
+ jmp L070xts_dec_only_one_more
.align 4,0x90
-L065xts_dec_done:
+L067xts_dec_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz L067xts_dec_ret
+ jz L069xts_dec_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm2
@@ -1636,7 +1687,7 @@ L065xts_dec_done:
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
-L068xts_dec_only_one_more:
+L070xts_dec_only_one_more:
pshufd $19,%xmm0,%xmm5
movdqa %xmm1,%xmm6
paddq %xmm1,%xmm1
@@ -1650,16 +1701,16 @@ L068xts_dec_only_one_more:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L069dec1_loop_13:
+L071dec1_loop_13:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L069dec1_loop_13
+ jnz L071dec1_loop_13
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
-L070xts_dec_steal:
+L072xts_dec_steal:
movzbl 16(%esi),%ecx
movzbl (%edi),%edx
leal 1(%esi),%esi
@@ -1667,7 +1718,7 @@ L070xts_dec_steal:
movb %dl,16(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz L070xts_dec_steal
+ jnz L072xts_dec_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1677,16 +1728,30 @@ L070xts_dec_steal:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L071dec1_loop_14:
+L073dec1_loop_14:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L071dec1_loop_14
+ jnz L073dec1_loop_14
.byte 102,15,56,223,209
xorps %xmm6,%xmm2
movups %xmm2,(%edi)
-L067xts_dec_ret:
+L069xts_dec_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1710,7 +1775,7 @@ L_aesni_cbc_encrypt_begin:
movl 32(%esp),%edx
movl 36(%esp),%ebp
testl %eax,%eax
- jz L072cbc_abort
+ jz L074cbc_abort
cmpl $0,40(%esp)
xchgl %esp,%ebx
movups (%ebp),%xmm7
@@ -1718,14 +1783,14 @@ L_aesni_cbc_encrypt_begin:
movl %edx,%ebp
movl %ebx,16(%esp)
movl %ecx,%ebx
- je L073cbc_decrypt
+ je L075cbc_decrypt
movaps %xmm7,%xmm2
cmpl $16,%eax
- jb L074cbc_enc_tail
+ jb L076cbc_enc_tail
subl $16,%eax
- jmp L075cbc_enc_loop
+ jmp L077cbc_enc_loop
.align 4,0x90
-L075cbc_enc_loop:
+L077cbc_enc_loop:
movups (%esi),%xmm7
leal 16(%esi),%esi
movups (%edx),%xmm0
@@ -1733,24 +1798,25 @@ L075cbc_enc_loop:
xorps %xmm0,%xmm7
leal 32(%edx),%edx
xorps %xmm7,%xmm2
-L076enc1_loop_15:
+L078enc1_loop_15:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L076enc1_loop_15
+ jnz L078enc1_loop_15
.byte 102,15,56,221,209
movl %ebx,%ecx
movl %ebp,%edx
movups %xmm2,(%edi)
leal 16(%edi),%edi
subl $16,%eax
- jnc L075cbc_enc_loop
+ jnc L077cbc_enc_loop
addl $16,%eax
- jnz L074cbc_enc_tail
+ jnz L076cbc_enc_tail
movaps %xmm2,%xmm7
- jmp L077cbc_ret
-L074cbc_enc_tail:
+ pxor %xmm2,%xmm2
+ jmp L079cbc_ret
+L076cbc_enc_tail:
movl %eax,%ecx
.long 2767451785
movl $16,%ecx
@@ -1761,20 +1827,20 @@ L074cbc_enc_tail:
movl %ebx,%ecx
movl %edi,%esi
movl %ebp,%edx
- jmp L075cbc_enc_loop
+ jmp L077cbc_enc_loop
.align 4,0x90
-L073cbc_decrypt:
+L075cbc_decrypt:
cmpl $80,%eax
- jbe L078cbc_dec_tail
+ jbe L080cbc_dec_tail
movaps %xmm7,(%esp)
subl $80,%eax
- jmp L079cbc_dec_loop6_enter
+ jmp L081cbc_dec_loop6_enter
.align 4,0x90
-L080cbc_dec_loop6:
+L082cbc_dec_loop6:
movaps %xmm0,(%esp)
movups %xmm7,(%edi)
leal 16(%edi),%edi
-L079cbc_dec_loop6_enter:
+L081cbc_dec_loop6_enter:
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -1804,28 +1870,28 @@ L079cbc_dec_loop6_enter:
movups %xmm6,64(%edi)
leal 80(%edi),%edi
subl $96,%eax
- ja L080cbc_dec_loop6
+ ja L082cbc_dec_loop6
movaps %xmm7,%xmm2
movaps %xmm0,%xmm7
addl $80,%eax
- jle L081cbc_dec_tail_collected
+ jle L083cbc_dec_clear_tail_collected
movups %xmm2,(%edi)
leal 16(%edi),%edi
-L078cbc_dec_tail:
+L080cbc_dec_tail:
movups (%esi),%xmm2
movaps %xmm2,%xmm6
cmpl $16,%eax
- jbe L082cbc_dec_one
+ jbe L084cbc_dec_one
movups 16(%esi),%xmm3
movaps %xmm3,%xmm5
cmpl $32,%eax
- jbe L083cbc_dec_two
+ jbe L085cbc_dec_two
movups 32(%esi),%xmm4
cmpl $48,%eax
- jbe L084cbc_dec_three
+ jbe L086cbc_dec_three
movups 48(%esi),%xmm5
cmpl $64,%eax
- jbe L085cbc_dec_four
+ jbe L087cbc_dec_four
movups 64(%esi),%xmm6
movaps %xmm7,(%esp)
movups (%esi),%xmm2
@@ -1843,55 +1909,62 @@ L078cbc_dec_tail:
xorps %xmm0,%xmm6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%edi)
+ pxor %xmm5,%xmm5
leal 64(%edi),%edi
movaps %xmm6,%xmm2
+ pxor %xmm6,%xmm6
subl $80,%eax
- jmp L081cbc_dec_tail_collected
+ jmp L088cbc_dec_tail_collected
.align 4,0x90
-L082cbc_dec_one:
+L084cbc_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L086dec1_loop_16:
+L089dec1_loop_16:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L086dec1_loop_16
+ jnz L089dec1_loop_16
.byte 102,15,56,223,209
xorps %xmm7,%xmm2
movaps %xmm6,%xmm7
subl $16,%eax
- jmp L081cbc_dec_tail_collected
+ jmp L088cbc_dec_tail_collected
.align 4,0x90
-L083cbc_dec_two:
+L085cbc_dec_two:
call __aesni_decrypt2
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movaps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leal 16(%edi),%edi
movaps %xmm5,%xmm7
subl $32,%eax
- jmp L081cbc_dec_tail_collected
+ jmp L088cbc_dec_tail_collected
.align 4,0x90
-L084cbc_dec_three:
+L086cbc_dec_three:
call __aesni_decrypt3
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
xorps %xmm5,%xmm4
movups %xmm2,(%edi)
movaps %xmm4,%xmm2
+ pxor %xmm4,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
leal 32(%edi),%edi
movups 32(%esi),%xmm7
subl $48,%eax
- jmp L081cbc_dec_tail_collected
+ jmp L088cbc_dec_tail_collected
.align 4,0x90
-L085cbc_dec_four:
+L087cbc_dec_four:
call __aesni_decrypt4
movups 16(%esi),%xmm1
movups 32(%esi),%xmm0
@@ -1901,28 +1974,44 @@ L085cbc_dec_four:
movups %xmm2,(%edi)
xorps %xmm1,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
leal 48(%edi),%edi
movaps %xmm5,%xmm2
+ pxor %xmm5,%xmm5
subl $64,%eax
-L081cbc_dec_tail_collected:
+ jmp L088cbc_dec_tail_collected
+.align 4,0x90
+L083cbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+L088cbc_dec_tail_collected:
andl $15,%eax
- jnz L087cbc_dec_tail_partial
+ jnz L090cbc_dec_tail_partial
movups %xmm2,(%edi)
- jmp L077cbc_ret
+ pxor %xmm0,%xmm0
+ jmp L079cbc_ret
.align 4,0x90
-L087cbc_dec_tail_partial:
+L090cbc_dec_tail_partial:
movaps %xmm2,(%esp)
+ pxor %xmm0,%xmm0
movl $16,%ecx
movl %esp,%esi
subl %eax,%ecx
.long 2767451785
-L077cbc_ret:
+ movdqa %xmm2,(%esp)
+L079cbc_ret:
movl 16(%esp),%esp
movl 36(%esp),%ebp
+ pxor %xmm2,%xmm2
+ pxor %xmm1,%xmm1
movups %xmm7,(%ebp)
-L072cbc_abort:
+ pxor %xmm7,%xmm7
+L074cbc_abort:
popl %edi
popl %esi
popl %ebx
@@ -1930,52 +2019,62 @@ L072cbc_abort:
ret
.align 4
__aesni_set_encrypt_key:
+ pushl %ebp
+ pushl %ebx
testl %eax,%eax
- jz L088bad_pointer
+ jz L091bad_pointer
testl %edx,%edx
- jz L088bad_pointer
+ jz L091bad_pointer
+ call L092pic
+L092pic:
+ popl %ebx
+ leal Lkey_const-L092pic(%ebx),%ebx
+ movl L_OPENSSL_ia32cap_P$non_lazy_ptr-Lkey_const(%ebx),%ebp
movups (%eax),%xmm0
xorps %xmm4,%xmm4
+ movl 4(%ebp),%ebp
leal 16(%edx),%edx
+ andl $268437504,%ebp
cmpl $256,%ecx
- je L08914rounds
+ je L09314rounds
cmpl $192,%ecx
- je L09012rounds
+ je L09412rounds
cmpl $128,%ecx
- jne L091bad_keybits
+ jne L095bad_keybits
.align 4,0x90
-L09210rounds:
+L09610rounds:
+ cmpl $268435456,%ebp
+ je L09710rounds_alt
movl $9,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,200,1
- call L093key_128_cold
+ call L098key_128_cold
.byte 102,15,58,223,200,2
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,4
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,8
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,16
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,32
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,64
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,128
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,27
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,54
- call L094key_128
+ call L099key_128
movups %xmm0,(%edx)
movl %ecx,80(%edx)
- xorl %eax,%eax
- ret
+ jmp L100good_key
.align 4,0x90
-L094key_128:
+L099key_128:
movups %xmm0,(%edx)
leal 16(%edx),%edx
-L093key_128_cold:
+L098key_128_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -1984,38 +2083,91 @@ L093key_128_cold:
xorps %xmm1,%xmm0
ret
.align 4,0x90
-L09012rounds:
+L09710rounds_alt:
+ movdqa (%ebx),%xmm5
+ movl $8,%ecx
+ movdqa 32(%ebx),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,-16(%edx)
+L101loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leal 16(%edx),%edx
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%edx)
+ movdqa %xmm0,%xmm2
+ decl %ecx
+ jnz L101loop_key128
+ movdqa 48(%ebx),%xmm4
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%edx)
+ movl $9,%ecx
+ movl %ecx,96(%edx)
+ jmp L100good_key
+.align 4,0x90
+L09412rounds:
movq 16(%eax),%xmm2
+ cmpl $268435456,%ebp
+ je L10212rounds_alt
movl $11,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,202,1
- call L095key_192a_cold
+ call L103key_192a_cold
.byte 102,15,58,223,202,2
- call L096key_192b
+ call L104key_192b
.byte 102,15,58,223,202,4
- call L097key_192a
+ call L105key_192a
.byte 102,15,58,223,202,8
- call L096key_192b
+ call L104key_192b
.byte 102,15,58,223,202,16
- call L097key_192a
+ call L105key_192a
.byte 102,15,58,223,202,32
- call L096key_192b
+ call L104key_192b
.byte 102,15,58,223,202,64
- call L097key_192a
+ call L105key_192a
.byte 102,15,58,223,202,128
- call L096key_192b
+ call L104key_192b
movups %xmm0,(%edx)
movl %ecx,48(%edx)
- xorl %eax,%eax
- ret
+ jmp L100good_key
.align 4,0x90
-L097key_192a:
+L105key_192a:
movups %xmm0,(%edx)
leal 16(%edx),%edx
.align 4,0x90
-L095key_192a_cold:
+L103key_192a_cold:
movaps %xmm2,%xmm5
-L098key_192b_warm:
+L106key_192b_warm:
shufps $16,%xmm0,%xmm4
movdqa %xmm2,%xmm3
xorps %xmm4,%xmm0
@@ -2029,56 +2181,90 @@ L098key_192b_warm:
pxor %xmm3,%xmm2
ret
.align 4,0x90
-L096key_192b:
+L104key_192b:
movaps %xmm0,%xmm3
shufps $68,%xmm0,%xmm5
movups %xmm5,(%edx)
shufps $78,%xmm2,%xmm3
movups %xmm3,16(%edx)
leal 32(%edx),%edx
- jmp L098key_192b_warm
+ jmp L106key_192b_warm
.align 4,0x90
-L08914rounds:
+L10212rounds_alt:
+ movdqa 16(%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $8,%ecx
+ movdqu %xmm0,-16(%edx)
+L107loop_key192:
+ movq %xmm2,(%edx)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leal 24(%edx),%edx
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%edx)
+ decl %ecx
+ jnz L107loop_key192
+ movl $11,%ecx
+ movl %ecx,32(%edx)
+ jmp L100good_key
+.align 4,0x90
+L09314rounds:
movups 16(%eax),%xmm2
- movl $13,%ecx
leal 16(%edx),%edx
+ cmpl $268435456,%ebp
+ je L10814rounds_alt
+ movl $13,%ecx
movups %xmm0,-32(%edx)
movups %xmm2,-16(%edx)
.byte 102,15,58,223,202,1
- call L099key_256a_cold
+ call L109key_256a_cold
.byte 102,15,58,223,200,1
- call L100key_256b
+ call L110key_256b
.byte 102,15,58,223,202,2
- call L101key_256a
+ call L111key_256a
.byte 102,15,58,223,200,2
- call L100key_256b
+ call L110key_256b
.byte 102,15,58,223,202,4
- call L101key_256a
+ call L111key_256a
.byte 102,15,58,223,200,4
- call L100key_256b
+ call L110key_256b
.byte 102,15,58,223,202,8
- call L101key_256a
+ call L111key_256a
.byte 102,15,58,223,200,8
- call L100key_256b
+ call L110key_256b
.byte 102,15,58,223,202,16
- call L101key_256a
+ call L111key_256a
.byte 102,15,58,223,200,16
- call L100key_256b
+ call L110key_256b
.byte 102,15,58,223,202,32
- call L101key_256a
+ call L111key_256a
.byte 102,15,58,223,200,32
- call L100key_256b
+ call L110key_256b
.byte 102,15,58,223,202,64
- call L101key_256a
+ call L111key_256a
movups %xmm0,(%edx)
movl %ecx,16(%edx)
xorl %eax,%eax
- ret
+ jmp L100good_key
.align 4,0x90
-L101key_256a:
+L111key_256a:
movups %xmm2,(%edx)
leal 16(%edx),%edx
-L099key_256a_cold:
+L109key_256a_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -2087,7 +2273,7 @@ L099key_256a_cold:
xorps %xmm1,%xmm0
ret
.align 4,0x90
-L100key_256b:
+L110key_256b:
movups %xmm0,(%edx)
leal 16(%edx),%edx
shufps $16,%xmm2,%xmm4
@@ -2097,13 +2283,70 @@ L100key_256b:
shufps $170,%xmm1,%xmm1
xorps %xmm1,%xmm2
ret
+.align 4,0x90
+L10814rounds_alt:
+ movdqa (%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $7,%ecx
+ movdqu %xmm0,-32(%edx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,-16(%edx)
+L112loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ decl %ecx
+ jz L113done_key256
+ pshufd $255,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%edx)
+ leal 32(%edx),%edx
+ movdqa %xmm2,%xmm1
+ jmp L112loop_key256
+L113done_key256:
+ movl $13,%ecx
+ movl %ecx,16(%edx)
+L100good_key:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ xorl %eax,%eax
+ popl %ebx
+ popl %ebp
+ ret
.align 2,0x90
-L088bad_pointer:
+L091bad_pointer:
movl $-1,%eax
+ popl %ebx
+ popl %ebp
ret
.align 2,0x90
-L091bad_keybits:
+L095bad_keybits:
+ pxor %xmm0,%xmm0
movl $-2,%eax
+ popl %ebx
+ popl %ebp
ret
.globl _aesni_set_encrypt_key
.align 4
@@ -2125,7 +2368,7 @@ L_aesni_set_decrypt_key_begin:
movl 12(%esp),%edx
shll $4,%ecx
testl %eax,%eax
- jnz L102dec_key_ret
+ jnz L114dec_key_ret
leal 16(%edx,%ecx,1),%eax
movups (%edx),%xmm0
movups (%eax),%xmm1
@@ -2133,7 +2376,7 @@ L_aesni_set_decrypt_key_begin:
movups %xmm1,(%edx)
leal 16(%edx),%edx
leal -16(%eax),%eax
-L103dec_key_inverse:
+L115dec_key_inverse:
movups (%edx),%xmm0
movups (%eax),%xmm1
.byte 102,15,56,219,192
@@ -2143,14 +2386,27 @@ L103dec_key_inverse:
movups %xmm0,16(%eax)
movups %xmm1,-16(%edx)
cmpl %edx,%eax
- ja L103dec_key_inverse
+ ja L115dec_key_inverse
movups (%edx),%xmm0
.byte 102,15,56,219,192
movups %xmm0,(%edx)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
xorl %eax,%eax
-L102dec_key_ret:
+L114dec_key_ret:
ret
+.align 6,0x90
+Lkey_const:
+.long 202313229,202313229,202313229,202313229
+.long 67569157,67569157,67569157,67569157
+.long 1,1,1,1
+.long 27,27,27,27
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
.byte 115,108,46,111,114,103,62,0
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L_OPENSSL_ia32cap_P$non_lazy_ptr:
+.indirect_symbol _OPENSSL_ia32cap_P
+.long 0
+.comm _OPENSSL_ia32cap_P,16,2
diff --git a/deps/openssl/asm/x86-win32-masm/aes/aesni-x86.asm b/deps/openssl/asm/x86-win32-masm/aes/aesni-x86.asm
index 43fdb5a034..6511c21bcf 100644
--- a/deps/openssl/asm/x86-win32-masm/aes/aesni-x86.asm
+++ b/deps/openssl/asm/x86-win32-masm/aes/aesni-x86.asm
@@ -17,6 +17,7 @@ IF @Version LT 800
ELSE
.text$ SEGMENT ALIGN(64) 'CODE'
ENDIF
+;EXTERN _OPENSSL_ia32cap_P:NEAR
ALIGN 16
_aesni_encrypt PROC PUBLIC
$L_aesni_encrypt_begin::
@@ -36,7 +37,10 @@ DB 102,15,56,220,209
lea edx,DWORD PTR 16[edx]
jnz $L000enc1_loop_1
DB 102,15,56,221,209
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
movups XMMWORD PTR [eax],xmm2
+ pxor xmm2,xmm2
ret
_aesni_encrypt ENDP
ALIGN 16
@@ -58,7 +62,10 @@ DB 102,15,56,222,209
lea edx,DWORD PTR 16[edx]
jnz $L001dec1_loop_2
DB 102,15,56,223,209
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
movups XMMWORD PTR [eax],xmm2
+ pxor xmm2,xmm2
ret
_aesni_decrypt ENDP
ALIGN 16
@@ -265,17 +272,15 @@ DB 102,15,56,220,217
neg ecx
DB 102,15,56,220,225
pxor xmm7,xmm0
+ movups xmm0,XMMWORD PTR [ecx*1+edx]
add ecx,16
-DB 102,15,56,220,233
-DB 102,15,56,220,241
-DB 102,15,56,220,249
- movups xmm0,XMMWORD PTR [ecx*1+edx-16]
- jmp $L_aesni_encrypt6_enter
+ jmp $L008_aesni_encrypt6_inner
ALIGN 16
-$L008enc6_loop:
+$L009enc6_loop:
DB 102,15,56,220,209
DB 102,15,56,220,217
DB 102,15,56,220,225
+$L008_aesni_encrypt6_inner:
DB 102,15,56,220,233
DB 102,15,56,220,241
DB 102,15,56,220,249
@@ -289,7 +294,7 @@ DB 102,15,56,220,232
DB 102,15,56,220,240
DB 102,15,56,220,248
movups xmm0,XMMWORD PTR [ecx*1+edx-16]
- jnz $L008enc6_loop
+ jnz $L009enc6_loop
DB 102,15,56,220,209
DB 102,15,56,220,217
DB 102,15,56,220,225
@@ -320,17 +325,15 @@ DB 102,15,56,222,217
neg ecx
DB 102,15,56,222,225
pxor xmm7,xmm0
+ movups xmm0,XMMWORD PTR [ecx*1+edx]
add ecx,16
-DB 102,15,56,222,233
-DB 102,15,56,222,241
-DB 102,15,56,222,249
- movups xmm0,XMMWORD PTR [ecx*1+edx-16]
- jmp $L_aesni_decrypt6_enter
+ jmp $L010_aesni_decrypt6_inner
ALIGN 16
-$L009dec6_loop:
+$L011dec6_loop:
DB 102,15,56,222,209
DB 102,15,56,222,217
DB 102,15,56,222,225
+$L010_aesni_decrypt6_inner:
DB 102,15,56,222,233
DB 102,15,56,222,241
DB 102,15,56,222,249
@@ -344,7 +347,7 @@ DB 102,15,56,222,232
DB 102,15,56,222,240
DB 102,15,56,222,248
movups xmm0,XMMWORD PTR [ecx*1+edx-16]
- jnz $L009dec6_loop
+ jnz $L011dec6_loop
DB 102,15,56,222,209
DB 102,15,56,222,217
DB 102,15,56,222,225
@@ -372,14 +375,14 @@ $L_aesni_ecb_encrypt_begin::
mov edx,DWORD PTR 32[esp]
mov ebx,DWORD PTR 36[esp]
and eax,-16
- jz $L010ecb_ret
+ jz $L012ecb_ret
mov ecx,DWORD PTR 240[edx]
test ebx,ebx
- jz $L011ecb_decrypt
+ jz $L013ecb_decrypt
mov ebp,edx
mov ebx,ecx
cmp eax,96
- jb $L012ecb_enc_tail
+ jb $L014ecb_enc_tail
movdqu xmm2,XMMWORD PTR [esi]
movdqu xmm3,XMMWORD PTR 16[esi]
movdqu xmm4,XMMWORD PTR 32[esi]
@@ -388,9 +391,9 @@ $L_aesni_ecb_encrypt_begin::
movdqu xmm7,XMMWORD PTR 80[esi]
lea esi,DWORD PTR 96[esi]
sub eax,96
- jmp $L013ecb_enc_loop6_enter
+ jmp $L015ecb_enc_loop6_enter
ALIGN 16
-$L014ecb_enc_loop6:
+$L016ecb_enc_loop6:
movups XMMWORD PTR [edi],xmm2
movdqu xmm2,XMMWORD PTR [esi]
movups XMMWORD PTR 16[edi],xmm3
@@ -405,12 +408,12 @@ $L014ecb_enc_loop6:
lea edi,DWORD PTR 96[edi]
movdqu xmm7,XMMWORD PTR 80[esi]
lea esi,DWORD PTR 96[esi]
-$L013ecb_enc_loop6_enter:
+$L015ecb_enc_loop6_enter:
call __aesni_encrypt6
mov edx,ebp
mov ecx,ebx
sub eax,96
- jnc $L014ecb_enc_loop6
+ jnc $L016ecb_enc_loop6
movups XMMWORD PTR [edi],xmm2
movups XMMWORD PTR 16[edi],xmm3
movups XMMWORD PTR 32[edi],xmm4
@@ -419,18 +422,18 @@ $L013ecb_enc_loop6_enter:
movups XMMWORD PTR 80[edi],xmm7
lea edi,DWORD PTR 96[edi]
add eax,96
- jz $L010ecb_ret
-$L012ecb_enc_tail:
+ jz $L012ecb_ret
+$L014ecb_enc_tail:
movups xmm2,XMMWORD PTR [esi]
cmp eax,32
- jb $L015ecb_enc_one
+ jb $L017ecb_enc_one
movups xmm3,XMMWORD PTR 16[esi]
- je $L016ecb_enc_two
+ je $L018ecb_enc_two
movups xmm4,XMMWORD PTR 32[esi]
cmp eax,64
- jb $L017ecb_enc_three
+ jb $L019ecb_enc_three
movups xmm5,XMMWORD PTR 48[esi]
- je $L018ecb_enc_four
+ je $L020ecb_enc_four
movups xmm6,XMMWORD PTR 64[esi]
xorps xmm7,xmm7
call __aesni_encrypt6
@@ -439,49 +442,49 @@ $L012ecb_enc_tail:
movups XMMWORD PTR 32[edi],xmm4
movups XMMWORD PTR 48[edi],xmm5
movups XMMWORD PTR 64[edi],xmm6
- jmp $L010ecb_ret
+ jmp $L012ecb_ret
ALIGN 16
-$L015ecb_enc_one:
+$L017ecb_enc_one:
movups xmm0,XMMWORD PTR [edx]
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L019enc1_loop_3:
+$L021enc1_loop_3:
DB 102,15,56,220,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L019enc1_loop_3
+ jnz $L021enc1_loop_3
DB 102,15,56,221,209
movups XMMWORD PTR [edi],xmm2
- jmp $L010ecb_ret
+ jmp $L012ecb_ret
ALIGN 16
-$L016ecb_enc_two:
+$L018ecb_enc_two:
call __aesni_encrypt2
movups XMMWORD PTR [edi],xmm2
movups XMMWORD PTR 16[edi],xmm3
- jmp $L010ecb_ret
+ jmp $L012ecb_ret
ALIGN 16
-$L017ecb_enc_three:
+$L019ecb_enc_three:
call __aesni_encrypt3
movups XMMWORD PTR [edi],xmm2
movups XMMWORD PTR 16[edi],xmm3
movups XMMWORD PTR 32[edi],xmm4
- jmp $L010ecb_ret
+ jmp $L012ecb_ret
ALIGN 16
-$L018ecb_enc_four:
+$L020ecb_enc_four:
call __aesni_encrypt4
movups XMMWORD PTR [edi],xmm2
movups XMMWORD PTR 16[edi],xmm3
movups XMMWORD PTR 32[edi],xmm4
movups XMMWORD PTR 48[edi],xmm5
- jmp $L010ecb_ret
+ jmp $L012ecb_ret
ALIGN 16
-$L011ecb_decrypt:
+$L013ecb_decrypt:
mov ebp,edx
mov ebx,ecx
cmp eax,96
- jb $L020ecb_dec_tail
+ jb $L022ecb_dec_tail
movdqu xmm2,XMMWORD PTR [esi]
movdqu xmm3,XMMWORD PTR 16[esi]
movdqu xmm4,XMMWORD PTR 32[esi]
@@ -490,9 +493,9 @@ $L011ecb_decrypt:
movdqu xmm7,XMMWORD PTR 80[esi]
lea esi,DWORD PTR 96[esi]
sub eax,96
- jmp $L021ecb_dec_loop6_enter
+ jmp $L023ecb_dec_loop6_enter
ALIGN 16
-$L022ecb_dec_loop6:
+$L024ecb_dec_loop6:
movups XMMWORD PTR [edi],xmm2
movdqu xmm2,XMMWORD PTR [esi]
movups XMMWORD PTR 16[edi],xmm3
@@ -507,12 +510,12 @@ $L022ecb_dec_loop6:
lea edi,DWORD PTR 96[edi]
movdqu xmm7,XMMWORD PTR 80[esi]
lea esi,DWORD PTR 96[esi]
-$L021ecb_dec_loop6_enter:
+$L023ecb_dec_loop6_enter:
call __aesni_decrypt6
mov edx,ebp
mov ecx,ebx
sub eax,96
- jnc $L022ecb_dec_loop6
+ jnc $L024ecb_dec_loop6
movups XMMWORD PTR [edi],xmm2
movups XMMWORD PTR 16[edi],xmm3
movups XMMWORD PTR 32[edi],xmm4
@@ -521,18 +524,18 @@ $L021ecb_dec_loop6_enter:
movups XMMWORD PTR 80[edi],xmm7
lea edi,DWORD PTR 96[edi]
add eax,96
- jz $L010ecb_ret
-$L020ecb_dec_tail:
+ jz $L012ecb_ret
+$L022ecb_dec_tail:
movups xmm2,XMMWORD PTR [esi]
cmp eax,32
- jb $L023ecb_dec_one
+ jb $L025ecb_dec_one
movups xmm3,XMMWORD PTR 16[esi]
- je $L024ecb_dec_two
+ je $L026ecb_dec_two
movups xmm4,XMMWORD PTR 32[esi]
cmp eax,64
- jb $L025ecb_dec_three
+ jb $L027ecb_dec_three
movups xmm5,XMMWORD PTR 48[esi]
- je $L026ecb_dec_four
+ je $L028ecb_dec_four
movups xmm6,XMMWORD PTR 64[esi]
xorps xmm7,xmm7
call __aesni_decrypt6
@@ -541,43 +544,51 @@ $L020ecb_dec_tail:
movups XMMWORD PTR 32[edi],xmm4
movups XMMWORD PTR 48[edi],xmm5
movups XMMWORD PTR 64[edi],xmm6
- jmp $L010ecb_ret
+ jmp $L012ecb_ret
ALIGN 16
-$L023ecb_dec_one:
+$L025ecb_dec_one:
movups xmm0,XMMWORD PTR [edx]
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L027dec1_loop_4:
+$L029dec1_loop_4:
DB 102,15,56,222,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L027dec1_loop_4
+ jnz $L029dec1_loop_4
DB 102,15,56,223,209
movups XMMWORD PTR [edi],xmm2
- jmp $L010ecb_ret
+ jmp $L012ecb_ret
ALIGN 16
-$L024ecb_dec_two:
+$L026ecb_dec_two:
call __aesni_decrypt2
movups XMMWORD PTR [edi],xmm2
movups XMMWORD PTR 16[edi],xmm3
- jmp $L010ecb_ret
+ jmp $L012ecb_ret
ALIGN 16
-$L025ecb_dec_three:
+$L027ecb_dec_three:
call __aesni_decrypt3
movups XMMWORD PTR [edi],xmm2
movups XMMWORD PTR 16[edi],xmm3
movups XMMWORD PTR 32[edi],xmm4
- jmp $L010ecb_ret
+ jmp $L012ecb_ret
ALIGN 16
-$L026ecb_dec_four:
+$L028ecb_dec_four:
call __aesni_decrypt4
movups XMMWORD PTR [edi],xmm2
movups XMMWORD PTR 16[edi],xmm3
movups XMMWORD PTR 32[edi],xmm4
movups XMMWORD PTR 48[edi],xmm5
-$L010ecb_ret:
+$L012ecb_ret:
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ pxor xmm6,xmm6
+ pxor xmm7,xmm7
pop edi
pop esi
pop ebx
@@ -622,7 +633,7 @@ $L_aesni_ccm64_encrypt_blocks_begin::
lea edx,DWORD PTR 32[ecx*1+edx]
sub ebx,ecx
DB 102,15,56,0,253
-$L028ccm64_enc_outer:
+$L030ccm64_enc_outer:
movups xmm0,XMMWORD PTR [ebp]
mov ecx,ebx
movups xmm6,XMMWORD PTR [esi]
@@ -631,7 +642,7 @@ $L028ccm64_enc_outer:
xorps xmm0,xmm6
xorps xmm3,xmm0
movups xmm0,XMMWORD PTR 32[ebp]
-$L029ccm64_enc2_loop:
+$L031ccm64_enc2_loop:
DB 102,15,56,220,209
DB 102,15,56,220,217
movups xmm1,XMMWORD PTR [ecx*1+edx]
@@ -639,7 +650,7 @@ DB 102,15,56,220,217
DB 102,15,56,220,208
DB 102,15,56,220,216
movups xmm0,XMMWORD PTR [ecx*1+edx-16]
- jnz $L029ccm64_enc2_loop
+ jnz $L031ccm64_enc2_loop
DB 102,15,56,220,209
DB 102,15,56,220,217
paddq xmm7,XMMWORD PTR 16[esp]
@@ -652,10 +663,18 @@ DB 102,15,56,221,216
movups XMMWORD PTR [edi],xmm6
DB 102,15,56,0,213
lea edi,DWORD PTR 16[edi]
- jnz $L028ccm64_enc_outer
+ jnz $L030ccm64_enc_outer
mov esp,DWORD PTR 48[esp]
mov edi,DWORD PTR 40[esp]
movups XMMWORD PTR [edi],xmm3
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ pxor xmm6,xmm6
+ pxor xmm7,xmm7
pop edi
pop esi
pop ebx
@@ -701,12 +720,12 @@ DB 102,15,56,0,253
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L030enc1_loop_5:
+$L032enc1_loop_5:
DB 102,15,56,220,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L030enc1_loop_5
+ jnz $L032enc1_loop_5
DB 102,15,56,221,209
shl ebx,4
mov ecx,16
@@ -716,16 +735,16 @@ DB 102,15,56,221,209
sub ecx,ebx
lea edx,DWORD PTR 32[ebx*1+ebp]
mov ebx,ecx
- jmp $L031ccm64_dec_outer
+ jmp $L033ccm64_dec_outer
ALIGN 16
-$L031ccm64_dec_outer:
+$L033ccm64_dec_outer:
xorps xmm6,xmm2
movdqa xmm2,xmm7
movups XMMWORD PTR [edi],xmm6
lea edi,DWORD PTR 16[edi]
DB 102,15,56,0,213
sub eax,1
- jz $L032ccm64_dec_break
+ jz $L034ccm64_dec_break
movups xmm0,XMMWORD PTR [ebp]
mov ecx,ebx
movups xmm1,XMMWORD PTR 16[ebp]
@@ -733,7 +752,7 @@ DB 102,15,56,0,213
xorps xmm2,xmm0
xorps xmm3,xmm6
movups xmm0,XMMWORD PTR 32[ebp]
-$L033ccm64_dec2_loop:
+$L035ccm64_dec2_loop:
DB 102,15,56,220,209
DB 102,15,56,220,217
movups xmm1,XMMWORD PTR [ecx*1+edx]
@@ -741,7 +760,7 @@ DB 102,15,56,220,217
DB 102,15,56,220,208
DB 102,15,56,220,216
movups xmm0,XMMWORD PTR [ecx*1+edx-16]
- jnz $L033ccm64_dec2_loop
+ jnz $L035ccm64_dec2_loop
movups xmm6,XMMWORD PTR [esi]
paddq xmm7,XMMWORD PTR 16[esp]
DB 102,15,56,220,209
@@ -749,9 +768,9 @@ DB 102,15,56,220,217
DB 102,15,56,221,208
DB 102,15,56,221,216
lea esi,QWORD PTR 16[esi]
- jmp $L031ccm64_dec_outer
+ jmp $L033ccm64_dec_outer
ALIGN 16
-$L032ccm64_dec_break:
+$L034ccm64_dec_break:
mov ecx,DWORD PTR 240[ebp]
mov edx,ebp
movups xmm0,XMMWORD PTR [edx]
@@ -759,16 +778,24 @@ $L032ccm64_dec_break:
xorps xmm6,xmm0
lea edx,DWORD PTR 32[edx]
xorps xmm3,xmm6
-$L034enc1_loop_6:
+$L036enc1_loop_6:
DB 102,15,56,220,217
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L034enc1_loop_6
+ jnz $L036enc1_loop_6
DB 102,15,56,221,217
mov esp,DWORD PTR 48[esp]
mov edi,DWORD PTR 40[esp]
movups XMMWORD PTR [edi],xmm3
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ pxor xmm6,xmm6
+ pxor xmm7,xmm7
pop edi
pop esi
pop ebx
@@ -792,7 +819,7 @@ $L_aesni_ctr32_encrypt_blocks_begin::
and esp,-16
mov DWORD PTR 80[esp],ebp
cmp eax,1
- je $L035ctr32_one_shortcut
+ je $L037ctr32_one_shortcut
movdqu xmm7,XMMWORD PTR [ebx]
mov DWORD PTR [esp],202182159
mov DWORD PTR 4[esp],134810123
@@ -830,7 +857,7 @@ DB 102,15,56,0,202
pshufd xmm2,xmm0,192
pshufd xmm3,xmm0,128
cmp eax,6
- jb $L036ctr32_tail
+ jb $L038ctr32_tail
pxor xmm7,xmm6
shl ecx,4
mov ebx,16
@@ -839,9 +866,9 @@ DB 102,15,56,0,202
sub ebx,ecx
lea edx,DWORD PTR 32[ecx*1+edx]
sub eax,6
- jmp $L037ctr32_loop6
+ jmp $L039ctr32_loop6
ALIGN 16
-$L037ctr32_loop6:
+$L039ctr32_loop6:
pshufd xmm4,xmm0,64
movdqa xmm0,XMMWORD PTR 32[esp]
pshufd xmm5,xmm1,192
@@ -895,27 +922,27 @@ DB 102,15,56,0,202
lea edi,DWORD PTR 96[edi]
pshufd xmm3,xmm0,128
sub eax,6
- jnc $L037ctr32_loop6
+ jnc $L039ctr32_loop6
add eax,6
- jz $L038ctr32_ret
+ jz $L040ctr32_ret
movdqu xmm7,XMMWORD PTR [ebp]
mov edx,ebp
pxor xmm7,XMMWORD PTR 32[esp]
mov ecx,DWORD PTR 240[ebp]
-$L036ctr32_tail:
+$L038ctr32_tail:
por xmm2,xmm7
cmp eax,2
- jb $L039ctr32_one
+ jb $L041ctr32_one
pshufd xmm4,xmm0,64
por xmm3,xmm7
- je $L040ctr32_two
+ je $L042ctr32_two
pshufd xmm5,xmm1,192
por xmm4,xmm7
cmp eax,4
- jb $L041ctr32_three
+ jb $L043ctr32_three
pshufd xmm6,xmm1,128
por xmm5,xmm7
- je $L042ctr32_four
+ je $L044ctr32_four
por xmm6,xmm7
call __aesni_encrypt6
movups xmm1,XMMWORD PTR [esi]
@@ -933,29 +960,29 @@ $L036ctr32_tail:
movups XMMWORD PTR 32[edi],xmm4
movups XMMWORD PTR 48[edi],xmm5
movups XMMWORD PTR 64[edi],xmm6
- jmp $L038ctr32_ret
+ jmp $L040ctr32_ret
ALIGN 16
-$L035ctr32_one_shortcut:
+$L037ctr32_one_shortcut:
movups xmm2,XMMWORD PTR [ebx]
mov ecx,DWORD PTR 240[edx]
-$L039ctr32_one:
+$L041ctr32_one:
movups xmm0,XMMWORD PTR [edx]
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L043enc1_loop_7:
+$L045enc1_loop_7:
DB 102,15,56,220,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L043enc1_loop_7
+ jnz $L045enc1_loop_7
DB 102,15,56,221,209
movups xmm6,XMMWORD PTR [esi]
xorps xmm6,xmm2
movups XMMWORD PTR [edi],xmm6
- jmp $L038ctr32_ret
+ jmp $L040ctr32_ret
ALIGN 16
-$L040ctr32_two:
+$L042ctr32_two:
call __aesni_encrypt2
movups xmm5,XMMWORD PTR [esi]
movups xmm6,XMMWORD PTR 16[esi]
@@ -963,9 +990,9 @@ $L040ctr32_two:
xorps xmm3,xmm6
movups XMMWORD PTR [edi],xmm2
movups XMMWORD PTR 16[edi],xmm3
- jmp $L038ctr32_ret
+ jmp $L040ctr32_ret
ALIGN 16
-$L041ctr32_three:
+$L043ctr32_three:
call __aesni_encrypt3
movups xmm5,XMMWORD PTR [esi]
movups xmm6,XMMWORD PTR 16[esi]
@@ -976,9 +1003,9 @@ $L041ctr32_three:
xorps xmm4,xmm7
movups XMMWORD PTR 16[edi],xmm3
movups XMMWORD PTR 32[edi],xmm4
- jmp $L038ctr32_ret
+ jmp $L040ctr32_ret
ALIGN 16
-$L042ctr32_four:
+$L044ctr32_four:
call __aesni_encrypt4
movups xmm6,XMMWORD PTR [esi]
movups xmm7,XMMWORD PTR 16[esi]
@@ -992,7 +1019,18 @@ $L042ctr32_four:
xorps xmm5,xmm0
movups XMMWORD PTR 32[edi],xmm4
movups XMMWORD PTR 48[edi],xmm5
-$L038ctr32_ret:
+$L040ctr32_ret:
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ movdqa XMMWORD PTR 32[esp],xmm0
+ pxor xmm5,xmm5
+ movdqa XMMWORD PTR 48[esp],xmm0
+ pxor xmm6,xmm6
+ movdqa XMMWORD PTR 64[esp],xmm0
+ pxor xmm7,xmm7
mov esp,DWORD PTR 80[esp]
pop edi
pop esi
@@ -1015,12 +1053,12 @@ $L_aesni_xts_encrypt_begin::
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L044enc1_loop_8:
+$L046enc1_loop_8:
DB 102,15,56,220,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L044enc1_loop_8
+ jnz $L046enc1_loop_8
DB 102,15,56,221,209
mov esi,DWORD PTR 20[esp]
mov edi,DWORD PTR 24[esp]
@@ -1044,14 +1082,14 @@ DB 102,15,56,221,209
mov ebp,edx
mov ebx,ecx
sub eax,96
- jc $L045xts_enc_short
+ jc $L047xts_enc_short
shl ecx,4
mov ebx,16
sub ebx,ecx
lea edx,DWORD PTR 32[ecx*1+edx]
- jmp $L046xts_enc_loop6
+ jmp $L048xts_enc_loop6
ALIGN 16
-$L046xts_enc_loop6:
+$L048xts_enc_loop6:
pshufd xmm2,xmm0,19
pxor xmm0,xmm0
movdqa XMMWORD PTR [esp],xmm1
@@ -1140,23 +1178,23 @@ DB 102,15,56,220,249
pcmpgtd xmm0,xmm1
pxor xmm1,xmm2
sub eax,96
- jnc $L046xts_enc_loop6
+ jnc $L048xts_enc_loop6
mov ecx,DWORD PTR 240[ebp]
mov edx,ebp
mov ebx,ecx
-$L045xts_enc_short:
+$L047xts_enc_short:
add eax,96
- jz $L047xts_enc_done6x
+ jz $L049xts_enc_done6x
movdqa xmm5,xmm1
cmp eax,32
- jb $L048xts_enc_one
+ jb $L050xts_enc_one
pshufd xmm2,xmm0,19
pxor xmm0,xmm0
paddq xmm1,xmm1
pand xmm2,xmm3
pcmpgtd xmm0,xmm1
pxor xmm1,xmm2
- je $L049xts_enc_two
+ je $L051xts_enc_two
pshufd xmm2,xmm0,19
pxor xmm0,xmm0
movdqa xmm6,xmm1
@@ -1165,7 +1203,7 @@ $L045xts_enc_short:
pcmpgtd xmm0,xmm1
pxor xmm1,xmm2
cmp eax,64
- jb $L050xts_enc_three
+ jb $L052xts_enc_three
pshufd xmm2,xmm0,19
pxor xmm0,xmm0
movdqa xmm7,xmm1
@@ -1175,7 +1213,7 @@ $L045xts_enc_short:
pxor xmm1,xmm2
movdqa XMMWORD PTR [esp],xmm5
movdqa XMMWORD PTR 16[esp],xmm6
- je $L051xts_enc_four
+ je $L053xts_enc_four
movdqa XMMWORD PTR 32[esp],xmm7
pshufd xmm7,xmm0,19
movdqa XMMWORD PTR 48[esp],xmm1
@@ -1207,9 +1245,9 @@ $L045xts_enc_short:
movups XMMWORD PTR 48[edi],xmm5
movups XMMWORD PTR 64[edi],xmm6
lea edi,DWORD PTR 80[edi]
- jmp $L052xts_enc_done
+ jmp $L054xts_enc_done
ALIGN 16
-$L048xts_enc_one:
+$L050xts_enc_one:
movups xmm2,XMMWORD PTR [esi]
lea esi,DWORD PTR 16[esi]
xorps xmm2,xmm5
@@ -1217,20 +1255,20 @@ $L048xts_enc_one:
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L053enc1_loop_9:
+$L055enc1_loop_9:
DB 102,15,56,220,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L053enc1_loop_9
+ jnz $L055enc1_loop_9
DB 102,15,56,221,209
xorps xmm2,xmm5
movups XMMWORD PTR [edi],xmm2
lea edi,DWORD PTR 16[edi]
movdqa xmm1,xmm5
- jmp $L052xts_enc_done
+ jmp $L054xts_enc_done
ALIGN 16
-$L049xts_enc_two:
+$L051xts_enc_two:
movaps xmm6,xmm1
movups xmm2,XMMWORD PTR [esi]
movups xmm3,XMMWORD PTR 16[esi]
@@ -1244,9 +1282,9 @@ $L049xts_enc_two:
movups XMMWORD PTR 16[edi],xmm3
lea edi,DWORD PTR 32[edi]
movdqa xmm1,xmm6
- jmp $L052xts_enc_done
+ jmp $L054xts_enc_done
ALIGN 16
-$L050xts_enc_three:
+$L052xts_enc_three:
movaps xmm7,xmm1
movups xmm2,XMMWORD PTR [esi]
movups xmm3,XMMWORD PTR 16[esi]
@@ -1264,9 +1302,9 @@ $L050xts_enc_three:
movups XMMWORD PTR 32[edi],xmm4
lea edi,DWORD PTR 48[edi]
movdqa xmm1,xmm7
- jmp $L052xts_enc_done
+ jmp $L054xts_enc_done
ALIGN 16
-$L051xts_enc_four:
+$L053xts_enc_four:
movaps xmm6,xmm1
movups xmm2,XMMWORD PTR [esi]
movups xmm3,XMMWORD PTR 16[esi]
@@ -1288,28 +1326,28 @@ $L051xts_enc_four:
movups XMMWORD PTR 48[edi],xmm5
lea edi,DWORD PTR 64[edi]
movdqa xmm1,xmm6
- jmp $L052xts_enc_done
+ jmp $L054xts_enc_done
ALIGN 16
-$L047xts_enc_done6x:
+$L049xts_enc_done6x:
mov eax,DWORD PTR 112[esp]
and eax,15
- jz $L054xts_enc_ret
+ jz $L056xts_enc_ret
movdqa xmm5,xmm1
mov DWORD PTR 112[esp],eax
- jmp $L055xts_enc_steal
+ jmp $L057xts_enc_steal
ALIGN 16
-$L052xts_enc_done:
+$L054xts_enc_done:
mov eax,DWORD PTR 112[esp]
pxor xmm0,xmm0
and eax,15
- jz $L054xts_enc_ret
+ jz $L056xts_enc_ret
pcmpgtd xmm0,xmm1
mov DWORD PTR 112[esp],eax
pshufd xmm5,xmm0,19
paddq xmm1,xmm1
pand xmm5,XMMWORD PTR 96[esp]
pxor xmm5,xmm1
-$L055xts_enc_steal:
+$L057xts_enc_steal:
movzx ecx,BYTE PTR [esi]
movzx edx,BYTE PTR [edi-16]
lea esi,DWORD PTR 1[esi]
@@ -1317,7 +1355,7 @@ $L055xts_enc_steal:
mov BYTE PTR [edi],dl
lea edi,DWORD PTR 1[edi]
sub eax,1
- jnz $L055xts_enc_steal
+ jnz $L057xts_enc_steal
sub edi,DWORD PTR 112[esp]
mov edx,ebp
mov ecx,ebx
@@ -1327,16 +1365,30 @@ $L055xts_enc_steal:
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L056enc1_loop_10:
+$L058enc1_loop_10:
DB 102,15,56,220,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L056enc1_loop_10
+ jnz $L058enc1_loop_10
DB 102,15,56,221,209
xorps xmm2,xmm5
movups XMMWORD PTR [edi-16],xmm2
-$L054xts_enc_ret:
+$L056xts_enc_ret:
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ movdqa XMMWORD PTR [esp],xmm0
+ pxor xmm3,xmm3
+ movdqa XMMWORD PTR 16[esp],xmm0
+ pxor xmm4,xmm4
+ movdqa XMMWORD PTR 32[esp],xmm0
+ pxor xmm5,xmm5
+ movdqa XMMWORD PTR 48[esp],xmm0
+ pxor xmm6,xmm6
+ movdqa XMMWORD PTR 64[esp],xmm0
+ pxor xmm7,xmm7
+ movdqa XMMWORD PTR 80[esp],xmm0
mov esp,DWORD PTR 116[esp]
pop edi
pop esi
@@ -1359,12 +1411,12 @@ $L_aesni_xts_decrypt_begin::
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L057enc1_loop_11:
+$L059enc1_loop_11:
DB 102,15,56,220,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L057enc1_loop_11
+ jnz $L059enc1_loop_11
DB 102,15,56,221,209
mov esi,DWORD PTR 20[esp]
mov edi,DWORD PTR 24[esp]
@@ -1393,14 +1445,14 @@ DB 102,15,56,221,209
pcmpgtd xmm0,xmm1
and eax,-16
sub eax,96
- jc $L058xts_dec_short
+ jc $L060xts_dec_short
shl ecx,4
mov ebx,16
sub ebx,ecx
lea edx,DWORD PTR 32[ecx*1+edx]
- jmp $L059xts_dec_loop6
+ jmp $L061xts_dec_loop6
ALIGN 16
-$L059xts_dec_loop6:
+$L061xts_dec_loop6:
pshufd xmm2,xmm0,19
pxor xmm0,xmm0
movdqa XMMWORD PTR [esp],xmm1
@@ -1489,23 +1541,23 @@ DB 102,15,56,222,249
pcmpgtd xmm0,xmm1
pxor xmm1,xmm2
sub eax,96
- jnc $L059xts_dec_loop6
+ jnc $L061xts_dec_loop6
mov ecx,DWORD PTR 240[ebp]
mov edx,ebp
mov ebx,ecx
-$L058xts_dec_short:
+$L060xts_dec_short:
add eax,96
- jz $L060xts_dec_done6x
+ jz $L062xts_dec_done6x
movdqa xmm5,xmm1
cmp eax,32
- jb $L061xts_dec_one
+ jb $L063xts_dec_one
pshufd xmm2,xmm0,19
pxor xmm0,xmm0
paddq xmm1,xmm1
pand xmm2,xmm3
pcmpgtd xmm0,xmm1
pxor xmm1,xmm2
- je $L062xts_dec_two
+ je $L064xts_dec_two
pshufd xmm2,xmm0,19
pxor xmm0,xmm0
movdqa xmm6,xmm1
@@ -1514,7 +1566,7 @@ $L058xts_dec_short:
pcmpgtd xmm0,xmm1
pxor xmm1,xmm2
cmp eax,64
- jb $L063xts_dec_three
+ jb $L065xts_dec_three
pshufd xmm2,xmm0,19
pxor xmm0,xmm0
movdqa xmm7,xmm1
@@ -1524,7 +1576,7 @@ $L058xts_dec_short:
pxor xmm1,xmm2
movdqa XMMWORD PTR [esp],xmm5
movdqa XMMWORD PTR 16[esp],xmm6
- je $L064xts_dec_four
+ je $L066xts_dec_four
movdqa XMMWORD PTR 32[esp],xmm7
pshufd xmm7,xmm0,19
movdqa XMMWORD PTR 48[esp],xmm1
@@ -1556,9 +1608,9 @@ $L058xts_dec_short:
movups XMMWORD PTR 48[edi],xmm5
movups XMMWORD PTR 64[edi],xmm6
lea edi,DWORD PTR 80[edi]
- jmp $L065xts_dec_done
+ jmp $L067xts_dec_done
ALIGN 16
-$L061xts_dec_one:
+$L063xts_dec_one:
movups xmm2,XMMWORD PTR [esi]
lea esi,DWORD PTR 16[esi]
xorps xmm2,xmm5
@@ -1566,20 +1618,20 @@ $L061xts_dec_one:
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L066dec1_loop_12:
+$L068dec1_loop_12:
DB 102,15,56,222,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L066dec1_loop_12
+ jnz $L068dec1_loop_12
DB 102,15,56,223,209
xorps xmm2,xmm5
movups XMMWORD PTR [edi],xmm2
lea edi,DWORD PTR 16[edi]
movdqa xmm1,xmm5
- jmp $L065xts_dec_done
+ jmp $L067xts_dec_done
ALIGN 16
-$L062xts_dec_two:
+$L064xts_dec_two:
movaps xmm6,xmm1
movups xmm2,XMMWORD PTR [esi]
movups xmm3,XMMWORD PTR 16[esi]
@@ -1593,9 +1645,9 @@ $L062xts_dec_two:
movups XMMWORD PTR 16[edi],xmm3
lea edi,DWORD PTR 32[edi]
movdqa xmm1,xmm6
- jmp $L065xts_dec_done
+ jmp $L067xts_dec_done
ALIGN 16
-$L063xts_dec_three:
+$L065xts_dec_three:
movaps xmm7,xmm1
movups xmm2,XMMWORD PTR [esi]
movups xmm3,XMMWORD PTR 16[esi]
@@ -1613,9 +1665,9 @@ $L063xts_dec_three:
movups XMMWORD PTR 32[edi],xmm4
lea edi,DWORD PTR 48[edi]
movdqa xmm1,xmm7
- jmp $L065xts_dec_done
+ jmp $L067xts_dec_done
ALIGN 16
-$L064xts_dec_four:
+$L066xts_dec_four:
movaps xmm6,xmm1
movups xmm2,XMMWORD PTR [esi]
movups xmm3,XMMWORD PTR 16[esi]
@@ -1637,20 +1689,20 @@ $L064xts_dec_four:
movups XMMWORD PTR 48[edi],xmm5
lea edi,DWORD PTR 64[edi]
movdqa xmm1,xmm6
- jmp $L065xts_dec_done
+ jmp $L067xts_dec_done
ALIGN 16
-$L060xts_dec_done6x:
+$L062xts_dec_done6x:
mov eax,DWORD PTR 112[esp]
and eax,15
- jz $L067xts_dec_ret
+ jz $L069xts_dec_ret
mov DWORD PTR 112[esp],eax
- jmp $L068xts_dec_only_one_more
+ jmp $L070xts_dec_only_one_more
ALIGN 16
-$L065xts_dec_done:
+$L067xts_dec_done:
mov eax,DWORD PTR 112[esp]
pxor xmm0,xmm0
and eax,15
- jz $L067xts_dec_ret
+ jz $L069xts_dec_ret
pcmpgtd xmm0,xmm1
mov DWORD PTR 112[esp],eax
pshufd xmm2,xmm0,19
@@ -1660,7 +1712,7 @@ $L065xts_dec_done:
pand xmm2,xmm3
pcmpgtd xmm0,xmm1
pxor xmm1,xmm2
-$L068xts_dec_only_one_more:
+$L070xts_dec_only_one_more:
pshufd xmm5,xmm0,19
movdqa xmm6,xmm1
paddq xmm1,xmm1
@@ -1674,16 +1726,16 @@ $L068xts_dec_only_one_more:
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L069dec1_loop_13:
+$L071dec1_loop_13:
DB 102,15,56,222,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L069dec1_loop_13
+ jnz $L071dec1_loop_13
DB 102,15,56,223,209
xorps xmm2,xmm5
movups XMMWORD PTR [edi],xmm2
-$L070xts_dec_steal:
+$L072xts_dec_steal:
movzx ecx,BYTE PTR 16[esi]
movzx edx,BYTE PTR [edi]
lea esi,DWORD PTR 1[esi]
@@ -1691,7 +1743,7 @@ $L070xts_dec_steal:
mov BYTE PTR 16[edi],dl
lea edi,DWORD PTR 1[edi]
sub eax,1
- jnz $L070xts_dec_steal
+ jnz $L072xts_dec_steal
sub edi,DWORD PTR 112[esp]
mov edx,ebp
mov ecx,ebx
@@ -1701,16 +1753,30 @@ $L070xts_dec_steal:
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L071dec1_loop_14:
+$L073dec1_loop_14:
DB 102,15,56,222,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L071dec1_loop_14
+ jnz $L073dec1_loop_14
DB 102,15,56,223,209
xorps xmm2,xmm6
movups XMMWORD PTR [edi],xmm2
-$L067xts_dec_ret:
+$L069xts_dec_ret:
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ movdqa XMMWORD PTR [esp],xmm0
+ pxor xmm3,xmm3
+ movdqa XMMWORD PTR 16[esp],xmm0
+ pxor xmm4,xmm4
+ movdqa XMMWORD PTR 32[esp],xmm0
+ pxor xmm5,xmm5
+ movdqa XMMWORD PTR 48[esp],xmm0
+ pxor xmm6,xmm6
+ movdqa XMMWORD PTR 64[esp],xmm0
+ pxor xmm7,xmm7
+ movdqa XMMWORD PTR 80[esp],xmm0
mov esp,DWORD PTR 116[esp]
pop edi
pop esi
@@ -1734,7 +1800,7 @@ $L_aesni_cbc_encrypt_begin::
mov edx,DWORD PTR 32[esp]
mov ebp,DWORD PTR 36[esp]
test eax,eax
- jz $L072cbc_abort
+ jz $L074cbc_abort
cmp DWORD PTR 40[esp],0
xchg ebx,esp
movups xmm7,XMMWORD PTR [ebp]
@@ -1742,14 +1808,14 @@ $L_aesni_cbc_encrypt_begin::
mov ebp,edx
mov DWORD PTR 16[esp],ebx
mov ebx,ecx
- je $L073cbc_decrypt
+ je $L075cbc_decrypt
movaps xmm2,xmm7
cmp eax,16
- jb $L074cbc_enc_tail
+ jb $L076cbc_enc_tail
sub eax,16
- jmp $L075cbc_enc_loop
+ jmp $L077cbc_enc_loop
ALIGN 16
-$L075cbc_enc_loop:
+$L077cbc_enc_loop:
movups xmm7,XMMWORD PTR [esi]
lea esi,DWORD PTR 16[esi]
movups xmm0,XMMWORD PTR [edx]
@@ -1757,24 +1823,25 @@ $L075cbc_enc_loop:
xorps xmm7,xmm0
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm7
-$L076enc1_loop_15:
+$L078enc1_loop_15:
DB 102,15,56,220,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L076enc1_loop_15
+ jnz $L078enc1_loop_15
DB 102,15,56,221,209
mov ecx,ebx
mov edx,ebp
movups XMMWORD PTR [edi],xmm2
lea edi,DWORD PTR 16[edi]
sub eax,16
- jnc $L075cbc_enc_loop
+ jnc $L077cbc_enc_loop
add eax,16
- jnz $L074cbc_enc_tail
+ jnz $L076cbc_enc_tail
movaps xmm7,xmm2
- jmp $L077cbc_ret
-$L074cbc_enc_tail:
+ pxor xmm2,xmm2
+ jmp $L079cbc_ret
+$L076cbc_enc_tail:
mov ecx,eax
DD 2767451785
mov ecx,16
@@ -1785,20 +1852,20 @@ DD 2868115081
mov ecx,ebx
mov esi,edi
mov edx,ebp
- jmp $L075cbc_enc_loop
+ jmp $L077cbc_enc_loop
ALIGN 16
-$L073cbc_decrypt:
+$L075cbc_decrypt:
cmp eax,80
- jbe $L078cbc_dec_tail
+ jbe $L080cbc_dec_tail
movaps XMMWORD PTR [esp],xmm7
sub eax,80
- jmp $L079cbc_dec_loop6_enter
+ jmp $L081cbc_dec_loop6_enter
ALIGN 16
-$L080cbc_dec_loop6:
+$L082cbc_dec_loop6:
movaps XMMWORD PTR [esp],xmm0
movups XMMWORD PTR [edi],xmm7
lea edi,DWORD PTR 16[edi]
-$L079cbc_dec_loop6_enter:
+$L081cbc_dec_loop6_enter:
movdqu xmm2,XMMWORD PTR [esi]
movdqu xmm3,XMMWORD PTR 16[esi]
movdqu xmm4,XMMWORD PTR 32[esi]
@@ -1828,28 +1895,28 @@ $L079cbc_dec_loop6_enter:
movups XMMWORD PTR 64[edi],xmm6
lea edi,DWORD PTR 80[edi]
sub eax,96
- ja $L080cbc_dec_loop6
+ ja $L082cbc_dec_loop6
movaps xmm2,xmm7
movaps xmm7,xmm0
add eax,80
- jle $L081cbc_dec_tail_collected
+ jle $L083cbc_dec_clear_tail_collected
movups XMMWORD PTR [edi],xmm2
lea edi,DWORD PTR 16[edi]
-$L078cbc_dec_tail:
+$L080cbc_dec_tail:
movups xmm2,XMMWORD PTR [esi]
movaps xmm6,xmm2
cmp eax,16
- jbe $L082cbc_dec_one
+ jbe $L084cbc_dec_one
movups xmm3,XMMWORD PTR 16[esi]
movaps xmm5,xmm3
cmp eax,32
- jbe $L083cbc_dec_two
+ jbe $L085cbc_dec_two
movups xmm4,XMMWORD PTR 32[esi]
cmp eax,48
- jbe $L084cbc_dec_three
+ jbe $L086cbc_dec_three
movups xmm5,XMMWORD PTR 48[esi]
cmp eax,64
- jbe $L085cbc_dec_four
+ jbe $L087cbc_dec_four
movups xmm6,XMMWORD PTR 64[esi]
movaps XMMWORD PTR [esp],xmm7
movups xmm2,XMMWORD PTR [esi]
@@ -1867,55 +1934,62 @@ $L078cbc_dec_tail:
xorps xmm6,xmm0
movups XMMWORD PTR [edi],xmm2
movups XMMWORD PTR 16[edi],xmm3
+ pxor xmm3,xmm3
movups XMMWORD PTR 32[edi],xmm4
+ pxor xmm4,xmm4
movups XMMWORD PTR 48[edi],xmm5
+ pxor xmm5,xmm5
lea edi,DWORD PTR 64[edi]
movaps xmm2,xmm6
+ pxor xmm6,xmm6
sub eax,80
- jmp $L081cbc_dec_tail_collected
+ jmp $L088cbc_dec_tail_collected
ALIGN 16
-$L082cbc_dec_one:
+$L084cbc_dec_one:
movups xmm0,XMMWORD PTR [edx]
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L086dec1_loop_16:
+$L089dec1_loop_16:
DB 102,15,56,222,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L086dec1_loop_16
+ jnz $L089dec1_loop_16
DB 102,15,56,223,209
xorps xmm2,xmm7
movaps xmm7,xmm6
sub eax,16
- jmp $L081cbc_dec_tail_collected
+ jmp $L088cbc_dec_tail_collected
ALIGN 16
-$L083cbc_dec_two:
+$L085cbc_dec_two:
call __aesni_decrypt2
xorps xmm2,xmm7
xorps xmm3,xmm6
movups XMMWORD PTR [edi],xmm2
movaps xmm2,xmm3
+ pxor xmm3,xmm3
lea edi,DWORD PTR 16[edi]
movaps xmm7,xmm5
sub eax,32
- jmp $L081cbc_dec_tail_collected
+ jmp $L088cbc_dec_tail_collected
ALIGN 16
-$L084cbc_dec_three:
+$L086cbc_dec_three:
call __aesni_decrypt3
xorps xmm2,xmm7
xorps xmm3,xmm6
xorps xmm4,xmm5
movups XMMWORD PTR [edi],xmm2
movaps xmm2,xmm4
+ pxor xmm4,xmm4
movups XMMWORD PTR 16[edi],xmm3
+ pxor xmm3,xmm3
lea edi,DWORD PTR 32[edi]
movups xmm7,XMMWORD PTR 32[esi]
sub eax,48
- jmp $L081cbc_dec_tail_collected
+ jmp $L088cbc_dec_tail_collected
ALIGN 16
-$L085cbc_dec_four:
+$L087cbc_dec_four:
call __aesni_decrypt4
movups xmm1,XMMWORD PTR 16[esi]
movups xmm0,XMMWORD PTR 32[esi]
@@ -1925,28 +1999,44 @@ $L085cbc_dec_four:
movups XMMWORD PTR [edi],xmm2
xorps xmm4,xmm1
movups XMMWORD PTR 16[edi],xmm3
+ pxor xmm3,xmm3
xorps xmm5,xmm0
movups XMMWORD PTR 32[edi],xmm4
+ pxor xmm4,xmm4
lea edi,DWORD PTR 48[edi]
movaps xmm2,xmm5
+ pxor xmm5,xmm5
sub eax,64
-$L081cbc_dec_tail_collected:
+ jmp $L088cbc_dec_tail_collected
+ALIGN 16
+$L083cbc_dec_clear_tail_collected:
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ pxor xmm6,xmm6
+$L088cbc_dec_tail_collected:
and eax,15
- jnz $L087cbc_dec_tail_partial
+ jnz $L090cbc_dec_tail_partial
movups XMMWORD PTR [edi],xmm2
- jmp $L077cbc_ret
+ pxor xmm0,xmm0
+ jmp $L079cbc_ret
ALIGN 16
-$L087cbc_dec_tail_partial:
+$L090cbc_dec_tail_partial:
movaps XMMWORD PTR [esp],xmm2
+ pxor xmm0,xmm0
mov ecx,16
mov esi,esp
sub ecx,eax
DD 2767451785
-$L077cbc_ret:
+ movdqa XMMWORD PTR [esp],xmm2
+$L079cbc_ret:
mov esp,DWORD PTR 16[esp]
mov ebp,DWORD PTR 36[esp]
+ pxor xmm2,xmm2
+ pxor xmm1,xmm1
movups XMMWORD PTR [ebp],xmm7
-$L072cbc_abort:
+ pxor xmm7,xmm7
+$L074cbc_abort:
pop edi
pop esi
pop ebx
@@ -1955,52 +2045,62 @@ $L072cbc_abort:
_aesni_cbc_encrypt ENDP
ALIGN 16
__aesni_set_encrypt_key PROC PRIVATE
+ push ebp
+ push ebx
test eax,eax
- jz $L088bad_pointer
+ jz $L091bad_pointer
test edx,edx
- jz $L088bad_pointer
+ jz $L091bad_pointer
+ call $L092pic
+$L092pic:
+ pop ebx
+ lea ebx,DWORD PTR ($Lkey_const-$L092pic)[ebx]
+ lea ebp,DWORD PTR _OPENSSL_ia32cap_P
movups xmm0,XMMWORD PTR [eax]
xorps xmm4,xmm4
+ mov ebp,DWORD PTR 4[ebp]
lea edx,DWORD PTR 16[edx]
+ and ebp,268437504
cmp ecx,256
- je $L08914rounds
+ je $L09314rounds
cmp ecx,192
- je $L09012rounds
+ je $L09412rounds
cmp ecx,128
- jne $L091bad_keybits
+ jne $L095bad_keybits
ALIGN 16
-$L09210rounds:
+$L09610rounds:
+ cmp ebp,268435456
+ je $L09710rounds_alt
mov ecx,9
movups XMMWORD PTR [edx-16],xmm0
DB 102,15,58,223,200,1
- call $L093key_128_cold
+ call $L098key_128_cold
DB 102,15,58,223,200,2
- call $L094key_128
+ call $L099key_128
DB 102,15,58,223,200,4
- call $L094key_128
+ call $L099key_128
DB 102,15,58,223,200,8
- call $L094key_128
+ call $L099key_128
DB 102,15,58,223,200,16
- call $L094key_128
+ call $L099key_128
DB 102,15,58,223,200,32
- call $L094key_128
+ call $L099key_128
DB 102,15,58,223,200,64
- call $L094key_128
+ call $L099key_128
DB 102,15,58,223,200,128
- call $L094key_128
+ call $L099key_128
DB 102,15,58,223,200,27
- call $L094key_128
+ call $L099key_128
DB 102,15,58,223,200,54
- call $L094key_128
+ call $L099key_128
movups XMMWORD PTR [edx],xmm0
mov DWORD PTR 80[edx],ecx
- xor eax,eax
- ret
+ jmp $L100good_key
ALIGN 16
-$L094key_128:
+$L099key_128:
movups XMMWORD PTR [edx],xmm0
lea edx,DWORD PTR 16[edx]
-$L093key_128_cold:
+$L098key_128_cold:
shufps xmm4,xmm0,16
xorps xmm0,xmm4
shufps xmm4,xmm0,140
@@ -2009,38 +2109,91 @@ $L093key_128_cold:
xorps xmm0,xmm1
ret
ALIGN 16
-$L09012rounds:
+$L09710rounds_alt:
+ movdqa xmm5,XMMWORD PTR [ebx]
+ mov ecx,8
+ movdqa xmm4,XMMWORD PTR 32[ebx]
+ movdqa xmm2,xmm0
+ movdqu XMMWORD PTR [edx-16],xmm0
+$L101loop_key128:
+DB 102,15,56,0,197
+DB 102,15,56,221,196
+ pslld xmm4,1
+ lea edx,DWORD PTR 16[edx]
+ movdqa xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm2,xmm3
+ pxor xmm0,xmm2
+ movdqu XMMWORD PTR [edx-16],xmm0
+ movdqa xmm2,xmm0
+ dec ecx
+ jnz $L101loop_key128
+ movdqa xmm4,XMMWORD PTR 48[ebx]
+DB 102,15,56,0,197
+DB 102,15,56,221,196
+ pslld xmm4,1
+ movdqa xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm2,xmm3
+ pxor xmm0,xmm2
+ movdqu XMMWORD PTR [edx],xmm0
+ movdqa xmm2,xmm0
+DB 102,15,56,0,197
+DB 102,15,56,221,196
+ movdqa xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm2,xmm3
+ pxor xmm0,xmm2
+ movdqu XMMWORD PTR 16[edx],xmm0
+ mov ecx,9
+ mov DWORD PTR 96[edx],ecx
+ jmp $L100good_key
+ALIGN 16
+$L09412rounds:
movq xmm2,QWORD PTR 16[eax]
+ cmp ebp,268435456
+ je $L10212rounds_alt
mov ecx,11
movups XMMWORD PTR [edx-16],xmm0
DB 102,15,58,223,202,1
- call $L095key_192a_cold
+ call $L103key_192a_cold
DB 102,15,58,223,202,2
- call $L096key_192b
+ call $L104key_192b
DB 102,15,58,223,202,4
- call $L097key_192a
+ call $L105key_192a
DB 102,15,58,223,202,8
- call $L096key_192b
+ call $L104key_192b
DB 102,15,58,223,202,16
- call $L097key_192a
+ call $L105key_192a
DB 102,15,58,223,202,32
- call $L096key_192b
+ call $L104key_192b
DB 102,15,58,223,202,64
- call $L097key_192a
+ call $L105key_192a
DB 102,15,58,223,202,128
- call $L096key_192b
+ call $L104key_192b
movups XMMWORD PTR [edx],xmm0
mov DWORD PTR 48[edx],ecx
- xor eax,eax
- ret
+ jmp $L100good_key
ALIGN 16
-$L097key_192a:
+$L105key_192a:
movups XMMWORD PTR [edx],xmm0
lea edx,DWORD PTR 16[edx]
ALIGN 16
-$L095key_192a_cold:
+$L103key_192a_cold:
movaps xmm5,xmm2
-$L098key_192b_warm:
+$L106key_192b_warm:
shufps xmm4,xmm0,16
movdqa xmm3,xmm2
xorps xmm0,xmm4
@@ -2054,56 +2207,90 @@ $L098key_192b_warm:
pxor xmm2,xmm3
ret
ALIGN 16
-$L096key_192b:
+$L104key_192b:
movaps xmm3,xmm0
shufps xmm5,xmm0,68
movups XMMWORD PTR [edx],xmm5
shufps xmm3,xmm2,78
movups XMMWORD PTR 16[edx],xmm3
lea edx,DWORD PTR 32[edx]
- jmp $L098key_192b_warm
+ jmp $L106key_192b_warm
+ALIGN 16
+$L10212rounds_alt:
+ movdqa xmm5,XMMWORD PTR 16[ebx]
+ movdqa xmm4,XMMWORD PTR 32[ebx]
+ mov ecx,8
+ movdqu XMMWORD PTR [edx-16],xmm0
+$L107loop_key192:
+ movq QWORD PTR [edx],xmm2
+ movdqa xmm1,xmm2
+DB 102,15,56,0,213
+DB 102,15,56,221,212
+ pslld xmm4,1
+ lea edx,DWORD PTR 24[edx]
+ movdqa xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm0,xmm3
+ pshufd xmm3,xmm0,255
+ pxor xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm3,xmm1
+ pxor xmm0,xmm2
+ pxor xmm2,xmm3
+ movdqu XMMWORD PTR [edx-16],xmm0
+ dec ecx
+ jnz $L107loop_key192
+ mov ecx,11
+ mov DWORD PTR 32[edx],ecx
+ jmp $L100good_key
ALIGN 16
-$L08914rounds:
+$L09314rounds:
movups xmm2,XMMWORD PTR 16[eax]
- mov ecx,13
lea edx,DWORD PTR 16[edx]
+ cmp ebp,268435456
+ je $L10814rounds_alt
+ mov ecx,13
movups XMMWORD PTR [edx-32],xmm0
movups XMMWORD PTR [edx-16],xmm2
DB 102,15,58,223,202,1
- call $L099key_256a_cold
+ call $L109key_256a_cold
DB 102,15,58,223,200,1
- call $L100key_256b
+ call $L110key_256b
DB 102,15,58,223,202,2
- call $L101key_256a
+ call $L111key_256a
DB 102,15,58,223,200,2
- call $L100key_256b
+ call $L110key_256b
DB 102,15,58,223,202,4
- call $L101key_256a
+ call $L111key_256a
DB 102,15,58,223,200,4
- call $L100key_256b
+ call $L110key_256b
DB 102,15,58,223,202,8
- call $L101key_256a
+ call $L111key_256a
DB 102,15,58,223,200,8
- call $L100key_256b
+ call $L110key_256b
DB 102,15,58,223,202,16
- call $L101key_256a
+ call $L111key_256a
DB 102,15,58,223,200,16
- call $L100key_256b
+ call $L110key_256b
DB 102,15,58,223,202,32
- call $L101key_256a
+ call $L111key_256a
DB 102,15,58,223,200,32
- call $L100key_256b
+ call $L110key_256b
DB 102,15,58,223,202,64
- call $L101key_256a
+ call $L111key_256a
movups XMMWORD PTR [edx],xmm0
mov DWORD PTR 16[edx],ecx
xor eax,eax
- ret
+ jmp $L100good_key
ALIGN 16
-$L101key_256a:
+$L111key_256a:
movups XMMWORD PTR [edx],xmm2
lea edx,DWORD PTR 16[edx]
-$L099key_256a_cold:
+$L109key_256a_cold:
shufps xmm4,xmm0,16
xorps xmm0,xmm4
shufps xmm4,xmm0,140
@@ -2112,7 +2299,7 @@ $L099key_256a_cold:
xorps xmm0,xmm1
ret
ALIGN 16
-$L100key_256b:
+$L110key_256b:
movups XMMWORD PTR [edx],xmm0
lea edx,DWORD PTR 16[edx]
shufps xmm4,xmm2,16
@@ -2122,13 +2309,70 @@ $L100key_256b:
shufps xmm1,xmm1,170
xorps xmm2,xmm1
ret
+ALIGN 16
+$L10814rounds_alt:
+ movdqa xmm5,XMMWORD PTR [ebx]
+ movdqa xmm4,XMMWORD PTR 32[ebx]
+ mov ecx,7
+ movdqu XMMWORD PTR [edx-32],xmm0
+ movdqa xmm1,xmm2
+ movdqu XMMWORD PTR [edx-16],xmm2
+$L112loop_key256:
+DB 102,15,56,0,213
+DB 102,15,56,221,212
+ movdqa xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm0,xmm3
+ pslld xmm4,1
+ pxor xmm0,xmm2
+ movdqu XMMWORD PTR [edx],xmm0
+ dec ecx
+ jz $L113done_key256
+ pshufd xmm2,xmm0,255
+ pxor xmm3,xmm3
+DB 102,15,56,221,211
+ movdqa xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm1,xmm3
+ pxor xmm2,xmm1
+ movdqu XMMWORD PTR 16[edx],xmm2
+ lea edx,DWORD PTR 32[edx]
+ movdqa xmm1,xmm2
+ jmp $L112loop_key256
+$L113done_key256:
+ mov ecx,13
+ mov DWORD PTR 16[edx],ecx
+$L100good_key:
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ xor eax,eax
+ pop ebx
+ pop ebp
+ ret
ALIGN 4
-$L088bad_pointer:
+$L091bad_pointer:
mov eax,-1
+ pop ebx
+ pop ebp
ret
ALIGN 4
-$L091bad_keybits:
+$L095bad_keybits:
+ pxor xmm0,xmm0
mov eax,-2
+ pop ebx
+ pop ebp
ret
__aesni_set_encrypt_key ENDP
ALIGN 16
@@ -2150,7 +2394,7 @@ $L_aesni_set_decrypt_key_begin::
mov edx,DWORD PTR 12[esp]
shl ecx,4
test eax,eax
- jnz $L102dec_key_ret
+ jnz $L114dec_key_ret
lea eax,DWORD PTR 16[ecx*1+edx]
movups xmm0,XMMWORD PTR [edx]
movups xmm1,XMMWORD PTR [eax]
@@ -2158,7 +2402,7 @@ $L_aesni_set_decrypt_key_begin::
movups XMMWORD PTR [edx],xmm1
lea edx,DWORD PTR 16[edx]
lea eax,DWORD PTR [eax-16]
-$L103dec_key_inverse:
+$L115dec_key_inverse:
movups xmm0,XMMWORD PTR [edx]
movups xmm1,XMMWORD PTR [eax]
DB 102,15,56,219,192
@@ -2168,17 +2412,28 @@ DB 102,15,56,219,201
movups XMMWORD PTR 16[eax],xmm0
movups XMMWORD PTR [edx-16],xmm1
cmp eax,edx
- ja $L103dec_key_inverse
+ ja $L115dec_key_inverse
movups xmm0,XMMWORD PTR [edx]
DB 102,15,56,219,192
movups XMMWORD PTR [edx],xmm0
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
xor eax,eax
-$L102dec_key_ret:
+$L114dec_key_ret:
ret
_aesni_set_decrypt_key ENDP
+ALIGN 64
+$Lkey_const::
+DD 202313229,202313229,202313229,202313229
+DD 67569157,67569157,67569157,67569157
+DD 1,1,1,1
+DD 27,27,27,27
DB 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
DB 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
DB 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
DB 115,108,46,111,114,103,62,0
.text$ ENDS
+.bss SEGMENT 'BSS'
+COMM _OPENSSL_ia32cap_P:DWORD:4
+.bss ENDS
END
diff --git a/deps/openssl/asm_obsolete/arm-void-gas/aes/aesv8-armx.S b/deps/openssl/asm_obsolete/arm-void-gas/aes/aesv8-armx.S
index 732ba3d9c8..fd979d078f 100644
--- a/deps/openssl/asm_obsolete/arm-void-gas/aes/aesv8-armx.S
+++ b/deps/openssl/asm_obsolete/arm-void-gas/aes/aesv8-armx.S
@@ -230,17 +230,17 @@ aes_v8_encrypt:
.Loop_enc:
.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
- vld1.32 {q0},[r2]!
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
+ vld1.32 {q0},[r2]!
subs r3,r3,#2
.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
- vld1.32 {q1},[r2]!
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
+ vld1.32 {q1},[r2]!
bgt .Loop_enc
.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
- vld1.32 {q0},[r2]
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
+ vld1.32 {q0},[r2]
.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
veor q2,q2,q0
@@ -259,17 +259,17 @@ aes_v8_decrypt:
.Loop_dec:
.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
- vld1.32 {q0},[r2]!
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
+ vld1.32 {q0},[r2]!
subs r3,r3,#2
.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
- vld1.32 {q1},[r2]!
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
+ vld1.32 {q1},[r2]!
bgt .Loop_dec
.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
- vld1.32 {q0},[r2]
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
+ vld1.32 {q0},[r2]
.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
veor q2,q2,q0
@@ -313,16 +313,42 @@ aes_v8_cbc_encrypt:
veor q5,q8,q7
beq .Lcbc_enc128
+ vld1.32 {q2-q3},[r7]
+ add r7,r3,#16
+ add r6,r3,#16*4
+ add r12,r3,#16*5
+ .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ add r14,r3,#16*6
+ add r3,r3,#16*7
+ b .Lenter_cbc_enc
+
+.align 4
.Loop_cbc_enc:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
- vld1.32 {q8},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- subs r6,r6,#2
+ vst1.8 {q6},[r1]!
+.Lenter_cbc_enc:
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
- vld1.32 {q9},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- bgt .Loop_cbc_enc
+ .byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q8},[r6]
+ cmp r5,#4
+ .byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q9},[r12]
+ beq .Lcbc_enc192
+
+ .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q8},[r14]
+ .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+ .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q9},[r3]
+ nop
+.Lcbc_enc192:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
subs r2,r2,#16
@@ -331,7 +357,6 @@ aes_v8_cbc_encrypt:
moveq r8,#0
.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- add r7,r3,#16
.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.8 {q8},[r0],r8
@@ -340,16 +365,14 @@ aes_v8_cbc_encrypt:
veor q8,q8,q5
.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
+ vld1.32 {q9},[r7] @ re-pre-load rndkey[1]
.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
-
- mov r6,r5
veor q6,q0,q7
- vst1.8 {q6},[r1]!
bhs .Loop_cbc_enc
+ vst1.8 {q6},[r1]!
b .Lcbc_done
.align 5
@@ -407,79 +430,78 @@ aes_v8_cbc_encrypt:
.Loop3x_cbc_dec:
.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
- .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
- .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
- vld1.32 {q8},[r7]!
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.32 {q8},[r7]!
subs r6,r6,#2
.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
- .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
- .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
- vld1.32 {q9},[r7]!
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.32 {q9},[r7]!
bgt .Loop3x_cbc_dec
.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
- .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
- .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
- veor q4,q6,q7
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ veor q4,q6,q7
+ subs r2,r2,#0x30
veor q5,q2,q7
+ movlo r6,r2 @ r6, r6, is zero at this point
.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
- .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
- .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
- veor q9,q3,q7
- subs r2,r2,#0x30
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
- vorr q6,q11,q11
- movlo r6,r2 @ r6, r6, is zero at this point
- .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
- .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
- .byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
+ veor q9,q3,q7
add r0,r0,r6 @ r0 is adjusted in such way that
@ at exit from the loop q1-q10
@ are loaded with last "words"
+ vorr q6,q11,q11
+ mov r7,r3
+ .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
- mov r7,r3
- .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
- .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
- .byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
vld1.8 {q2},[r0]!
+ .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.8 {q3},[r0]!
.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14
- .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
- .byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
- vld1.8 {q11},[r0]!
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
- vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
+ vld1.8 {q11},[r0]!
.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15
.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15
-
+ vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
add r6,r5,#2
veor q4,q4,q0
veor q5,q5,q1
veor q10,q10,q9
vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
- vorr q0,q2,q2
vst1.8 {q4},[r1]!
- vorr q1,q3,q3
+ vorr q0,q2,q2
vst1.8 {q5},[r1]!
+ vorr q1,q3,q3
vst1.8 {q10},[r1]!
vorr q10,q11,q11
bhs .Loop3x_cbc_dec
@@ -490,39 +512,39 @@ aes_v8_cbc_encrypt:
.Lcbc_dec_tail:
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
- .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
- vld1.32 {q8},[r7]!
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.32 {q8},[r7]!
subs r6,r6,#2
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
- .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
- vld1.32 {q9},[r7]!
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.32 {q9},[r7]!
bgt .Lcbc_dec_tail
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
- .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
- .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
- .byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
cmn r2,#0x20
.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
- .byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
veor q5,q6,q7
.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
- .byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ .byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
veor q9,q3,q7
.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
@@ -590,70 +612,69 @@ aes_v8_ctr32_encrypt_blocks:
.align 4
.Loop3x_ctr32:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
- .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
- .byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
- vld1.32 {q8},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ .byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
+ vld1.32 {q8},[r7]!
subs r6,r6,#2
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
- .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
- .byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
- vld1.32 {q9},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ .byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
+ vld1.32 {q9},[r7]!
bgt .Loop3x_ctr32
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
- .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
- .byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
- mov r7,r3
.byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0
- vld1.8 {q2},[r0]!
+ .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1
- .byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
+ vld1.8 {q2},[r0]!
vorr q0,q6,q6
- .byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9
+ .byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
+ .byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
vld1.8 {q3},[r0]!
- .byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9
- .byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
vorr q1,q6,q6
+ .byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
- vld1.8 {q11},[r0]!
+ .byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ vld1.8 {q11},[r0]!
+ mov r7,r3
+ .byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
.byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10
vorr q10,q6,q6
add r9,r8,#1
.byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12
+ .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12
- .byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12
+ .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
veor q2,q2,q7
add r10,r8,#2
- .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
- .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ .byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12
.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
veor q3,q3,q7
add r8,r8,#3
.byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13
+ .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13
- .byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13
+ .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
veor q11,q11,q7
rev r9,r9
- .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
- vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
- .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ .byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13
.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
vmov.32 d1[1], r9
rev r10,r10
.byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14
+ .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14
- .byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14
+ .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
vmov.32 d3[1], r10
rev r12,r8
- .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
- .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ .byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14
.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
vmov.32 d21[1], r12
subs r2,r2,#3
@@ -661,13 +682,14 @@ aes_v8_ctr32_encrypt_blocks:
.byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15
.byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15
- mov r6,r5
veor q2,q2,q4
+ vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
+ vst1.8 {q2},[r1]!
veor q3,q3,q5
+ mov r6,r5
+ vst1.8 {q3},[r1]!
veor q11,q11,q9
vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
- vst1.8 {q2},[r1]!
- vst1.8 {q3},[r1]!
vst1.8 {q11},[r1]!
bhs .Loop3x_ctr32
@@ -679,40 +701,40 @@ aes_v8_ctr32_encrypt_blocks:
.Lctr32_tail:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
- .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
- vld1.32 {q8},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ vld1.32 {q8},[r7]!
subs r6,r6,#2
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
- .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
- vld1.32 {q9},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ vld1.32 {q9},[r7]!
bgt .Lctr32_tail
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
- .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
- .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
vld1.8 {q2},[r0],r12
.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
- .byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12
- vld1.8 {q3},[r0]
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ vld1.8 {q3},[r0]
.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
- .byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
- .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
- .byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14
veor q2,q2,q7
+ .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ .byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
veor q3,q3,q7
.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
diff --git a/deps/openssl/asm_obsolete/arm-void-gas/modes/ghash-armv4.S b/deps/openssl/asm_obsolete/arm-void-gas/modes/ghash-armv4.S
index d321235f79..c54f514997 100644
--- a/deps/openssl/asm_obsolete/arm-void-gas/modes/ghash-armv4.S
+++ b/deps/openssl/asm_obsolete/arm-void-gas/modes/ghash-armv4.S
@@ -495,7 +495,7 @@ gcm_ghash_neon:
veor q10,q10,q9 @
vshl.i64 q9,q0,#63
veor q10, q10, q9 @
- veor d1,d1,d20 @
+ veor d1,d1,d20 @
veor d4,d4,d21
vshr.u64 q10,q0,#1 @ 2nd phase
diff --git a/deps/openssl/asm_obsolete/arm-void-gas/modes/ghashv8-armx.S b/deps/openssl/asm_obsolete/arm-void-gas/modes/ghashv8-armx.S
index 570d9175c4..269574945f 100644
--- a/deps/openssl/asm_obsolete/arm-void-gas/modes/ghashv8-armx.S
+++ b/deps/openssl/asm_obsolete/arm-void-gas/modes/ghashv8-armx.S
@@ -7,109 +7,223 @@
.type gcm_init_v8,%function
.align 4
gcm_init_v8:
- vld1.64 {q9},[r1] @ load H
- vmov.i8 q8,#0xe1
+ vld1.64 {q9},[r1] @ load input H
+ vmov.i8 q11,#0xe1
+ vshl.i64 q11,q11,#57 @ 0xc2.0
vext.8 q3,q9,q9,#8
- vshl.i64 q8,q8,#57
- vshr.u64 q10,q8,#63
- vext.8 q8,q10,q8,#8 @ t0=0xc2....01
+ vshr.u64 q10,q11,#63
vdup.32 q9,d18[1]
- vshr.u64 q11,q3,#63
+ vext.8 q8,q10,q11,#8 @ t0=0xc2....01
+ vshr.u64 q10,q3,#63
vshr.s32 q9,q9,#31 @ broadcast carry bit
- vand q11,q11,q8
+ vand q10,q10,q8
vshl.i64 q3,q3,#1
- vext.8 q11,q11,q11,#8
+ vext.8 q10,q10,q10,#8
vand q8,q8,q9
- vorr q3,q3,q11 @ H<<<=1
- veor q3,q3,q8 @ twisted H
- vst1.64 {q3},[r0]
+ vorr q3,q3,q10 @ H<<<=1
+ veor q12,q3,q8 @ twisted H
+ vst1.64 {q12},[r0]! @ store Htable[0]
+
+ @ calculate H^2
+ vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing
+ .byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12
+ veor q8,q8,q12
+ .byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12
+ .byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8
+
+ vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
+ veor q10,q0,q2
+ veor q1,q1,q9
+ veor q1,q1,q10
+ .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase
+
+ vmov d4,d3 @ Xh|Xm - 256-bit result
+ vmov d3,d0 @ Xm is rotated Xl
+ veor q0,q1,q10
+
+ vext.8 q10,q0,q0,#8 @ 2nd phase
+ .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
+ veor q10,q10,q2
+ veor q14,q0,q10
+
+ vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing
+ veor q9,q9,q14
+ vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed
+ vst1.64 {q13-q14},[r0] @ store Htable[1..2]
bx lr
.size gcm_init_v8,.-gcm_init_v8
-
.global gcm_gmult_v8
.type gcm_gmult_v8,%function
.align 4
gcm_gmult_v8:
vld1.64 {q9},[r0] @ load Xi
vmov.i8 q11,#0xe1
- vld1.64 {q12},[r1] @ load twisted H
+ vld1.64 {q12-q13},[r1] @ load twisted H, ...
vshl.u64 q11,q11,#57
#ifndef __ARMEB__
vrev64.8 q9,q9
#endif
- vext.8 q13,q12,q12,#8
- mov r3,#0
vext.8 q3,q9,q9,#8
- mov r12,#0
- veor q13,q13,q12 @ Karatsuba pre-processing
- mov r2,r0
- b .Lgmult_v8
-.size gcm_gmult_v8,.-gcm_gmult_v8
+ .byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
+ veor q9,q9,q3 @ Karatsuba pre-processing
+ .byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
+ .byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
+ veor q10,q0,q2
+ veor q1,q1,q9
+ veor q1,q1,q10
+ .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
+
+ vmov d4,d3 @ Xh|Xm - 256-bit result
+ vmov d3,d0 @ Xm is rotated Xl
+ veor q0,q1,q10
+
+ vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
+ .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
+ veor q10,q10,q2
+ veor q0,q0,q10
+
+#ifndef __ARMEB__
+ vrev64.8 q0,q0
+#endif
+ vext.8 q0,q0,q0,#8
+ vst1.64 {q0},[r0] @ write out Xi
+
+ bx lr
+.size gcm_gmult_v8,.-gcm_gmult_v8
.global gcm_ghash_v8
.type gcm_ghash_v8,%function
.align 4
gcm_ghash_v8:
+ vstmdb sp!,{d8-d15} @ 32-bit ABI says so
vld1.64 {q0},[r0] @ load [rotated] Xi
- subs r3,r3,#16
+ @ "[rotated]" means that
+ @ loaded value would have
+ @ to be rotated in order to
+ @ make it appear as in
+ @ alorithm specification
+ subs r3,r3,#32 @ see if r3 is 32 or larger
+ mov r12,#16 @ r12 is used as post-
+ @ increment for input pointer;
+ @ as loop is modulo-scheduled
+ @ r12 is zeroed just in time
+ @ to preclude oversteping
+ @ inp[len], which means that
+ @ last block[s] are actually
+ @ loaded twice, but last
+ @ copy is not processed
+ vld1.64 {q12-q13},[r1]! @ load twisted H, ..., H^2
vmov.i8 q11,#0xe1
- mov r12,#16
- vld1.64 {q12},[r1] @ load twisted H
- moveq r12,#0
- vext.8 q0,q0,q0,#8
- vshl.u64 q11,q11,#57
- vld1.64 {q9},[r2],r12 @ load [rotated] inp
- vext.8 q13,q12,q12,#8
+ vld1.64 {q14},[r1]
+ moveq r12,#0 @ is it time to zero r12?
+ vext.8 q0,q0,q0,#8 @ rotate Xi
+ vld1.64 {q8},[r2]! @ load [rotated] I[0]
+ vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant
#ifndef __ARMEB__
+ vrev64.8 q8,q8
vrev64.8 q0,q0
+#endif
+ vext.8 q3,q8,q8,#8 @ rotate I[0]
+ blo .Lodd_tail_v8 @ r3 was less than 32
+ vld1.64 {q9},[r2],r12 @ load [rotated] I[1]
+#ifndef __ARMEB__
vrev64.8 q9,q9
#endif
- veor q13,q13,q12 @ Karatsuba pre-processing
- vext.8 q3,q9,q9,#8
- b .Loop_v8
+ vext.8 q7,q9,q9,#8
+ veor q3,q3,q0 @ I[i]^=Xi
+ .byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
+ veor q9,q9,q7 @ Karatsuba pre-processing
+ .byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
+ b .Loop_mod2x_v8
.align 4
-.Loop_v8:
+.Loop_mod2x_v8:
+ vext.8 q10,q3,q3,#8
+ subs r3,r3,#32 @ is there more data?
+ .byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo
+ movlo r12,#0 @ is it time to zero r12?
+
+ .byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9
+ veor q10,q10,q3 @ Karatsuba pre-processing
+ .byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi
+ veor q0,q0,q4 @ accumulate
+ .byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+ vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2]
+
+ veor q2,q2,q6
+ moveq r12,#0 @ is it time to zero r12?
+ veor q1,q1,q5
+
+ vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
+ veor q10,q0,q2
+ veor q1,q1,q9
+ vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3]
+#ifndef __ARMEB__
+ vrev64.8 q8,q8
+#endif
+ veor q1,q1,q10
+ .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
+
+#ifndef __ARMEB__
+ vrev64.8 q9,q9
+#endif
+ vmov d4,d3 @ Xh|Xm - 256-bit result
+ vmov d3,d0 @ Xm is rotated Xl
+ vext.8 q7,q9,q9,#8
+ vext.8 q3,q8,q8,#8
+ veor q0,q1,q10
+ .byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
+ veor q3,q3,q2 @ accumulate q3 early
+
+ vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
+ .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
+ veor q3,q3,q10
+ veor q9,q9,q7 @ Karatsuba pre-processing
+ veor q3,q3,q0
+ .byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
+ bhs .Loop_mod2x_v8 @ there was at least 32 more bytes
+
+ veor q2,q2,q10
+ vext.8 q3,q8,q8,#8 @ re-construct q3
+ adds r3,r3,#32 @ re-construct r3
+ veor q0,q0,q2 @ re-construct q0
+ beq .Ldone_v8 @ is r3 zero?
+.Lodd_tail_v8:
vext.8 q10,q0,q0,#8
veor q3,q3,q0 @ inp^=Xi
- veor q9,q9,q10 @ q9 is rotated inp^Xi
+ veor q9,q8,q10 @ q9 is rotated inp^Xi
-.Lgmult_v8:
.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
veor q9,q9,q3 @ Karatsuba pre-processing
.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
- subs r3,r3,#16
.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
- moveq r12,#0
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
veor q10,q0,q2
veor q1,q1,q9
- vld1.64 {q9},[r2],r12 @ load [rotated] inp
veor q1,q1,q10
- .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase
+ .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
vmov d4,d3 @ Xh|Xm - 256-bit result
vmov d3,d0 @ Xm is rotated Xl
-#ifndef __ARMEB__
- vrev64.8 q9,q9
-#endif
veor q0,q1,q10
- vext.8 q3,q9,q9,#8
- vext.8 q10,q0,q0,#8 @ 2nd phase
+ vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
veor q10,q10,q2
veor q0,q0,q10
- bhs .Loop_v8
+.Ldone_v8:
#ifndef __ARMEB__
vrev64.8 q0,q0
#endif
vext.8 q0,q0,q0,#8
vst1.64 {q0},[r0] @ write out Xi
+ vldmia sp!,{d8-d15} @ 32-bit ABI says so
bx lr
.size gcm_ghash_v8,.-gcm_ghash_v8
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
diff --git a/deps/openssl/asm_obsolete/arm-void-gas/sha/sha256-armv4.S b/deps/openssl/asm_obsolete/arm-void-gas/sha/sha256-armv4.S
index bf1ce4f997..683f1cc0c8 100644
--- a/deps/openssl/asm_obsolete/arm-void-gas/sha/sha256-armv4.S
+++ b/deps/openssl/asm_obsolete/arm-void-gas/sha/sha256-armv4.S
@@ -1,7 +1,59 @@
-#include "arm_arch.h"
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA256 block procedure for ARMv4. May 2007.
+
+@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
+@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+@ byte [on single-issue Xscale PXA250 core].
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
+@ Cortex A8 core and ~20 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+@ September 2013.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process one
+@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+@ code (meaning that latter performs sub-optimally, nothing was done
+@ about it).
+
+@ May 2014.
+@
+@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
.text
+#if __ARM_ARCH__<7
.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code 32
+# endif
+#endif
.type K256,%object
.align 5
@@ -24,7 +76,7 @@ K256:
.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.size K256,.-K256
.word 0 @ terminator
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-sha256_block_data_order
#endif
@@ -33,9 +85,12 @@ K256:
.global sha256_block_data_order
.type sha256_block_data_order,%function
sha256_block_data_order:
+#if __ARM_ARCH__<7
sub r3,pc,#8 @ sha256_block_data_order
- add r2,r1,r2,lsl#6 @ len to point at the end of inp
-#if __ARM_MAX_ARCH__>=7
+#else
+ adr r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
tst r12,#ARMV8_SHA256
@@ -43,6 +98,7 @@ sha256_block_data_order:
tst r12,#ARMV7_NEON
bne .LNEON
#endif
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
stmdb sp!,{r0,r1,r2,r4-r11,lr}
ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}
sub r14,r3,#256+32 @ K256
@@ -1736,6 +1792,9 @@ sha256_block_data_order:
eor r12,r12,r6 @ Maj(a,b,c)
add r4,r4,r0,ror#2 @ h+=Sigma0(a)
@ add r4,r4,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ ite eq @ Thumb2 thing, sanity check in ARM
+#endif
ldreq r3,[sp,#16*4] @ pull ctx
bne .Lrounds_16_xx
@@ -1777,16 +1836,19 @@ sha256_block_data_order:
.arch armv7-a
.fpu neon
+.global sha256_block_data_order_neon
.type sha256_block_data_order_neon,%function
.align 4
sha256_block_data_order_neon:
.LNEON:
stmdb sp!,{r4-r12,lr}
+ sub r11,sp,#16*4+16
+ adr r14,K256
+ bic r11,r11,#15 @ align for 128-bit stores
mov r12,sp
- sub sp,sp,#16*4+16 @ alloca
- sub r14,r3,#256+32 @ K256
- bic sp,sp,#15 @ align for 128-bit stores
+ mov sp,r11 @ alloca
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
vld1.8 {q0},[r1]!
vld1.8 {q1},[r1]!
@@ -2224,11 +2286,13 @@ sha256_block_data_order_neon:
ldr r0,[sp,#72]
sub r14,r14,#256 @ rewind r14
teq r1,r0
+ it eq
subeq r1,r1,#64 @ avoid SEGV
vld1.8 {q0},[r1]! @ load next input block
vld1.8 {q1},[r1]!
vld1.8 {q2},[r1]!
vld1.8 {q3},[r1]!
+ it ne
strne r1,[sp,#68]
mov r1,sp
add r11,r11,r2
@@ -2542,23 +2606,38 @@ sha256_block_data_order_neon:
str r7,[r2],#4
stmia r2,{r8-r11}
+ ittte ne
movne r1,sp
ldrne r2,[sp,#0]
eorne r12,r12,r12
ldreq sp,[sp,#76] @ restore original sp
+ itt ne
eorne r3,r5,r6
bne .L_00_48
ldmia sp!,{r4-r12,pc}
.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
#endif
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d
+# endif
+
.type sha256_block_data_order_armv8,%function
.align 5
sha256_block_data_order_armv8:
.LARMv8:
vld1.32 {q0,q1},[r0]
- sub r3,r3,#sha256_block_data_order-K256
+# ifdef __thumb2__
+ adr r3,.LARMv8
+ sub r3,r3,#.LARMv8-K256
+# else
+ adrl r3,K256
+# endif
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
.Loop_v8:
vld1.8 {q8-q9},[r1]!
@@ -2573,114 +2652,115 @@ sha256_block_data_order_armv8:
teq r1,r2
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q8
- .byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
vmov q2,q0
- .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
- .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
- .byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q9
- .byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
vmov q2,q0
- .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
- .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
- .byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q10
- .byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
vmov q2,q0
- .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
- .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
- .byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q11
- .byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
vmov q2,q0
- .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
- .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
- .byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q8
- .byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
vmov q2,q0
- .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
- .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
- .byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q9
- .byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
vmov q2,q0
- .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
- .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
- .byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q10
- .byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
vmov q2,q0
- .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
- .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
- .byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q11
- .byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
vmov q2,q0
- .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
- .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
- .byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q8
- .byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
vmov q2,q0
- .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
- .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
- .byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q9
- .byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
vmov q2,q0
- .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
- .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
- .byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q10
- .byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
vmov q2,q0
- .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
- .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
- .byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q11
- .byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
vmov q2,q0
- .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
- .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
- .byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q8
vmov q2,q0
- .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
- .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q9
vmov q2,q0
- .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
- .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
vld1.32 {q13},[r3]
vadd.i32 q12,q12,q10
sub r3,r3,#256-16 @ rewind
vmov q2,q0
- .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
- .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
vadd.i32 q13,q13,q11
vmov q2,q0
- .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
- .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
vadd.i32 q0,q0,q14
vadd.i32 q1,q1,q15
+ it ne
bne .Loop_v8
vst1.32 {q0,q1},[r0]
@@ -2690,6 +2770,6 @@ sha256_block_data_order_armv8:
#endif
.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>"
.align 2
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.comm OPENSSL_armcap_P,4,4
#endif
diff --git a/deps/openssl/asm_obsolete/arm64-linux64-gas/aes/aesv8-armx.S b/deps/openssl/asm_obsolete/arm64-linux64-gas/aes/aesv8-armx.S
index 0a4b1ac4c4..f5dd6cbb86 100644
--- a/deps/openssl/asm_obsolete/arm64-linux64-gas/aes/aesv8-armx.S
+++ b/deps/openssl/asm_obsolete/arm64-linux64-gas/aes/aesv8-armx.S
@@ -227,17 +227,17 @@ aes_v8_encrypt:
.Loop_enc:
aese v2.16b,v0.16b
- ld1 {v0.4s},[x2],#16
aesmc v2.16b,v2.16b
+ ld1 {v0.4s},[x2],#16
subs w3,w3,#2
aese v2.16b,v1.16b
- ld1 {v1.4s},[x2],#16
aesmc v2.16b,v2.16b
+ ld1 {v1.4s},[x2],#16
b.gt .Loop_enc
aese v2.16b,v0.16b
- ld1 {v0.4s},[x2]
aesmc v2.16b,v2.16b
+ ld1 {v0.4s},[x2]
aese v2.16b,v1.16b
eor v2.16b,v2.16b,v0.16b
@@ -256,17 +256,17 @@ aes_v8_decrypt:
.Loop_dec:
aesd v2.16b,v0.16b
- ld1 {v0.4s},[x2],#16
aesimc v2.16b,v2.16b
+ ld1 {v0.4s},[x2],#16
subs w3,w3,#2
aesd v2.16b,v1.16b
- ld1 {v1.4s},[x2],#16
aesimc v2.16b,v2.16b
+ ld1 {v1.4s},[x2],#16
b.gt .Loop_dec
aesd v2.16b,v0.16b
- ld1 {v0.4s},[x2]
aesimc v2.16b,v2.16b
+ ld1 {v0.4s},[x2]
aesd v2.16b,v1.16b
eor v2.16b,v2.16b,v0.16b
@@ -308,16 +308,42 @@ aes_v8_cbc_encrypt:
eor v5.16b,v16.16b,v7.16b
b.eq .Lcbc_enc128
+ ld1 {v2.4s-v3.4s},[x7]
+ add x7,x3,#16
+ add x6,x3,#16*4
+ add x12,x3,#16*5
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ add x14,x3,#16*6
+ add x3,x3,#16*7
+ b .Lenter_cbc_enc
+
+.align 4
.Loop_cbc_enc:
aese v0.16b,v16.16b
- ld1 {v16.4s},[x7],#16
aesmc v0.16b,v0.16b
- subs w6,w6,#2
+ st1 {v6.16b},[x1],#16
+.Lenter_cbc_enc:
aese v0.16b,v17.16b
- ld1 {v17.4s},[x7],#16
aesmc v0.16b,v0.16b
- b.gt .Loop_cbc_enc
+ aese v0.16b,v2.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x6]
+ cmp w5,#4
+ aese v0.16b,v3.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x12]
+ b.eq .Lcbc_enc192
+
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x14]
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x3]
+ nop
+.Lcbc_enc192:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
subs x2,x2,#16
@@ -326,7 +352,6 @@ aes_v8_cbc_encrypt:
csel x8,xzr,x8,eq
aese v0.16b,v18.16b
aesmc v0.16b,v0.16b
- add x7,x3,#16
aese v0.16b,v19.16b
aesmc v0.16b,v0.16b
ld1 {v16.16b},[x0],x8
@@ -335,16 +360,14 @@ aes_v8_cbc_encrypt:
eor v16.16b,v16.16b,v5.16b
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
- ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
+ ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
aese v0.16b,v23.16b
-
- mov w6,w5
eor v6.16b,v0.16b,v7.16b
- st1 {v6.16b},[x1],#16
b.hs .Loop_cbc_enc
+ st1 {v6.16b},[x1],#16
b .Lcbc_done
.align 5
@@ -402,79 +425,78 @@ aes_v8_cbc_encrypt:
.Loop3x_cbc_dec:
aesd v0.16b,v16.16b
- aesd v1.16b,v16.16b
- aesd v18.16b,v16.16b
- ld1 {v16.4s},[x7],#16
aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aesd v0.16b,v17.16b
- aesd v1.16b,v17.16b
- aesd v18.16b,v17.16b
- ld1 {v17.4s},[x7],#16
aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
b.gt .Loop3x_cbc_dec
aesd v0.16b,v16.16b
- aesd v1.16b,v16.16b
- aesd v18.16b,v16.16b
- eor v4.16b,v6.16b,v7.16b
aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
+ eor v4.16b,v6.16b,v7.16b
+ subs x2,x2,#0x30
eor v5.16b,v2.16b,v7.16b
+ csel x6,x2,x6,lo // x6, w6, is zero at this point
aesd v0.16b,v17.16b
- aesd v1.16b,v17.16b
- aesd v18.16b,v17.16b
- eor v17.16b,v3.16b,v7.16b
- subs x2,x2,#0x30
aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
- orr v6.16b,v19.16b,v19.16b
- csel x6,x2,x6,lo // x6, w6, is zero at this point
- aesd v0.16b,v20.16b
- aesd v1.16b,v20.16b
- aesd v18.16b,v20.16b
+ eor v17.16b,v3.16b,v7.16b
add x0,x0,x6 // x0 is adjusted in such way that
// at exit from the loop v1.16b-v18.16b
// are loaded with last "words"
+ orr v6.16b,v19.16b,v19.16b
+ mov x7,x3
+ aesd v0.16b,v20.16b
aesimc v0.16b,v0.16b
+ aesd v1.16b,v20.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v20.16b
aesimc v18.16b,v18.16b
- mov x7,x3
- aesd v0.16b,v21.16b
- aesd v1.16b,v21.16b
- aesd v18.16b,v21.16b
ld1 {v2.16b},[x0],#16
+ aesd v0.16b,v21.16b
aesimc v0.16b,v0.16b
+ aesd v1.16b,v21.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v21.16b
aesimc v18.16b,v18.16b
ld1 {v3.16b},[x0],#16
aesd v0.16b,v22.16b
- aesd v1.16b,v22.16b
- aesd v18.16b,v22.16b
- ld1 {v19.16b},[x0],#16
aesimc v0.16b,v0.16b
+ aesd v1.16b,v22.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v22.16b
aesimc v18.16b,v18.16b
- ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
+ ld1 {v19.16b},[x0],#16
aesd v0.16b,v23.16b
aesd v1.16b,v23.16b
aesd v18.16b,v23.16b
-
+ ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
add w6,w5,#2
eor v4.16b,v4.16b,v0.16b
eor v5.16b,v5.16b,v1.16b
eor v18.16b,v18.16b,v17.16b
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
- orr v0.16b,v2.16b,v2.16b
st1 {v4.16b},[x1],#16
- orr v1.16b,v3.16b,v3.16b
+ orr v0.16b,v2.16b,v2.16b
st1 {v5.16b},[x1],#16
+ orr v1.16b,v3.16b,v3.16b
st1 {v18.16b},[x1],#16
orr v18.16b,v19.16b,v19.16b
b.hs .Loop3x_cbc_dec
@@ -485,39 +507,39 @@ aes_v8_cbc_encrypt:
.Lcbc_dec_tail:
aesd v1.16b,v16.16b
- aesd v18.16b,v16.16b
- ld1 {v16.4s},[x7],#16
aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aesd v1.16b,v17.16b
- aesd v18.16b,v17.16b
- ld1 {v17.4s},[x7],#16
aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
b.gt .Lcbc_dec_tail
aesd v1.16b,v16.16b
- aesd v18.16b,v16.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
aesd v1.16b,v17.16b
- aesd v18.16b,v17.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
aesd v1.16b,v20.16b
- aesd v18.16b,v20.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v20.16b
aesimc v18.16b,v18.16b
cmn x2,#0x20
aesd v1.16b,v21.16b
- aesd v18.16b,v21.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v21.16b
aesimc v18.16b,v18.16b
eor v5.16b,v6.16b,v7.16b
aesd v1.16b,v22.16b
- aesd v18.16b,v22.16b
aesimc v1.16b,v1.16b
+ aesd v18.16b,v22.16b
aesimc v18.16b,v18.16b
eor v17.16b,v3.16b,v7.16b
aesd v1.16b,v23.16b
@@ -583,70 +605,69 @@ aes_v8_ctr32_encrypt_blocks:
.align 4
.Loop3x_ctr32:
aese v0.16b,v16.16b
- aese v1.16b,v16.16b
- aese v18.16b,v16.16b
- ld1 {v16.4s},[x7],#16
aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
+ aese v18.16b,v16.16b
aesmc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aese v0.16b,v17.16b
- aese v1.16b,v17.16b
- aese v18.16b,v17.16b
- ld1 {v17.4s},[x7],#16
aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
+ aese v18.16b,v17.16b
aesmc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
b.gt .Loop3x_ctr32
aese v0.16b,v16.16b
- aese v1.16b,v16.16b
- aese v18.16b,v16.16b
- mov x7,x3
aesmc v4.16b,v0.16b
- ld1 {v2.16b},[x0],#16
+ aese v1.16b,v16.16b
aesmc v5.16b,v1.16b
- aesmc v18.16b,v18.16b
+ ld1 {v2.16b},[x0],#16
orr v0.16b,v6.16b,v6.16b
- aese v4.16b,v17.16b
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
ld1 {v3.16b},[x0],#16
- aese v5.16b,v17.16b
- aese v18.16b,v17.16b
orr v1.16b,v6.16b,v6.16b
+ aese v4.16b,v17.16b
aesmc v4.16b,v4.16b
- ld1 {v19.16b},[x0],#16
+ aese v5.16b,v17.16b
aesmc v5.16b,v5.16b
+ ld1 {v19.16b},[x0],#16
+ mov x7,x3
+ aese v18.16b,v17.16b
aesmc v17.16b,v18.16b
orr v18.16b,v6.16b,v6.16b
add w9,w8,#1
aese v4.16b,v20.16b
+ aesmc v4.16b,v4.16b
aese v5.16b,v20.16b
- aese v17.16b,v20.16b
+ aesmc v5.16b,v5.16b
eor v2.16b,v2.16b,v7.16b
add w10,w8,#2
- aesmc v4.16b,v4.16b
- aesmc v5.16b,v5.16b
+ aese v17.16b,v20.16b
aesmc v17.16b,v17.16b
eor v3.16b,v3.16b,v7.16b
add w8,w8,#3
aese v4.16b,v21.16b
+ aesmc v4.16b,v4.16b
aese v5.16b,v21.16b
- aese v17.16b,v21.16b
+ aesmc v5.16b,v5.16b
eor v19.16b,v19.16b,v7.16b
rev w9,w9
- aesmc v4.16b,v4.16b
- ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
- aesmc v5.16b,v5.16b
+ aese v17.16b,v21.16b
aesmc v17.16b,v17.16b
mov v0.s[3], w9
rev w10,w10
aese v4.16b,v22.16b
+ aesmc v4.16b,v4.16b
aese v5.16b,v22.16b
- aese v17.16b,v22.16b
+ aesmc v5.16b,v5.16b
mov v1.s[3], w10
rev w12,w8
- aesmc v4.16b,v4.16b
- aesmc v5.16b,v5.16b
+ aese v17.16b,v22.16b
aesmc v17.16b,v17.16b
mov v18.s[3], w12
subs x2,x2,#3
@@ -654,13 +675,14 @@ aes_v8_ctr32_encrypt_blocks:
aese v5.16b,v23.16b
aese v17.16b,v23.16b
- mov w6,w5
eor v2.16b,v2.16b,v4.16b
+ ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
+ st1 {v2.16b},[x1],#16
eor v3.16b,v3.16b,v5.16b
+ mov w6,w5
+ st1 {v3.16b},[x1],#16
eor v19.16b,v19.16b,v17.16b
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
- st1 {v2.16b},[x1],#16
- st1 {v3.16b},[x1],#16
st1 {v19.16b},[x1],#16
b.hs .Loop3x_ctr32
@@ -672,40 +694,40 @@ aes_v8_ctr32_encrypt_blocks:
.Lctr32_tail:
aese v0.16b,v16.16b
- aese v1.16b,v16.16b
- ld1 {v16.4s},[x7],#16
aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
+ ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aese v0.16b,v17.16b
- aese v1.16b,v17.16b
- ld1 {v17.4s},[x7],#16
aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
+ ld1 {v17.4s},[x7],#16
b.gt .Lctr32_tail
aese v0.16b,v16.16b
- aese v1.16b,v16.16b
aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
aese v0.16b,v17.16b
- aese v1.16b,v17.16b
aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
ld1 {v2.16b},[x0],x12
aese v0.16b,v20.16b
- aese v1.16b,v20.16b
- ld1 {v3.16b},[x0]
aesmc v0.16b,v0.16b
+ aese v1.16b,v20.16b
aesmc v1.16b,v1.16b
+ ld1 {v3.16b},[x0]
aese v0.16b,v21.16b
- aese v1.16b,v21.16b
aesmc v0.16b,v0.16b
+ aese v1.16b,v21.16b
aesmc v1.16b,v1.16b
- aese v0.16b,v22.16b
- aese v1.16b,v22.16b
eor v2.16b,v2.16b,v7.16b
+ aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
+ aese v1.16b,v22.16b
aesmc v1.16b,v1.16b
eor v3.16b,v3.16b,v7.16b
aese v0.16b,v23.16b
diff --git a/deps/openssl/asm_obsolete/arm64-linux64-gas/modes/ghashv8-armx.S b/deps/openssl/asm_obsolete/arm64-linux64-gas/modes/ghashv8-armx.S
index 1bfb26340a..479007dc54 100644
--- a/deps/openssl/asm_obsolete/arm64-linux64-gas/modes/ghashv8-armx.S
+++ b/deps/openssl/asm_obsolete/arm64-linux64-gas/modes/ghashv8-armx.S
@@ -6,103 +6,215 @@
.type gcm_init_v8,%function
.align 4
gcm_init_v8:
- ld1 {v17.2d},[x1] //load H
- movi v16.16b,#0xe1
+ ld1 {v17.2d},[x1] //load input H
+ movi v19.16b,#0xe1
+ shl v19.2d,v19.2d,#57 //0xc2.0
ext v3.16b,v17.16b,v17.16b,#8
- shl v16.2d,v16.2d,#57
- ushr v18.2d,v16.2d,#63
- ext v16.16b,v18.16b,v16.16b,#8 //t0=0xc2....01
+ ushr v18.2d,v19.2d,#63
dup v17.4s,v17.s[1]
- ushr v19.2d,v3.2d,#63
+ ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
+ ushr v18.2d,v3.2d,#63
sshr v17.4s,v17.4s,#31 //broadcast carry bit
- and v19.16b,v19.16b,v16.16b
+ and v18.16b,v18.16b,v16.16b
shl v3.2d,v3.2d,#1
- ext v19.16b,v19.16b,v19.16b,#8
+ ext v18.16b,v18.16b,v18.16b,#8
and v16.16b,v16.16b,v17.16b
- orr v3.16b,v3.16b,v19.16b //H<<<=1
- eor v3.16b,v3.16b,v16.16b //twisted H
- st1 {v3.2d},[x0]
+ orr v3.16b,v3.16b,v18.16b //H<<<=1
+ eor v20.16b,v3.16b,v16.16b //twisted H
+ st1 {v20.2d},[x0],#16 //store Htable[0]
+
+ //calculate H^2
+ ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
+ pmull v0.1q,v20.1d,v20.1d
+ eor v16.16b,v16.16b,v20.16b
+ pmull2 v2.1q,v20.2d,v20.2d
+ pmull v1.1q,v16.1d,v16.1d
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase
+
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v22.16b,v0.16b,v18.16b
+
+ ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
+ eor v17.16b,v17.16b,v22.16b
+ ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
+ st1 {v21.2d-v22.2d},[x0] //store Htable[1..2]
ret
.size gcm_init_v8,.-gcm_init_v8
-
.global gcm_gmult_v8
.type gcm_gmult_v8,%function
.align 4
gcm_gmult_v8:
ld1 {v17.2d},[x0] //load Xi
movi v19.16b,#0xe1
- ld1 {v20.2d},[x1] //load twisted H
+ ld1 {v20.2d-v21.2d},[x1] //load twisted H, ...
shl v19.2d,v19.2d,#57
#ifndef __ARMEB__
rev64 v17.16b,v17.16b
#endif
- ext v21.16b,v20.16b,v20.16b,#8
- mov x3,#0
ext v3.16b,v17.16b,v17.16b,#8
- mov x12,#0
- eor v21.16b,v21.16b,v20.16b //Karatsuba pre-processing
- mov x2,x0
- b .Lgmult_v8
-.size gcm_gmult_v8,.-gcm_gmult_v8
+ pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
+ eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
+ pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
+ pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+
+#ifndef __ARMEB__
+ rev64 v0.16b,v0.16b
+#endif
+ ext v0.16b,v0.16b,v0.16b,#8
+ st1 {v0.2d},[x0] //write out Xi
+
+ ret
+.size gcm_gmult_v8,.-gcm_gmult_v8
.global gcm_ghash_v8
.type gcm_ghash_v8,%function
.align 4
gcm_ghash_v8:
ld1 {v0.2d},[x0] //load [rotated] Xi
- subs x3,x3,#16
+ //"[rotated]" means that
+ //loaded value would have
+ //to be rotated in order to
+ //make it appear as in
+ //alorithm specification
+ subs x3,x3,#32 //see if x3 is 32 or larger
+ mov x12,#16 //x12 is used as post-
+ //increment for input pointer;
+ //as loop is modulo-scheduled
+ //x12 is zeroed just in time
+ //to preclude oversteping
+ //inp[len], which means that
+ //last block[s] are actually
+ //loaded twice, but last
+ //copy is not processed
+ ld1 {v20.2d-v21.2d},[x1],#32 //load twisted H, ..., H^2
movi v19.16b,#0xe1
- mov x12,#16
- ld1 {v20.2d},[x1] //load twisted H
- csel x12,xzr,x12,eq
- ext v0.16b,v0.16b,v0.16b,#8
- shl v19.2d,v19.2d,#57
- ld1 {v17.2d},[x2],x12 //load [rotated] inp
- ext v21.16b,v20.16b,v20.16b,#8
+ ld1 {v22.2d},[x1]
+ csel x12,xzr,x12,eq //is it time to zero x12?
+ ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
+ ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
+ shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
#ifndef __ARMEB__
+ rev64 v16.16b,v16.16b
rev64 v0.16b,v0.16b
+#endif
+ ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
+ b.lo .Lodd_tail_v8 //x3 was less than 32
+ ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
+#ifndef __ARMEB__
rev64 v17.16b,v17.16b
#endif
- eor v21.16b,v21.16b,v20.16b //Karatsuba pre-processing
- ext v3.16b,v17.16b,v17.16b,#8
- b .Loop_v8
+ ext v7.16b,v17.16b,v17.16b,#8
+ eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
+ pmull v4.1q,v20.1d,v7.1d //H·Ii+1
+ eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
+ pmull2 v6.1q,v20.2d,v7.2d
+ b .Loop_mod2x_v8
.align 4
-.Loop_v8:
+.Loop_mod2x_v8:
+ ext v18.16b,v3.16b,v3.16b,#8
+ subs x3,x3,#32 //is there more data?
+ pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
+ csel x12,xzr,x12,lo //is it time to zero x12?
+
+ pmull v5.1q,v21.1d,v17.1d
+ eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
+ pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
+ eor v0.16b,v0.16b,v4.16b //accumulate
+ pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+ ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
+
+ eor v2.16b,v2.16b,v6.16b
+ csel x12,xzr,x12,eq //is it time to zero x12?
+ eor v1.16b,v1.16b,v5.16b
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
+#ifndef __ARMEB__
+ rev64 v16.16b,v16.16b
+#endif
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+
+#ifndef __ARMEB__
+ rev64 v17.16b,v17.16b
+#endif
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ ext v7.16b,v17.16b,v17.16b,#8
+ ext v3.16b,v16.16b,v16.16b,#8
+ eor v0.16b,v1.16b,v18.16b
+ pmull v4.1q,v20.1d,v7.1d //H·Ii+1
+ eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v3.16b,v3.16b,v18.16b
+ eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
+ eor v3.16b,v3.16b,v0.16b
+ pmull2 v6.1q,v20.2d,v7.2d
+ b.hs .Loop_mod2x_v8 //there was at least 32 more bytes
+
+ eor v2.16b,v2.16b,v18.16b
+ ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
+ adds x3,x3,#32 //re-construct x3
+ eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
+ b.eq .Ldone_v8 //is x3 zero?
+.Lodd_tail_v8:
ext v18.16b,v0.16b,v0.16b,#8
eor v3.16b,v3.16b,v0.16b //inp^=Xi
- eor v17.16b,v17.16b,v18.16b //v17.16b is rotated inp^Xi
+ eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
-.Lgmult_v8:
pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
- subs x3,x3,#16
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
- csel x12,xzr,x12,eq
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
- ld1 {v17.2d},[x2],x12 //load [rotated] inp
eor v1.16b,v1.16b,v18.16b
- pmull v18.1q,v0.1d,v19.1d //1st phase
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
-#ifndef __ARMEB__
- rev64 v17.16b,v17.16b
-#endif
eor v0.16b,v1.16b,v18.16b
- ext v3.16b,v17.16b,v17.16b,#8
- ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
- b.hs .Loop_v8
+.Ldone_v8:
#ifndef __ARMEB__
rev64 v0.16b,v0.16b
#endif
diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-x86_64.s
index 84708afbbb..6573fe4be3 100644
--- a/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-x86_64.s
@@ -17,7 +17,10 @@ aesni_encrypt:
leaq 16(%rdx),%rdx
jnz .Loop_enc1_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
.byte 0xf3,0xc3
.size aesni_encrypt,.-aesni_encrypt
@@ -38,7 +41,10 @@ aesni_decrypt:
leaq 16(%rdx),%rdx
jnz .Loop_dec1_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
.byte 0xf3,0xc3
.size aesni_decrypt, .-aesni_decrypt
.type _aesni_encrypt2,@function
@@ -264,21 +270,18 @@ _aesni_encrypt6:
pxor %xmm0,%xmm6
.byte 102,15,56,220,225
pxor %xmm0,%xmm7
+ movups (%rcx,%rax,1),%xmm0
addq $16,%rax
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
- movups -16(%rcx,%rax,1),%xmm0
jmp .Lenc_loop6_enter
.align 16
.Lenc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
+.Lenc_loop6_enter:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.Lenc_loop6_enter:
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
.byte 102,15,56,220,208
@@ -321,21 +324,18 @@ _aesni_decrypt6:
pxor %xmm0,%xmm6
.byte 102,15,56,222,225
pxor %xmm0,%xmm7
+ movups (%rcx,%rax,1),%xmm0
addq $16,%rax
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
- movups -16(%rcx,%rax,1),%xmm0
jmp .Ldec_loop6_enter
.align 16
.Ldec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
+.Ldec_loop6_enter:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.Ldec_loop6_enter:
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
.byte 102,15,56,222,208
@@ -375,23 +375,18 @@ _aesni_encrypt8:
leaq 32(%rcx,%rax,1),%rcx
negq %rax
.byte 102,15,56,220,209
- addq $16,%rax
pxor %xmm0,%xmm7
-.byte 102,15,56,220,217
pxor %xmm0,%xmm8
+.byte 102,15,56,220,217
pxor %xmm0,%xmm9
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
- movups -16(%rcx,%rax,1),%xmm0
- jmp .Lenc_loop8_enter
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp .Lenc_loop8_inner
.align 16
.Lenc_loop8:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
+.Lenc_loop8_inner:
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
@@ -444,23 +439,18 @@ _aesni_decrypt8:
leaq 32(%rcx,%rax,1),%rcx
negq %rax
.byte 102,15,56,222,209
- addq $16,%rax
pxor %xmm0,%xmm7
-.byte 102,15,56,222,217
pxor %xmm0,%xmm8
+.byte 102,15,56,222,217
pxor %xmm0,%xmm9
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
- movups -16(%rcx,%rax,1),%xmm0
- jmp .Ldec_loop8_enter
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp .Ldec_loop8_inner
.align 16
.Ldec_loop8:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
+.Ldec_loop8_inner:
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
@@ -587,6 +577,7 @@ aesni_ecb_encrypt:
movups 80(%rdi),%xmm7
je .Lecb_enc_six
movdqu 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
call _aesni_encrypt8
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
@@ -700,15 +691,23 @@ aesni_ecb_encrypt:
jnc .Lecb_dec_loop8
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movq %r11,%rcx
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movl %r10d,%eax
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
movups %xmm9,112(%rsi)
+ pxor %xmm9,%xmm9
leaq 128(%rsi),%rsi
addq $128,%rdx
jz .Lecb_ret
@@ -731,14 +730,23 @@ aesni_ecb_encrypt:
je .Lecb_dec_six
movups 96(%rdi),%xmm8
movups (%rcx),%xmm0
+ xorps %xmm9,%xmm9
call _aesni_decrypt8
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
jmp .Lecb_ret
.align 16
.Lecb_dec_one:
@@ -754,49 +762,73 @@ aesni_ecb_encrypt:
jnz .Loop_dec1_4
.byte 102,15,56,223,209
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
jmp .Lecb_ret
.align 16
.Lecb_dec_two:
call _aesni_decrypt2
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
jmp .Lecb_ret
.align 16
.Lecb_dec_three:
call _aesni_decrypt3
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
jmp .Lecb_ret
.align 16
.Lecb_dec_four:
call _aesni_decrypt4
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
jmp .Lecb_ret
.align 16
.Lecb_dec_five:
xorps %xmm7,%xmm7
call _aesni_decrypt6
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
jmp .Lecb_ret
.align 16
.Lecb_dec_six:
call _aesni_decrypt6
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
.Lecb_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
.byte 0xf3,0xc3
.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
.globl aesni_ccm64_encrypt_blocks
@@ -853,7 +885,13 @@ aesni_ccm64_encrypt_blocks:
leaq 16(%rsi),%rsi
jnz .Lccm64_enc_outer
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
movups %xmm3,(%r9)
+ pxor %xmm3,%xmm3
+ pxor %xmm8,%xmm8
+ pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
.globl aesni_ccm64_decrypt_blocks
@@ -944,21 +982,56 @@ aesni_ccm64_decrypt_blocks:
leaq 16(%r11),%r11
jnz .Loop_enc1_6
.byte 102,15,56,221,217
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
movups %xmm3,(%r9)
+ pxor %xmm3,%xmm3
+ pxor %xmm8,%xmm8
+ pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
.globl aesni_ctr32_encrypt_blocks
.type aesni_ctr32_encrypt_blocks,@function
.align 16
aesni_ctr32_encrypt_blocks:
+ cmpq $1,%rdx
+ jne .Lctr32_bulk
+
+
+
+ movups (%r8),%xmm2
+ movups (%rdi),%xmm3
+ movl 240(%rcx),%edx
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_enc1_7:
+.byte 102,15,56,220,209
+ decl %edx
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_7
+.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ xorps %xmm2,%xmm2
+ jmp .Lctr32_epilogue
+
+.align 16
+.Lctr32_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $128,%rsp
andq $-16,%rsp
leaq -8(%rax),%rbp
- cmpq $1,%rdx
- je .Lctr32_one_shortcut
+
+
movdqu (%r8),%xmm2
movdqu (%rcx),%xmm0
@@ -1349,11 +1422,14 @@ aesni_ctr32_encrypt_blocks:
leaq -128(%rcx),%rcx
.Lctr32_tail:
+
+
leaq 16(%rcx),%rcx
cmpq $4,%rdx
jb .Lctr32_loop3
je .Lctr32_loop4
+
shll $4,%eax
movdqa 96(%rsp),%xmm8
pxor %xmm9,%xmm9
@@ -1456,30 +1532,33 @@ aesni_ctr32_encrypt_blocks:
movups 32(%rdi),%xmm12
xorps %xmm12,%xmm4
movups %xmm4,32(%rsi)
- jmp .Lctr32_done
-.align 16
-.Lctr32_one_shortcut:
- movups (%r8),%xmm2
- movups (%rdi),%xmm10
- movl 240(%rcx),%eax
- movups (%rcx),%xmm0
- movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
- xorps %xmm0,%xmm2
-.Loop_enc1_7:
-.byte 102,15,56,220,209
- decl %eax
- movups (%rcx),%xmm1
- leaq 16(%rcx),%rcx
- jnz .Loop_enc1_7
-.byte 102,15,56,221,209
- xorps %xmm10,%xmm2
- movups %xmm2,(%rsi)
- jmp .Lctr32_done
-
-.align 16
.Lctr32_done:
+ xorps %xmm0,%xmm0
+ xorl %r11d,%r11d
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ movaps %xmm0,112(%rsp)
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
.Lctr32_epilogue:
@@ -1750,6 +1829,7 @@ aesni_xts_encrypt:
shrl $4,%eax
.Lxts_enc_short:
+
movl %eax,%r10d
pxor %xmm0,%xmm10
addq $96,%rdx
@@ -1778,6 +1858,7 @@ aesni_xts_encrypt:
pxor %xmm12,%xmm4
pxor %xmm13,%xmm5
pxor %xmm14,%xmm6
+ pxor %xmm7,%xmm7
call _aesni_encrypt6
@@ -1920,6 +2001,29 @@ aesni_xts_encrypt:
movups %xmm2,-16(%rsi)
.Lxts_enc_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
.Lxts_enc_epilogue:
@@ -2196,6 +2300,7 @@ aesni_xts_decrypt:
shrl $4,%eax
.Lxts_dec_short:
+
movl %eax,%r10d
pxor %xmm0,%xmm10
pxor %xmm0,%xmm11
@@ -2398,6 +2503,29 @@ aesni_xts_decrypt:
movups %xmm2,(%rsi)
.Lxts_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
.Lxts_dec_epilogue:
@@ -2446,7 +2574,11 @@ aesni_cbc_encrypt:
jnc .Lcbc_enc_loop
addq $16,%rdx
jnz .Lcbc_enc_tail
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%r8)
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
jmp .Lcbc_ret
.Lcbc_enc_tail:
@@ -2466,6 +2598,35 @@ aesni_cbc_encrypt:
.align 16
.Lcbc_decrypt:
+ cmpq $16,%rdx
+ jne .Lcbc_decrypt_bulk
+
+
+
+ movdqu (%rdi),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa %xmm2,%xmm4
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_16:
+.byte 102,15,56,222,209
+ decl %r10d
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_16
+.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movdqu %xmm4,(%r8)
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ jmp .Lcbc_ret
+.align 16
+.Lcbc_decrypt_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $16,%rsp
@@ -2702,7 +2863,7 @@ aesni_cbc_encrypt:
movaps %xmm9,%xmm2
leaq -112(%rcx),%rcx
addq $112,%rdx
- jle .Lcbc_dec_tail_collected
+ jle .Lcbc_dec_clear_tail_collected
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
cmpq $80,%rdx
@@ -2721,14 +2882,19 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
leaq 80(%rsi),%rsi
movdqa %xmm7,%xmm2
+ pxor %xmm7,%xmm7
jmp .Lcbc_dec_tail_collected
.align 16
@@ -2743,16 +2909,23 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
pxor %xmm9,%xmm8
movdqu %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
leaq 96(%rsi),%rsi
movdqa %xmm8,%xmm2
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
jmp .Lcbc_dec_tail_collected
.align 16
@@ -2796,7 +2969,7 @@ aesni_cbc_encrypt:
movdqa %xmm7,%xmm2
addq $80,%rdx
- jle .Lcbc_dec_tail_collected
+ jle .Lcbc_dec_clear_tail_collected
movups %xmm7,(%rsi)
leaq 16(%rsi),%rsi
@@ -2831,12 +3004,17 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
leaq 64(%rsi),%rsi
movdqa %xmm6,%xmm2
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
subq $16,%rdx
jmp .Lcbc_dec_tail_collected
@@ -2847,12 +3025,12 @@ aesni_cbc_encrypt:
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
-.Loop_dec1_16:
+.Loop_dec1_17:
.byte 102,15,56,222,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz .Loop_dec1_16
+ jnz .Loop_dec1_17
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movaps %xmm11,%xmm10
@@ -2866,6 +3044,7 @@ aesni_cbc_encrypt:
pxor %xmm11,%xmm3
movdqu %xmm2,(%rsi)
movdqa %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leaq 16(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.align 16
@@ -2878,7 +3057,9 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movdqa %xmm4,%xmm2
+ pxor %xmm4,%xmm4
leaq 32(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.align 16
@@ -2891,29 +3072,45 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movdqa %xmm5,%xmm2
+ pxor %xmm5,%xmm5
leaq 48(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.align 16
+.Lcbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
.Lcbc_dec_tail_collected:
movups %xmm10,(%r8)
andq $15,%rdx
jnz .Lcbc_dec_tail_partial
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
jmp .Lcbc_dec_ret
.align 16
.Lcbc_dec_tail_partial:
movaps %xmm2,(%rsp)
+ pxor %xmm2,%xmm2
movq $16,%rcx
movq %rsi,%rdi
subq %rdx,%rcx
leaq (%rsp),%rsi
.long 0x9066A4F3
+ movdqa %xmm2,(%rsp)
.Lcbc_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
leaq (%rbp),%rsp
popq %rbp
.Lcbc_ret:
@@ -2951,7 +3148,9 @@ aesni_set_decrypt_key:
movups (%rdx),%xmm0
.byte 102,15,56,219,192
+ pxor %xmm1,%xmm1
movups %xmm0,(%rdi)
+ pxor %xmm0,%xmm0
.Ldec_key_ret:
addq $8,%rsp
.byte 0xf3,0xc3
@@ -2969,8 +3168,10 @@ __aesni_set_encrypt_key:
testq %rdx,%rdx
jz .Lenc_key_ret
+ movl $268437504,%r10d
movups (%rdi),%xmm0
xorps %xmm4,%xmm4
+ andl OPENSSL_ia32cap_P+4(%rip),%r10d
leaq 16(%rdx),%rax
cmpl $256,%esi
je .L14rounds
@@ -2981,6 +3182,9 @@ __aesni_set_encrypt_key:
.L10rounds:
movl $9,%esi
+ cmpl $268435456,%r10d
+ je .L10rounds_alt
+
movups %xmm0,(%rdx)
.byte 102,15,58,223,200,1
call .Lkey_expansion_128_cold
@@ -3008,9 +3212,79 @@ __aesni_set_encrypt_key:
jmp .Lenc_key_ret
.align 16
+.L10rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movl $8,%r10d
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,(%rdx)
+ jmp .Loop_key128
+
+.align 16
+.Loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leaq 16(%rax),%rax
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%rax)
+ movdqa %xmm0,%xmm2
+
+ decl %r10d
+ jnz .Loop_key128
+
+ movdqa .Lkey_rcon1b(%rip),%xmm4
+
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%rax)
+
+ movl %esi,96(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
.L12rounds:
movq 16(%rdi),%xmm2
movl $11,%esi
+ cmpl $268435456,%r10d
+ je .L12rounds_alt
+
movups %xmm0,(%rdx)
.byte 102,15,58,223,202,1
call .Lkey_expansion_192a_cold
@@ -3034,10 +3308,54 @@ __aesni_set_encrypt_key:
jmp .Lenc_key_ret
.align 16
+.L12rounds_alt:
+ movdqa .Lkey_rotate192(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movl $8,%r10d
+ movdqu %xmm0,(%rdx)
+ jmp .Loop_key192
+
+.align 16
+.Loop_key192:
+ movq %xmm2,0(%rax)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leaq 24(%rax),%rax
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%rax)
+
+ decl %r10d
+ jnz .Loop_key192
+
+ movl %esi,32(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
.L14rounds:
movups 16(%rdi),%xmm2
movl $13,%esi
leaq 16(%rax),%rax
+ cmpl $268435456,%r10d
+ je .L14rounds_alt
+
movups %xmm0,(%rdx)
movups %xmm2,16(%rdx)
.byte 102,15,58,223,202,1
@@ -3072,9 +3390,69 @@ __aesni_set_encrypt_key:
jmp .Lenc_key_ret
.align 16
+.L14rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movl $7,%r10d
+ movdqu %xmm0,0(%rdx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,16(%rdx)
+ jmp .Loop_key256
+
+.align 16
+.Loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ decl %r10d
+ jz .Ldone_key256
+
+ pshufd $255,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%rax)
+ leaq 32(%rax),%rax
+ movdqa %xmm2,%xmm1
+
+ jmp .Loop_key256
+
+.Ldone_key256:
+ movl %esi,16(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
.Lbad_keybits:
movq $-2,%rax
.Lenc_key_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
addq $8,%rsp
.byte 0xf3,0xc3
.LSEH_end_set_encrypt_key:
@@ -3160,6 +3538,14 @@ __aesni_set_encrypt_key:
.long 0x87,0,1,0
.Lincrement1:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Lkey_rotate:
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+.Lkey_rotate192:
+.long 0x04070605,0x04070605,0x04070605,0x04070605
+.Lkey_rcon1:
+.long 1,1,1,1
+.Lkey_rcon1b:
+.long 0x1b,0x1b,0x1b,0x1b
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64
diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont5.s b/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont5.s
index 1bf368c7eb..5f98ff2237 100644
--- a/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont5.s
+++ b/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont5.s
@@ -1755,11 +1755,16 @@ bn_from_mont8x:
.type bn_get_bits5,@function
.align 16
bn_get_bits5:
- movq %rdi,%r10
+ leaq 0(%rdi),%r10
+ leaq 1(%rdi),%r11
movl %esi,%ecx
- shrl $3,%esi
- movzwl (%r10,%rsi,1),%eax
- andl $7,%ecx
+ shrl $4,%esi
+ andl $15,%ecx
+ leal -8(%rcx),%eax
+ cmpl $11,%ecx
+ cmovaq %r11,%r10
+ cmoval %eax,%ecx
+ movzwl (%r10,%rsi,2),%eax
shrl %cl,%eax
andl $31,%eax
.byte 0xf3,0xc3
diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-x86_64.s
index 57509ae719..41ad80eebd 100644
--- a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-x86_64.s
+++ b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-x86_64.s
@@ -17,7 +17,10 @@ L$oop_enc1_1:
leaq 16(%rdx),%rdx
jnz L$oop_enc1_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
.byte 0xf3,0xc3
@@ -38,7 +41,10 @@ L$oop_dec1_2:
leaq 16(%rdx),%rdx
jnz L$oop_dec1_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
.byte 0xf3,0xc3
@@ -264,21 +270,18 @@ _aesni_encrypt6:
pxor %xmm0,%xmm6
.byte 102,15,56,220,225
pxor %xmm0,%xmm7
+ movups (%rcx,%rax,1),%xmm0
addq $16,%rax
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
- movups -16(%rcx,%rax,1),%xmm0
jmp L$enc_loop6_enter
.p2align 4
L$enc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
+L$enc_loop6_enter:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-L$enc_loop6_enter:
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
.byte 102,15,56,220,208
@@ -321,21 +324,18 @@ _aesni_decrypt6:
pxor %xmm0,%xmm6
.byte 102,15,56,222,225
pxor %xmm0,%xmm7
+ movups (%rcx,%rax,1),%xmm0
addq $16,%rax
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
- movups -16(%rcx,%rax,1),%xmm0
jmp L$dec_loop6_enter
.p2align 4
L$dec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
+L$dec_loop6_enter:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-L$dec_loop6_enter:
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
.byte 102,15,56,222,208
@@ -375,23 +375,18 @@ _aesni_encrypt8:
leaq 32(%rcx,%rax,1),%rcx
negq %rax
.byte 102,15,56,220,209
- addq $16,%rax
pxor %xmm0,%xmm7
-.byte 102,15,56,220,217
pxor %xmm0,%xmm8
+.byte 102,15,56,220,217
pxor %xmm0,%xmm9
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
- movups -16(%rcx,%rax,1),%xmm0
- jmp L$enc_loop8_enter
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp L$enc_loop8_inner
.p2align 4
L$enc_loop8:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
+L$enc_loop8_inner:
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
@@ -444,23 +439,18 @@ _aesni_decrypt8:
leaq 32(%rcx,%rax,1),%rcx
negq %rax
.byte 102,15,56,222,209
- addq $16,%rax
pxor %xmm0,%xmm7
-.byte 102,15,56,222,217
pxor %xmm0,%xmm8
+.byte 102,15,56,222,217
pxor %xmm0,%xmm9
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
- movups -16(%rcx,%rax,1),%xmm0
- jmp L$dec_loop8_enter
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp L$dec_loop8_inner
.p2align 4
L$dec_loop8:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
+L$dec_loop8_inner:
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
@@ -587,6 +577,7 @@ L$ecb_enc_tail:
movups 80(%rdi),%xmm7
je L$ecb_enc_six
movdqu 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
call _aesni_encrypt8
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
@@ -700,15 +691,23 @@ L$ecb_dec_loop8_enter:
jnc L$ecb_dec_loop8
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movq %r11,%rcx
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movl %r10d,%eax
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
movups %xmm9,112(%rsi)
+ pxor %xmm9,%xmm9
leaq 128(%rsi),%rsi
addq $128,%rdx
jz L$ecb_ret
@@ -731,14 +730,23 @@ L$ecb_dec_tail:
je L$ecb_dec_six
movups 96(%rdi),%xmm8
movups (%rcx),%xmm0
+ xorps %xmm9,%xmm9
call _aesni_decrypt8
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
jmp L$ecb_ret
.p2align 4
L$ecb_dec_one:
@@ -754,49 +762,73 @@ L$oop_dec1_4:
jnz L$oop_dec1_4
.byte 102,15,56,223,209
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
jmp L$ecb_ret
.p2align 4
L$ecb_dec_two:
call _aesni_decrypt2
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
jmp L$ecb_ret
.p2align 4
L$ecb_dec_three:
call _aesni_decrypt3
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
jmp L$ecb_ret
.p2align 4
L$ecb_dec_four:
call _aesni_decrypt4
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
jmp L$ecb_ret
.p2align 4
L$ecb_dec_five:
xorps %xmm7,%xmm7
call _aesni_decrypt6
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
jmp L$ecb_ret
.p2align 4
L$ecb_dec_six:
call _aesni_decrypt6
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
L$ecb_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
.byte 0xf3,0xc3
.globl _aesni_ccm64_encrypt_blocks
@@ -853,7 +885,13 @@ L$ccm64_enc2_loop:
leaq 16(%rsi),%rsi
jnz L$ccm64_enc_outer
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
movups %xmm3,(%r9)
+ pxor %xmm3,%xmm3
+ pxor %xmm8,%xmm8
+ pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.globl _aesni_ccm64_decrypt_blocks
@@ -944,21 +982,56 @@ L$oop_enc1_6:
leaq 16(%r11),%r11
jnz L$oop_enc1_6
.byte 102,15,56,221,217
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
movups %xmm3,(%r9)
+ pxor %xmm3,%xmm3
+ pxor %xmm8,%xmm8
+ pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.globl _aesni_ctr32_encrypt_blocks
.p2align 4
_aesni_ctr32_encrypt_blocks:
+ cmpq $1,%rdx
+ jne L$ctr32_bulk
+
+
+
+ movups (%r8),%xmm2
+ movups (%rdi),%xmm3
+ movl 240(%rcx),%edx
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_enc1_7:
+.byte 102,15,56,220,209
+ decl %edx
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_enc1_7
+.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ xorps %xmm2,%xmm2
+ jmp L$ctr32_epilogue
+
+.p2align 4
+L$ctr32_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $128,%rsp
andq $-16,%rsp
leaq -8(%rax),%rbp
- cmpq $1,%rdx
- je L$ctr32_one_shortcut
+
+
movdqu (%r8),%xmm2
movdqu (%rcx),%xmm0
@@ -1349,11 +1422,14 @@ L$ctr32_enc_done:
leaq -128(%rcx),%rcx
L$ctr32_tail:
+
+
leaq 16(%rcx),%rcx
cmpq $4,%rdx
jb L$ctr32_loop3
je L$ctr32_loop4
+
shll $4,%eax
movdqa 96(%rsp),%xmm8
pxor %xmm9,%xmm9
@@ -1456,30 +1532,33 @@ L$ctr32_loop3:
movups 32(%rdi),%xmm12
xorps %xmm12,%xmm4
movups %xmm4,32(%rsi)
- jmp L$ctr32_done
-.p2align 4
-L$ctr32_one_shortcut:
- movups (%r8),%xmm2
- movups (%rdi),%xmm10
- movl 240(%rcx),%eax
- movups (%rcx),%xmm0
- movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
- xorps %xmm0,%xmm2
-L$oop_enc1_7:
-.byte 102,15,56,220,209
- decl %eax
- movups (%rcx),%xmm1
- leaq 16(%rcx),%rcx
- jnz L$oop_enc1_7
-.byte 102,15,56,221,209
- xorps %xmm10,%xmm2
- movups %xmm2,(%rsi)
- jmp L$ctr32_done
-
-.p2align 4
L$ctr32_done:
+ xorps %xmm0,%xmm0
+ xorl %r11d,%r11d
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ movaps %xmm0,112(%rsp)
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
L$ctr32_epilogue:
@@ -1750,6 +1829,7 @@ L$xts_enc_loop6:
shrl $4,%eax
L$xts_enc_short:
+
movl %eax,%r10d
pxor %xmm0,%xmm10
addq $96,%rdx
@@ -1778,6 +1858,7 @@ L$xts_enc_short:
pxor %xmm12,%xmm4
pxor %xmm13,%xmm5
pxor %xmm14,%xmm6
+ pxor %xmm7,%xmm7
call _aesni_encrypt6
@@ -1920,6 +2001,29 @@ L$oop_enc1_10:
movups %xmm2,-16(%rsi)
L$xts_enc_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
L$xts_enc_epilogue:
@@ -2196,6 +2300,7 @@ L$xts_dec_loop6:
shrl $4,%eax
L$xts_dec_short:
+
movl %eax,%r10d
pxor %xmm0,%xmm10
pxor %xmm0,%xmm11
@@ -2398,6 +2503,29 @@ L$oop_dec1_14:
movups %xmm2,(%rsi)
L$xts_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
L$xts_dec_epilogue:
@@ -2446,7 +2574,11 @@ L$oop_enc1_15:
jnc L$cbc_enc_loop
addq $16,%rdx
jnz L$cbc_enc_tail
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%r8)
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
jmp L$cbc_ret
L$cbc_enc_tail:
@@ -2466,6 +2598,35 @@ L$cbc_enc_tail:
.p2align 4
L$cbc_decrypt:
+ cmpq $16,%rdx
+ jne L$cbc_decrypt_bulk
+
+
+
+ movdqu (%rdi),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa %xmm2,%xmm4
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_dec1_16:
+.byte 102,15,56,222,209
+ decl %r10d
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_dec1_16
+.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movdqu %xmm4,(%r8)
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ jmp L$cbc_ret
+.p2align 4
+L$cbc_decrypt_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $16,%rsp
@@ -2702,7 +2863,7 @@ L$cbc_dec_done:
movaps %xmm9,%xmm2
leaq -112(%rcx),%rcx
addq $112,%rdx
- jle L$cbc_dec_tail_collected
+ jle L$cbc_dec_clear_tail_collected
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
cmpq $80,%rdx
@@ -2721,14 +2882,19 @@ L$cbc_dec_six_or_seven:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
leaq 80(%rsi),%rsi
movdqa %xmm7,%xmm2
+ pxor %xmm7,%xmm7
jmp L$cbc_dec_tail_collected
.p2align 4
@@ -2743,16 +2909,23 @@ L$cbc_dec_seven:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
pxor %xmm9,%xmm8
movdqu %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
leaq 96(%rsi),%rsi
movdqa %xmm8,%xmm2
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
jmp L$cbc_dec_tail_collected
.p2align 4
@@ -2796,7 +2969,7 @@ L$cbc_dec_loop6_enter:
movdqa %xmm7,%xmm2
addq $80,%rdx
- jle L$cbc_dec_tail_collected
+ jle L$cbc_dec_clear_tail_collected
movups %xmm7,(%rsi)
leaq 16(%rsi),%rsi
@@ -2831,12 +3004,17 @@ L$cbc_dec_tail:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
leaq 64(%rsi),%rsi
movdqa %xmm6,%xmm2
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
subq $16,%rdx
jmp L$cbc_dec_tail_collected
@@ -2847,12 +3025,12 @@ L$cbc_dec_one:
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
-L$oop_dec1_16:
+L$oop_dec1_17:
.byte 102,15,56,222,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
- jnz L$oop_dec1_16
+ jnz L$oop_dec1_17
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movaps %xmm11,%xmm10
@@ -2866,6 +3044,7 @@ L$cbc_dec_two:
pxor %xmm11,%xmm3
movdqu %xmm2,(%rsi)
movdqa %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leaq 16(%rsi),%rsi
jmp L$cbc_dec_tail_collected
.p2align 4
@@ -2878,7 +3057,9 @@ L$cbc_dec_three:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
movdqa %xmm4,%xmm2
+ pxor %xmm4,%xmm4
leaq 32(%rsi),%rsi
jmp L$cbc_dec_tail_collected
.p2align 4
@@ -2891,29 +3072,45 @@ L$cbc_dec_four:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
movdqa %xmm5,%xmm2
+ pxor %xmm5,%xmm5
leaq 48(%rsi),%rsi
jmp L$cbc_dec_tail_collected
.p2align 4
+L$cbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
L$cbc_dec_tail_collected:
movups %xmm10,(%r8)
andq $15,%rdx
jnz L$cbc_dec_tail_partial
movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
jmp L$cbc_dec_ret
.p2align 4
L$cbc_dec_tail_partial:
movaps %xmm2,(%rsp)
+ pxor %xmm2,%xmm2
movq $16,%rcx
movq %rsi,%rdi
subq %rdx,%rcx
leaq (%rsp),%rsi
.long 0x9066A4F3
+ movdqa %xmm2,(%rsp)
L$cbc_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
leaq (%rbp),%rsp
popq %rbp
L$cbc_ret:
@@ -2951,7 +3148,9 @@ L$dec_key_inverse:
movups (%rdx),%xmm0
.byte 102,15,56,219,192
+ pxor %xmm1,%xmm1
movups %xmm0,(%rdi)
+ pxor %xmm0,%xmm0
L$dec_key_ret:
addq $8,%rsp
.byte 0xf3,0xc3
@@ -2969,8 +3168,10 @@ __aesni_set_encrypt_key:
testq %rdx,%rdx
jz L$enc_key_ret
+ movl $268437504,%r10d
movups (%rdi),%xmm0
xorps %xmm4,%xmm4
+ andl _OPENSSL_ia32cap_P+4(%rip),%r10d
leaq 16(%rdx),%rax
cmpl $256,%esi
je L$14rounds
@@ -2981,6 +3182,9 @@ __aesni_set_encrypt_key:
L$10rounds:
movl $9,%esi
+ cmpl $268435456,%r10d
+ je L$10rounds_alt
+
movups %xmm0,(%rdx)
.byte 102,15,58,223,200,1
call L$key_expansion_128_cold
@@ -3008,9 +3212,79 @@ L$10rounds:
jmp L$enc_key_ret
.p2align 4
+L$10rounds_alt:
+ movdqa L$key_rotate(%rip),%xmm5
+ movl $8,%r10d
+ movdqa L$key_rcon1(%rip),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,(%rdx)
+ jmp L$oop_key128
+
+.p2align 4
+L$oop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leaq 16(%rax),%rax
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%rax)
+ movdqa %xmm0,%xmm2
+
+ decl %r10d
+ jnz L$oop_key128
+
+ movdqa L$key_rcon1b(%rip),%xmm4
+
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%rax)
+
+ movl %esi,96(%rax)
+ xorl %eax,%eax
+ jmp L$enc_key_ret
+
+.p2align 4
L$12rounds:
movq 16(%rdi),%xmm2
movl $11,%esi
+ cmpl $268435456,%r10d
+ je L$12rounds_alt
+
movups %xmm0,(%rdx)
.byte 102,15,58,223,202,1
call L$key_expansion_192a_cold
@@ -3034,10 +3308,54 @@ L$12rounds:
jmp L$enc_key_ret
.p2align 4
+L$12rounds_alt:
+ movdqa L$key_rotate192(%rip),%xmm5
+ movdqa L$key_rcon1(%rip),%xmm4
+ movl $8,%r10d
+ movdqu %xmm0,(%rdx)
+ jmp L$oop_key192
+
+.p2align 4
+L$oop_key192:
+ movq %xmm2,0(%rax)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leaq 24(%rax),%rax
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%rax)
+
+ decl %r10d
+ jnz L$oop_key192
+
+ movl %esi,32(%rax)
+ xorl %eax,%eax
+ jmp L$enc_key_ret
+
+.p2align 4
L$14rounds:
movups 16(%rdi),%xmm2
movl $13,%esi
leaq 16(%rax),%rax
+ cmpl $268435456,%r10d
+ je L$14rounds_alt
+
movups %xmm0,(%rdx)
movups %xmm2,16(%rdx)
.byte 102,15,58,223,202,1
@@ -3072,9 +3390,69 @@ L$14rounds:
jmp L$enc_key_ret
.p2align 4
+L$14rounds_alt:
+ movdqa L$key_rotate(%rip),%xmm5
+ movdqa L$key_rcon1(%rip),%xmm4
+ movl $7,%r10d
+ movdqu %xmm0,0(%rdx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,16(%rdx)
+ jmp L$oop_key256
+
+.p2align 4
+L$oop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ decl %r10d
+ jz L$done_key256
+
+ pshufd $255,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%rax)
+ leaq 32(%rax),%rax
+ movdqa %xmm2,%xmm1
+
+ jmp L$oop_key256
+
+L$done_key256:
+ movl %esi,16(%rax)
+ xorl %eax,%eax
+ jmp L$enc_key_ret
+
+.p2align 4
L$bad_keybits:
movq $-2,%rax
L$enc_key_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
addq $8,%rsp
.byte 0xf3,0xc3
L$SEH_end_set_encrypt_key:
@@ -3160,6 +3538,14 @@ L$xts_magic:
.long 0x87,0,1,0
L$increment1:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+L$key_rotate:
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+L$key_rotate192:
+.long 0x04070605,0x04070605,0x04070605,0x04070605
+L$key_rcon1:
+.long 1,1,1,1
+L$key_rcon1b:
+.long 0x1b,0x1b,0x1b,0x1b
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 6
diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont5.s b/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont5.s
index ba4d62157c..049bf06473 100644
--- a/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont5.s
+++ b/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont5.s
@@ -1755,11 +1755,16 @@ L$from_epilogue:
.p2align 4
_bn_get_bits5:
- movq %rdi,%r10
+ leaq 0(%rdi),%r10
+ leaq 1(%rdi),%r11
movl %esi,%ecx
- shrl $3,%esi
- movzwl (%r10,%rsi,1),%eax
- andl $7,%ecx
+ shrl $4,%esi
+ andl $15,%ecx
+ leal -8(%rcx),%eax
+ cmpl $11,%ecx
+ cmovaq %r11,%r10
+ cmoval %eax,%ecx
+ movzwl (%r10,%rsi,2),%eax
shrl %cl,%eax
andl $31,%eax
.byte 0xf3,0xc3
diff --git a/deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-sha256-x86_64.asm b/deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-sha256-x86_64.asm
index 9473352638..34b554f9a9 100644
--- a/deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-sha256-x86_64.asm
+++ b/deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-sha256-x86_64.asm
@@ -60,77 +60,6 @@ DB 54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98
DB 121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108
DB 46,111,114,103,62,0
ALIGN 64
- mov rsi,rax
- mov rax,QWORD PTR[((64+56))+rax]
- lea rax,QWORD PTR[48+rax]
-
- mov rbx,QWORD PTR[((-8))+rax]
- mov rbp,QWORD PTR[((-16))+rax]
- mov r12,QWORD PTR[((-24))+rax]
- mov r13,QWORD PTR[((-32))+rax]
- mov r14,QWORD PTR[((-40))+rax]
- mov r15,QWORD PTR[((-48))+rax]
- mov QWORD PTR[144+r8],rbx
- mov QWORD PTR[160+r8],rbp
- mov QWORD PTR[216+r8],r12
- mov QWORD PTR[224+r8],r13
- mov QWORD PTR[232+r8],r14
- mov QWORD PTR[240+r8],r15
-
- lea rsi,QWORD PTR[((64+64))+rsi]
- lea rdi,QWORD PTR[512+r8]
- mov ecx,20
- DD 0a548f3fch
-
-$L$in_prologue::
- mov rdi,QWORD PTR[8+rax]
- mov rsi,QWORD PTR[16+rax]
- mov QWORD PTR[152+r8],rax
- mov QWORD PTR[168+r8],rsi
- mov QWORD PTR[176+r8],rdi
-
- mov rdi,QWORD PTR[40+r9]
- mov rsi,r8
- mov ecx,154
- DD 0a548f3fch
-
- mov rsi,r9
- xor rcx,rcx
- mov rdx,QWORD PTR[8+rsi]
- mov r8,QWORD PTR[rsi]
- mov r9,QWORD PTR[16+rsi]
- mov r10,QWORD PTR[40+rsi]
- lea r11,QWORD PTR[56+rsi]
- lea r12,QWORD PTR[24+rsi]
- mov QWORD PTR[32+rsp],r10
- mov QWORD PTR[40+rsp],r11
- mov QWORD PTR[48+rsp],r12
- mov QWORD PTR[56+rsp],rcx
- call QWORD PTR[__imp_RtlVirtualUnwind]
-
- mov eax,1
- add rsp,64
- popfq
- pop r15
- pop r14
- pop r13
- pop r12
- pop rbp
- pop rbx
- pop rdi
- pop rsi
- DB 0F3h,0C3h ;repret
-
.text$ ENDS
-.pdata SEGMENT READONLY ALIGN(4)
- DD imagerel $L$SEH_begin_aesni_cbc_sha256_enc_xop
- DD imagerel $L$SEH_end_aesni_cbc_sha256_enc_xop
- DD imagerel $L$SEH_info_aesni_cbc_sha256_enc_xop
-
- DD imagerel $L$SEH_begin_aesni_cbc_sha256_enc_avx
- DD imagerel $L$SEH_end_aesni_cbc_sha256_enc_avx
- DD imagerel $L$SEH_info_aesni_cbc_sha256_enc_avx
-
-.pdata ENDS
END
diff --git a/deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-x86_64.asm b/deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-x86_64.asm
index 53d8afc950..5e848125d6 100644
--- a/deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-x86_64.asm
+++ b/deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-x86_64.asm
@@ -18,7 +18,10 @@ DB 102,15,56,220,209
lea r8,QWORD PTR[16+r8]
jnz $L$oop_enc1_1
DB 102,15,56,221,209
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
movups XMMWORD PTR[rdx],xmm2
+ pxor xmm2,xmm2
DB 0F3h,0C3h ;repret
aesni_encrypt ENDP
@@ -39,7 +42,10 @@ DB 102,15,56,222,209
lea r8,QWORD PTR[16+r8]
jnz $L$oop_dec1_2
DB 102,15,56,223,209
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
movups XMMWORD PTR[rdx],xmm2
+ pxor xmm2,xmm2
DB 0F3h,0C3h ;repret
aesni_decrypt ENDP
@@ -265,21 +271,18 @@ DB 102,15,56,220,217
pxor xmm6,xmm0
DB 102,15,56,220,225
pxor xmm7,xmm0
+ movups xmm0,XMMWORD PTR[rax*1+rcx]
add rax,16
-DB 102,15,56,220,233
-DB 102,15,56,220,241
-DB 102,15,56,220,249
- movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
jmp $L$enc_loop6_enter
ALIGN 16
$L$enc_loop6::
DB 102,15,56,220,209
DB 102,15,56,220,217
DB 102,15,56,220,225
+$L$enc_loop6_enter::
DB 102,15,56,220,233
DB 102,15,56,220,241
DB 102,15,56,220,249
-$L$enc_loop6_enter::
movups xmm1,XMMWORD PTR[rax*1+rcx]
add rax,32
DB 102,15,56,220,208
@@ -322,21 +325,18 @@ DB 102,15,56,222,217
pxor xmm6,xmm0
DB 102,15,56,222,225
pxor xmm7,xmm0
+ movups xmm0,XMMWORD PTR[rax*1+rcx]
add rax,16
-DB 102,15,56,222,233
-DB 102,15,56,222,241
-DB 102,15,56,222,249
- movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
jmp $L$dec_loop6_enter
ALIGN 16
$L$dec_loop6::
DB 102,15,56,222,209
DB 102,15,56,222,217
DB 102,15,56,222,225
+$L$dec_loop6_enter::
DB 102,15,56,222,233
DB 102,15,56,222,241
DB 102,15,56,222,249
-$L$dec_loop6_enter::
movups xmm1,XMMWORD PTR[rax*1+rcx]
add rax,32
DB 102,15,56,222,208
@@ -376,23 +376,18 @@ _aesni_encrypt8 PROC PRIVATE
lea rcx,QWORD PTR[32+rax*1+rcx]
neg rax
DB 102,15,56,220,209
- add rax,16
pxor xmm7,xmm0
-DB 102,15,56,220,217
pxor xmm8,xmm0
+DB 102,15,56,220,217
pxor xmm9,xmm0
-DB 102,15,56,220,225
-DB 102,15,56,220,233
-DB 102,15,56,220,241
-DB 102,15,56,220,249
-DB 102,68,15,56,220,193
-DB 102,68,15,56,220,201
- movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
- jmp $L$enc_loop8_enter
+ movups xmm0,XMMWORD PTR[rax*1+rcx]
+ add rax,16
+ jmp $L$enc_loop8_inner
ALIGN 16
$L$enc_loop8::
DB 102,15,56,220,209
DB 102,15,56,220,217
+$L$enc_loop8_inner::
DB 102,15,56,220,225
DB 102,15,56,220,233
DB 102,15,56,220,241
@@ -445,23 +440,18 @@ _aesni_decrypt8 PROC PRIVATE
lea rcx,QWORD PTR[32+rax*1+rcx]
neg rax
DB 102,15,56,222,209
- add rax,16
pxor xmm7,xmm0
-DB 102,15,56,222,217
pxor xmm8,xmm0
+DB 102,15,56,222,217
pxor xmm9,xmm0
-DB 102,15,56,222,225
-DB 102,15,56,222,233
-DB 102,15,56,222,241
-DB 102,15,56,222,249
-DB 102,68,15,56,222,193
-DB 102,68,15,56,222,201
- movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
- jmp $L$dec_loop8_enter
+ movups xmm0,XMMWORD PTR[rax*1+rcx]
+ add rax,16
+ jmp $L$dec_loop8_inner
ALIGN 16
$L$dec_loop8::
DB 102,15,56,222,209
DB 102,15,56,222,217
+$L$dec_loop8_inner::
DB 102,15,56,222,225
DB 102,15,56,222,233
DB 102,15,56,222,241
@@ -605,6 +595,7 @@ $L$ecb_enc_tail::
movups xmm7,XMMWORD PTR[80+rdi]
je $L$ecb_enc_six
movdqu xmm8,XMMWORD PTR[96+rdi]
+ xorps xmm9,xmm9
call _aesni_encrypt8
movups XMMWORD PTR[rsi],xmm2
movups XMMWORD PTR[16+rsi],xmm3
@@ -718,15 +709,23 @@ $L$ecb_dec_loop8_enter::
jnc $L$ecb_dec_loop8
movups XMMWORD PTR[rsi],xmm2
+ pxor xmm2,xmm2
mov rcx,r11
movups XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
mov eax,r10d
movups XMMWORD PTR[32+rsi],xmm4
+ pxor xmm4,xmm4
movups XMMWORD PTR[48+rsi],xmm5
+ pxor xmm5,xmm5
movups XMMWORD PTR[64+rsi],xmm6
+ pxor xmm6,xmm6
movups XMMWORD PTR[80+rsi],xmm7
+ pxor xmm7,xmm7
movups XMMWORD PTR[96+rsi],xmm8
+ pxor xmm8,xmm8
movups XMMWORD PTR[112+rsi],xmm9
+ pxor xmm9,xmm9
lea rsi,QWORD PTR[128+rsi]
add rdx,080h
jz $L$ecb_ret
@@ -749,14 +748,23 @@ $L$ecb_dec_tail::
je $L$ecb_dec_six
movups xmm8,XMMWORD PTR[96+rdi]
movups xmm0,XMMWORD PTR[rcx]
+ xorps xmm9,xmm9
call _aesni_decrypt8
movups XMMWORD PTR[rsi],xmm2
+ pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
movups XMMWORD PTR[32+rsi],xmm4
+ pxor xmm4,xmm4
movups XMMWORD PTR[48+rsi],xmm5
+ pxor xmm5,xmm5
movups XMMWORD PTR[64+rsi],xmm6
+ pxor xmm6,xmm6
movups XMMWORD PTR[80+rsi],xmm7
+ pxor xmm7,xmm7
movups XMMWORD PTR[96+rsi],xmm8
+ pxor xmm8,xmm8
+ pxor xmm9,xmm9
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_one::
@@ -772,53 +780,81 @@ DB 102,15,56,222,209
jnz $L$oop_dec1_4
DB 102,15,56,223,209
movups XMMWORD PTR[rsi],xmm2
+ pxor xmm2,xmm2
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_two::
call _aesni_decrypt2
movups XMMWORD PTR[rsi],xmm2
+ pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_three::
call _aesni_decrypt3
movups XMMWORD PTR[rsi],xmm2
+ pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
movups XMMWORD PTR[32+rsi],xmm4
+ pxor xmm4,xmm4
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_four::
call _aesni_decrypt4
movups XMMWORD PTR[rsi],xmm2
+ pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
movups XMMWORD PTR[32+rsi],xmm4
+ pxor xmm4,xmm4
movups XMMWORD PTR[48+rsi],xmm5
+ pxor xmm5,xmm5
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_five::
xorps xmm7,xmm7
call _aesni_decrypt6
movups XMMWORD PTR[rsi],xmm2
+ pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
movups XMMWORD PTR[32+rsi],xmm4
+ pxor xmm4,xmm4
movups XMMWORD PTR[48+rsi],xmm5
+ pxor xmm5,xmm5
movups XMMWORD PTR[64+rsi],xmm6
+ pxor xmm6,xmm6
+ pxor xmm7,xmm7
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_six::
call _aesni_decrypt6
movups XMMWORD PTR[rsi],xmm2
+ pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
movups XMMWORD PTR[32+rsi],xmm4
+ pxor xmm4,xmm4
movups XMMWORD PTR[48+rsi],xmm5
+ pxor xmm5,xmm5
movups XMMWORD PTR[64+rsi],xmm6
+ pxor xmm6,xmm6
movups XMMWORD PTR[80+rsi],xmm7
+ pxor xmm7,xmm7
$L$ecb_ret::
+ xorps xmm0,xmm0
+ pxor xmm1,xmm1
movaps xmm6,XMMWORD PTR[rsp]
+ movaps XMMWORD PTR[rsp],xmm0
movaps xmm7,XMMWORD PTR[16+rsp]
+ movaps XMMWORD PTR[16+rsp],xmm0
movaps xmm8,XMMWORD PTR[32+rsp]
+ movaps XMMWORD PTR[32+rsp],xmm0
movaps xmm9,XMMWORD PTR[48+rsp]
+ movaps XMMWORD PTR[48+rsp],xmm0
lea rsp,QWORD PTR[88+rsp]
$L$ecb_enc_ret::
mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue
@@ -898,11 +934,21 @@ DB 102,15,56,0,215
lea rsi,QWORD PTR[16+rsi]
jnz $L$ccm64_enc_outer
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
movups XMMWORD PTR[r9],xmm3
+ pxor xmm3,xmm3
+ pxor xmm8,xmm8
+ pxor xmm6,xmm6
movaps xmm6,XMMWORD PTR[rsp]
+ movaps XMMWORD PTR[rsp],xmm0
movaps xmm7,XMMWORD PTR[16+rsp]
+ movaps XMMWORD PTR[16+rsp],xmm0
movaps xmm8,XMMWORD PTR[32+rsp]
+ movaps XMMWORD PTR[32+rsp],xmm0
movaps xmm9,XMMWORD PTR[48+rsp]
+ movaps XMMWORD PTR[48+rsp],xmm0
lea rsp,QWORD PTR[88+rsp]
$L$ccm64_enc_ret::
mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue
@@ -1016,11 +1062,21 @@ DB 102,15,56,220,217
lea r11,QWORD PTR[16+r11]
jnz $L$oop_enc1_6
DB 102,15,56,221,217
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
movups XMMWORD PTR[r9],xmm3
+ pxor xmm3,xmm3
+ pxor xmm8,xmm8
+ pxor xmm6,xmm6
movaps xmm6,XMMWORD PTR[rsp]
+ movaps XMMWORD PTR[rsp],xmm0
movaps xmm7,XMMWORD PTR[16+rsp]
+ movaps XMMWORD PTR[16+rsp],xmm0
movaps xmm8,XMMWORD PTR[32+rsp]
+ movaps XMMWORD PTR[32+rsp],xmm0
movaps xmm9,XMMWORD PTR[48+rsp]
+ movaps XMMWORD PTR[48+rsp],xmm0
lea rsp,QWORD PTR[88+rsp]
$L$ccm64_dec_ret::
mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue
@@ -1043,6 +1099,35 @@ $L$SEH_begin_aesni_ctr32_encrypt_blocks::
mov r8,QWORD PTR[40+rsp]
+ cmp rdx,1
+ jne $L$ctr32_bulk
+
+
+
+ movups xmm2,XMMWORD PTR[r8]
+ movups xmm3,XMMWORD PTR[rdi]
+ mov edx,DWORD PTR[240+rcx]
+ movups xmm0,XMMWORD PTR[rcx]
+ movups xmm1,XMMWORD PTR[16+rcx]
+ lea rcx,QWORD PTR[32+rcx]
+ xorps xmm2,xmm0
+$L$oop_enc1_7::
+DB 102,15,56,220,209
+ dec edx
+ movups xmm1,XMMWORD PTR[rcx]
+ lea rcx,QWORD PTR[16+rcx]
+ jnz $L$oop_enc1_7
+DB 102,15,56,221,209
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ xorps xmm2,xmm3
+ pxor xmm3,xmm3
+ movups XMMWORD PTR[rsi],xmm2
+ xorps xmm2,xmm2
+ jmp $L$ctr32_epilogue
+
+ALIGN 16
+$L$ctr32_bulk::
lea rax,QWORD PTR[rsp]
push rbp
sub rsp,288
@@ -1060,8 +1145,8 @@ $L$SEH_begin_aesni_ctr32_encrypt_blocks::
$L$ctr32_body::
lea rbp,QWORD PTR[((-8))+rax]
- cmp rdx,1
- je $L$ctr32_one_shortcut
+
+
movdqu xmm2,XMMWORD PTR[r8]
movdqu xmm0,XMMWORD PTR[rcx]
@@ -1452,11 +1537,14 @@ DB 102,69,15,56,221,202
lea rcx,QWORD PTR[((-128))+rcx]
$L$ctr32_tail::
+
+
lea rcx,QWORD PTR[16+rcx]
cmp rdx,4
jb $L$ctr32_loop3
je $L$ctr32_loop4
+
shl eax,4
movdqa xmm8,XMMWORD PTR[96+rsp]
pxor xmm9,xmm9
@@ -1559,40 +1647,43 @@ DB 102,15,56,221,225
movups xmm12,XMMWORD PTR[32+rdi]
xorps xmm4,xmm12
movups XMMWORD PTR[32+rsi],xmm4
- jmp $L$ctr32_done
-ALIGN 16
-$L$ctr32_one_shortcut::
- movups xmm2,XMMWORD PTR[r8]
- movups xmm10,XMMWORD PTR[rdi]
- mov eax,DWORD PTR[240+rcx]
- movups xmm0,XMMWORD PTR[rcx]
- movups xmm1,XMMWORD PTR[16+rcx]
- lea rcx,QWORD PTR[32+rcx]
- xorps xmm2,xmm0
-$L$oop_enc1_7::
-DB 102,15,56,220,209
- dec eax
- movups xmm1,XMMWORD PTR[rcx]
- lea rcx,QWORD PTR[16+rcx]
- jnz $L$oop_enc1_7
-DB 102,15,56,221,209
- xorps xmm2,xmm10
- movups XMMWORD PTR[rsi],xmm2
- jmp $L$ctr32_done
-
-ALIGN 16
$L$ctr32_done::
+ xorps xmm0,xmm0
+ xor r11d,r11d
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
movaps xmm6,XMMWORD PTR[((-160))+rbp]
+ movaps XMMWORD PTR[(-160)+rbp],xmm0
movaps xmm7,XMMWORD PTR[((-144))+rbp]
+ movaps XMMWORD PTR[(-144)+rbp],xmm0
movaps xmm8,XMMWORD PTR[((-128))+rbp]
+ movaps XMMWORD PTR[(-128)+rbp],xmm0
movaps xmm9,XMMWORD PTR[((-112))+rbp]
+ movaps XMMWORD PTR[(-112)+rbp],xmm0
movaps xmm10,XMMWORD PTR[((-96))+rbp]
+ movaps XMMWORD PTR[(-96)+rbp],xmm0
movaps xmm11,XMMWORD PTR[((-80))+rbp]
+ movaps XMMWORD PTR[(-80)+rbp],xmm0
movaps xmm12,XMMWORD PTR[((-64))+rbp]
+ movaps XMMWORD PTR[(-64)+rbp],xmm0
movaps xmm13,XMMWORD PTR[((-48))+rbp]
+ movaps XMMWORD PTR[(-48)+rbp],xmm0
movaps xmm14,XMMWORD PTR[((-32))+rbp]
+ movaps XMMWORD PTR[(-32)+rbp],xmm0
movaps xmm15,XMMWORD PTR[((-16))+rbp]
+ movaps XMMWORD PTR[(-16)+rbp],xmm0
+ movaps XMMWORD PTR[rsp],xmm0
+ movaps XMMWORD PTR[16+rsp],xmm0
+ movaps XMMWORD PTR[32+rsp],xmm0
+ movaps XMMWORD PTR[48+rsp],xmm0
+ movaps XMMWORD PTR[64+rsp],xmm0
+ movaps XMMWORD PTR[80+rsp],xmm0
+ movaps XMMWORD PTR[96+rsp],xmm0
+ movaps XMMWORD PTR[112+rsp],xmm0
lea rsp,QWORD PTR[rbp]
pop rbp
$L$ctr32_epilogue::
@@ -1889,6 +1980,7 @@ DB 102,15,56,221,124,36,80
shr eax,4
$L$xts_enc_short::
+
mov r10d,eax
pxor xmm10,xmm0
add rdx,16*6
@@ -1917,6 +2009,7 @@ $L$xts_enc_short::
pxor xmm4,xmm12
pxor xmm5,xmm13
pxor xmm6,xmm14
+ pxor xmm7,xmm7
call _aesni_encrypt6
@@ -2059,16 +2152,39 @@ DB 102,15,56,221,209
movups XMMWORD PTR[(-16)+rsi],xmm2
$L$xts_enc_ret::
+ xorps xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
movaps xmm6,XMMWORD PTR[((-160))+rbp]
+ movaps XMMWORD PTR[(-160)+rbp],xmm0
movaps xmm7,XMMWORD PTR[((-144))+rbp]
+ movaps XMMWORD PTR[(-144)+rbp],xmm0
movaps xmm8,XMMWORD PTR[((-128))+rbp]
+ movaps XMMWORD PTR[(-128)+rbp],xmm0
movaps xmm9,XMMWORD PTR[((-112))+rbp]
+ movaps XMMWORD PTR[(-112)+rbp],xmm0
movaps xmm10,XMMWORD PTR[((-96))+rbp]
+ movaps XMMWORD PTR[(-96)+rbp],xmm0
movaps xmm11,XMMWORD PTR[((-80))+rbp]
+ movaps XMMWORD PTR[(-80)+rbp],xmm0
movaps xmm12,XMMWORD PTR[((-64))+rbp]
+ movaps XMMWORD PTR[(-64)+rbp],xmm0
movaps xmm13,XMMWORD PTR[((-48))+rbp]
+ movaps XMMWORD PTR[(-48)+rbp],xmm0
movaps xmm14,XMMWORD PTR[((-32))+rbp]
+ movaps XMMWORD PTR[(-32)+rbp],xmm0
movaps xmm15,XMMWORD PTR[((-16))+rbp]
+ movaps XMMWORD PTR[(-16)+rbp],xmm0
+ movaps XMMWORD PTR[rsp],xmm0
+ movaps XMMWORD PTR[16+rsp],xmm0
+ movaps XMMWORD PTR[32+rsp],xmm0
+ movaps XMMWORD PTR[48+rsp],xmm0
+ movaps XMMWORD PTR[64+rsp],xmm0
+ movaps XMMWORD PTR[80+rsp],xmm0
+ movaps XMMWORD PTR[96+rsp],xmm0
lea rsp,QWORD PTR[rbp]
pop rbp
$L$xts_enc_epilogue::
@@ -2371,6 +2487,7 @@ DB 102,15,56,223,124,36,80
shr eax,4
$L$xts_dec_short::
+
mov r10d,eax
pxor xmm10,xmm0
pxor xmm11,xmm0
@@ -2573,16 +2690,39 @@ DB 102,15,56,223,209
movups XMMWORD PTR[rsi],xmm2
$L$xts_dec_ret::
+ xorps xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
movaps xmm6,XMMWORD PTR[((-160))+rbp]
+ movaps XMMWORD PTR[(-160)+rbp],xmm0
movaps xmm7,XMMWORD PTR[((-144))+rbp]
+ movaps XMMWORD PTR[(-144)+rbp],xmm0
movaps xmm8,XMMWORD PTR[((-128))+rbp]
+ movaps XMMWORD PTR[(-128)+rbp],xmm0
movaps xmm9,XMMWORD PTR[((-112))+rbp]
+ movaps XMMWORD PTR[(-112)+rbp],xmm0
movaps xmm10,XMMWORD PTR[((-96))+rbp]
+ movaps XMMWORD PTR[(-96)+rbp],xmm0
movaps xmm11,XMMWORD PTR[((-80))+rbp]
+ movaps XMMWORD PTR[(-80)+rbp],xmm0
movaps xmm12,XMMWORD PTR[((-64))+rbp]
+ movaps XMMWORD PTR[(-64)+rbp],xmm0
movaps xmm13,XMMWORD PTR[((-48))+rbp]
+ movaps XMMWORD PTR[(-48)+rbp],xmm0
movaps xmm14,XMMWORD PTR[((-32))+rbp]
+ movaps XMMWORD PTR[(-32)+rbp],xmm0
movaps xmm15,XMMWORD PTR[((-16))+rbp]
+ movaps XMMWORD PTR[(-16)+rbp],xmm0
+ movaps XMMWORD PTR[rsp],xmm0
+ movaps XMMWORD PTR[16+rsp],xmm0
+ movaps XMMWORD PTR[32+rsp],xmm0
+ movaps XMMWORD PTR[48+rsp],xmm0
+ movaps XMMWORD PTR[64+rsp],xmm0
+ movaps XMMWORD PTR[80+rsp],xmm0
+ movaps XMMWORD PTR[96+rsp],xmm0
lea rsp,QWORD PTR[rbp]
pop rbp
$L$xts_dec_epilogue::
@@ -2646,7 +2786,11 @@ DB 102,15,56,221,209
jnc $L$cbc_enc_loop
add rdx,16
jnz $L$cbc_enc_tail
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
movups XMMWORD PTR[r8],xmm2
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
jmp $L$cbc_ret
$L$cbc_enc_tail::
@@ -2666,6 +2810,35 @@ $L$cbc_enc_tail::
ALIGN 16
$L$cbc_decrypt::
+ cmp rdx,16
+ jne $L$cbc_decrypt_bulk
+
+
+
+ movdqu xmm2,XMMWORD PTR[rdi]
+ movdqu xmm3,XMMWORD PTR[r8]
+ movdqa xmm4,xmm2
+ movups xmm0,XMMWORD PTR[rcx]
+ movups xmm1,XMMWORD PTR[16+rcx]
+ lea rcx,QWORD PTR[32+rcx]
+ xorps xmm2,xmm0
+$L$oop_dec1_16::
+DB 102,15,56,222,209
+ dec r10d
+ movups xmm1,XMMWORD PTR[rcx]
+ lea rcx,QWORD PTR[16+rcx]
+ jnz $L$oop_dec1_16
+DB 102,15,56,223,209
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ movdqu XMMWORD PTR[r8],xmm4
+ xorps xmm2,xmm3
+ pxor xmm3,xmm3
+ movups XMMWORD PTR[rsi],xmm2
+ pxor xmm2,xmm2
+ jmp $L$cbc_ret
+ALIGN 16
+$L$cbc_decrypt_bulk::
lea rax,QWORD PTR[rsp]
push rbp
sub rsp,176
@@ -2913,7 +3086,7 @@ DB 102,69,15,56,223,202
movaps xmm2,xmm9
lea rcx,QWORD PTR[((-112))+rcx]
add rdx,070h
- jle $L$cbc_dec_tail_collected
+ jle $L$cbc_dec_clear_tail_collected
movups XMMWORD PTR[rsi],xmm9
lea rsi,QWORD PTR[16+rsi]
cmp rdx,050h
@@ -2932,14 +3105,19 @@ $L$cbc_dec_six_or_seven::
movdqu XMMWORD PTR[rsi],xmm2
pxor xmm4,xmm12
movdqu XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
pxor xmm5,xmm13
movdqu XMMWORD PTR[32+rsi],xmm4
+ pxor xmm4,xmm4
pxor xmm6,xmm14
movdqu XMMWORD PTR[48+rsi],xmm5
+ pxor xmm5,xmm5
pxor xmm7,xmm15
movdqu XMMWORD PTR[64+rsi],xmm6
+ pxor xmm6,xmm6
lea rsi,QWORD PTR[80+rsi]
movdqa xmm2,xmm7
+ pxor xmm7,xmm7
jmp $L$cbc_dec_tail_collected
ALIGN 16
@@ -2954,16 +3132,23 @@ $L$cbc_dec_seven::
movdqu XMMWORD PTR[rsi],xmm2
pxor xmm4,xmm12
movdqu XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
pxor xmm5,xmm13
movdqu XMMWORD PTR[32+rsi],xmm4
+ pxor xmm4,xmm4
pxor xmm6,xmm14
movdqu XMMWORD PTR[48+rsi],xmm5
+ pxor xmm5,xmm5
pxor xmm7,xmm15
movdqu XMMWORD PTR[64+rsi],xmm6
+ pxor xmm6,xmm6
pxor xmm8,xmm9
movdqu XMMWORD PTR[80+rsi],xmm7
+ pxor xmm7,xmm7
lea rsi,QWORD PTR[96+rsi]
movdqa xmm2,xmm8
+ pxor xmm8,xmm8
+ pxor xmm9,xmm9
jmp $L$cbc_dec_tail_collected
ALIGN 16
@@ -3007,7 +3192,7 @@ $L$cbc_dec_loop6_enter::
movdqa xmm2,xmm7
add rdx,050h
- jle $L$cbc_dec_tail_collected
+ jle $L$cbc_dec_clear_tail_collected
movups XMMWORD PTR[rsi],xmm7
lea rsi,QWORD PTR[16+rsi]
@@ -3042,12 +3227,17 @@ $L$cbc_dec_tail::
movdqu XMMWORD PTR[rsi],xmm2
pxor xmm4,xmm12
movdqu XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
pxor xmm5,xmm13
movdqu XMMWORD PTR[32+rsi],xmm4
+ pxor xmm4,xmm4
pxor xmm6,xmm14
movdqu XMMWORD PTR[48+rsi],xmm5
+ pxor xmm5,xmm5
lea rsi,QWORD PTR[64+rsi]
movdqa xmm2,xmm6
+ pxor xmm6,xmm6
+ pxor xmm7,xmm7
sub rdx,010h
jmp $L$cbc_dec_tail_collected
@@ -3058,12 +3248,12 @@ $L$cbc_dec_one::
movups xmm1,XMMWORD PTR[16+rcx]
lea rcx,QWORD PTR[32+rcx]
xorps xmm2,xmm0
-$L$oop_dec1_16::
+$L$oop_dec1_17::
DB 102,15,56,222,209
dec eax
movups xmm1,XMMWORD PTR[rcx]
lea rcx,QWORD PTR[16+rcx]
- jnz $L$oop_dec1_16
+ jnz $L$oop_dec1_17
DB 102,15,56,223,209
xorps xmm2,xmm10
movaps xmm10,xmm11
@@ -3077,6 +3267,7 @@ $L$cbc_dec_two::
pxor xmm3,xmm11
movdqu XMMWORD PTR[rsi],xmm2
movdqa xmm2,xmm3
+ pxor xmm3,xmm3
lea rsi,QWORD PTR[16+rsi]
jmp $L$cbc_dec_tail_collected
ALIGN 16
@@ -3089,7 +3280,9 @@ $L$cbc_dec_three::
movdqu XMMWORD PTR[rsi],xmm2
pxor xmm4,xmm12
movdqu XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
movdqa xmm2,xmm4
+ pxor xmm4,xmm4
lea rsi,QWORD PTR[32+rsi]
jmp $L$cbc_dec_tail_collected
ALIGN 16
@@ -3102,39 +3295,61 @@ $L$cbc_dec_four::
movdqu XMMWORD PTR[rsi],xmm2
pxor xmm4,xmm12
movdqu XMMWORD PTR[16+rsi],xmm3
+ pxor xmm3,xmm3
pxor xmm5,xmm13
movdqu XMMWORD PTR[32+rsi],xmm4
+ pxor xmm4,xmm4
movdqa xmm2,xmm5
+ pxor xmm5,xmm5
lea rsi,QWORD PTR[48+rsi]
jmp $L$cbc_dec_tail_collected
ALIGN 16
+$L$cbc_dec_clear_tail_collected::
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
$L$cbc_dec_tail_collected::
movups XMMWORD PTR[r8],xmm10
and rdx,15
jnz $L$cbc_dec_tail_partial
movups XMMWORD PTR[rsi],xmm2
+ pxor xmm2,xmm2
jmp $L$cbc_dec_ret
ALIGN 16
$L$cbc_dec_tail_partial::
movaps XMMWORD PTR[rsp],xmm2
+ pxor xmm2,xmm2
mov rcx,16
mov rdi,rsi
sub rcx,rdx
lea rsi,QWORD PTR[rsp]
DD 09066A4F3h
+ movdqa XMMWORD PTR[rsp],xmm2
$L$cbc_dec_ret::
+ xorps xmm0,xmm0
+ pxor xmm1,xmm1
movaps xmm6,XMMWORD PTR[16+rsp]
+ movaps XMMWORD PTR[16+rsp],xmm0
movaps xmm7,XMMWORD PTR[32+rsp]
+ movaps XMMWORD PTR[32+rsp],xmm0
movaps xmm8,XMMWORD PTR[48+rsp]
+ movaps XMMWORD PTR[48+rsp],xmm0
movaps xmm9,XMMWORD PTR[64+rsp]
+ movaps XMMWORD PTR[64+rsp],xmm0
movaps xmm10,XMMWORD PTR[80+rsp]
+ movaps XMMWORD PTR[80+rsp],xmm0
movaps xmm11,XMMWORD PTR[96+rsp]
+ movaps XMMWORD PTR[96+rsp],xmm0
movaps xmm12,XMMWORD PTR[112+rsp]
+ movaps XMMWORD PTR[112+rsp],xmm0
movaps xmm13,XMMWORD PTR[128+rsp]
+ movaps XMMWORD PTR[128+rsp],xmm0
movaps xmm14,XMMWORD PTR[144+rsp]
+ movaps XMMWORD PTR[144+rsp],xmm0
movaps xmm15,XMMWORD PTR[160+rsp]
+ movaps XMMWORD PTR[160+rsp],xmm0
lea rsp,QWORD PTR[rbp]
pop rbp
$L$cbc_ret::
@@ -3175,7 +3390,9 @@ DB 102,15,56,219,201
movups xmm0,XMMWORD PTR[r8]
DB 102,15,56,219,192
+ pxor xmm1,xmm1
movups XMMWORD PTR[rcx],xmm0
+ pxor xmm0,xmm0
$L$dec_key_ret::
add rsp,8
DB 0F3h,0C3h ;repret
@@ -3193,8 +3410,10 @@ DB 048h,083h,0ECh,008h
test r8,r8
jz $L$enc_key_ret
+ mov r10d,268437504
movups xmm0,XMMWORD PTR[rcx]
xorps xmm4,xmm4
+ and r10d,DWORD PTR[((OPENSSL_ia32cap_P+4))]
lea rax,QWORD PTR[16+r8]
cmp edx,256
je $L$14rounds
@@ -3205,6 +3424,9 @@ DB 048h,083h,0ECh,008h
$L$10rounds::
mov edx,9
+ cmp r10d,268435456
+ je $L$10rounds_alt
+
movups XMMWORD PTR[r8],xmm0
DB 102,15,58,223,200,1
call $L$key_expansion_128_cold
@@ -3232,9 +3454,79 @@ DB 102,15,58,223,200,54
jmp $L$enc_key_ret
ALIGN 16
+$L$10rounds_alt::
+ movdqa xmm5,XMMWORD PTR[$L$key_rotate]
+ mov r10d,8
+ movdqa xmm4,XMMWORD PTR[$L$key_rcon1]
+ movdqa xmm2,xmm0
+ movdqu XMMWORD PTR[r8],xmm0
+ jmp $L$oop_key128
+
+ALIGN 16
+$L$oop_key128::
+DB 102,15,56,0,197
+DB 102,15,56,221,196
+ pslld xmm4,1
+ lea rax,QWORD PTR[16+rax]
+
+ movdqa xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm2,xmm3
+
+ pxor xmm0,xmm2
+ movdqu XMMWORD PTR[(-16)+rax],xmm0
+ movdqa xmm2,xmm0
+
+ dec r10d
+ jnz $L$oop_key128
+
+ movdqa xmm4,XMMWORD PTR[$L$key_rcon1b]
+
+DB 102,15,56,0,197
+DB 102,15,56,221,196
+ pslld xmm4,1
+
+ movdqa xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm2,xmm3
+
+ pxor xmm0,xmm2
+ movdqu XMMWORD PTR[rax],xmm0
+
+ movdqa xmm2,xmm0
+DB 102,15,56,0,197
+DB 102,15,56,221,196
+
+ movdqa xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm2,xmm3
+
+ pxor xmm0,xmm2
+ movdqu XMMWORD PTR[16+rax],xmm0
+
+ mov DWORD PTR[96+rax],edx
+ xor eax,eax
+ jmp $L$enc_key_ret
+
+ALIGN 16
$L$12rounds::
movq xmm2,QWORD PTR[16+rcx]
mov edx,11
+ cmp r10d,268435456
+ je $L$12rounds_alt
+
movups XMMWORD PTR[r8],xmm0
DB 102,15,58,223,202,1
call $L$key_expansion_192a_cold
@@ -3258,10 +3550,54 @@ DB 102,15,58,223,202,128
jmp $L$enc_key_ret
ALIGN 16
+$L$12rounds_alt::
+ movdqa xmm5,XMMWORD PTR[$L$key_rotate192]
+ movdqa xmm4,XMMWORD PTR[$L$key_rcon1]
+ mov r10d,8
+ movdqu XMMWORD PTR[r8],xmm0
+ jmp $L$oop_key192
+
+ALIGN 16
+$L$oop_key192::
+ movq QWORD PTR[rax],xmm2
+ movdqa xmm1,xmm2
+DB 102,15,56,0,213
+DB 102,15,56,221,212
+ pslld xmm4,1
+ lea rax,QWORD PTR[24+rax]
+
+ movdqa xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm0,xmm3
+
+ pshufd xmm3,xmm0,0ffh
+ pxor xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm3,xmm1
+
+ pxor xmm0,xmm2
+ pxor xmm2,xmm3
+ movdqu XMMWORD PTR[(-16)+rax],xmm0
+
+ dec r10d
+ jnz $L$oop_key192
+
+ mov DWORD PTR[32+rax],edx
+ xor eax,eax
+ jmp $L$enc_key_ret
+
+ALIGN 16
$L$14rounds::
movups xmm2,XMMWORD PTR[16+rcx]
mov edx,13
lea rax,QWORD PTR[16+rax]
+ cmp r10d,268435456
+ je $L$14rounds_alt
+
movups XMMWORD PTR[r8],xmm0
movups XMMWORD PTR[16+r8],xmm2
DB 102,15,58,223,202,1
@@ -3296,9 +3632,69 @@ DB 102,15,58,223,202,64
jmp $L$enc_key_ret
ALIGN 16
+$L$14rounds_alt::
+ movdqa xmm5,XMMWORD PTR[$L$key_rotate]
+ movdqa xmm4,XMMWORD PTR[$L$key_rcon1]
+ mov r10d,7
+ movdqu XMMWORD PTR[r8],xmm0
+ movdqa xmm1,xmm2
+ movdqu XMMWORD PTR[16+r8],xmm2
+ jmp $L$oop_key256
+
+ALIGN 16
+$L$oop_key256::
+DB 102,15,56,0,213
+DB 102,15,56,221,212
+
+ movdqa xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm0,xmm3
+ pslld xmm4,1
+
+ pxor xmm0,xmm2
+ movdqu XMMWORD PTR[rax],xmm0
+
+ dec r10d
+ jz $L$done_key256
+
+ pshufd xmm2,xmm0,0ffh
+ pxor xmm3,xmm3
+DB 102,15,56,221,211
+
+ movdqa xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm1,xmm3
+
+ pxor xmm2,xmm1
+ movdqu XMMWORD PTR[16+rax],xmm2
+ lea rax,QWORD PTR[32+rax]
+ movdqa xmm1,xmm2
+
+ jmp $L$oop_key256
+
+$L$done_key256::
+ mov DWORD PTR[16+rax],edx
+ xor eax,eax
+ jmp $L$enc_key_ret
+
+ALIGN 16
$L$bad_keybits::
mov rax,-2
$L$enc_key_ret::
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
add rsp,8
DB 0F3h,0C3h ;repret
$L$SEH_end_set_encrypt_key::
@@ -3384,6 +3780,14 @@ $L$xts_magic::
DD 087h,0,1,0
$L$increment1::
DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+$L$key_rotate::
+ DD 00c0f0e0dh,00c0f0e0dh,00c0f0e0dh,00c0f0e0dh
+$L$key_rotate192::
+ DD 004070605h,004070605h,004070605h,004070605h
+$L$key_rcon1::
+ DD 1,1,1,1
+$L$key_rcon1b::
+ DD 01bh,01bh,01bh,01bh
DB 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
DB 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
@@ -3489,7 +3893,7 @@ cbc_se_handler PROC PRIVATE
mov rax,QWORD PTR[152+r8]
mov rbx,QWORD PTR[248+r8]
- lea r10,QWORD PTR[$L$cbc_decrypt]
+ lea r10,QWORD PTR[$L$cbc_decrypt_bulk]
cmp rbx,r10
jb $L$common_seh_tail
diff --git a/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont5.asm b/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont5.asm
index c47130f44c..f690ba58d3 100644
--- a/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont5.asm
+++ b/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont5.asm
@@ -1832,11 +1832,16 @@ PUBLIC bn_get_bits5
ALIGN 16
bn_get_bits5 PROC PUBLIC
- mov r10,rcx
+ lea r10,QWORD PTR[rcx]
+ lea r11,QWORD PTR[1+rcx]
mov ecx,edx
- shr edx,3
- movzx eax,WORD PTR[rdx*1+r10]
- and ecx,7
+ shr edx,4
+ and ecx,15
+ lea eax,DWORD PTR[((-8))+rcx]
+ cmp ecx,11
+ cmova r10,r11
+ cmova ecx,eax
+ movzx eax,WORD PTR[rdx*2+r10]
shr eax,cl
and eax,31
DB 0F3h,0C3h ;repret
diff --git a/deps/openssl/asm_obsolete/x86-elf-gas/aes/aesni-x86.s b/deps/openssl/asm_obsolete/x86-elf-gas/aes/aesni-x86.s
index a68f7cdbe9..3bbc4e47d6 100644
--- a/deps/openssl/asm_obsolete/x86-elf-gas/aes/aesni-x86.s
+++ b/deps/openssl/asm_obsolete/x86-elf-gas/aes/aesni-x86.s
@@ -21,7 +21,10 @@ aesni_encrypt:
leal 16(%edx),%edx
jnz .L000enc1_loop_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
ret
.size aesni_encrypt,.-.L_aesni_encrypt_begin
.globl aesni_decrypt
@@ -45,7 +48,10 @@ aesni_decrypt:
leal 16(%edx),%edx
jnz .L001dec1_loop_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
ret
.size aesni_decrypt,.-.L_aesni_decrypt_begin
.type _aesni_encrypt2,@function
@@ -259,17 +265,15 @@ _aesni_encrypt6:
negl %ecx
.byte 102,15,56,220,225
pxor %xmm0,%xmm7
+ movups (%edx,%ecx,1),%xmm0
addl $16,%ecx
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
- movups -16(%edx,%ecx,1),%xmm0
- jmp .L_aesni_encrypt6_enter
+ jmp .L008_aesni_encrypt6_inner
.align 16
-.L008enc6_loop:
+.L009enc6_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
+.L008_aesni_encrypt6_inner:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
@@ -283,7 +287,7 @@ _aesni_encrypt6:
.byte 102,15,56,220,240
.byte 102,15,56,220,248
movups -16(%edx,%ecx,1),%xmm0
- jnz .L008enc6_loop
+ jnz .L009enc6_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -315,17 +319,15 @@ _aesni_decrypt6:
negl %ecx
.byte 102,15,56,222,225
pxor %xmm0,%xmm7
+ movups (%edx,%ecx,1),%xmm0
addl $16,%ecx
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
- movups -16(%edx,%ecx,1),%xmm0
- jmp .L_aesni_decrypt6_enter
+ jmp .L010_aesni_decrypt6_inner
.align 16
-.L009dec6_loop:
+.L011dec6_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
+.L010_aesni_decrypt6_inner:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
@@ -339,7 +341,7 @@ _aesni_decrypt6:
.byte 102,15,56,222,240
.byte 102,15,56,222,248
movups -16(%edx,%ecx,1),%xmm0
- jnz .L009dec6_loop
+ jnz .L011dec6_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -369,14 +371,14 @@ aesni_ecb_encrypt:
movl 32(%esp),%edx
movl 36(%esp),%ebx
andl $-16,%eax
- jz .L010ecb_ret
+ jz .L012ecb_ret
movl 240(%edx),%ecx
testl %ebx,%ebx
- jz .L011ecb_decrypt
+ jz .L013ecb_decrypt
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb .L012ecb_enc_tail
+ jb .L014ecb_enc_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -385,9 +387,9 @@ aesni_ecb_encrypt:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp .L013ecb_enc_loop6_enter
+ jmp .L015ecb_enc_loop6_enter
.align 16
-.L014ecb_enc_loop6:
+.L016ecb_enc_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -402,12 +404,12 @@ aesni_ecb_encrypt:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-.L013ecb_enc_loop6_enter:
+.L015ecb_enc_loop6_enter:
call _aesni_encrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc .L014ecb_enc_loop6
+ jnc .L016ecb_enc_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -416,18 +418,18 @@ aesni_ecb_encrypt:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz .L010ecb_ret
-.L012ecb_enc_tail:
+ jz .L012ecb_ret
+.L014ecb_enc_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb .L015ecb_enc_one
+ jb .L017ecb_enc_one
movups 16(%esi),%xmm3
- je .L016ecb_enc_two
+ je .L018ecb_enc_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb .L017ecb_enc_three
+ jb .L019ecb_enc_three
movups 48(%esi),%xmm5
- je .L018ecb_enc_four
+ je .L020ecb_enc_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call _aesni_encrypt6
@@ -436,49 +438,49 @@ aesni_ecb_encrypt:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L015ecb_enc_one:
+.L017ecb_enc_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L019enc1_loop_3:
+.L021enc1_loop_3:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L019enc1_loop_3
+ jnz .L021enc1_loop_3
.byte 102,15,56,221,209
movups %xmm2,(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L016ecb_enc_two:
+.L018ecb_enc_two:
call _aesni_encrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L017ecb_enc_three:
+.L019ecb_enc_three:
call _aesni_encrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L018ecb_enc_four:
+.L020ecb_enc_four:
call _aesni_encrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L011ecb_decrypt:
+.L013ecb_decrypt:
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb .L020ecb_dec_tail
+ jb .L022ecb_dec_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -487,9 +489,9 @@ aesni_ecb_encrypt:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp .L021ecb_dec_loop6_enter
+ jmp .L023ecb_dec_loop6_enter
.align 16
-.L022ecb_dec_loop6:
+.L024ecb_dec_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -504,12 +506,12 @@ aesni_ecb_encrypt:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-.L021ecb_dec_loop6_enter:
+.L023ecb_dec_loop6_enter:
call _aesni_decrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc .L022ecb_dec_loop6
+ jnc .L024ecb_dec_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -518,18 +520,18 @@ aesni_ecb_encrypt:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz .L010ecb_ret
-.L020ecb_dec_tail:
+ jz .L012ecb_ret
+.L022ecb_dec_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb .L023ecb_dec_one
+ jb .L025ecb_dec_one
movups 16(%esi),%xmm3
- je .L024ecb_dec_two
+ je .L026ecb_dec_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb .L025ecb_dec_three
+ jb .L027ecb_dec_three
movups 48(%esi),%xmm5
- je .L026ecb_dec_four
+ je .L028ecb_dec_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call _aesni_decrypt6
@@ -538,43 +540,51 @@ aesni_ecb_encrypt:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L023ecb_dec_one:
+.L025ecb_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L027dec1_loop_4:
+.L029dec1_loop_4:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L027dec1_loop_4
+ jnz .L029dec1_loop_4
.byte 102,15,56,223,209
movups %xmm2,(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L024ecb_dec_two:
+.L026ecb_dec_two:
call _aesni_decrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L025ecb_dec_three:
+.L027ecb_dec_three:
call _aesni_decrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L026ecb_dec_four:
+.L028ecb_dec_four:
call _aesni_decrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-.L010ecb_ret:
+.L012ecb_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -621,7 +631,7 @@ aesni_ccm64_encrypt_blocks:
leal 32(%edx,%ecx,1),%edx
subl %ecx,%ebx
.byte 102,15,56,0,253
-.L028ccm64_enc_outer:
+.L030ccm64_enc_outer:
movups (%ebp),%xmm0
movl %ebx,%ecx
movups (%esi),%xmm6
@@ -630,7 +640,7 @@ aesni_ccm64_encrypt_blocks:
xorps %xmm6,%xmm0
xorps %xmm0,%xmm3
movups 32(%ebp),%xmm0
-.L029ccm64_enc2_loop:
+.L031ccm64_enc2_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
movups (%edx,%ecx,1),%xmm1
@@ -638,7 +648,7 @@ aesni_ccm64_encrypt_blocks:
.byte 102,15,56,220,208
.byte 102,15,56,220,216
movups -16(%edx,%ecx,1),%xmm0
- jnz .L029ccm64_enc2_loop
+ jnz .L031ccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
paddq 16(%esp),%xmm7
@@ -651,10 +661,18 @@ aesni_ccm64_encrypt_blocks:
movups %xmm6,(%edi)
.byte 102,15,56,0,213
leal 16(%edi),%edi
- jnz .L028ccm64_enc_outer
+ jnz .L030ccm64_enc_outer
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -702,12 +720,12 @@ aesni_ccm64_decrypt_blocks:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L030enc1_loop_5:
+.L032enc1_loop_5:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L030enc1_loop_5
+ jnz .L032enc1_loop_5
.byte 102,15,56,221,209
shll $4,%ebx
movl $16,%ecx
@@ -717,16 +735,16 @@ aesni_ccm64_decrypt_blocks:
subl %ebx,%ecx
leal 32(%ebp,%ebx,1),%edx
movl %ecx,%ebx
- jmp .L031ccm64_dec_outer
+ jmp .L033ccm64_dec_outer
.align 16
-.L031ccm64_dec_outer:
+.L033ccm64_dec_outer:
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
movups %xmm6,(%edi)
leal 16(%edi),%edi
.byte 102,15,56,0,213
subl $1,%eax
- jz .L032ccm64_dec_break
+ jz .L034ccm64_dec_break
movups (%ebp),%xmm0
movl %ebx,%ecx
movups 16(%ebp),%xmm1
@@ -734,7 +752,7 @@ aesni_ccm64_decrypt_blocks:
xorps %xmm0,%xmm2
xorps %xmm6,%xmm3
movups 32(%ebp),%xmm0
-.L033ccm64_dec2_loop:
+.L035ccm64_dec2_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
movups (%edx,%ecx,1),%xmm1
@@ -742,7 +760,7 @@ aesni_ccm64_decrypt_blocks:
.byte 102,15,56,220,208
.byte 102,15,56,220,216
movups -16(%edx,%ecx,1),%xmm0
- jnz .L033ccm64_dec2_loop
+ jnz .L035ccm64_dec2_loop
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
.byte 102,15,56,220,209
@@ -750,9 +768,9 @@ aesni_ccm64_decrypt_blocks:
.byte 102,15,56,221,208
.byte 102,15,56,221,216
leal 16(%esi),%esi
- jmp .L031ccm64_dec_outer
+ jmp .L033ccm64_dec_outer
.align 16
-.L032ccm64_dec_break:
+.L034ccm64_dec_break:
movl 240(%ebp),%ecx
movl %ebp,%edx
movups (%edx),%xmm0
@@ -760,16 +778,24 @@ aesni_ccm64_decrypt_blocks:
xorps %xmm0,%xmm6
leal 32(%edx),%edx
xorps %xmm6,%xmm3
-.L034enc1_loop_6:
+.L036enc1_loop_6:
.byte 102,15,56,220,217
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L034enc1_loop_6
+ jnz .L036enc1_loop_6
.byte 102,15,56,221,217
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -795,7 +821,7 @@ aesni_ctr32_encrypt_blocks:
andl $-16,%esp
movl %ebp,80(%esp)
cmpl $1,%eax
- je .L035ctr32_one_shortcut
+ je .L037ctr32_one_shortcut
movdqu (%ebx),%xmm7
movl $202182159,(%esp)
movl $134810123,4(%esp)
@@ -833,7 +859,7 @@ aesni_ctr32_encrypt_blocks:
pshufd $192,%xmm0,%xmm2
pshufd $128,%xmm0,%xmm3
cmpl $6,%eax
- jb .L036ctr32_tail
+ jb .L038ctr32_tail
pxor %xmm6,%xmm7
shll $4,%ecx
movl $16,%ebx
@@ -842,9 +868,9 @@ aesni_ctr32_encrypt_blocks:
subl %ecx,%ebx
leal 32(%edx,%ecx,1),%edx
subl $6,%eax
- jmp .L037ctr32_loop6
+ jmp .L039ctr32_loop6
.align 16
-.L037ctr32_loop6:
+.L039ctr32_loop6:
pshufd $64,%xmm0,%xmm4
movdqa 32(%esp),%xmm0
pshufd $192,%xmm1,%xmm5
@@ -898,27 +924,27 @@ aesni_ctr32_encrypt_blocks:
leal 96(%edi),%edi
pshufd $128,%xmm0,%xmm3
subl $6,%eax
- jnc .L037ctr32_loop6
+ jnc .L039ctr32_loop6
addl $6,%eax
- jz .L038ctr32_ret
+ jz .L040ctr32_ret
movdqu (%ebp),%xmm7
movl %ebp,%edx
pxor 32(%esp),%xmm7
movl 240(%ebp),%ecx
-.L036ctr32_tail:
+.L038ctr32_tail:
por %xmm7,%xmm2
cmpl $2,%eax
- jb .L039ctr32_one
+ jb .L041ctr32_one
pshufd $64,%xmm0,%xmm4
por %xmm7,%xmm3
- je .L040ctr32_two
+ je .L042ctr32_two
pshufd $192,%xmm1,%xmm5
por %xmm7,%xmm4
cmpl $4,%eax
- jb .L041ctr32_three
+ jb .L043ctr32_three
pshufd $128,%xmm1,%xmm6
por %xmm7,%xmm5
- je .L042ctr32_four
+ je .L044ctr32_four
por %xmm7,%xmm6
call _aesni_encrypt6
movups (%esi),%xmm1
@@ -936,29 +962,29 @@ aesni_ctr32_encrypt_blocks:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L038ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L035ctr32_one_shortcut:
+.L037ctr32_one_shortcut:
movups (%ebx),%xmm2
movl 240(%edx),%ecx
-.L039ctr32_one:
+.L041ctr32_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L043enc1_loop_7:
+.L045enc1_loop_7:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L043enc1_loop_7
+ jnz .L045enc1_loop_7
.byte 102,15,56,221,209
movups (%esi),%xmm6
xorps %xmm2,%xmm6
movups %xmm6,(%edi)
- jmp .L038ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L040ctr32_two:
+.L042ctr32_two:
call _aesni_encrypt2
movups (%esi),%xmm5
movups 16(%esi),%xmm6
@@ -966,9 +992,9 @@ aesni_ctr32_encrypt_blocks:
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L038ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L041ctr32_three:
+.L043ctr32_three:
call _aesni_encrypt3
movups (%esi),%xmm5
movups 16(%esi),%xmm6
@@ -979,9 +1005,9 @@ aesni_ctr32_encrypt_blocks:
xorps %xmm7,%xmm4
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L038ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L042ctr32_four:
+.L044ctr32_four:
call _aesni_encrypt4
movups (%esi),%xmm6
movups 16(%esi),%xmm7
@@ -995,7 +1021,18 @@ aesni_ctr32_encrypt_blocks:
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-.L038ctr32_ret:
+.L040ctr32_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
movl 80(%esp),%esp
popl %edi
popl %esi
@@ -1020,12 +1057,12 @@ aesni_xts_encrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L044enc1_loop_8:
+.L046enc1_loop_8:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L044enc1_loop_8
+ jnz .L046enc1_loop_8
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1049,14 +1086,14 @@ aesni_xts_encrypt:
movl %edx,%ebp
movl %ecx,%ebx
subl $96,%eax
- jc .L045xts_enc_short
+ jc .L047xts_enc_short
shll $4,%ecx
movl $16,%ebx
subl %ecx,%ebx
leal 32(%edx,%ecx,1),%edx
- jmp .L046xts_enc_loop6
+ jmp .L048xts_enc_loop6
.align 16
-.L046xts_enc_loop6:
+.L048xts_enc_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1145,23 +1182,23 @@ aesni_xts_encrypt:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
subl $96,%eax
- jnc .L046xts_enc_loop6
+ jnc .L048xts_enc_loop6
movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-.L045xts_enc_short:
+.L047xts_enc_short:
addl $96,%eax
- jz .L047xts_enc_done6x
+ jz .L049xts_enc_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb .L048xts_enc_one
+ jb .L050xts_enc_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je .L049xts_enc_two
+ je .L051xts_enc_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1170,7 +1207,7 @@ aesni_xts_encrypt:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb .L050xts_enc_three
+ jb .L052xts_enc_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1180,7 +1217,7 @@ aesni_xts_encrypt:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je .L051xts_enc_four
+ je .L053xts_enc_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1212,9 +1249,9 @@ aesni_xts_encrypt:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp .L052xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L048xts_enc_one:
+.L050xts_enc_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1222,20 +1259,20 @@ aesni_xts_encrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L053enc1_loop_9:
+.L055enc1_loop_9:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L053enc1_loop_9
+ jnz .L055enc1_loop_9
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp .L052xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L049xts_enc_two:
+.L051xts_enc_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1249,9 +1286,9 @@ aesni_xts_encrypt:
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L052xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L050xts_enc_three:
+.L052xts_enc_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1269,9 +1306,9 @@ aesni_xts_encrypt:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp .L052xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L051xts_enc_four:
+.L053xts_enc_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1293,28 +1330,28 @@ aesni_xts_encrypt:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L052xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L047xts_enc_done6x:
+.L049xts_enc_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz .L054xts_enc_ret
+ jz .L056xts_enc_ret
movdqa %xmm1,%xmm5
movl %eax,112(%esp)
- jmp .L055xts_enc_steal
+ jmp .L057xts_enc_steal
.align 16
-.L052xts_enc_done:
+.L054xts_enc_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz .L054xts_enc_ret
+ jz .L056xts_enc_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm5
paddq %xmm1,%xmm1
pand 96(%esp),%xmm5
pxor %xmm1,%xmm5
-.L055xts_enc_steal:
+.L057xts_enc_steal:
movzbl (%esi),%ecx
movzbl -16(%edi),%edx
leal 1(%esi),%esi
@@ -1322,7 +1359,7 @@ aesni_xts_encrypt:
movb %dl,(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz .L055xts_enc_steal
+ jnz .L057xts_enc_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1332,16 +1369,30 @@ aesni_xts_encrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L056enc1_loop_10:
+.L058enc1_loop_10:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L056enc1_loop_10
+ jnz .L058enc1_loop_10
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,-16(%edi)
-.L054xts_enc_ret:
+.L056xts_enc_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1366,12 +1417,12 @@ aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L057enc1_loop_11:
+.L059enc1_loop_11:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L057enc1_loop_11
+ jnz .L059enc1_loop_11
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1400,14 +1451,14 @@ aesni_xts_decrypt:
pcmpgtd %xmm1,%xmm0
andl $-16,%eax
subl $96,%eax
- jc .L058xts_dec_short
+ jc .L060xts_dec_short
shll $4,%ecx
movl $16,%ebx
subl %ecx,%ebx
leal 32(%edx,%ecx,1),%edx
- jmp .L059xts_dec_loop6
+ jmp .L061xts_dec_loop6
.align 16
-.L059xts_dec_loop6:
+.L061xts_dec_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1496,23 +1547,23 @@ aesni_xts_decrypt:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
subl $96,%eax
- jnc .L059xts_dec_loop6
+ jnc .L061xts_dec_loop6
movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-.L058xts_dec_short:
+.L060xts_dec_short:
addl $96,%eax
- jz .L060xts_dec_done6x
+ jz .L062xts_dec_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb .L061xts_dec_one
+ jb .L063xts_dec_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je .L062xts_dec_two
+ je .L064xts_dec_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1521,7 +1572,7 @@ aesni_xts_decrypt:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb .L063xts_dec_three
+ jb .L065xts_dec_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1531,7 +1582,7 @@ aesni_xts_decrypt:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je .L064xts_dec_four
+ je .L066xts_dec_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1563,9 +1614,9 @@ aesni_xts_decrypt:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp .L065xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L061xts_dec_one:
+.L063xts_dec_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1573,20 +1624,20 @@ aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L066dec1_loop_12:
+.L068dec1_loop_12:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L066dec1_loop_12
+ jnz .L068dec1_loop_12
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp .L065xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L062xts_dec_two:
+.L064xts_dec_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1600,9 +1651,9 @@ aesni_xts_decrypt:
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L065xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L063xts_dec_three:
+.L065xts_dec_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1620,9 +1671,9 @@ aesni_xts_decrypt:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp .L065xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L064xts_dec_four:
+.L066xts_dec_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1644,20 +1695,20 @@ aesni_xts_decrypt:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L065xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L060xts_dec_done6x:
+.L062xts_dec_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz .L067xts_dec_ret
+ jz .L069xts_dec_ret
movl %eax,112(%esp)
- jmp .L068xts_dec_only_one_more
+ jmp .L070xts_dec_only_one_more
.align 16
-.L065xts_dec_done:
+.L067xts_dec_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz .L067xts_dec_ret
+ jz .L069xts_dec_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm2
@@ -1667,7 +1718,7 @@ aesni_xts_decrypt:
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
-.L068xts_dec_only_one_more:
+.L070xts_dec_only_one_more:
pshufd $19,%xmm0,%xmm5
movdqa %xmm1,%xmm6
paddq %xmm1,%xmm1
@@ -1681,16 +1732,16 @@ aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L069dec1_loop_13:
+.L071dec1_loop_13:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L069dec1_loop_13
+ jnz .L071dec1_loop_13
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
-.L070xts_dec_steal:
+.L072xts_dec_steal:
movzbl 16(%esi),%ecx
movzbl (%edi),%edx
leal 1(%esi),%esi
@@ -1698,7 +1749,7 @@ aesni_xts_decrypt:
movb %dl,16(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz .L070xts_dec_steal
+ jnz .L072xts_dec_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1708,16 +1759,30 @@ aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L071dec1_loop_14:
+.L073dec1_loop_14:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L071dec1_loop_14
+ jnz .L073dec1_loop_14
.byte 102,15,56,223,209
xorps %xmm6,%xmm2
movups %xmm2,(%edi)
-.L067xts_dec_ret:
+.L069xts_dec_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1743,7 +1808,7 @@ aesni_cbc_encrypt:
movl 32(%esp),%edx
movl 36(%esp),%ebp
testl %eax,%eax
- jz .L072cbc_abort
+ jz .L074cbc_abort
cmpl $0,40(%esp)
xchgl %esp,%ebx
movups (%ebp),%xmm7
@@ -1751,14 +1816,14 @@ aesni_cbc_encrypt:
movl %edx,%ebp
movl %ebx,16(%esp)
movl %ecx,%ebx
- je .L073cbc_decrypt
+ je .L075cbc_decrypt
movaps %xmm7,%xmm2
cmpl $16,%eax
- jb .L074cbc_enc_tail
+ jb .L076cbc_enc_tail
subl $16,%eax
- jmp .L075cbc_enc_loop
+ jmp .L077cbc_enc_loop
.align 16
-.L075cbc_enc_loop:
+.L077cbc_enc_loop:
movups (%esi),%xmm7
leal 16(%esi),%esi
movups (%edx),%xmm0
@@ -1766,24 +1831,25 @@ aesni_cbc_encrypt:
xorps %xmm0,%xmm7
leal 32(%edx),%edx
xorps %xmm7,%xmm2
-.L076enc1_loop_15:
+.L078enc1_loop_15:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L076enc1_loop_15
+ jnz .L078enc1_loop_15
.byte 102,15,56,221,209
movl %ebx,%ecx
movl %ebp,%edx
movups %xmm2,(%edi)
leal 16(%edi),%edi
subl $16,%eax
- jnc .L075cbc_enc_loop
+ jnc .L077cbc_enc_loop
addl $16,%eax
- jnz .L074cbc_enc_tail
+ jnz .L076cbc_enc_tail
movaps %xmm2,%xmm7
- jmp .L077cbc_ret
-.L074cbc_enc_tail:
+ pxor %xmm2,%xmm2
+ jmp .L079cbc_ret
+.L076cbc_enc_tail:
movl %eax,%ecx
.long 2767451785
movl $16,%ecx
@@ -1794,20 +1860,20 @@ aesni_cbc_encrypt:
movl %ebx,%ecx
movl %edi,%esi
movl %ebp,%edx
- jmp .L075cbc_enc_loop
+ jmp .L077cbc_enc_loop
.align 16
-.L073cbc_decrypt:
+.L075cbc_decrypt:
cmpl $80,%eax
- jbe .L078cbc_dec_tail
+ jbe .L080cbc_dec_tail
movaps %xmm7,(%esp)
subl $80,%eax
- jmp .L079cbc_dec_loop6_enter
+ jmp .L081cbc_dec_loop6_enter
.align 16
-.L080cbc_dec_loop6:
+.L082cbc_dec_loop6:
movaps %xmm0,(%esp)
movups %xmm7,(%edi)
leal 16(%edi),%edi
-.L079cbc_dec_loop6_enter:
+.L081cbc_dec_loop6_enter:
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -1837,28 +1903,28 @@ aesni_cbc_encrypt:
movups %xmm6,64(%edi)
leal 80(%edi),%edi
subl $96,%eax
- ja .L080cbc_dec_loop6
+ ja .L082cbc_dec_loop6
movaps %xmm7,%xmm2
movaps %xmm0,%xmm7
addl $80,%eax
- jle .L081cbc_dec_tail_collected
+ jle .L083cbc_dec_clear_tail_collected
movups %xmm2,(%edi)
leal 16(%edi),%edi
-.L078cbc_dec_tail:
+.L080cbc_dec_tail:
movups (%esi),%xmm2
movaps %xmm2,%xmm6
cmpl $16,%eax
- jbe .L082cbc_dec_one
+ jbe .L084cbc_dec_one
movups 16(%esi),%xmm3
movaps %xmm3,%xmm5
cmpl $32,%eax
- jbe .L083cbc_dec_two
+ jbe .L085cbc_dec_two
movups 32(%esi),%xmm4
cmpl $48,%eax
- jbe .L084cbc_dec_three
+ jbe .L086cbc_dec_three
movups 48(%esi),%xmm5
cmpl $64,%eax
- jbe .L085cbc_dec_four
+ jbe .L087cbc_dec_four
movups 64(%esi),%xmm6
movaps %xmm7,(%esp)
movups (%esi),%xmm2
@@ -1876,55 +1942,62 @@ aesni_cbc_encrypt:
xorps %xmm0,%xmm6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%edi)
+ pxor %xmm5,%xmm5
leal 64(%edi),%edi
movaps %xmm6,%xmm2
+ pxor %xmm6,%xmm6
subl $80,%eax
- jmp .L081cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L082cbc_dec_one:
+.L084cbc_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L086dec1_loop_16:
+.L089dec1_loop_16:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L086dec1_loop_16
+ jnz .L089dec1_loop_16
.byte 102,15,56,223,209
xorps %xmm7,%xmm2
movaps %xmm6,%xmm7
subl $16,%eax
- jmp .L081cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L083cbc_dec_two:
+.L085cbc_dec_two:
call _aesni_decrypt2
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movaps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leal 16(%edi),%edi
movaps %xmm5,%xmm7
subl $32,%eax
- jmp .L081cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L084cbc_dec_three:
+.L086cbc_dec_three:
call _aesni_decrypt3
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
xorps %xmm5,%xmm4
movups %xmm2,(%edi)
movaps %xmm4,%xmm2
+ pxor %xmm4,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
leal 32(%edi),%edi
movups 32(%esi),%xmm7
subl $48,%eax
- jmp .L081cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L085cbc_dec_four:
+.L087cbc_dec_four:
call _aesni_decrypt4
movups 16(%esi),%xmm1
movups 32(%esi),%xmm0
@@ -1934,28 +2007,44 @@ aesni_cbc_encrypt:
movups %xmm2,(%edi)
xorps %xmm1,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
leal 48(%edi),%edi
movaps %xmm5,%xmm2
+ pxor %xmm5,%xmm5
subl $64,%eax
-.L081cbc_dec_tail_collected:
+ jmp .L088cbc_dec_tail_collected
+.align 16
+.L083cbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+.L088cbc_dec_tail_collected:
andl $15,%eax
- jnz .L087cbc_dec_tail_partial
+ jnz .L090cbc_dec_tail_partial
movups %xmm2,(%edi)
- jmp .L077cbc_ret
+ pxor %xmm0,%xmm0
+ jmp .L079cbc_ret
.align 16
-.L087cbc_dec_tail_partial:
+.L090cbc_dec_tail_partial:
movaps %xmm2,(%esp)
+ pxor %xmm0,%xmm0
movl $16,%ecx
movl %esp,%esi
subl %eax,%ecx
.long 2767451785
-.L077cbc_ret:
+ movdqa %xmm2,(%esp)
+.L079cbc_ret:
movl 16(%esp),%esp
movl 36(%esp),%ebp
+ pxor %xmm2,%xmm2
+ pxor %xmm1,%xmm1
movups %xmm7,(%ebp)
-.L072cbc_abort:
+ pxor %xmm7,%xmm7
+.L074cbc_abort:
popl %edi
popl %esi
popl %ebx
@@ -1965,52 +2054,62 @@ aesni_cbc_encrypt:
.type _aesni_set_encrypt_key,@function
.align 16
_aesni_set_encrypt_key:
+ pushl %ebp
+ pushl %ebx
testl %eax,%eax
- jz .L088bad_pointer
+ jz .L091bad_pointer
testl %edx,%edx
- jz .L088bad_pointer
+ jz .L091bad_pointer
+ call .L092pic
+.L092pic:
+ popl %ebx
+ leal .Lkey_const-.L092pic(%ebx),%ebx
+ leal OPENSSL_ia32cap_P,%ebp
movups (%eax),%xmm0
xorps %xmm4,%xmm4
+ movl 4(%ebp),%ebp
leal 16(%edx),%edx
+ andl $268437504,%ebp
cmpl $256,%ecx
- je .L08914rounds
+ je .L09314rounds
cmpl $192,%ecx
- je .L09012rounds
+ je .L09412rounds
cmpl $128,%ecx
- jne .L091bad_keybits
+ jne .L095bad_keybits
.align 16
-.L09210rounds:
+.L09610rounds:
+ cmpl $268435456,%ebp
+ je .L09710rounds_alt
movl $9,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,200,1
- call .L093key_128_cold
+ call .L098key_128_cold
.byte 102,15,58,223,200,2
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,4
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,8
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,16
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,32
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,64
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,128
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,27
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,54
- call .L094key_128
+ call .L099key_128
movups %xmm0,(%edx)
movl %ecx,80(%edx)
- xorl %eax,%eax
- ret
+ jmp .L100good_key
.align 16
-.L094key_128:
+.L099key_128:
movups %xmm0,(%edx)
leal 16(%edx),%edx
-.L093key_128_cold:
+.L098key_128_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -2019,38 +2118,91 @@ _aesni_set_encrypt_key:
xorps %xmm1,%xmm0
ret
.align 16
-.L09012rounds:
+.L09710rounds_alt:
+ movdqa (%ebx),%xmm5
+ movl $8,%ecx
+ movdqa 32(%ebx),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,-16(%edx)
+.L101loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leal 16(%edx),%edx
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%edx)
+ movdqa %xmm0,%xmm2
+ decl %ecx
+ jnz .L101loop_key128
+ movdqa 48(%ebx),%xmm4
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%edx)
+ movl $9,%ecx
+ movl %ecx,96(%edx)
+ jmp .L100good_key
+.align 16
+.L09412rounds:
movq 16(%eax),%xmm2
+ cmpl $268435456,%ebp
+ je .L10212rounds_alt
movl $11,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,202,1
- call .L095key_192a_cold
+ call .L103key_192a_cold
.byte 102,15,58,223,202,2
- call .L096key_192b
+ call .L104key_192b
.byte 102,15,58,223,202,4
- call .L097key_192a
+ call .L105key_192a
.byte 102,15,58,223,202,8
- call .L096key_192b
+ call .L104key_192b
.byte 102,15,58,223,202,16
- call .L097key_192a
+ call .L105key_192a
.byte 102,15,58,223,202,32
- call .L096key_192b
+ call .L104key_192b
.byte 102,15,58,223,202,64
- call .L097key_192a
+ call .L105key_192a
.byte 102,15,58,223,202,128
- call .L096key_192b
+ call .L104key_192b
movups %xmm0,(%edx)
movl %ecx,48(%edx)
- xorl %eax,%eax
- ret
+ jmp .L100good_key
.align 16
-.L097key_192a:
+.L105key_192a:
movups %xmm0,(%edx)
leal 16(%edx),%edx
.align 16
-.L095key_192a_cold:
+.L103key_192a_cold:
movaps %xmm2,%xmm5
-.L098key_192b_warm:
+.L106key_192b_warm:
shufps $16,%xmm0,%xmm4
movdqa %xmm2,%xmm3
xorps %xmm4,%xmm0
@@ -2064,56 +2216,90 @@ _aesni_set_encrypt_key:
pxor %xmm3,%xmm2
ret
.align 16
-.L096key_192b:
+.L104key_192b:
movaps %xmm0,%xmm3
shufps $68,%xmm0,%xmm5
movups %xmm5,(%edx)
shufps $78,%xmm2,%xmm3
movups %xmm3,16(%edx)
leal 32(%edx),%edx
- jmp .L098key_192b_warm
+ jmp .L106key_192b_warm
+.align 16
+.L10212rounds_alt:
+ movdqa 16(%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $8,%ecx
+ movdqu %xmm0,-16(%edx)
+.L107loop_key192:
+ movq %xmm2,(%edx)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leal 24(%edx),%edx
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%edx)
+ decl %ecx
+ jnz .L107loop_key192
+ movl $11,%ecx
+ movl %ecx,32(%edx)
+ jmp .L100good_key
.align 16
-.L08914rounds:
+.L09314rounds:
movups 16(%eax),%xmm2
- movl $13,%ecx
leal 16(%edx),%edx
+ cmpl $268435456,%ebp
+ je .L10814rounds_alt
+ movl $13,%ecx
movups %xmm0,-32(%edx)
movups %xmm2,-16(%edx)
.byte 102,15,58,223,202,1
- call .L099key_256a_cold
+ call .L109key_256a_cold
.byte 102,15,58,223,200,1
- call .L100key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,2
- call .L101key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,2
- call .L100key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,4
- call .L101key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,4
- call .L100key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,8
- call .L101key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,8
- call .L100key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,16
- call .L101key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,16
- call .L100key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,32
- call .L101key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,32
- call .L100key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,64
- call .L101key_256a
+ call .L111key_256a
movups %xmm0,(%edx)
movl %ecx,16(%edx)
xorl %eax,%eax
- ret
+ jmp .L100good_key
.align 16
-.L101key_256a:
+.L111key_256a:
movups %xmm2,(%edx)
leal 16(%edx),%edx
-.L099key_256a_cold:
+.L109key_256a_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -2122,7 +2308,7 @@ _aesni_set_encrypt_key:
xorps %xmm1,%xmm0
ret
.align 16
-.L100key_256b:
+.L110key_256b:
movups %xmm0,(%edx)
leal 16(%edx),%edx
shufps $16,%xmm2,%xmm4
@@ -2132,13 +2318,70 @@ _aesni_set_encrypt_key:
shufps $170,%xmm1,%xmm1
xorps %xmm1,%xmm2
ret
+.align 16
+.L10814rounds_alt:
+ movdqa (%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $7,%ecx
+ movdqu %xmm0,-32(%edx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,-16(%edx)
+.L112loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ decl %ecx
+ jz .L113done_key256
+ pshufd $255,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%edx)
+ leal 32(%edx),%edx
+ movdqa %xmm2,%xmm1
+ jmp .L112loop_key256
+.L113done_key256:
+ movl $13,%ecx
+ movl %ecx,16(%edx)
+.L100good_key:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ xorl %eax,%eax
+ popl %ebx
+ popl %ebp
+ ret
.align 4
-.L088bad_pointer:
+.L091bad_pointer:
movl $-1,%eax
+ popl %ebx
+ popl %ebp
ret
.align 4
-.L091bad_keybits:
+.L095bad_keybits:
+ pxor %xmm0,%xmm0
movl $-2,%eax
+ popl %ebx
+ popl %ebp
ret
.size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key
.globl aesni_set_encrypt_key
@@ -2164,7 +2407,7 @@ aesni_set_decrypt_key:
movl 12(%esp),%edx
shll $4,%ecx
testl %eax,%eax
- jnz .L102dec_key_ret
+ jnz .L114dec_key_ret
leal 16(%edx,%ecx,1),%eax
movups (%edx),%xmm0
movups (%eax),%xmm1
@@ -2172,7 +2415,7 @@ aesni_set_decrypt_key:
movups %xmm1,(%edx)
leal 16(%edx),%edx
leal -16(%eax),%eax
-.L103dec_key_inverse:
+.L115dec_key_inverse:
movups (%edx),%xmm0
movups (%eax),%xmm1
.byte 102,15,56,219,192
@@ -2182,15 +2425,24 @@ aesni_set_decrypt_key:
movups %xmm0,16(%eax)
movups %xmm1,-16(%edx)
cmpl %edx,%eax
- ja .L103dec_key_inverse
+ ja .L115dec_key_inverse
movups (%edx),%xmm0
.byte 102,15,56,219,192
movups %xmm0,(%edx)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
xorl %eax,%eax
-.L102dec_key_ret:
+.L114dec_key_ret:
ret
.size aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin
+.align 64
+.Lkey_const:
+.long 202313229,202313229,202313229,202313229
+.long 67569157,67569157,67569157,67569157
+.long 1,1,1,1
+.long 27,27,27,27
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
.byte 115,108,46,111,114,103,62,0
+.comm OPENSSL_ia32cap_P,16,4
diff --git a/deps/openssl/asm_obsolete/x86-macosx-gas/aes/aesni-x86.s b/deps/openssl/asm_obsolete/x86-macosx-gas/aes/aesni-x86.s
index cecd5f83f7..c1f5aec62c 100644
--- a/deps/openssl/asm_obsolete/x86-macosx-gas/aes/aesni-x86.s
+++ b/deps/openssl/asm_obsolete/x86-macosx-gas/aes/aesni-x86.s
@@ -20,7 +20,10 @@ L000enc1_loop_1:
leal 16(%edx),%edx
jnz L000enc1_loop_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
ret
.globl _aesni_decrypt
.align 4
@@ -42,7 +45,10 @@ L001dec1_loop_2:
leal 16(%edx),%edx
jnz L001dec1_loop_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
ret
.align 4
__aesni_encrypt2:
@@ -242,17 +248,15 @@ __aesni_encrypt6:
negl %ecx
.byte 102,15,56,220,225
pxor %xmm0,%xmm7
+ movups (%edx,%ecx,1),%xmm0
addl $16,%ecx
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
- movups -16(%edx,%ecx,1),%xmm0
- jmp L_aesni_encrypt6_enter
+ jmp L008_aesni_encrypt6_inner
.align 4,0x90
-L008enc6_loop:
+L009enc6_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
+L008_aesni_encrypt6_inner:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
@@ -266,7 +270,7 @@ L_aesni_encrypt6_enter:
.byte 102,15,56,220,240
.byte 102,15,56,220,248
movups -16(%edx,%ecx,1),%xmm0
- jnz L008enc6_loop
+ jnz L009enc6_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -296,17 +300,15 @@ __aesni_decrypt6:
negl %ecx
.byte 102,15,56,222,225
pxor %xmm0,%xmm7
+ movups (%edx,%ecx,1),%xmm0
addl $16,%ecx
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
- movups -16(%edx,%ecx,1),%xmm0
- jmp L_aesni_decrypt6_enter
+ jmp L010_aesni_decrypt6_inner
.align 4,0x90
-L009dec6_loop:
+L011dec6_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
+L010_aesni_decrypt6_inner:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
@@ -320,7 +322,7 @@ L_aesni_decrypt6_enter:
.byte 102,15,56,222,240
.byte 102,15,56,222,248
movups -16(%edx,%ecx,1),%xmm0
- jnz L009dec6_loop
+ jnz L011dec6_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -348,14 +350,14 @@ L_aesni_ecb_encrypt_begin:
movl 32(%esp),%edx
movl 36(%esp),%ebx
andl $-16,%eax
- jz L010ecb_ret
+ jz L012ecb_ret
movl 240(%edx),%ecx
testl %ebx,%ebx
- jz L011ecb_decrypt
+ jz L013ecb_decrypt
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb L012ecb_enc_tail
+ jb L014ecb_enc_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -364,9 +366,9 @@ L_aesni_ecb_encrypt_begin:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp L013ecb_enc_loop6_enter
+ jmp L015ecb_enc_loop6_enter
.align 4,0x90
-L014ecb_enc_loop6:
+L016ecb_enc_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -381,12 +383,12 @@ L014ecb_enc_loop6:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-L013ecb_enc_loop6_enter:
+L015ecb_enc_loop6_enter:
call __aesni_encrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc L014ecb_enc_loop6
+ jnc L016ecb_enc_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -395,18 +397,18 @@ L013ecb_enc_loop6_enter:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz L010ecb_ret
-L012ecb_enc_tail:
+ jz L012ecb_ret
+L014ecb_enc_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb L015ecb_enc_one
+ jb L017ecb_enc_one
movups 16(%esi),%xmm3
- je L016ecb_enc_two
+ je L018ecb_enc_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb L017ecb_enc_three
+ jb L019ecb_enc_three
movups 48(%esi),%xmm5
- je L018ecb_enc_four
+ je L020ecb_enc_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call __aesni_encrypt6
@@ -415,49 +417,49 @@ L012ecb_enc_tail:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L015ecb_enc_one:
+L017ecb_enc_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L019enc1_loop_3:
+L021enc1_loop_3:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L019enc1_loop_3
+ jnz L021enc1_loop_3
.byte 102,15,56,221,209
movups %xmm2,(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L016ecb_enc_two:
+L018ecb_enc_two:
call __aesni_encrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L017ecb_enc_three:
+L019ecb_enc_three:
call __aesni_encrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L018ecb_enc_four:
+L020ecb_enc_four:
call __aesni_encrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L011ecb_decrypt:
+L013ecb_decrypt:
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb L020ecb_dec_tail
+ jb L022ecb_dec_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -466,9 +468,9 @@ L011ecb_decrypt:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp L021ecb_dec_loop6_enter
+ jmp L023ecb_dec_loop6_enter
.align 4,0x90
-L022ecb_dec_loop6:
+L024ecb_dec_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -483,12 +485,12 @@ L022ecb_dec_loop6:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-L021ecb_dec_loop6_enter:
+L023ecb_dec_loop6_enter:
call __aesni_decrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc L022ecb_dec_loop6
+ jnc L024ecb_dec_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -497,18 +499,18 @@ L021ecb_dec_loop6_enter:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz L010ecb_ret
-L020ecb_dec_tail:
+ jz L012ecb_ret
+L022ecb_dec_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb L023ecb_dec_one
+ jb L025ecb_dec_one
movups 16(%esi),%xmm3
- je L024ecb_dec_two
+ je L026ecb_dec_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb L025ecb_dec_three
+ jb L027ecb_dec_three
movups 48(%esi),%xmm5
- je L026ecb_dec_four
+ je L028ecb_dec_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call __aesni_decrypt6
@@ -517,43 +519,51 @@ L020ecb_dec_tail:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L023ecb_dec_one:
+L025ecb_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L027dec1_loop_4:
+L029dec1_loop_4:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L027dec1_loop_4
+ jnz L029dec1_loop_4
.byte 102,15,56,223,209
movups %xmm2,(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L024ecb_dec_two:
+L026ecb_dec_two:
call __aesni_decrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L025ecb_dec_three:
+L027ecb_dec_three:
call __aesni_decrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp L010ecb_ret
+ jmp L012ecb_ret
.align 4,0x90
-L026ecb_dec_four:
+L028ecb_dec_four:
call __aesni_decrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-L010ecb_ret:
+L012ecb_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -598,7 +608,7 @@ L_aesni_ccm64_encrypt_blocks_begin:
leal 32(%edx,%ecx,1),%edx
subl %ecx,%ebx
.byte 102,15,56,0,253
-L028ccm64_enc_outer:
+L030ccm64_enc_outer:
movups (%ebp),%xmm0
movl %ebx,%ecx
movups (%esi),%xmm6
@@ -607,7 +617,7 @@ L028ccm64_enc_outer:
xorps %xmm6,%xmm0
xorps %xmm0,%xmm3
movups 32(%ebp),%xmm0
-L029ccm64_enc2_loop:
+L031ccm64_enc2_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
movups (%edx,%ecx,1),%xmm1
@@ -615,7 +625,7 @@ L029ccm64_enc2_loop:
.byte 102,15,56,220,208
.byte 102,15,56,220,216
movups -16(%edx,%ecx,1),%xmm0
- jnz L029ccm64_enc2_loop
+ jnz L031ccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
paddq 16(%esp),%xmm7
@@ -628,10 +638,18 @@ L029ccm64_enc2_loop:
movups %xmm6,(%edi)
.byte 102,15,56,0,213
leal 16(%edi),%edi
- jnz L028ccm64_enc_outer
+ jnz L030ccm64_enc_outer
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -677,12 +695,12 @@ L_aesni_ccm64_decrypt_blocks_begin:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L030enc1_loop_5:
+L032enc1_loop_5:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L030enc1_loop_5
+ jnz L032enc1_loop_5
.byte 102,15,56,221,209
shll $4,%ebx
movl $16,%ecx
@@ -692,16 +710,16 @@ L030enc1_loop_5:
subl %ebx,%ecx
leal 32(%ebp,%ebx,1),%edx
movl %ecx,%ebx
- jmp L031ccm64_dec_outer
+ jmp L033ccm64_dec_outer
.align 4,0x90
-L031ccm64_dec_outer:
+L033ccm64_dec_outer:
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
movups %xmm6,(%edi)
leal 16(%edi),%edi
.byte 102,15,56,0,213
subl $1,%eax
- jz L032ccm64_dec_break
+ jz L034ccm64_dec_break
movups (%ebp),%xmm0
movl %ebx,%ecx
movups 16(%ebp),%xmm1
@@ -709,7 +727,7 @@ L031ccm64_dec_outer:
xorps %xmm0,%xmm2
xorps %xmm6,%xmm3
movups 32(%ebp),%xmm0
-L033ccm64_dec2_loop:
+L035ccm64_dec2_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
movups (%edx,%ecx,1),%xmm1
@@ -717,7 +735,7 @@ L033ccm64_dec2_loop:
.byte 102,15,56,220,208
.byte 102,15,56,220,216
movups -16(%edx,%ecx,1),%xmm0
- jnz L033ccm64_dec2_loop
+ jnz L035ccm64_dec2_loop
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
.byte 102,15,56,220,209
@@ -725,9 +743,9 @@ L033ccm64_dec2_loop:
.byte 102,15,56,221,208
.byte 102,15,56,221,216
leal 16(%esi),%esi
- jmp L031ccm64_dec_outer
+ jmp L033ccm64_dec_outer
.align 4,0x90
-L032ccm64_dec_break:
+L034ccm64_dec_break:
movl 240(%ebp),%ecx
movl %ebp,%edx
movups (%edx),%xmm0
@@ -735,16 +753,24 @@ L032ccm64_dec_break:
xorps %xmm0,%xmm6
leal 32(%edx),%edx
xorps %xmm6,%xmm3
-L034enc1_loop_6:
+L036enc1_loop_6:
.byte 102,15,56,220,217
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L034enc1_loop_6
+ jnz L036enc1_loop_6
.byte 102,15,56,221,217
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -768,7 +794,7 @@ L_aesni_ctr32_encrypt_blocks_begin:
andl $-16,%esp
movl %ebp,80(%esp)
cmpl $1,%eax
- je L035ctr32_one_shortcut
+ je L037ctr32_one_shortcut
movdqu (%ebx),%xmm7
movl $202182159,(%esp)
movl $134810123,4(%esp)
@@ -806,7 +832,7 @@ L_aesni_ctr32_encrypt_blocks_begin:
pshufd $192,%xmm0,%xmm2
pshufd $128,%xmm0,%xmm3
cmpl $6,%eax
- jb L036ctr32_tail
+ jb L038ctr32_tail
pxor %xmm6,%xmm7
shll $4,%ecx
movl $16,%ebx
@@ -815,9 +841,9 @@ L_aesni_ctr32_encrypt_blocks_begin:
subl %ecx,%ebx
leal 32(%edx,%ecx,1),%edx
subl $6,%eax
- jmp L037ctr32_loop6
+ jmp L039ctr32_loop6
.align 4,0x90
-L037ctr32_loop6:
+L039ctr32_loop6:
pshufd $64,%xmm0,%xmm4
movdqa 32(%esp),%xmm0
pshufd $192,%xmm1,%xmm5
@@ -871,27 +897,27 @@ L037ctr32_loop6:
leal 96(%edi),%edi
pshufd $128,%xmm0,%xmm3
subl $6,%eax
- jnc L037ctr32_loop6
+ jnc L039ctr32_loop6
addl $6,%eax
- jz L038ctr32_ret
+ jz L040ctr32_ret
movdqu (%ebp),%xmm7
movl %ebp,%edx
pxor 32(%esp),%xmm7
movl 240(%ebp),%ecx
-L036ctr32_tail:
+L038ctr32_tail:
por %xmm7,%xmm2
cmpl $2,%eax
- jb L039ctr32_one
+ jb L041ctr32_one
pshufd $64,%xmm0,%xmm4
por %xmm7,%xmm3
- je L040ctr32_two
+ je L042ctr32_two
pshufd $192,%xmm1,%xmm5
por %xmm7,%xmm4
cmpl $4,%eax
- jb L041ctr32_three
+ jb L043ctr32_three
pshufd $128,%xmm1,%xmm6
por %xmm7,%xmm5
- je L042ctr32_four
+ je L044ctr32_four
por %xmm7,%xmm6
call __aesni_encrypt6
movups (%esi),%xmm1
@@ -909,29 +935,29 @@ L036ctr32_tail:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp L038ctr32_ret
+ jmp L040ctr32_ret
.align 4,0x90
-L035ctr32_one_shortcut:
+L037ctr32_one_shortcut:
movups (%ebx),%xmm2
movl 240(%edx),%ecx
-L039ctr32_one:
+L041ctr32_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L043enc1_loop_7:
+L045enc1_loop_7:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L043enc1_loop_7
+ jnz L045enc1_loop_7
.byte 102,15,56,221,209
movups (%esi),%xmm6
xorps %xmm2,%xmm6
movups %xmm6,(%edi)
- jmp L038ctr32_ret
+ jmp L040ctr32_ret
.align 4,0x90
-L040ctr32_two:
+L042ctr32_two:
call __aesni_encrypt2
movups (%esi),%xmm5
movups 16(%esi),%xmm6
@@ -939,9 +965,9 @@ L040ctr32_two:
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp L038ctr32_ret
+ jmp L040ctr32_ret
.align 4,0x90
-L041ctr32_three:
+L043ctr32_three:
call __aesni_encrypt3
movups (%esi),%xmm5
movups 16(%esi),%xmm6
@@ -952,9 +978,9 @@ L041ctr32_three:
xorps %xmm7,%xmm4
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp L038ctr32_ret
+ jmp L040ctr32_ret
.align 4,0x90
-L042ctr32_four:
+L044ctr32_four:
call __aesni_encrypt4
movups (%esi),%xmm6
movups 16(%esi),%xmm7
@@ -968,7 +994,18 @@ L042ctr32_four:
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-L038ctr32_ret:
+L040ctr32_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
movl 80(%esp),%esp
popl %edi
popl %esi
@@ -991,12 +1028,12 @@ L_aesni_xts_encrypt_begin:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L044enc1_loop_8:
+L046enc1_loop_8:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L044enc1_loop_8
+ jnz L046enc1_loop_8
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1020,14 +1057,14 @@ L044enc1_loop_8:
movl %edx,%ebp
movl %ecx,%ebx
subl $96,%eax
- jc L045xts_enc_short
+ jc L047xts_enc_short
shll $4,%ecx
movl $16,%ebx
subl %ecx,%ebx
leal 32(%edx,%ecx,1),%edx
- jmp L046xts_enc_loop6
+ jmp L048xts_enc_loop6
.align 4,0x90
-L046xts_enc_loop6:
+L048xts_enc_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1116,23 +1153,23 @@ L046xts_enc_loop6:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
subl $96,%eax
- jnc L046xts_enc_loop6
+ jnc L048xts_enc_loop6
movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-L045xts_enc_short:
+L047xts_enc_short:
addl $96,%eax
- jz L047xts_enc_done6x
+ jz L049xts_enc_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb L048xts_enc_one
+ jb L050xts_enc_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je L049xts_enc_two
+ je L051xts_enc_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1141,7 +1178,7 @@ L045xts_enc_short:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb L050xts_enc_three
+ jb L052xts_enc_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1151,7 +1188,7 @@ L045xts_enc_short:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je L051xts_enc_four
+ je L053xts_enc_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1183,9 +1220,9 @@ L045xts_enc_short:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp L052xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L048xts_enc_one:
+L050xts_enc_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1193,20 +1230,20 @@ L048xts_enc_one:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L053enc1_loop_9:
+L055enc1_loop_9:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L053enc1_loop_9
+ jnz L055enc1_loop_9
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp L052xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L049xts_enc_two:
+L051xts_enc_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1220,9 +1257,9 @@ L049xts_enc_two:
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L052xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L050xts_enc_three:
+L052xts_enc_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1240,9 +1277,9 @@ L050xts_enc_three:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp L052xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L051xts_enc_four:
+L053xts_enc_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1264,28 +1301,28 @@ L051xts_enc_four:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L052xts_enc_done
+ jmp L054xts_enc_done
.align 4,0x90
-L047xts_enc_done6x:
+L049xts_enc_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz L054xts_enc_ret
+ jz L056xts_enc_ret
movdqa %xmm1,%xmm5
movl %eax,112(%esp)
- jmp L055xts_enc_steal
+ jmp L057xts_enc_steal
.align 4,0x90
-L052xts_enc_done:
+L054xts_enc_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz L054xts_enc_ret
+ jz L056xts_enc_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm5
paddq %xmm1,%xmm1
pand 96(%esp),%xmm5
pxor %xmm1,%xmm5
-L055xts_enc_steal:
+L057xts_enc_steal:
movzbl (%esi),%ecx
movzbl -16(%edi),%edx
leal 1(%esi),%esi
@@ -1293,7 +1330,7 @@ L055xts_enc_steal:
movb %dl,(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz L055xts_enc_steal
+ jnz L057xts_enc_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1303,16 +1340,30 @@ L055xts_enc_steal:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L056enc1_loop_10:
+L058enc1_loop_10:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L056enc1_loop_10
+ jnz L058enc1_loop_10
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,-16(%edi)
-L054xts_enc_ret:
+L056xts_enc_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1335,12 +1386,12 @@ L_aesni_xts_decrypt_begin:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L057enc1_loop_11:
+L059enc1_loop_11:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L057enc1_loop_11
+ jnz L059enc1_loop_11
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1369,14 +1420,14 @@ L057enc1_loop_11:
pcmpgtd %xmm1,%xmm0
andl $-16,%eax
subl $96,%eax
- jc L058xts_dec_short
+ jc L060xts_dec_short
shll $4,%ecx
movl $16,%ebx
subl %ecx,%ebx
leal 32(%edx,%ecx,1),%edx
- jmp L059xts_dec_loop6
+ jmp L061xts_dec_loop6
.align 4,0x90
-L059xts_dec_loop6:
+L061xts_dec_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1465,23 +1516,23 @@ L059xts_dec_loop6:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
subl $96,%eax
- jnc L059xts_dec_loop6
+ jnc L061xts_dec_loop6
movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-L058xts_dec_short:
+L060xts_dec_short:
addl $96,%eax
- jz L060xts_dec_done6x
+ jz L062xts_dec_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb L061xts_dec_one
+ jb L063xts_dec_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je L062xts_dec_two
+ je L064xts_dec_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1490,7 +1541,7 @@ L058xts_dec_short:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb L063xts_dec_three
+ jb L065xts_dec_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1500,7 +1551,7 @@ L058xts_dec_short:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je L064xts_dec_four
+ je L066xts_dec_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1532,9 +1583,9 @@ L058xts_dec_short:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp L065xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L061xts_dec_one:
+L063xts_dec_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1542,20 +1593,20 @@ L061xts_dec_one:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L066dec1_loop_12:
+L068dec1_loop_12:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L066dec1_loop_12
+ jnz L068dec1_loop_12
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp L065xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L062xts_dec_two:
+L064xts_dec_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1569,9 +1620,9 @@ L062xts_dec_two:
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L065xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L063xts_dec_three:
+L065xts_dec_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1589,9 +1640,9 @@ L063xts_dec_three:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp L065xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L064xts_dec_four:
+L066xts_dec_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1613,20 +1664,20 @@ L064xts_dec_four:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp L065xts_dec_done
+ jmp L067xts_dec_done
.align 4,0x90
-L060xts_dec_done6x:
+L062xts_dec_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz L067xts_dec_ret
+ jz L069xts_dec_ret
movl %eax,112(%esp)
- jmp L068xts_dec_only_one_more
+ jmp L070xts_dec_only_one_more
.align 4,0x90
-L065xts_dec_done:
+L067xts_dec_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz L067xts_dec_ret
+ jz L069xts_dec_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm2
@@ -1636,7 +1687,7 @@ L065xts_dec_done:
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
-L068xts_dec_only_one_more:
+L070xts_dec_only_one_more:
pshufd $19,%xmm0,%xmm5
movdqa %xmm1,%xmm6
paddq %xmm1,%xmm1
@@ -1650,16 +1701,16 @@ L068xts_dec_only_one_more:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L069dec1_loop_13:
+L071dec1_loop_13:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L069dec1_loop_13
+ jnz L071dec1_loop_13
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
-L070xts_dec_steal:
+L072xts_dec_steal:
movzbl 16(%esi),%ecx
movzbl (%edi),%edx
leal 1(%esi),%esi
@@ -1667,7 +1718,7 @@ L070xts_dec_steal:
movb %dl,16(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz L070xts_dec_steal
+ jnz L072xts_dec_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1677,16 +1728,30 @@ L070xts_dec_steal:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L071dec1_loop_14:
+L073dec1_loop_14:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L071dec1_loop_14
+ jnz L073dec1_loop_14
.byte 102,15,56,223,209
xorps %xmm6,%xmm2
movups %xmm2,(%edi)
-L067xts_dec_ret:
+L069xts_dec_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1710,7 +1775,7 @@ L_aesni_cbc_encrypt_begin:
movl 32(%esp),%edx
movl 36(%esp),%ebp
testl %eax,%eax
- jz L072cbc_abort
+ jz L074cbc_abort
cmpl $0,40(%esp)
xchgl %esp,%ebx
movups (%ebp),%xmm7
@@ -1718,14 +1783,14 @@ L_aesni_cbc_encrypt_begin:
movl %edx,%ebp
movl %ebx,16(%esp)
movl %ecx,%ebx
- je L073cbc_decrypt
+ je L075cbc_decrypt
movaps %xmm7,%xmm2
cmpl $16,%eax
- jb L074cbc_enc_tail
+ jb L076cbc_enc_tail
subl $16,%eax
- jmp L075cbc_enc_loop
+ jmp L077cbc_enc_loop
.align 4,0x90
-L075cbc_enc_loop:
+L077cbc_enc_loop:
movups (%esi),%xmm7
leal 16(%esi),%esi
movups (%edx),%xmm0
@@ -1733,24 +1798,25 @@ L075cbc_enc_loop:
xorps %xmm0,%xmm7
leal 32(%edx),%edx
xorps %xmm7,%xmm2
-L076enc1_loop_15:
+L078enc1_loop_15:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L076enc1_loop_15
+ jnz L078enc1_loop_15
.byte 102,15,56,221,209
movl %ebx,%ecx
movl %ebp,%edx
movups %xmm2,(%edi)
leal 16(%edi),%edi
subl $16,%eax
- jnc L075cbc_enc_loop
+ jnc L077cbc_enc_loop
addl $16,%eax
- jnz L074cbc_enc_tail
+ jnz L076cbc_enc_tail
movaps %xmm2,%xmm7
- jmp L077cbc_ret
-L074cbc_enc_tail:
+ pxor %xmm2,%xmm2
+ jmp L079cbc_ret
+L076cbc_enc_tail:
movl %eax,%ecx
.long 2767451785
movl $16,%ecx
@@ -1761,20 +1827,20 @@ L074cbc_enc_tail:
movl %ebx,%ecx
movl %edi,%esi
movl %ebp,%edx
- jmp L075cbc_enc_loop
+ jmp L077cbc_enc_loop
.align 4,0x90
-L073cbc_decrypt:
+L075cbc_decrypt:
cmpl $80,%eax
- jbe L078cbc_dec_tail
+ jbe L080cbc_dec_tail
movaps %xmm7,(%esp)
subl $80,%eax
- jmp L079cbc_dec_loop6_enter
+ jmp L081cbc_dec_loop6_enter
.align 4,0x90
-L080cbc_dec_loop6:
+L082cbc_dec_loop6:
movaps %xmm0,(%esp)
movups %xmm7,(%edi)
leal 16(%edi),%edi
-L079cbc_dec_loop6_enter:
+L081cbc_dec_loop6_enter:
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -1804,28 +1870,28 @@ L079cbc_dec_loop6_enter:
movups %xmm6,64(%edi)
leal 80(%edi),%edi
subl $96,%eax
- ja L080cbc_dec_loop6
+ ja L082cbc_dec_loop6
movaps %xmm7,%xmm2
movaps %xmm0,%xmm7
addl $80,%eax
- jle L081cbc_dec_tail_collected
+ jle L083cbc_dec_clear_tail_collected
movups %xmm2,(%edi)
leal 16(%edi),%edi
-L078cbc_dec_tail:
+L080cbc_dec_tail:
movups (%esi),%xmm2
movaps %xmm2,%xmm6
cmpl $16,%eax
- jbe L082cbc_dec_one
+ jbe L084cbc_dec_one
movups 16(%esi),%xmm3
movaps %xmm3,%xmm5
cmpl $32,%eax
- jbe L083cbc_dec_two
+ jbe L085cbc_dec_two
movups 32(%esi),%xmm4
cmpl $48,%eax
- jbe L084cbc_dec_three
+ jbe L086cbc_dec_three
movups 48(%esi),%xmm5
cmpl $64,%eax
- jbe L085cbc_dec_four
+ jbe L087cbc_dec_four
movups 64(%esi),%xmm6
movaps %xmm7,(%esp)
movups (%esi),%xmm2
@@ -1843,55 +1909,62 @@ L078cbc_dec_tail:
xorps %xmm0,%xmm6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%edi)
+ pxor %xmm5,%xmm5
leal 64(%edi),%edi
movaps %xmm6,%xmm2
+ pxor %xmm6,%xmm6
subl $80,%eax
- jmp L081cbc_dec_tail_collected
+ jmp L088cbc_dec_tail_collected
.align 4,0x90
-L082cbc_dec_one:
+L084cbc_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-L086dec1_loop_16:
+L089dec1_loop_16:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz L086dec1_loop_16
+ jnz L089dec1_loop_16
.byte 102,15,56,223,209
xorps %xmm7,%xmm2
movaps %xmm6,%xmm7
subl $16,%eax
- jmp L081cbc_dec_tail_collected
+ jmp L088cbc_dec_tail_collected
.align 4,0x90
-L083cbc_dec_two:
+L085cbc_dec_two:
call __aesni_decrypt2
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movaps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leal 16(%edi),%edi
movaps %xmm5,%xmm7
subl $32,%eax
- jmp L081cbc_dec_tail_collected
+ jmp L088cbc_dec_tail_collected
.align 4,0x90
-L084cbc_dec_three:
+L086cbc_dec_three:
call __aesni_decrypt3
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
xorps %xmm5,%xmm4
movups %xmm2,(%edi)
movaps %xmm4,%xmm2
+ pxor %xmm4,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
leal 32(%edi),%edi
movups 32(%esi),%xmm7
subl $48,%eax
- jmp L081cbc_dec_tail_collected
+ jmp L088cbc_dec_tail_collected
.align 4,0x90
-L085cbc_dec_four:
+L087cbc_dec_four:
call __aesni_decrypt4
movups 16(%esi),%xmm1
movups 32(%esi),%xmm0
@@ -1901,28 +1974,44 @@ L085cbc_dec_four:
movups %xmm2,(%edi)
xorps %xmm1,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
leal 48(%edi),%edi
movaps %xmm5,%xmm2
+ pxor %xmm5,%xmm5
subl $64,%eax
-L081cbc_dec_tail_collected:
+ jmp L088cbc_dec_tail_collected
+.align 4,0x90
+L083cbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+L088cbc_dec_tail_collected:
andl $15,%eax
- jnz L087cbc_dec_tail_partial
+ jnz L090cbc_dec_tail_partial
movups %xmm2,(%edi)
- jmp L077cbc_ret
+ pxor %xmm0,%xmm0
+ jmp L079cbc_ret
.align 4,0x90
-L087cbc_dec_tail_partial:
+L090cbc_dec_tail_partial:
movaps %xmm2,(%esp)
+ pxor %xmm0,%xmm0
movl $16,%ecx
movl %esp,%esi
subl %eax,%ecx
.long 2767451785
-L077cbc_ret:
+ movdqa %xmm2,(%esp)
+L079cbc_ret:
movl 16(%esp),%esp
movl 36(%esp),%ebp
+ pxor %xmm2,%xmm2
+ pxor %xmm1,%xmm1
movups %xmm7,(%ebp)
-L072cbc_abort:
+ pxor %xmm7,%xmm7
+L074cbc_abort:
popl %edi
popl %esi
popl %ebx
@@ -1930,52 +2019,62 @@ L072cbc_abort:
ret
.align 4
__aesni_set_encrypt_key:
+ pushl %ebp
+ pushl %ebx
testl %eax,%eax
- jz L088bad_pointer
+ jz L091bad_pointer
testl %edx,%edx
- jz L088bad_pointer
+ jz L091bad_pointer
+ call L092pic
+L092pic:
+ popl %ebx
+ leal Lkey_const-L092pic(%ebx),%ebx
+ movl L_OPENSSL_ia32cap_P$non_lazy_ptr-Lkey_const(%ebx),%ebp
movups (%eax),%xmm0
xorps %xmm4,%xmm4
+ movl 4(%ebp),%ebp
leal 16(%edx),%edx
+ andl $268437504,%ebp
cmpl $256,%ecx
- je L08914rounds
+ je L09314rounds
cmpl $192,%ecx
- je L09012rounds
+ je L09412rounds
cmpl $128,%ecx
- jne L091bad_keybits
+ jne L095bad_keybits
.align 4,0x90
-L09210rounds:
+L09610rounds:
+ cmpl $268435456,%ebp
+ je L09710rounds_alt
movl $9,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,200,1
- call L093key_128_cold
+ call L098key_128_cold
.byte 102,15,58,223,200,2
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,4
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,8
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,16
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,32
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,64
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,128
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,27
- call L094key_128
+ call L099key_128
.byte 102,15,58,223,200,54
- call L094key_128
+ call L099key_128
movups %xmm0,(%edx)
movl %ecx,80(%edx)
- xorl %eax,%eax
- ret
+ jmp L100good_key
.align 4,0x90
-L094key_128:
+L099key_128:
movups %xmm0,(%edx)
leal 16(%edx),%edx
-L093key_128_cold:
+L098key_128_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -1984,38 +2083,91 @@ L093key_128_cold:
xorps %xmm1,%xmm0
ret
.align 4,0x90
-L09012rounds:
+L09710rounds_alt:
+ movdqa (%ebx),%xmm5
+ movl $8,%ecx
+ movdqa 32(%ebx),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,-16(%edx)
+L101loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leal 16(%edx),%edx
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%edx)
+ movdqa %xmm0,%xmm2
+ decl %ecx
+ jnz L101loop_key128
+ movdqa 48(%ebx),%xmm4
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%edx)
+ movl $9,%ecx
+ movl %ecx,96(%edx)
+ jmp L100good_key
+.align 4,0x90
+L09412rounds:
movq 16(%eax),%xmm2
+ cmpl $268435456,%ebp
+ je L10212rounds_alt
movl $11,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,202,1
- call L095key_192a_cold
+ call L103key_192a_cold
.byte 102,15,58,223,202,2
- call L096key_192b
+ call L104key_192b
.byte 102,15,58,223,202,4
- call L097key_192a
+ call L105key_192a
.byte 102,15,58,223,202,8
- call L096key_192b
+ call L104key_192b
.byte 102,15,58,223,202,16
- call L097key_192a
+ call L105key_192a
.byte 102,15,58,223,202,32
- call L096key_192b
+ call L104key_192b
.byte 102,15,58,223,202,64
- call L097key_192a
+ call L105key_192a
.byte 102,15,58,223,202,128
- call L096key_192b
+ call L104key_192b
movups %xmm0,(%edx)
movl %ecx,48(%edx)
- xorl %eax,%eax
- ret
+ jmp L100good_key
.align 4,0x90
-L097key_192a:
+L105key_192a:
movups %xmm0,(%edx)
leal 16(%edx),%edx
.align 4,0x90
-L095key_192a_cold:
+L103key_192a_cold:
movaps %xmm2,%xmm5
-L098key_192b_warm:
+L106key_192b_warm:
shufps $16,%xmm0,%xmm4
movdqa %xmm2,%xmm3
xorps %xmm4,%xmm0
@@ -2029,56 +2181,90 @@ L098key_192b_warm:
pxor %xmm3,%xmm2
ret
.align 4,0x90
-L096key_192b:
+L104key_192b:
movaps %xmm0,%xmm3
shufps $68,%xmm0,%xmm5
movups %xmm5,(%edx)
shufps $78,%xmm2,%xmm3
movups %xmm3,16(%edx)
leal 32(%edx),%edx
- jmp L098key_192b_warm
+ jmp L106key_192b_warm
.align 4,0x90
-L08914rounds:
+L10212rounds_alt:
+ movdqa 16(%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $8,%ecx
+ movdqu %xmm0,-16(%edx)
+L107loop_key192:
+ movq %xmm2,(%edx)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leal 24(%edx),%edx
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%edx)
+ decl %ecx
+ jnz L107loop_key192
+ movl $11,%ecx
+ movl %ecx,32(%edx)
+ jmp L100good_key
+.align 4,0x90
+L09314rounds:
movups 16(%eax),%xmm2
- movl $13,%ecx
leal 16(%edx),%edx
+ cmpl $268435456,%ebp
+ je L10814rounds_alt
+ movl $13,%ecx
movups %xmm0,-32(%edx)
movups %xmm2,-16(%edx)
.byte 102,15,58,223,202,1
- call L099key_256a_cold
+ call L109key_256a_cold
.byte 102,15,58,223,200,1
- call L100key_256b
+ call L110key_256b
.byte 102,15,58,223,202,2
- call L101key_256a
+ call L111key_256a
.byte 102,15,58,223,200,2
- call L100key_256b
+ call L110key_256b
.byte 102,15,58,223,202,4
- call L101key_256a
+ call L111key_256a
.byte 102,15,58,223,200,4
- call L100key_256b
+ call L110key_256b
.byte 102,15,58,223,202,8
- call L101key_256a
+ call L111key_256a
.byte 102,15,58,223,200,8
- call L100key_256b
+ call L110key_256b
.byte 102,15,58,223,202,16
- call L101key_256a
+ call L111key_256a
.byte 102,15,58,223,200,16
- call L100key_256b
+ call L110key_256b
.byte 102,15,58,223,202,32
- call L101key_256a
+ call L111key_256a
.byte 102,15,58,223,200,32
- call L100key_256b
+ call L110key_256b
.byte 102,15,58,223,202,64
- call L101key_256a
+ call L111key_256a
movups %xmm0,(%edx)
movl %ecx,16(%edx)
xorl %eax,%eax
- ret
+ jmp L100good_key
.align 4,0x90
-L101key_256a:
+L111key_256a:
movups %xmm2,(%edx)
leal 16(%edx),%edx
-L099key_256a_cold:
+L109key_256a_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -2087,7 +2273,7 @@ L099key_256a_cold:
xorps %xmm1,%xmm0
ret
.align 4,0x90
-L100key_256b:
+L110key_256b:
movups %xmm0,(%edx)
leal 16(%edx),%edx
shufps $16,%xmm2,%xmm4
@@ -2097,13 +2283,70 @@ L100key_256b:
shufps $170,%xmm1,%xmm1
xorps %xmm1,%xmm2
ret
+.align 4,0x90
+L10814rounds_alt:
+ movdqa (%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $7,%ecx
+ movdqu %xmm0,-32(%edx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,-16(%edx)
+L112loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ decl %ecx
+ jz L113done_key256
+ pshufd $255,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%edx)
+ leal 32(%edx),%edx
+ movdqa %xmm2,%xmm1
+ jmp L112loop_key256
+L113done_key256:
+ movl $13,%ecx
+ movl %ecx,16(%edx)
+L100good_key:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ xorl %eax,%eax
+ popl %ebx
+ popl %ebp
+ ret
.align 2,0x90
-L088bad_pointer:
+L091bad_pointer:
movl $-1,%eax
+ popl %ebx
+ popl %ebp
ret
.align 2,0x90
-L091bad_keybits:
+L095bad_keybits:
+ pxor %xmm0,%xmm0
movl $-2,%eax
+ popl %ebx
+ popl %ebp
ret
.globl _aesni_set_encrypt_key
.align 4
@@ -2125,7 +2368,7 @@ L_aesni_set_decrypt_key_begin:
movl 12(%esp),%edx
shll $4,%ecx
testl %eax,%eax
- jnz L102dec_key_ret
+ jnz L114dec_key_ret
leal 16(%edx,%ecx,1),%eax
movups (%edx),%xmm0
movups (%eax),%xmm1
@@ -2133,7 +2376,7 @@ L_aesni_set_decrypt_key_begin:
movups %xmm1,(%edx)
leal 16(%edx),%edx
leal -16(%eax),%eax
-L103dec_key_inverse:
+L115dec_key_inverse:
movups (%edx),%xmm0
movups (%eax),%xmm1
.byte 102,15,56,219,192
@@ -2143,14 +2386,27 @@ L103dec_key_inverse:
movups %xmm0,16(%eax)
movups %xmm1,-16(%edx)
cmpl %edx,%eax
- ja L103dec_key_inverse
+ ja L115dec_key_inverse
movups (%edx),%xmm0
.byte 102,15,56,219,192
movups %xmm0,(%edx)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
xorl %eax,%eax
-L102dec_key_ret:
+L114dec_key_ret:
ret
+.align 6,0x90
+Lkey_const:
+.long 202313229,202313229,202313229,202313229
+.long 67569157,67569157,67569157,67569157
+.long 1,1,1,1
+.long 27,27,27,27
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
.byte 115,108,46,111,114,103,62,0
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L_OPENSSL_ia32cap_P$non_lazy_ptr:
+.indirect_symbol _OPENSSL_ia32cap_P
+.long 0
+.comm _OPENSSL_ia32cap_P,16,2
diff --git a/deps/openssl/asm_obsolete/x86-win32-masm/aes/aesni-x86.asm b/deps/openssl/asm_obsolete/x86-win32-masm/aes/aesni-x86.asm
index 43fdb5a034..6511c21bcf 100644
--- a/deps/openssl/asm_obsolete/x86-win32-masm/aes/aesni-x86.asm
+++ b/deps/openssl/asm_obsolete/x86-win32-masm/aes/aesni-x86.asm
@@ -17,6 +17,7 @@ IF @Version LT 800
ELSE
.text$ SEGMENT ALIGN(64) 'CODE'
ENDIF
+;EXTERN _OPENSSL_ia32cap_P:NEAR
ALIGN 16
_aesni_encrypt PROC PUBLIC
$L_aesni_encrypt_begin::
@@ -36,7 +37,10 @@ DB 102,15,56,220,209
lea edx,DWORD PTR 16[edx]
jnz $L000enc1_loop_1
DB 102,15,56,221,209
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
movups XMMWORD PTR [eax],xmm2
+ pxor xmm2,xmm2
ret
_aesni_encrypt ENDP
ALIGN 16
@@ -58,7 +62,10 @@ DB 102,15,56,222,209
lea edx,DWORD PTR 16[edx]
jnz $L001dec1_loop_2
DB 102,15,56,223,209
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
movups XMMWORD PTR [eax],xmm2
+ pxor xmm2,xmm2
ret
_aesni_decrypt ENDP
ALIGN 16
@@ -265,17 +272,15 @@ DB 102,15,56,220,217
neg ecx
DB 102,15,56,220,225
pxor xmm7,xmm0
+ movups xmm0,XMMWORD PTR [ecx*1+edx]
add ecx,16
-DB 102,15,56,220,233
-DB 102,15,56,220,241
-DB 102,15,56,220,249
- movups xmm0,XMMWORD PTR [ecx*1+edx-16]
- jmp $L_aesni_encrypt6_enter
+ jmp $L008_aesni_encrypt6_inner
ALIGN 16
-$L008enc6_loop:
+$L009enc6_loop:
DB 102,15,56,220,209
DB 102,15,56,220,217
DB 102,15,56,220,225
+$L008_aesni_encrypt6_inner:
DB 102,15,56,220,233
DB 102,15,56,220,241
DB 102,15,56,220,249
@@ -289,7 +294,7 @@ DB 102,15,56,220,232
DB 102,15,56,220,240
DB 102,15,56,220,248
movups xmm0,XMMWORD PTR [ecx*1+edx-16]
- jnz $L008enc6_loop
+ jnz $L009enc6_loop
DB 102,15,56,220,209
DB 102,15,56,220,217
DB 102,15,56,220,225
@@ -320,17 +325,15 @@ DB 102,15,56,222,217
neg ecx
DB 102,15,56,222,225
pxor xmm7,xmm0
+ movups xmm0,XMMWORD PTR [ecx*1+edx]
add ecx,16
-DB 102,15,56,222,233
-DB 102,15,56,222,241
-DB 102,15,56,222,249
- movups xmm0,XMMWORD PTR [ecx*1+edx-16]
- jmp $L_aesni_decrypt6_enter
+ jmp $L010_aesni_decrypt6_inner
ALIGN 16
-$L009dec6_loop:
+$L011dec6_loop:
DB 102,15,56,222,209
DB 102,15,56,222,217
DB 102,15,56,222,225
+$L010_aesni_decrypt6_inner:
DB 102,15,56,222,233
DB 102,15,56,222,241
DB 102,15,56,222,249
@@ -344,7 +347,7 @@ DB 102,15,56,222,232
DB 102,15,56,222,240
DB 102,15,56,222,248
movups xmm0,XMMWORD PTR [ecx*1+edx-16]
- jnz $L009dec6_loop
+ jnz $L011dec6_loop
DB 102,15,56,222,209
DB 102,15,56,222,217
DB 102,15,56,222,225
@@ -372,14 +375,14 @@ $L_aesni_ecb_encrypt_begin::
mov edx,DWORD PTR 32[esp]
mov ebx,DWORD PTR 36[esp]
and eax,-16
- jz $L010ecb_ret
+ jz $L012ecb_ret
mov ecx,DWORD PTR 240[edx]
test ebx,ebx
- jz $L011ecb_decrypt
+ jz $L013ecb_decrypt
mov ebp,edx
mov ebx,ecx
cmp eax,96
- jb $L012ecb_enc_tail
+ jb $L014ecb_enc_tail
movdqu xmm2,XMMWORD PTR [esi]
movdqu xmm3,XMMWORD PTR 16[esi]
movdqu xmm4,XMMWORD PTR 32[esi]
@@ -388,9 +391,9 @@ $L_aesni_ecb_encrypt_begin::
movdqu xmm7,XMMWORD PTR 80[esi]
lea esi,DWORD PTR 96[esi]
sub eax,96
- jmp $L013ecb_enc_loop6_enter
+ jmp $L015ecb_enc_loop6_enter
ALIGN 16
-$L014ecb_enc_loop6:
+$L016ecb_enc_loop6:
movups XMMWORD PTR [edi],xmm2
movdqu xmm2,XMMWORD PTR [esi]
movups XMMWORD PTR 16[edi],xmm3
@@ -405,12 +408,12 @@ $L014ecb_enc_loop6:
lea edi,DWORD PTR 96[edi]
movdqu xmm7,XMMWORD PTR 80[esi]
lea esi,DWORD PTR 96[esi]
-$L013ecb_enc_loop6_enter:
+$L015ecb_enc_loop6_enter:
call __aesni_encrypt6
mov edx,ebp
mov ecx,ebx
sub eax,96
- jnc $L014ecb_enc_loop6
+ jnc $L016ecb_enc_loop6
movups XMMWORD PTR [edi],xmm2
movups XMMWORD PTR 16[edi],xmm3
movups XMMWORD PTR 32[edi],xmm4
@@ -419,18 +422,18 @@ $L013ecb_enc_loop6_enter:
movups XMMWORD PTR 80[edi],xmm7
lea edi,DWORD PTR 96[edi]
add eax,96
- jz $L010ecb_ret
-$L012ecb_enc_tail:
+ jz $L012ecb_ret
+$L014ecb_enc_tail:
movups xmm2,XMMWORD PTR [esi]
cmp eax,32
- jb $L015ecb_enc_one
+ jb $L017ecb_enc_one
movups xmm3,XMMWORD PTR 16[esi]
- je $L016ecb_enc_two
+ je $L018ecb_enc_two
movups xmm4,XMMWORD PTR 32[esi]
cmp eax,64
- jb $L017ecb_enc_three
+ jb $L019ecb_enc_three
movups xmm5,XMMWORD PTR 48[esi]
- je $L018ecb_enc_four
+ je $L020ecb_enc_four
movups xmm6,XMMWORD PTR 64[esi]
xorps xmm7,xmm7
call __aesni_encrypt6
@@ -439,49 +442,49 @@ $L012ecb_enc_tail:
movups XMMWORD PTR 32[edi],xmm4
movups XMMWORD PTR 48[edi],xmm5
movups XMMWORD PTR 64[edi],xmm6
- jmp $L010ecb_ret
+ jmp $L012ecb_ret
ALIGN 16
-$L015ecb_enc_one:
+$L017ecb_enc_one:
movups xmm0,XMMWORD PTR [edx]
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L019enc1_loop_3:
+$L021enc1_loop_3:
DB 102,15,56,220,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L019enc1_loop_3
+ jnz $L021enc1_loop_3
DB 102,15,56,221,209
movups XMMWORD PTR [edi],xmm2
- jmp $L010ecb_ret
+ jmp $L012ecb_ret
ALIGN 16
-$L016ecb_enc_two:
+$L018ecb_enc_two:
call __aesni_encrypt2
movups XMMWORD PTR [edi],xmm2
movups XMMWORD PTR 16[edi],xmm3
- jmp $L010ecb_ret
+ jmp $L012ecb_ret
ALIGN 16
-$L017ecb_enc_three:
+$L019ecb_enc_three:
call __aesni_encrypt3
movups XMMWORD PTR [edi],xmm2
movups XMMWORD PTR 16[edi],xmm3
movups XMMWORD PTR 32[edi],xmm4
- jmp $L010ecb_ret
+ jmp $L012ecb_ret
ALIGN 16
-$L018ecb_enc_four:
+$L020ecb_enc_four:
call __aesni_encrypt4
movups XMMWORD PTR [edi],xmm2
movups XMMWORD PTR 16[edi],xmm3
movups XMMWORD PTR 32[edi],xmm4
movups XMMWORD PTR 48[edi],xmm5
- jmp $L010ecb_ret
+ jmp $L012ecb_ret
ALIGN 16
-$L011ecb_decrypt:
+$L013ecb_decrypt:
mov ebp,edx
mov ebx,ecx
cmp eax,96
- jb $L020ecb_dec_tail
+ jb $L022ecb_dec_tail
movdqu xmm2,XMMWORD PTR [esi]
movdqu xmm3,XMMWORD PTR 16[esi]
movdqu xmm4,XMMWORD PTR 32[esi]
@@ -490,9 +493,9 @@ $L011ecb_decrypt:
movdqu xmm7,XMMWORD PTR 80[esi]
lea esi,DWORD PTR 96[esi]
sub eax,96
- jmp $L021ecb_dec_loop6_enter
+ jmp $L023ecb_dec_loop6_enter
ALIGN 16
-$L022ecb_dec_loop6:
+$L024ecb_dec_loop6:
movups XMMWORD PTR [edi],xmm2
movdqu xmm2,XMMWORD PTR [esi]
movups XMMWORD PTR 16[edi],xmm3
@@ -507,12 +510,12 @@ $L022ecb_dec_loop6:
lea edi,DWORD PTR 96[edi]
movdqu xmm7,XMMWORD PTR 80[esi]
lea esi,DWORD PTR 96[esi]
-$L021ecb_dec_loop6_enter:
+$L023ecb_dec_loop6_enter:
call __aesni_decrypt6
mov edx,ebp
mov ecx,ebx
sub eax,96
- jnc $L022ecb_dec_loop6
+ jnc $L024ecb_dec_loop6
movups XMMWORD PTR [edi],xmm2
movups XMMWORD PTR 16[edi],xmm3
movups XMMWORD PTR 32[edi],xmm4
@@ -521,18 +524,18 @@ $L021ecb_dec_loop6_enter:
movups XMMWORD PTR 80[edi],xmm7
lea edi,DWORD PTR 96[edi]
add eax,96
- jz $L010ecb_ret
-$L020ecb_dec_tail:
+ jz $L012ecb_ret
+$L022ecb_dec_tail:
movups xmm2,XMMWORD PTR [esi]
cmp eax,32
- jb $L023ecb_dec_one
+ jb $L025ecb_dec_one
movups xmm3,XMMWORD PTR 16[esi]
- je $L024ecb_dec_two
+ je $L026ecb_dec_two
movups xmm4,XMMWORD PTR 32[esi]
cmp eax,64
- jb $L025ecb_dec_three
+ jb $L027ecb_dec_three
movups xmm5,XMMWORD PTR 48[esi]
- je $L026ecb_dec_four
+ je $L028ecb_dec_four
movups xmm6,XMMWORD PTR 64[esi]
xorps xmm7,xmm7
call __aesni_decrypt6
@@ -541,43 +544,51 @@ $L020ecb_dec_tail:
movups XMMWORD PTR 32[edi],xmm4
movups XMMWORD PTR 48[edi],xmm5
movups XMMWORD PTR 64[edi],xmm6
- jmp $L010ecb_ret
+ jmp $L012ecb_ret
ALIGN 16
-$L023ecb_dec_one:
+$L025ecb_dec_one:
movups xmm0,XMMWORD PTR [edx]
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L027dec1_loop_4:
+$L029dec1_loop_4:
DB 102,15,56,222,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L027dec1_loop_4
+ jnz $L029dec1_loop_4
DB 102,15,56,223,209
movups XMMWORD PTR [edi],xmm2
- jmp $L010ecb_ret
+ jmp $L012ecb_ret
ALIGN 16
-$L024ecb_dec_two:
+$L026ecb_dec_two:
call __aesni_decrypt2
movups XMMWORD PTR [edi],xmm2
movups XMMWORD PTR 16[edi],xmm3
- jmp $L010ecb_ret
+ jmp $L012ecb_ret
ALIGN 16
-$L025ecb_dec_three:
+$L027ecb_dec_three:
call __aesni_decrypt3
movups XMMWORD PTR [edi],xmm2
movups XMMWORD PTR 16[edi],xmm3
movups XMMWORD PTR 32[edi],xmm4
- jmp $L010ecb_ret
+ jmp $L012ecb_ret
ALIGN 16
-$L026ecb_dec_four:
+$L028ecb_dec_four:
call __aesni_decrypt4
movups XMMWORD PTR [edi],xmm2
movups XMMWORD PTR 16[edi],xmm3
movups XMMWORD PTR 32[edi],xmm4
movups XMMWORD PTR 48[edi],xmm5
-$L010ecb_ret:
+$L012ecb_ret:
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ pxor xmm6,xmm6
+ pxor xmm7,xmm7
pop edi
pop esi
pop ebx
@@ -622,7 +633,7 @@ $L_aesni_ccm64_encrypt_blocks_begin::
lea edx,DWORD PTR 32[ecx*1+edx]
sub ebx,ecx
DB 102,15,56,0,253
-$L028ccm64_enc_outer:
+$L030ccm64_enc_outer:
movups xmm0,XMMWORD PTR [ebp]
mov ecx,ebx
movups xmm6,XMMWORD PTR [esi]
@@ -631,7 +642,7 @@ $L028ccm64_enc_outer:
xorps xmm0,xmm6
xorps xmm3,xmm0
movups xmm0,XMMWORD PTR 32[ebp]
-$L029ccm64_enc2_loop:
+$L031ccm64_enc2_loop:
DB 102,15,56,220,209
DB 102,15,56,220,217
movups xmm1,XMMWORD PTR [ecx*1+edx]
@@ -639,7 +650,7 @@ DB 102,15,56,220,217
DB 102,15,56,220,208
DB 102,15,56,220,216
movups xmm0,XMMWORD PTR [ecx*1+edx-16]
- jnz $L029ccm64_enc2_loop
+ jnz $L031ccm64_enc2_loop
DB 102,15,56,220,209
DB 102,15,56,220,217
paddq xmm7,XMMWORD PTR 16[esp]
@@ -652,10 +663,18 @@ DB 102,15,56,221,216
movups XMMWORD PTR [edi],xmm6
DB 102,15,56,0,213
lea edi,DWORD PTR 16[edi]
- jnz $L028ccm64_enc_outer
+ jnz $L030ccm64_enc_outer
mov esp,DWORD PTR 48[esp]
mov edi,DWORD PTR 40[esp]
movups XMMWORD PTR [edi],xmm3
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ pxor xmm6,xmm6
+ pxor xmm7,xmm7
pop edi
pop esi
pop ebx
@@ -701,12 +720,12 @@ DB 102,15,56,0,253
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L030enc1_loop_5:
+$L032enc1_loop_5:
DB 102,15,56,220,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L030enc1_loop_5
+ jnz $L032enc1_loop_5
DB 102,15,56,221,209
shl ebx,4
mov ecx,16
@@ -716,16 +735,16 @@ DB 102,15,56,221,209
sub ecx,ebx
lea edx,DWORD PTR 32[ebx*1+ebp]
mov ebx,ecx
- jmp $L031ccm64_dec_outer
+ jmp $L033ccm64_dec_outer
ALIGN 16
-$L031ccm64_dec_outer:
+$L033ccm64_dec_outer:
xorps xmm6,xmm2
movdqa xmm2,xmm7
movups XMMWORD PTR [edi],xmm6
lea edi,DWORD PTR 16[edi]
DB 102,15,56,0,213
sub eax,1
- jz $L032ccm64_dec_break
+ jz $L034ccm64_dec_break
movups xmm0,XMMWORD PTR [ebp]
mov ecx,ebx
movups xmm1,XMMWORD PTR 16[ebp]
@@ -733,7 +752,7 @@ DB 102,15,56,0,213
xorps xmm2,xmm0
xorps xmm3,xmm6
movups xmm0,XMMWORD PTR 32[ebp]
-$L033ccm64_dec2_loop:
+$L035ccm64_dec2_loop:
DB 102,15,56,220,209
DB 102,15,56,220,217
movups xmm1,XMMWORD PTR [ecx*1+edx]
@@ -741,7 +760,7 @@ DB 102,15,56,220,217
DB 102,15,56,220,208
DB 102,15,56,220,216
movups xmm0,XMMWORD PTR [ecx*1+edx-16]
- jnz $L033ccm64_dec2_loop
+ jnz $L035ccm64_dec2_loop
movups xmm6,XMMWORD PTR [esi]
paddq xmm7,XMMWORD PTR 16[esp]
DB 102,15,56,220,209
@@ -749,9 +768,9 @@ DB 102,15,56,220,217
DB 102,15,56,221,208
DB 102,15,56,221,216
lea esi,QWORD PTR 16[esi]
- jmp $L031ccm64_dec_outer
+ jmp $L033ccm64_dec_outer
ALIGN 16
-$L032ccm64_dec_break:
+$L034ccm64_dec_break:
mov ecx,DWORD PTR 240[ebp]
mov edx,ebp
movups xmm0,XMMWORD PTR [edx]
@@ -759,16 +778,24 @@ $L032ccm64_dec_break:
xorps xmm6,xmm0
lea edx,DWORD PTR 32[edx]
xorps xmm3,xmm6
-$L034enc1_loop_6:
+$L036enc1_loop_6:
DB 102,15,56,220,217
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L034enc1_loop_6
+ jnz $L036enc1_loop_6
DB 102,15,56,221,217
mov esp,DWORD PTR 48[esp]
mov edi,DWORD PTR 40[esp]
movups XMMWORD PTR [edi],xmm3
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ pxor xmm6,xmm6
+ pxor xmm7,xmm7
pop edi
pop esi
pop ebx
@@ -792,7 +819,7 @@ $L_aesni_ctr32_encrypt_blocks_begin::
and esp,-16
mov DWORD PTR 80[esp],ebp
cmp eax,1
- je $L035ctr32_one_shortcut
+ je $L037ctr32_one_shortcut
movdqu xmm7,XMMWORD PTR [ebx]
mov DWORD PTR [esp],202182159
mov DWORD PTR 4[esp],134810123
@@ -830,7 +857,7 @@ DB 102,15,56,0,202
pshufd xmm2,xmm0,192
pshufd xmm3,xmm0,128
cmp eax,6
- jb $L036ctr32_tail
+ jb $L038ctr32_tail
pxor xmm7,xmm6
shl ecx,4
mov ebx,16
@@ -839,9 +866,9 @@ DB 102,15,56,0,202
sub ebx,ecx
lea edx,DWORD PTR 32[ecx*1+edx]
sub eax,6
- jmp $L037ctr32_loop6
+ jmp $L039ctr32_loop6
ALIGN 16
-$L037ctr32_loop6:
+$L039ctr32_loop6:
pshufd xmm4,xmm0,64
movdqa xmm0,XMMWORD PTR 32[esp]
pshufd xmm5,xmm1,192
@@ -895,27 +922,27 @@ DB 102,15,56,0,202
lea edi,DWORD PTR 96[edi]
pshufd xmm3,xmm0,128
sub eax,6
- jnc $L037ctr32_loop6
+ jnc $L039ctr32_loop6
add eax,6
- jz $L038ctr32_ret
+ jz $L040ctr32_ret
movdqu xmm7,XMMWORD PTR [ebp]
mov edx,ebp
pxor xmm7,XMMWORD PTR 32[esp]
mov ecx,DWORD PTR 240[ebp]
-$L036ctr32_tail:
+$L038ctr32_tail:
por xmm2,xmm7
cmp eax,2
- jb $L039ctr32_one
+ jb $L041ctr32_one
pshufd xmm4,xmm0,64
por xmm3,xmm7
- je $L040ctr32_two
+ je $L042ctr32_two
pshufd xmm5,xmm1,192
por xmm4,xmm7
cmp eax,4
- jb $L041ctr32_three
+ jb $L043ctr32_three
pshufd xmm6,xmm1,128
por xmm5,xmm7
- je $L042ctr32_four
+ je $L044ctr32_four
por xmm6,xmm7
call __aesni_encrypt6
movups xmm1,XMMWORD PTR [esi]
@@ -933,29 +960,29 @@ $L036ctr32_tail:
movups XMMWORD PTR 32[edi],xmm4
movups XMMWORD PTR 48[edi],xmm5
movups XMMWORD PTR 64[edi],xmm6
- jmp $L038ctr32_ret
+ jmp $L040ctr32_ret
ALIGN 16
-$L035ctr32_one_shortcut:
+$L037ctr32_one_shortcut:
movups xmm2,XMMWORD PTR [ebx]
mov ecx,DWORD PTR 240[edx]
-$L039ctr32_one:
+$L041ctr32_one:
movups xmm0,XMMWORD PTR [edx]
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L043enc1_loop_7:
+$L045enc1_loop_7:
DB 102,15,56,220,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L043enc1_loop_7
+ jnz $L045enc1_loop_7
DB 102,15,56,221,209
movups xmm6,XMMWORD PTR [esi]
xorps xmm6,xmm2
movups XMMWORD PTR [edi],xmm6
- jmp $L038ctr32_ret
+ jmp $L040ctr32_ret
ALIGN 16
-$L040ctr32_two:
+$L042ctr32_two:
call __aesni_encrypt2
movups xmm5,XMMWORD PTR [esi]
movups xmm6,XMMWORD PTR 16[esi]
@@ -963,9 +990,9 @@ $L040ctr32_two:
xorps xmm3,xmm6
movups XMMWORD PTR [edi],xmm2
movups XMMWORD PTR 16[edi],xmm3
- jmp $L038ctr32_ret
+ jmp $L040ctr32_ret
ALIGN 16
-$L041ctr32_three:
+$L043ctr32_three:
call __aesni_encrypt3
movups xmm5,XMMWORD PTR [esi]
movups xmm6,XMMWORD PTR 16[esi]
@@ -976,9 +1003,9 @@ $L041ctr32_three:
xorps xmm4,xmm7
movups XMMWORD PTR 16[edi],xmm3
movups XMMWORD PTR 32[edi],xmm4
- jmp $L038ctr32_ret
+ jmp $L040ctr32_ret
ALIGN 16
-$L042ctr32_four:
+$L044ctr32_four:
call __aesni_encrypt4
movups xmm6,XMMWORD PTR [esi]
movups xmm7,XMMWORD PTR 16[esi]
@@ -992,7 +1019,18 @@ $L042ctr32_four:
xorps xmm5,xmm0
movups XMMWORD PTR 32[edi],xmm4
movups XMMWORD PTR 48[edi],xmm5
-$L038ctr32_ret:
+$L040ctr32_ret:
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ movdqa XMMWORD PTR 32[esp],xmm0
+ pxor xmm5,xmm5
+ movdqa XMMWORD PTR 48[esp],xmm0
+ pxor xmm6,xmm6
+ movdqa XMMWORD PTR 64[esp],xmm0
+ pxor xmm7,xmm7
mov esp,DWORD PTR 80[esp]
pop edi
pop esi
@@ -1015,12 +1053,12 @@ $L_aesni_xts_encrypt_begin::
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L044enc1_loop_8:
+$L046enc1_loop_8:
DB 102,15,56,220,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L044enc1_loop_8
+ jnz $L046enc1_loop_8
DB 102,15,56,221,209
mov esi,DWORD PTR 20[esp]
mov edi,DWORD PTR 24[esp]
@@ -1044,14 +1082,14 @@ DB 102,15,56,221,209
mov ebp,edx
mov ebx,ecx
sub eax,96
- jc $L045xts_enc_short
+ jc $L047xts_enc_short
shl ecx,4
mov ebx,16
sub ebx,ecx
lea edx,DWORD PTR 32[ecx*1+edx]
- jmp $L046xts_enc_loop6
+ jmp $L048xts_enc_loop6
ALIGN 16
-$L046xts_enc_loop6:
+$L048xts_enc_loop6:
pshufd xmm2,xmm0,19
pxor xmm0,xmm0
movdqa XMMWORD PTR [esp],xmm1
@@ -1140,23 +1178,23 @@ DB 102,15,56,220,249
pcmpgtd xmm0,xmm1
pxor xmm1,xmm2
sub eax,96
- jnc $L046xts_enc_loop6
+ jnc $L048xts_enc_loop6
mov ecx,DWORD PTR 240[ebp]
mov edx,ebp
mov ebx,ecx
-$L045xts_enc_short:
+$L047xts_enc_short:
add eax,96
- jz $L047xts_enc_done6x
+ jz $L049xts_enc_done6x
movdqa xmm5,xmm1
cmp eax,32
- jb $L048xts_enc_one
+ jb $L050xts_enc_one
pshufd xmm2,xmm0,19
pxor xmm0,xmm0
paddq xmm1,xmm1
pand xmm2,xmm3
pcmpgtd xmm0,xmm1
pxor xmm1,xmm2
- je $L049xts_enc_two
+ je $L051xts_enc_two
pshufd xmm2,xmm0,19
pxor xmm0,xmm0
movdqa xmm6,xmm1
@@ -1165,7 +1203,7 @@ $L045xts_enc_short:
pcmpgtd xmm0,xmm1
pxor xmm1,xmm2
cmp eax,64
- jb $L050xts_enc_three
+ jb $L052xts_enc_three
pshufd xmm2,xmm0,19
pxor xmm0,xmm0
movdqa xmm7,xmm1
@@ -1175,7 +1213,7 @@ $L045xts_enc_short:
pxor xmm1,xmm2
movdqa XMMWORD PTR [esp],xmm5
movdqa XMMWORD PTR 16[esp],xmm6
- je $L051xts_enc_four
+ je $L053xts_enc_four
movdqa XMMWORD PTR 32[esp],xmm7
pshufd xmm7,xmm0,19
movdqa XMMWORD PTR 48[esp],xmm1
@@ -1207,9 +1245,9 @@ $L045xts_enc_short:
movups XMMWORD PTR 48[edi],xmm5
movups XMMWORD PTR 64[edi],xmm6
lea edi,DWORD PTR 80[edi]
- jmp $L052xts_enc_done
+ jmp $L054xts_enc_done
ALIGN 16
-$L048xts_enc_one:
+$L050xts_enc_one:
movups xmm2,XMMWORD PTR [esi]
lea esi,DWORD PTR 16[esi]
xorps xmm2,xmm5
@@ -1217,20 +1255,20 @@ $L048xts_enc_one:
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L053enc1_loop_9:
+$L055enc1_loop_9:
DB 102,15,56,220,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L053enc1_loop_9
+ jnz $L055enc1_loop_9
DB 102,15,56,221,209
xorps xmm2,xmm5
movups XMMWORD PTR [edi],xmm2
lea edi,DWORD PTR 16[edi]
movdqa xmm1,xmm5
- jmp $L052xts_enc_done
+ jmp $L054xts_enc_done
ALIGN 16
-$L049xts_enc_two:
+$L051xts_enc_two:
movaps xmm6,xmm1
movups xmm2,XMMWORD PTR [esi]
movups xmm3,XMMWORD PTR 16[esi]
@@ -1244,9 +1282,9 @@ $L049xts_enc_two:
movups XMMWORD PTR 16[edi],xmm3
lea edi,DWORD PTR 32[edi]
movdqa xmm1,xmm6
- jmp $L052xts_enc_done
+ jmp $L054xts_enc_done
ALIGN 16
-$L050xts_enc_three:
+$L052xts_enc_three:
movaps xmm7,xmm1
movups xmm2,XMMWORD PTR [esi]
movups xmm3,XMMWORD PTR 16[esi]
@@ -1264,9 +1302,9 @@ $L050xts_enc_three:
movups XMMWORD PTR 32[edi],xmm4
lea edi,DWORD PTR 48[edi]
movdqa xmm1,xmm7
- jmp $L052xts_enc_done
+ jmp $L054xts_enc_done
ALIGN 16
-$L051xts_enc_four:
+$L053xts_enc_four:
movaps xmm6,xmm1
movups xmm2,XMMWORD PTR [esi]
movups xmm3,XMMWORD PTR 16[esi]
@@ -1288,28 +1326,28 @@ $L051xts_enc_four:
movups XMMWORD PTR 48[edi],xmm5
lea edi,DWORD PTR 64[edi]
movdqa xmm1,xmm6
- jmp $L052xts_enc_done
+ jmp $L054xts_enc_done
ALIGN 16
-$L047xts_enc_done6x:
+$L049xts_enc_done6x:
mov eax,DWORD PTR 112[esp]
and eax,15
- jz $L054xts_enc_ret
+ jz $L056xts_enc_ret
movdqa xmm5,xmm1
mov DWORD PTR 112[esp],eax
- jmp $L055xts_enc_steal
+ jmp $L057xts_enc_steal
ALIGN 16
-$L052xts_enc_done:
+$L054xts_enc_done:
mov eax,DWORD PTR 112[esp]
pxor xmm0,xmm0
and eax,15
- jz $L054xts_enc_ret
+ jz $L056xts_enc_ret
pcmpgtd xmm0,xmm1
mov DWORD PTR 112[esp],eax
pshufd xmm5,xmm0,19
paddq xmm1,xmm1
pand xmm5,XMMWORD PTR 96[esp]
pxor xmm5,xmm1
-$L055xts_enc_steal:
+$L057xts_enc_steal:
movzx ecx,BYTE PTR [esi]
movzx edx,BYTE PTR [edi-16]
lea esi,DWORD PTR 1[esi]
@@ -1317,7 +1355,7 @@ $L055xts_enc_steal:
mov BYTE PTR [edi],dl
lea edi,DWORD PTR 1[edi]
sub eax,1
- jnz $L055xts_enc_steal
+ jnz $L057xts_enc_steal
sub edi,DWORD PTR 112[esp]
mov edx,ebp
mov ecx,ebx
@@ -1327,16 +1365,30 @@ $L055xts_enc_steal:
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L056enc1_loop_10:
+$L058enc1_loop_10:
DB 102,15,56,220,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L056enc1_loop_10
+ jnz $L058enc1_loop_10
DB 102,15,56,221,209
xorps xmm2,xmm5
movups XMMWORD PTR [edi-16],xmm2
-$L054xts_enc_ret:
+$L056xts_enc_ret:
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ movdqa XMMWORD PTR [esp],xmm0
+ pxor xmm3,xmm3
+ movdqa XMMWORD PTR 16[esp],xmm0
+ pxor xmm4,xmm4
+ movdqa XMMWORD PTR 32[esp],xmm0
+ pxor xmm5,xmm5
+ movdqa XMMWORD PTR 48[esp],xmm0
+ pxor xmm6,xmm6
+ movdqa XMMWORD PTR 64[esp],xmm0
+ pxor xmm7,xmm7
+ movdqa XMMWORD PTR 80[esp],xmm0
mov esp,DWORD PTR 116[esp]
pop edi
pop esi
@@ -1359,12 +1411,12 @@ $L_aesni_xts_decrypt_begin::
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L057enc1_loop_11:
+$L059enc1_loop_11:
DB 102,15,56,220,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L057enc1_loop_11
+ jnz $L059enc1_loop_11
DB 102,15,56,221,209
mov esi,DWORD PTR 20[esp]
mov edi,DWORD PTR 24[esp]
@@ -1393,14 +1445,14 @@ DB 102,15,56,221,209
pcmpgtd xmm0,xmm1
and eax,-16
sub eax,96
- jc $L058xts_dec_short
+ jc $L060xts_dec_short
shl ecx,4
mov ebx,16
sub ebx,ecx
lea edx,DWORD PTR 32[ecx*1+edx]
- jmp $L059xts_dec_loop6
+ jmp $L061xts_dec_loop6
ALIGN 16
-$L059xts_dec_loop6:
+$L061xts_dec_loop6:
pshufd xmm2,xmm0,19
pxor xmm0,xmm0
movdqa XMMWORD PTR [esp],xmm1
@@ -1489,23 +1541,23 @@ DB 102,15,56,222,249
pcmpgtd xmm0,xmm1
pxor xmm1,xmm2
sub eax,96
- jnc $L059xts_dec_loop6
+ jnc $L061xts_dec_loop6
mov ecx,DWORD PTR 240[ebp]
mov edx,ebp
mov ebx,ecx
-$L058xts_dec_short:
+$L060xts_dec_short:
add eax,96
- jz $L060xts_dec_done6x
+ jz $L062xts_dec_done6x
movdqa xmm5,xmm1
cmp eax,32
- jb $L061xts_dec_one
+ jb $L063xts_dec_one
pshufd xmm2,xmm0,19
pxor xmm0,xmm0
paddq xmm1,xmm1
pand xmm2,xmm3
pcmpgtd xmm0,xmm1
pxor xmm1,xmm2
- je $L062xts_dec_two
+ je $L064xts_dec_two
pshufd xmm2,xmm0,19
pxor xmm0,xmm0
movdqa xmm6,xmm1
@@ -1514,7 +1566,7 @@ $L058xts_dec_short:
pcmpgtd xmm0,xmm1
pxor xmm1,xmm2
cmp eax,64
- jb $L063xts_dec_three
+ jb $L065xts_dec_three
pshufd xmm2,xmm0,19
pxor xmm0,xmm0
movdqa xmm7,xmm1
@@ -1524,7 +1576,7 @@ $L058xts_dec_short:
pxor xmm1,xmm2
movdqa XMMWORD PTR [esp],xmm5
movdqa XMMWORD PTR 16[esp],xmm6
- je $L064xts_dec_four
+ je $L066xts_dec_four
movdqa XMMWORD PTR 32[esp],xmm7
pshufd xmm7,xmm0,19
movdqa XMMWORD PTR 48[esp],xmm1
@@ -1556,9 +1608,9 @@ $L058xts_dec_short:
movups XMMWORD PTR 48[edi],xmm5
movups XMMWORD PTR 64[edi],xmm6
lea edi,DWORD PTR 80[edi]
- jmp $L065xts_dec_done
+ jmp $L067xts_dec_done
ALIGN 16
-$L061xts_dec_one:
+$L063xts_dec_one:
movups xmm2,XMMWORD PTR [esi]
lea esi,DWORD PTR 16[esi]
xorps xmm2,xmm5
@@ -1566,20 +1618,20 @@ $L061xts_dec_one:
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L066dec1_loop_12:
+$L068dec1_loop_12:
DB 102,15,56,222,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L066dec1_loop_12
+ jnz $L068dec1_loop_12
DB 102,15,56,223,209
xorps xmm2,xmm5
movups XMMWORD PTR [edi],xmm2
lea edi,DWORD PTR 16[edi]
movdqa xmm1,xmm5
- jmp $L065xts_dec_done
+ jmp $L067xts_dec_done
ALIGN 16
-$L062xts_dec_two:
+$L064xts_dec_two:
movaps xmm6,xmm1
movups xmm2,XMMWORD PTR [esi]
movups xmm3,XMMWORD PTR 16[esi]
@@ -1593,9 +1645,9 @@ $L062xts_dec_two:
movups XMMWORD PTR 16[edi],xmm3
lea edi,DWORD PTR 32[edi]
movdqa xmm1,xmm6
- jmp $L065xts_dec_done
+ jmp $L067xts_dec_done
ALIGN 16
-$L063xts_dec_three:
+$L065xts_dec_three:
movaps xmm7,xmm1
movups xmm2,XMMWORD PTR [esi]
movups xmm3,XMMWORD PTR 16[esi]
@@ -1613,9 +1665,9 @@ $L063xts_dec_three:
movups XMMWORD PTR 32[edi],xmm4
lea edi,DWORD PTR 48[edi]
movdqa xmm1,xmm7
- jmp $L065xts_dec_done
+ jmp $L067xts_dec_done
ALIGN 16
-$L064xts_dec_four:
+$L066xts_dec_four:
movaps xmm6,xmm1
movups xmm2,XMMWORD PTR [esi]
movups xmm3,XMMWORD PTR 16[esi]
@@ -1637,20 +1689,20 @@ $L064xts_dec_four:
movups XMMWORD PTR 48[edi],xmm5
lea edi,DWORD PTR 64[edi]
movdqa xmm1,xmm6
- jmp $L065xts_dec_done
+ jmp $L067xts_dec_done
ALIGN 16
-$L060xts_dec_done6x:
+$L062xts_dec_done6x:
mov eax,DWORD PTR 112[esp]
and eax,15
- jz $L067xts_dec_ret
+ jz $L069xts_dec_ret
mov DWORD PTR 112[esp],eax
- jmp $L068xts_dec_only_one_more
+ jmp $L070xts_dec_only_one_more
ALIGN 16
-$L065xts_dec_done:
+$L067xts_dec_done:
mov eax,DWORD PTR 112[esp]
pxor xmm0,xmm0
and eax,15
- jz $L067xts_dec_ret
+ jz $L069xts_dec_ret
pcmpgtd xmm0,xmm1
mov DWORD PTR 112[esp],eax
pshufd xmm2,xmm0,19
@@ -1660,7 +1712,7 @@ $L065xts_dec_done:
pand xmm2,xmm3
pcmpgtd xmm0,xmm1
pxor xmm1,xmm2
-$L068xts_dec_only_one_more:
+$L070xts_dec_only_one_more:
pshufd xmm5,xmm0,19
movdqa xmm6,xmm1
paddq xmm1,xmm1
@@ -1674,16 +1726,16 @@ $L068xts_dec_only_one_more:
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L069dec1_loop_13:
+$L071dec1_loop_13:
DB 102,15,56,222,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L069dec1_loop_13
+ jnz $L071dec1_loop_13
DB 102,15,56,223,209
xorps xmm2,xmm5
movups XMMWORD PTR [edi],xmm2
-$L070xts_dec_steal:
+$L072xts_dec_steal:
movzx ecx,BYTE PTR 16[esi]
movzx edx,BYTE PTR [edi]
lea esi,DWORD PTR 1[esi]
@@ -1691,7 +1743,7 @@ $L070xts_dec_steal:
mov BYTE PTR 16[edi],dl
lea edi,DWORD PTR 1[edi]
sub eax,1
- jnz $L070xts_dec_steal
+ jnz $L072xts_dec_steal
sub edi,DWORD PTR 112[esp]
mov edx,ebp
mov ecx,ebx
@@ -1701,16 +1753,30 @@ $L070xts_dec_steal:
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L071dec1_loop_14:
+$L073dec1_loop_14:
DB 102,15,56,222,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L071dec1_loop_14
+ jnz $L073dec1_loop_14
DB 102,15,56,223,209
xorps xmm2,xmm6
movups XMMWORD PTR [edi],xmm2
-$L067xts_dec_ret:
+$L069xts_dec_ret:
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ movdqa XMMWORD PTR [esp],xmm0
+ pxor xmm3,xmm3
+ movdqa XMMWORD PTR 16[esp],xmm0
+ pxor xmm4,xmm4
+ movdqa XMMWORD PTR 32[esp],xmm0
+ pxor xmm5,xmm5
+ movdqa XMMWORD PTR 48[esp],xmm0
+ pxor xmm6,xmm6
+ movdqa XMMWORD PTR 64[esp],xmm0
+ pxor xmm7,xmm7
+ movdqa XMMWORD PTR 80[esp],xmm0
mov esp,DWORD PTR 116[esp]
pop edi
pop esi
@@ -1734,7 +1800,7 @@ $L_aesni_cbc_encrypt_begin::
mov edx,DWORD PTR 32[esp]
mov ebp,DWORD PTR 36[esp]
test eax,eax
- jz $L072cbc_abort
+ jz $L074cbc_abort
cmp DWORD PTR 40[esp],0
xchg ebx,esp
movups xmm7,XMMWORD PTR [ebp]
@@ -1742,14 +1808,14 @@ $L_aesni_cbc_encrypt_begin::
mov ebp,edx
mov DWORD PTR 16[esp],ebx
mov ebx,ecx
- je $L073cbc_decrypt
+ je $L075cbc_decrypt
movaps xmm2,xmm7
cmp eax,16
- jb $L074cbc_enc_tail
+ jb $L076cbc_enc_tail
sub eax,16
- jmp $L075cbc_enc_loop
+ jmp $L077cbc_enc_loop
ALIGN 16
-$L075cbc_enc_loop:
+$L077cbc_enc_loop:
movups xmm7,XMMWORD PTR [esi]
lea esi,DWORD PTR 16[esi]
movups xmm0,XMMWORD PTR [edx]
@@ -1757,24 +1823,25 @@ $L075cbc_enc_loop:
xorps xmm7,xmm0
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm7
-$L076enc1_loop_15:
+$L078enc1_loop_15:
DB 102,15,56,220,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L076enc1_loop_15
+ jnz $L078enc1_loop_15
DB 102,15,56,221,209
mov ecx,ebx
mov edx,ebp
movups XMMWORD PTR [edi],xmm2
lea edi,DWORD PTR 16[edi]
sub eax,16
- jnc $L075cbc_enc_loop
+ jnc $L077cbc_enc_loop
add eax,16
- jnz $L074cbc_enc_tail
+ jnz $L076cbc_enc_tail
movaps xmm7,xmm2
- jmp $L077cbc_ret
-$L074cbc_enc_tail:
+ pxor xmm2,xmm2
+ jmp $L079cbc_ret
+$L076cbc_enc_tail:
mov ecx,eax
DD 2767451785
mov ecx,16
@@ -1785,20 +1852,20 @@ DD 2868115081
mov ecx,ebx
mov esi,edi
mov edx,ebp
- jmp $L075cbc_enc_loop
+ jmp $L077cbc_enc_loop
ALIGN 16
-$L073cbc_decrypt:
+$L075cbc_decrypt:
cmp eax,80
- jbe $L078cbc_dec_tail
+ jbe $L080cbc_dec_tail
movaps XMMWORD PTR [esp],xmm7
sub eax,80
- jmp $L079cbc_dec_loop6_enter
+ jmp $L081cbc_dec_loop6_enter
ALIGN 16
-$L080cbc_dec_loop6:
+$L082cbc_dec_loop6:
movaps XMMWORD PTR [esp],xmm0
movups XMMWORD PTR [edi],xmm7
lea edi,DWORD PTR 16[edi]
-$L079cbc_dec_loop6_enter:
+$L081cbc_dec_loop6_enter:
movdqu xmm2,XMMWORD PTR [esi]
movdqu xmm3,XMMWORD PTR 16[esi]
movdqu xmm4,XMMWORD PTR 32[esi]
@@ -1828,28 +1895,28 @@ $L079cbc_dec_loop6_enter:
movups XMMWORD PTR 64[edi],xmm6
lea edi,DWORD PTR 80[edi]
sub eax,96
- ja $L080cbc_dec_loop6
+ ja $L082cbc_dec_loop6
movaps xmm2,xmm7
movaps xmm7,xmm0
add eax,80
- jle $L081cbc_dec_tail_collected
+ jle $L083cbc_dec_clear_tail_collected
movups XMMWORD PTR [edi],xmm2
lea edi,DWORD PTR 16[edi]
-$L078cbc_dec_tail:
+$L080cbc_dec_tail:
movups xmm2,XMMWORD PTR [esi]
movaps xmm6,xmm2
cmp eax,16
- jbe $L082cbc_dec_one
+ jbe $L084cbc_dec_one
movups xmm3,XMMWORD PTR 16[esi]
movaps xmm5,xmm3
cmp eax,32
- jbe $L083cbc_dec_two
+ jbe $L085cbc_dec_two
movups xmm4,XMMWORD PTR 32[esi]
cmp eax,48
- jbe $L084cbc_dec_three
+ jbe $L086cbc_dec_three
movups xmm5,XMMWORD PTR 48[esi]
cmp eax,64
- jbe $L085cbc_dec_four
+ jbe $L087cbc_dec_four
movups xmm6,XMMWORD PTR 64[esi]
movaps XMMWORD PTR [esp],xmm7
movups xmm2,XMMWORD PTR [esi]
@@ -1867,55 +1934,62 @@ $L078cbc_dec_tail:
xorps xmm6,xmm0
movups XMMWORD PTR [edi],xmm2
movups XMMWORD PTR 16[edi],xmm3
+ pxor xmm3,xmm3
movups XMMWORD PTR 32[edi],xmm4
+ pxor xmm4,xmm4
movups XMMWORD PTR 48[edi],xmm5
+ pxor xmm5,xmm5
lea edi,DWORD PTR 64[edi]
movaps xmm2,xmm6
+ pxor xmm6,xmm6
sub eax,80
- jmp $L081cbc_dec_tail_collected
+ jmp $L088cbc_dec_tail_collected
ALIGN 16
-$L082cbc_dec_one:
+$L084cbc_dec_one:
movups xmm0,XMMWORD PTR [edx]
movups xmm1,XMMWORD PTR 16[edx]
lea edx,DWORD PTR 32[edx]
xorps xmm2,xmm0
-$L086dec1_loop_16:
+$L089dec1_loop_16:
DB 102,15,56,222,209
dec ecx
movups xmm1,XMMWORD PTR [edx]
lea edx,DWORD PTR 16[edx]
- jnz $L086dec1_loop_16
+ jnz $L089dec1_loop_16
DB 102,15,56,223,209
xorps xmm2,xmm7
movaps xmm7,xmm6
sub eax,16
- jmp $L081cbc_dec_tail_collected
+ jmp $L088cbc_dec_tail_collected
ALIGN 16
-$L083cbc_dec_two:
+$L085cbc_dec_two:
call __aesni_decrypt2
xorps xmm2,xmm7
xorps xmm3,xmm6
movups XMMWORD PTR [edi],xmm2
movaps xmm2,xmm3
+ pxor xmm3,xmm3
lea edi,DWORD PTR 16[edi]
movaps xmm7,xmm5
sub eax,32
- jmp $L081cbc_dec_tail_collected
+ jmp $L088cbc_dec_tail_collected
ALIGN 16
-$L084cbc_dec_three:
+$L086cbc_dec_three:
call __aesni_decrypt3
xorps xmm2,xmm7
xorps xmm3,xmm6
xorps xmm4,xmm5
movups XMMWORD PTR [edi],xmm2
movaps xmm2,xmm4
+ pxor xmm4,xmm4
movups XMMWORD PTR 16[edi],xmm3
+ pxor xmm3,xmm3
lea edi,DWORD PTR 32[edi]
movups xmm7,XMMWORD PTR 32[esi]
sub eax,48
- jmp $L081cbc_dec_tail_collected
+ jmp $L088cbc_dec_tail_collected
ALIGN 16
-$L085cbc_dec_four:
+$L087cbc_dec_four:
call __aesni_decrypt4
movups xmm1,XMMWORD PTR 16[esi]
movups xmm0,XMMWORD PTR 32[esi]
@@ -1925,28 +1999,44 @@ $L085cbc_dec_four:
movups XMMWORD PTR [edi],xmm2
xorps xmm4,xmm1
movups XMMWORD PTR 16[edi],xmm3
+ pxor xmm3,xmm3
xorps xmm5,xmm0
movups XMMWORD PTR 32[edi],xmm4
+ pxor xmm4,xmm4
lea edi,DWORD PTR 48[edi]
movaps xmm2,xmm5
+ pxor xmm5,xmm5
sub eax,64
-$L081cbc_dec_tail_collected:
+ jmp $L088cbc_dec_tail_collected
+ALIGN 16
+$L083cbc_dec_clear_tail_collected:
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ pxor xmm6,xmm6
+$L088cbc_dec_tail_collected:
and eax,15
- jnz $L087cbc_dec_tail_partial
+ jnz $L090cbc_dec_tail_partial
movups XMMWORD PTR [edi],xmm2
- jmp $L077cbc_ret
+ pxor xmm0,xmm0
+ jmp $L079cbc_ret
ALIGN 16
-$L087cbc_dec_tail_partial:
+$L090cbc_dec_tail_partial:
movaps XMMWORD PTR [esp],xmm2
+ pxor xmm0,xmm0
mov ecx,16
mov esi,esp
sub ecx,eax
DD 2767451785
-$L077cbc_ret:
+ movdqa XMMWORD PTR [esp],xmm2
+$L079cbc_ret:
mov esp,DWORD PTR 16[esp]
mov ebp,DWORD PTR 36[esp]
+ pxor xmm2,xmm2
+ pxor xmm1,xmm1
movups XMMWORD PTR [ebp],xmm7
-$L072cbc_abort:
+ pxor xmm7,xmm7
+$L074cbc_abort:
pop edi
pop esi
pop ebx
@@ -1955,52 +2045,62 @@ $L072cbc_abort:
_aesni_cbc_encrypt ENDP
ALIGN 16
__aesni_set_encrypt_key PROC PRIVATE
+ push ebp
+ push ebx
test eax,eax
- jz $L088bad_pointer
+ jz $L091bad_pointer
test edx,edx
- jz $L088bad_pointer
+ jz $L091bad_pointer
+ call $L092pic
+$L092pic:
+ pop ebx
+ lea ebx,DWORD PTR ($Lkey_const-$L092pic)[ebx]
+ lea ebp,DWORD PTR _OPENSSL_ia32cap_P
movups xmm0,XMMWORD PTR [eax]
xorps xmm4,xmm4
+ mov ebp,DWORD PTR 4[ebp]
lea edx,DWORD PTR 16[edx]
+ and ebp,268437504
cmp ecx,256
- je $L08914rounds
+ je $L09314rounds
cmp ecx,192
- je $L09012rounds
+ je $L09412rounds
cmp ecx,128
- jne $L091bad_keybits
+ jne $L095bad_keybits
ALIGN 16
-$L09210rounds:
+$L09610rounds:
+ cmp ebp,268435456
+ je $L09710rounds_alt
mov ecx,9
movups XMMWORD PTR [edx-16],xmm0
DB 102,15,58,223,200,1
- call $L093key_128_cold
+ call $L098key_128_cold
DB 102,15,58,223,200,2
- call $L094key_128
+ call $L099key_128
DB 102,15,58,223,200,4
- call $L094key_128
+ call $L099key_128
DB 102,15,58,223,200,8
- call $L094key_128
+ call $L099key_128
DB 102,15,58,223,200,16
- call $L094key_128
+ call $L099key_128
DB 102,15,58,223,200,32
- call $L094key_128
+ call $L099key_128
DB 102,15,58,223,200,64
- call $L094key_128
+ call $L099key_128
DB 102,15,58,223,200,128
- call $L094key_128
+ call $L099key_128
DB 102,15,58,223,200,27
- call $L094key_128
+ call $L099key_128
DB 102,15,58,223,200,54
- call $L094key_128
+ call $L099key_128
movups XMMWORD PTR [edx],xmm0
mov DWORD PTR 80[edx],ecx
- xor eax,eax
- ret
+ jmp $L100good_key
ALIGN 16
-$L094key_128:
+$L099key_128:
movups XMMWORD PTR [edx],xmm0
lea edx,DWORD PTR 16[edx]
-$L093key_128_cold:
+$L098key_128_cold:
shufps xmm4,xmm0,16
xorps xmm0,xmm4
shufps xmm4,xmm0,140
@@ -2009,38 +2109,91 @@ $L093key_128_cold:
xorps xmm0,xmm1
ret
ALIGN 16
-$L09012rounds:
+$L09710rounds_alt:
+ movdqa xmm5,XMMWORD PTR [ebx]
+ mov ecx,8
+ movdqa xmm4,XMMWORD PTR 32[ebx]
+ movdqa xmm2,xmm0
+ movdqu XMMWORD PTR [edx-16],xmm0
+$L101loop_key128:
+DB 102,15,56,0,197
+DB 102,15,56,221,196
+ pslld xmm4,1
+ lea edx,DWORD PTR 16[edx]
+ movdqa xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm2,xmm3
+ pxor xmm0,xmm2
+ movdqu XMMWORD PTR [edx-16],xmm0
+ movdqa xmm2,xmm0
+ dec ecx
+ jnz $L101loop_key128
+ movdqa xmm4,XMMWORD PTR 48[ebx]
+DB 102,15,56,0,197
+DB 102,15,56,221,196
+ pslld xmm4,1
+ movdqa xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm2,xmm3
+ pxor xmm0,xmm2
+ movdqu XMMWORD PTR [edx],xmm0
+ movdqa xmm2,xmm0
+DB 102,15,56,0,197
+DB 102,15,56,221,196
+ movdqa xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm2,xmm3
+ pxor xmm0,xmm2
+ movdqu XMMWORD PTR 16[edx],xmm0
+ mov ecx,9
+ mov DWORD PTR 96[edx],ecx
+ jmp $L100good_key
+ALIGN 16
+$L09412rounds:
movq xmm2,QWORD PTR 16[eax]
+ cmp ebp,268435456
+ je $L10212rounds_alt
mov ecx,11
movups XMMWORD PTR [edx-16],xmm0
DB 102,15,58,223,202,1
- call $L095key_192a_cold
+ call $L103key_192a_cold
DB 102,15,58,223,202,2
- call $L096key_192b
+ call $L104key_192b
DB 102,15,58,223,202,4
- call $L097key_192a
+ call $L105key_192a
DB 102,15,58,223,202,8
- call $L096key_192b
+ call $L104key_192b
DB 102,15,58,223,202,16
- call $L097key_192a
+ call $L105key_192a
DB 102,15,58,223,202,32
- call $L096key_192b
+ call $L104key_192b
DB 102,15,58,223,202,64
- call $L097key_192a
+ call $L105key_192a
DB 102,15,58,223,202,128
- call $L096key_192b
+ call $L104key_192b
movups XMMWORD PTR [edx],xmm0
mov DWORD PTR 48[edx],ecx
- xor eax,eax
- ret
+ jmp $L100good_key
ALIGN 16
-$L097key_192a:
+$L105key_192a:
movups XMMWORD PTR [edx],xmm0
lea edx,DWORD PTR 16[edx]
ALIGN 16
-$L095key_192a_cold:
+$L103key_192a_cold:
movaps xmm5,xmm2
-$L098key_192b_warm:
+$L106key_192b_warm:
shufps xmm4,xmm0,16
movdqa xmm3,xmm2
xorps xmm0,xmm4
@@ -2054,56 +2207,90 @@ $L098key_192b_warm:
pxor xmm2,xmm3
ret
ALIGN 16
-$L096key_192b:
+$L104key_192b:
movaps xmm3,xmm0
shufps xmm5,xmm0,68
movups XMMWORD PTR [edx],xmm5
shufps xmm3,xmm2,78
movups XMMWORD PTR 16[edx],xmm3
lea edx,DWORD PTR 32[edx]
- jmp $L098key_192b_warm
+ jmp $L106key_192b_warm
+ALIGN 16
+$L10212rounds_alt:
+ movdqa xmm5,XMMWORD PTR 16[ebx]
+ movdqa xmm4,XMMWORD PTR 32[ebx]
+ mov ecx,8
+ movdqu XMMWORD PTR [edx-16],xmm0
+$L107loop_key192:
+ movq QWORD PTR [edx],xmm2
+ movdqa xmm1,xmm2
+DB 102,15,56,0,213
+DB 102,15,56,221,212
+ pslld xmm4,1
+ lea edx,DWORD PTR 24[edx]
+ movdqa xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm0,xmm3
+ pshufd xmm3,xmm0,255
+ pxor xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm3,xmm1
+ pxor xmm0,xmm2
+ pxor xmm2,xmm3
+ movdqu XMMWORD PTR [edx-16],xmm0
+ dec ecx
+ jnz $L107loop_key192
+ mov ecx,11
+ mov DWORD PTR 32[edx],ecx
+ jmp $L100good_key
ALIGN 16
-$L08914rounds:
+$L09314rounds:
movups xmm2,XMMWORD PTR 16[eax]
- mov ecx,13
lea edx,DWORD PTR 16[edx]
+ cmp ebp,268435456
+ je $L10814rounds_alt
+ mov ecx,13
movups XMMWORD PTR [edx-32],xmm0
movups XMMWORD PTR [edx-16],xmm2
DB 102,15,58,223,202,1
- call $L099key_256a_cold
+ call $L109key_256a_cold
DB 102,15,58,223,200,1
- call $L100key_256b
+ call $L110key_256b
DB 102,15,58,223,202,2
- call $L101key_256a
+ call $L111key_256a
DB 102,15,58,223,200,2
- call $L100key_256b
+ call $L110key_256b
DB 102,15,58,223,202,4
- call $L101key_256a
+ call $L111key_256a
DB 102,15,58,223,200,4
- call $L100key_256b
+ call $L110key_256b
DB 102,15,58,223,202,8
- call $L101key_256a
+ call $L111key_256a
DB 102,15,58,223,200,8
- call $L100key_256b
+ call $L110key_256b
DB 102,15,58,223,202,16
- call $L101key_256a
+ call $L111key_256a
DB 102,15,58,223,200,16
- call $L100key_256b
+ call $L110key_256b
DB 102,15,58,223,202,32
- call $L101key_256a
+ call $L111key_256a
DB 102,15,58,223,200,32
- call $L100key_256b
+ call $L110key_256b
DB 102,15,58,223,202,64
- call $L101key_256a
+ call $L111key_256a
movups XMMWORD PTR [edx],xmm0
mov DWORD PTR 16[edx],ecx
xor eax,eax
- ret
+ jmp $L100good_key
ALIGN 16
-$L101key_256a:
+$L111key_256a:
movups XMMWORD PTR [edx],xmm2
lea edx,DWORD PTR 16[edx]
-$L099key_256a_cold:
+$L109key_256a_cold:
shufps xmm4,xmm0,16
xorps xmm0,xmm4
shufps xmm4,xmm0,140
@@ -2112,7 +2299,7 @@ $L099key_256a_cold:
xorps xmm0,xmm1
ret
ALIGN 16
-$L100key_256b:
+$L110key_256b:
movups XMMWORD PTR [edx],xmm0
lea edx,DWORD PTR 16[edx]
shufps xmm4,xmm2,16
@@ -2122,13 +2309,70 @@ $L100key_256b:
shufps xmm1,xmm1,170
xorps xmm2,xmm1
ret
+ALIGN 16
+$L10814rounds_alt:
+ movdqa xmm5,XMMWORD PTR [ebx]
+ movdqa xmm4,XMMWORD PTR 32[ebx]
+ mov ecx,7
+ movdqu XMMWORD PTR [edx-32],xmm0
+ movdqa xmm1,xmm2
+ movdqu XMMWORD PTR [edx-16],xmm2
+$L112loop_key256:
+DB 102,15,56,0,213
+DB 102,15,56,221,212
+ movdqa xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm0,xmm3
+ pslld xmm4,1
+ pxor xmm0,xmm2
+ movdqu XMMWORD PTR [edx],xmm0
+ dec ecx
+ jz $L113done_key256
+ pshufd xmm2,xmm0,255
+ pxor xmm3,xmm3
+DB 102,15,56,221,211
+ movdqa xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm1,xmm3
+ pxor xmm2,xmm1
+ movdqu XMMWORD PTR 16[edx],xmm2
+ lea edx,DWORD PTR 32[edx]
+ movdqa xmm1,xmm2
+ jmp $L112loop_key256
+$L113done_key256:
+ mov ecx,13
+ mov DWORD PTR 16[edx],ecx
+$L100good_key:
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ xor eax,eax
+ pop ebx
+ pop ebp
+ ret
ALIGN 4
-$L088bad_pointer:
+$L091bad_pointer:
mov eax,-1
+ pop ebx
+ pop ebp
ret
ALIGN 4
-$L091bad_keybits:
+$L095bad_keybits:
+ pxor xmm0,xmm0
mov eax,-2
+ pop ebx
+ pop ebp
ret
__aesni_set_encrypt_key ENDP
ALIGN 16
@@ -2150,7 +2394,7 @@ $L_aesni_set_decrypt_key_begin::
mov edx,DWORD PTR 12[esp]
shl ecx,4
test eax,eax
- jnz $L102dec_key_ret
+ jnz $L114dec_key_ret
lea eax,DWORD PTR 16[ecx*1+edx]
movups xmm0,XMMWORD PTR [edx]
movups xmm1,XMMWORD PTR [eax]
@@ -2158,7 +2402,7 @@ $L_aesni_set_decrypt_key_begin::
movups XMMWORD PTR [edx],xmm1
lea edx,DWORD PTR 16[edx]
lea eax,DWORD PTR [eax-16]
-$L103dec_key_inverse:
+$L115dec_key_inverse:
movups xmm0,XMMWORD PTR [edx]
movups xmm1,XMMWORD PTR [eax]
DB 102,15,56,219,192
@@ -2168,17 +2412,28 @@ DB 102,15,56,219,201
movups XMMWORD PTR 16[eax],xmm0
movups XMMWORD PTR [edx-16],xmm1
cmp eax,edx
- ja $L103dec_key_inverse
+ ja $L115dec_key_inverse
movups xmm0,XMMWORD PTR [edx]
DB 102,15,56,219,192
movups XMMWORD PTR [edx],xmm0
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
xor eax,eax
-$L102dec_key_ret:
+$L114dec_key_ret:
ret
_aesni_set_decrypt_key ENDP
+ALIGN 64
+$Lkey_const::
+DD 202313229,202313229,202313229,202313229
+DD 67569157,67569157,67569157,67569157
+DD 1,1,1,1
+DD 27,27,27,27
DB 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
DB 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
DB 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
DB 115,108,46,111,114,103,62,0
.text$ ENDS
+.bss SEGMENT 'BSS'
+COMM _OPENSSL_ia32cap_P:DWORD:4
+.bss ENDS
END