summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorNikos Mavrogiannopoulos <nmav@gnutls.org>2013-11-26 23:19:45 +0100
committerNikos Mavrogiannopoulos <nmav@gnutls.org>2013-11-26 23:23:31 +0100
commit70932fa58b8ac7233853efc0958bd52684b6ee1b (patch)
treed928d014f6992ffa6a0c446dcdbfcc2f1233ea89 /lib
parente91b2a14c43c3bccbc3a5ec1a0d3913df642dce7 (diff)
downloadgnutls-70932fa58b8ac7233853efc0958bd52684b6ee1b.tar.gz
updated auto-generated asm files. This fixes a valgrind complaint when AES-NI is in use.
Diffstat (limited to 'lib')
-rw-r--r--lib/accelerated/x86/coff/appro-aes-gcm-x86-64-coff.s574
-rw-r--r--lib/accelerated/x86/coff/appro-aes-x86-64-coff.s1826
-rw-r--r--lib/accelerated/x86/coff/padlock-x86-64-coff.s495
-rw-r--r--lib/accelerated/x86/coff/padlock-x86-coff.s352
-rw-r--r--lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s515
-rw-r--r--lib/accelerated/x86/elf/appro-aes-x86-64.s1609
-rw-r--r--lib/accelerated/x86/elf/padlock-x86-64.s462
-rw-r--r--lib/accelerated/x86/elf/padlock-x86.s575
-rw-r--r--lib/accelerated/x86/macosx/appro-aes-gcm-x86-64-macosx.s515
-rw-r--r--lib/accelerated/x86/macosx/appro-aes-x86-64-macosx.s1609
-rw-r--r--lib/accelerated/x86/macosx/padlock-x86-64-macosx.s462
-rw-r--r--lib/accelerated/x86/macosx/padlock-x86-macosx.s349
12 files changed, 6978 insertions, 2365 deletions
diff --git a/lib/accelerated/x86/coff/appro-aes-gcm-x86-64-coff.s b/lib/accelerated/x86/coff/appro-aes-gcm-x86-64-coff.s
index fa449d6953..ceb9108c32 100644
--- a/lib/accelerated/x86/coff/appro-aes-gcm-x86-64-coff.s
+++ b/lib/accelerated/x86/coff/appro-aes-gcm-x86-64-coff.s
@@ -717,6 +717,11 @@ gcm_ghash_4bit:
.def gcm_init_clmul; .scl 2; .type 32; .endef
.p2align 4
gcm_init_clmul:
+.L_init_clmul:
+.LSEH_begin_gcm_init_clmul:
+
+.byte 0x48,0x83,0xec,0x18
+.byte 0x0f,0x29,0x34,0x24
movdqu (%rdx),%xmm2
pshufd $78,%xmm2,%xmm2
@@ -735,15 +740,15 @@ gcm_init_clmul:
pxor %xmm5,%xmm2
+ pshufd $78,%xmm2,%xmm6
movdqa %xmm2,%xmm0
+ pxor %xmm2,%xmm6
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
+.byte 102,15,58,68,222,0
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -753,44 +758,137 @@ gcm_init_clmul:
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
psllq $1,%xmm0
pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm2,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm2,%xmm3
+ movdqu %xmm2,0(%rcx)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,16(%rcx)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,32(%rcx)
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
psrlq $5,%xmm0
pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ movdqa %xmm0,%xmm5
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- movdqu %xmm2,(%rcx)
- movdqu %xmm0,16(%rcx)
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm5,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm5,%xmm3
+ movdqu %xmm5,48(%rcx)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,64(%rcx)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,80(%rcx)
+ movaps (%rsp),%xmm6
+ leaq 24(%rsp),%rsp
+.LSEH_end_gcm_init_clmul:
.byte 0xf3,0xc3
.globl gcm_gmult_clmul
.def gcm_gmult_clmul; .scl 2; .type 32; .endef
.p2align 4
gcm_gmult_clmul:
+.L_gmult_clmul:
movdqu (%rcx),%xmm0
movdqa .Lbswap_mask(%rip),%xmm5
movdqu (%rdx),%xmm2
+ movdqu 32(%rdx),%xmm4
.byte 102,15,56,0,197
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
.byte 102,15,58,68,220,0
@@ -803,194 +901,372 @@ gcm_gmult_clmul:
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
.byte 102,15,56,0,197
movdqu %xmm0,(%rcx)
.byte 0xf3,0xc3
.globl gcm_ghash_clmul
.def gcm_ghash_clmul; .scl 2; .type 32; .endef
-.p2align 4
+.p2align 5
gcm_ghash_clmul:
+.L_ghash_clmul:
+ leaq -136(%rsp),%rax
.LSEH_begin_gcm_ghash_clmul:
-.byte 0x48,0x83,0xec,0x58
-.byte 0x0f,0x29,0x34,0x24
-.byte 0x0f,0x29,0x7c,0x24,0x10
-.byte 0x44,0x0f,0x29,0x44,0x24,0x20
-.byte 0x44,0x0f,0x29,0x4c,0x24,0x30
-.byte 0x44,0x0f,0x29,0x54,0x24,0x40
+.byte 0x48,0x8d,0x60,0xe0
+.byte 0x0f,0x29,0x70,0xe0
+.byte 0x0f,0x29,0x78,0xf0
+.byte 0x44,0x0f,0x29,0x00
+.byte 0x44,0x0f,0x29,0x48,0x10
+.byte 0x44,0x0f,0x29,0x50,0x20
+.byte 0x44,0x0f,0x29,0x58,0x30
+.byte 0x44,0x0f,0x29,0x60,0x40
+.byte 0x44,0x0f,0x29,0x68,0x50
+.byte 0x44,0x0f,0x29,0x70,0x60
+.byte 0x44,0x0f,0x29,0x78,0x70
movdqa .Lbswap_mask(%rip),%xmm5
+ movq $11547335547999543296,%rax
movdqu (%rcx),%xmm0
movdqu (%rdx),%xmm2
+ movdqu 32(%rdx),%xmm10
.byte 102,15,56,0,197
subq $16,%r9
jz .Lodd_tail
- movdqu 16(%rdx),%xmm8
+ movdqu 16(%rdx),%xmm9
+ cmpq $48,%r9
+ jb .Lskip4x
+ subq $48,%r9
+ movdqu 48(%rdx),%xmm14
+ movdqu 64(%rdx),%xmm15
- movdqu (%r8),%xmm3
- movdqu 16(%r8),%xmm6
-.byte 102,15,56,0,221
+ movdqu 48(%r8),%xmm6
+ movdqu 32(%r8),%xmm11
.byte 102,15,56,0,245
- pxor %xmm3,%xmm0
- movdqa %xmm6,%xmm7
- pshufd $78,%xmm6,%xmm3
- pshufd $78,%xmm2,%xmm4
- pxor %xmm6,%xmm3
- pxor %xmm2,%xmm4
+.byte 102,68,15,56,0,221
+ movdqa %xmm6,%xmm8
+ pshufd $78,%xmm6,%xmm7
+ pxor %xmm6,%xmm7
.byte 102,15,58,68,242,0
-.byte 102,15,58,68,250,17
-.byte 102,15,58,68,220,0
- pxor %xmm6,%xmm3
- pxor %xmm7,%xmm3
+.byte 102,68,15,58,68,194,17
+.byte 102,65,15,58,68,250,0
+
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,217,0
+.byte 102,69,15,58,68,233,17
+ xorps %xmm11,%xmm6
+.byte 102,69,15,58,68,226,16
+ xorps %xmm13,%xmm8
+ movups 80(%rdx),%xmm10
+ xorps %xmm12,%xmm7
+
+ movdqu 16(%r8),%xmm11
+ movdqu 0(%r8),%xmm3
+.byte 102,68,15,56,0,221
+.byte 102,15,56,0,221
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm3,%xmm0
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm6
+.byte 102,69,15,58,68,226,0
+ xorps %xmm13,%xmm8
+
+ leaq 64(%r8),%r8
+ subq $64,%r9
+ jc .Ltail4x
+
+ jmp .Lmod4_loop
+.p2align 5
+.Lmod4_loop:
+.byte 102,65,15,58,68,199,0
+ xorps %xmm12,%xmm7
+ movdqu 48(%r8),%xmm11
+.byte 102,68,15,56,0,221
+.byte 102,65,15,58,68,207,17
+ xorps %xmm6,%xmm0
+ movdqu 32(%r8),%xmm6
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+.byte 102,65,15,58,68,218,16
+ xorps %xmm8,%xmm1
+ pxor %xmm11,%xmm12
+.byte 102,15,56,0,245
+ movups 32(%rdx),%xmm10
+.byte 102,68,15,58,68,218,0
+ xorps %xmm7,%xmm3
+ movdqa %xmm6,%xmm8
+ pshufd $78,%xmm6,%xmm7
+ pxor %xmm0,%xmm3
+ pxor %xmm6,%xmm7
+ pxor %xmm1,%xmm3
movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
+ pslldq $8,%xmm3
+.byte 102,68,15,58,68,234,17
+ psrldq $8,%xmm4
+ pxor %xmm3,%xmm0
+ movdqa .L7_mask(%rip),%xmm3
+ pxor %xmm4,%xmm1
+.byte 102,72,15,110,224
+
+ pand %xmm0,%xmm3
+.byte 102,15,56,0,227
+.byte 102,69,15,58,68,226,0
+ pxor %xmm0,%xmm4
+ psllq $57,%xmm4
+ movdqa %xmm4,%xmm3
pslldq $8,%xmm4
- pxor %xmm3,%xmm7
- pxor %xmm4,%xmm6
+.byte 102,65,15,58,68,241,0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqu 0(%r8),%xmm3
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+.byte 102,69,15,58,68,193,17
+ xorps %xmm11,%xmm6
+ movdqu 16(%r8),%xmm11
+.byte 102,68,15,56,0,221
+.byte 102,65,15,58,68,250,16
+ xorps %xmm13,%xmm8
+ movups 80(%rdx),%xmm10
+.byte 102,15,56,0,221
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+
+ movdqa %xmm11,%xmm13
+ pxor %xmm12,%xmm7
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ psrlq $1,%xmm0
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm6
+ pxor %xmm1,%xmm0
+
+.byte 102,69,15,58,68,226,0
+ xorps %xmm13,%xmm8
+
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm8,%xmm4
pxor %xmm0,%xmm3
- pxor %xmm8,%xmm4
- leaq 32(%r8),%r8
- subq $32,%r9
- jbe .Leven_tail
+ leaq 64(%r8),%r8
+ subq $64,%r9
+ jnc .Lmod4_loop
+
+.Ltail4x:
+.byte 102,65,15,58,68,199,0
+ xorps %xmm12,%xmm7
+.byte 102,65,15,58,68,207,17
+ xorps %xmm6,%xmm0
+.byte 102,65,15,58,68,218,16
+ xorps %xmm8,%xmm1
+ pxor %xmm0,%xmm1
+ pxor %xmm7,%xmm3
-.Lmod_loop:
-.byte 102,65,15,58,68,192,0
-.byte 102,65,15,58,68,200,17
-.byte 102,15,58,68,220,0
- pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
+ pxor %xmm0,%xmm1
movdqa %xmm3,%xmm4
psrldq $8,%xmm3
pslldq $8,%xmm4
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
- movdqu (%r8),%xmm3
- pxor %xmm6,%xmm0
- pxor %xmm7,%xmm1
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ addq $64,%r9
+ jz .Ldone
+ movdqu 32(%rdx),%xmm10
+ subq $16,%r9
+ jz .Lodd_tail
+.Lskip4x:
+
+
+
+
+
+ movdqu (%r8),%xmm3
movdqu 16(%r8),%xmm6
.byte 102,15,56,0,221
.byte 102,15,56,0,245
+ pxor %xmm3,%xmm0
+
+ movdqa %xmm6,%xmm8
+ pshufd $78,%xmm6,%xmm3
+ pxor %xmm6,%xmm3
+.byte 102,15,58,68,242,0
+.byte 102,68,15,58,68,194,17
+.byte 102,65,15,58,68,218,0
+
+ leaq 32(%r8),%r8
+ subq $32,%r9
+ jbe .Leven_tail
+ jmp .Lmod_loop
- movdqa %xmm6,%xmm7
- pshufd $78,%xmm6,%xmm9
- pshufd $78,%xmm2,%xmm10
- pxor %xmm6,%xmm9
- pxor %xmm2,%xmm10
+.p2align 5
+.Lmod_loop:
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm0,%xmm4
+
+.byte 102,65,15,58,68,193,0
+.byte 102,65,15,58,68,201,17
+.byte 102,65,15,58,68,226,16
+
+ pxor %xmm6,%xmm0
+ pxor %xmm8,%xmm1
+ movdqu (%r8),%xmm8
+.byte 102,68,15,56,0,197
+ movdqu 16(%r8),%xmm6
+
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pxor %xmm8,%xmm1
+ pxor %xmm3,%xmm4
+.byte 102,15,56,0,245
+ movdqa %xmm4,%xmm3
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm6,%xmm8
+
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
- pxor %xmm3,%xmm0
.byte 102,15,58,68,242,0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ pshufd $78,%xmm8,%xmm3
+ pxor %xmm8,%xmm3
-.byte 102,15,58,68,250,17
+.byte 102,68,15,58,68,194,17
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
-
-.byte 102,69,15,58,68,202,0
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm8,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm8,%xmm4
-
- pxor %xmm6,%xmm9
- pxor %xmm7,%xmm9
- movdqa %xmm9,%xmm10
- psrldq $8,%xmm9
- pslldq $8,%xmm10
- pxor %xmm9,%xmm7
- pxor %xmm10,%xmm6
+.byte 102,65,15,58,68,218,0
+ pxor %xmm1,%xmm0
leaq 32(%r8),%r8
subq $32,%r9
ja .Lmod_loop
.Leven_tail:
-.byte 102,65,15,58,68,192,0
-.byte 102,65,15,58,68,200,17
-.byte 102,15,58,68,220,0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm0,%xmm4
+
+.byte 102,65,15,58,68,193,0
+.byte 102,65,15,58,68,201,17
+.byte 102,65,15,58,68,226,16
+
+ pxor %xmm6,%xmm0
+ pxor %xmm8,%xmm1
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
-
- movdqa %xmm3,%xmm4
+ pxor %xmm3,%xmm4
+ movdqa %xmm4,%xmm3
psrldq $8,%xmm3
pslldq $8,%xmm4
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
- pxor %xmm6,%xmm0
- pxor %xmm7,%xmm1
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
testq %r9,%r9
jnz .Ldone
@@ -1000,12 +1276,10 @@ gcm_ghash_clmul:
pxor %xmm3,%xmm0
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
+.byte 102,65,15,58,68,218,0
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -1015,27 +1289,28 @@ gcm_ghash_clmul:
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
.Ldone:
.byte 102,15,56,0,197
movdqu %xmm0,(%rcx)
@@ -1044,15 +1319,42 @@ gcm_ghash_clmul:
movaps 32(%rsp),%xmm8
movaps 48(%rsp),%xmm9
movaps 64(%rsp),%xmm10
- addq $88,%rsp
- .byte 0xf3,0xc3
+ movaps 80(%rsp),%xmm11
+ movaps 96(%rsp),%xmm12
+ movaps 112(%rsp),%xmm13
+ movaps 128(%rsp),%xmm14
+ movaps 144(%rsp),%xmm15
+ leaq 168(%rsp),%rsp
.LSEH_end_gcm_ghash_clmul:
+ .byte 0xf3,0xc3
+
+.globl gcm_init_avx
+.def gcm_init_avx; .scl 2; .type 32; .endef
+.p2align 5
+gcm_init_avx:
+ jmp .L_init_clmul
+
+.globl gcm_gmult_avx
+.def gcm_gmult_avx; .scl 2; .type 32; .endef
+.p2align 5
+gcm_gmult_avx:
+ jmp .L_gmult_clmul
+
+.globl gcm_ghash_avx
+.def gcm_ghash_avx; .scl 2; .type 32; .endef
+.p2align 5
+gcm_ghash_avx:
+ jmp .L_ghash_clmul
.p2align 6
.Lbswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.L0x1c2_polynomial:
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+.long 7,0,7,0
+.L7_mask_poly:
+.long 7,0,450,0
.p2align 6
.Lrem_4bit:
@@ -1189,10 +1491,13 @@ se_handler:
.rva .LSEH_end_gcm_ghash_4bit
.rva .LSEH_info_gcm_ghash_4bit
+.rva .LSEH_begin_gcm_init_clmul
+.rva .LSEH_end_gcm_init_clmul
+.rva .LSEH_info_gcm_init_clmul
+
.rva .LSEH_begin_gcm_ghash_clmul
.rva .LSEH_end_gcm_ghash_clmul
.rva .LSEH_info_gcm_ghash_clmul
-
.section .xdata
.p2align 3
.LSEH_info_gcm_gmult_4bit:
@@ -1203,11 +1508,20 @@ se_handler:
.byte 9,0,0,0
.rva se_handler
.rva .Lghash_prologue,.Lghash_epilogue
+.LSEH_info_gcm_init_clmul:
+.byte 0x01,0x08,0x03,0x00
+.byte 0x08,0x68,0x00,0x00
+.byte 0x04,0x22,0x00,0x00
.LSEH_info_gcm_ghash_clmul:
-.byte 0x01,0x1f,0x0b,0x00
-.byte 0x1f,0xa8,0x04,0x00
-.byte 0x19,0x98,0x03,0x00
-.byte 0x13,0x88,0x02,0x00
-.byte 0x0d,0x78,0x01,0x00
+.byte 0x01,0x33,0x16,0x00
+.byte 0x33,0xf8,0x09,0x00
+.byte 0x2e,0xe8,0x08,0x00
+.byte 0x29,0xd8,0x07,0x00
+.byte 0x24,0xc8,0x06,0x00
+.byte 0x1f,0xb8,0x05,0x00
+.byte 0x1a,0xa8,0x04,0x00
+.byte 0x15,0x98,0x03,0x00
+.byte 0x10,0x88,0x02,0x00
+.byte 0x0c,0x78,0x01,0x00
.byte 0x08,0x68,0x00,0x00
-.byte 0x04,0xa2,0x00,0x00
+.byte 0x04,0x01,0x15,0x00
diff --git a/lib/accelerated/x86/coff/appro-aes-x86-64-coff.s b/lib/accelerated/x86/coff/appro-aes-x86-64-coff.s
index 7bd96654d8..224a226b0d 100644
--- a/lib/accelerated/x86/coff/appro-aes-x86-64-coff.s
+++ b/lib/accelerated/x86/coff/appro-aes-x86-64-coff.s
@@ -997,211 +997,423 @@ aesni_ctr32_encrypt_blocks:
movq %r9,%rcx
movq 40(%rsp),%r8
- leaq -200(%rsp),%rsp
- movaps %xmm6,32(%rsp)
- movaps %xmm7,48(%rsp)
- movaps %xmm8,64(%rsp)
- movaps %xmm9,80(%rsp)
- movaps %xmm10,96(%rsp)
- movaps %xmm11,112(%rsp)
- movaps %xmm12,128(%rsp)
- movaps %xmm13,144(%rsp)
- movaps %xmm14,160(%rsp)
- movaps %xmm15,176(%rsp)
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $288,%rsp
+ andq $-16,%rsp
+ movaps %xmm6,-168(%rax)
+ movaps %xmm7,-152(%rax)
+ movaps %xmm8,-136(%rax)
+ movaps %xmm9,-120(%rax)
+ movaps %xmm10,-104(%rax)
+ movaps %xmm11,-88(%rax)
+ movaps %xmm12,-72(%rax)
+ movaps %xmm13,-56(%rax)
+ movaps %xmm14,-40(%rax)
+ movaps %xmm15,-24(%rax)
.Lctr32_body:
+ leaq -8(%rax),%rbp
+
cmpq $1,%rdx
je .Lctr32_one_shortcut
- movdqu (%r8),%xmm14
- movdqa .Lbswap_mask(%rip),%xmm15
- xorl %eax,%eax
-.byte 102,69,15,58,22,242,3
-.byte 102,68,15,58,34,240,3
+ movdqu (%r8),%xmm2
+ movdqu (%rcx),%xmm0
+ movl 12(%r8),%r8d
+ pxor %xmm0,%xmm2
+ movl 12(%rcx),%r11d
+ movdqa %xmm2,0(%rsp)
+ bswapl %r8d
+ movdqa %xmm2,%xmm3
+ movdqa %xmm2,%xmm4
+ movdqa %xmm2,%xmm5
+ movdqa %xmm2,64(%rsp)
+ movdqa %xmm2,80(%rsp)
+ movdqa %xmm2,96(%rsp)
+ movdqa %xmm2,112(%rsp)
movl 240(%rcx),%eax
+
+ leaq 1(%r8),%r9
+ leaq 2(%r8),%r10
+ bswapl %r9d
bswapl %r10d
- pxor %xmm12,%xmm12
- pxor %xmm13,%xmm13
-.byte 102,69,15,58,34,226,0
- leaq 3(%r10),%r11
-.byte 102,69,15,58,34,235,0
- incl %r10d
-.byte 102,69,15,58,34,226,1
- incq %r11
-.byte 102,69,15,58,34,235,1
- incl %r10d
-.byte 102,69,15,58,34,226,2
- incq %r11
-.byte 102,69,15,58,34,235,2
- movdqa %xmm12,0(%rsp)
-.byte 102,69,15,56,0,231
- movdqa %xmm13,16(%rsp)
-.byte 102,69,15,56,0,239
-
- pshufd $192,%xmm12,%xmm2
- pshufd $128,%xmm12,%xmm3
- pshufd $64,%xmm12,%xmm4
- cmpq $6,%rdx
- jb .Lctr32_tail
- shrl $1,%eax
- movq %rcx,%r11
- movl %eax,%r10d
- subq $6,%rdx
- jmp .Lctr32_loop6
+ xorl %r11d,%r9d
+ xorl %r11d,%r10d
+.byte 102,65,15,58,34,217,3
+ leaq 3(%r8),%r9
+ movdqa %xmm3,16(%rsp)
+.byte 102,65,15,58,34,226,3
+ bswapl %r9d
+ leaq 4(%r8),%r10
+ movdqa %xmm4,32(%rsp)
+ xorl %r11d,%r9d
+ bswapl %r10d
+.byte 102,65,15,58,34,233,3
+ xorl %r11d,%r10d
+ movdqa %xmm5,48(%rsp)
+ leaq 5(%r8),%r9
+ movl %r10d,64+12(%rsp)
+ bswapl %r9d
+ leaq 6(%r8),%r10
+ xorl %r11d,%r9d
+ bswapl %r10d
+ movl %r9d,80+12(%rsp)
+ xorl %r11d,%r10d
+ leaq 7(%r8),%r9
+ movl %r10d,96+12(%rsp)
+ bswapl %r9d
+ xorl %r11d,%r9d
+ movl %r9d,112+12(%rsp)
-.p2align 4
-.Lctr32_loop6:
- pshufd $192,%xmm13,%xmm5
- por %xmm14,%xmm2
- movups (%r11),%xmm0
- pshufd $128,%xmm13,%xmm6
- por %xmm14,%xmm3
- movups 16(%r11),%xmm1
- pshufd $64,%xmm13,%xmm7
- por %xmm14,%xmm4
- por %xmm14,%xmm5
- xorps %xmm0,%xmm2
- por %xmm14,%xmm6
- por %xmm14,%xmm7
+ movups 16(%rcx),%xmm1
+ movdqa 64(%rsp),%xmm6
+ movdqa 80(%rsp),%xmm7
+ cmpq $8,%rdx
+ jb .Lctr32_tail
+ leaq 128(%rcx),%rcx
+ subq $8,%rdx
+ jmp .Lctr32_loop8
- pxor %xmm0,%xmm3
+.p2align 5
+.Lctr32_loop8:
+ addl $8,%r8d
+ movdqa 96(%rsp),%xmm8
.byte 102,15,56,220,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+ movl %r8d,%r9d
+ movdqa 112(%rsp),%xmm9
.byte 102,15,56,220,217
- movdqa .Lincrement32(%rip),%xmm13
- pxor %xmm0,%xmm5
+ bswapl %r9d
+ movups 32-128(%rcx),%xmm0
.byte 102,15,56,220,225
- movdqa 0(%rsp),%xmm12
- pxor %xmm0,%xmm6
+ xorl %r11d,%r9d
.byte 102,15,56,220,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+ movl %r9d,0+12(%rsp)
+ leaq 1(%r8),%r9
.byte 102,15,56,220,241
.byte 102,15,56,220,249
- jmp .Lctr32_enc_loop6_enter
-.p2align 4
-.Lctr32_enc_loop6:
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 48-128(%rcx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ bswapl %r9d
+.byte 102,15,56,220,224
+ xorl %r11d,%r9d
+.byte 102,15,56,220,232
+ movl %r9d,16+12(%rsp)
+ leaq 2(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 64-128(%rcx),%xmm0
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
+ bswapl %r9d
.byte 102,15,56,220,225
+ xorl %r11d,%r9d
.byte 102,15,56,220,233
+ movl %r9d,32+12(%rsp)
+ leaq 3(%r8),%r9
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.Lctr32_enc_loop6_enter:
- movups 16(%rcx),%xmm1
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 80-128(%rcx),%xmm1
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
+ bswapl %r9d
.byte 102,15,56,220,224
+ xorl %r11d,%r9d
.byte 102,15,56,220,232
+ movl %r9d,48+12(%rsp)
+ leaq 4(%r8),%r9
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
- jnz .Lctr32_enc_loop6
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 96-128(%rcx),%xmm0
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ bswapl %r9d
+.byte 102,15,56,220,225
+ xorl %r11d,%r9d
+.byte 102,15,56,220,233
+ movl %r9d,64+12(%rsp)
+ leaq 5(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 112-128(%rcx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ bswapl %r9d
+.byte 102,15,56,220,224
+ xorl %r11d,%r9d
+.byte 102,15,56,220,232
+ movl %r9d,80+12(%rsp)
+ leaq 6(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 128-128(%rcx),%xmm0
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ bswapl %r9d
+.byte 102,15,56,220,225
+ xorl %r11d,%r9d
+.byte 102,15,56,220,233
+ movl %r9d,96+12(%rsp)
+ leaq 7(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 144-128(%rcx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ bswapl %r9d
+.byte 102,15,56,220,224
+ xorl %r11d,%r9d
+.byte 102,15,56,220,232
+ movl %r9d,112+12(%rsp)
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+ movdqu 0(%rdi),%xmm10
+.byte 102,68,15,56,220,200
+ movups 160-128(%rcx),%xmm0
+
+ cmpl $11,%eax
+ jb .Lctr32_enc_done
.byte 102,15,56,220,209
- paddd %xmm13,%xmm12
.byte 102,15,56,220,217
- paddd 16(%rsp),%xmm13
.byte 102,15,56,220,225
- movdqa %xmm12,0(%rsp)
.byte 102,15,56,220,233
- movdqa %xmm13,16(%rsp)
.byte 102,15,56,220,241
-.byte 102,69,15,56,0,231
.byte 102,15,56,220,249
-.byte 102,69,15,56,0,239
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 176-128(%rcx),%xmm1
-.byte 102,15,56,221,208
- movups (%rdi),%xmm8
-.byte 102,15,56,221,216
- movups 16(%rdi),%xmm9
-.byte 102,15,56,221,224
- movups 32(%rdi),%xmm10
-.byte 102,15,56,221,232
- movups 48(%rdi),%xmm11
-.byte 102,15,56,221,240
- movups 64(%rdi),%xmm1
-.byte 102,15,56,221,248
- movups 80(%rdi),%xmm0
- leaq 96(%rdi),%rdi
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 192-128(%rcx),%xmm0
+ je .Lctr32_enc_done
- xorps %xmm2,%xmm8
- pshufd $192,%xmm12,%xmm2
- xorps %xmm3,%xmm9
- pshufd $128,%xmm12,%xmm3
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- pshufd $64,%xmm12,%xmm4
- movups %xmm9,16(%rsi)
- xorps %xmm5,%xmm11
- movups %xmm10,32(%rsi)
- xorps %xmm6,%xmm1
- movups %xmm11,48(%rsi)
- xorps %xmm7,%xmm0
- movups %xmm1,64(%rsi)
- movups %xmm0,80(%rsi)
- leaq 96(%rsi),%rsi
- movl %r10d,%eax
- subq $6,%rdx
- jnc .Lctr32_loop6
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 208-128(%rcx),%xmm1
+
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 224-128(%rcx),%xmm0
+
+.Lctr32_enc_done:
+ movdqu 16(%rdi),%xmm11
+ pxor %xmm0,%xmm10
+ movdqu 32(%rdi),%xmm12
+ pxor %xmm0,%xmm11
+ movdqu 48(%rdi),%xmm13
+ pxor %xmm0,%xmm12
+ movdqu 64(%rdi),%xmm14
+ pxor %xmm0,%xmm13
+ movdqu 80(%rdi),%xmm15
+ pxor %xmm0,%xmm14
+.byte 102,15,56,220,209
+ pxor %xmm0,%xmm15
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movdqu 96(%rdi),%xmm1
+
+.byte 102,65,15,56,221,210
+ pxor %xmm0,%xmm1
+ movdqu 112(%rdi),%xmm10
+ leaq 128(%rdi),%rdi
+.byte 102,65,15,56,221,219
+ pxor %xmm0,%xmm10
+ movdqa 0(%rsp),%xmm11
+.byte 102,65,15,56,221,228
+ movdqa 16(%rsp),%xmm12
+.byte 102,65,15,56,221,237
+ movdqa 32(%rsp),%xmm13
+.byte 102,65,15,56,221,246
+ movdqa 48(%rsp),%xmm14
+.byte 102,65,15,56,221,255
+ movdqa 64(%rsp),%xmm15
+.byte 102,68,15,56,221,193
+ movdqa 80(%rsp),%xmm0
+.byte 102,69,15,56,221,202
+ movups 16-128(%rcx),%xmm1
- addq $6,%rdx
+ movups %xmm2,(%rsi)
+ movdqa %xmm11,%xmm2
+ movups %xmm3,16(%rsi)
+ movdqa %xmm12,%xmm3
+ movups %xmm4,32(%rsi)
+ movdqa %xmm13,%xmm4
+ movups %xmm5,48(%rsi)
+ movdqa %xmm14,%xmm5
+ movups %xmm6,64(%rsi)
+ movdqa %xmm15,%xmm6
+ movups %xmm7,80(%rsi)
+ movdqa %xmm0,%xmm7
+ movups %xmm8,96(%rsi)
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+
+ subq $8,%rdx
+ jnc .Lctr32_loop8
+
+ addq $8,%rdx
jz .Lctr32_done
- movq %r11,%rcx
- leal 1(%rax,%rax,1),%eax
+ leaq -128(%rcx),%rcx
.Lctr32_tail:
- por %xmm14,%xmm2
- movups (%rdi),%xmm8
- cmpq $2,%rdx
- jb .Lctr32_one
+ leaq 16(%rcx),%rcx
+ cmpq $4,%rdx
+ jb .Lctr32_loop3
+ je .Lctr32_loop4
- por %xmm14,%xmm3
- movups 16(%rdi),%xmm9
- je .Lctr32_two
+ movdqa 96(%rsp),%xmm8
+ pxor %xmm9,%xmm9
- pshufd $192,%xmm13,%xmm5
- por %xmm14,%xmm4
- movups 32(%rdi),%xmm10
- cmpq $4,%rdx
- jb .Lctr32_three
+ movups 16(%rcx),%xmm0
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+.byte 102,15,56,220,217
+ shrl $1,%eax
+.byte 102,15,56,220,225
+ decl %eax
+.byte 102,15,56,220,233
+ movups (%rdi),%xmm10
+.byte 102,15,56,220,241
+ movups 16(%rdi),%xmm11
+.byte 102,15,56,220,249
+ movups 32(%rdi),%xmm12
+.byte 102,68,15,56,220,193
+ movups 16(%rcx),%xmm1
- pshufd $128,%xmm13,%xmm6
- por %xmm14,%xmm5
- movups 48(%rdi),%xmm11
- je .Lctr32_four
+ call .Lenc_loop8_enter
- por %xmm14,%xmm6
- xorps %xmm7,%xmm7
+ movdqu 48(%rdi),%xmm13
+ pxor %xmm10,%xmm2
+ movdqu 64(%rdi),%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm10,%xmm6
+ movdqu %xmm5,48(%rsi)
+ movdqu %xmm6,64(%rsi)
+ cmpq $6,%rdx
+ jb .Lctr32_done
- call _aesni_encrypt6
+ movups 80(%rdi),%xmm11
+ xorps %xmm11,%xmm7
+ movups %xmm7,80(%rsi)
+ je .Lctr32_done
- movups 64(%rdi),%xmm1
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- movups %xmm9,16(%rsi)
- xorps %xmm5,%xmm11
- movups %xmm10,32(%rsi)
- xorps %xmm6,%xmm1
- movups %xmm11,48(%rsi)
- movups %xmm1,64(%rsi)
+ movups 96(%rdi),%xmm12
+ xorps %xmm12,%xmm8
+ movups %xmm8,96(%rsi)
+ jmp .Lctr32_done
+
+.p2align 5
+.Lctr32_loop4:
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups (%rcx),%xmm1
+ decl %eax
+ jnz .Lctr32_loop4
+.byte 102,15,56,221,209
+ movups (%rdi),%xmm10
+.byte 102,15,56,221,217
+ movups 16(%rdi),%xmm11
+.byte 102,15,56,221,225
+ movups 32(%rdi),%xmm12
+.byte 102,15,56,221,233
+ movups 48(%rdi),%xmm13
+
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+ xorps %xmm11,%xmm3
+ movups %xmm3,16(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm5,48(%rsi)
+ jmp .Lctr32_done
+
+.p2align 5
+.Lctr32_loop3:
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+ movups (%rcx),%xmm1
+ decl %eax
+ jnz .Lctr32_loop3
+.byte 102,15,56,221,209
+.byte 102,15,56,221,217
+.byte 102,15,56,221,225
+
+ movups (%rdi),%xmm10
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+ cmpq $2,%rdx
+ jb .Lctr32_done
+
+ movups 16(%rdi),%xmm11
+ xorps %xmm11,%xmm3
+ movups %xmm3,16(%rsi)
+ je .Lctr32_done
+
+ movups 32(%rdi),%xmm12
+ xorps %xmm12,%xmm4
+ movups %xmm4,32(%rsi)
jmp .Lctr32_done
.p2align 4
.Lctr32_one_shortcut:
movups (%r8),%xmm2
- movups (%rdi),%xmm8
+ movups (%rdi),%xmm10
movl 240(%rcx),%eax
-.Lctr32_one:
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
@@ -1213,56 +1425,25 @@ aesni_ctr32_encrypt_blocks:
leaq 16(%rcx),%rcx
jnz .Loop_enc1_7
.byte 102,15,56,221,209
- xorps %xmm2,%xmm8
- movups %xmm8,(%rsi)
- jmp .Lctr32_done
-
-.p2align 4
-.Lctr32_two:
- xorps %xmm4,%xmm4
- call _aesni_encrypt3
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- movups %xmm9,16(%rsi)
- jmp .Lctr32_done
-
-.p2align 4
-.Lctr32_three:
- call _aesni_encrypt3
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- movups %xmm9,16(%rsi)
- movups %xmm10,32(%rsi)
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
jmp .Lctr32_done
.p2align 4
-.Lctr32_four:
- call _aesni_encrypt4
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- movups %xmm9,16(%rsi)
- xorps %xmm5,%xmm11
- movups %xmm10,32(%rsi)
- movups %xmm11,48(%rsi)
-
.Lctr32_done:
- movaps 32(%rsp),%xmm6
- movaps 48(%rsp),%xmm7
- movaps 64(%rsp),%xmm8
- movaps 80(%rsp),%xmm9
- movaps 96(%rsp),%xmm10
- movaps 112(%rsp),%xmm11
- movaps 128(%rsp),%xmm12
- movaps 144(%rsp),%xmm13
- movaps 160(%rsp),%xmm14
- movaps 176(%rsp),%xmm15
- leaq 200(%rsp),%rsp
-.Lctr32_ret:
+ movaps -160(%rbp),%xmm6
+ movaps -144(%rbp),%xmm7
+ movaps -128(%rbp),%xmm8
+ movaps -112(%rbp),%xmm9
+ movaps -96(%rbp),%xmm10
+ movaps -80(%rbp),%xmm11
+ movaps -64(%rbp),%xmm12
+ movaps -48(%rbp),%xmm13
+ movaps -32(%rbp),%xmm14
+ movaps -16(%rbp),%xmm15
+ leaq (%rbp),%rsp
+ popq %rbp
+.Lctr32_epilogue:
movq 8(%rsp),%rdi
movq 16(%rsp),%rsi
.byte 0xf3,0xc3
@@ -1282,18 +1463,22 @@ aesni_xts_encrypt:
movq 40(%rsp),%r8
movq 48(%rsp),%r9
- leaq -264(%rsp),%rsp
- movaps %xmm6,96(%rsp)
- movaps %xmm7,112(%rsp)
- movaps %xmm8,128(%rsp)
- movaps %xmm9,144(%rsp)
- movaps %xmm10,160(%rsp)
- movaps %xmm11,176(%rsp)
- movaps %xmm12,192(%rsp)
- movaps %xmm13,208(%rsp)
- movaps %xmm14,224(%rsp)
- movaps %xmm15,240(%rsp)
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $272,%rsp
+ andq $-16,%rsp
+ movaps %xmm6,-168(%rax)
+ movaps %xmm7,-152(%rax)
+ movaps %xmm8,-136(%rax)
+ movaps %xmm9,-120(%rax)
+ movaps %xmm10,-104(%rax)
+ movaps %xmm11,-88(%rax)
+ movaps %xmm12,-72(%rax)
+ movaps %xmm13,-56(%rax)
+ movaps %xmm14,-40(%rax)
+ movaps %xmm15,-24(%rax)
.Lxts_enc_body:
+ leaq -8(%rax),%rbp
movups (%r9),%xmm15
movl 240(%r8),%eax
movl 240(%rcx),%r10d
@@ -1308,228 +1493,266 @@ aesni_xts_encrypt:
leaq 16(%r8),%r8
jnz .Loop_enc1_8
.byte 102,68,15,56,221,249
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+ movl %eax,%r10d
+
movdqa .Lxts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pshufd $95,%xmm15,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc .Lxts_enc_short
shrl $1,%eax
- subl $1,%eax
+ subl $3,%eax
+ movups 16(%r11),%xmm1
movl %eax,%r10d
+ leaq .Lxts_magic(%rip),%r8
jmp .Lxts_enc_grandloop
-.p2align 4
+.p2align 5
.Lxts_enc_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,220,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,220,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,220,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,220,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,220,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,220,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,220,249
+ movups 48(%r11),%xmm1
+
+.byte 102,15,56,220,208
+ pxor %xmm9,%xmm12
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,220,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,220,216
+ pxor %xmm9,%xmm13
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,220,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,220,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,220,224
+ pxor %xmm9,%xmm14
+.byte 102,15,56,220,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,220,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,220,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp .Lxts_enc_loop6_enter
-
-.p2align 4
+.byte 102,15,56,220,240
+ movdqa %xmm8,80(%rsp)
+.byte 102,15,56,220,248
+ movups 64(%r11),%xmm0
+ leaq 64(%r11),%rcx
+ pshufd $95,%xmm15,%xmm9
+ jmp .Lxts_enc_loop6
+.p2align 5
.Lxts_enc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.Lxts_enc_loop6_enter:
movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
movups (%rcx),%xmm0
+ decl %eax
jnz .Lxts_enc_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
.byte 102,15,56,220,233
+ pxor %xmm14,%xmm15
.byte 102,15,56,220,241
+ movaps %xmm10,%xmm11
.byte 102,15,56,220,249
movups 16(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,208
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm10
+ psrad $31,%xmm14
.byte 102,15,56,220,216
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,220,224
- pxor %xmm9,%xmm15
.byte 102,15,56,220,232
+ pxor %xmm14,%xmm15
.byte 102,15,56,220,240
+ movaps %xmm11,%xmm12
.byte 102,15,56,220,248
movups 32(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
+ movdqa %xmm13,48(%rsp)
.byte 102,15,56,220,233
+ pxor %xmm14,%xmm15
.byte 102,15,56,220,241
+ movaps %xmm12,%xmm13
.byte 102,15,56,220,249
+ movups 48(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,220,216
paddq %xmm15,%xmm15
-.byte 102,15,56,221,208
- pand %xmm8,%xmm9
-.byte 102,15,56,221,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,221,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,221,232
-.byte 102,15,56,221,240
-.byte 102,15,56,221,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ pxor %xmm14,%xmm15
+.byte 102,15,56,220,240
+ movaps %xmm13,%xmm14
+.byte 102,15,56,220,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,220,217
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+ psrad $31,%xmm9
+.byte 102,15,56,221,84,36,0
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,221,92,36,16
+.byte 102,15,56,221,100,36,32
pxor %xmm9,%xmm15
-
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
+.byte 102,15,56,221,108,36,48
+.byte 102,15,56,221,116,36,64
+.byte 102,15,56,221,124,36,80
movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
+
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc .Lxts_enc_grandloop
- leal 3(%rax,%rax,1),%eax
+ leal 7(%rax,%rax,1),%eax
movq %r11,%rcx
movl %eax,%r10d
.Lxts_enc_short:
+ pxor %xmm0,%xmm10
addq $96,%rdx
jz .Lxts_enc_done
+ pxor %xmm0,%xmm11
cmpq $32,%rdx
jb .Lxts_enc_one
+ pxor %xmm0,%xmm12
je .Lxts_enc_two
+ pxor %xmm0,%xmm13
cmpq $64,%rdx
jb .Lxts_enc_three
+ pxor %xmm0,%xmm14
je .Lxts_enc_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
@@ -1632,15 +1855,15 @@ aesni_xts_encrypt:
call _aesni_encrypt4
- xorps %xmm10,%xmm2
- movdqa %xmm15,%xmm10
- xorps %xmm11,%xmm3
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm10,%xmm2
+ movdqa %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp .Lxts_enc_done
@@ -1681,17 +1904,18 @@ aesni_xts_encrypt:
movups %xmm2,-16(%rsi)
.Lxts_enc_ret:
- movaps 96(%rsp),%xmm6
- movaps 112(%rsp),%xmm7
- movaps 128(%rsp),%xmm8
- movaps 144(%rsp),%xmm9
- movaps 160(%rsp),%xmm10
- movaps 176(%rsp),%xmm11
- movaps 192(%rsp),%xmm12
- movaps 208(%rsp),%xmm13
- movaps 224(%rsp),%xmm14
- movaps 240(%rsp),%xmm15
- leaq 264(%rsp),%rsp
+ movaps -160(%rbp),%xmm6
+ movaps -144(%rbp),%xmm7
+ movaps -128(%rbp),%xmm8
+ movaps -112(%rbp),%xmm9
+ movaps -96(%rbp),%xmm10
+ movaps -80(%rbp),%xmm11
+ movaps -64(%rbp),%xmm12
+ movaps -48(%rbp),%xmm13
+ movaps -32(%rbp),%xmm14
+ movaps -16(%rbp),%xmm15
+ leaq (%rbp),%rsp
+ popq %rbp
.Lxts_enc_epilogue:
movq 8(%rsp),%rdi
movq 16(%rsp),%rsi
@@ -1712,18 +1936,22 @@ aesni_xts_decrypt:
movq 40(%rsp),%r8
movq 48(%rsp),%r9
- leaq -264(%rsp),%rsp
- movaps %xmm6,96(%rsp)
- movaps %xmm7,112(%rsp)
- movaps %xmm8,128(%rsp)
- movaps %xmm9,144(%rsp)
- movaps %xmm10,160(%rsp)
- movaps %xmm11,176(%rsp)
- movaps %xmm12,192(%rsp)
- movaps %xmm13,208(%rsp)
- movaps %xmm14,224(%rsp)
- movaps %xmm15,240(%rsp)
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $272,%rsp
+ andq $-16,%rsp
+ movaps %xmm6,-168(%rax)
+ movaps %xmm7,-152(%rax)
+ movaps %xmm8,-136(%rax)
+ movaps %xmm9,-120(%rax)
+ movaps %xmm10,-104(%rax)
+ movaps %xmm11,-88(%rax)
+ movaps %xmm12,-72(%rax)
+ movaps %xmm13,-56(%rax)
+ movaps %xmm14,-40(%rax)
+ movaps %xmm15,-24(%rax)
.Lxts_dec_body:
+ leaq -8(%rax),%rbp
movups (%r9),%xmm15
movl 240(%r8),%eax
movl 240(%rcx),%r10d
@@ -1744,228 +1972,266 @@ aesni_xts_decrypt:
shlq $4,%rax
subq %rax,%rdx
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+ movl %eax,%r10d
+
movdqa .Lxts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pshufd $95,%xmm15,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc .Lxts_dec_short
shrl $1,%eax
- subl $1,%eax
+ subl $3,%eax
+ movups 16(%r11),%xmm1
movl %eax,%r10d
+ leaq .Lxts_magic(%rip),%r8
jmp .Lxts_dec_grandloop
-.p2align 4
+.p2align 5
.Lxts_dec_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,222,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,222,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,222,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,222,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,222,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,222,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,222,249
+ movups 48(%r11),%xmm1
+
+.byte 102,15,56,222,208
+ pxor %xmm9,%xmm12
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,222,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,222,216
+ pxor %xmm9,%xmm13
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,222,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,222,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,222,224
+ pxor %xmm9,%xmm14
+.byte 102,15,56,222,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,222,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,222,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp .Lxts_dec_loop6_enter
-
-.p2align 4
+.byte 102,15,56,222,240
+ movdqa %xmm8,80(%rsp)
+.byte 102,15,56,222,248
+ movups 64(%r11),%xmm0
+ leaq 64(%r11),%rcx
+ pshufd $95,%xmm15,%xmm9
+ jmp .Lxts_dec_loop6
+.p2align 5
.Lxts_dec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.Lxts_dec_loop6_enter:
movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
movups (%rcx),%xmm0
+ decl %eax
jnz .Lxts_dec_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
.byte 102,15,56,222,233
+ pxor %xmm14,%xmm15
.byte 102,15,56,222,241
+ movaps %xmm10,%xmm11
.byte 102,15,56,222,249
movups 16(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,208
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm10
+ psrad $31,%xmm14
.byte 102,15,56,222,216
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,222,224
- pxor %xmm9,%xmm15
.byte 102,15,56,222,232
+ pxor %xmm14,%xmm15
.byte 102,15,56,222,240
+ movaps %xmm11,%xmm12
.byte 102,15,56,222,248
movups 32(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
+ movdqa %xmm13,48(%rsp)
.byte 102,15,56,222,233
+ pxor %xmm14,%xmm15
.byte 102,15,56,222,241
+ movaps %xmm12,%xmm13
.byte 102,15,56,222,249
+ movups 48(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,222,216
paddq %xmm15,%xmm15
-.byte 102,15,56,223,208
- pand %xmm8,%xmm9
-.byte 102,15,56,223,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,223,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,223,232
-.byte 102,15,56,223,240
-.byte 102,15,56,223,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ pxor %xmm14,%xmm15
+.byte 102,15,56,222,240
+ movaps %xmm13,%xmm14
+.byte 102,15,56,222,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,222,217
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+ psrad $31,%xmm9
+.byte 102,15,56,223,84,36,0
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,223,92,36,16
+.byte 102,15,56,223,100,36,32
pxor %xmm9,%xmm15
-
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
+.byte 102,15,56,223,108,36,48
+.byte 102,15,56,223,116,36,64
+.byte 102,15,56,223,124,36,80
movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
+
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc .Lxts_dec_grandloop
- leal 3(%rax,%rax,1),%eax
+ leal 7(%rax,%rax,1),%eax
movq %r11,%rcx
movl %eax,%r10d
.Lxts_dec_short:
+ pxor %xmm0,%xmm10
+ pxor %xmm0,%xmm11
addq $96,%rdx
jz .Lxts_dec_done
+ pxor %xmm0,%xmm12
cmpq $32,%rdx
jb .Lxts_dec_one
+ pxor %xmm0,%xmm13
je .Lxts_dec_two
+ pxor %xmm0,%xmm14
cmpq $64,%rdx
jb .Lxts_dec_three
je .Lxts_dec_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
@@ -2058,7 +2324,7 @@ aesni_xts_decrypt:
xorps %xmm10,%xmm2
movdqa %xmm13,%xmm10
xorps %xmm11,%xmm3
- movdqa %xmm15,%xmm11
+ movdqa %xmm14,%xmm11
xorps %xmm12,%xmm4
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
@@ -2068,14 +2334,8 @@ aesni_xts_decrypt:
.p2align 4
.Lxts_dec_four:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movups (%rdi),%xmm2
- pand %xmm8,%xmm9
movups 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movups 32(%rdi),%xmm4
xorps %xmm10,%xmm2
movups 48(%rdi),%xmm5
@@ -2086,16 +2346,16 @@ aesni_xts_decrypt:
call _aesni_decrypt4
- xorps %xmm10,%xmm2
+ pxor %xmm10,%xmm2
movdqa %xmm14,%xmm10
- xorps %xmm11,%xmm3
+ pxor %xmm11,%xmm3
movdqa %xmm15,%xmm11
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp .Lxts_dec_done
@@ -2155,17 +2415,18 @@ aesni_xts_decrypt:
movups %xmm2,(%rsi)
.Lxts_dec_ret:
- movaps 96(%rsp),%xmm6
- movaps 112(%rsp),%xmm7
- movaps 128(%rsp),%xmm8
- movaps 144(%rsp),%xmm9
- movaps 160(%rsp),%xmm10
- movaps 176(%rsp),%xmm11
- movaps 192(%rsp),%xmm12
- movaps 208(%rsp),%xmm13
- movaps 224(%rsp),%xmm14
- movaps 240(%rsp),%xmm15
- leaq 264(%rsp),%rsp
+ movaps -160(%rbp),%xmm6
+ movaps -144(%rbp),%xmm7
+ movaps -128(%rbp),%xmm8
+ movaps -112(%rbp),%xmm9
+ movaps -96(%rbp),%xmm10
+ movaps -80(%rbp),%xmm11
+ movaps -64(%rbp),%xmm12
+ movaps -48(%rbp),%xmm13
+ movaps -32(%rbp),%xmm14
+ movaps -16(%rbp),%xmm15
+ leaq (%rbp),%rsp
+ popq %rbp
.Lxts_dec_epilogue:
movq 8(%rsp),%rdi
movq 16(%rsp),%rsi
@@ -2245,155 +2506,335 @@ aesni_cbc_encrypt:
.p2align 4
.Lcbc_decrypt:
- leaq -88(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,16(%rsp)
- movaps %xmm8,32(%rsp)
- movaps %xmm9,48(%rsp)
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $176,%rsp
+ andq $-16,%rsp
+ movaps %xmm6,16(%rsp)
+ movaps %xmm7,32(%rsp)
+ movaps %xmm8,48(%rsp)
+ movaps %xmm9,64(%rsp)
+ movaps %xmm10,80(%rsp)
+ movaps %xmm11,96(%rsp)
+ movaps %xmm12,112(%rsp)
+ movaps %xmm13,128(%rsp)
+ movaps %xmm14,144(%rsp)
+ movaps %xmm15,160(%rsp)
.Lcbc_decrypt_body:
- movups (%r8),%xmm9
+ leaq -8(%rax),%rbp
+ movups (%r8),%xmm10
movl %r10d,%eax
- cmpq $112,%rdx
+ cmpq $80,%rdx
jbe .Lcbc_dec_tail
- shrl $1,%r10d
+
+ movups (%rcx),%xmm0
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqa %xmm2,%xmm11
+ movdqu 32(%rdi),%xmm4
+ movdqa %xmm3,%xmm12
+ movdqu 48(%rdi),%xmm5
+ movdqa %xmm4,%xmm13
+ movdqu 64(%rdi),%xmm6
+ movdqa %xmm5,%xmm14
+ movdqu 80(%rdi),%xmm7
+ movdqa %xmm6,%xmm15
+ cmpq $112,%rdx
+ jbe .Lcbc_dec_six_or_seven
+
subq $112,%rdx
- movl %r10d,%eax
- movaps %xmm9,64(%rsp)
+ leaq 112(%rcx),%rcx
jmp .Lcbc_dec_loop8_enter
.p2align 4
.Lcbc_dec_loop8:
- movaps %xmm0,64(%rsp)
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
.Lcbc_dec_loop8_enter:
- movups (%rcx),%xmm0
- movups (%rdi),%xmm2
- movups 16(%rdi),%xmm3
- movups 16(%rcx),%xmm1
+ movdqu 96(%rdi),%xmm8
+ pxor %xmm0,%xmm2
+ movdqu 112(%rdi),%xmm9
+ pxor %xmm0,%xmm3
+ movups 16-112(%rcx),%xmm1
+ pxor %xmm0,%xmm4
+ xorq %r11,%r11
+ cmpq $112,%rdx
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+ pxor %xmm0,%xmm7
+ pxor %xmm0,%xmm8
- leaq 32(%rcx),%rcx
- movdqu 32(%rdi),%xmm4
- xorps %xmm0,%xmm2
- movdqu 48(%rdi),%xmm5
- xorps %xmm0,%xmm3
- movdqu 64(%rdi),%xmm6
.byte 102,15,56,222,209
- pxor %xmm0,%xmm4
- movdqu 80(%rdi),%xmm7
+ pxor %xmm0,%xmm9
+ movups 32-112(%rcx),%xmm0
.byte 102,15,56,222,217
- pxor %xmm0,%xmm5
- movdqu 96(%rdi),%xmm8
.byte 102,15,56,222,225
- pxor %xmm0,%xmm6
- movdqu 112(%rdi),%xmm9
.byte 102,15,56,222,233
- pxor %xmm0,%xmm7
- decl %eax
.byte 102,15,56,222,241
- pxor %xmm0,%xmm8
.byte 102,15,56,222,249
- pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
+ setnc %r11b
.byte 102,68,15,56,222,193
+ shlq $7,%r11
.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
-
- call .Ldec_loop8_enter
+ addq %rdi,%r11
+ movups 48-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 64-112(%rcx),%xmm0
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 80-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 96-112(%rcx),%xmm0
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 112-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 128-112(%rcx),%xmm0
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 144-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 160-112(%rcx),%xmm0
+ cmpl $11,%eax
+ jb .Lcbc_dec_done
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 176-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 192-112(%rcx),%xmm0
+ je .Lcbc_dec_done
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 208-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 224-112(%rcx),%xmm0
+.Lcbc_dec_done:
+.byte 102,15,56,222,209
+ pxor %xmm0,%xmm10
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm11
+.byte 102,15,56,222,225
+ pxor %xmm0,%xmm12
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm13
+.byte 102,15,56,222,241
+ pxor %xmm0,%xmm14
+.byte 102,15,56,222,249
+ pxor %xmm0,%xmm15
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movdqu 80(%rdi),%xmm1
+
+.byte 102,65,15,56,223,210
+ movdqu 96(%rdi),%xmm10
+ pxor %xmm0,%xmm1
+.byte 102,65,15,56,223,219
+ pxor %xmm0,%xmm10
+ movdqu 112(%rdi),%xmm0
+ leaq 128(%rdi),%rdi
+.byte 102,65,15,56,223,228
+ movdqu 0(%r11),%xmm11
+.byte 102,65,15,56,223,237
+ movdqu 16(%r11),%xmm12
+.byte 102,65,15,56,223,246
+ movdqu 32(%r11),%xmm13
+.byte 102,65,15,56,223,255
+ movdqu 48(%r11),%xmm14
+.byte 102,68,15,56,223,193
+ movdqu 64(%r11),%xmm15
+.byte 102,69,15,56,223,202
+ movdqa %xmm0,%xmm10
+ movdqu 80(%r11),%xmm1
+ movups -112(%rcx),%xmm0
- movups (%rdi),%xmm1
- movups 16(%rdi),%xmm0
- xorps 64(%rsp),%xmm2
- xorps %xmm1,%xmm3
- movups 32(%rdi),%xmm1
- xorps %xmm0,%xmm4
- movups 48(%rdi),%xmm0
- xorps %xmm1,%xmm5
- movups 64(%rdi),%xmm1
- xorps %xmm0,%xmm6
- movups 80(%rdi),%xmm0
- xorps %xmm1,%xmm7
- movups 96(%rdi),%xmm1
- xorps %xmm0,%xmm8
- movups 112(%rdi),%xmm0
- xorps %xmm1,%xmm9
movups %xmm2,(%rsi)
+ movdqa %xmm11,%xmm2
movups %xmm3,16(%rsi)
+ movdqa %xmm12,%xmm3
movups %xmm4,32(%rsi)
+ movdqa %xmm13,%xmm4
movups %xmm5,48(%rsi)
- movl %r10d,%eax
+ movdqa %xmm14,%xmm5
movups %xmm6,64(%rsi)
- movq %r11,%rcx
+ movdqa %xmm15,%xmm6
movups %xmm7,80(%rsi)
- leaq 128(%rdi),%rdi
+ movdqa %xmm1,%xmm7
movups %xmm8,96(%rsi)
leaq 112(%rsi),%rsi
+
subq $128,%rdx
ja .Lcbc_dec_loop8
movaps %xmm9,%xmm2
- movaps %xmm0,%xmm9
+ leaq -112(%rcx),%rcx
addq $112,%rdx
jle .Lcbc_dec_tail_collected
- movups %xmm2,(%rsi)
- leal 1(%r10,%r10,1),%eax
+ movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
+ cmpq $80,%rdx
+ jbe .Lcbc_dec_tail
+
+ movaps %xmm11,%xmm2
+.Lcbc_dec_six_or_seven:
+ cmpq $96,%rdx
+ ja .Lcbc_dec_seven
+
+ movaps %xmm7,%xmm8
+ call _aesni_decrypt6
+ pxor %xmm10,%xmm2
+ movaps %xmm8,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ movdqa %xmm7,%xmm2
+ jmp .Lcbc_dec_tail_collected
+
+.p2align 4
+.Lcbc_dec_seven:
+ movups 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
+ call _aesni_decrypt8
+ movups 80(%rdi),%xmm9
+ pxor %xmm10,%xmm2
+ movups 96(%rdi),%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movdqu %xmm6,64(%rsi)
+ pxor %xmm9,%xmm8
+ movdqu %xmm7,80(%rsi)
+ leaq 96(%rsi),%rsi
+ movdqa %xmm8,%xmm2
+ jmp .Lcbc_dec_tail_collected
+
.Lcbc_dec_tail:
movups (%rdi),%xmm2
- movaps %xmm2,%xmm8
- cmpq $16,%rdx
+ subq $16,%rdx
jbe .Lcbc_dec_one
movups 16(%rdi),%xmm3
- movaps %xmm3,%xmm7
- cmpq $32,%rdx
+ movaps %xmm2,%xmm11
+ subq $16,%rdx
jbe .Lcbc_dec_two
movups 32(%rdi),%xmm4
- movaps %xmm4,%xmm6
- cmpq $48,%rdx
+ movaps %xmm3,%xmm12
+ subq $16,%rdx
jbe .Lcbc_dec_three
movups 48(%rdi),%xmm5
- cmpq $64,%rdx
+ movaps %xmm4,%xmm13
+ subq $16,%rdx
jbe .Lcbc_dec_four
movups 64(%rdi),%xmm6
- cmpq $80,%rdx
- jbe .Lcbc_dec_five
-
- movups 80(%rdi),%xmm7
- cmpq $96,%rdx
- jbe .Lcbc_dec_six
-
- movups 96(%rdi),%xmm8
- movaps %xmm9,64(%rsp)
- call _aesni_decrypt8
- movups (%rdi),%xmm1
- movups 16(%rdi),%xmm0
- xorps 64(%rsp),%xmm2
- xorps %xmm1,%xmm3
- movups 32(%rdi),%xmm1
- xorps %xmm0,%xmm4
- movups 48(%rdi),%xmm0
- xorps %xmm1,%xmm5
- movups 64(%rdi),%xmm1
- xorps %xmm0,%xmm6
- movups 80(%rdi),%xmm0
- xorps %xmm1,%xmm7
- movups 96(%rdi),%xmm9
- xorps %xmm0,%xmm8
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
- leaq 96(%rsi),%rsi
- movaps %xmm8,%xmm2
- subq $112,%rdx
+ movaps %xmm5,%xmm14
+ movaps %xmm6,%xmm15
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ pxor %xmm10,%xmm2
+ movaps %xmm15,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ leaq 64(%rsi),%rsi
+ movdqa %xmm6,%xmm2
+ subq $16,%rdx
jmp .Lcbc_dec_tail_collected
+
.p2align 4
.Lcbc_dec_one:
+ movaps %xmm2,%xmm11
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
@@ -2405,116 +2846,79 @@ aesni_cbc_encrypt:
leaq 16(%rcx),%rcx
jnz .Loop_dec1_16
.byte 102,15,56,223,209
- xorps %xmm9,%xmm2
- movaps %xmm8,%xmm9
- subq $16,%rdx
+ xorps %xmm10,%xmm2
+ movaps %xmm11,%xmm10
jmp .Lcbc_dec_tail_collected
.p2align 4
.Lcbc_dec_two:
+ movaps %xmm3,%xmm12
xorps %xmm4,%xmm4
call _aesni_decrypt3
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- movups %xmm2,(%rsi)
- movaps %xmm7,%xmm9
- movaps %xmm3,%xmm2
+ pxor %xmm10,%xmm2
+ movaps %xmm12,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ movdqa %xmm3,%xmm2
leaq 16(%rsi),%rsi
- subq $32,%rdx
jmp .Lcbc_dec_tail_collected
.p2align 4
.Lcbc_dec_three:
+ movaps %xmm4,%xmm13
call _aesni_decrypt3
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- movups %xmm2,(%rsi)
- xorps %xmm7,%xmm4
- movups %xmm3,16(%rsi)
- movaps %xmm6,%xmm9
- movaps %xmm4,%xmm2
+ pxor %xmm10,%xmm2
+ movaps %xmm13,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ movdqa %xmm4,%xmm2
leaq 32(%rsi),%rsi
- subq $48,%rdx
jmp .Lcbc_dec_tail_collected
.p2align 4
.Lcbc_dec_four:
+ movaps %xmm5,%xmm14
call _aesni_decrypt4
- xorps %xmm9,%xmm2
- movups 48(%rdi),%xmm9
- xorps %xmm8,%xmm3
- movups %xmm2,(%rsi)
- xorps %xmm7,%xmm4
- movups %xmm3,16(%rsi)
- xorps %xmm6,%xmm5
- movups %xmm4,32(%rsi)
- movaps %xmm5,%xmm2
+ pxor %xmm10,%xmm2
+ movaps %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ movdqa %xmm5,%xmm2
leaq 48(%rsi),%rsi
- subq $64,%rdx
- jmp .Lcbc_dec_tail_collected
-.p2align 4
-.Lcbc_dec_five:
- xorps %xmm7,%xmm7
- call _aesni_decrypt6
- movups 16(%rdi),%xmm1
- movups 32(%rdi),%xmm0
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- xorps %xmm1,%xmm4
- movups 48(%rdi),%xmm1
- xorps %xmm0,%xmm5
- movups 64(%rdi),%xmm9
- xorps %xmm1,%xmm6
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- leaq 64(%rsi),%rsi
- movaps %xmm6,%xmm2
- subq $80,%rdx
- jmp .Lcbc_dec_tail_collected
-.p2align 4
-.Lcbc_dec_six:
- call _aesni_decrypt6
- movups 16(%rdi),%xmm1
- movups 32(%rdi),%xmm0
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- xorps %xmm1,%xmm4
- movups 48(%rdi),%xmm1
- xorps %xmm0,%xmm5
- movups 64(%rdi),%xmm0
- xorps %xmm1,%xmm6
- movups 80(%rdi),%xmm9
- xorps %xmm0,%xmm7
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- movups %xmm6,64(%rsi)
- leaq 80(%rsi),%rsi
- movaps %xmm7,%xmm2
- subq $96,%rdx
jmp .Lcbc_dec_tail_collected
+
.p2align 4
.Lcbc_dec_tail_collected:
+ movups %xmm10,(%r8)
andq $15,%rdx
- movups %xmm9,(%r8)
jnz .Lcbc_dec_tail_partial
movups %xmm2,(%rsi)
jmp .Lcbc_dec_ret
.p2align 4
.Lcbc_dec_tail_partial:
- movaps %xmm2,64(%rsp)
+ movaps %xmm2,(%rsp)
movq $16,%rcx
movq %rsi,%rdi
subq %rdx,%rcx
- leaq 64(%rsp),%rsi
+ leaq (%rsp),%rsi
.long 0x9066A4F3
.Lcbc_dec_ret:
- movaps (%rsp),%xmm6
- movaps 16(%rsp),%xmm7
- movaps 32(%rsp),%xmm8
- movaps 48(%rsp),%xmm9
- leaq 88(%rsp),%rsp
+ movaps 16(%rsp),%xmm6
+ movaps 32(%rsp),%xmm7
+ movaps 48(%rsp),%xmm8
+ movaps 64(%rsp),%xmm9
+ movaps 80(%rsp),%xmm10
+ movaps 96(%rsp),%xmm11
+ movaps 112(%rsp),%xmm12
+ movaps 128(%rsp),%xmm13
+ movaps 144(%rsp),%xmm14
+ movaps 160(%rsp),%xmm15
+ leaq (%rbp),%rsp
+ popq %rbp
.Lcbc_ret:
movq 8(%rsp),%rdi
movq 16(%rsp),%rsi
@@ -2759,6 +3163,8 @@ __aesni_set_encrypt_key:
.long 1,0,0,0
.Lxts_magic:
.long 0x87,0,1,0
+.Lincrement1:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 6
@@ -2823,45 +3229,9 @@ ccm64_se_handler:
jmp .Lcommon_seh_tail
-.def ctr32_se_handler; .scl 3; .type 32; .endef
-.p2align 4
-ctr32_se_handler:
- pushq %rsi
- pushq %rdi
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushfq
- subq $64,%rsp
-
- movq 120(%r8),%rax
- movq 248(%r8),%rbx
-
- leaq .Lctr32_body(%rip),%r10
- cmpq %r10,%rbx
- jb .Lcommon_seh_tail
-
- movq 152(%r8),%rax
-
- leaq .Lctr32_ret(%rip),%r10
- cmpq %r10,%rbx
- jae .Lcommon_seh_tail
-
- leaq 32(%rax),%rsi
- leaq 512(%r8),%rdi
- movl $20,%ecx
-.long 0xa548f3fc
- leaq 200(%rax),%rax
-
- jmp .Lcommon_seh_tail
-
-
-.def xts_se_handler; .scl 3; .type 32; .endef
+.def ctr_xts_se_handler; .scl 3; .type 32; .endef
.p2align 4
-xts_se_handler:
+ctr_xts_se_handler:
pushq %rsi
pushq %rdi
pushq %rbx
@@ -2891,13 +3261,13 @@ xts_se_handler:
cmpq %r10,%rbx
jae .Lcommon_seh_tail
- leaq 96(%rax),%rsi
+ movq 160(%r8),%rax
+ leaq -160(%rax),%rsi
leaq 512(%r8),%rdi
movl $20,%ecx
.long 0xa548f3fc
- leaq 104+160(%rax),%rax
- jmp .Lcommon_seh_tail
+ jmp .Lcommon_rbp_tail
.def cbc_se_handler; .scl 3; .type 32; .endef
.p2align 4
@@ -2928,11 +3298,16 @@ cbc_se_handler:
cmpq %r10,%rbx
jae .Lcommon_seh_tail
- leaq 0(%rax),%rsi
+ leaq 16(%rax),%rsi
leaq 512(%r8),%rdi
- movl $8,%ecx
+ movl $20,%ecx
.long 0xa548f3fc
- leaq 88(%rax),%rax
+
+.Lcommon_rbp_tail:
+ movq 160(%r8),%rax
+ movq (%rax),%rbp
+ leaq 8(%rax),%rax
+ movq %rbp,160(%r8)
jmp .Lcommon_seh_tail
.Lrestore_cbc_rax:
@@ -3029,14 +3404,15 @@ cbc_se_handler:
.rva .Lccm64_dec_body,.Lccm64_dec_ret
.LSEH_info_ctr32:
.byte 9,0,0,0
-.rva ctr32_se_handler
+.rva ctr_xts_se_handler
+.rva .Lctr32_body,.Lctr32_epilogue
.LSEH_info_xts_enc:
.byte 9,0,0,0
-.rva xts_se_handler
+.rva ctr_xts_se_handler
.rva .Lxts_enc_body,.Lxts_enc_epilogue
.LSEH_info_xts_dec:
.byte 9,0,0,0
-.rva xts_se_handler
+.rva ctr_xts_se_handler
.rva .Lxts_dec_body,.Lxts_dec_epilogue
.LSEH_info_cbc:
.byte 9,0,0,0
diff --git a/lib/accelerated/x86/coff/padlock-x86-64-coff.s b/lib/accelerated/x86/coff/padlock-x86-64-coff.s
index 9f658ee761..a3a0e301e7 100644
--- a/lib/accelerated/x86/coff/padlock-x86-64-coff.s
+++ b/lib/accelerated/x86/coff/padlock-x86-64-coff.s
@@ -686,6 +686,501 @@ padlock_cbc_encrypt:
movq 16(%rsp),%rsi
.byte 0xf3,0xc3
.LSEH_end_padlock_cbc_encrypt:
+.globl padlock_cfb_encrypt
+.def padlock_cfb_encrypt; .scl 2; .type 32; .endef
+.p2align 4
+padlock_cfb_encrypt:
+ movq %rdi,8(%rsp)
+ movq %rsi,16(%rsp)
+ movq %rsp,%rax
+.LSEH_begin_padlock_cfb_encrypt:
+ movq %rcx,%rdi
+ movq %rdx,%rsi
+ movq %r8,%rdx
+ movq %r9,%rcx
+
+ pushq %rbp
+ pushq %rbx
+
+ xorl %eax,%eax
+ testq $15,%rdx
+ jnz .Lcfb_abort
+ testq $15,%rcx
+ jnz .Lcfb_abort
+ leaq .Lpadlock_saved_context(%rip),%rax
+ pushf
+ cld
+ call _padlock_verify_ctx
+ leaq 16(%rdx),%rdx
+ xorl %eax,%eax
+ xorl %ebx,%ebx
+ testl $32,(%rdx)
+ jnz .Lcfb_aligned
+ testq $15,%rdi
+ setz %al
+ testq $15,%rsi
+ setz %bl
+ testl %ebx,%eax
+ jnz .Lcfb_aligned
+ negq %rax
+ movq $512,%rbx
+ notq %rax
+ leaq (%rsp),%rbp
+ cmpq %rbx,%rcx
+ cmovcq %rcx,%rbx
+ andq %rbx,%rax
+ movq %rcx,%rbx
+ negq %rax
+ andq $512-1,%rbx
+ leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+ jmp .Lcfb_loop
+.p2align 4
+.Lcfb_loop:
+ cmpq %rcx,%rbx
+ cmovaq %rcx,%rbx
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rcx,%r10
+ movq %rbx,%rcx
+ movq %rbx,%r11
+ testq $15,%rdi
+ cmovnzq %rsp,%rdi
+ testq $15,%rsi
+ jz .Lcfb_inp_aligned
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
+ subq %rbx,%rdi
+ movq %rbx,%rcx
+ movq %rdi,%rsi
+.Lcfb_inp_aligned:
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,224
+ movdqa (%rax),%xmm0
+ movdqa %xmm0,-16(%rdx)
+ movq %r8,%rdi
+ movq %r11,%rbx
+ testq $15,%rdi
+ jz .Lcfb_out_aligned
+ movq %rbx,%rcx
+ leaq (%rsp),%rsi
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
+ subq %rbx,%rdi
+.Lcfb_out_aligned:
+ movq %r9,%rsi
+ movq %r10,%rcx
+ addq %rbx,%rdi
+ addq %rbx,%rsi
+ subq %rbx,%rcx
+ movq $512,%rbx
+ jnz .Lcfb_loop
+ cmpq %rbp,%rsp
+ je .Lcfb_done
+
+ pxor %xmm0,%xmm0
+ leaq (%rsp),%rax
+.Lcfb_bzero:
+ movaps %xmm0,(%rax)
+ leaq 16(%rax),%rax
+ cmpq %rax,%rbp
+ ja .Lcfb_bzero
+
+.Lcfb_done:
+ leaq (%rbp),%rsp
+ jmp .Lcfb_exit
+
+.p2align 4
+.Lcfb_aligned:
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,224
+ movdqa (%rax),%xmm0
+ movdqa %xmm0,-16(%rdx)
+.Lcfb_exit:
+ movl $1,%eax
+ leaq 8(%rsp),%rsp
+.Lcfb_abort:
+ popq %rbx
+ popq %rbp
+ movq 8(%rsp),%rdi
+ movq 16(%rsp),%rsi
+ .byte 0xf3,0xc3
+.LSEH_end_padlock_cfb_encrypt:
+.globl padlock_ofb_encrypt
+.def padlock_ofb_encrypt; .scl 2; .type 32; .endef
+.p2align 4
+padlock_ofb_encrypt:
+ movq %rdi,8(%rsp)
+ movq %rsi,16(%rsp)
+ movq %rsp,%rax
+.LSEH_begin_padlock_ofb_encrypt:
+ movq %rcx,%rdi
+ movq %rdx,%rsi
+ movq %r8,%rdx
+ movq %r9,%rcx
+
+ pushq %rbp
+ pushq %rbx
+
+ xorl %eax,%eax
+ testq $15,%rdx
+ jnz .Lofb_abort
+ testq $15,%rcx
+ jnz .Lofb_abort
+ leaq .Lpadlock_saved_context(%rip),%rax
+ pushf
+ cld
+ call _padlock_verify_ctx
+ leaq 16(%rdx),%rdx
+ xorl %eax,%eax
+ xorl %ebx,%ebx
+ testl $32,(%rdx)
+ jnz .Lofb_aligned
+ testq $15,%rdi
+ setz %al
+ testq $15,%rsi
+ setz %bl
+ testl %ebx,%eax
+ jnz .Lofb_aligned
+ negq %rax
+ movq $512,%rbx
+ notq %rax
+ leaq (%rsp),%rbp
+ cmpq %rbx,%rcx
+ cmovcq %rcx,%rbx
+ andq %rbx,%rax
+ movq %rcx,%rbx
+ negq %rax
+ andq $512-1,%rbx
+ leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+ jmp .Lofb_loop
+.p2align 4
+.Lofb_loop:
+ cmpq %rcx,%rbx
+ cmovaq %rcx,%rbx
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rcx,%r10
+ movq %rbx,%rcx
+ movq %rbx,%r11
+ testq $15,%rdi
+ cmovnzq %rsp,%rdi
+ testq $15,%rsi
+ jz .Lofb_inp_aligned
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
+ subq %rbx,%rdi
+ movq %rbx,%rcx
+ movq %rdi,%rsi
+.Lofb_inp_aligned:
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,232
+ movdqa (%rax),%xmm0
+ movdqa %xmm0,-16(%rdx)
+ movq %r8,%rdi
+ movq %r11,%rbx
+ testq $15,%rdi
+ jz .Lofb_out_aligned
+ movq %rbx,%rcx
+ leaq (%rsp),%rsi
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
+ subq %rbx,%rdi
+.Lofb_out_aligned:
+ movq %r9,%rsi
+ movq %r10,%rcx
+ addq %rbx,%rdi
+ addq %rbx,%rsi
+ subq %rbx,%rcx
+ movq $512,%rbx
+ jnz .Lofb_loop
+ cmpq %rbp,%rsp
+ je .Lofb_done
+
+ pxor %xmm0,%xmm0
+ leaq (%rsp),%rax
+.Lofb_bzero:
+ movaps %xmm0,(%rax)
+ leaq 16(%rax),%rax
+ cmpq %rax,%rbp
+ ja .Lofb_bzero
+
+.Lofb_done:
+ leaq (%rbp),%rsp
+ jmp .Lofb_exit
+
+.p2align 4
+.Lofb_aligned:
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,232
+ movdqa (%rax),%xmm0
+ movdqa %xmm0,-16(%rdx)
+.Lofb_exit:
+ movl $1,%eax
+ leaq 8(%rsp),%rsp
+.Lofb_abort:
+ popq %rbx
+ popq %rbp
+ movq 8(%rsp),%rdi
+ movq 16(%rsp),%rsi
+ .byte 0xf3,0xc3
+.LSEH_end_padlock_ofb_encrypt:
+.globl padlock_ctr32_encrypt
+.def padlock_ctr32_encrypt; .scl 2; .type 32; .endef
+.p2align 4
+padlock_ctr32_encrypt:
+ movq %rdi,8(%rsp)
+ movq %rsi,16(%rsp)
+ movq %rsp,%rax
+.LSEH_begin_padlock_ctr32_encrypt:
+ movq %rcx,%rdi
+ movq %rdx,%rsi
+ movq %r8,%rdx
+ movq %r9,%rcx
+
+ pushq %rbp
+ pushq %rbx
+
+ xorl %eax,%eax
+ testq $15,%rdx
+ jnz .Lctr32_abort
+ testq $15,%rcx
+ jnz .Lctr32_abort
+ leaq .Lpadlock_saved_context(%rip),%rax
+ pushf
+ cld
+ call _padlock_verify_ctx
+ leaq 16(%rdx),%rdx
+ xorl %eax,%eax
+ xorl %ebx,%ebx
+ testl $32,(%rdx)
+ jnz .Lctr32_aligned
+ testq $15,%rdi
+ setz %al
+ testq $15,%rsi
+ setz %bl
+ testl %ebx,%eax
+ jnz .Lctr32_aligned
+ negq %rax
+ movq $512,%rbx
+ notq %rax
+ leaq (%rsp),%rbp
+ cmpq %rbx,%rcx
+ cmovcq %rcx,%rbx
+ andq %rbx,%rax
+ movq %rcx,%rbx
+ negq %rax
+ andq $512-1,%rbx
+ leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+.Lctr32_reenter:
+ movl -4(%rdx),%eax
+ bswapl %eax
+ negl %eax
+ andl $31,%eax
+ movq $512,%rbx
+ shll $4,%eax
+ cmovzq %rbx,%rax
+ cmpq %rax,%rcx
+ cmovaq %rax,%rbx
+ cmovbeq %rcx,%rbx
+ cmpq %rbx,%rcx
+ ja .Lctr32_loop
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $32,%rax
+ movq $-32,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jz .Lctr32_unaligned_tail
+ jmp .Lctr32_loop
+.p2align 4
+.Lctr32_loop:
+ cmpq %rcx,%rbx
+ cmovaq %rcx,%rbx
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rcx,%r10
+ movq %rbx,%rcx
+ movq %rbx,%r11
+ testq $15,%rdi
+ cmovnzq %rsp,%rdi
+ testq $15,%rsi
+ jz .Lctr32_inp_aligned
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
+ subq %rbx,%rdi
+ movq %rbx,%rcx
+ movq %rdi,%rsi
+.Lctr32_inp_aligned:
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,216
+ movl -4(%rdx),%eax
+ testl $4294901760,%eax
+ jnz .Lctr32_no_carry
+ bswapl %eax
+ addl $65536,%eax
+ bswapl %eax
+ movl %eax,-4(%rdx)
+.Lctr32_no_carry:
+ movq %r8,%rdi
+ movq %r11,%rbx
+ testq $15,%rdi
+ jz .Lctr32_out_aligned
+ movq %rbx,%rcx
+ leaq (%rsp),%rsi
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
+ subq %rbx,%rdi
+.Lctr32_out_aligned:
+ movq %r9,%rsi
+ movq %r10,%rcx
+ addq %rbx,%rdi
+ addq %rbx,%rsi
+ subq %rbx,%rcx
+ movq $512,%rbx
+ jz .Lctr32_break
+ cmpq %rbx,%rcx
+ jae .Lctr32_loop
+ movq %rcx,%rbx
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $32,%rax
+ movq $-32,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jnz .Lctr32_loop
+.Lctr32_unaligned_tail:
+ xorl %eax,%eax
+ cmpq %rsp,%rbp
+ cmoveq %rcx,%rax
+ movq %rdi,%r8
+ movq %rcx,%rbx
+ subq %rax,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ movq %rsp,%rsi
+ movq %r8,%rdi
+ movq %rbx,%rcx
+ jmp .Lctr32_loop
+.p2align 4
+.Lctr32_break:
+ cmpq %rbp,%rsp
+ je .Lctr32_done
+
+ pxor %xmm0,%xmm0
+ leaq (%rsp),%rax
+.Lctr32_bzero:
+ movaps %xmm0,(%rax)
+ leaq 16(%rax),%rax
+ cmpq %rax,%rbp
+ ja .Lctr32_bzero
+
+.Lctr32_done:
+ leaq (%rbp),%rsp
+ jmp .Lctr32_exit
+
+.p2align 4
+.Lctr32_aligned:
+ movl -4(%rdx),%eax
+ bswapl %eax
+ negl %eax
+ andl $65535,%eax
+ movq $1048576,%rbx
+ shll $4,%eax
+ cmovzq %rbx,%rax
+ cmpq %rax,%rcx
+ cmovaq %rax,%rbx
+ cmovbeq %rcx,%rbx
+ jbe .Lctr32_aligned_skip
+
+.Lctr32_aligned_loop:
+ movq %rcx,%r10
+ movq %rbx,%rcx
+ movq %rbx,%r11
+
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,216
+
+ movl -4(%rdx),%eax
+ bswapl %eax
+ addl $65536,%eax
+ bswapl %eax
+ movl %eax,-4(%rdx)
+
+ movq %r10,%rcx
+ subq %r11,%rcx
+ movq $1048576,%rbx
+ jz .Lctr32_exit
+ cmpq %rbx,%rcx
+ jae .Lctr32_aligned_loop
+
+.Lctr32_aligned_skip:
+ leaq (%rsi,%rcx,1),%rbp
+ negq %rbp
+ andq $4095,%rbp
+ xorl %eax,%eax
+ cmpq $32,%rbp
+ movq $32-1,%rbp
+ cmovaeq %rax,%rbp
+ andq %rcx,%rbp
+ subq %rbp,%rcx
+ jz .Lctr32_aligned_tail
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,216
+ testq %rbp,%rbp
+ jz .Lctr32_exit
+
+.Lctr32_aligned_tail:
+ movq %rdi,%r8
+ movq %rbp,%rbx
+ movq %rbp,%rcx
+ leaq (%rsp),%rbp
+ subq %rcx,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ leaq (%r8),%rdi
+ leaq (%rsp),%rsi
+ movq %rbx,%rcx
+ jmp .Lctr32_loop
+.Lctr32_exit:
+ movl $1,%eax
+ leaq 8(%rsp),%rsp
+.Lctr32_abort:
+ popq %rbx
+ popq %rbp
+ movq 8(%rsp),%rdi
+ movq 16(%rsp),%rsi
+ .byte 0xf3,0xc3
+.LSEH_end_padlock_ctr32_encrypt:
.byte 86,73,65,32,80,97,100,108,111,99,107,32,120,56,54,95,54,52,32,109,111,100,117,108,101,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 4
.data
diff --git a/lib/accelerated/x86/coff/padlock-x86-coff.s b/lib/accelerated/x86/coff/padlock-x86-coff.s
index 69eb468638..d969f307b5 100644
--- a/lib/accelerated/x86/coff/padlock-x86-coff.s
+++ b/lib/accelerated/x86/coff/padlock-x86-coff.s
@@ -515,6 +515,354 @@ _padlock_cbc_encrypt:
popl %ebx
popl %ebp
ret
+.globl _padlock_cfb_encrypt
+.def _padlock_cfb_encrypt; .scl 2; .type 32; .endef
+.align 16
+_padlock_cfb_encrypt:
+.L_padlock_cfb_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%edx
+ movl 32(%esp),%ecx
+ testl $15,%edx
+ jnz .L028cfb_abort
+ testl $15,%ecx
+ jnz .L028cfb_abort
+ leal .Lpadlock_saved_context,%eax
+ pushfl
+ cld
+ call __padlock_verify_ctx
+.L029cfb_pic_point:
+ leal 16(%edx),%edx
+ xorl %eax,%eax
+ xorl %ebx,%ebx
+ testl $32,(%edx)
+ jnz .L030cfb_aligned
+ testl $15,%edi
+ setz %al
+ testl $15,%esi
+ setz %bl
+ testl %ebx,%eax
+ jnz .L030cfb_aligned
+ negl %eax
+ movl $512,%ebx
+ notl %eax
+ leal -24(%esp),%ebp
+ cmpl %ebx,%ecx
+ cmovcl %ecx,%ebx
+ andl %ebx,%eax
+ movl %ecx,%ebx
+ negl %eax
+ andl $511,%ebx
+ leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ jmp .L031cfb_loop
+.align 16
+.L031cfb_loop:
+ movl %edi,(%ebp)
+ movl %esi,4(%ebp)
+ movl %ecx,8(%ebp)
+ movl %ebx,%ecx
+ movl %ebx,12(%ebp)
+ testl $15,%edi
+ cmovnzl %esp,%edi
+ testl $15,%esi
+ jz .L032cfb_inp_aligned
+ shrl $2,%ecx
+.byte 243,165
+ subl %ebx,%edi
+ movl %ebx,%ecx
+ movl %edi,%esi
+.L032cfb_inp_aligned:
+ leal -16(%edx),%eax
+ leal 16(%edx),%ebx
+ shrl $4,%ecx
+.byte 243,15,167,224
+ movaps (%eax),%xmm0
+ movaps %xmm0,-16(%edx)
+ movl (%ebp),%edi
+ movl 12(%ebp),%ebx
+ testl $15,%edi
+ jz .L033cfb_out_aligned
+ movl %ebx,%ecx
+ leal (%esp),%esi
+ shrl $2,%ecx
+.byte 243,165
+ subl %ebx,%edi
+.L033cfb_out_aligned:
+ movl 4(%ebp),%esi
+ movl 8(%ebp),%ecx
+ addl %ebx,%edi
+ addl %ebx,%esi
+ subl %ebx,%ecx
+ movl $512,%ebx
+ jnz .L031cfb_loop
+ cmpl %ebp,%esp
+ je .L034cfb_done
+ pxor %xmm0,%xmm0
+ leal (%esp),%eax
+.L035cfb_bzero:
+ movaps %xmm0,(%eax)
+ leal 16(%eax),%eax
+ cmpl %eax,%ebp
+ ja .L035cfb_bzero
+.L034cfb_done:
+ movl 16(%ebp),%ebp
+ leal 24(%ebp),%esp
+ jmp .L036cfb_exit
+.align 16
+.L030cfb_aligned:
+ leal -16(%edx),%eax
+ leal 16(%edx),%ebx
+ shrl $4,%ecx
+.byte 243,15,167,224
+ movaps (%eax),%xmm0
+ movaps %xmm0,-16(%edx)
+.L036cfb_exit:
+ movl $1,%eax
+ leal 4(%esp),%esp
+.L028cfb_abort:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _padlock_ofb_encrypt
+.def _padlock_ofb_encrypt; .scl 2; .type 32; .endef
+.align 16
+_padlock_ofb_encrypt:
+.L_padlock_ofb_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%edx
+ movl 32(%esp),%ecx
+ testl $15,%edx
+ jnz .L037ofb_abort
+ testl $15,%ecx
+ jnz .L037ofb_abort
+ leal .Lpadlock_saved_context,%eax
+ pushfl
+ cld
+ call __padlock_verify_ctx
+.L038ofb_pic_point:
+ leal 16(%edx),%edx
+ xorl %eax,%eax
+ xorl %ebx,%ebx
+ testl $32,(%edx)
+ jnz .L039ofb_aligned
+ testl $15,%edi
+ setz %al
+ testl $15,%esi
+ setz %bl
+ testl %ebx,%eax
+ jnz .L039ofb_aligned
+ negl %eax
+ movl $512,%ebx
+ notl %eax
+ leal -24(%esp),%ebp
+ cmpl %ebx,%ecx
+ cmovcl %ecx,%ebx
+ andl %ebx,%eax
+ movl %ecx,%ebx
+ negl %eax
+ andl $511,%ebx
+ leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ jmp .L040ofb_loop
+.align 16
+.L040ofb_loop:
+ movl %edi,(%ebp)
+ movl %esi,4(%ebp)
+ movl %ecx,8(%ebp)
+ movl %ebx,%ecx
+ movl %ebx,12(%ebp)
+ testl $15,%edi
+ cmovnzl %esp,%edi
+ testl $15,%esi
+ jz .L041ofb_inp_aligned
+ shrl $2,%ecx
+.byte 243,165
+ subl %ebx,%edi
+ movl %ebx,%ecx
+ movl %edi,%esi
+.L041ofb_inp_aligned:
+ leal -16(%edx),%eax
+ leal 16(%edx),%ebx
+ shrl $4,%ecx
+.byte 243,15,167,232
+ movaps (%eax),%xmm0
+ movaps %xmm0,-16(%edx)
+ movl (%ebp),%edi
+ movl 12(%ebp),%ebx
+ testl $15,%edi
+ jz .L042ofb_out_aligned
+ movl %ebx,%ecx
+ leal (%esp),%esi
+ shrl $2,%ecx
+.byte 243,165
+ subl %ebx,%edi
+.L042ofb_out_aligned:
+ movl 4(%ebp),%esi
+ movl 8(%ebp),%ecx
+ addl %ebx,%edi
+ addl %ebx,%esi
+ subl %ebx,%ecx
+ movl $512,%ebx
+ jnz .L040ofb_loop
+ cmpl %ebp,%esp
+ je .L043ofb_done
+ pxor %xmm0,%xmm0
+ leal (%esp),%eax
+.L044ofb_bzero:
+ movaps %xmm0,(%eax)
+ leal 16(%eax),%eax
+ cmpl %eax,%ebp
+ ja .L044ofb_bzero
+.L043ofb_done:
+ movl 16(%ebp),%ebp
+ leal 24(%ebp),%esp
+ jmp .L045ofb_exit
+.align 16
+.L039ofb_aligned:
+ leal -16(%edx),%eax
+ leal 16(%edx),%ebx
+ shrl $4,%ecx
+.byte 243,15,167,232
+ movaps (%eax),%xmm0
+ movaps %xmm0,-16(%edx)
+.L045ofb_exit:
+ movl $1,%eax
+ leal 4(%esp),%esp
+.L037ofb_abort:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _padlock_ctr32_encrypt
+.def _padlock_ctr32_encrypt; .scl 2; .type 32; .endef
+.align 16
+_padlock_ctr32_encrypt:
+.L_padlock_ctr32_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%edx
+ movl 32(%esp),%ecx
+ testl $15,%edx
+ jnz .L046ctr32_abort
+ testl $15,%ecx
+ jnz .L046ctr32_abort
+ leal .Lpadlock_saved_context,%eax
+ pushfl
+ cld
+ call __padlock_verify_ctx
+.L047ctr32_pic_point:
+ leal 16(%edx),%edx
+ xorl %eax,%eax
+ movq -16(%edx),%mm0
+ movl $512,%ebx
+ notl %eax
+ leal -24(%esp),%ebp
+ cmpl %ebx,%ecx
+ cmovcl %ecx,%ebx
+ andl %ebx,%eax
+ movl %ecx,%ebx
+ negl %eax
+ andl $511,%ebx
+ leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ jmp .L048ctr32_loop
+.align 16
+.L048ctr32_loop:
+ movl %edi,(%ebp)
+ movl %esi,4(%ebp)
+ movl %ecx,8(%ebp)
+ movl %ebx,%ecx
+ movl %ebx,12(%ebp)
+ movl -4(%edx),%ecx
+ xorl %edi,%edi
+ movl -8(%edx),%eax
+.L049ctr32_prepare:
+ movl %ecx,12(%esp,%edi,1)
+ bswap %ecx
+ movq %mm0,(%esp,%edi,1)
+ incl %ecx
+ movl %eax,8(%esp,%edi,1)
+ bswap %ecx
+ leal 16(%edi),%edi
+ cmpl %ebx,%edi
+ jb .L049ctr32_prepare
+ movl %ecx,-4(%edx)
+ leal (%esp),%esi
+ leal (%esp),%edi
+ movl %ebx,%ecx
+ leal -16(%edx),%eax
+ leal 16(%edx),%ebx
+ shrl $4,%ecx
+.byte 243,15,167,200
+ movl (%ebp),%edi
+ movl 12(%ebp),%ebx
+ movl 4(%ebp),%esi
+ xorl %ecx,%ecx
+.L050ctr32_xor:
+ movups (%esi,%ecx,1),%xmm1
+ leal 16(%ecx),%ecx
+ pxor -16(%esp,%ecx,1),%xmm1
+ movups %xmm1,-16(%edi,%ecx,1)
+ cmpl %ebx,%ecx
+ jb .L050ctr32_xor
+ movl 8(%ebp),%ecx
+ addl %ebx,%edi
+ addl %ebx,%esi
+ subl %ebx,%ecx
+ movl $512,%ebx
+ jnz .L048ctr32_loop
+ pxor %xmm0,%xmm0
+ leal (%esp),%eax
+.L051ctr32_bzero:
+ movaps %xmm0,(%eax)
+ leal 16(%eax),%eax
+ cmpl %eax,%ebp
+ ja .L051ctr32_bzero
+.L052ctr32_done:
+ movl 16(%ebp),%ebp
+ leal 24(%ebp),%esp
+ movl $1,%eax
+ leal 4(%esp),%esp
+ emms
+.L046ctr32_abort:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
.globl _padlock_xstore
.def _padlock_xstore; .scl 2; .type 32; .endef
.align 16
@@ -533,10 +881,10 @@ __win32_segv_handler:
movl 4(%esp),%edx
movl 12(%esp),%ecx
cmpl $3221225477,(%edx)
- jne .L028ret
+ jne .L053ret
addl $4,184(%ecx)
movl $0,%eax
-.L028ret:
+.L053ret:
ret
.globl _padlock_sha1_oneshot
.def _padlock_sha1_oneshot; .scl 2; .type 32; .endef
diff --git a/lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s b/lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s
index 8f2b96ff1f..9755951f7b 100644
--- a/lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s
+++ b/lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s
@@ -697,6 +697,7 @@ gcm_ghash_4bit:
.type gcm_init_clmul,@function
.align 16
gcm_init_clmul:
+.L_init_clmul:
movdqu (%rsi),%xmm2
pshufd $78,%xmm2,%xmm2
@@ -715,15 +716,15 @@ gcm_init_clmul:
pxor %xmm5,%xmm2
+ pshufd $78,%xmm2,%xmm6
movdqa %xmm2,%xmm0
+ pxor %xmm2,%xmm6
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
+.byte 102,15,58,68,222,0
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -733,44 +734,134 @@ gcm_init_clmul:
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
psllq $1,%xmm0
pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm2,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm2,%xmm3
+ movdqu %xmm2,0(%rdi)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,16(%rdi)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,32(%rdi)
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
psrlq $5,%xmm0
pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ movdqa %xmm0,%xmm5
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- movdqu %xmm2,(%rdi)
- movdqu %xmm0,16(%rdi)
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm5,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm5,%xmm3
+ movdqu %xmm5,48(%rdi)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,64(%rdi)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,80(%rdi)
.byte 0xf3,0xc3
.size gcm_init_clmul,.-gcm_init_clmul
.globl gcm_gmult_clmul
.type gcm_gmult_clmul,@function
.align 16
gcm_gmult_clmul:
+.L_gmult_clmul:
movdqu (%rdi),%xmm0
movdqa .Lbswap_mask(%rip),%xmm5
movdqu (%rsi),%xmm2
+ movdqu 32(%rsi),%xmm4
.byte 102,15,56,0,197
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
.byte 102,15,58,68,220,0
@@ -783,186 +874,358 @@ gcm_gmult_clmul:
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
.byte 102,15,56,0,197
movdqu %xmm0,(%rdi)
.byte 0xf3,0xc3
.size gcm_gmult_clmul,.-gcm_gmult_clmul
.globl gcm_ghash_clmul
.type gcm_ghash_clmul,@function
-.align 16
+.align 32
gcm_ghash_clmul:
+.L_ghash_clmul:
movdqa .Lbswap_mask(%rip),%xmm5
+ movq $11547335547999543296,%rax
movdqu (%rdi),%xmm0
movdqu (%rsi),%xmm2
+ movdqu 32(%rsi),%xmm10
.byte 102,15,56,0,197
subq $16,%rcx
jz .Lodd_tail
- movdqu 16(%rsi),%xmm8
+ movdqu 16(%rsi),%xmm9
+ cmpq $48,%rcx
+ jb .Lskip4x
+ subq $48,%rcx
+ movdqu 48(%rsi),%xmm14
+ movdqu 64(%rsi),%xmm15
- movdqu (%rdx),%xmm3
- movdqu 16(%rdx),%xmm6
-.byte 102,15,56,0,221
+ movdqu 48(%rdx),%xmm6
+ movdqu 32(%rdx),%xmm11
.byte 102,15,56,0,245
- pxor %xmm3,%xmm0
- movdqa %xmm6,%xmm7
- pshufd $78,%xmm6,%xmm3
- pshufd $78,%xmm2,%xmm4
- pxor %xmm6,%xmm3
- pxor %xmm2,%xmm4
+.byte 102,68,15,56,0,221
+ movdqa %xmm6,%xmm8
+ pshufd $78,%xmm6,%xmm7
+ pxor %xmm6,%xmm7
.byte 102,15,58,68,242,0
-.byte 102,15,58,68,250,17
-.byte 102,15,58,68,220,0
- pxor %xmm6,%xmm3
- pxor %xmm7,%xmm3
+.byte 102,68,15,58,68,194,17
+.byte 102,65,15,58,68,250,0
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,217,0
+.byte 102,69,15,58,68,233,17
+ xorps %xmm11,%xmm6
+.byte 102,69,15,58,68,226,16
+ xorps %xmm13,%xmm8
+ movups 80(%rsi),%xmm10
+ xorps %xmm12,%xmm7
+
+ movdqu 16(%rdx),%xmm11
+ movdqu 0(%rdx),%xmm3
+.byte 102,68,15,56,0,221
+.byte 102,15,56,0,221
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm3,%xmm0
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm6
+.byte 102,69,15,58,68,226,0
+ xorps %xmm13,%xmm8
+
+ leaq 64(%rdx),%rdx
+ subq $64,%rcx
+ jc .Ltail4x
+
+ jmp .Lmod4_loop
+.align 32
+.Lmod4_loop:
+.byte 102,65,15,58,68,199,0
+ xorps %xmm12,%xmm7
+ movdqu 48(%rdx),%xmm11
+.byte 102,68,15,56,0,221
+.byte 102,65,15,58,68,207,17
+ xorps %xmm6,%xmm0
+ movdqu 32(%rdx),%xmm6
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+.byte 102,65,15,58,68,218,16
+ xorps %xmm8,%xmm1
+ pxor %xmm11,%xmm12
+.byte 102,15,56,0,245
+ movups 32(%rsi),%xmm10
+.byte 102,68,15,58,68,218,0
+ xorps %xmm7,%xmm3
+ movdqa %xmm6,%xmm8
+ pshufd $78,%xmm6,%xmm7
+
+ pxor %xmm0,%xmm3
+ pxor %xmm6,%xmm7
+ pxor %xmm1,%xmm3
movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
+ pslldq $8,%xmm3
+.byte 102,68,15,58,68,234,17
+ psrldq $8,%xmm4
+ pxor %xmm3,%xmm0
+ movdqa .L7_mask(%rip),%xmm3
+ pxor %xmm4,%xmm1
+.byte 102,72,15,110,224
+
+ pand %xmm0,%xmm3
+.byte 102,15,56,0,227
+.byte 102,69,15,58,68,226,0
+ pxor %xmm0,%xmm4
+ psllq $57,%xmm4
+ movdqa %xmm4,%xmm3
pslldq $8,%xmm4
- pxor %xmm3,%xmm7
- pxor %xmm4,%xmm6
+.byte 102,65,15,58,68,241,0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqu 0(%rdx),%xmm3
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+.byte 102,69,15,58,68,193,17
+ xorps %xmm11,%xmm6
+ movdqu 16(%rdx),%xmm11
+.byte 102,68,15,56,0,221
+.byte 102,65,15,58,68,250,16
+ xorps %xmm13,%xmm8
+ movups 80(%rsi),%xmm10
+.byte 102,15,56,0,221
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+
+ movdqa %xmm11,%xmm13
+ pxor %xmm12,%xmm7
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ psrlq $1,%xmm0
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm6
+ pxor %xmm1,%xmm0
+
+.byte 102,69,15,58,68,226,0
+ xorps %xmm13,%xmm8
+
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm8,%xmm4
pxor %xmm0,%xmm3
- pxor %xmm8,%xmm4
- leaq 32(%rdx),%rdx
- subq $32,%rcx
- jbe .Leven_tail
+ leaq 64(%rdx),%rdx
+ subq $64,%rcx
+ jnc .Lmod4_loop
+
+.Ltail4x:
+.byte 102,65,15,58,68,199,0
+ xorps %xmm12,%xmm7
+.byte 102,65,15,58,68,207,17
+ xorps %xmm6,%xmm0
+.byte 102,65,15,58,68,218,16
+ xorps %xmm8,%xmm1
+ pxor %xmm0,%xmm1
+ pxor %xmm7,%xmm3
-.Lmod_loop:
-.byte 102,65,15,58,68,192,0
-.byte 102,65,15,58,68,200,17
-.byte 102,15,58,68,220,0
- pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
+ pxor %xmm0,%xmm1
movdqa %xmm3,%xmm4
psrldq $8,%xmm3
pslldq $8,%xmm4
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
- movdqu (%rdx),%xmm3
- pxor %xmm6,%xmm0
- pxor %xmm7,%xmm1
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ addq $64,%rcx
+ jz .Ldone
+ movdqu 32(%rsi),%xmm10
+ subq $16,%rcx
+ jz .Lodd_tail
+.Lskip4x:
+
+
+
+
+
+ movdqu (%rdx),%xmm3
movdqu 16(%rdx),%xmm6
.byte 102,15,56,0,221
.byte 102,15,56,0,245
+ pxor %xmm3,%xmm0
+
+ movdqa %xmm6,%xmm8
+ pshufd $78,%xmm6,%xmm3
+ pxor %xmm6,%xmm3
+.byte 102,15,58,68,242,0
+.byte 102,68,15,58,68,194,17
+.byte 102,65,15,58,68,218,0
+
+ leaq 32(%rdx),%rdx
+ subq $32,%rcx
+ jbe .Leven_tail
+ jmp .Lmod_loop
- movdqa %xmm6,%xmm7
- pshufd $78,%xmm6,%xmm9
- pshufd $78,%xmm2,%xmm10
- pxor %xmm6,%xmm9
- pxor %xmm2,%xmm10
+.align 32
+.Lmod_loop:
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm0,%xmm4
+
+.byte 102,65,15,58,68,193,0
+.byte 102,65,15,58,68,201,17
+.byte 102,65,15,58,68,226,16
+
+ pxor %xmm6,%xmm0
+ pxor %xmm8,%xmm1
+ movdqu (%rdx),%xmm8
+.byte 102,68,15,56,0,197
+ movdqu 16(%rdx),%xmm6
+
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pxor %xmm8,%xmm1
+ pxor %xmm3,%xmm4
+.byte 102,15,56,0,245
+ movdqa %xmm4,%xmm3
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm6,%xmm8
+
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
- pxor %xmm3,%xmm0
.byte 102,15,58,68,242,0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ pshufd $78,%xmm8,%xmm3
+ pxor %xmm8,%xmm3
-.byte 102,15,58,68,250,17
+.byte 102,68,15,58,68,194,17
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
-
-.byte 102,69,15,58,68,202,0
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm8,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm8,%xmm4
-
- pxor %xmm6,%xmm9
- pxor %xmm7,%xmm9
- movdqa %xmm9,%xmm10
- psrldq $8,%xmm9
- pslldq $8,%xmm10
- pxor %xmm9,%xmm7
- pxor %xmm10,%xmm6
+.byte 102,65,15,58,68,218,0
+ pxor %xmm1,%xmm0
leaq 32(%rdx),%rdx
subq $32,%rcx
ja .Lmod_loop
.Leven_tail:
-.byte 102,65,15,58,68,192,0
-.byte 102,65,15,58,68,200,17
-.byte 102,15,58,68,220,0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm0,%xmm4
+
+.byte 102,65,15,58,68,193,0
+.byte 102,65,15,58,68,201,17
+.byte 102,65,15,58,68,226,16
+
+ pxor %xmm6,%xmm0
+ pxor %xmm8,%xmm1
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
-
- movdqa %xmm3,%xmm4
+ pxor %xmm3,%xmm4
+ movdqa %xmm4,%xmm3
psrldq $8,%xmm3
pslldq $8,%xmm4
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
- pxor %xmm6,%xmm0
- pxor %xmm7,%xmm1
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
testq %rcx,%rcx
jnz .Ldone
@@ -972,12 +1235,10 @@ gcm_ghash_clmul:
pxor %xmm3,%xmm0
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
+.byte 102,65,15,58,68,218,0
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -987,38 +1248,60 @@ gcm_ghash_clmul:
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
.Ldone:
.byte 102,15,56,0,197
movdqu %xmm0,(%rdi)
.byte 0xf3,0xc3
-.LSEH_end_gcm_ghash_clmul:
.size gcm_ghash_clmul,.-gcm_ghash_clmul
+.globl gcm_init_avx
+.type gcm_init_avx,@function
+.align 32
+gcm_init_avx:
+ jmp .L_init_clmul
+.size gcm_init_avx,.-gcm_init_avx
+.globl gcm_gmult_avx
+.type gcm_gmult_avx,@function
+.align 32
+gcm_gmult_avx:
+ jmp .L_gmult_clmul
+.size gcm_gmult_avx,.-gcm_gmult_avx
+.globl gcm_ghash_avx
+.type gcm_ghash_avx,@function
+.align 32
+gcm_ghash_avx:
+ jmp .L_ghash_clmul
+.size gcm_ghash_avx,.-gcm_ghash_avx
.align 64
.Lbswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.L0x1c2_polynomial:
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+.long 7,0,7,0
+.L7_mask_poly:
+.long 7,0,450,0
.align 64
.type .Lrem_4bit,@object
.Lrem_4bit:
diff --git a/lib/accelerated/x86/elf/appro-aes-x86-64.s b/lib/accelerated/x86/elf/appro-aes-x86-64.s
index f48666f7ae..d3734a6edd 100644
--- a/lib/accelerated/x86/elf/appro-aes-x86-64.s
+++ b/lib/accelerated/x86/elf/appro-aes-x86-64.s
@@ -925,199 +925,412 @@ aesni_ccm64_decrypt_blocks:
.type aesni_ctr32_encrypt_blocks,@function
.align 16
aesni_ctr32_encrypt_blocks:
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $128,%rsp
+ andq $-16,%rsp
+ leaq -8(%rax),%rbp
+
cmpq $1,%rdx
je .Lctr32_one_shortcut
- movdqu (%r8),%xmm14
- movdqa .Lbswap_mask(%rip),%xmm15
- xorl %eax,%eax
-.byte 102,69,15,58,22,242,3
-.byte 102,68,15,58,34,240,3
+ movdqu (%r8),%xmm2
+ movdqu (%rcx),%xmm0
+ movl 12(%r8),%r8d
+ pxor %xmm0,%xmm2
+ movl 12(%rcx),%r11d
+ movdqa %xmm2,0(%rsp)
+ bswapl %r8d
+ movdqa %xmm2,%xmm3
+ movdqa %xmm2,%xmm4
+ movdqa %xmm2,%xmm5
+ movdqa %xmm2,64(%rsp)
+ movdqa %xmm2,80(%rsp)
+ movdqa %xmm2,96(%rsp)
+ movdqa %xmm2,112(%rsp)
movl 240(%rcx),%eax
+
+ leaq 1(%r8),%r9
+ leaq 2(%r8),%r10
+ bswapl %r9d
bswapl %r10d
- pxor %xmm12,%xmm12
- pxor %xmm13,%xmm13
-.byte 102,69,15,58,34,226,0
- leaq 3(%r10),%r11
-.byte 102,69,15,58,34,235,0
- incl %r10d
-.byte 102,69,15,58,34,226,1
- incq %r11
-.byte 102,69,15,58,34,235,1
- incl %r10d
-.byte 102,69,15,58,34,226,2
- incq %r11
-.byte 102,69,15,58,34,235,2
- movdqa %xmm12,-40(%rsp)
-.byte 102,69,15,56,0,231
- movdqa %xmm13,-24(%rsp)
-.byte 102,69,15,56,0,239
-
- pshufd $192,%xmm12,%xmm2
- pshufd $128,%xmm12,%xmm3
- pshufd $64,%xmm12,%xmm4
- cmpq $6,%rdx
- jb .Lctr32_tail
- shrl $1,%eax
- movq %rcx,%r11
- movl %eax,%r10d
- subq $6,%rdx
- jmp .Lctr32_loop6
+ xorl %r11d,%r9d
+ xorl %r11d,%r10d
+.byte 102,65,15,58,34,217,3
+ leaq 3(%r8),%r9
+ movdqa %xmm3,16(%rsp)
+.byte 102,65,15,58,34,226,3
+ bswapl %r9d
+ leaq 4(%r8),%r10
+ movdqa %xmm4,32(%rsp)
+ xorl %r11d,%r9d
+ bswapl %r10d
+.byte 102,65,15,58,34,233,3
+ xorl %r11d,%r10d
+ movdqa %xmm5,48(%rsp)
+ leaq 5(%r8),%r9
+ movl %r10d,64+12(%rsp)
+ bswapl %r9d
+ leaq 6(%r8),%r10
+ xorl %r11d,%r9d
+ bswapl %r10d
+ movl %r9d,80+12(%rsp)
+ xorl %r11d,%r10d
+ leaq 7(%r8),%r9
+ movl %r10d,96+12(%rsp)
+ bswapl %r9d
+ xorl %r11d,%r9d
+ movl %r9d,112+12(%rsp)
-.align 16
-.Lctr32_loop6:
- pshufd $192,%xmm13,%xmm5
- por %xmm14,%xmm2
- movups (%r11),%xmm0
- pshufd $128,%xmm13,%xmm6
- por %xmm14,%xmm3
- movups 16(%r11),%xmm1
- pshufd $64,%xmm13,%xmm7
- por %xmm14,%xmm4
- por %xmm14,%xmm5
- xorps %xmm0,%xmm2
- por %xmm14,%xmm6
- por %xmm14,%xmm7
+ movups 16(%rcx),%xmm1
+ movdqa 64(%rsp),%xmm6
+ movdqa 80(%rsp),%xmm7
+ cmpq $8,%rdx
+ jb .Lctr32_tail
+ leaq 128(%rcx),%rcx
+ subq $8,%rdx
+ jmp .Lctr32_loop8
- pxor %xmm0,%xmm3
+.align 32
+.Lctr32_loop8:
+ addl $8,%r8d
+ movdqa 96(%rsp),%xmm8
.byte 102,15,56,220,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+ movl %r8d,%r9d
+ movdqa 112(%rsp),%xmm9
.byte 102,15,56,220,217
- movdqa .Lincrement32(%rip),%xmm13
- pxor %xmm0,%xmm5
+ bswapl %r9d
+ movups 32-128(%rcx),%xmm0
.byte 102,15,56,220,225
- movdqa -40(%rsp),%xmm12
- pxor %xmm0,%xmm6
+ xorl %r11d,%r9d
.byte 102,15,56,220,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+ movl %r9d,0+12(%rsp)
+ leaq 1(%r8),%r9
.byte 102,15,56,220,241
.byte 102,15,56,220,249
- jmp .Lctr32_enc_loop6_enter
-.align 16
-.Lctr32_enc_loop6:
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 48-128(%rcx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ bswapl %r9d
+.byte 102,15,56,220,224
+ xorl %r11d,%r9d
+.byte 102,15,56,220,232
+ movl %r9d,16+12(%rsp)
+ leaq 2(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 64-128(%rcx),%xmm0
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
+ bswapl %r9d
.byte 102,15,56,220,225
+ xorl %r11d,%r9d
.byte 102,15,56,220,233
+ movl %r9d,32+12(%rsp)
+ leaq 3(%r8),%r9
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.Lctr32_enc_loop6_enter:
- movups 16(%rcx),%xmm1
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 80-128(%rcx),%xmm1
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
+ bswapl %r9d
.byte 102,15,56,220,224
+ xorl %r11d,%r9d
.byte 102,15,56,220,232
+ movl %r9d,48+12(%rsp)
+ leaq 4(%r8),%r9
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
- jnz .Lctr32_enc_loop6
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 96-128(%rcx),%xmm0
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ bswapl %r9d
+.byte 102,15,56,220,225
+ xorl %r11d,%r9d
+.byte 102,15,56,220,233
+ movl %r9d,64+12(%rsp)
+ leaq 5(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 112-128(%rcx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ bswapl %r9d
+.byte 102,15,56,220,224
+ xorl %r11d,%r9d
+.byte 102,15,56,220,232
+ movl %r9d,80+12(%rsp)
+ leaq 6(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 128-128(%rcx),%xmm0
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ bswapl %r9d
+.byte 102,15,56,220,225
+ xorl %r11d,%r9d
+.byte 102,15,56,220,233
+ movl %r9d,96+12(%rsp)
+ leaq 7(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 144-128(%rcx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ bswapl %r9d
+.byte 102,15,56,220,224
+ xorl %r11d,%r9d
+.byte 102,15,56,220,232
+ movl %r9d,112+12(%rsp)
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+ movdqu 0(%rdi),%xmm10
+.byte 102,68,15,56,220,200
+ movups 160-128(%rcx),%xmm0
+
+ cmpl $11,%eax
+ jb .Lctr32_enc_done
.byte 102,15,56,220,209
- paddd %xmm13,%xmm12
.byte 102,15,56,220,217
- paddd -24(%rsp),%xmm13
.byte 102,15,56,220,225
- movdqa %xmm12,-40(%rsp)
.byte 102,15,56,220,233
- movdqa %xmm13,-24(%rsp)
.byte 102,15,56,220,241
-.byte 102,69,15,56,0,231
.byte 102,15,56,220,249
-.byte 102,69,15,56,0,239
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 176-128(%rcx),%xmm1
-.byte 102,15,56,221,208
- movups (%rdi),%xmm8
-.byte 102,15,56,221,216
- movups 16(%rdi),%xmm9
-.byte 102,15,56,221,224
- movups 32(%rdi),%xmm10
-.byte 102,15,56,221,232
- movups 48(%rdi),%xmm11
-.byte 102,15,56,221,240
- movups 64(%rdi),%xmm1
-.byte 102,15,56,221,248
- movups 80(%rdi),%xmm0
- leaq 96(%rdi),%rdi
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 192-128(%rcx),%xmm0
+ je .Lctr32_enc_done
- xorps %xmm2,%xmm8
- pshufd $192,%xmm12,%xmm2
- xorps %xmm3,%xmm9
- pshufd $128,%xmm12,%xmm3
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- pshufd $64,%xmm12,%xmm4
- movups %xmm9,16(%rsi)
- xorps %xmm5,%xmm11
- movups %xmm10,32(%rsi)
- xorps %xmm6,%xmm1
- movups %xmm11,48(%rsi)
- xorps %xmm7,%xmm0
- movups %xmm1,64(%rsi)
- movups %xmm0,80(%rsi)
- leaq 96(%rsi),%rsi
- movl %r10d,%eax
- subq $6,%rdx
- jnc .Lctr32_loop6
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 208-128(%rcx),%xmm1
- addq $6,%rdx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 224-128(%rcx),%xmm0
+
+.Lctr32_enc_done:
+ movdqu 16(%rdi),%xmm11
+ pxor %xmm0,%xmm10
+ movdqu 32(%rdi),%xmm12
+ pxor %xmm0,%xmm11
+ movdqu 48(%rdi),%xmm13
+ pxor %xmm0,%xmm12
+ movdqu 64(%rdi),%xmm14
+ pxor %xmm0,%xmm13
+ movdqu 80(%rdi),%xmm15
+ pxor %xmm0,%xmm14
+.byte 102,15,56,220,209
+ pxor %xmm0,%xmm15
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movdqu 96(%rdi),%xmm1
+
+.byte 102,65,15,56,221,210
+ pxor %xmm0,%xmm1
+ movdqu 112(%rdi),%xmm10
+ leaq 128(%rdi),%rdi
+.byte 102,65,15,56,221,219
+ pxor %xmm0,%xmm10
+ movdqa 0(%rsp),%xmm11
+.byte 102,65,15,56,221,228
+ movdqa 16(%rsp),%xmm12
+.byte 102,65,15,56,221,237
+ movdqa 32(%rsp),%xmm13
+.byte 102,65,15,56,221,246
+ movdqa 48(%rsp),%xmm14
+.byte 102,65,15,56,221,255
+ movdqa 64(%rsp),%xmm15
+.byte 102,68,15,56,221,193
+ movdqa 80(%rsp),%xmm0
+.byte 102,69,15,56,221,202
+ movups 16-128(%rcx),%xmm1
+
+ movups %xmm2,(%rsi)
+ movdqa %xmm11,%xmm2
+ movups %xmm3,16(%rsi)
+ movdqa %xmm12,%xmm3
+ movups %xmm4,32(%rsi)
+ movdqa %xmm13,%xmm4
+ movups %xmm5,48(%rsi)
+ movdqa %xmm14,%xmm5
+ movups %xmm6,64(%rsi)
+ movdqa %xmm15,%xmm6
+ movups %xmm7,80(%rsi)
+ movdqa %xmm0,%xmm7
+ movups %xmm8,96(%rsi)
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+
+ subq $8,%rdx
+ jnc .Lctr32_loop8
+
+ addq $8,%rdx
jz .Lctr32_done
- movq %r11,%rcx
- leal 1(%rax,%rax,1),%eax
+ leaq -128(%rcx),%rcx
.Lctr32_tail:
- por %xmm14,%xmm2
- movups (%rdi),%xmm8
- cmpq $2,%rdx
- jb .Lctr32_one
+ leaq 16(%rcx),%rcx
+ cmpq $4,%rdx
+ jb .Lctr32_loop3
+ je .Lctr32_loop4
- por %xmm14,%xmm3
- movups 16(%rdi),%xmm9
- je .Lctr32_two
+ movdqa 96(%rsp),%xmm8
+ pxor %xmm9,%xmm9
- pshufd $192,%xmm13,%xmm5
- por %xmm14,%xmm4
- movups 32(%rdi),%xmm10
- cmpq $4,%rdx
- jb .Lctr32_three
+ movups 16(%rcx),%xmm0
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+.byte 102,15,56,220,217
+ shrl $1,%eax
+.byte 102,15,56,220,225
+ decl %eax
+.byte 102,15,56,220,233
+ movups (%rdi),%xmm10
+.byte 102,15,56,220,241
+ movups 16(%rdi),%xmm11
+.byte 102,15,56,220,249
+ movups 32(%rdi),%xmm12
+.byte 102,68,15,56,220,193
+ movups 16(%rcx),%xmm1
- pshufd $128,%xmm13,%xmm6
- por %xmm14,%xmm5
- movups 48(%rdi),%xmm11
- je .Lctr32_four
+ call .Lenc_loop8_enter
- por %xmm14,%xmm6
- xorps %xmm7,%xmm7
+ movdqu 48(%rdi),%xmm13
+ pxor %xmm10,%xmm2
+ movdqu 64(%rdi),%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm10,%xmm6
+ movdqu %xmm5,48(%rsi)
+ movdqu %xmm6,64(%rsi)
+ cmpq $6,%rdx
+ jb .Lctr32_done
- call _aesni_encrypt6
+ movups 80(%rdi),%xmm11
+ xorps %xmm11,%xmm7
+ movups %xmm7,80(%rsi)
+ je .Lctr32_done
- movups 64(%rdi),%xmm1
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- movups %xmm9,16(%rsi)
- xorps %xmm5,%xmm11
- movups %xmm10,32(%rsi)
- xorps %xmm6,%xmm1
- movups %xmm11,48(%rsi)
- movups %xmm1,64(%rsi)
+ movups 96(%rdi),%xmm12
+ xorps %xmm12,%xmm8
+ movups %xmm8,96(%rsi)
+ jmp .Lctr32_done
+
+.align 32
+.Lctr32_loop4:
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups (%rcx),%xmm1
+ decl %eax
+ jnz .Lctr32_loop4
+.byte 102,15,56,221,209
+ movups (%rdi),%xmm10
+.byte 102,15,56,221,217
+ movups 16(%rdi),%xmm11
+.byte 102,15,56,221,225
+ movups 32(%rdi),%xmm12
+.byte 102,15,56,221,233
+ movups 48(%rdi),%xmm13
+
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+ xorps %xmm11,%xmm3
+ movups %xmm3,16(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm5,48(%rsi)
+ jmp .Lctr32_done
+
+.align 32
+.Lctr32_loop3:
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+ movups (%rcx),%xmm1
+ decl %eax
+ jnz .Lctr32_loop3
+.byte 102,15,56,221,209
+.byte 102,15,56,221,217
+.byte 102,15,56,221,225
+
+ movups (%rdi),%xmm10
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+ cmpq $2,%rdx
+ jb .Lctr32_done
+
+ movups 16(%rdi),%xmm11
+ xorps %xmm11,%xmm3
+ movups %xmm3,16(%rsi)
+ je .Lctr32_done
+
+ movups 32(%rdi),%xmm12
+ xorps %xmm12,%xmm4
+ movups %xmm4,32(%rsi)
jmp .Lctr32_done
.align 16
.Lctr32_one_shortcut:
movups (%r8),%xmm2
- movups (%rdi),%xmm8
+ movups (%rdi),%xmm10
movl 240(%rcx),%eax
-.Lctr32_one:
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
@@ -1129,51 +1342,26 @@ aesni_ctr32_encrypt_blocks:
leaq 16(%rcx),%rcx
jnz .Loop_enc1_7
.byte 102,15,56,221,209
- xorps %xmm2,%xmm8
- movups %xmm8,(%rsi)
- jmp .Lctr32_done
-
-.align 16
-.Lctr32_two:
- xorps %xmm4,%xmm4
- call _aesni_encrypt3
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- movups %xmm9,16(%rsi)
- jmp .Lctr32_done
-
-.align 16
-.Lctr32_three:
- call _aesni_encrypt3
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- movups %xmm9,16(%rsi)
- movups %xmm10,32(%rsi)
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
jmp .Lctr32_done
.align 16
-.Lctr32_four:
- call _aesni_encrypt4
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- movups %xmm9,16(%rsi)
- xorps %xmm5,%xmm11
- movups %xmm10,32(%rsi)
- movups %xmm11,48(%rsi)
-
.Lctr32_done:
+ leaq (%rbp),%rsp
+ popq %rbp
+.Lctr32_epilogue:
.byte 0xf3,0xc3
.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
.globl aesni_xts_encrypt
.type aesni_xts_encrypt,@function
.align 16
aesni_xts_encrypt:
- leaq -104(%rsp),%rsp
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $112,%rsp
+ andq $-16,%rsp
+ leaq -8(%rax),%rbp
movups (%r9),%xmm15
movl 240(%r8),%eax
movl 240(%rcx),%r10d
@@ -1188,228 +1376,266 @@ aesni_xts_encrypt:
leaq 16(%r8),%r8
jnz .Loop_enc1_8
.byte 102,68,15,56,221,249
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+ movl %eax,%r10d
+
movdqa .Lxts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pshufd $95,%xmm15,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc .Lxts_enc_short
shrl $1,%eax
- subl $1,%eax
+ subl $3,%eax
+ movups 16(%r11),%xmm1
movl %eax,%r10d
+ leaq .Lxts_magic(%rip),%r8
jmp .Lxts_enc_grandloop
-.align 16
+.align 32
.Lxts_enc_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,220,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,220,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,220,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,220,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,220,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,220,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,220,249
+ movups 48(%r11),%xmm1
+
+.byte 102,15,56,220,208
+ pxor %xmm9,%xmm12
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,220,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,220,216
+ pxor %xmm9,%xmm13
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,220,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,220,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,220,224
+ pxor %xmm9,%xmm14
+.byte 102,15,56,220,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,220,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,220,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp .Lxts_enc_loop6_enter
-
-.align 16
+.byte 102,15,56,220,240
+ movdqa %xmm8,80(%rsp)
+.byte 102,15,56,220,248
+ movups 64(%r11),%xmm0
+ leaq 64(%r11),%rcx
+ pshufd $95,%xmm15,%xmm9
+ jmp .Lxts_enc_loop6
+.align 32
.Lxts_enc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-.Lxts_enc_loop6_enter:
movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
movups (%rcx),%xmm0
+ decl %eax
jnz .Lxts_enc_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
.byte 102,15,56,220,233
+ pxor %xmm14,%xmm15
.byte 102,15,56,220,241
+ movaps %xmm10,%xmm11
.byte 102,15,56,220,249
movups 16(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,208
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm10
+ psrad $31,%xmm14
.byte 102,15,56,220,216
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,220,224
- pxor %xmm9,%xmm15
.byte 102,15,56,220,232
+ pxor %xmm14,%xmm15
.byte 102,15,56,220,240
+ movaps %xmm11,%xmm12
.byte 102,15,56,220,248
movups 32(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
+ movdqa %xmm13,48(%rsp)
.byte 102,15,56,220,233
+ pxor %xmm14,%xmm15
.byte 102,15,56,220,241
+ movaps %xmm12,%xmm13
.byte 102,15,56,220,249
+ movups 48(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,220,216
paddq %xmm15,%xmm15
-.byte 102,15,56,221,208
- pand %xmm8,%xmm9
-.byte 102,15,56,221,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,221,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,221,232
-.byte 102,15,56,221,240
-.byte 102,15,56,221,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ pxor %xmm14,%xmm15
+.byte 102,15,56,220,240
+ movaps %xmm13,%xmm14
+.byte 102,15,56,220,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,220,217
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+ psrad $31,%xmm9
+.byte 102,15,56,221,84,36,0
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,221,92,36,16
+.byte 102,15,56,221,100,36,32
pxor %xmm9,%xmm15
-
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
+.byte 102,15,56,221,108,36,48
+.byte 102,15,56,221,116,36,64
+.byte 102,15,56,221,124,36,80
movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
+
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc .Lxts_enc_grandloop
- leal 3(%rax,%rax,1),%eax
+ leal 7(%rax,%rax,1),%eax
movq %r11,%rcx
movl %eax,%r10d
.Lxts_enc_short:
+ pxor %xmm0,%xmm10
addq $96,%rdx
jz .Lxts_enc_done
+ pxor %xmm0,%xmm11
cmpq $32,%rdx
jb .Lxts_enc_one
+ pxor %xmm0,%xmm12
je .Lxts_enc_two
+ pxor %xmm0,%xmm13
cmpq $64,%rdx
jb .Lxts_enc_three
+ pxor %xmm0,%xmm14
je .Lxts_enc_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
@@ -1512,15 +1738,15 @@ aesni_xts_encrypt:
call _aesni_encrypt4
- xorps %xmm10,%xmm2
- movdqa %xmm15,%xmm10
- xorps %xmm11,%xmm3
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm10,%xmm2
+ movdqa %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp .Lxts_enc_done
@@ -1561,7 +1787,8 @@ aesni_xts_encrypt:
movups %xmm2,-16(%rsi)
.Lxts_enc_ret:
- leaq 104(%rsp),%rsp
+ leaq (%rbp),%rsp
+ popq %rbp
.Lxts_enc_epilogue:
.byte 0xf3,0xc3
.size aesni_xts_encrypt,.-aesni_xts_encrypt
@@ -1569,7 +1796,11 @@ aesni_xts_encrypt:
.type aesni_xts_decrypt,@function
.align 16
aesni_xts_decrypt:
- leaq -104(%rsp),%rsp
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $112,%rsp
+ andq $-16,%rsp
+ leaq -8(%rax),%rbp
movups (%r9),%xmm15
movl 240(%r8),%eax
movl 240(%rcx),%r10d
@@ -1590,228 +1821,266 @@ aesni_xts_decrypt:
shlq $4,%rax
subq %rax,%rdx
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+ movl %eax,%r10d
+
movdqa .Lxts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pshufd $95,%xmm15,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc .Lxts_dec_short
shrl $1,%eax
- subl $1,%eax
+ subl $3,%eax
+ movups 16(%r11),%xmm1
movl %eax,%r10d
+ leaq .Lxts_magic(%rip),%r8
jmp .Lxts_dec_grandloop
-.align 16
+.align 32
.Lxts_dec_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,222,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,222,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,222,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,222,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,222,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,222,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,222,249
+ movups 48(%r11),%xmm1
+
+.byte 102,15,56,222,208
+ pxor %xmm9,%xmm12
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,222,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,222,216
+ pxor %xmm9,%xmm13
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,222,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,222,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,222,224
+ pxor %xmm9,%xmm14
+.byte 102,15,56,222,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,222,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,222,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp .Lxts_dec_loop6_enter
-
-.align 16
+.byte 102,15,56,222,240
+ movdqa %xmm8,80(%rsp)
+.byte 102,15,56,222,248
+ movups 64(%r11),%xmm0
+ leaq 64(%r11),%rcx
+ pshufd $95,%xmm15,%xmm9
+ jmp .Lxts_dec_loop6
+.align 32
.Lxts_dec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-.Lxts_dec_loop6_enter:
movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
movups (%rcx),%xmm0
+ decl %eax
jnz .Lxts_dec_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
.byte 102,15,56,222,233
+ pxor %xmm14,%xmm15
.byte 102,15,56,222,241
+ movaps %xmm10,%xmm11
.byte 102,15,56,222,249
movups 16(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,208
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm10
+ psrad $31,%xmm14
.byte 102,15,56,222,216
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,222,224
- pxor %xmm9,%xmm15
.byte 102,15,56,222,232
+ pxor %xmm14,%xmm15
.byte 102,15,56,222,240
+ movaps %xmm11,%xmm12
.byte 102,15,56,222,248
movups 32(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
+ movdqa %xmm13,48(%rsp)
.byte 102,15,56,222,233
+ pxor %xmm14,%xmm15
.byte 102,15,56,222,241
+ movaps %xmm12,%xmm13
.byte 102,15,56,222,249
+ movups 48(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,222,216
paddq %xmm15,%xmm15
-.byte 102,15,56,223,208
- pand %xmm8,%xmm9
-.byte 102,15,56,223,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,223,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,223,232
-.byte 102,15,56,223,240
-.byte 102,15,56,223,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ pxor %xmm14,%xmm15
+.byte 102,15,56,222,240
+ movaps %xmm13,%xmm14
+.byte 102,15,56,222,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,222,217
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+ psrad $31,%xmm9
+.byte 102,15,56,223,84,36,0
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,223,92,36,16
+.byte 102,15,56,223,100,36,32
pxor %xmm9,%xmm15
-
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
+.byte 102,15,56,223,108,36,48
+.byte 102,15,56,223,116,36,64
+.byte 102,15,56,223,124,36,80
movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
+
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc .Lxts_dec_grandloop
- leal 3(%rax,%rax,1),%eax
+ leal 7(%rax,%rax,1),%eax
movq %r11,%rcx
movl %eax,%r10d
.Lxts_dec_short:
+ pxor %xmm0,%xmm10
+ pxor %xmm0,%xmm11
addq $96,%rdx
jz .Lxts_dec_done
+ pxor %xmm0,%xmm12
cmpq $32,%rdx
jb .Lxts_dec_one
+ pxor %xmm0,%xmm13
je .Lxts_dec_two
+ pxor %xmm0,%xmm14
cmpq $64,%rdx
jb .Lxts_dec_three
je .Lxts_dec_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
@@ -1904,7 +2173,7 @@ aesni_xts_decrypt:
xorps %xmm10,%xmm2
movdqa %xmm13,%xmm10
xorps %xmm11,%xmm3
- movdqa %xmm15,%xmm11
+ movdqa %xmm14,%xmm11
xorps %xmm12,%xmm4
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
@@ -1914,14 +2183,8 @@ aesni_xts_decrypt:
.align 16
.Lxts_dec_four:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movups (%rdi),%xmm2
- pand %xmm8,%xmm9
movups 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movups 32(%rdi),%xmm4
xorps %xmm10,%xmm2
movups 48(%rdi),%xmm5
@@ -1932,16 +2195,16 @@ aesni_xts_decrypt:
call _aesni_decrypt4
- xorps %xmm10,%xmm2
+ pxor %xmm10,%xmm2
movdqa %xmm14,%xmm10
- xorps %xmm11,%xmm3
+ pxor %xmm11,%xmm3
movdqa %xmm15,%xmm11
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp .Lxts_dec_done
@@ -2001,7 +2264,8 @@ aesni_xts_decrypt:
movups %xmm2,(%rsi)
.Lxts_dec_ret:
- leaq 104(%rsp),%rsp
+ leaq (%rbp),%rsp
+ popq %rbp
.Lxts_dec_epilogue:
.byte 0xf3,0xc3
.size aesni_xts_decrypt,.-aesni_xts_decrypt
@@ -2068,149 +2332,324 @@ aesni_cbc_encrypt:
.align 16
.Lcbc_decrypt:
- movups (%r8),%xmm9
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $16,%rsp
+ andq $-16,%rsp
+ leaq -8(%rax),%rbp
+ movups (%r8),%xmm10
movl %r10d,%eax
- cmpq $112,%rdx
+ cmpq $80,%rdx
jbe .Lcbc_dec_tail
- shrl $1,%r10d
+
+ movups (%rcx),%xmm0
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqa %xmm2,%xmm11
+ movdqu 32(%rdi),%xmm4
+ movdqa %xmm3,%xmm12
+ movdqu 48(%rdi),%xmm5
+ movdqa %xmm4,%xmm13
+ movdqu 64(%rdi),%xmm6
+ movdqa %xmm5,%xmm14
+ movdqu 80(%rdi),%xmm7
+ movdqa %xmm6,%xmm15
+ cmpq $112,%rdx
+ jbe .Lcbc_dec_six_or_seven
+
subq $112,%rdx
- movl %r10d,%eax
- movaps %xmm9,-24(%rsp)
+ leaq 112(%rcx),%rcx
jmp .Lcbc_dec_loop8_enter
.align 16
.Lcbc_dec_loop8:
- movaps %xmm0,-24(%rsp)
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
.Lcbc_dec_loop8_enter:
- movups (%rcx),%xmm0
- movups (%rdi),%xmm2
- movups 16(%rdi),%xmm3
- movups 16(%rcx),%xmm1
+ movdqu 96(%rdi),%xmm8
+ pxor %xmm0,%xmm2
+ movdqu 112(%rdi),%xmm9
+ pxor %xmm0,%xmm3
+ movups 16-112(%rcx),%xmm1
+ pxor %xmm0,%xmm4
+ xorq %r11,%r11
+ cmpq $112,%rdx
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+ pxor %xmm0,%xmm7
+ pxor %xmm0,%xmm8
- leaq 32(%rcx),%rcx
- movdqu 32(%rdi),%xmm4
- xorps %xmm0,%xmm2
- movdqu 48(%rdi),%xmm5
- xorps %xmm0,%xmm3
- movdqu 64(%rdi),%xmm6
.byte 102,15,56,222,209
- pxor %xmm0,%xmm4
- movdqu 80(%rdi),%xmm7
+ pxor %xmm0,%xmm9
+ movups 32-112(%rcx),%xmm0
.byte 102,15,56,222,217
- pxor %xmm0,%xmm5
- movdqu 96(%rdi),%xmm8
.byte 102,15,56,222,225
- pxor %xmm0,%xmm6
- movdqu 112(%rdi),%xmm9
.byte 102,15,56,222,233
- pxor %xmm0,%xmm7
- decl %eax
.byte 102,15,56,222,241
- pxor %xmm0,%xmm8
.byte 102,15,56,222,249
- pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
+ setnc %r11b
.byte 102,68,15,56,222,193
+ shlq $7,%r11
.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
-
- call .Ldec_loop8_enter
+ addq %rdi,%r11
+ movups 48-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 64-112(%rcx),%xmm0
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 80-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 96-112(%rcx),%xmm0
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 112-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 128-112(%rcx),%xmm0
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 144-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 160-112(%rcx),%xmm0
+ cmpl $11,%eax
+ jb .Lcbc_dec_done
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 176-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 192-112(%rcx),%xmm0
+ je .Lcbc_dec_done
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 208-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 224-112(%rcx),%xmm0
+.Lcbc_dec_done:
+.byte 102,15,56,222,209
+ pxor %xmm0,%xmm10
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm11
+.byte 102,15,56,222,225
+ pxor %xmm0,%xmm12
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm13
+.byte 102,15,56,222,241
+ pxor %xmm0,%xmm14
+.byte 102,15,56,222,249
+ pxor %xmm0,%xmm15
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movdqu 80(%rdi),%xmm1
+
+.byte 102,65,15,56,223,210
+ movdqu 96(%rdi),%xmm10
+ pxor %xmm0,%xmm1
+.byte 102,65,15,56,223,219
+ pxor %xmm0,%xmm10
+ movdqu 112(%rdi),%xmm0
+ leaq 128(%rdi),%rdi
+.byte 102,65,15,56,223,228
+ movdqu 0(%r11),%xmm11
+.byte 102,65,15,56,223,237
+ movdqu 16(%r11),%xmm12
+.byte 102,65,15,56,223,246
+ movdqu 32(%r11),%xmm13
+.byte 102,65,15,56,223,255
+ movdqu 48(%r11),%xmm14
+.byte 102,68,15,56,223,193
+ movdqu 64(%r11),%xmm15
+.byte 102,69,15,56,223,202
+ movdqa %xmm0,%xmm10
+ movdqu 80(%r11),%xmm1
+ movups -112(%rcx),%xmm0
- movups (%rdi),%xmm1
- movups 16(%rdi),%xmm0
- xorps -24(%rsp),%xmm2
- xorps %xmm1,%xmm3
- movups 32(%rdi),%xmm1
- xorps %xmm0,%xmm4
- movups 48(%rdi),%xmm0
- xorps %xmm1,%xmm5
- movups 64(%rdi),%xmm1
- xorps %xmm0,%xmm6
- movups 80(%rdi),%xmm0
- xorps %xmm1,%xmm7
- movups 96(%rdi),%xmm1
- xorps %xmm0,%xmm8
- movups 112(%rdi),%xmm0
- xorps %xmm1,%xmm9
movups %xmm2,(%rsi)
+ movdqa %xmm11,%xmm2
movups %xmm3,16(%rsi)
+ movdqa %xmm12,%xmm3
movups %xmm4,32(%rsi)
+ movdqa %xmm13,%xmm4
movups %xmm5,48(%rsi)
- movl %r10d,%eax
+ movdqa %xmm14,%xmm5
movups %xmm6,64(%rsi)
- movq %r11,%rcx
+ movdqa %xmm15,%xmm6
movups %xmm7,80(%rsi)
- leaq 128(%rdi),%rdi
+ movdqa %xmm1,%xmm7
movups %xmm8,96(%rsi)
leaq 112(%rsi),%rsi
+
subq $128,%rdx
ja .Lcbc_dec_loop8
movaps %xmm9,%xmm2
- movaps %xmm0,%xmm9
+ leaq -112(%rcx),%rcx
addq $112,%rdx
jle .Lcbc_dec_tail_collected
- movups %xmm2,(%rsi)
- leal 1(%r10,%r10,1),%eax
+ movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
+ cmpq $80,%rdx
+ jbe .Lcbc_dec_tail
+
+ movaps %xmm11,%xmm2
+.Lcbc_dec_six_or_seven:
+ cmpq $96,%rdx
+ ja .Lcbc_dec_seven
+
+ movaps %xmm7,%xmm8
+ call _aesni_decrypt6
+ pxor %xmm10,%xmm2
+ movaps %xmm8,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ movdqa %xmm7,%xmm2
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_seven:
+ movups 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
+ call _aesni_decrypt8
+ movups 80(%rdi),%xmm9
+ pxor %xmm10,%xmm2
+ movups 96(%rdi),%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movdqu %xmm6,64(%rsi)
+ pxor %xmm9,%xmm8
+ movdqu %xmm7,80(%rsi)
+ leaq 96(%rsi),%rsi
+ movdqa %xmm8,%xmm2
+ jmp .Lcbc_dec_tail_collected
+
.Lcbc_dec_tail:
movups (%rdi),%xmm2
- movaps %xmm2,%xmm8
- cmpq $16,%rdx
+ subq $16,%rdx
jbe .Lcbc_dec_one
movups 16(%rdi),%xmm3
- movaps %xmm3,%xmm7
- cmpq $32,%rdx
+ movaps %xmm2,%xmm11
+ subq $16,%rdx
jbe .Lcbc_dec_two
movups 32(%rdi),%xmm4
- movaps %xmm4,%xmm6
- cmpq $48,%rdx
+ movaps %xmm3,%xmm12
+ subq $16,%rdx
jbe .Lcbc_dec_three
movups 48(%rdi),%xmm5
- cmpq $64,%rdx
+ movaps %xmm4,%xmm13
+ subq $16,%rdx
jbe .Lcbc_dec_four
movups 64(%rdi),%xmm6
- cmpq $80,%rdx
- jbe .Lcbc_dec_five
-
- movups 80(%rdi),%xmm7
- cmpq $96,%rdx
- jbe .Lcbc_dec_six
-
- movups 96(%rdi),%xmm8
- movaps %xmm9,-24(%rsp)
- call _aesni_decrypt8
- movups (%rdi),%xmm1
- movups 16(%rdi),%xmm0
- xorps -24(%rsp),%xmm2
- xorps %xmm1,%xmm3
- movups 32(%rdi),%xmm1
- xorps %xmm0,%xmm4
- movups 48(%rdi),%xmm0
- xorps %xmm1,%xmm5
- movups 64(%rdi),%xmm1
- xorps %xmm0,%xmm6
- movups 80(%rdi),%xmm0
- xorps %xmm1,%xmm7
- movups 96(%rdi),%xmm9
- xorps %xmm0,%xmm8
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
- leaq 96(%rsi),%rsi
- movaps %xmm8,%xmm2
- subq $112,%rdx
+ movaps %xmm5,%xmm14
+ movaps %xmm6,%xmm15
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ pxor %xmm10,%xmm2
+ movaps %xmm15,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ leaq 64(%rsi),%rsi
+ movdqa %xmm6,%xmm2
+ subq $16,%rdx
jmp .Lcbc_dec_tail_collected
+
.align 16
.Lcbc_dec_one:
+ movaps %xmm2,%xmm11
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
@@ -2222,111 +2661,69 @@ aesni_cbc_encrypt:
leaq 16(%rcx),%rcx
jnz .Loop_dec1_16
.byte 102,15,56,223,209
- xorps %xmm9,%xmm2
- movaps %xmm8,%xmm9
- subq $16,%rdx
+ xorps %xmm10,%xmm2
+ movaps %xmm11,%xmm10
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_two:
+ movaps %xmm3,%xmm12
xorps %xmm4,%xmm4
call _aesni_decrypt3
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- movups %xmm2,(%rsi)
- movaps %xmm7,%xmm9
- movaps %xmm3,%xmm2
+ pxor %xmm10,%xmm2
+ movaps %xmm12,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ movdqa %xmm3,%xmm2
leaq 16(%rsi),%rsi
- subq $32,%rdx
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_three:
+ movaps %xmm4,%xmm13
call _aesni_decrypt3
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- movups %xmm2,(%rsi)
- xorps %xmm7,%xmm4
- movups %xmm3,16(%rsi)
- movaps %xmm6,%xmm9
- movaps %xmm4,%xmm2
+ pxor %xmm10,%xmm2
+ movaps %xmm13,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ movdqa %xmm4,%xmm2
leaq 32(%rsi),%rsi
- subq $48,%rdx
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_four:
+ movaps %xmm5,%xmm14
call _aesni_decrypt4
- xorps %xmm9,%xmm2
- movups 48(%rdi),%xmm9
- xorps %xmm8,%xmm3
- movups %xmm2,(%rsi)
- xorps %xmm7,%xmm4
- movups %xmm3,16(%rsi)
- xorps %xmm6,%xmm5
- movups %xmm4,32(%rsi)
- movaps %xmm5,%xmm2
+ pxor %xmm10,%xmm2
+ movaps %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ movdqa %xmm5,%xmm2
leaq 48(%rsi),%rsi
- subq $64,%rdx
- jmp .Lcbc_dec_tail_collected
-.align 16
-.Lcbc_dec_five:
- xorps %xmm7,%xmm7
- call _aesni_decrypt6
- movups 16(%rdi),%xmm1
- movups 32(%rdi),%xmm0
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- xorps %xmm1,%xmm4
- movups 48(%rdi),%xmm1
- xorps %xmm0,%xmm5
- movups 64(%rdi),%xmm9
- xorps %xmm1,%xmm6
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- leaq 64(%rsi),%rsi
- movaps %xmm6,%xmm2
- subq $80,%rdx
- jmp .Lcbc_dec_tail_collected
-.align 16
-.Lcbc_dec_six:
- call _aesni_decrypt6
- movups 16(%rdi),%xmm1
- movups 32(%rdi),%xmm0
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- xorps %xmm1,%xmm4
- movups 48(%rdi),%xmm1
- xorps %xmm0,%xmm5
- movups 64(%rdi),%xmm0
- xorps %xmm1,%xmm6
- movups 80(%rdi),%xmm9
- xorps %xmm0,%xmm7
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- movups %xmm6,64(%rsi)
- leaq 80(%rsi),%rsi
- movaps %xmm7,%xmm2
- subq $96,%rdx
jmp .Lcbc_dec_tail_collected
+
.align 16
.Lcbc_dec_tail_collected:
+ movups %xmm10,(%r8)
andq $15,%rdx
- movups %xmm9,(%r8)
jnz .Lcbc_dec_tail_partial
movups %xmm2,(%rsi)
jmp .Lcbc_dec_ret
.align 16
.Lcbc_dec_tail_partial:
- movaps %xmm2,-24(%rsp)
+ movaps %xmm2,(%rsp)
movq $16,%rcx
movq %rsi,%rdi
subq %rdx,%rcx
- leaq -24(%rsp),%rsi
+ leaq (%rsp),%rsi
.long 0x9066A4F3
.Lcbc_dec_ret:
+ leaq (%rbp),%rsp
+ popq %rbp
.Lcbc_ret:
.byte 0xf3,0xc3
.size aesni_cbc_encrypt,.-aesni_cbc_encrypt
@@ -2569,6 +2966,8 @@ __aesni_set_encrypt_key:
.long 1,0,0,0
.Lxts_magic:
.long 0x87,0,1,0
+.Lincrement1:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64
diff --git a/lib/accelerated/x86/elf/padlock-x86-64.s b/lib/accelerated/x86/elf/padlock-x86-64.s
index 4709ac2273..2ac113d72c 100644
--- a/lib/accelerated/x86/elf/padlock-x86-64.s
+++ b/lib/accelerated/x86/elf/padlock-x86-64.s
@@ -595,6 +595,468 @@ padlock_cbc_encrypt:
popq %rbp
.byte 0xf3,0xc3
.size padlock_cbc_encrypt,.-padlock_cbc_encrypt
+.globl padlock_cfb_encrypt
+.type padlock_cfb_encrypt,@function
+.align 16
+padlock_cfb_encrypt:
+ pushq %rbp
+ pushq %rbx
+
+ xorl %eax,%eax
+ testq $15,%rdx
+ jnz .Lcfb_abort
+ testq $15,%rcx
+ jnz .Lcfb_abort
+ leaq .Lpadlock_saved_context(%rip),%rax
+ pushf
+ cld
+ call _padlock_verify_ctx
+ leaq 16(%rdx),%rdx
+ xorl %eax,%eax
+ xorl %ebx,%ebx
+ testl $32,(%rdx)
+ jnz .Lcfb_aligned
+ testq $15,%rdi
+ setz %al
+ testq $15,%rsi
+ setz %bl
+ testl %ebx,%eax
+ jnz .Lcfb_aligned
+ negq %rax
+ movq $512,%rbx
+ notq %rax
+ leaq (%rsp),%rbp
+ cmpq %rbx,%rcx
+ cmovcq %rcx,%rbx
+ andq %rbx,%rax
+ movq %rcx,%rbx
+ negq %rax
+ andq $512-1,%rbx
+ leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+ jmp .Lcfb_loop
+.align 16
+.Lcfb_loop:
+ cmpq %rcx,%rbx
+ cmovaq %rcx,%rbx
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rcx,%r10
+ movq %rbx,%rcx
+ movq %rbx,%r11
+ testq $15,%rdi
+ cmovnzq %rsp,%rdi
+ testq $15,%rsi
+ jz .Lcfb_inp_aligned
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
+ subq %rbx,%rdi
+ movq %rbx,%rcx
+ movq %rdi,%rsi
+.Lcfb_inp_aligned:
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,224
+ movdqa (%rax),%xmm0
+ movdqa %xmm0,-16(%rdx)
+ movq %r8,%rdi
+ movq %r11,%rbx
+ testq $15,%rdi
+ jz .Lcfb_out_aligned
+ movq %rbx,%rcx
+ leaq (%rsp),%rsi
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
+ subq %rbx,%rdi
+.Lcfb_out_aligned:
+ movq %r9,%rsi
+ movq %r10,%rcx
+ addq %rbx,%rdi
+ addq %rbx,%rsi
+ subq %rbx,%rcx
+ movq $512,%rbx
+ jnz .Lcfb_loop
+ cmpq %rbp,%rsp
+ je .Lcfb_done
+
+ pxor %xmm0,%xmm0
+ leaq (%rsp),%rax
+.Lcfb_bzero:
+ movaps %xmm0,(%rax)
+ leaq 16(%rax),%rax
+ cmpq %rax,%rbp
+ ja .Lcfb_bzero
+
+.Lcfb_done:
+ leaq (%rbp),%rsp
+ jmp .Lcfb_exit
+
+.align 16
+.Lcfb_aligned:
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,224
+ movdqa (%rax),%xmm0
+ movdqa %xmm0,-16(%rdx)
+.Lcfb_exit:
+ movl $1,%eax
+ leaq 8(%rsp),%rsp
+.Lcfb_abort:
+ popq %rbx
+ popq %rbp
+ .byte 0xf3,0xc3
+.size padlock_cfb_encrypt,.-padlock_cfb_encrypt
+.globl padlock_ofb_encrypt
+.type padlock_ofb_encrypt,@function
+.align 16
+padlock_ofb_encrypt:
+ pushq %rbp
+ pushq %rbx
+
+ xorl %eax,%eax
+ testq $15,%rdx
+ jnz .Lofb_abort
+ testq $15,%rcx
+ jnz .Lofb_abort
+ leaq .Lpadlock_saved_context(%rip),%rax
+ pushf
+ cld
+ call _padlock_verify_ctx
+ leaq 16(%rdx),%rdx
+ xorl %eax,%eax
+ xorl %ebx,%ebx
+ testl $32,(%rdx)
+ jnz .Lofb_aligned
+ testq $15,%rdi
+ setz %al
+ testq $15,%rsi
+ setz %bl
+ testl %ebx,%eax
+ jnz .Lofb_aligned
+ negq %rax
+ movq $512,%rbx
+ notq %rax
+ leaq (%rsp),%rbp
+ cmpq %rbx,%rcx
+ cmovcq %rcx,%rbx
+ andq %rbx,%rax
+ movq %rcx,%rbx
+ negq %rax
+ andq $512-1,%rbx
+ leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+ jmp .Lofb_loop
+.align 16
+.Lofb_loop:
+ cmpq %rcx,%rbx
+ cmovaq %rcx,%rbx
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rcx,%r10
+ movq %rbx,%rcx
+ movq %rbx,%r11
+ testq $15,%rdi
+ cmovnzq %rsp,%rdi
+ testq $15,%rsi
+ jz .Lofb_inp_aligned
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
+ subq %rbx,%rdi
+ movq %rbx,%rcx
+ movq %rdi,%rsi
+.Lofb_inp_aligned:
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,232
+ movdqa (%rax),%xmm0
+ movdqa %xmm0,-16(%rdx)
+ movq %r8,%rdi
+ movq %r11,%rbx
+ testq $15,%rdi
+ jz .Lofb_out_aligned
+ movq %rbx,%rcx
+ leaq (%rsp),%rsi
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
+ subq %rbx,%rdi
+.Lofb_out_aligned:
+ movq %r9,%rsi
+ movq %r10,%rcx
+ addq %rbx,%rdi
+ addq %rbx,%rsi
+ subq %rbx,%rcx
+ movq $512,%rbx
+ jnz .Lofb_loop
+ cmpq %rbp,%rsp
+ je .Lofb_done
+
+ pxor %xmm0,%xmm0
+ leaq (%rsp),%rax
+.Lofb_bzero:
+ movaps %xmm0,(%rax)
+ leaq 16(%rax),%rax
+ cmpq %rax,%rbp
+ ja .Lofb_bzero
+
+.Lofb_done:
+ leaq (%rbp),%rsp
+ jmp .Lofb_exit
+
+.align 16
+.Lofb_aligned:
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,232
+ movdqa (%rax),%xmm0
+ movdqa %xmm0,-16(%rdx)
+.Lofb_exit:
+ movl $1,%eax
+ leaq 8(%rsp),%rsp
+.Lofb_abort:
+ popq %rbx
+ popq %rbp
+ .byte 0xf3,0xc3
+.size padlock_ofb_encrypt,.-padlock_ofb_encrypt
+.globl padlock_ctr32_encrypt
+.type padlock_ctr32_encrypt,@function
+.align 16
+padlock_ctr32_encrypt:
+ pushq %rbp
+ pushq %rbx
+
+ xorl %eax,%eax
+ testq $15,%rdx
+ jnz .Lctr32_abort
+ testq $15,%rcx
+ jnz .Lctr32_abort
+ leaq .Lpadlock_saved_context(%rip),%rax
+ pushf
+ cld
+ call _padlock_verify_ctx
+ leaq 16(%rdx),%rdx
+ xorl %eax,%eax
+ xorl %ebx,%ebx
+ testl $32,(%rdx)
+ jnz .Lctr32_aligned
+ testq $15,%rdi
+ setz %al
+ testq $15,%rsi
+ setz %bl
+ testl %ebx,%eax
+ jnz .Lctr32_aligned
+ negq %rax
+ movq $512,%rbx
+ notq %rax
+ leaq (%rsp),%rbp
+ cmpq %rbx,%rcx
+ cmovcq %rcx,%rbx
+ andq %rbx,%rax
+ movq %rcx,%rbx
+ negq %rax
+ andq $512-1,%rbx
+ leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+.Lctr32_reenter:
+ movl -4(%rdx),%eax
+ bswapl %eax
+ negl %eax
+ andl $31,%eax
+ movq $512,%rbx
+ shll $4,%eax
+ cmovzq %rbx,%rax
+ cmpq %rax,%rcx
+ cmovaq %rax,%rbx
+ cmovbeq %rcx,%rbx
+ cmpq %rbx,%rcx
+ ja .Lctr32_loop
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $32,%rax
+ movq $-32,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jz .Lctr32_unaligned_tail
+ jmp .Lctr32_loop
+.align 16
+.Lctr32_loop:
+ cmpq %rcx,%rbx
+ cmovaq %rcx,%rbx
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rcx,%r10
+ movq %rbx,%rcx
+ movq %rbx,%r11
+ testq $15,%rdi
+ cmovnzq %rsp,%rdi
+ testq $15,%rsi
+ jz .Lctr32_inp_aligned
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
+ subq %rbx,%rdi
+ movq %rbx,%rcx
+ movq %rdi,%rsi
+.Lctr32_inp_aligned:
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,216
+ movl -4(%rdx),%eax
+ testl $4294901760,%eax
+ jnz .Lctr32_no_carry
+ bswapl %eax
+ addl $65536,%eax
+ bswapl %eax
+ movl %eax,-4(%rdx)
+.Lctr32_no_carry:
+ movq %r8,%rdi
+ movq %r11,%rbx
+ testq $15,%rdi
+ jz .Lctr32_out_aligned
+ movq %rbx,%rcx
+ leaq (%rsp),%rsi
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
+ subq %rbx,%rdi
+.Lctr32_out_aligned:
+ movq %r9,%rsi
+ movq %r10,%rcx
+ addq %rbx,%rdi
+ addq %rbx,%rsi
+ subq %rbx,%rcx
+ movq $512,%rbx
+ jz .Lctr32_break
+ cmpq %rbx,%rcx
+ jae .Lctr32_loop
+ movq %rcx,%rbx
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $32,%rax
+ movq $-32,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jnz .Lctr32_loop
+.Lctr32_unaligned_tail:
+ xorl %eax,%eax
+ cmpq %rsp,%rbp
+ cmoveq %rcx,%rax
+ movq %rdi,%r8
+ movq %rcx,%rbx
+ subq %rax,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ movq %rsp,%rsi
+ movq %r8,%rdi
+ movq %rbx,%rcx
+ jmp .Lctr32_loop
+.align 16
+.Lctr32_break:
+ cmpq %rbp,%rsp
+ je .Lctr32_done
+
+ pxor %xmm0,%xmm0
+ leaq (%rsp),%rax
+.Lctr32_bzero:
+ movaps %xmm0,(%rax)
+ leaq 16(%rax),%rax
+ cmpq %rax,%rbp
+ ja .Lctr32_bzero
+
+.Lctr32_done:
+ leaq (%rbp),%rsp
+ jmp .Lctr32_exit
+
+.align 16
+.Lctr32_aligned:
+ movl -4(%rdx),%eax
+ bswapl %eax
+ negl %eax
+ andl $65535,%eax
+ movq $1048576,%rbx
+ shll $4,%eax
+ cmovzq %rbx,%rax
+ cmpq %rax,%rcx
+ cmovaq %rax,%rbx
+ cmovbeq %rcx,%rbx
+ jbe .Lctr32_aligned_skip
+
+.Lctr32_aligned_loop:
+ movq %rcx,%r10
+ movq %rbx,%rcx
+ movq %rbx,%r11
+
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,216
+
+ movl -4(%rdx),%eax
+ bswapl %eax
+ addl $65536,%eax
+ bswapl %eax
+ movl %eax,-4(%rdx)
+
+ movq %r10,%rcx
+ subq %r11,%rcx
+ movq $1048576,%rbx
+ jz .Lctr32_exit
+ cmpq %rbx,%rcx
+ jae .Lctr32_aligned_loop
+
+.Lctr32_aligned_skip:
+ leaq (%rsi,%rcx,1),%rbp
+ negq %rbp
+ andq $4095,%rbp
+ xorl %eax,%eax
+ cmpq $32,%rbp
+ movq $32-1,%rbp
+ cmovaeq %rax,%rbp
+ andq %rcx,%rbp
+ subq %rbp,%rcx
+ jz .Lctr32_aligned_tail
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,216
+ testq %rbp,%rbp
+ jz .Lctr32_exit
+
+.Lctr32_aligned_tail:
+ movq %rdi,%r8
+ movq %rbp,%rbx
+ movq %rbp,%rcx
+ leaq (%rsp),%rbp
+ subq %rcx,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ leaq (%r8),%rdi
+ leaq (%rsp),%rsi
+ movq %rbx,%rcx
+ jmp .Lctr32_loop
+.Lctr32_exit:
+ movl $1,%eax
+ leaq 8(%rsp),%rsp
+.Lctr32_abort:
+ popq %rbx
+ popq %rbp
+ .byte 0xf3,0xc3
+.size padlock_ctr32_encrypt,.-padlock_ctr32_encrypt
.byte 86,73,65,32,80,97,100,108,111,99,107,32,120,56,54,95,54,52,32,109,111,100,117,108,101,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 16
.data
diff --git a/lib/accelerated/x86/elf/padlock-x86.s b/lib/accelerated/x86/elf/padlock-x86.s
index ea982ec4a7..2199255efe 100644
--- a/lib/accelerated/x86/elf/padlock-x86.s
+++ b/lib/accelerated/x86/elf/padlock-x86.s
@@ -187,16 +187,14 @@ padlock_ecb_encrypt:
leal 16(%edx),%edx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpl $128,%ecx
- jbe .L006ecb_short
testl $32,(%edx)
- jnz .L007ecb_aligned
+ jnz .L006ecb_aligned
testl $15,%edi
setz %al
testl $15,%esi
setz %bl
testl %ebx,%eax
- jnz .L007ecb_aligned
+ jnz .L006ecb_aligned
negl %eax
movl $512,%ebx
notl %eax
@@ -208,10 +206,28 @@ padlock_ecb_encrypt:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
- jmp .L008ecb_loop
+ movl %eax,16(%ebp)
+ cmpl %ebx,%ecx
+ ja .L007ecb_loop
+ movl %esi,%eax
+ cmpl %esp,%ebp
+ cmovel %edi,%eax
+ addl %ecx,%eax
+ negl %eax
+ andl $4095,%eax
+ cmpl $128,%eax
+ movl $-128,%eax
+ cmovael %ebx,%eax
+ andl %eax,%ebx
+ jz .L008ecb_unaligned_tail
+ jmp .L007ecb_loop
.align 16
-.L008ecb_loop:
+.L007ecb_loop:
movl %edi,(%ebp)
movl %esi,4(%ebp)
movl %ecx,8(%ebp)
@@ -236,8 +252,8 @@ padlock_ecb_encrypt:
testl $15,%edi
jz .L010ecb_out_aligned
movl %ebx,%ecx
- shrl $2,%ecx
leal (%esp),%esi
+ shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
.L010ecb_out_aligned:
@@ -247,43 +263,75 @@ padlock_ecb_encrypt:
addl %ebx,%esi
subl %ebx,%ecx
movl $512,%ebx
- jnz .L008ecb_loop
+ jz .L011ecb_break
+ cmpl %ebx,%ecx
+ jae .L007ecb_loop
+.L008ecb_unaligned_tail:
+ xorl %eax,%eax
cmpl %ebp,%esp
- je .L011ecb_done
+ cmovel %ecx,%eax
+ subl %eax,%esp
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp .L007ecb_loop
+.align 16
+.L011ecb_break:
+ cmpl %ebp,%esp
+ je .L012ecb_done
pxor %xmm0,%xmm0
leal (%esp),%eax
-.L012ecb_bzero:
+.L013ecb_bzero:
movaps %xmm0,(%eax)
leal 16(%eax),%eax
cmpl %eax,%ebp
- ja .L012ecb_bzero
-.L011ecb_done:
+ ja .L013ecb_bzero
+.L012ecb_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
- jmp .L013ecb_exit
+ jmp .L014ecb_exit
.align 16
-.L006ecb_short:
+.L006ecb_aligned:
+ leal (%esi,%ecx,1),%ebp
+ negl %ebp
+ andl $4095,%ebp
xorl %eax,%eax
- leal -24(%esp),%ebp
- subl %ecx,%eax
- leal (%eax,%ebp,1),%esp
- andl $-16,%esp
- xorl %ebx,%ebx
-.L014ecb_short_copy:
- movups (%esi,%ebx,1),%xmm0
- leal 16(%ebx),%ebx
- cmpl %ebx,%ecx
- movaps %xmm0,-16(%esp,%ebx,1)
- ja .L014ecb_short_copy
- movl %esp,%esi
- movl %ecx,%ebx
- jmp .L008ecb_loop
-.align 16
-.L007ecb_aligned:
+ cmpl $128,%ebp
+ movl $127,%ebp
+ cmovael %eax,%ebp
+ andl %ecx,%ebp
+ subl %ebp,%ecx
+ jz .L015ecb_aligned_tail
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
.byte 243,15,167,200
-.L013ecb_exit:
+ testl %ebp,%ebp
+ jz .L014ecb_exit
+.L015ecb_aligned_tail:
+ movl %ebp,%ecx
+ leal -24(%esp),%ebp
+ movl %ebp,%esp
+ movl %ebp,%eax
+ subl %ecx,%esp
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp .L007ecb_loop
+.L014ecb_exit:
movl $1,%eax
leal 4(%esp),%esp
.L004ecb_abort:
@@ -307,19 +355,17 @@ padlock_cbc_encrypt:
movl 28(%esp),%edx
movl 32(%esp),%ecx
testl $15,%edx
- jnz .L015cbc_abort
+ jnz .L016cbc_abort
testl $15,%ecx
- jnz .L015cbc_abort
- leal .Lpadlock_saved_context-.L016cbc_pic_point,%eax
+ jnz .L016cbc_abort
+ leal .Lpadlock_saved_context-.L017cbc_pic_point,%eax
pushfl
cld
call _padlock_verify_ctx
-.L016cbc_pic_point:
+.L017cbc_pic_point:
leal 16(%edx),%edx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpl $64,%ecx
- jbe .L017cbc_short
testl $32,(%edx)
jnz .L018cbc_aligned
testl $15,%edi
@@ -339,7 +385,25 @@ padlock_cbc_encrypt:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
+ movl %eax,16(%ebp)
+ cmpl %ebx,%ecx
+ ja .L019cbc_loop
+ movl %esi,%eax
+ cmpl %esp,%ebp
+ cmovel %edi,%eax
+ addl %ecx,%eax
+ negl %eax
+ andl $4095,%eax
+ cmpl $64,%eax
+ movl $-64,%eax
+ cmovael %ebx,%eax
+ andl %eax,%ebx
+ jz .L020cbc_unaligned_tail
jmp .L019cbc_loop
.align 16
.L019cbc_loop:
@@ -351,13 +415,13 @@ padlock_cbc_encrypt:
testl $15,%edi
cmovnzl %esp,%edi
testl $15,%esi
- jz .L020cbc_inp_aligned
+ jz .L021cbc_inp_aligned
shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
movl %ebx,%ecx
movl %edi,%esi
-.L020cbc_inp_aligned:
+.L021cbc_inp_aligned:
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
@@ -367,67 +431,450 @@ padlock_cbc_encrypt:
movl (%ebp),%edi
movl 12(%ebp),%ebx
testl $15,%edi
- jz .L021cbc_out_aligned
+ jz .L022cbc_out_aligned
movl %ebx,%ecx
- shrl $2,%ecx
leal (%esp),%esi
+ shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
-.L021cbc_out_aligned:
+.L022cbc_out_aligned:
movl 4(%ebp),%esi
movl 8(%ebp),%ecx
addl %ebx,%edi
addl %ebx,%esi
subl %ebx,%ecx
movl $512,%ebx
- jnz .L019cbc_loop
+ jz .L023cbc_break
+ cmpl %ebx,%ecx
+ jae .L019cbc_loop
+.L020cbc_unaligned_tail:
+ xorl %eax,%eax
cmpl %ebp,%esp
- je .L022cbc_done
+ cmovel %ecx,%eax
+ subl %eax,%esp
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp .L019cbc_loop
+.align 16
+.L023cbc_break:
+ cmpl %ebp,%esp
+ je .L024cbc_done
pxor %xmm0,%xmm0
leal (%esp),%eax
-.L023cbc_bzero:
+.L025cbc_bzero:
movaps %xmm0,(%eax)
leal 16(%eax),%eax
cmpl %eax,%ebp
- ja .L023cbc_bzero
-.L022cbc_done:
+ ja .L025cbc_bzero
+.L024cbc_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
- jmp .L024cbc_exit
+ jmp .L026cbc_exit
.align 16
-.L017cbc_short:
+.L018cbc_aligned:
+ leal (%esi,%ecx,1),%ebp
+ negl %ebp
+ andl $4095,%ebp
xorl %eax,%eax
+ cmpl $64,%ebp
+ movl $63,%ebp
+ cmovael %eax,%ebp
+ andl %ecx,%ebp
+ subl %ebp,%ecx
+ jz .L027cbc_aligned_tail
+ leal -16(%edx),%eax
+ leal 16(%edx),%ebx
+ shrl $4,%ecx
+.byte 243,15,167,208
+ movaps (%eax),%xmm0
+ movaps %xmm0,-16(%edx)
+ testl %ebp,%ebp
+ jz .L026cbc_exit
+.L027cbc_aligned_tail:
+ movl %ebp,%ecx
leal -24(%esp),%ebp
- subl %ecx,%eax
+ movl %ebp,%esp
+ movl %ebp,%eax
+ subl %ecx,%esp
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp .L019cbc_loop
+.L026cbc_exit:
+ movl $1,%eax
+ leal 4(%esp),%esp
+.L016cbc_abort:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size padlock_cbc_encrypt,.-.L_padlock_cbc_encrypt_begin
+.globl padlock_cfb_encrypt
+.type padlock_cfb_encrypt,@function
+.align 16
+padlock_cfb_encrypt:
+.L_padlock_cfb_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%edx
+ movl 32(%esp),%ecx
+ testl $15,%edx
+ jnz .L028cfb_abort
+ testl $15,%ecx
+ jnz .L028cfb_abort
+ leal .Lpadlock_saved_context-.L029cfb_pic_point,%eax
+ pushfl
+ cld
+ call _padlock_verify_ctx
+.L029cfb_pic_point:
+ leal 16(%edx),%edx
+ xorl %eax,%eax
+ xorl %ebx,%ebx
+ testl $32,(%edx)
+ jnz .L030cfb_aligned
+ testl $15,%edi
+ setz %al
+ testl $15,%esi
+ setz %bl
+ testl %ebx,%eax
+ jnz .L030cfb_aligned
+ negl %eax
+ movl $512,%ebx
+ notl %eax
+ leal -24(%esp),%ebp
+ cmpl %ebx,%ecx
+ cmovcl %ecx,%ebx
+ andl %ebx,%eax
+ movl %ecx,%ebx
+ negl %eax
+ andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
+ movl %eax,16(%ebp)
+ jmp .L031cfb_loop
+.align 16
+.L031cfb_loop:
+ movl %edi,(%ebp)
+ movl %esi,4(%ebp)
+ movl %ecx,8(%ebp)
+ movl %ebx,%ecx
+ movl %ebx,12(%ebp)
+ testl $15,%edi
+ cmovnzl %esp,%edi
+ testl $15,%esi
+ jz .L032cfb_inp_aligned
+ shrl $2,%ecx
+.byte 243,165
+ subl %ebx,%edi
+ movl %ebx,%ecx
+ movl %edi,%esi
+.L032cfb_inp_aligned:
+ leal -16(%edx),%eax
+ leal 16(%edx),%ebx
+ shrl $4,%ecx
+.byte 243,15,167,224
+ movaps (%eax),%xmm0
+ movaps %xmm0,-16(%edx)
+ movl (%ebp),%edi
+ movl 12(%ebp),%ebx
+ testl $15,%edi
+ jz .L033cfb_out_aligned
+ movl %ebx,%ecx
+ leal (%esp),%esi
+ shrl $2,%ecx
+.byte 243,165
+ subl %ebx,%edi
+.L033cfb_out_aligned:
+ movl 4(%ebp),%esi
+ movl 8(%ebp),%ecx
+ addl %ebx,%edi
+ addl %ebx,%esi
+ subl %ebx,%ecx
+ movl $512,%ebx
+ jnz .L031cfb_loop
+ cmpl %ebp,%esp
+ je .L034cfb_done
+ pxor %xmm0,%xmm0
+ leal (%esp),%eax
+.L035cfb_bzero:
+ movaps %xmm0,(%eax)
+ leal 16(%eax),%eax
+ cmpl %eax,%ebp
+ ja .L035cfb_bzero
+.L034cfb_done:
+ movl 16(%ebp),%ebp
+ leal 24(%ebp),%esp
+ jmp .L036cfb_exit
+.align 16
+.L030cfb_aligned:
+ leal -16(%edx),%eax
+ leal 16(%edx),%ebx
+ shrl $4,%ecx
+.byte 243,15,167,224
+ movaps (%eax),%xmm0
+ movaps %xmm0,-16(%edx)
+.L036cfb_exit:
+ movl $1,%eax
+ leal 4(%esp),%esp
+.L028cfb_abort:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size padlock_cfb_encrypt,.-.L_padlock_cfb_encrypt_begin
+.globl padlock_ofb_encrypt
+.type padlock_ofb_encrypt,@function
+.align 16
+padlock_ofb_encrypt:
+.L_padlock_ofb_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%edx
+ movl 32(%esp),%ecx
+ testl $15,%edx
+ jnz .L037ofb_abort
+ testl $15,%ecx
+ jnz .L037ofb_abort
+ leal .Lpadlock_saved_context-.L038ofb_pic_point,%eax
+ pushfl
+ cld
+ call _padlock_verify_ctx
+.L038ofb_pic_point:
+ leal 16(%edx),%edx
+ xorl %eax,%eax
xorl %ebx,%ebx
-.L025cbc_short_copy:
- movups (%esi,%ebx,1),%xmm0
- leal 16(%ebx),%ebx
+ testl $32,(%edx)
+ jnz .L039ofb_aligned
+ testl $15,%edi
+ setz %al
+ testl $15,%esi
+ setz %bl
+ testl %ebx,%eax
+ jnz .L039ofb_aligned
+ negl %eax
+ movl $512,%ebx
+ notl %eax
+ leal -24(%esp),%ebp
cmpl %ebx,%ecx
- movaps %xmm0,-16(%esp,%ebx,1)
- ja .L025cbc_short_copy
- movl %esp,%esi
+ cmovcl %ecx,%ebx
+ andl %ebx,%eax
movl %ecx,%ebx
- jmp .L019cbc_loop
+ negl %eax
+ andl $511,%ebx
+ leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ jmp .L040ofb_loop
.align 16
-.L018cbc_aligned:
+.L040ofb_loop:
+ movl %edi,(%ebp)
+ movl %esi,4(%ebp)
+ movl %ecx,8(%ebp)
+ movl %ebx,%ecx
+ movl %ebx,12(%ebp)
+ testl $15,%edi
+ cmovnzl %esp,%edi
+ testl $15,%esi
+ jz .L041ofb_inp_aligned
+ shrl $2,%ecx
+.byte 243,165
+ subl %ebx,%edi
+ movl %ebx,%ecx
+ movl %edi,%esi
+.L041ofb_inp_aligned:
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
-.byte 243,15,167,208
+.byte 243,15,167,232
movaps (%eax),%xmm0
movaps %xmm0,-16(%edx)
-.L024cbc_exit:
+ movl (%ebp),%edi
+ movl 12(%ebp),%ebx
+ testl $15,%edi
+ jz .L042ofb_out_aligned
+ movl %ebx,%ecx
+ leal (%esp),%esi
+ shrl $2,%ecx
+.byte 243,165
+ subl %ebx,%edi
+.L042ofb_out_aligned:
+ movl 4(%ebp),%esi
+ movl 8(%ebp),%ecx
+ addl %ebx,%edi
+ addl %ebx,%esi
+ subl %ebx,%ecx
+ movl $512,%ebx
+ jnz .L040ofb_loop
+ cmpl %ebp,%esp
+ je .L043ofb_done
+ pxor %xmm0,%xmm0
+ leal (%esp),%eax
+.L044ofb_bzero:
+ movaps %xmm0,(%eax)
+ leal 16(%eax),%eax
+ cmpl %eax,%ebp
+ ja .L044ofb_bzero
+.L043ofb_done:
+ movl 16(%ebp),%ebp
+ leal 24(%ebp),%esp
+ jmp .L045ofb_exit
+.align 16
+.L039ofb_aligned:
+ leal -16(%edx),%eax
+ leal 16(%edx),%ebx
+ shrl $4,%ecx
+.byte 243,15,167,232
+ movaps (%eax),%xmm0
+ movaps %xmm0,-16(%edx)
+.L045ofb_exit:
movl $1,%eax
leal 4(%esp),%esp
-.L015cbc_abort:
+.L037ofb_abort:
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
-.size padlock_cbc_encrypt,.-.L_padlock_cbc_encrypt_begin
+.size padlock_ofb_encrypt,.-.L_padlock_ofb_encrypt_begin
+.globl padlock_ctr32_encrypt
+.type padlock_ctr32_encrypt,@function
+.align 16
+padlock_ctr32_encrypt:
+.L_padlock_ctr32_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%edx
+ movl 32(%esp),%ecx
+ testl $15,%edx
+ jnz .L046ctr32_abort
+ testl $15,%ecx
+ jnz .L046ctr32_abort
+ leal .Lpadlock_saved_context-.L047ctr32_pic_point,%eax
+ pushfl
+ cld
+ call _padlock_verify_ctx
+.L047ctr32_pic_point:
+ leal 16(%edx),%edx
+ xorl %eax,%eax
+ movq -16(%edx),%mm0
+ movl $512,%ebx
+ notl %eax
+ leal -24(%esp),%ebp
+ cmpl %ebx,%ecx
+ cmovcl %ecx,%ebx
+ andl %ebx,%eax
+ movl %ecx,%ebx
+ negl %eax
+ andl $511,%ebx
+ leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ jmp .L048ctr32_loop
+.align 16
+.L048ctr32_loop:
+ movl %edi,(%ebp)
+ movl %esi,4(%ebp)
+ movl %ecx,8(%ebp)
+ movl %ebx,%ecx
+ movl %ebx,12(%ebp)
+ movl -4(%edx),%ecx
+ xorl %edi,%edi
+ movl -8(%edx),%eax
+.L049ctr32_prepare:
+ movl %ecx,12(%esp,%edi,1)
+ bswap %ecx
+ movq %mm0,(%esp,%edi,1)
+ incl %ecx
+ movl %eax,8(%esp,%edi,1)
+ bswap %ecx
+ leal 16(%edi),%edi
+ cmpl %ebx,%edi
+ jb .L049ctr32_prepare
+ movl %ecx,-4(%edx)
+ leal (%esp),%esi
+ leal (%esp),%edi
+ movl %ebx,%ecx
+ leal -16(%edx),%eax
+ leal 16(%edx),%ebx
+ shrl $4,%ecx
+.byte 243,15,167,200
+ movl (%ebp),%edi
+ movl 12(%ebp),%ebx
+ movl 4(%ebp),%esi
+ xorl %ecx,%ecx
+.L050ctr32_xor:
+ movups (%esi,%ecx,1),%xmm1
+ leal 16(%ecx),%ecx
+ pxor -16(%esp,%ecx,1),%xmm1
+ movups %xmm1,-16(%edi,%ecx,1)
+ cmpl %ebx,%ecx
+ jb .L050ctr32_xor
+ movl 8(%ebp),%ecx
+ addl %ebx,%edi
+ addl %ebx,%esi
+ subl %ebx,%ecx
+ movl $512,%ebx
+ jnz .L048ctr32_loop
+ pxor %xmm0,%xmm0
+ leal (%esp),%eax
+.L051ctr32_bzero:
+ movaps %xmm0,(%eax)
+ leal 16(%eax),%eax
+ cmpl %eax,%ebp
+ ja .L051ctr32_bzero
+.L052ctr32_done:
+ movl 16(%ebp),%ebp
+ leal 24(%ebp),%esp
+ movl $1,%eax
+ leal 4(%esp),%esp
+ emms
+.L046ctr32_abort:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size padlock_ctr32_encrypt,.-.L_padlock_ctr32_encrypt_begin
.globl padlock_xstore
.type padlock_xstore,@function
.align 16
@@ -447,10 +894,10 @@ _win32_segv_handler:
movl 4(%esp),%edx
movl 12(%esp),%ecx
cmpl $3221225477,(%edx)
- jne .L026ret
+ jne .L053ret
addl $4,184(%ecx)
movl $0,%eax
-.L026ret:
+.L053ret:
ret
.size _win32_segv_handler,.-_win32_segv_handler
.globl padlock_sha1_oneshot
diff --git a/lib/accelerated/x86/macosx/appro-aes-gcm-x86-64-macosx.s b/lib/accelerated/x86/macosx/appro-aes-gcm-x86-64-macosx.s
index cfac705042..eac88aeba1 100644
--- a/lib/accelerated/x86/macosx/appro-aes-gcm-x86-64-macosx.s
+++ b/lib/accelerated/x86/macosx/appro-aes-gcm-x86-64-macosx.s
@@ -699,6 +699,7 @@ L$ghash_epilogue:
.p2align 4
_gcm_init_clmul:
+L$_init_clmul:
movdqu (%rsi),%xmm2
pshufd $78,%xmm2,%xmm2
@@ -717,15 +718,15 @@ _gcm_init_clmul:
pxor %xmm5,%xmm2
+ pshufd $78,%xmm2,%xmm6
movdqa %xmm2,%xmm0
+ pxor %xmm2,%xmm6
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
+.byte 102,15,58,68,222,0
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -735,44 +736,134 @@ _gcm_init_clmul:
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
psllq $1,%xmm0
pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm2,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm2,%xmm3
+ movdqu %xmm2,0(%rdi)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,16(%rdi)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,32(%rdi)
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
psrlq $5,%xmm0
pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ movdqa %xmm0,%xmm5
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- movdqu %xmm2,(%rdi)
- movdqu %xmm0,16(%rdi)
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm5,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm5,%xmm3
+ movdqu %xmm5,48(%rdi)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,64(%rdi)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,80(%rdi)
.byte 0xf3,0xc3
.globl _gcm_gmult_clmul
.p2align 4
_gcm_gmult_clmul:
+L$_gmult_clmul:
movdqu (%rdi),%xmm0
movdqa L$bswap_mask(%rip),%xmm5
movdqu (%rsi),%xmm2
+ movdqu 32(%rsi),%xmm4
.byte 102,15,56,0,197
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
.byte 102,15,58,68,220,0
@@ -785,186 +876,358 @@ _gcm_gmult_clmul:
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
.byte 102,15,56,0,197
movdqu %xmm0,(%rdi)
.byte 0xf3,0xc3
.globl _gcm_ghash_clmul
-.p2align 4
+.p2align 5
_gcm_ghash_clmul:
+L$_ghash_clmul:
movdqa L$bswap_mask(%rip),%xmm5
+ movq $11547335547999543296,%rax
movdqu (%rdi),%xmm0
movdqu (%rsi),%xmm2
+ movdqu 32(%rsi),%xmm10
.byte 102,15,56,0,197
subq $16,%rcx
jz L$odd_tail
- movdqu 16(%rsi),%xmm8
+ movdqu 16(%rsi),%xmm9
+ cmpq $48,%rcx
+ jb L$skip4x
+ subq $48,%rcx
+ movdqu 48(%rsi),%xmm14
+ movdqu 64(%rsi),%xmm15
- movdqu (%rdx),%xmm3
- movdqu 16(%rdx),%xmm6
-.byte 102,15,56,0,221
+ movdqu 48(%rdx),%xmm6
+ movdqu 32(%rdx),%xmm11
.byte 102,15,56,0,245
- pxor %xmm3,%xmm0
- movdqa %xmm6,%xmm7
- pshufd $78,%xmm6,%xmm3
- pshufd $78,%xmm2,%xmm4
- pxor %xmm6,%xmm3
- pxor %xmm2,%xmm4
+.byte 102,68,15,56,0,221
+ movdqa %xmm6,%xmm8
+ pshufd $78,%xmm6,%xmm7
+ pxor %xmm6,%xmm7
.byte 102,15,58,68,242,0
-.byte 102,15,58,68,250,17
-.byte 102,15,58,68,220,0
- pxor %xmm6,%xmm3
- pxor %xmm7,%xmm3
+.byte 102,68,15,58,68,194,17
+.byte 102,65,15,58,68,250,0
+
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,217,0
+.byte 102,69,15,58,68,233,17
+ xorps %xmm11,%xmm6
+.byte 102,69,15,58,68,226,16
+ xorps %xmm13,%xmm8
+ movups 80(%rsi),%xmm10
+ xorps %xmm12,%xmm7
+
+ movdqu 16(%rdx),%xmm11
+ movdqu 0(%rdx),%xmm3
+.byte 102,68,15,56,0,221
+.byte 102,15,56,0,221
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm3,%xmm0
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm6
+.byte 102,69,15,58,68,226,0
+ xorps %xmm13,%xmm8
+
+ leaq 64(%rdx),%rdx
+ subq $64,%rcx
+ jc L$tail4x
+
+ jmp L$mod4_loop
+.p2align 5
+L$mod4_loop:
+.byte 102,65,15,58,68,199,0
+ xorps %xmm12,%xmm7
+ movdqu 48(%rdx),%xmm11
+.byte 102,68,15,56,0,221
+.byte 102,65,15,58,68,207,17
+ xorps %xmm6,%xmm0
+ movdqu 32(%rdx),%xmm6
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+.byte 102,65,15,58,68,218,16
+ xorps %xmm8,%xmm1
+ pxor %xmm11,%xmm12
+.byte 102,15,56,0,245
+ movups 32(%rsi),%xmm10
+.byte 102,68,15,58,68,218,0
+ xorps %xmm7,%xmm3
+ movdqa %xmm6,%xmm8
+ pshufd $78,%xmm6,%xmm7
+ pxor %xmm0,%xmm3
+ pxor %xmm6,%xmm7
+ pxor %xmm1,%xmm3
movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
+ pslldq $8,%xmm3
+.byte 102,68,15,58,68,234,17
+ psrldq $8,%xmm4
+ pxor %xmm3,%xmm0
+ movdqa L$7_mask(%rip),%xmm3
+ pxor %xmm4,%xmm1
+.byte 102,72,15,110,224
+
+ pand %xmm0,%xmm3
+.byte 102,15,56,0,227
+.byte 102,69,15,58,68,226,0
+ pxor %xmm0,%xmm4
+ psllq $57,%xmm4
+ movdqa %xmm4,%xmm3
pslldq $8,%xmm4
- pxor %xmm3,%xmm7
- pxor %xmm4,%xmm6
+.byte 102,65,15,58,68,241,0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqu 0(%rdx),%xmm3
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+.byte 102,69,15,58,68,193,17
+ xorps %xmm11,%xmm6
+ movdqu 16(%rdx),%xmm11
+.byte 102,68,15,56,0,221
+.byte 102,65,15,58,68,250,16
+ xorps %xmm13,%xmm8
+ movups 80(%rsi),%xmm10
+.byte 102,15,56,0,221
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+
+ movdqa %xmm11,%xmm13
+ pxor %xmm12,%xmm7
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ psrlq $1,%xmm0
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm6
+ pxor %xmm1,%xmm0
+
+.byte 102,69,15,58,68,226,0
+ xorps %xmm13,%xmm8
+
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm8,%xmm4
pxor %xmm0,%xmm3
- pxor %xmm8,%xmm4
- leaq 32(%rdx),%rdx
- subq $32,%rcx
- jbe L$even_tail
+ leaq 64(%rdx),%rdx
+ subq $64,%rcx
+ jnc L$mod4_loop
+
+L$tail4x:
+.byte 102,65,15,58,68,199,0
+ xorps %xmm12,%xmm7
+.byte 102,65,15,58,68,207,17
+ xorps %xmm6,%xmm0
+.byte 102,65,15,58,68,218,16
+ xorps %xmm8,%xmm1
+ pxor %xmm0,%xmm1
+ pxor %xmm7,%xmm3
-L$mod_loop:
-.byte 102,65,15,58,68,192,0
-.byte 102,65,15,58,68,200,17
-.byte 102,15,58,68,220,0
- pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
+ pxor %xmm0,%xmm1
movdqa %xmm3,%xmm4
psrldq $8,%xmm3
pslldq $8,%xmm4
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
- movdqu (%rdx),%xmm3
- pxor %xmm6,%xmm0
- pxor %xmm7,%xmm1
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ addq $64,%rcx
+ jz L$done
+ movdqu 32(%rsi),%xmm10
+ subq $16,%rcx
+ jz L$odd_tail
+L$skip4x:
+
+
+
+
+
+ movdqu (%rdx),%xmm3
movdqu 16(%rdx),%xmm6
.byte 102,15,56,0,221
.byte 102,15,56,0,245
+ pxor %xmm3,%xmm0
+
+ movdqa %xmm6,%xmm8
+ pshufd $78,%xmm6,%xmm3
+ pxor %xmm6,%xmm3
+.byte 102,15,58,68,242,0
+.byte 102,68,15,58,68,194,17
+.byte 102,65,15,58,68,218,0
+
+ leaq 32(%rdx),%rdx
+ subq $32,%rcx
+ jbe L$even_tail
+ jmp L$mod_loop
+
+.p2align 5
+L$mod_loop:
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm0,%xmm4
+
+.byte 102,65,15,58,68,193,0
+.byte 102,65,15,58,68,201,17
+.byte 102,65,15,58,68,226,16
+
+ pxor %xmm6,%xmm0
+ pxor %xmm8,%xmm1
+ movdqu (%rdx),%xmm8
+.byte 102,68,15,56,0,197
+ movdqu 16(%rdx),%xmm6
- movdqa %xmm6,%xmm7
- pshufd $78,%xmm6,%xmm9
- pshufd $78,%xmm2,%xmm10
- pxor %xmm6,%xmm9
- pxor %xmm2,%xmm10
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pxor %xmm8,%xmm1
+ pxor %xmm3,%xmm4
+.byte 102,15,56,0,245
+ movdqa %xmm4,%xmm3
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm6,%xmm8
+
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
- pxor %xmm3,%xmm0
.byte 102,15,58,68,242,0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ pshufd $78,%xmm8,%xmm3
+ pxor %xmm8,%xmm3
-.byte 102,15,58,68,250,17
+.byte 102,68,15,58,68,194,17
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
-
-.byte 102,69,15,58,68,202,0
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm8,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm8,%xmm4
-
- pxor %xmm6,%xmm9
- pxor %xmm7,%xmm9
- movdqa %xmm9,%xmm10
- psrldq $8,%xmm9
- pslldq $8,%xmm10
- pxor %xmm9,%xmm7
- pxor %xmm10,%xmm6
+.byte 102,65,15,58,68,218,0
+ pxor %xmm1,%xmm0
leaq 32(%rdx),%rdx
subq $32,%rcx
ja L$mod_loop
L$even_tail:
-.byte 102,65,15,58,68,192,0
-.byte 102,65,15,58,68,200,17
-.byte 102,15,58,68,220,0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm0,%xmm4
+
+.byte 102,65,15,58,68,193,0
+.byte 102,65,15,58,68,201,17
+.byte 102,65,15,58,68,226,16
+
+ pxor %xmm6,%xmm0
+ pxor %xmm8,%xmm1
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
-
- movdqa %xmm3,%xmm4
+ pxor %xmm3,%xmm4
+ movdqa %xmm4,%xmm3
psrldq $8,%xmm3
pslldq $8,%xmm4
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
- pxor %xmm6,%xmm0
- pxor %xmm7,%xmm1
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
testq %rcx,%rcx
jnz L$done
@@ -974,12 +1237,10 @@ L$odd_tail:
pxor %xmm3,%xmm0
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
+.byte 102,65,15,58,68,218,0
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -989,38 +1250,60 @@ L$odd_tail:
pxor %xmm3,%xmm1
pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
pxor %xmm3,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
pslldq $8,%xmm0
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- pxor %xmm4,%xmm1
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
movdqa %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
pxor %xmm4,%xmm0
- pxor %xmm1,%xmm4
psrlq $1,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm0
L$done:
.byte 102,15,56,0,197
movdqu %xmm0,(%rdi)
.byte 0xf3,0xc3
-L$SEH_end_gcm_ghash_clmul:
+
+.globl _gcm_init_avx
+
+.p2align 5
+_gcm_init_avx:
+ jmp L$_init_clmul
+
+.globl _gcm_gmult_avx
+
+.p2align 5
+_gcm_gmult_avx:
+ jmp L$_gmult_clmul
+
+.globl _gcm_ghash_avx
+
+.p2align 5
+_gcm_ghash_avx:
+ jmp L$_ghash_clmul
.p2align 6
L$bswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
L$0x1c2_polynomial:
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+L$7_mask:
+.long 7,0,7,0
+L$7_mask_poly:
+.long 7,0,450,0
.p2align 6
L$rem_4bit:
diff --git a/lib/accelerated/x86/macosx/appro-aes-x86-64-macosx.s b/lib/accelerated/x86/macosx/appro-aes-x86-64-macosx.s
index a82f0a55c6..e2cfa17951 100644
--- a/lib/accelerated/x86/macosx/appro-aes-x86-64-macosx.s
+++ b/lib/accelerated/x86/macosx/appro-aes-x86-64-macosx.s
@@ -927,199 +927,412 @@ L$oop_enc1_6:
.p2align 4
_aesni_ctr32_encrypt_blocks:
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $128,%rsp
+ andq $-16,%rsp
+ leaq -8(%rax),%rbp
+
cmpq $1,%rdx
je L$ctr32_one_shortcut
- movdqu (%r8),%xmm14
- movdqa L$bswap_mask(%rip),%xmm15
- xorl %eax,%eax
-.byte 102,69,15,58,22,242,3
-.byte 102,68,15,58,34,240,3
+ movdqu (%r8),%xmm2
+ movdqu (%rcx),%xmm0
+ movl 12(%r8),%r8d
+ pxor %xmm0,%xmm2
+ movl 12(%rcx),%r11d
+ movdqa %xmm2,0(%rsp)
+ bswapl %r8d
+ movdqa %xmm2,%xmm3
+ movdqa %xmm2,%xmm4
+ movdqa %xmm2,%xmm5
+ movdqa %xmm2,64(%rsp)
+ movdqa %xmm2,80(%rsp)
+ movdqa %xmm2,96(%rsp)
+ movdqa %xmm2,112(%rsp)
movl 240(%rcx),%eax
+
+ leaq 1(%r8),%r9
+ leaq 2(%r8),%r10
+ bswapl %r9d
bswapl %r10d
- pxor %xmm12,%xmm12
- pxor %xmm13,%xmm13
-.byte 102,69,15,58,34,226,0
- leaq 3(%r10),%r11
-.byte 102,69,15,58,34,235,0
- incl %r10d
-.byte 102,69,15,58,34,226,1
- incq %r11
-.byte 102,69,15,58,34,235,1
- incl %r10d
-.byte 102,69,15,58,34,226,2
- incq %r11
-.byte 102,69,15,58,34,235,2
- movdqa %xmm12,-40(%rsp)
-.byte 102,69,15,56,0,231
- movdqa %xmm13,-24(%rsp)
-.byte 102,69,15,56,0,239
-
- pshufd $192,%xmm12,%xmm2
- pshufd $128,%xmm12,%xmm3
- pshufd $64,%xmm12,%xmm4
- cmpq $6,%rdx
- jb L$ctr32_tail
- shrl $1,%eax
- movq %rcx,%r11
- movl %eax,%r10d
- subq $6,%rdx
- jmp L$ctr32_loop6
+ xorl %r11d,%r9d
+ xorl %r11d,%r10d
+.byte 102,65,15,58,34,217,3
+ leaq 3(%r8),%r9
+ movdqa %xmm3,16(%rsp)
+.byte 102,65,15,58,34,226,3
+ bswapl %r9d
+ leaq 4(%r8),%r10
+ movdqa %xmm4,32(%rsp)
+ xorl %r11d,%r9d
+ bswapl %r10d
+.byte 102,65,15,58,34,233,3
+ xorl %r11d,%r10d
+ movdqa %xmm5,48(%rsp)
+ leaq 5(%r8),%r9
+ movl %r10d,64+12(%rsp)
+ bswapl %r9d
+ leaq 6(%r8),%r10
+ xorl %r11d,%r9d
+ bswapl %r10d
+ movl %r9d,80+12(%rsp)
+ xorl %r11d,%r10d
+ leaq 7(%r8),%r9
+ movl %r10d,96+12(%rsp)
+ bswapl %r9d
+ xorl %r11d,%r9d
+ movl %r9d,112+12(%rsp)
-.p2align 4
-L$ctr32_loop6:
- pshufd $192,%xmm13,%xmm5
- por %xmm14,%xmm2
- movups (%r11),%xmm0
- pshufd $128,%xmm13,%xmm6
- por %xmm14,%xmm3
- movups 16(%r11),%xmm1
- pshufd $64,%xmm13,%xmm7
- por %xmm14,%xmm4
- por %xmm14,%xmm5
- xorps %xmm0,%xmm2
- por %xmm14,%xmm6
- por %xmm14,%xmm7
+ movups 16(%rcx),%xmm1
+ movdqa 64(%rsp),%xmm6
+ movdqa 80(%rsp),%xmm7
+ cmpq $8,%rdx
+ jb L$ctr32_tail
+ leaq 128(%rcx),%rcx
+ subq $8,%rdx
+ jmp L$ctr32_loop8
- pxor %xmm0,%xmm3
+.p2align 5
+L$ctr32_loop8:
+ addl $8,%r8d
+ movdqa 96(%rsp),%xmm8
.byte 102,15,56,220,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+ movl %r8d,%r9d
+ movdqa 112(%rsp),%xmm9
.byte 102,15,56,220,217
- movdqa L$increment32(%rip),%xmm13
- pxor %xmm0,%xmm5
+ bswapl %r9d
+ movups 32-128(%rcx),%xmm0
.byte 102,15,56,220,225
- movdqa -40(%rsp),%xmm12
- pxor %xmm0,%xmm6
+ xorl %r11d,%r9d
.byte 102,15,56,220,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+ movl %r9d,0+12(%rsp)
+ leaq 1(%r8),%r9
.byte 102,15,56,220,241
.byte 102,15,56,220,249
- jmp L$ctr32_enc_loop6_enter
-.p2align 4
-L$ctr32_enc_loop6:
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 48-128(%rcx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ bswapl %r9d
+.byte 102,15,56,220,224
+ xorl %r11d,%r9d
+.byte 102,15,56,220,232
+ movl %r9d,16+12(%rsp)
+ leaq 2(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 64-128(%rcx),%xmm0
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
+ bswapl %r9d
.byte 102,15,56,220,225
+ xorl %r11d,%r9d
.byte 102,15,56,220,233
+ movl %r9d,32+12(%rsp)
+ leaq 3(%r8),%r9
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-L$ctr32_enc_loop6_enter:
- movups 16(%rcx),%xmm1
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 80-128(%rcx),%xmm1
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
+ bswapl %r9d
.byte 102,15,56,220,224
+ xorl %r11d,%r9d
.byte 102,15,56,220,232
+ movl %r9d,48+12(%rsp)
+ leaq 4(%r8),%r9
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups (%rcx),%xmm0
- jnz L$ctr32_enc_loop6
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 96-128(%rcx),%xmm0
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ bswapl %r9d
+.byte 102,15,56,220,225
+ xorl %r11d,%r9d
+.byte 102,15,56,220,233
+ movl %r9d,64+12(%rsp)
+ leaq 5(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 112-128(%rcx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ bswapl %r9d
+.byte 102,15,56,220,224
+ xorl %r11d,%r9d
+.byte 102,15,56,220,232
+ movl %r9d,80+12(%rsp)
+ leaq 6(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 128-128(%rcx),%xmm0
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ bswapl %r9d
+.byte 102,15,56,220,225
+ xorl %r11d,%r9d
+.byte 102,15,56,220,233
+ movl %r9d,96+12(%rsp)
+ leaq 7(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 144-128(%rcx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ bswapl %r9d
+.byte 102,15,56,220,224
+ xorl %r11d,%r9d
+.byte 102,15,56,220,232
+ movl %r9d,112+12(%rsp)
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+ movdqu 0(%rdi),%xmm10
+.byte 102,68,15,56,220,200
+ movups 160-128(%rcx),%xmm0
+
+ cmpl $11,%eax
+ jb L$ctr32_enc_done
.byte 102,15,56,220,209
- paddd %xmm13,%xmm12
.byte 102,15,56,220,217
- paddd -24(%rsp),%xmm13
.byte 102,15,56,220,225
- movdqa %xmm12,-40(%rsp)
.byte 102,15,56,220,233
- movdqa %xmm13,-24(%rsp)
.byte 102,15,56,220,241
-.byte 102,69,15,56,0,231
.byte 102,15,56,220,249
-.byte 102,69,15,56,0,239
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 176-128(%rcx),%xmm1
-.byte 102,15,56,221,208
- movups (%rdi),%xmm8
-.byte 102,15,56,221,216
- movups 16(%rdi),%xmm9
-.byte 102,15,56,221,224
- movups 32(%rdi),%xmm10
-.byte 102,15,56,221,232
- movups 48(%rdi),%xmm11
-.byte 102,15,56,221,240
- movups 64(%rdi),%xmm1
-.byte 102,15,56,221,248
- movups 80(%rdi),%xmm0
- leaq 96(%rdi),%rdi
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 192-128(%rcx),%xmm0
+ je L$ctr32_enc_done
- xorps %xmm2,%xmm8
- pshufd $192,%xmm12,%xmm2
- xorps %xmm3,%xmm9
- pshufd $128,%xmm12,%xmm3
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- pshufd $64,%xmm12,%xmm4
- movups %xmm9,16(%rsi)
- xorps %xmm5,%xmm11
- movups %xmm10,32(%rsi)
- xorps %xmm6,%xmm1
- movups %xmm11,48(%rsi)
- xorps %xmm7,%xmm0
- movups %xmm1,64(%rsi)
- movups %xmm0,80(%rsi)
- leaq 96(%rsi),%rsi
- movl %r10d,%eax
- subq $6,%rdx
- jnc L$ctr32_loop6
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 208-128(%rcx),%xmm1
- addq $6,%rdx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 224-128(%rcx),%xmm0
+
+L$ctr32_enc_done:
+ movdqu 16(%rdi),%xmm11
+ pxor %xmm0,%xmm10
+ movdqu 32(%rdi),%xmm12
+ pxor %xmm0,%xmm11
+ movdqu 48(%rdi),%xmm13
+ pxor %xmm0,%xmm12
+ movdqu 64(%rdi),%xmm14
+ pxor %xmm0,%xmm13
+ movdqu 80(%rdi),%xmm15
+ pxor %xmm0,%xmm14
+.byte 102,15,56,220,209
+ pxor %xmm0,%xmm15
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movdqu 96(%rdi),%xmm1
+
+.byte 102,65,15,56,221,210
+ pxor %xmm0,%xmm1
+ movdqu 112(%rdi),%xmm10
+ leaq 128(%rdi),%rdi
+.byte 102,65,15,56,221,219
+ pxor %xmm0,%xmm10
+ movdqa 0(%rsp),%xmm11
+.byte 102,65,15,56,221,228
+ movdqa 16(%rsp),%xmm12
+.byte 102,65,15,56,221,237
+ movdqa 32(%rsp),%xmm13
+.byte 102,65,15,56,221,246
+ movdqa 48(%rsp),%xmm14
+.byte 102,65,15,56,221,255
+ movdqa 64(%rsp),%xmm15
+.byte 102,68,15,56,221,193
+ movdqa 80(%rsp),%xmm0
+.byte 102,69,15,56,221,202
+ movups 16-128(%rcx),%xmm1
+
+ movups %xmm2,(%rsi)
+ movdqa %xmm11,%xmm2
+ movups %xmm3,16(%rsi)
+ movdqa %xmm12,%xmm3
+ movups %xmm4,32(%rsi)
+ movdqa %xmm13,%xmm4
+ movups %xmm5,48(%rsi)
+ movdqa %xmm14,%xmm5
+ movups %xmm6,64(%rsi)
+ movdqa %xmm15,%xmm6
+ movups %xmm7,80(%rsi)
+ movdqa %xmm0,%xmm7
+ movups %xmm8,96(%rsi)
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+
+ subq $8,%rdx
+ jnc L$ctr32_loop8
+
+ addq $8,%rdx
jz L$ctr32_done
- movq %r11,%rcx
- leal 1(%rax,%rax,1),%eax
+ leaq -128(%rcx),%rcx
L$ctr32_tail:
- por %xmm14,%xmm2
- movups (%rdi),%xmm8
- cmpq $2,%rdx
- jb L$ctr32_one
+ leaq 16(%rcx),%rcx
+ cmpq $4,%rdx
+ jb L$ctr32_loop3
+ je L$ctr32_loop4
- por %xmm14,%xmm3
- movups 16(%rdi),%xmm9
- je L$ctr32_two
+ movdqa 96(%rsp),%xmm8
+ pxor %xmm9,%xmm9
- pshufd $192,%xmm13,%xmm5
- por %xmm14,%xmm4
- movups 32(%rdi),%xmm10
- cmpq $4,%rdx
- jb L$ctr32_three
+ movups 16(%rcx),%xmm0
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+.byte 102,15,56,220,217
+ shrl $1,%eax
+.byte 102,15,56,220,225
+ decl %eax
+.byte 102,15,56,220,233
+ movups (%rdi),%xmm10
+.byte 102,15,56,220,241
+ movups 16(%rdi),%xmm11
+.byte 102,15,56,220,249
+ movups 32(%rdi),%xmm12
+.byte 102,68,15,56,220,193
+ movups 16(%rcx),%xmm1
- pshufd $128,%xmm13,%xmm6
- por %xmm14,%xmm5
- movups 48(%rdi),%xmm11
- je L$ctr32_four
+ call L$enc_loop8_enter
- por %xmm14,%xmm6
- xorps %xmm7,%xmm7
+ movdqu 48(%rdi),%xmm13
+ pxor %xmm10,%xmm2
+ movdqu 64(%rdi),%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm10,%xmm6
+ movdqu %xmm5,48(%rsi)
+ movdqu %xmm6,64(%rsi)
+ cmpq $6,%rdx
+ jb L$ctr32_done
- call _aesni_encrypt6
+ movups 80(%rdi),%xmm11
+ xorps %xmm11,%xmm7
+ movups %xmm7,80(%rsi)
+ je L$ctr32_done
- movups 64(%rdi),%xmm1
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- movups %xmm9,16(%rsi)
- xorps %xmm5,%xmm11
- movups %xmm10,32(%rsi)
- xorps %xmm6,%xmm1
- movups %xmm11,48(%rsi)
- movups %xmm1,64(%rsi)
+ movups 96(%rdi),%xmm12
+ xorps %xmm12,%xmm8
+ movups %xmm8,96(%rsi)
+ jmp L$ctr32_done
+
+.p2align 5
+L$ctr32_loop4:
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups (%rcx),%xmm1
+ decl %eax
+ jnz L$ctr32_loop4
+.byte 102,15,56,221,209
+ movups (%rdi),%xmm10
+.byte 102,15,56,221,217
+ movups 16(%rdi),%xmm11
+.byte 102,15,56,221,225
+ movups 32(%rdi),%xmm12
+.byte 102,15,56,221,233
+ movups 48(%rdi),%xmm13
+
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+ xorps %xmm11,%xmm3
+ movups %xmm3,16(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm5,48(%rsi)
+ jmp L$ctr32_done
+
+.p2align 5
+L$ctr32_loop3:
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+ movups (%rcx),%xmm1
+ decl %eax
+ jnz L$ctr32_loop3
+.byte 102,15,56,221,209
+.byte 102,15,56,221,217
+.byte 102,15,56,221,225
+
+ movups (%rdi),%xmm10
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+ cmpq $2,%rdx
+ jb L$ctr32_done
+
+ movups 16(%rdi),%xmm11
+ xorps %xmm11,%xmm3
+ movups %xmm3,16(%rsi)
+ je L$ctr32_done
+
+ movups 32(%rdi),%xmm12
+ xorps %xmm12,%xmm4
+ movups %xmm4,32(%rsi)
jmp L$ctr32_done
.p2align 4
L$ctr32_one_shortcut:
movups (%r8),%xmm2
- movups (%rdi),%xmm8
+ movups (%rdi),%xmm10
movl 240(%rcx),%eax
-L$ctr32_one:
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
@@ -1131,51 +1344,26 @@ L$oop_enc1_7:
leaq 16(%rcx),%rcx
jnz L$oop_enc1_7
.byte 102,15,56,221,209
- xorps %xmm2,%xmm8
- movups %xmm8,(%rsi)
- jmp L$ctr32_done
-
-.p2align 4
-L$ctr32_two:
- xorps %xmm4,%xmm4
- call _aesni_encrypt3
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- movups %xmm9,16(%rsi)
- jmp L$ctr32_done
-
-.p2align 4
-L$ctr32_three:
- call _aesni_encrypt3
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- movups %xmm9,16(%rsi)
- movups %xmm10,32(%rsi)
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
jmp L$ctr32_done
.p2align 4
-L$ctr32_four:
- call _aesni_encrypt4
- xorps %xmm2,%xmm8
- xorps %xmm3,%xmm9
- movups %xmm8,(%rsi)
- xorps %xmm4,%xmm10
- movups %xmm9,16(%rsi)
- xorps %xmm5,%xmm11
- movups %xmm10,32(%rsi)
- movups %xmm11,48(%rsi)
-
L$ctr32_done:
+ leaq (%rbp),%rsp
+ popq %rbp
+L$ctr32_epilogue:
.byte 0xf3,0xc3
.globl _aesni_xts_encrypt
.p2align 4
_aesni_xts_encrypt:
- leaq -104(%rsp),%rsp
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $112,%rsp
+ andq $-16,%rsp
+ leaq -8(%rax),%rbp
movups (%r9),%xmm15
movl 240(%r8),%eax
movl 240(%rcx),%r10d
@@ -1190,228 +1378,266 @@ L$oop_enc1_8:
leaq 16(%r8),%r8
jnz L$oop_enc1_8
.byte 102,68,15,56,221,249
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+ movl %eax,%r10d
+
movdqa L$xts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pshufd $95,%xmm15,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc L$xts_enc_short
shrl $1,%eax
- subl $1,%eax
+ subl $3,%eax
+ movups 16(%r11),%xmm1
movl %eax,%r10d
+ leaq L$xts_magic(%rip),%r8
jmp L$xts_enc_grandloop
-.p2align 4
+.p2align 5
L$xts_enc_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,220,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,220,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,220,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,220,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,220,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,220,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,220,249
+ movups 48(%r11),%xmm1
+
+.byte 102,15,56,220,208
+ pxor %xmm9,%xmm12
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,220,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,220,216
+ pxor %xmm9,%xmm13
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,220,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,220,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,220,224
+ pxor %xmm9,%xmm14
+.byte 102,15,56,220,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,220,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,220,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp L$xts_enc_loop6_enter
-
-.p2align 4
+.byte 102,15,56,220,240
+ movdqa %xmm8,80(%rsp)
+.byte 102,15,56,220,248
+ movups 64(%r11),%xmm0
+ leaq 64(%r11),%rcx
+ pshufd $95,%xmm15,%xmm9
+ jmp L$xts_enc_loop6
+.p2align 5
L$xts_enc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
-L$xts_enc_loop6_enter:
movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
movups (%rcx),%xmm0
+ decl %eax
jnz L$xts_enc_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
.byte 102,15,56,220,233
+ pxor %xmm14,%xmm15
.byte 102,15,56,220,241
+ movaps %xmm10,%xmm11
.byte 102,15,56,220,249
movups 16(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,208
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm10
+ psrad $31,%xmm14
.byte 102,15,56,220,216
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,220,224
- pxor %xmm9,%xmm15
.byte 102,15,56,220,232
+ pxor %xmm14,%xmm15
.byte 102,15,56,220,240
+ movaps %xmm11,%xmm12
.byte 102,15,56,220,248
movups 32(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,220,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,220,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,220,225
- pxor %xmm9,%xmm15
+ movdqa %xmm13,48(%rsp)
.byte 102,15,56,220,233
+ pxor %xmm14,%xmm15
.byte 102,15,56,220,241
+ movaps %xmm12,%xmm13
.byte 102,15,56,220,249
+ movups 48(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,220,216
paddq %xmm15,%xmm15
-.byte 102,15,56,221,208
- pand %xmm8,%xmm9
-.byte 102,15,56,221,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,221,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,221,232
-.byte 102,15,56,221,240
-.byte 102,15,56,221,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ pxor %xmm14,%xmm15
+.byte 102,15,56,220,240
+ movaps %xmm13,%xmm14
+.byte 102,15,56,220,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,220,217
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+ psrad $31,%xmm9
+.byte 102,15,56,221,84,36,0
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,221,92,36,16
+.byte 102,15,56,221,100,36,32
pxor %xmm9,%xmm15
-
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
+.byte 102,15,56,221,108,36,48
+.byte 102,15,56,221,116,36,64
+.byte 102,15,56,221,124,36,80
movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
+
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc L$xts_enc_grandloop
- leal 3(%rax,%rax,1),%eax
+ leal 7(%rax,%rax,1),%eax
movq %r11,%rcx
movl %eax,%r10d
L$xts_enc_short:
+ pxor %xmm0,%xmm10
addq $96,%rdx
jz L$xts_enc_done
+ pxor %xmm0,%xmm11
cmpq $32,%rdx
jb L$xts_enc_one
+ pxor %xmm0,%xmm12
je L$xts_enc_two
+ pxor %xmm0,%xmm13
cmpq $64,%rdx
jb L$xts_enc_three
+ pxor %xmm0,%xmm14
je L$xts_enc_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
@@ -1514,15 +1740,15 @@ L$xts_enc_four:
call _aesni_encrypt4
- xorps %xmm10,%xmm2
- movdqa %xmm15,%xmm10
- xorps %xmm11,%xmm3
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm10,%xmm2
+ movdqa %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp L$xts_enc_done
@@ -1563,7 +1789,8 @@ L$oop_enc1_10:
movups %xmm2,-16(%rsi)
L$xts_enc_ret:
- leaq 104(%rsp),%rsp
+ leaq (%rbp),%rsp
+ popq %rbp
L$xts_enc_epilogue:
.byte 0xf3,0xc3
@@ -1571,7 +1798,11 @@ L$xts_enc_epilogue:
.p2align 4
_aesni_xts_decrypt:
- leaq -104(%rsp),%rsp
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $112,%rsp
+ andq $-16,%rsp
+ leaq -8(%rax),%rbp
movups (%r9),%xmm15
movl 240(%r8),%eax
movl 240(%rcx),%r10d
@@ -1592,228 +1823,266 @@ L$oop_enc1_11:
shlq $4,%rax
subq %rax,%rdx
+ movups (%rcx),%xmm0
movq %rcx,%r11
movl %r10d,%eax
+ shll $4,%r10d
movq %rdx,%r9
andq $-16,%rdx
+ movups 16(%rcx,%r10,1),%xmm1
+ movl %eax,%r10d
+
movdqa L$xts_magic(%rip),%xmm8
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pshufd $95,%xmm15,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
paddq %xmm15,%xmm15
- pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
- pxor %xmm9,%xmm15
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
- pcmpgtd %xmm15,%xmm14
+ pxor %xmm0,%xmm14
pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
subq $96,%rdx
jc L$xts_dec_short
shrl $1,%eax
- subl $1,%eax
+ subl $3,%eax
+ movups 16(%r11),%xmm1
movl %eax,%r10d
+ leaq L$xts_magic(%rip),%r8
jmp L$xts_dec_grandloop
-.p2align 4
+.p2align 5
L$xts_dec_grandloop:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
- pand %xmm8,%xmm9
+ movdqa %xmm0,%xmm8
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
- movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
- movdqu 48(%rdi),%xmm5
+ movdqu 32(%rdi),%xmm4
pxor %xmm11,%xmm3
- movdqu 64(%rdi),%xmm6
+.byte 102,15,56,222,209
+ movdqu 48(%rdi),%xmm5
pxor %xmm12,%xmm4
- movdqu 80(%rdi),%xmm7
- leaq 96(%rdi),%rdi
+.byte 102,15,56,222,217
+ movdqu 64(%rdi),%xmm6
pxor %xmm13,%xmm5
- movups (%r11),%xmm0
+.byte 102,15,56,222,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
- pxor %xmm15,%xmm7
-
-
+.byte 102,15,56,222,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
- movups 16(%r11),%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm0,%xmm3
+ pxor %xmm9,%xmm10
+.byte 102,15,56,222,241
+ pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
-.byte 102,15,56,222,209
- leaq 32(%r11),%rcx
- pxor %xmm0,%xmm4
+.byte 102,15,56,222,249
+ movups 48(%r11),%xmm1
+
+.byte 102,15,56,222,208
+ pxor %xmm9,%xmm12
movdqa %xmm11,16(%rsp)
-.byte 102,15,56,222,217
- pxor %xmm0,%xmm5
+.byte 102,15,56,222,216
+ pxor %xmm9,%xmm13
movdqa %xmm12,32(%rsp)
-.byte 102,15,56,222,225
- pxor %xmm0,%xmm6
- movdqa %xmm13,48(%rsp)
-.byte 102,15,56,222,233
- pxor %xmm0,%xmm7
- movups (%rcx),%xmm0
- decl %eax
+.byte 102,15,56,222,224
+ pxor %xmm9,%xmm14
+.byte 102,15,56,222,232
+ pxor %xmm9,%xmm8
movdqa %xmm14,64(%rsp)
-.byte 102,15,56,222,241
- movdqa %xmm15,80(%rsp)
-.byte 102,15,56,222,249
- pxor %xmm14,%xmm14
- pcmpgtd %xmm15,%xmm14
- jmp L$xts_dec_loop6_enter
-
-.p2align 4
+.byte 102,15,56,222,240
+ movdqa %xmm8,80(%rsp)
+.byte 102,15,56,222,248
+ movups 64(%r11),%xmm0
+ leaq 64(%r11),%rcx
+ pshufd $95,%xmm15,%xmm9
+ jmp L$xts_dec_loop6
+.p2align 5
L$xts_dec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
- decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
-L$xts_dec_loop6_enter:
movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+
.byte 102,15,56,222,208
.byte 102,15,56,222,216
- leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
movups (%rcx),%xmm0
+ decl %eax
jnz L$xts_dec_loop6
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- paddq %xmm15,%xmm15
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
.byte 102,15,56,222,233
+ pxor %xmm14,%xmm15
.byte 102,15,56,222,241
+ movaps %xmm10,%xmm11
.byte 102,15,56,222,249
movups 16(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm10
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,208
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm10
+ psrad $31,%xmm14
.byte 102,15,56,222,216
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,222,224
- pxor %xmm9,%xmm15
.byte 102,15,56,222,232
+ pxor %xmm14,%xmm15
.byte 102,15,56,222,240
+ movaps %xmm11,%xmm12
.byte 102,15,56,222,248
movups 32(%rcx),%xmm0
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm11
- paddq %xmm15,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
.byte 102,15,56,222,209
- pand %xmm8,%xmm9
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
.byte 102,15,56,222,217
- pcmpgtd %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
.byte 102,15,56,222,225
- pxor %xmm9,%xmm15
+ movdqa %xmm13,48(%rsp)
.byte 102,15,56,222,233
+ pxor %xmm14,%xmm15
.byte 102,15,56,222,241
+ movaps %xmm12,%xmm13
.byte 102,15,56,222,249
+ movups 48(%rcx),%xmm1
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm12
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,222,216
paddq %xmm15,%xmm15
-.byte 102,15,56,223,208
- pand %xmm8,%xmm9
-.byte 102,15,56,223,216
- pcmpgtd %xmm15,%xmm14
-.byte 102,15,56,223,224
- pxor %xmm9,%xmm15
-.byte 102,15,56,223,232
-.byte 102,15,56,223,240
-.byte 102,15,56,223,248
+ pand %xmm8,%xmm14
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ pxor %xmm14,%xmm15
+.byte 102,15,56,222,240
+ movaps %xmm13,%xmm14
+.byte 102,15,56,222,248
- pshufd $19,%xmm14,%xmm9
- pxor %xmm14,%xmm14
- movdqa %xmm15,%xmm13
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,222,217
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+ psrad $31,%xmm9
+.byte 102,15,56,223,84,36,0
paddq %xmm15,%xmm15
- xorps 0(%rsp),%xmm2
pand %xmm8,%xmm9
- xorps 16(%rsp),%xmm3
- pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,223,92,36,16
+.byte 102,15,56,223,100,36,32
pxor %xmm9,%xmm15
-
- xorps 32(%rsp),%xmm4
- movups %xmm2,0(%rsi)
- xorps 48(%rsp),%xmm5
- movups %xmm3,16(%rsi)
- xorps 64(%rsp),%xmm6
- movups %xmm4,32(%rsi)
- xorps 80(%rsp),%xmm7
- movups %xmm5,48(%rsi)
+.byte 102,15,56,223,108,36,48
+.byte 102,15,56,223,116,36,64
+.byte 102,15,56,223,124,36,80
movl %r10d,%eax
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
+
leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
subq $96,%rdx
jnc L$xts_dec_grandloop
- leal 3(%rax,%rax,1),%eax
+ leal 7(%rax,%rax,1),%eax
movq %r11,%rcx
movl %eax,%r10d
L$xts_dec_short:
+ pxor %xmm0,%xmm10
+ pxor %xmm0,%xmm11
addq $96,%rdx
jz L$xts_dec_done
+ pxor %xmm0,%xmm12
cmpq $32,%rdx
jb L$xts_dec_one
+ pxor %xmm0,%xmm13
je L$xts_dec_two
+ pxor %xmm0,%xmm14
cmpq $64,%rdx
jb L$xts_dec_three
je L$xts_dec_four
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
- pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
@@ -1906,7 +2175,7 @@ L$xts_dec_three:
xorps %xmm10,%xmm2
movdqa %xmm13,%xmm10
xorps %xmm11,%xmm3
- movdqa %xmm15,%xmm11
+ movdqa %xmm14,%xmm11
xorps %xmm12,%xmm4
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
@@ -1916,14 +2185,8 @@ L$xts_dec_three:
.p2align 4
L$xts_dec_four:
- pshufd $19,%xmm14,%xmm9
- movdqa %xmm15,%xmm14
- paddq %xmm15,%xmm15
movups (%rdi),%xmm2
- pand %xmm8,%xmm9
movups 16(%rdi),%xmm3
- pxor %xmm9,%xmm15
-
movups 32(%rdi),%xmm4
xorps %xmm10,%xmm2
movups 48(%rdi),%xmm5
@@ -1934,16 +2197,16 @@ L$xts_dec_four:
call _aesni_decrypt4
- xorps %xmm10,%xmm2
+ pxor %xmm10,%xmm2
movdqa %xmm14,%xmm10
- xorps %xmm11,%xmm3
+ pxor %xmm11,%xmm3
movdqa %xmm15,%xmm11
- xorps %xmm12,%xmm4
- movups %xmm2,(%rsi)
- xorps %xmm13,%xmm5
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp L$xts_dec_done
@@ -2003,7 +2266,8 @@ L$oop_dec1_14:
movups %xmm2,(%rsi)
L$xts_dec_ret:
- leaq 104(%rsp),%rsp
+ leaq (%rbp),%rsp
+ popq %rbp
L$xts_dec_epilogue:
.byte 0xf3,0xc3
@@ -2070,149 +2334,324 @@ L$cbc_enc_tail:
.p2align 4
L$cbc_decrypt:
- movups (%r8),%xmm9
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $16,%rsp
+ andq $-16,%rsp
+ leaq -8(%rax),%rbp
+ movups (%r8),%xmm10
movl %r10d,%eax
- cmpq $112,%rdx
+ cmpq $80,%rdx
jbe L$cbc_dec_tail
- shrl $1,%r10d
+
+ movups (%rcx),%xmm0
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqa %xmm2,%xmm11
+ movdqu 32(%rdi),%xmm4
+ movdqa %xmm3,%xmm12
+ movdqu 48(%rdi),%xmm5
+ movdqa %xmm4,%xmm13
+ movdqu 64(%rdi),%xmm6
+ movdqa %xmm5,%xmm14
+ movdqu 80(%rdi),%xmm7
+ movdqa %xmm6,%xmm15
+ cmpq $112,%rdx
+ jbe L$cbc_dec_six_or_seven
+
subq $112,%rdx
- movl %r10d,%eax
- movaps %xmm9,-24(%rsp)
+ leaq 112(%rcx),%rcx
jmp L$cbc_dec_loop8_enter
.p2align 4
L$cbc_dec_loop8:
- movaps %xmm0,-24(%rsp)
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
L$cbc_dec_loop8_enter:
- movups (%rcx),%xmm0
- movups (%rdi),%xmm2
- movups 16(%rdi),%xmm3
- movups 16(%rcx),%xmm1
+ movdqu 96(%rdi),%xmm8
+ pxor %xmm0,%xmm2
+ movdqu 112(%rdi),%xmm9
+ pxor %xmm0,%xmm3
+ movups 16-112(%rcx),%xmm1
+ pxor %xmm0,%xmm4
+ xorq %r11,%r11
+ cmpq $112,%rdx
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+ pxor %xmm0,%xmm7
+ pxor %xmm0,%xmm8
- leaq 32(%rcx),%rcx
- movdqu 32(%rdi),%xmm4
- xorps %xmm0,%xmm2
- movdqu 48(%rdi),%xmm5
- xorps %xmm0,%xmm3
- movdqu 64(%rdi),%xmm6
.byte 102,15,56,222,209
- pxor %xmm0,%xmm4
- movdqu 80(%rdi),%xmm7
+ pxor %xmm0,%xmm9
+ movups 32-112(%rcx),%xmm0
.byte 102,15,56,222,217
- pxor %xmm0,%xmm5
- movdqu 96(%rdi),%xmm8
.byte 102,15,56,222,225
- pxor %xmm0,%xmm6
- movdqu 112(%rdi),%xmm9
.byte 102,15,56,222,233
- pxor %xmm0,%xmm7
- decl %eax
.byte 102,15,56,222,241
- pxor %xmm0,%xmm8
.byte 102,15,56,222,249
- pxor %xmm0,%xmm9
- movups (%rcx),%xmm0
+ setnc %r11b
.byte 102,68,15,56,222,193
+ shlq $7,%r11
.byte 102,68,15,56,222,201
- movups 16(%rcx),%xmm1
-
- call L$dec_loop8_enter
+ addq %rdi,%r11
+ movups 48-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 64-112(%rcx),%xmm0
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 80-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 96-112(%rcx),%xmm0
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 112-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 128-112(%rcx),%xmm0
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 144-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 160-112(%rcx),%xmm0
+ cmpl $11,%eax
+ jb L$cbc_dec_done
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 176-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 192-112(%rcx),%xmm0
+ je L$cbc_dec_done
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 208-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 224-112(%rcx),%xmm0
+L$cbc_dec_done:
+.byte 102,15,56,222,209
+ pxor %xmm0,%xmm10
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm11
+.byte 102,15,56,222,225
+ pxor %xmm0,%xmm12
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm13
+.byte 102,15,56,222,241
+ pxor %xmm0,%xmm14
+.byte 102,15,56,222,249
+ pxor %xmm0,%xmm15
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movdqu 80(%rdi),%xmm1
+
+.byte 102,65,15,56,223,210
+ movdqu 96(%rdi),%xmm10
+ pxor %xmm0,%xmm1
+.byte 102,65,15,56,223,219
+ pxor %xmm0,%xmm10
+ movdqu 112(%rdi),%xmm0
+ leaq 128(%rdi),%rdi
+.byte 102,65,15,56,223,228
+ movdqu 0(%r11),%xmm11
+.byte 102,65,15,56,223,237
+ movdqu 16(%r11),%xmm12
+.byte 102,65,15,56,223,246
+ movdqu 32(%r11),%xmm13
+.byte 102,65,15,56,223,255
+ movdqu 48(%r11),%xmm14
+.byte 102,68,15,56,223,193
+ movdqu 64(%r11),%xmm15
+.byte 102,69,15,56,223,202
+ movdqa %xmm0,%xmm10
+ movdqu 80(%r11),%xmm1
+ movups -112(%rcx),%xmm0
- movups (%rdi),%xmm1
- movups 16(%rdi),%xmm0
- xorps -24(%rsp),%xmm2
- xorps %xmm1,%xmm3
- movups 32(%rdi),%xmm1
- xorps %xmm0,%xmm4
- movups 48(%rdi),%xmm0
- xorps %xmm1,%xmm5
- movups 64(%rdi),%xmm1
- xorps %xmm0,%xmm6
- movups 80(%rdi),%xmm0
- xorps %xmm1,%xmm7
- movups 96(%rdi),%xmm1
- xorps %xmm0,%xmm8
- movups 112(%rdi),%xmm0
- xorps %xmm1,%xmm9
movups %xmm2,(%rsi)
+ movdqa %xmm11,%xmm2
movups %xmm3,16(%rsi)
+ movdqa %xmm12,%xmm3
movups %xmm4,32(%rsi)
+ movdqa %xmm13,%xmm4
movups %xmm5,48(%rsi)
- movl %r10d,%eax
+ movdqa %xmm14,%xmm5
movups %xmm6,64(%rsi)
- movq %r11,%rcx
+ movdqa %xmm15,%xmm6
movups %xmm7,80(%rsi)
- leaq 128(%rdi),%rdi
+ movdqa %xmm1,%xmm7
movups %xmm8,96(%rsi)
leaq 112(%rsi),%rsi
+
subq $128,%rdx
ja L$cbc_dec_loop8
movaps %xmm9,%xmm2
- movaps %xmm0,%xmm9
+ leaq -112(%rcx),%rcx
addq $112,%rdx
jle L$cbc_dec_tail_collected
- movups %xmm2,(%rsi)
- leal 1(%r10,%r10,1),%eax
+ movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
+ cmpq $80,%rdx
+ jbe L$cbc_dec_tail
+
+ movaps %xmm11,%xmm2
+L$cbc_dec_six_or_seven:
+ cmpq $96,%rdx
+ ja L$cbc_dec_seven
+
+ movaps %xmm7,%xmm8
+ call _aesni_decrypt6
+ pxor %xmm10,%xmm2
+ movaps %xmm8,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ movdqa %xmm7,%xmm2
+ jmp L$cbc_dec_tail_collected
+
+.p2align 4
+L$cbc_dec_seven:
+ movups 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
+ call _aesni_decrypt8
+ movups 80(%rdi),%xmm9
+ pxor %xmm10,%xmm2
+ movups 96(%rdi),%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movdqu %xmm6,64(%rsi)
+ pxor %xmm9,%xmm8
+ movdqu %xmm7,80(%rsi)
+ leaq 96(%rsi),%rsi
+ movdqa %xmm8,%xmm2
+ jmp L$cbc_dec_tail_collected
+
L$cbc_dec_tail:
movups (%rdi),%xmm2
- movaps %xmm2,%xmm8
- cmpq $16,%rdx
+ subq $16,%rdx
jbe L$cbc_dec_one
movups 16(%rdi),%xmm3
- movaps %xmm3,%xmm7
- cmpq $32,%rdx
+ movaps %xmm2,%xmm11
+ subq $16,%rdx
jbe L$cbc_dec_two
movups 32(%rdi),%xmm4
- movaps %xmm4,%xmm6
- cmpq $48,%rdx
+ movaps %xmm3,%xmm12
+ subq $16,%rdx
jbe L$cbc_dec_three
movups 48(%rdi),%xmm5
- cmpq $64,%rdx
+ movaps %xmm4,%xmm13
+ subq $16,%rdx
jbe L$cbc_dec_four
movups 64(%rdi),%xmm6
- cmpq $80,%rdx
- jbe L$cbc_dec_five
-
- movups 80(%rdi),%xmm7
- cmpq $96,%rdx
- jbe L$cbc_dec_six
-
- movups 96(%rdi),%xmm8
- movaps %xmm9,-24(%rsp)
- call _aesni_decrypt8
- movups (%rdi),%xmm1
- movups 16(%rdi),%xmm0
- xorps -24(%rsp),%xmm2
- xorps %xmm1,%xmm3
- movups 32(%rdi),%xmm1
- xorps %xmm0,%xmm4
- movups 48(%rdi),%xmm0
- xorps %xmm1,%xmm5
- movups 64(%rdi),%xmm1
- xorps %xmm0,%xmm6
- movups 80(%rdi),%xmm0
- xorps %xmm1,%xmm7
- movups 96(%rdi),%xmm9
- xorps %xmm0,%xmm8
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
- leaq 96(%rsi),%rsi
- movaps %xmm8,%xmm2
- subq $112,%rdx
+ movaps %xmm5,%xmm14
+ movaps %xmm6,%xmm15
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ pxor %xmm10,%xmm2
+ movaps %xmm15,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ leaq 64(%rsi),%rsi
+ movdqa %xmm6,%xmm2
+ subq $16,%rdx
jmp L$cbc_dec_tail_collected
+
.p2align 4
L$cbc_dec_one:
+ movaps %xmm2,%xmm11
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
@@ -2224,111 +2663,69 @@ L$oop_dec1_16:
leaq 16(%rcx),%rcx
jnz L$oop_dec1_16
.byte 102,15,56,223,209
- xorps %xmm9,%xmm2
- movaps %xmm8,%xmm9
- subq $16,%rdx
+ xorps %xmm10,%xmm2
+ movaps %xmm11,%xmm10
jmp L$cbc_dec_tail_collected
.p2align 4
L$cbc_dec_two:
+ movaps %xmm3,%xmm12
xorps %xmm4,%xmm4
call _aesni_decrypt3
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- movups %xmm2,(%rsi)
- movaps %xmm7,%xmm9
- movaps %xmm3,%xmm2
+ pxor %xmm10,%xmm2
+ movaps %xmm12,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ movdqa %xmm3,%xmm2
leaq 16(%rsi),%rsi
- subq $32,%rdx
jmp L$cbc_dec_tail_collected
.p2align 4
L$cbc_dec_three:
+ movaps %xmm4,%xmm13
call _aesni_decrypt3
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- movups %xmm2,(%rsi)
- xorps %xmm7,%xmm4
- movups %xmm3,16(%rsi)
- movaps %xmm6,%xmm9
- movaps %xmm4,%xmm2
+ pxor %xmm10,%xmm2
+ movaps %xmm13,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ movdqa %xmm4,%xmm2
leaq 32(%rsi),%rsi
- subq $48,%rdx
jmp L$cbc_dec_tail_collected
.p2align 4
L$cbc_dec_four:
+ movaps %xmm5,%xmm14
call _aesni_decrypt4
- xorps %xmm9,%xmm2
- movups 48(%rdi),%xmm9
- xorps %xmm8,%xmm3
- movups %xmm2,(%rsi)
- xorps %xmm7,%xmm4
- movups %xmm3,16(%rsi)
- xorps %xmm6,%xmm5
- movups %xmm4,32(%rsi)
- movaps %xmm5,%xmm2
+ pxor %xmm10,%xmm2
+ movaps %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ movdqa %xmm5,%xmm2
leaq 48(%rsi),%rsi
- subq $64,%rdx
- jmp L$cbc_dec_tail_collected
-.p2align 4
-L$cbc_dec_five:
- xorps %xmm7,%xmm7
- call _aesni_decrypt6
- movups 16(%rdi),%xmm1
- movups 32(%rdi),%xmm0
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- xorps %xmm1,%xmm4
- movups 48(%rdi),%xmm1
- xorps %xmm0,%xmm5
- movups 64(%rdi),%xmm9
- xorps %xmm1,%xmm6
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- leaq 64(%rsi),%rsi
- movaps %xmm6,%xmm2
- subq $80,%rdx
- jmp L$cbc_dec_tail_collected
-.p2align 4
-L$cbc_dec_six:
- call _aesni_decrypt6
- movups 16(%rdi),%xmm1
- movups 32(%rdi),%xmm0
- xorps %xmm9,%xmm2
- xorps %xmm8,%xmm3
- xorps %xmm1,%xmm4
- movups 48(%rdi),%xmm1
- xorps %xmm0,%xmm5
- movups 64(%rdi),%xmm0
- xorps %xmm1,%xmm6
- movups 80(%rdi),%xmm9
- xorps %xmm0,%xmm7
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- movups %xmm6,64(%rsi)
- leaq 80(%rsi),%rsi
- movaps %xmm7,%xmm2
- subq $96,%rdx
jmp L$cbc_dec_tail_collected
+
.p2align 4
L$cbc_dec_tail_collected:
+ movups %xmm10,(%r8)
andq $15,%rdx
- movups %xmm9,(%r8)
jnz L$cbc_dec_tail_partial
movups %xmm2,(%rsi)
jmp L$cbc_dec_ret
.p2align 4
L$cbc_dec_tail_partial:
- movaps %xmm2,-24(%rsp)
+ movaps %xmm2,(%rsp)
movq $16,%rcx
movq %rsi,%rdi
subq %rdx,%rcx
- leaq -24(%rsp),%rsi
+ leaq (%rsp),%rsi
.long 0x9066A4F3
L$cbc_dec_ret:
+ leaq (%rbp),%rsp
+ popq %rbp
L$cbc_ret:
.byte 0xf3,0xc3
@@ -2571,6 +2968,8 @@ L$increment64:
.long 1,0,0,0
L$xts_magic:
.long 0x87,0,1,0
+L$increment1:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 6
diff --git a/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s b/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s
index b9ec30c03c..1327e82172 100644
--- a/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s
+++ b/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s
@@ -597,6 +597,468 @@ L$cbc_abort:
popq %rbp
.byte 0xf3,0xc3
+.globl _padlock_cfb_encrypt
+
+.p2align 4
+_padlock_cfb_encrypt:
+ pushq %rbp
+ pushq %rbx
+
+ xorl %eax,%eax
+ testq $15,%rdx
+ jnz L$cfb_abort
+ testq $15,%rcx
+ jnz L$cfb_abort
+ leaq L$padlock_saved_context(%rip),%rax
+ pushf
+ cld
+ call _padlock_verify_ctx
+ leaq 16(%rdx),%rdx
+ xorl %eax,%eax
+ xorl %ebx,%ebx
+ testl $32,(%rdx)
+ jnz L$cfb_aligned
+ testq $15,%rdi
+ setz %al
+ testq $15,%rsi
+ setz %bl
+ testl %ebx,%eax
+ jnz L$cfb_aligned
+ negq %rax
+ movq $512,%rbx
+ notq %rax
+ leaq (%rsp),%rbp
+ cmpq %rbx,%rcx
+ cmovcq %rcx,%rbx
+ andq %rbx,%rax
+ movq %rcx,%rbx
+ negq %rax
+ andq $512-1,%rbx
+ leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+ jmp L$cfb_loop
+.p2align 4
+L$cfb_loop:
+ cmpq %rcx,%rbx
+ cmovaq %rcx,%rbx
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rcx,%r10
+ movq %rbx,%rcx
+ movq %rbx,%r11
+ testq $15,%rdi
+ cmovnzq %rsp,%rdi
+ testq $15,%rsi
+ jz L$cfb_inp_aligned
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
+ subq %rbx,%rdi
+ movq %rbx,%rcx
+ movq %rdi,%rsi
+L$cfb_inp_aligned:
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,224
+ movdqa (%rax),%xmm0
+ movdqa %xmm0,-16(%rdx)
+ movq %r8,%rdi
+ movq %r11,%rbx
+ testq $15,%rdi
+ jz L$cfb_out_aligned
+ movq %rbx,%rcx
+ leaq (%rsp),%rsi
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
+ subq %rbx,%rdi
+L$cfb_out_aligned:
+ movq %r9,%rsi
+ movq %r10,%rcx
+ addq %rbx,%rdi
+ addq %rbx,%rsi
+ subq %rbx,%rcx
+ movq $512,%rbx
+ jnz L$cfb_loop
+ cmpq %rbp,%rsp
+ je L$cfb_done
+
+ pxor %xmm0,%xmm0
+ leaq (%rsp),%rax
+L$cfb_bzero:
+ movaps %xmm0,(%rax)
+ leaq 16(%rax),%rax
+ cmpq %rax,%rbp
+ ja L$cfb_bzero
+
+L$cfb_done:
+ leaq (%rbp),%rsp
+ jmp L$cfb_exit
+
+.p2align 4
+L$cfb_aligned:
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,224
+ movdqa (%rax),%xmm0
+ movdqa %xmm0,-16(%rdx)
+L$cfb_exit:
+ movl $1,%eax
+ leaq 8(%rsp),%rsp
+L$cfb_abort:
+ popq %rbx
+ popq %rbp
+ .byte 0xf3,0xc3
+
+.globl _padlock_ofb_encrypt
+
+.p2align 4
+_padlock_ofb_encrypt:
+ pushq %rbp
+ pushq %rbx
+
+ xorl %eax,%eax
+ testq $15,%rdx
+ jnz L$ofb_abort
+ testq $15,%rcx
+ jnz L$ofb_abort
+ leaq L$padlock_saved_context(%rip),%rax
+ pushf
+ cld
+ call _padlock_verify_ctx
+ leaq 16(%rdx),%rdx
+ xorl %eax,%eax
+ xorl %ebx,%ebx
+ testl $32,(%rdx)
+ jnz L$ofb_aligned
+ testq $15,%rdi
+ setz %al
+ testq $15,%rsi
+ setz %bl
+ testl %ebx,%eax
+ jnz L$ofb_aligned
+ negq %rax
+ movq $512,%rbx
+ notq %rax
+ leaq (%rsp),%rbp
+ cmpq %rbx,%rcx
+ cmovcq %rcx,%rbx
+ andq %rbx,%rax
+ movq %rcx,%rbx
+ negq %rax
+ andq $512-1,%rbx
+ leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+ jmp L$ofb_loop
+.p2align 4
+L$ofb_loop:
+ cmpq %rcx,%rbx
+ cmovaq %rcx,%rbx
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rcx,%r10
+ movq %rbx,%rcx
+ movq %rbx,%r11
+ testq $15,%rdi
+ cmovnzq %rsp,%rdi
+ testq $15,%rsi
+ jz L$ofb_inp_aligned
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
+ subq %rbx,%rdi
+ movq %rbx,%rcx
+ movq %rdi,%rsi
+L$ofb_inp_aligned:
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,232
+ movdqa (%rax),%xmm0
+ movdqa %xmm0,-16(%rdx)
+ movq %r8,%rdi
+ movq %r11,%rbx
+ testq $15,%rdi
+ jz L$ofb_out_aligned
+ movq %rbx,%rcx
+ leaq (%rsp),%rsi
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
+ subq %rbx,%rdi
+L$ofb_out_aligned:
+ movq %r9,%rsi
+ movq %r10,%rcx
+ addq %rbx,%rdi
+ addq %rbx,%rsi
+ subq %rbx,%rcx
+ movq $512,%rbx
+ jnz L$ofb_loop
+ cmpq %rbp,%rsp
+ je L$ofb_done
+
+ pxor %xmm0,%xmm0
+ leaq (%rsp),%rax
+L$ofb_bzero:
+ movaps %xmm0,(%rax)
+ leaq 16(%rax),%rax
+ cmpq %rax,%rbp
+ ja L$ofb_bzero
+
+L$ofb_done:
+ leaq (%rbp),%rsp
+ jmp L$ofb_exit
+
+.p2align 4
+L$ofb_aligned:
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,232
+ movdqa (%rax),%xmm0
+ movdqa %xmm0,-16(%rdx)
+L$ofb_exit:
+ movl $1,%eax
+ leaq 8(%rsp),%rsp
+L$ofb_abort:
+ popq %rbx
+ popq %rbp
+ .byte 0xf3,0xc3
+
+.globl _padlock_ctr32_encrypt
+
+.p2align 4
+_padlock_ctr32_encrypt:
+ pushq %rbp
+ pushq %rbx
+
+ xorl %eax,%eax
+ testq $15,%rdx
+ jnz L$ctr32_abort
+ testq $15,%rcx
+ jnz L$ctr32_abort
+ leaq L$padlock_saved_context(%rip),%rax
+ pushf
+ cld
+ call _padlock_verify_ctx
+ leaq 16(%rdx),%rdx
+ xorl %eax,%eax
+ xorl %ebx,%ebx
+ testl $32,(%rdx)
+ jnz L$ctr32_aligned
+ testq $15,%rdi
+ setz %al
+ testq $15,%rsi
+ setz %bl
+ testl %ebx,%eax
+ jnz L$ctr32_aligned
+ negq %rax
+ movq $512,%rbx
+ notq %rax
+ leaq (%rsp),%rbp
+ cmpq %rbx,%rcx
+ cmovcq %rcx,%rbx
+ andq %rbx,%rax
+ movq %rcx,%rbx
+ negq %rax
+ andq $512-1,%rbx
+ leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+L$ctr32_reenter:
+ movl -4(%rdx),%eax
+ bswapl %eax
+ negl %eax
+ andl $31,%eax
+ movq $512,%rbx
+ shll $4,%eax
+ cmovzq %rbx,%rax
+ cmpq %rax,%rcx
+ cmovaq %rax,%rbx
+ cmovbeq %rcx,%rbx
+ cmpq %rbx,%rcx
+ ja L$ctr32_loop
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $32,%rax
+ movq $-32,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jz L$ctr32_unaligned_tail
+ jmp L$ctr32_loop
+.p2align 4
+L$ctr32_loop:
+ cmpq %rcx,%rbx
+ cmovaq %rcx,%rbx
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rcx,%r10
+ movq %rbx,%rcx
+ movq %rbx,%r11
+ testq $15,%rdi
+ cmovnzq %rsp,%rdi
+ testq $15,%rsi
+ jz L$ctr32_inp_aligned
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
+ subq %rbx,%rdi
+ movq %rbx,%rcx
+ movq %rdi,%rsi
+L$ctr32_inp_aligned:
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,216
+ movl -4(%rdx),%eax
+ testl $4294901760,%eax
+ jnz L$ctr32_no_carry
+ bswapl %eax
+ addl $65536,%eax
+ bswapl %eax
+ movl %eax,-4(%rdx)
+L$ctr32_no_carry:
+ movq %r8,%rdi
+ movq %r11,%rbx
+ testq $15,%rdi
+ jz L$ctr32_out_aligned
+ movq %rbx,%rcx
+ leaq (%rsp),%rsi
+ shrq $3,%rcx
+.byte 0xf3,0x48,0xa5
+ subq %rbx,%rdi
+L$ctr32_out_aligned:
+ movq %r9,%rsi
+ movq %r10,%rcx
+ addq %rbx,%rdi
+ addq %rbx,%rsi
+ subq %rbx,%rcx
+ movq $512,%rbx
+ jz L$ctr32_break
+ cmpq %rbx,%rcx
+ jae L$ctr32_loop
+ movq %rcx,%rbx
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $32,%rax
+ movq $-32,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jnz L$ctr32_loop
+L$ctr32_unaligned_tail:
+ xorl %eax,%eax
+ cmpq %rsp,%rbp
+ cmoveq %rcx,%rax
+ movq %rdi,%r8
+ movq %rcx,%rbx
+ subq %rax,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ movq %rsp,%rsi
+ movq %r8,%rdi
+ movq %rbx,%rcx
+ jmp L$ctr32_loop
+.p2align 4
+L$ctr32_break:
+ cmpq %rbp,%rsp
+ je L$ctr32_done
+
+ pxor %xmm0,%xmm0
+ leaq (%rsp),%rax
+L$ctr32_bzero:
+ movaps %xmm0,(%rax)
+ leaq 16(%rax),%rax
+ cmpq %rax,%rbp
+ ja L$ctr32_bzero
+
+L$ctr32_done:
+ leaq (%rbp),%rsp
+ jmp L$ctr32_exit
+
+.p2align 4
+L$ctr32_aligned:
+ movl -4(%rdx),%eax
+ bswapl %eax
+ negl %eax
+ andl $65535,%eax
+ movq $1048576,%rbx
+ shll $4,%eax
+ cmovzq %rbx,%rax
+ cmpq %rax,%rcx
+ cmovaq %rax,%rbx
+ cmovbeq %rcx,%rbx
+ jbe L$ctr32_aligned_skip
+
+L$ctr32_aligned_loop:
+ movq %rcx,%r10
+ movq %rbx,%rcx
+ movq %rbx,%r11
+
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,216
+
+ movl -4(%rdx),%eax
+ bswapl %eax
+ addl $65536,%eax
+ bswapl %eax
+ movl %eax,-4(%rdx)
+
+ movq %r10,%rcx
+ subq %r11,%rcx
+ movq $1048576,%rbx
+ jz L$ctr32_exit
+ cmpq %rbx,%rcx
+ jae L$ctr32_aligned_loop
+
+L$ctr32_aligned_skip:
+ leaq (%rsi,%rcx,1),%rbp
+ negq %rbp
+ andq $4095,%rbp
+ xorl %eax,%eax
+ cmpq $32,%rbp
+ movq $32-1,%rbp
+ cmovaeq %rax,%rbp
+ andq %rcx,%rbp
+ subq %rbp,%rcx
+ jz L$ctr32_aligned_tail
+ leaq -16(%rdx),%rax
+ leaq 16(%rdx),%rbx
+ shrq $4,%rcx
+.byte 0xf3,0x0f,0xa7,216
+ testq %rbp,%rbp
+ jz L$ctr32_exit
+
+L$ctr32_aligned_tail:
+ movq %rdi,%r8
+ movq %rbp,%rbx
+ movq %rbp,%rcx
+ leaq (%rsp),%rbp
+ subq %rcx,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ leaq (%r8),%rdi
+ leaq (%rsp),%rsi
+ movq %rbx,%rcx
+ jmp L$ctr32_loop
+L$ctr32_exit:
+ movl $1,%eax
+ leaq 8(%rsp),%rsp
+L$ctr32_abort:
+ popq %rbx
+ popq %rbp
+ .byte 0xf3,0xc3
+
.byte 86,73,65,32,80,97,100,108,111,99,107,32,120,56,54,95,54,52,32,109,111,100,117,108,101,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 4
.data
diff --git a/lib/accelerated/x86/macosx/padlock-x86-macosx.s b/lib/accelerated/x86/macosx/padlock-x86-macosx.s
index 7a38b7c3e1..1a2fa9246b 100644
--- a/lib/accelerated/x86/macosx/padlock-x86-macosx.s
+++ b/lib/accelerated/x86/macosx/padlock-x86-macosx.s
@@ -510,6 +510,351 @@ L016cbc_abort:
popl %ebx
popl %ebp
ret
+.globl _padlock_cfb_encrypt
+.align 4
+_padlock_cfb_encrypt:
+L_padlock_cfb_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%edx
+ movl 32(%esp),%ecx
+ testl $15,%edx
+ jnz L028cfb_abort
+ testl $15,%ecx
+ jnz L028cfb_abort
+ leal Lpadlock_saved_context-L029cfb_pic_point,%eax
+ pushfl
+ cld
+ call __padlock_verify_ctx
+L029cfb_pic_point:
+ leal 16(%edx),%edx
+ xorl %eax,%eax
+ xorl %ebx,%ebx
+ testl $32,(%edx)
+ jnz L030cfb_aligned
+ testl $15,%edi
+ setz %al
+ testl $15,%esi
+ setz %bl
+ testl %ebx,%eax
+ jnz L030cfb_aligned
+ negl %eax
+ movl $512,%ebx
+ notl %eax
+ leal -24(%esp),%ebp
+ cmpl %ebx,%ecx
+ cmovcl %ecx,%ebx
+ andl %ebx,%eax
+ movl %ecx,%ebx
+ negl %eax
+ andl $511,%ebx
+ leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ jmp L031cfb_loop
+.align 4,0x90
+L031cfb_loop:
+ movl %edi,(%ebp)
+ movl %esi,4(%ebp)
+ movl %ecx,8(%ebp)
+ movl %ebx,%ecx
+ movl %ebx,12(%ebp)
+ testl $15,%edi
+ cmovnzl %esp,%edi
+ testl $15,%esi
+ jz L032cfb_inp_aligned
+ shrl $2,%ecx
+.byte 243,165
+ subl %ebx,%edi
+ movl %ebx,%ecx
+ movl %edi,%esi
+L032cfb_inp_aligned:
+ leal -16(%edx),%eax
+ leal 16(%edx),%ebx
+ shrl $4,%ecx
+.byte 243,15,167,224
+ movaps (%eax),%xmm0
+ movaps %xmm0,-16(%edx)
+ movl (%ebp),%edi
+ movl 12(%ebp),%ebx
+ testl $15,%edi
+ jz L033cfb_out_aligned
+ movl %ebx,%ecx
+ leal (%esp),%esi
+ shrl $2,%ecx
+.byte 243,165
+ subl %ebx,%edi
+L033cfb_out_aligned:
+ movl 4(%ebp),%esi
+ movl 8(%ebp),%ecx
+ addl %ebx,%edi
+ addl %ebx,%esi
+ subl %ebx,%ecx
+ movl $512,%ebx
+ jnz L031cfb_loop
+ cmpl %ebp,%esp
+ je L034cfb_done
+ pxor %xmm0,%xmm0
+ leal (%esp),%eax
+L035cfb_bzero:
+ movaps %xmm0,(%eax)
+ leal 16(%eax),%eax
+ cmpl %eax,%ebp
+ ja L035cfb_bzero
+L034cfb_done:
+ movl 16(%ebp),%ebp
+ leal 24(%ebp),%esp
+ jmp L036cfb_exit
+.align 4,0x90
+L030cfb_aligned:
+ leal -16(%edx),%eax
+ leal 16(%edx),%ebx
+ shrl $4,%ecx
+.byte 243,15,167,224
+ movaps (%eax),%xmm0
+ movaps %xmm0,-16(%edx)
+L036cfb_exit:
+ movl $1,%eax
+ leal 4(%esp),%esp
+L028cfb_abort:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _padlock_ofb_encrypt
+.align 4
+_padlock_ofb_encrypt:
+L_padlock_ofb_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%edx
+ movl 32(%esp),%ecx
+ testl $15,%edx
+ jnz L037ofb_abort
+ testl $15,%ecx
+ jnz L037ofb_abort
+ leal Lpadlock_saved_context-L038ofb_pic_point,%eax
+ pushfl
+ cld
+ call __padlock_verify_ctx
+L038ofb_pic_point:
+ leal 16(%edx),%edx
+ xorl %eax,%eax
+ xorl %ebx,%ebx
+ testl $32,(%edx)
+ jnz L039ofb_aligned
+ testl $15,%edi
+ setz %al
+ testl $15,%esi
+ setz %bl
+ testl %ebx,%eax
+ jnz L039ofb_aligned
+ negl %eax
+ movl $512,%ebx
+ notl %eax
+ leal -24(%esp),%ebp
+ cmpl %ebx,%ecx
+ cmovcl %ecx,%ebx
+ andl %ebx,%eax
+ movl %ecx,%ebx
+ negl %eax
+ andl $511,%ebx
+ leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ jmp L040ofb_loop
+.align 4,0x90
+L040ofb_loop:
+ movl %edi,(%ebp)
+ movl %esi,4(%ebp)
+ movl %ecx,8(%ebp)
+ movl %ebx,%ecx
+ movl %ebx,12(%ebp)
+ testl $15,%edi
+ cmovnzl %esp,%edi
+ testl $15,%esi
+ jz L041ofb_inp_aligned
+ shrl $2,%ecx
+.byte 243,165
+ subl %ebx,%edi
+ movl %ebx,%ecx
+ movl %edi,%esi
+L041ofb_inp_aligned:
+ leal -16(%edx),%eax
+ leal 16(%edx),%ebx
+ shrl $4,%ecx
+.byte 243,15,167,232
+ movaps (%eax),%xmm0
+ movaps %xmm0,-16(%edx)
+ movl (%ebp),%edi
+ movl 12(%ebp),%ebx
+ testl $15,%edi
+ jz L042ofb_out_aligned
+ movl %ebx,%ecx
+ leal (%esp),%esi
+ shrl $2,%ecx
+.byte 243,165
+ subl %ebx,%edi
+L042ofb_out_aligned:
+ movl 4(%ebp),%esi
+ movl 8(%ebp),%ecx
+ addl %ebx,%edi
+ addl %ebx,%esi
+ subl %ebx,%ecx
+ movl $512,%ebx
+ jnz L040ofb_loop
+ cmpl %ebp,%esp
+ je L043ofb_done
+ pxor %xmm0,%xmm0
+ leal (%esp),%eax
+L044ofb_bzero:
+ movaps %xmm0,(%eax)
+ leal 16(%eax),%eax
+ cmpl %eax,%ebp
+ ja L044ofb_bzero
+L043ofb_done:
+ movl 16(%ebp),%ebp
+ leal 24(%ebp),%esp
+ jmp L045ofb_exit
+.align 4,0x90
+L039ofb_aligned:
+ leal -16(%edx),%eax
+ leal 16(%edx),%ebx
+ shrl $4,%ecx
+.byte 243,15,167,232
+ movaps (%eax),%xmm0
+ movaps %xmm0,-16(%edx)
+L045ofb_exit:
+ movl $1,%eax
+ leal 4(%esp),%esp
+L037ofb_abort:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _padlock_ctr32_encrypt
+.align 4
+_padlock_ctr32_encrypt:
+L_padlock_ctr32_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%edx
+ movl 32(%esp),%ecx
+ testl $15,%edx
+ jnz L046ctr32_abort
+ testl $15,%ecx
+ jnz L046ctr32_abort
+ leal Lpadlock_saved_context-L047ctr32_pic_point,%eax
+ pushfl
+ cld
+ call __padlock_verify_ctx
+L047ctr32_pic_point:
+ leal 16(%edx),%edx
+ xorl %eax,%eax
+ movq -16(%edx),%mm0
+ movl $512,%ebx
+ notl %eax
+ leal -24(%esp),%ebp
+ cmpl %ebx,%ecx
+ cmovcl %ecx,%ebx
+ andl %ebx,%eax
+ movl %ecx,%ebx
+ negl %eax
+ andl $511,%ebx
+ leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ jmp L048ctr32_loop
+.align 4,0x90
+L048ctr32_loop:
+ movl %edi,(%ebp)
+ movl %esi,4(%ebp)
+ movl %ecx,8(%ebp)
+ movl %ebx,%ecx
+ movl %ebx,12(%ebp)
+ movl -4(%edx),%ecx
+ xorl %edi,%edi
+ movl -8(%edx),%eax
+L049ctr32_prepare:
+ movl %ecx,12(%esp,%edi,1)
+ bswap %ecx
+ movq %mm0,(%esp,%edi,1)
+ incl %ecx
+ movl %eax,8(%esp,%edi,1)
+ bswap %ecx
+ leal 16(%edi),%edi
+ cmpl %ebx,%edi
+ jb L049ctr32_prepare
+ movl %ecx,-4(%edx)
+ leal (%esp),%esi
+ leal (%esp),%edi
+ movl %ebx,%ecx
+ leal -16(%edx),%eax
+ leal 16(%edx),%ebx
+ shrl $4,%ecx
+.byte 243,15,167,200
+ movl (%ebp),%edi
+ movl 12(%ebp),%ebx
+ movl 4(%ebp),%esi
+ xorl %ecx,%ecx
+L050ctr32_xor:
+ movups (%esi,%ecx,1),%xmm1
+ leal 16(%ecx),%ecx
+ pxor -16(%esp,%ecx,1),%xmm1
+ movups %xmm1,-16(%edi,%ecx,1)
+ cmpl %ebx,%ecx
+ jb L050ctr32_xor
+ movl 8(%ebp),%ecx
+ addl %ebx,%edi
+ addl %ebx,%esi
+ subl %ebx,%ecx
+ movl $512,%ebx
+ jnz L048ctr32_loop
+ pxor %xmm0,%xmm0
+ leal (%esp),%eax
+L051ctr32_bzero:
+ movaps %xmm0,(%eax)
+ leal 16(%eax),%eax
+ cmpl %eax,%ebp
+ ja L051ctr32_bzero
+L052ctr32_done:
+ movl 16(%ebp),%ebp
+ leal 24(%ebp),%esp
+ movl $1,%eax
+ leal 4(%esp),%esp
+ emms
+L046ctr32_abort:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
.globl _padlock_xstore
.align 4
_padlock_xstore:
@@ -526,10 +871,10 @@ __win32_segv_handler:
movl 4(%esp),%edx
movl 12(%esp),%ecx
cmpl $3221225477,(%edx)
- jne L028ret
+ jne L053ret
addl $4,184(%ecx)
movl $0,%eax
-L028ret:
+L053ret:
ret
.globl _padlock_sha1_oneshot
.align 4