summaryrefslogtreecommitdiff
path: root/lib/accelerated/x86/coff/ghash-x86_64.s
diff options
context:
space:
mode:
Diffstat (limited to 'lib/accelerated/x86/coff/ghash-x86_64.s')
-rw-r--r--lib/accelerated/x86/coff/ghash-x86_64.s934
1 files changed, 727 insertions, 207 deletions
diff --git a/lib/accelerated/x86/coff/ghash-x86_64.s b/lib/accelerated/x86/coff/ghash-x86_64.s
index 221a226d6b..f4bcee28f0 100644
--- a/lib/accelerated/x86/coff/ghash-x86_64.s
+++ b/lib/accelerated/x86/coff/ghash-x86_64.s
@@ -39,6 +39,7 @@
#
.text
+
.globl gcm_gmult_4bit
.def gcm_gmult_4bit; .scl 2; .type 32; .endef
.p2align 4
@@ -65,14 +66,14 @@ gcm_gmult_4bit:
movq $14,%rcx
movq 8(%rsi,%rax,1),%r8
movq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
movq %r8,%rdx
jmp .Loop1
.p2align 4
.Loop1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
movb (%rdi,%rcx,1),%al
shrq $4,%r9
@@ -88,13 +89,13 @@ gcm_gmult_4bit:
js .Lbreak1
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
@@ -103,19 +104,19 @@ gcm_gmult_4bit:
.p2align 4
.Lbreak1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rbx,1),%r8
@@ -720,8 +721,8 @@ gcm_init_clmul:
.L_init_clmul:
.LSEH_begin_gcm_init_clmul:
-.byte 0x48,0x83,0xec,0x18
-.byte 0x0f,0x29,0x34,0x24
+.byte 0x48,0x83,0xec,0x18
+.byte 0x0f,0x29,0x34,0x24
movdqu (%rdx),%xmm2
pshufd $78,%xmm2,%xmm2
@@ -935,184 +936,188 @@ gcm_ghash_clmul:
leaq -136(%rsp),%rax
.LSEH_begin_gcm_ghash_clmul:
-.byte 0x48,0x8d,0x60,0xe0
-.byte 0x0f,0x29,0x70,0xe0
-.byte 0x0f,0x29,0x78,0xf0
-.byte 0x44,0x0f,0x29,0x00
-.byte 0x44,0x0f,0x29,0x48,0x10
-.byte 0x44,0x0f,0x29,0x50,0x20
-.byte 0x44,0x0f,0x29,0x58,0x30
-.byte 0x44,0x0f,0x29,0x60,0x40
-.byte 0x44,0x0f,0x29,0x68,0x50
-.byte 0x44,0x0f,0x29,0x70,0x60
-.byte 0x44,0x0f,0x29,0x78,0x70
- movdqa .Lbswap_mask(%rip),%xmm5
- movq $11547335547999543296,%rax
+.byte 0x48,0x8d,0x60,0xe0
+.byte 0x0f,0x29,0x70,0xe0
+.byte 0x0f,0x29,0x78,0xf0
+.byte 0x44,0x0f,0x29,0x00
+.byte 0x44,0x0f,0x29,0x48,0x10
+.byte 0x44,0x0f,0x29,0x50,0x20
+.byte 0x44,0x0f,0x29,0x58,0x30
+.byte 0x44,0x0f,0x29,0x60,0x40
+.byte 0x44,0x0f,0x29,0x68,0x50
+.byte 0x44,0x0f,0x29,0x70,0x60
+.byte 0x44,0x0f,0x29,0x78,0x70
+ movdqa .Lbswap_mask(%rip),%xmm10
movdqu (%rcx),%xmm0
movdqu (%rdx),%xmm2
- movdqu 32(%rdx),%xmm10
-.byte 102,15,56,0,197
+ movdqu 32(%rdx),%xmm7
+.byte 102,65,15,56,0,194
- subq $16,%r9
+ subq $0x10,%r9
jz .Lodd_tail
- movdqu 16(%rdx),%xmm9
- cmpq $48,%r9
+ movdqu 16(%rdx),%xmm6
+ movl _gnutls_x86_cpuid_s+4(%rip),%eax
+ cmpq $0x30,%r9
jb .Lskip4x
- subq $48,%r9
+ andl $71303168,%eax
+ cmpl $4194304,%eax
+ je .Lskip4x
+
+ subq $0x30,%r9
+ movq $0xA040608020C0E000,%rax
movdqu 48(%rdx),%xmm14
movdqu 64(%rdx),%xmm15
- movdqu 48(%r8),%xmm6
+ movdqu 48(%r8),%xmm3
movdqu 32(%r8),%xmm11
-.byte 102,15,56,0,245
-.byte 102,68,15,56,0,221
- movdqa %xmm6,%xmm8
- pshufd $78,%xmm6,%xmm7
- pxor %xmm6,%xmm7
-.byte 102,15,58,68,242,0
-.byte 102,68,15,58,68,194,17
-.byte 102,65,15,58,68,250,0
+.byte 102,65,15,56,0,218
+.byte 102,69,15,56,0,218
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
movdqa %xmm11,%xmm13
pshufd $78,%xmm11,%xmm12
pxor %xmm11,%xmm12
-.byte 102,69,15,58,68,217,0
-.byte 102,69,15,58,68,233,17
- xorps %xmm11,%xmm6
-.byte 102,69,15,58,68,226,16
- xorps %xmm13,%xmm8
- movups 80(%rdx),%xmm10
- xorps %xmm12,%xmm7
+.byte 102,68,15,58,68,222,0
+.byte 102,68,15,58,68,238,17
+.byte 102,68,15,58,68,231,16
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
+ movups 80(%rdx),%xmm7
+ xorps %xmm12,%xmm4
movdqu 16(%r8),%xmm11
- movdqu 0(%r8),%xmm3
-.byte 102,68,15,56,0,221
-.byte 102,15,56,0,221
+ movdqu 0(%r8),%xmm8
+.byte 102,69,15,56,0,218
+.byte 102,69,15,56,0,194
movdqa %xmm11,%xmm13
pshufd $78,%xmm11,%xmm12
- pxor %xmm3,%xmm0
+ pxor %xmm8,%xmm0
pxor %xmm11,%xmm12
.byte 102,69,15,58,68,222,0
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pxor %xmm0,%xmm3
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
.byte 102,69,15,58,68,238,17
- xorps %xmm11,%xmm6
-.byte 102,69,15,58,68,226,0
- xorps %xmm13,%xmm8
+.byte 102,68,15,58,68,231,0
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
leaq 64(%r8),%r8
- subq $64,%r9
+ subq $0x40,%r9
jc .Ltail4x
jmp .Lmod4_loop
.p2align 5
.Lmod4_loop:
.byte 102,65,15,58,68,199,0
- xorps %xmm12,%xmm7
+ xorps %xmm12,%xmm4
movdqu 48(%r8),%xmm11
-.byte 102,68,15,56,0,221
+.byte 102,69,15,56,0,218
.byte 102,65,15,58,68,207,17
- xorps %xmm6,%xmm0
- movdqu 32(%r8),%xmm6
+ xorps %xmm3,%xmm0
+ movdqu 32(%r8),%xmm3
movdqa %xmm11,%xmm13
+.byte 102,68,15,58,68,199,16
pshufd $78,%xmm11,%xmm12
-.byte 102,65,15,58,68,218,16
- xorps %xmm8,%xmm1
+ xorps %xmm5,%xmm1
pxor %xmm11,%xmm12
-.byte 102,15,56,0,245
- movups 32(%rdx),%xmm10
+.byte 102,65,15,56,0,218
+ movups 32(%rdx),%xmm7
+ xorps %xmm4,%xmm8
.byte 102,68,15,58,68,218,0
- xorps %xmm7,%xmm3
- movdqa %xmm6,%xmm8
- pshufd $78,%xmm6,%xmm7
+ pshufd $78,%xmm3,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm6,%xmm7
- pxor %xmm1,%xmm3
- movdqa %xmm3,%xmm4
- pslldq $8,%xmm3
+ pxor %xmm0,%xmm8
+ movdqa %xmm3,%xmm5
+ pxor %xmm1,%xmm8
+ pxor %xmm3,%xmm4
+ movdqa %xmm8,%xmm9
.byte 102,68,15,58,68,234,17
- psrldq $8,%xmm4
- pxor %xmm3,%xmm0
- movdqa .L7_mask(%rip),%xmm3
- pxor %xmm4,%xmm1
-.byte 102,72,15,110,224
-
- pand %xmm0,%xmm3
-.byte 102,15,56,0,227
-.byte 102,69,15,58,68,226,0
- pxor %xmm0,%xmm4
- psllq $57,%xmm4
- movdqa %xmm4,%xmm3
- pslldq $8,%xmm4
-.byte 102,65,15,58,68,241,0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
- movdqu 0(%r8),%xmm3
+ pslldq $8,%xmm8
+ psrldq $8,%xmm9
+ pxor %xmm8,%xmm0
+ movdqa .L7_mask(%rip),%xmm8
+ pxor %xmm9,%xmm1
+.byte 102,76,15,110,200
+
+ pand %xmm0,%xmm8
+.byte 102,69,15,56,0,200
+ pxor %xmm0,%xmm9
+.byte 102,68,15,58,68,231,0
+ psllq $57,%xmm9
+ movdqa %xmm9,%xmm8
+ pslldq $8,%xmm9
+.byte 102,15,58,68,222,0
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
+ movdqu 0(%r8),%xmm8
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm9
psrlq $1,%xmm0
-.byte 102,69,15,58,68,193,17
- xorps %xmm11,%xmm6
+.byte 102,15,58,68,238,17
+ xorps %xmm11,%xmm3
movdqu 16(%r8),%xmm11
-.byte 102,68,15,56,0,221
-.byte 102,65,15,58,68,250,16
- xorps %xmm13,%xmm8
- movups 80(%rdx),%xmm10
-.byte 102,15,56,0,221
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
+.byte 102,69,15,56,0,218
+.byte 102,15,58,68,231,16
+ xorps %xmm13,%xmm5
+ movups 80(%rdx),%xmm7
+.byte 102,69,15,56,0,194
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
psrlq $5,%xmm0
movdqa %xmm11,%xmm13
- pxor %xmm12,%xmm7
+ pxor %xmm12,%xmm4
pshufd $78,%xmm11,%xmm12
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
pxor %xmm11,%xmm12
.byte 102,69,15,58,68,222,0
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
psrlq $1,%xmm0
-.byte 102,69,15,58,68,238,17
- xorps %xmm11,%xmm6
pxor %xmm1,%xmm0
-
-.byte 102,69,15,58,68,226,0
- xorps %xmm13,%xmm8
-
movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pxor %xmm0,%xmm3
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm3
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
+
+.byte 102,68,15,58,68,231,0
+ xorps %xmm13,%xmm5
leaq 64(%r8),%r8
- subq $64,%r9
+ subq $0x40,%r9
jnc .Lmod4_loop
.Ltail4x:
.byte 102,65,15,58,68,199,0
- xorps %xmm12,%xmm7
.byte 102,65,15,58,68,207,17
- xorps %xmm6,%xmm0
-.byte 102,65,15,58,68,218,16
- xorps %xmm8,%xmm1
+.byte 102,68,15,58,68,199,16
+ xorps %xmm12,%xmm4
+ xorps %xmm3,%xmm0
+ xorps %xmm5,%xmm1
pxor %xmm0,%xmm1
- pxor %xmm7,%xmm3
+ pxor %xmm4,%xmm8
- pxor %xmm1,%xmm3
+ pxor %xmm1,%xmm8
pxor %xmm0,%xmm1
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
+ movdqa %xmm8,%xmm9
+ psrldq $8,%xmm8
+ pslldq $8,%xmm9
+ pxor %xmm8,%xmm1
+ pxor %xmm9,%xmm0
movdqa %xmm0,%xmm4
movdqa %xmm0,%xmm3
@@ -1136,10 +1141,10 @@ gcm_ghash_clmul:
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- addq $64,%r9
+ addq $0x40,%r9
jz .Ldone
- movdqu 32(%rdx),%xmm10
- subq $16,%r9
+ movdqu 32(%rdx),%xmm7
+ subq $0x10,%r9
jz .Lodd_tail
.Lskip4x:
@@ -1147,102 +1152,106 @@ gcm_ghash_clmul:
- movdqu (%r8),%xmm3
- movdqu 16(%r8),%xmm6
-.byte 102,15,56,0,221
-.byte 102,15,56,0,245
- pxor %xmm3,%xmm0
+ movdqu (%r8),%xmm8
+ movdqu 16(%r8),%xmm3
+.byte 102,69,15,56,0,194
+.byte 102,65,15,56,0,218
+ pxor %xmm8,%xmm0
- movdqa %xmm6,%xmm8
- pshufd $78,%xmm6,%xmm3
- pxor %xmm6,%xmm3
-.byte 102,15,58,68,242,0
-.byte 102,68,15,58,68,194,17
-.byte 102,65,15,58,68,218,0
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
leaq 32(%r8),%r8
- subq $32,%r9
+ nop
+ subq $0x20,%r9
jbe .Leven_tail
+ nop
jmp .Lmod_loop
.p2align 5
.Lmod_loop:
movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
pshufd $78,%xmm0,%xmm4
pxor %xmm0,%xmm4
-.byte 102,65,15,58,68,193,0
-.byte 102,65,15,58,68,201,17
-.byte 102,65,15,58,68,226,16
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
- pxor %xmm6,%xmm0
- pxor %xmm8,%xmm1
- movdqu (%r8),%xmm8
-.byte 102,68,15,56,0,197
- movdqu 16(%r8),%xmm6
-
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
- pxor %xmm8,%xmm1
- pxor %xmm3,%xmm4
-.byte 102,15,56,0,245
- movdqa %xmm4,%xmm3
- psrldq $8,%xmm3
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ movdqu (%r8),%xmm9
+ pxor %xmm0,%xmm8
+.byte 102,69,15,56,0,202
+ movdqu 16(%r8),%xmm3
+
+ pxor %xmm1,%xmm8
+ pxor %xmm9,%xmm1
+ pxor %xmm8,%xmm4
+.byte 102,65,15,56,0,218
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
pslldq $8,%xmm4
- pxor %xmm3,%xmm1
+ pxor %xmm8,%xmm1
pxor %xmm4,%xmm0
- movdqa %xmm6,%xmm8
+ movdqa %xmm3,%xmm5
- movdqa %xmm0,%xmm4
- movdqa %xmm0,%xmm3
+ movdqa %xmm0,%xmm9
+ movdqa %xmm0,%xmm8
psllq $5,%xmm0
-.byte 102,15,58,68,242,0
- pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm8
+.byte 102,15,58,68,218,0
psllq $1,%xmm0
- pxor %xmm3,%xmm0
+ pxor %xmm8,%xmm0
psllq $57,%xmm0
- movdqa %xmm0,%xmm3
+ movdqa %xmm0,%xmm8
pslldq $8,%xmm0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
- pshufd $78,%xmm8,%xmm3
- pxor %xmm8,%xmm3
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pshufd $78,%xmm5,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm5,%xmm4
-.byte 102,68,15,58,68,194,17
- movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm9
psrlq $1,%xmm0
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
+.byte 102,15,58,68,234,17
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
psrlq $5,%xmm0
- pxor %xmm4,%xmm0
+ pxor %xmm9,%xmm0
+ leaq 32(%r8),%r8
psrlq $1,%xmm0
-.byte 102,65,15,58,68,218,0
+.byte 102,15,58,68,231,0
pxor %xmm1,%xmm0
- leaq 32(%r8),%r8
- subq $32,%r9
+ subq $0x20,%r9
ja .Lmod_loop
.Leven_tail:
movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
pshufd $78,%xmm0,%xmm4
pxor %xmm0,%xmm4
-.byte 102,65,15,58,68,193,0
-.byte 102,65,15,58,68,201,17
-.byte 102,65,15,58,68,226,16
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
- pxor %xmm6,%xmm0
- pxor %xmm8,%xmm1
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
- pxor %xmm3,%xmm4
- movdqa %xmm4,%xmm3
- psrldq $8,%xmm3
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ pxor %xmm0,%xmm8
+ pxor %xmm1,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
pslldq $8,%xmm4
- pxor %xmm3,%xmm1
+ pxor %xmm8,%xmm1
pxor %xmm4,%xmm0
movdqa %xmm0,%xmm4
@@ -1271,15 +1280,15 @@ gcm_ghash_clmul:
jnz .Ldone
.Lodd_tail:
- movdqu (%r8),%xmm3
-.byte 102,15,56,0,221
- pxor %xmm3,%xmm0
+ movdqu (%r8),%xmm8
+.byte 102,69,15,56,0,194
+ pxor %xmm8,%xmm0
movdqa %xmm0,%xmm1
pshufd $78,%xmm0,%xmm3
pxor %xmm0,%xmm3
.byte 102,15,58,68,194,0
.byte 102,15,58,68,202,17
-.byte 102,65,15,58,68,218,0
+.byte 102,15,58,68,223,0
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
@@ -1312,7 +1321,7 @@ gcm_ghash_clmul:
psrlq $1,%xmm0
pxor %xmm1,%xmm0
.Ldone:
-.byte 102,15,56,0,197
+.byte 102,65,15,56,0,194
movdqu %xmm0,(%rcx)
movaps (%rsp),%xmm6
movaps 16(%rsp),%xmm7
@@ -1332,7 +1341,115 @@ gcm_ghash_clmul:
.def gcm_init_avx; .scl 2; .type 32; .endef
.p2align 5
gcm_init_avx:
- jmp .L_init_clmul
+.LSEH_begin_gcm_init_avx:
+
+.byte 0x48,0x83,0xec,0x18
+.byte 0x0f,0x29,0x34,0x24
+ vzeroupper
+
+ vmovdqu (%rdx),%xmm2
+ vpshufd $78,%xmm2,%xmm2
+
+
+ vpshufd $255,%xmm2,%xmm4
+ vpsrlq $63,%xmm2,%xmm3
+ vpsllq $1,%xmm2,%xmm2
+ vpxor %xmm5,%xmm5,%xmm5
+ vpcmpgtd %xmm4,%xmm5,%xmm5
+ vpslldq $8,%xmm3,%xmm3
+ vpor %xmm3,%xmm2,%xmm2
+
+
+ vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vpunpckhqdq %xmm2,%xmm2,%xmm6
+ vmovdqa %xmm2,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ movq $4,%r10
+ jmp .Linit_start_avx
+.p2align 5
+.Linit_loop_avx:
+ vpalignr $8,%xmm3,%xmm4,%xmm5
+ vmovdqu %xmm5,-16(%rcx)
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+.Linit_start_avx:
+ vmovdqa %xmm0,%xmm5
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+ vpshufd $78,%xmm5,%xmm3
+ vpshufd $78,%xmm0,%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqu %xmm5,0(%rcx)
+ vpxor %xmm0,%xmm4,%xmm4
+ vmovdqu %xmm0,16(%rcx)
+ leaq 48(%rcx),%rcx
+ subq $1,%r10
+ jnz .Linit_loop_avx
+
+ vpalignr $8,%xmm4,%xmm3,%xmm5
+ vmovdqu %xmm5,-16(%rcx)
+
+ vzeroupper
+ movaps (%rsp),%xmm6
+ leaq 24(%rsp),%rsp
+.LSEH_end_gcm_init_avx:
+ .byte 0xf3,0xc3
.globl gcm_gmult_avx
.def gcm_gmult_avx; .scl 2; .type 32; .endef
@@ -1344,7 +1461,403 @@ gcm_gmult_avx:
.def gcm_ghash_avx; .scl 2; .type 32; .endef
.p2align 5
gcm_ghash_avx:
- jmp .L_ghash_clmul
+ leaq -136(%rsp),%rax
+.LSEH_begin_gcm_ghash_avx:
+
+.byte 0x48,0x8d,0x60,0xe0
+.byte 0x0f,0x29,0x70,0xe0
+.byte 0x0f,0x29,0x78,0xf0
+.byte 0x44,0x0f,0x29,0x00
+.byte 0x44,0x0f,0x29,0x48,0x10
+.byte 0x44,0x0f,0x29,0x50,0x20
+.byte 0x44,0x0f,0x29,0x58,0x30
+.byte 0x44,0x0f,0x29,0x60,0x40
+.byte 0x44,0x0f,0x29,0x68,0x50
+.byte 0x44,0x0f,0x29,0x70,0x60
+.byte 0x44,0x0f,0x29,0x78,0x70
+ vzeroupper
+
+ vmovdqu (%rcx),%xmm10
+ leaq .L0x1c2_polynomial(%rip),%r10
+ leaq 64(%rdx),%rdx
+ vmovdqu .Lbswap_mask(%rip),%xmm13
+ vpshufb %xmm13,%xmm10,%xmm10
+ cmpq $0x80,%r9
+ jb .Lshort_avx
+ subq $0x80,%r9
+
+ vmovdqu 112(%r8),%xmm14
+ vmovdqu 0-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vmovdqu 32-64(%rdx),%xmm7
+
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vmovdqu 96(%r8),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm14,%xmm9,%xmm9
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 16-64(%rdx),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 80(%r8),%xmm14
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 48-64(%rdx),%xmm6
+ vpxor %xmm14,%xmm9,%xmm9
+ vmovdqu 64(%r8),%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rdx),%xmm7
+
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rdx),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 48(%r8),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rdx),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rdx),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 32(%r8),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rdx),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 16(%r8),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rdx),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rdx),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu (%r8),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rdx),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+
+ leaq 128(%r8),%r8
+ cmpq $0x80,%r9
+ jb .Ltail_avx
+
+ vpxor %xmm10,%xmm15,%xmm15
+ subq $0x80,%r9
+ jmp .Loop8x_avx
+
+.p2align 5
+.Loop8x_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 112(%r8),%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpxor %xmm15,%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
+ vmovdqu 0-64(%rdx),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
+ vmovdqu 32-64(%rdx),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 96(%r8),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm3,%xmm10,%xmm10
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vxorps %xmm4,%xmm11,%xmm11
+ vmovdqu 16-64(%rdx),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm5,%xmm12,%xmm12
+ vxorps %xmm15,%xmm8,%xmm8
+
+ vmovdqu 80(%r8),%xmm14
+ vpxor %xmm10,%xmm12,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm11,%xmm12,%xmm12
+ vpslldq $8,%xmm12,%xmm9
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vpsrldq $8,%xmm12,%xmm12
+ vpxor %xmm9,%xmm10,%xmm10
+ vmovdqu 48-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vxorps %xmm12,%xmm11,%xmm11
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rdx),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 64(%r8),%xmm15
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rdx),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vxorps %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vmovdqu 48(%r8),%xmm14
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rdx),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rdx),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 32(%r8),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rdx),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+ vxorps %xmm12,%xmm10,%xmm10
+
+ vmovdqu 16(%r8),%xmm14
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rdx),%xmm6
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vxorps %xmm11,%xmm12,%xmm12
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rdx),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu (%r8),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rdx),%xmm6
+ vpxor %xmm12,%xmm15,%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+ vpxor %xmm10,%xmm15,%xmm15
+
+ leaq 128(%r8),%r8
+ subq $0x80,%r9
+ jnc .Loop8x_avx
+
+ addq $0x80,%r9
+ jmp .Ltail_no_xor_avx
+
+.p2align 5
+.Lshort_avx:
+ vmovdqu -16(%r8,%r9,1),%xmm14
+ leaq (%r8,%r9,1),%r8
+ vmovdqu 0-64(%rdx),%xmm6
+ vmovdqu 32-64(%rdx),%xmm7
+ vpshufb %xmm13,%xmm14,%xmm15
+
+ vmovdqa %xmm0,%xmm3
+ vmovdqa %xmm1,%xmm4
+ vmovdqa %xmm2,%xmm5
+ subq $0x10,%r9
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -32(%r8),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 16-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%r9
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -48(%r8),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 48-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 80-64(%rdx),%xmm7
+ subq $0x10,%r9
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -64(%r8),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 64-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%r9
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -80(%r8),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 96-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 128-64(%rdx),%xmm7
+ subq $0x10,%r9
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -96(%r8),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 112-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%r9
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -112(%r8),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 144-64(%rdx),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovq 184-64(%rdx),%xmm7
+ subq $0x10,%r9
+ jmp .Ltail_avx
+
+.p2align 5
+.Ltail_avx:
+ vpxor %xmm10,%xmm15,%xmm15
+.Ltail_no_xor_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+
+ vmovdqu (%r10),%xmm12
+
+ vpxor %xmm0,%xmm3,%xmm10
+ vpxor %xmm1,%xmm4,%xmm11
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vpxor %xmm10,%xmm5,%xmm5
+ vpxor %xmm11,%xmm5,%xmm5
+ vpslldq $8,%xmm5,%xmm9
+ vpsrldq $8,%xmm5,%xmm5
+ vpxor %xmm9,%xmm10,%xmm10
+ vpxor %xmm5,%xmm11,%xmm11
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm11,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ cmpq $0,%r9
+ jne .Lshort_avx
+
+ vpshufb %xmm13,%xmm10,%xmm10
+ vmovdqu %xmm10,(%rcx)
+ vzeroupper
+ movaps (%rsp),%xmm6
+ movaps 16(%rsp),%xmm7
+ movaps 32(%rsp),%xmm8
+ movaps 48(%rsp),%xmm9
+ movaps 64(%rsp),%xmm10
+ movaps 80(%rsp),%xmm11
+ movaps 96(%rsp),%xmm12
+ movaps 112(%rsp),%xmm13
+ movaps 128(%rsp),%xmm14
+ movaps 144(%rsp),%xmm15
+ leaq 168(%rsp),%rsp
+.LSEH_end_gcm_ghash_avx:
+ .byte 0xf3,0xc3
.p2align 6
.Lbswap_mask:
@@ -1451,7 +1964,7 @@ se_handler:
movq 40(%r9),%rdi
movq %r8,%rsi
movl $154,%ecx
-.long 0xa548f3fc
+.long 0xa548f3fc
movq %r9,%rsi
xorq %rcx,%rcx
@@ -1498,31 +2011,38 @@ se_handler:
.rva .LSEH_begin_gcm_ghash_clmul
.rva .LSEH_end_gcm_ghash_clmul
.rva .LSEH_info_gcm_ghash_clmul
+.rva .LSEH_begin_gcm_init_avx
+.rva .LSEH_end_gcm_init_avx
+.rva .LSEH_info_gcm_init_clmul
+
+.rva .LSEH_begin_gcm_ghash_avx
+.rva .LSEH_end_gcm_ghash_avx
+.rva .LSEH_info_gcm_ghash_clmul
.section .xdata
.p2align 3
.LSEH_info_gcm_gmult_4bit:
.byte 9,0,0,0
.rva se_handler
-.rva .Lgmult_prologue,.Lgmult_epilogue
+.rva .Lgmult_prologue,.Lgmult_epilogue
.LSEH_info_gcm_ghash_4bit:
.byte 9,0,0,0
.rva se_handler
-.rva .Lghash_prologue,.Lghash_epilogue
+.rva .Lghash_prologue,.Lghash_epilogue
.LSEH_info_gcm_init_clmul:
.byte 0x01,0x08,0x03,0x00
-.byte 0x08,0x68,0x00,0x00
-.byte 0x04,0x22,0x00,0x00
+.byte 0x08,0x68,0x00,0x00
+.byte 0x04,0x22,0x00,0x00
.LSEH_info_gcm_ghash_clmul:
.byte 0x01,0x33,0x16,0x00
-.byte 0x33,0xf8,0x09,0x00
-.byte 0x2e,0xe8,0x08,0x00
-.byte 0x29,0xd8,0x07,0x00
-.byte 0x24,0xc8,0x06,0x00
-.byte 0x1f,0xb8,0x05,0x00
-.byte 0x1a,0xa8,0x04,0x00
-.byte 0x15,0x98,0x03,0x00
-.byte 0x10,0x88,0x02,0x00
-.byte 0x0c,0x78,0x01,0x00
-.byte 0x08,0x68,0x00,0x00
-.byte 0x04,0x01,0x15,0x00
+.byte 0x33,0xf8,0x09,0x00
+.byte 0x2e,0xe8,0x08,0x00
+.byte 0x29,0xd8,0x07,0x00
+.byte 0x24,0xc8,0x06,0x00
+.byte 0x1f,0xb8,0x05,0x00
+.byte 0x1a,0xa8,0x04,0x00
+.byte 0x15,0x98,0x03,0x00
+.byte 0x10,0x88,0x02,0x00
+.byte 0x0c,0x78,0x01,0x00
+.byte 0x08,0x68,0x00,0x00
+.byte 0x04,0x01,0x15,0x00