summaryrefslogtreecommitdiff
path: root/lib/accelerated/x86/coff
diff options
context:
space:
mode:
authorNikos Mavrogiannopoulos <nmav@gnutls.org>2013-12-14 13:00:55 +0100
committerNikos Mavrogiannopoulos <nmav@gnutls.org>2013-12-14 13:09:07 +0100
commit226ae36af51105cd21a5d2bdcc21e9f4062f14bd (patch)
tree99a9a7005aacde34f7a3cb91965883f634765e6d /lib/accelerated/x86/coff
parent48097fa622ff63f9839cc11f2c88ef7af495e9a7 (diff)
downloadgnutls-226ae36af51105cd21a5d2bdcc21e9f4062f14bd.tar.gz
Added Mike Hamburg's SSSE3 AES implementation.
Diffstat (limited to 'lib/accelerated/x86/coff')
-rw-r--r--lib/accelerated/x86/coff/aes-ssse3-x86.s662
-rw-r--r--lib/accelerated/x86/coff/aes-ssse3-x86_64.s1137
-rw-r--r--lib/accelerated/x86/coff/aesni-x86.s2
-rw-r--r--lib/accelerated/x86/coff/aesni-x86_64.s2
-rw-r--r--lib/accelerated/x86/coff/cpuid-x86.s3
-rw-r--r--lib/accelerated/x86/coff/cpuid-x86_64.s3
-rw-r--r--lib/accelerated/x86/coff/e_padlock-x86.s2
-rw-r--r--lib/accelerated/x86/coff/e_padlock-x86_64.s2
-rw-r--r--lib/accelerated/x86/coff/ghash-x86_64.s2
-rw-r--r--lib/accelerated/x86/coff/sha1-ssse3-x86.s2
-rw-r--r--lib/accelerated/x86/coff/sha1-ssse3-x86_64.s2
-rw-r--r--lib/accelerated/x86/coff/sha256-ssse3-x86.s2
-rw-r--r--lib/accelerated/x86/coff/sha512-ssse3-x86.s2
-rw-r--r--lib/accelerated/x86/coff/sha512-ssse3-x86_64.s2
14 files changed, 1813 insertions, 12 deletions
diff --git a/lib/accelerated/x86/coff/aes-ssse3-x86.s b/lib/accelerated/x86/coff/aes-ssse3-x86.s
new file mode 100644
index 0000000000..6894b14b7c
--- /dev/null
+++ b/lib/accelerated/x86/coff/aes-ssse3-x86.s
@@ -0,0 +1,662 @@
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+#
+# *** This file is auto-generated ***
+#
+.file "vpaes-x86.s"
+.text
+.align 64
+.L_vpaes_consts:
+.long 218628480,235210255,168496130,67568393
+.long 252381056,17041926,33884169,51187212
+.long 252645135,252645135,252645135,252645135
+.long 1512730624,3266504856,1377990664,3401244816
+.long 830229760,1275146365,2969422977,3447763452
+.long 3411033600,2979783055,338359620,2782886510
+.long 4209124096,907596821,221174255,1006095553
+.long 191964160,3799684038,3164090317,1589111125
+.long 182528256,1777043520,2877432650,3265356744
+.long 1874708224,3503451415,3305285752,363511674
+.long 1606117888,3487855781,1093350906,2384367825
+.long 197121,67569157,134941193,202313229
+.long 67569157,134941193,202313229,197121
+.long 134941193,202313229,197121,67569157
+.long 202313229,197121,67569157,134941193
+.long 33619971,100992007,168364043,235736079
+.long 235736079,33619971,100992007,168364043
+.long 168364043,235736079,33619971,100992007
+.long 100992007,168364043,235736079,33619971
+.long 50462976,117835012,185207048,252579084
+.long 252314880,51251460,117574920,184942860
+.long 184682752,252054788,50987272,118359308
+.long 118099200,185467140,251790600,50727180
+.long 2946363062,528716217,1300004225,1881839624
+.long 1532713819,1532713819,1532713819,1532713819
+.long 3602276352,4288629033,3737020424,4153884961
+.long 1354558464,32357713,2958822624,3775749553
+.long 1201988352,132424512,1572796698,503232858
+.long 2213177600,1597421020,4103937655,675398315
+.long 2749646592,4273543773,1511898873,121693092
+.long 3040248576,1103263732,2871565598,1608280554
+.long 2236667136,2588920351,482954393,64377734
+.long 3069987328,291237287,2117370568,3650299247
+.long 533321216,3573750986,2572112006,1401264716
+.long 1339849704,2721158661,548607111,3445553514
+.long 2128193280,3054596040,2183486460,1257083700
+.long 655635200,1165381986,3923443150,2344132524
+.long 190078720,256924420,290342170,357187870
+.long 1610966272,2263057382,4103205268,309794674
+.long 2592527872,2233205587,1335446729,3402964816
+.long 3973531904,3225098121,3002836325,1918774430
+.long 3870401024,2102906079,2284471353,4117666579
+.long 617007872,1021508343,366931923,691083277
+.long 2528395776,3491914898,2968704004,1613121270
+.long 3445188352,3247741094,844474987,4093578302
+.long 651481088,1190302358,1689581232,574775300
+.long 4289380608,206939853,2555985458,2489840491
+.long 2130264064,327674451,3566485037,3349835193
+.long 2470714624,316102159,3636825756,3393945945
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+.byte 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
+.byte 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
+.byte 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
+.byte 118,101,114,115,105,116,121,41,0
+.align 64
+.def __vpaes_preheat; .scl 3; .type 32; .endef
+.align 16
+__vpaes_preheat:
+ addl (%esp),%ebp
+ movdqa -48(%ebp),%xmm7
+ movdqa -16(%ebp),%xmm6
+ ret
+.def __vpaes_encrypt_core; .scl 3; .type 32; .endef
+.align 16
+__vpaes_encrypt_core:
+ movl $16,%ecx
+ movl 240(%edx),%eax
+ movdqa %xmm6,%xmm1
+ movdqa (%ebp),%xmm2
+ pandn %xmm0,%xmm1
+ pand %xmm6,%xmm0
+ movdqu (%edx),%xmm5
+.byte 102,15,56,0,208
+ movdqa 16(%ebp),%xmm0
+ pxor %xmm5,%xmm2
+ psrld $4,%xmm1
+ addl $16,%edx
+.byte 102,15,56,0,193
+ leal 192(%ebp),%ebx
+ pxor %xmm2,%xmm0
+ jmp .L000enc_entry
+.align 16
+.L001enc_loop:
+ movdqa 32(%ebp),%xmm4
+ movdqa 48(%ebp),%xmm0
+.byte 102,15,56,0,226
+.byte 102,15,56,0,195
+ pxor %xmm5,%xmm4
+ movdqa 64(%ebp),%xmm5
+ pxor %xmm4,%xmm0
+ movdqa -64(%ebx,%ecx,1),%xmm1
+.byte 102,15,56,0,234
+ movdqa 80(%ebp),%xmm2
+ movdqa (%ebx,%ecx,1),%xmm4
+.byte 102,15,56,0,211
+ movdqa %xmm0,%xmm3
+ pxor %xmm5,%xmm2
+.byte 102,15,56,0,193
+ addl $16,%edx
+ pxor %xmm2,%xmm0
+.byte 102,15,56,0,220
+ addl $16,%ecx
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,193
+ andl $48,%ecx
+ subl $1,%eax
+ pxor %xmm3,%xmm0
+.L000enc_entry:
+ movdqa %xmm6,%xmm1
+ movdqa -32(%ebp),%xmm5
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm6,%xmm0
+.byte 102,15,56,0,232
+ movdqa %xmm7,%xmm3
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,217
+ movdqa %xmm7,%xmm4
+ pxor %xmm5,%xmm3
+.byte 102,15,56,0,224
+ movdqa %xmm7,%xmm2
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,211
+ movdqa %xmm7,%xmm3
+ pxor %xmm0,%xmm2
+.byte 102,15,56,0,220
+ movdqu (%edx),%xmm5
+ pxor %xmm1,%xmm3
+ jnz .L001enc_loop
+ movdqa 96(%ebp),%xmm4
+ movdqa 112(%ebp),%xmm0
+.byte 102,15,56,0,226
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,195
+ movdqa 64(%ebx,%ecx,1),%xmm1
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,193
+ ret
+.def __vpaes_decrypt_core; .scl 3; .type 32; .endef
+.align 16
+__vpaes_decrypt_core:
+ leal 608(%ebp),%ebx
+ movl 240(%edx),%eax
+ movdqa %xmm6,%xmm1
+ movdqa -64(%ebx),%xmm2
+ pandn %xmm0,%xmm1
+ movl %eax,%ecx
+ psrld $4,%xmm1
+ movdqu (%edx),%xmm5
+ shll $4,%ecx
+ pand %xmm6,%xmm0
+.byte 102,15,56,0,208
+ movdqa -48(%ebx),%xmm0
+ xorl $48,%ecx
+.byte 102,15,56,0,193
+ andl $48,%ecx
+ pxor %xmm5,%xmm2
+ movdqa 176(%ebp),%xmm5
+ pxor %xmm2,%xmm0
+ addl $16,%edx
+ leal -352(%ebx,%ecx,1),%ecx
+ jmp .L002dec_entry
+.align 16
+.L003dec_loop:
+ movdqa -32(%ebx),%xmm4
+ movdqa -16(%ebx),%xmm1
+.byte 102,15,56,0,226
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa (%ebx),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 16(%ebx),%xmm1
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 32(%ebx),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 48(%ebx),%xmm1
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 64(%ebx),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 80(%ebx),%xmm1
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ addl $16,%edx
+.byte 102,15,58,15,237,12
+ pxor %xmm1,%xmm0
+ subl $1,%eax
+.L002dec_entry:
+ movdqa %xmm6,%xmm1
+ movdqa -32(%ebp),%xmm2
+ pandn %xmm0,%xmm1
+ pand %xmm6,%xmm0
+ psrld $4,%xmm1
+.byte 102,15,56,0,208
+ movdqa %xmm7,%xmm3
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,217
+ movdqa %xmm7,%xmm4
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm7,%xmm2
+.byte 102,15,56,0,211
+ movdqa %xmm7,%xmm3
+ pxor %xmm0,%xmm2
+.byte 102,15,56,0,220
+ movdqu (%edx),%xmm0
+ pxor %xmm1,%xmm3
+ jnz .L003dec_loop
+ movdqa 96(%ebx),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 112(%ebx),%xmm0
+ movdqa (%ecx),%xmm2
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,194
+ ret
+.def __vpaes_schedule_core; .scl 3; .type 32; .endef
+.align 16
+__vpaes_schedule_core:
+ addl (%esp),%ebp
+ movdqu (%esi),%xmm0
+ movdqa 320(%ebp),%xmm2
+ movdqa %xmm0,%xmm3
+ leal (%ebp),%ebx
+ movdqa %xmm2,4(%esp)
+ call __vpaes_schedule_transform
+ movdqa %xmm0,%xmm7
+ testl %edi,%edi
+ jnz .L004schedule_am_decrypting
+ movdqu %xmm0,(%edx)
+ jmp .L005schedule_go
+.L004schedule_am_decrypting:
+ movdqa 256(%ebp,%ecx,1),%xmm1
+.byte 102,15,56,0,217
+ movdqu %xmm3,(%edx)
+ xorl $48,%ecx
+.L005schedule_go:
+ cmpl $192,%eax
+ ja .L006schedule_256
+ je .L007schedule_192
+.L008schedule_128:
+ movl $10,%eax
+.L009loop_schedule_128:
+ call __vpaes_schedule_round
+ decl %eax
+ jz .L010schedule_mangle_last
+ call __vpaes_schedule_mangle
+ jmp .L009loop_schedule_128
+.align 16
+.L007schedule_192:
+ movdqu 8(%esi),%xmm0
+ call __vpaes_schedule_transform
+ movdqa %xmm0,%xmm6
+ pxor %xmm4,%xmm4
+ movhlps %xmm4,%xmm6
+ movl $4,%eax
+.L011loop_schedule_192:
+ call __vpaes_schedule_round
+.byte 102,15,58,15,198,8
+ call __vpaes_schedule_mangle
+ call __vpaes_schedule_192_smear
+ call __vpaes_schedule_mangle
+ call __vpaes_schedule_round
+ decl %eax
+ jz .L010schedule_mangle_last
+ call __vpaes_schedule_mangle
+ call __vpaes_schedule_192_smear
+ jmp .L011loop_schedule_192
+.align 16
+.L006schedule_256:
+ movdqu 16(%esi),%xmm0
+ call __vpaes_schedule_transform
+ movl $7,%eax
+.L012loop_schedule_256:
+ call __vpaes_schedule_mangle
+ movdqa %xmm0,%xmm6
+ call __vpaes_schedule_round
+ decl %eax
+ jz .L010schedule_mangle_last
+ call __vpaes_schedule_mangle
+ pshufd $255,%xmm0,%xmm0
+ movdqa %xmm7,20(%esp)
+ movdqa %xmm6,%xmm7
+ call .L_vpaes_schedule_low_round
+ movdqa 20(%esp),%xmm7
+ jmp .L012loop_schedule_256
+.align 16
+.L010schedule_mangle_last:
+ leal 384(%ebp),%ebx
+ testl %edi,%edi
+ jnz .L013schedule_mangle_last_dec
+ movdqa 256(%ebp,%ecx,1),%xmm1
+.byte 102,15,56,0,193
+ leal 352(%ebp),%ebx
+ addl $32,%edx
+.L013schedule_mangle_last_dec:
+ addl $-16,%edx
+ pxor 336(%ebp),%xmm0
+ call __vpaes_schedule_transform
+ movdqu %xmm0,(%edx)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ ret
+.def __vpaes_schedule_192_smear; .scl 3; .type 32; .endef
+.align 16
+__vpaes_schedule_192_smear:
+ pshufd $128,%xmm6,%xmm1
+ pshufd $254,%xmm7,%xmm0
+ pxor %xmm1,%xmm6
+ pxor %xmm1,%xmm1
+ pxor %xmm0,%xmm6
+ movdqa %xmm6,%xmm0
+ movhlps %xmm1,%xmm6
+ ret
+.def __vpaes_schedule_round; .scl 3; .type 32; .endef
+.align 16
+__vpaes_schedule_round:
+ movdqa 8(%esp),%xmm2
+ pxor %xmm1,%xmm1
+.byte 102,15,58,15,202,15
+.byte 102,15,58,15,210,15
+ pxor %xmm1,%xmm7
+ pshufd $255,%xmm0,%xmm0
+.byte 102,15,58,15,192,1
+ movdqa %xmm2,8(%esp)
+.L_vpaes_schedule_low_round:
+ movdqa %xmm7,%xmm1
+ pslldq $4,%xmm7
+ pxor %xmm1,%xmm7
+ movdqa %xmm7,%xmm1
+ pslldq $8,%xmm7
+ pxor %xmm1,%xmm7
+ pxor 336(%ebp),%xmm7
+ movdqa -16(%ebp),%xmm4
+ movdqa -48(%ebp),%xmm5
+ movdqa %xmm4,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm4,%xmm0
+ movdqa -32(%ebp),%xmm2
+.byte 102,15,56,0,208
+ pxor %xmm1,%xmm0
+ movdqa %xmm5,%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ movdqa %xmm5,%xmm4
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm5,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm0,%xmm2
+ movdqa %xmm5,%xmm3
+.byte 102,15,56,0,220
+ pxor %xmm1,%xmm3
+ movdqa 32(%ebp),%xmm4
+.byte 102,15,56,0,226
+ movdqa 48(%ebp),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+ pxor %xmm7,%xmm0
+ movdqa %xmm0,%xmm7
+ ret
+.def __vpaes_schedule_transform; .scl 3; .type 32; .endef
+.align 16
+__vpaes_schedule_transform:
+ movdqa -16(%ebp),%xmm2
+ movdqa %xmm2,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm2,%xmm0
+ movdqa (%ebx),%xmm2
+.byte 102,15,56,0,208
+ movdqa 16(%ebx),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm2,%xmm0
+ ret
+.def __vpaes_schedule_mangle; .scl 3; .type 32; .endef
+.align 16
+__vpaes_schedule_mangle:
+ movdqa %xmm0,%xmm4
+ movdqa 128(%ebp),%xmm5
+ testl %edi,%edi
+ jnz .L014schedule_mangle_dec
+ addl $16,%edx
+ pxor 336(%ebp),%xmm4
+.byte 102,15,56,0,229
+ movdqa %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+ jmp .L015schedule_mangle_both
+.align 16
+.L014schedule_mangle_dec:
+ movdqa -16(%ebp),%xmm2
+ leal 416(%ebp),%esi
+ movdqa %xmm2,%xmm1
+ pandn %xmm4,%xmm1
+ psrld $4,%xmm1
+ pand %xmm2,%xmm4
+ movdqa (%esi),%xmm2
+.byte 102,15,56,0,212
+ movdqa 16(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+ movdqa 32(%esi),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 48(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+ movdqa 64(%esi),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 80(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+ movdqa 96(%esi),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 112(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ addl $-16,%edx
+.L015schedule_mangle_both:
+ movdqa 256(%ebp,%ecx,1),%xmm1
+.byte 102,15,56,0,217
+ addl $-16,%ecx
+ andl $48,%ecx
+ movdqu %xmm3,(%edx)
+ ret
+.globl _vpaes_set_encrypt_key
+.def _vpaes_set_encrypt_key; .scl 2; .type 32; .endef
+.align 16
+_vpaes_set_encrypt_key:
+.L_vpaes_set_encrypt_key_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%eax
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movl %eax,%ebx
+ shrl $5,%ebx
+ addl $5,%ebx
+ movl %ebx,240(%edx)
+ movl $48,%ecx
+ movl $0,%edi
+ leal .L_vpaes_consts+0x30-.L016pic_point,%ebp
+ call __vpaes_schedule_core
+.L016pic_point:
+ movl 48(%esp),%esp
+ xorl %eax,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _vpaes_set_decrypt_key
+.def _vpaes_set_decrypt_key; .scl 2; .type 32; .endef
+.align 16
+_vpaes_set_decrypt_key:
+.L_vpaes_set_decrypt_key_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%eax
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movl %eax,%ebx
+ shrl $5,%ebx
+ addl $5,%ebx
+ movl %ebx,240(%edx)
+ shll $4,%ebx
+ leal 16(%edx,%ebx,1),%edx
+ movl $1,%edi
+ movl %eax,%ecx
+ shrl $1,%ecx
+ andl $32,%ecx
+ xorl $32,%ecx
+ leal .L_vpaes_consts+0x30-.L017pic_point,%ebp
+ call __vpaes_schedule_core
+.L017pic_point:
+ movl 48(%esp),%esp
+ xorl %eax,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _vpaes_encrypt
+.def _vpaes_encrypt; .scl 2; .type 32; .endef
+.align 16
+_vpaes_encrypt:
+.L_vpaes_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ leal .L_vpaes_consts+0x30-.L018pic_point,%ebp
+ call __vpaes_preheat
+.L018pic_point:
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%edi
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movdqu (%esi),%xmm0
+ call __vpaes_encrypt_core
+ movdqu %xmm0,(%edi)
+ movl 48(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _vpaes_decrypt
+.def _vpaes_decrypt; .scl 2; .type 32; .endef
+.align 16
+_vpaes_decrypt:
+.L_vpaes_decrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ leal .L_vpaes_consts+0x30-.L019pic_point,%ebp
+ call __vpaes_preheat
+.L019pic_point:
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%edi
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movdqu (%esi),%xmm0
+ call __vpaes_decrypt_core
+ movdqu %xmm0,(%edi)
+ movl 48(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _vpaes_cbc_encrypt
+.def _vpaes_cbc_encrypt; .scl 2; .type 32; .endef
+.align 16
+_vpaes_cbc_encrypt:
+.L_vpaes_cbc_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ subl $16,%eax
+ jc .L020cbc_abort
+ leal -56(%esp),%ebx
+ movl 36(%esp),%ebp
+ andl $-16,%ebx
+ movl 40(%esp),%ecx
+ xchgl %esp,%ebx
+ movdqu (%ebp),%xmm1
+ subl %esi,%edi
+ movl %ebx,48(%esp)
+ movl %edi,(%esp)
+ movl %edx,4(%esp)
+ movl %ebp,8(%esp)
+ movl %eax,%edi
+ leal .L_vpaes_consts+0x30-.L021pic_point,%ebp
+ call __vpaes_preheat
+.L021pic_point:
+ cmpl $0,%ecx
+ je .L022cbc_dec_loop
+ jmp .L023cbc_enc_loop
+.align 16
+.L023cbc_enc_loop:
+ movdqu (%esi),%xmm0
+ pxor %xmm1,%xmm0
+ call __vpaes_encrypt_core
+ movl (%esp),%ebx
+ movl 4(%esp),%edx
+ movdqa %xmm0,%xmm1
+ movdqu %xmm0,(%ebx,%esi,1)
+ leal 16(%esi),%esi
+ subl $16,%edi
+ jnc .L023cbc_enc_loop
+ jmp .L024cbc_done
+.align 16
+.L022cbc_dec_loop:
+ movdqu (%esi),%xmm0
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm0,32(%esp)
+ call __vpaes_decrypt_core
+ movl (%esp),%ebx
+ movl 4(%esp),%edx
+ pxor 16(%esp),%xmm0
+ movdqa 32(%esp),%xmm1
+ movdqu %xmm0,(%ebx,%esi,1)
+ leal 16(%esi),%esi
+ subl $16,%edi
+ jnc .L022cbc_dec_loop
+.L024cbc_done:
+ movl 8(%esp),%ebx
+ movl 48(%esp),%esp
+ movdqu %xmm1,(%ebx)
+.L020cbc_abort:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+
+.section .note.GNU-stack,"",%progbits
diff --git a/lib/accelerated/x86/coff/aes-ssse3-x86_64.s b/lib/accelerated/x86/coff/aes-ssse3-x86_64.s
new file mode 100644
index 0000000000..f8dbd266be
--- /dev/null
+++ b/lib/accelerated/x86/coff/aes-ssse3-x86_64.s
@@ -0,0 +1,1137 @@
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+#
+# *** This file is auto-generated ***
+#
+.text
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.def _vpaes_encrypt_core; .scl 3; .type 32; .endef
+.p2align 4
+_vpaes_encrypt_core:
+ movq %rdx,%r9
+ movq $16,%r11
+ movl 240(%rdx),%eax
+ movdqa %xmm9,%xmm1
+ movdqa .Lk_ipt(%rip),%xmm2
+ pandn %xmm0,%xmm1
+ movdqu (%r9),%xmm5
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,208
+ movdqa .Lk_ipt+16(%rip),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm5,%xmm2
+ addq $16,%r9
+ pxor %xmm2,%xmm0
+ leaq .Lk_mc_backward(%rip),%r10
+ jmp .Lenc_entry
+
+.p2align 4
+.Lenc_loop:
+
+ movdqa %xmm13,%xmm4
+ movdqa %xmm12,%xmm0
+.byte 102,15,56,0,226
+.byte 102,15,56,0,195
+ pxor %xmm5,%xmm4
+ movdqa %xmm15,%xmm5
+ pxor %xmm4,%xmm0
+ movdqa -64(%r11,%r10,1),%xmm1
+.byte 102,15,56,0,234
+ movdqa (%r11,%r10,1),%xmm4
+ movdqa %xmm14,%xmm2
+.byte 102,15,56,0,211
+ movdqa %xmm0,%xmm3
+ pxor %xmm5,%xmm2
+.byte 102,15,56,0,193
+ addq $16,%r9
+ pxor %xmm2,%xmm0
+.byte 102,15,56,0,220
+ addq $16,%r11
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,193
+ andq $48,%r11
+ subq $1,%rax
+ pxor %xmm3,%xmm0
+
+.Lenc_entry:
+
+ movdqa %xmm9,%xmm1
+ movdqa %xmm11,%xmm5
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,232
+ movdqa %xmm10,%xmm3
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,217
+ movdqa %xmm10,%xmm4
+ pxor %xmm5,%xmm3
+.byte 102,15,56,0,224
+ movdqa %xmm10,%xmm2
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,211
+ movdqa %xmm10,%xmm3
+ pxor %xmm0,%xmm2
+.byte 102,15,56,0,220
+ movdqu (%r9),%xmm5
+ pxor %xmm1,%xmm3
+ jnz .Lenc_loop
+
+
+ movdqa -96(%r10),%xmm4
+ movdqa -80(%r10),%xmm0
+.byte 102,15,56,0,226
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,195
+ movdqa 64(%r11,%r10,1),%xmm1
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,193
+ .byte 0xf3,0xc3
+
+
+
+
+
+
+
+.def _vpaes_decrypt_core; .scl 3; .type 32; .endef
+.p2align 4
+_vpaes_decrypt_core:
+ movq %rdx,%r9
+ movl 240(%rdx),%eax
+ movdqa %xmm9,%xmm1
+ movdqa .Lk_dipt(%rip),%xmm2
+ pandn %xmm0,%xmm1
+ movq %rax,%r11
+ psrld $4,%xmm1
+ movdqu (%r9),%xmm5
+ shlq $4,%r11
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,208
+ movdqa .Lk_dipt+16(%rip),%xmm0
+ xorq $48,%r11
+ leaq .Lk_dsbd(%rip),%r10
+.byte 102,15,56,0,193
+ andq $48,%r11
+ pxor %xmm5,%xmm2
+ movdqa .Lk_mc_forward+48(%rip),%xmm5
+ pxor %xmm2,%xmm0
+ addq $16,%r9
+ addq %r10,%r11
+ jmp .Ldec_entry
+
+.p2align 4
+.Ldec_loop:
+
+
+
+ movdqa -32(%r10),%xmm4
+ movdqa -16(%r10),%xmm1
+.byte 102,15,56,0,226
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 0(%r10),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 16(%r10),%xmm1
+
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 32(%r10),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 48(%r10),%xmm1
+
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 64(%r10),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 80(%r10),%xmm1
+
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ addq $16,%r9
+.byte 102,15,58,15,237,12
+ pxor %xmm1,%xmm0
+ subq $1,%rax
+
+.Ldec_entry:
+
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ movdqa %xmm11,%xmm2
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,208
+ movdqa %xmm10,%xmm3
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,217
+ movdqa %xmm10,%xmm4
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm10,%xmm2
+.byte 102,15,56,0,211
+ movdqa %xmm10,%xmm3
+ pxor %xmm0,%xmm2
+.byte 102,15,56,0,220
+ movdqu (%r9),%xmm0
+ pxor %xmm1,%xmm3
+ jnz .Ldec_loop
+
+
+ movdqa 96(%r10),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 112(%r10),%xmm0
+ movdqa -352(%r11),%xmm2
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,194
+ .byte 0xf3,0xc3
+
+
+
+
+
+
+
+.def _vpaes_schedule_core; .scl 3; .type 32; .endef
+.p2align 4
+_vpaes_schedule_core:
+
+
+
+
+
+ call _vpaes_preheat
+ movdqa .Lk_rcon(%rip),%xmm8
+ movdqu (%rdi),%xmm0
+
+
+ movdqa %xmm0,%xmm3
+ leaq .Lk_ipt(%rip),%r11
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm7
+
+ leaq .Lk_sr(%rip),%r10
+ testq %rcx,%rcx
+ jnz .Lschedule_am_decrypting
+
+
+ movdqu %xmm0,(%rdx)
+ jmp .Lschedule_go
+
+.Lschedule_am_decrypting:
+
+ movdqa (%r8,%r10,1),%xmm1
+.byte 102,15,56,0,217
+ movdqu %xmm3,(%rdx)
+ xorq $48,%r8
+
+.Lschedule_go:
+ cmpl $192,%esi
+ ja .Lschedule_256
+ je .Lschedule_192
+
+
+
+
+
+
+
+
+
+
+.Lschedule_128:
+ movl $10,%esi
+
+.Loop_schedule_128:
+ call _vpaes_schedule_round
+ decq %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle
+ jmp .Loop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align 4
+.Lschedule_192:
+ movdqu 8(%rdi),%xmm0
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm6
+ pxor %xmm4,%xmm4
+ movhlps %xmm4,%xmm6
+ movl $4,%esi
+
+.Loop_schedule_192:
+ call _vpaes_schedule_round
+.byte 102,15,58,15,198,8
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_round
+ decq %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ jmp .Loop_schedule_192
+
+
+
+
+
+
+
+
+
+
+
+.p2align 4
+.Lschedule_256:
+ movdqu 16(%rdi),%xmm0
+ call _vpaes_schedule_transform
+ movl $7,%esi
+
+.Loop_schedule_256:
+ call _vpaes_schedule_mangle
+ movdqa %xmm0,%xmm6
+
+
+ call _vpaes_schedule_round
+ decq %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle
+
+
+ pshufd $255,%xmm0,%xmm0
+ movdqa %xmm7,%xmm5
+ movdqa %xmm6,%xmm7
+ call _vpaes_schedule_low_round
+ movdqa %xmm5,%xmm7
+
+ jmp .Loop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align 4
+.Lschedule_mangle_last:
+
+ leaq .Lk_deskew(%rip),%r11
+ testq %rcx,%rcx
+ jnz .Lschedule_mangle_last_dec
+
+
+ movdqa (%r8,%r10,1),%xmm1
+.byte 102,15,56,0,193
+ leaq .Lk_opt(%rip),%r11
+ addq $32,%rdx
+
+.Lschedule_mangle_last_dec:
+ addq $-16,%rdx
+ pxor .Lk_s63(%rip),%xmm0
+ call _vpaes_schedule_transform
+ movdqu %xmm0,(%rdx)
+
+
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ .byte 0xf3,0xc3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.def _vpaes_schedule_192_smear; .scl 3; .type 32; .endef
+.p2align 4
+_vpaes_schedule_192_smear:
+ pshufd $128,%xmm6,%xmm1
+ pshufd $254,%xmm7,%xmm0
+ pxor %xmm1,%xmm6
+ pxor %xmm1,%xmm1
+ pxor %xmm0,%xmm6
+ movdqa %xmm6,%xmm0
+ movhlps %xmm1,%xmm6
+ .byte 0xf3,0xc3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.def _vpaes_schedule_round; .scl 3; .type 32; .endef
+.p2align 4
+_vpaes_schedule_round:
+
+ pxor %xmm1,%xmm1
+.byte 102,65,15,58,15,200,15
+.byte 102,69,15,58,15,192,15
+ pxor %xmm1,%xmm7
+
+
+ pshufd $255,%xmm0,%xmm0
+.byte 102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+ movdqa %xmm7,%xmm1
+ pslldq $4,%xmm7
+ pxor %xmm1,%xmm7
+ movdqa %xmm7,%xmm1
+ pslldq $8,%xmm7
+ pxor %xmm1,%xmm7
+ pxor .Lk_s63(%rip),%xmm7
+
+
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+ movdqa %xmm11,%xmm2
+.byte 102,15,56,0,208
+ pxor %xmm1,%xmm0
+ movdqa %xmm10,%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ movdqa %xmm10,%xmm4
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm10,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm0,%xmm2
+ movdqa %xmm10,%xmm3
+.byte 102,15,56,0,220
+ pxor %xmm1,%xmm3
+ movdqa %xmm13,%xmm4
+.byte 102,15,56,0,226
+ movdqa %xmm12,%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+
+
+ pxor %xmm7,%xmm0
+ movdqa %xmm0,%xmm7
+ .byte 0xf3,0xc3
+
+
+
+
+
+
+
+
+
+
+
+.def _vpaes_schedule_transform; .scl 3; .type 32; .endef
+.p2align 4
+_vpaes_schedule_transform:
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+ movdqa (%r11),%xmm2
+.byte 102,15,56,0,208
+ movdqa 16(%r11),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm2,%xmm0
+ .byte 0xf3,0xc3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.def _vpaes_schedule_mangle; .scl 3; .type 32; .endef
+.p2align 4
+_vpaes_schedule_mangle:
+ movdqa %xmm0,%xmm4
+ movdqa .Lk_mc_forward(%rip),%xmm5
+ testq %rcx,%rcx
+ jnz .Lschedule_mangle_dec
+
+
+ addq $16,%rdx
+ pxor .Lk_s63(%rip),%xmm4
+.byte 102,15,56,0,229
+ movdqa %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+
+ jmp .Lschedule_mangle_both
+.p2align 4
+.Lschedule_mangle_dec:
+
+ leaq .Lk_dksd(%rip),%r11
+ movdqa %xmm9,%xmm1
+ pandn %xmm4,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm4
+
+ movdqa 0(%r11),%xmm2
+.byte 102,15,56,0,212
+ movdqa 16(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+
+ movdqa 32(%r11),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 48(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+
+ movdqa 64(%r11),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 80(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+
+ movdqa 96(%r11),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 112(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+
+ addq $-16,%rdx
+
+.Lschedule_mangle_both:
+ movdqa (%r8,%r10,1),%xmm1
+.byte 102,15,56,0,217
+ addq $-16,%r8
+ andq $48,%r8
+ movdqu %xmm3,(%rdx)
+ .byte 0xf3,0xc3
+
+
+
+
+
+.globl vpaes_set_encrypt_key
+.def vpaes_set_encrypt_key; .scl 2; .type 32; .endef
+.p2align 4
+vpaes_set_encrypt_key:
+ movq %rdi,8(%rsp)
+ movq %rsi,16(%rsp)
+ movq %rsp,%rax
+.LSEH_begin_vpaes_set_encrypt_key:
+ movq %rcx,%rdi
+ movq %rdx,%rsi
+ movq %r8,%rdx
+
+ leaq -184(%rsp),%rsp
+ movaps %xmm6,16(%rsp)
+ movaps %xmm7,32(%rsp)
+ movaps %xmm8,48(%rsp)
+ movaps %xmm9,64(%rsp)
+ movaps %xmm10,80(%rsp)
+ movaps %xmm11,96(%rsp)
+ movaps %xmm12,112(%rsp)
+ movaps %xmm13,128(%rsp)
+ movaps %xmm14,144(%rsp)
+ movaps %xmm15,160(%rsp)
+.Lenc_key_body:
+ movl %esi,%eax
+ shrl $5,%eax
+ addl $5,%eax
+ movl %eax,240(%rdx)
+
+ movl $0,%ecx
+ movl $48,%r8d
+ call _vpaes_schedule_core
+ movaps 16(%rsp),%xmm6
+ movaps 32(%rsp),%xmm7
+ movaps 48(%rsp),%xmm8
+ movaps 64(%rsp),%xmm9
+ movaps 80(%rsp),%xmm10
+ movaps 96(%rsp),%xmm11
+ movaps 112(%rsp),%xmm12
+ movaps 128(%rsp),%xmm13
+ movaps 144(%rsp),%xmm14
+ movaps 160(%rsp),%xmm15
+ leaq 184(%rsp),%rsp
+.Lenc_key_epilogue:
+ xorl %eax,%eax
+ movq 8(%rsp),%rdi
+ movq 16(%rsp),%rsi
+ .byte 0xf3,0xc3
+.LSEH_end_vpaes_set_encrypt_key:
+
+.globl vpaes_set_decrypt_key
+.def vpaes_set_decrypt_key; .scl 2; .type 32; .endef
+.p2align 4
+vpaes_set_decrypt_key:
+ movq %rdi,8(%rsp)
+ movq %rsi,16(%rsp)
+ movq %rsp,%rax
+.LSEH_begin_vpaes_set_decrypt_key:
+ movq %rcx,%rdi
+ movq %rdx,%rsi
+ movq %r8,%rdx
+
+ leaq -184(%rsp),%rsp
+ movaps %xmm6,16(%rsp)
+ movaps %xmm7,32(%rsp)
+ movaps %xmm8,48(%rsp)
+ movaps %xmm9,64(%rsp)
+ movaps %xmm10,80(%rsp)
+ movaps %xmm11,96(%rsp)
+ movaps %xmm12,112(%rsp)
+ movaps %xmm13,128(%rsp)
+ movaps %xmm14,144(%rsp)
+ movaps %xmm15,160(%rsp)
+.Ldec_key_body:
+ movl %esi,%eax
+ shrl $5,%eax
+ addl $5,%eax
+ movl %eax,240(%rdx)
+ shll $4,%eax
+ leaq 16(%rdx,%rax,1),%rdx
+
+ movl $1,%ecx
+ movl %esi,%r8d
+ shrl $1,%r8d
+ andl $32,%r8d
+ xorl $32,%r8d
+ call _vpaes_schedule_core
+ movaps 16(%rsp),%xmm6
+ movaps 32(%rsp),%xmm7
+ movaps 48(%rsp),%xmm8
+ movaps 64(%rsp),%xmm9
+ movaps 80(%rsp),%xmm10
+ movaps 96(%rsp),%xmm11
+ movaps 112(%rsp),%xmm12
+ movaps 128(%rsp),%xmm13
+ movaps 144(%rsp),%xmm14
+ movaps 160(%rsp),%xmm15
+ leaq 184(%rsp),%rsp
+.Ldec_key_epilogue:
+ xorl %eax,%eax
+ movq 8(%rsp),%rdi
+ movq 16(%rsp),%rsi
+ .byte 0xf3,0xc3
+.LSEH_end_vpaes_set_decrypt_key:
+
+.globl vpaes_encrypt
+.def vpaes_encrypt; .scl 2; .type 32; .endef
+.p2align 4
+vpaes_encrypt:
+ movq %rdi,8(%rsp)
+ movq %rsi,16(%rsp)
+ movq %rsp,%rax
+.LSEH_begin_vpaes_encrypt:
+ movq %rcx,%rdi
+ movq %rdx,%rsi
+ movq %r8,%rdx
+
+ leaq -184(%rsp),%rsp
+ movaps %xmm6,16(%rsp)
+ movaps %xmm7,32(%rsp)
+ movaps %xmm8,48(%rsp)
+ movaps %xmm9,64(%rsp)
+ movaps %xmm10,80(%rsp)
+ movaps %xmm11,96(%rsp)
+ movaps %xmm12,112(%rsp)
+ movaps %xmm13,128(%rsp)
+ movaps %xmm14,144(%rsp)
+ movaps %xmm15,160(%rsp)
+.Lenc_body:
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_encrypt_core
+ movdqu %xmm0,(%rsi)
+ movaps 16(%rsp),%xmm6
+ movaps 32(%rsp),%xmm7
+ movaps 48(%rsp),%xmm8
+ movaps 64(%rsp),%xmm9
+ movaps 80(%rsp),%xmm10
+ movaps 96(%rsp),%xmm11
+ movaps 112(%rsp),%xmm12
+ movaps 128(%rsp),%xmm13
+ movaps 144(%rsp),%xmm14
+ movaps 160(%rsp),%xmm15
+ leaq 184(%rsp),%rsp
+.Lenc_epilogue:
+ movq 8(%rsp),%rdi
+ movq 16(%rsp),%rsi
+ .byte 0xf3,0xc3
+.LSEH_end_vpaes_encrypt:
+
+.globl vpaes_decrypt
+.def vpaes_decrypt; .scl 2; .type 32; .endef
+.p2align 4
+vpaes_decrypt:
+ movq %rdi,8(%rsp)
+ movq %rsi,16(%rsp)
+ movq %rsp,%rax
+.LSEH_begin_vpaes_decrypt:
+ movq %rcx,%rdi
+ movq %rdx,%rsi
+ movq %r8,%rdx
+
+ leaq -184(%rsp),%rsp
+ movaps %xmm6,16(%rsp)
+ movaps %xmm7,32(%rsp)
+ movaps %xmm8,48(%rsp)
+ movaps %xmm9,64(%rsp)
+ movaps %xmm10,80(%rsp)
+ movaps %xmm11,96(%rsp)
+ movaps %xmm12,112(%rsp)
+ movaps %xmm13,128(%rsp)
+ movaps %xmm14,144(%rsp)
+ movaps %xmm15,160(%rsp)
+.Ldec_body:
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_decrypt_core
+ movdqu %xmm0,(%rsi)
+ movaps 16(%rsp),%xmm6
+ movaps 32(%rsp),%xmm7
+ movaps 48(%rsp),%xmm8
+ movaps 64(%rsp),%xmm9
+ movaps 80(%rsp),%xmm10
+ movaps 96(%rsp),%xmm11
+ movaps 112(%rsp),%xmm12
+ movaps 128(%rsp),%xmm13
+ movaps 144(%rsp),%xmm14
+ movaps 160(%rsp),%xmm15
+ leaq 184(%rsp),%rsp
+.Ldec_epilogue:
+ movq 8(%rsp),%rdi
+ movq 16(%rsp),%rsi
+ .byte 0xf3,0xc3
+.LSEH_end_vpaes_decrypt:
+.globl vpaes_cbc_encrypt
+.def vpaes_cbc_encrypt; .scl 2; .type 32; .endef
+.p2align 4
+vpaes_cbc_encrypt:
+ movq %rdi,8(%rsp)
+ movq %rsi,16(%rsp)
+ movq %rsp,%rax
+.LSEH_begin_vpaes_cbc_encrypt:
+ movq %rcx,%rdi
+ movq %rdx,%rsi
+ movq %r8,%rdx
+ movq %r9,%rcx
+ movq 40(%rsp),%r8
+ movq 48(%rsp),%r9
+
+ xchgq %rcx,%rdx
+ subq $16,%rcx
+ jc .Lcbc_abort
+ leaq -184(%rsp),%rsp
+ movaps %xmm6,16(%rsp)
+ movaps %xmm7,32(%rsp)
+ movaps %xmm8,48(%rsp)
+ movaps %xmm9,64(%rsp)
+ movaps %xmm10,80(%rsp)
+ movaps %xmm11,96(%rsp)
+ movaps %xmm12,112(%rsp)
+ movaps %xmm13,128(%rsp)
+ movaps %xmm14,144(%rsp)
+ movaps %xmm15,160(%rsp)
+.Lcbc_body:
+ movdqu (%r8),%xmm6
+ subq %rdi,%rsi
+ call _vpaes_preheat
+ cmpl $0,%r9d
+ je .Lcbc_dec_loop
+ jmp .Lcbc_enc_loop
+.p2align 4
+.Lcbc_enc_loop:
+ movdqu (%rdi),%xmm0
+ pxor %xmm6,%xmm0
+ call _vpaes_encrypt_core
+ movdqa %xmm0,%xmm6
+ movdqu %xmm0,(%rsi,%rdi,1)
+ leaq 16(%rdi),%rdi
+ subq $16,%rcx
+ jnc .Lcbc_enc_loop
+ jmp .Lcbc_done
+.p2align 4
+.Lcbc_dec_loop:
+ movdqu (%rdi),%xmm0
+ movdqa %xmm0,%xmm7
+ call _vpaes_decrypt_core
+ pxor %xmm6,%xmm0
+ movdqa %xmm7,%xmm6
+ movdqu %xmm0,(%rsi,%rdi,1)
+ leaq 16(%rdi),%rdi
+ subq $16,%rcx
+ jnc .Lcbc_dec_loop
+.Lcbc_done:
+ movdqu %xmm6,(%r8)
+ movaps 16(%rsp),%xmm6
+ movaps 32(%rsp),%xmm7
+ movaps 48(%rsp),%xmm8
+ movaps 64(%rsp),%xmm9
+ movaps 80(%rsp),%xmm10
+ movaps 96(%rsp),%xmm11
+ movaps 112(%rsp),%xmm12
+ movaps 128(%rsp),%xmm13
+ movaps 144(%rsp),%xmm14
+ movaps 160(%rsp),%xmm15
+ leaq 184(%rsp),%rsp
+.Lcbc_epilogue:
+.Lcbc_abort:
+ movq 8(%rsp),%rdi
+ movq 16(%rsp),%rsi
+ .byte 0xf3,0xc3
+.LSEH_end_vpaes_cbc_encrypt:
+
+
+
+
+
+
+.def _vpaes_preheat; .scl 3; .type 32; .endef
+.p2align 4
+_vpaes_preheat:
+ leaq .Lk_s0F(%rip),%r10
+ movdqa -32(%r10),%xmm10
+ movdqa -16(%r10),%xmm11
+ movdqa 0(%r10),%xmm9
+ movdqa 48(%r10),%xmm13
+ movdqa 64(%r10),%xmm12
+ movdqa 80(%r10),%xmm15
+ movdqa 96(%r10),%xmm14
+ .byte 0xf3,0xc3
+
+
+
+
+
+
+
+.p2align 6
+_vpaes_consts:
+.Lk_inv:
+.quad 0x0E05060F0D080180, 0x040703090A0B0C02
+.quad 0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F:
+.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt:
+.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1:
+.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2:
+.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
+.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo:
+.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward:
+.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad 0x080B0A0904070605, 0x000302010C0F0E0D
+.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad 0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:
+.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad 0x020100030E0D0C0F, 0x0A09080B06050407
+.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad 0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr:
+.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad 0x030E09040F0A0500, 0x0B06010C07020D08
+.quad 0x0F060D040B020900, 0x070E050C030A0108
+.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon:
+.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63:
+.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt:
+.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew:
+.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+
+
+
+
+.Lk_dksd:
+.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:
+.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:
+.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:
+.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+
+
+
+
+.Lk_dipt:
+.quad 0x0F505B040B545F00, 0x154A411E114E451A
+.quad 0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9:
+.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:
+.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:
+.quad 0xD022649296B44200, 0x602646F6B0F2D404
+.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:
+.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo:
+.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.p2align 6
+
+
+.def se_handler; .scl 3; .type 32; .endef
+.p2align 4
+se_handler:
+ pushq %rsi
+ pushq %rdi
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushfq
+ subq $64,%rsp
+
+ movq 120(%r8),%rax
+ movq 248(%r8),%rbx
+
+ movq 8(%r9),%rsi
+ movq 56(%r9),%r11
+
+ movl 0(%r11),%r10d
+ leaq (%rsi,%r10,1),%r10
+ cmpq %r10,%rbx
+ jb .Lin_prologue
+
+ movq 152(%r8),%rax
+
+ movl 4(%r11),%r10d
+ leaq (%rsi,%r10,1),%r10
+ cmpq %r10,%rbx
+ jae .Lin_prologue
+
+ leaq 16(%rax),%rsi
+ leaq 512(%r8),%rdi
+ movl $20,%ecx
+.long 0xa548f3fc
+ leaq 184(%rax),%rax
+
+.Lin_prologue:
+ movq 8(%rax),%rdi
+ movq 16(%rax),%rsi
+ movq %rax,152(%r8)
+ movq %rsi,168(%r8)
+ movq %rdi,176(%r8)
+
+ movq 40(%r9),%rdi
+ movq %r8,%rsi
+ movl $154,%ecx
+.long 0xa548f3fc
+
+ movq %r9,%rsi
+ xorq %rcx,%rcx
+ movq 8(%rsi),%rdx
+ movq 0(%rsi),%r8
+ movq 16(%rsi),%r9
+ movq 40(%rsi),%r10
+ leaq 56(%rsi),%r11
+ leaq 24(%rsi),%r12
+ movq %r10,32(%rsp)
+ movq %r11,40(%rsp)
+ movq %r12,48(%rsp)
+ movq %rcx,56(%rsp)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ movl $1,%eax
+ addq $64,%rsp
+ popfq
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+ popq %rdi
+ popq %rsi
+ .byte 0xf3,0xc3
+
+
+.section .pdata
+.p2align 2
+.rva .LSEH_begin_vpaes_set_encrypt_key
+.rva .LSEH_end_vpaes_set_encrypt_key
+.rva .LSEH_info_vpaes_set_encrypt_key
+
+.rva .LSEH_begin_vpaes_set_decrypt_key
+.rva .LSEH_end_vpaes_set_decrypt_key
+.rva .LSEH_info_vpaes_set_decrypt_key
+
+.rva .LSEH_begin_vpaes_encrypt
+.rva .LSEH_end_vpaes_encrypt
+.rva .LSEH_info_vpaes_encrypt
+
+.rva .LSEH_begin_vpaes_decrypt
+.rva .LSEH_end_vpaes_decrypt
+.rva .LSEH_info_vpaes_decrypt
+
+.rva .LSEH_begin_vpaes_cbc_encrypt
+.rva .LSEH_end_vpaes_cbc_encrypt
+.rva .LSEH_info_vpaes_cbc_encrypt
+
+.section .xdata
+.p2align 3
+.LSEH_info_vpaes_set_encrypt_key:
+.byte 9,0,0,0
+.rva se_handler
+.rva .Lenc_key_body,.Lenc_key_epilogue
+.LSEH_info_vpaes_set_decrypt_key:
+.byte 9,0,0,0
+.rva se_handler
+.rva .Ldec_key_body,.Ldec_key_epilogue
+.LSEH_info_vpaes_encrypt:
+.byte 9,0,0,0
+.rva se_handler
+.rva .Lenc_body,.Lenc_epilogue
+.LSEH_info_vpaes_decrypt:
+.byte 9,0,0,0
+.rva se_handler
+.rva .Ldec_body,.Ldec_epilogue
+.LSEH_info_vpaes_cbc_encrypt:
+.byte 9,0,0,0
+.rva se_handler
+.rva .Lcbc_body,.Lcbc_epilogue
+
+.section .note.GNU-stack,"",%progbits
diff --git a/lib/accelerated/x86/coff/aesni-x86.s b/lib/accelerated/x86/coff/aesni-x86.s
index 1970712d30..9c982a20b5 100644
--- a/lib/accelerated/x86/coff/aesni-x86.s
+++ b/lib/accelerated/x86/coff/aesni-x86.s
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
diff --git a/lib/accelerated/x86/coff/aesni-x86_64.s b/lib/accelerated/x86/coff/aesni-x86_64.s
index 85b51085a5..30b822394b 100644
--- a/lib/accelerated/x86/coff/aesni-x86_64.s
+++ b/lib/accelerated/x86/coff/aesni-x86_64.s
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
diff --git a/lib/accelerated/x86/coff/cpuid-x86.s b/lib/accelerated/x86/coff/cpuid-x86.s
index f35cfba63a..9931ff05ba 100644
--- a/lib/accelerated/x86/coff/cpuid-x86.s
+++ b/lib/accelerated/x86/coff/cpuid-x86.s
@@ -1,5 +1,6 @@
#
-# Copyright (C) 2011-2012 Free Software Foundation, Inc.
+# Copyright (C) 2011-2013 Free Software Foundation, Inc.
+# Copyright (C) 2013 Nikos Mavrogiannopoulos
#
# Author: Nikos Mavrogiannopoulos
#
diff --git a/lib/accelerated/x86/coff/cpuid-x86_64.s b/lib/accelerated/x86/coff/cpuid-x86_64.s
index 033df92ebf..3add1900cf 100644
--- a/lib/accelerated/x86/coff/cpuid-x86_64.s
+++ b/lib/accelerated/x86/coff/cpuid-x86_64.s
@@ -1,5 +1,6 @@
#
-# Copyright (C) 2011-2012 Free Software Foundation, Inc.
+# Copyright (C) 2011-2013 Free Software Foundation, Inc.
+# Copyright (C) 2013 Nikos Mavrogiannopoulos
#
# Author: Nikos Mavrogiannopoulos
#
diff --git a/lib/accelerated/x86/coff/e_padlock-x86.s b/lib/accelerated/x86/coff/e_padlock-x86.s
index d51d62ff73..328e6462f6 100644
--- a/lib/accelerated/x86/coff/e_padlock-x86.s
+++ b/lib/accelerated/x86/coff/e_padlock-x86.s
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
diff --git a/lib/accelerated/x86/coff/e_padlock-x86_64.s b/lib/accelerated/x86/coff/e_padlock-x86_64.s
index 14c62fd176..6b73825042 100644
--- a/lib/accelerated/x86/coff/e_padlock-x86_64.s
+++ b/lib/accelerated/x86/coff/e_padlock-x86_64.s
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
diff --git a/lib/accelerated/x86/coff/ghash-x86_64.s b/lib/accelerated/x86/coff/ghash-x86_64.s
index 951ee891b9..d61c82b9cc 100644
--- a/lib/accelerated/x86/coff/ghash-x86_64.s
+++ b/lib/accelerated/x86/coff/ghash-x86_64.s
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
diff --git a/lib/accelerated/x86/coff/sha1-ssse3-x86.s b/lib/accelerated/x86/coff/sha1-ssse3-x86.s
index 9bd41a0de4..450f574e16 100644
--- a/lib/accelerated/x86/coff/sha1-ssse3-x86.s
+++ b/lib/accelerated/x86/coff/sha1-ssse3-x86.s
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
diff --git a/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s b/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s
index 75868a42c6..98fd50dcc1 100644
--- a/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s
+++ b/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
diff --git a/lib/accelerated/x86/coff/sha256-ssse3-x86.s b/lib/accelerated/x86/coff/sha256-ssse3-x86.s
index 6fe27746ce..117b2bd413 100644
--- a/lib/accelerated/x86/coff/sha256-ssse3-x86.s
+++ b/lib/accelerated/x86/coff/sha256-ssse3-x86.s
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
diff --git a/lib/accelerated/x86/coff/sha512-ssse3-x86.s b/lib/accelerated/x86/coff/sha512-ssse3-x86.s
index 79098da5c2..d68eeffb4a 100644
--- a/lib/accelerated/x86/coff/sha512-ssse3-x86.s
+++ b/lib/accelerated/x86/coff/sha512-ssse3-x86.s
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
diff --git a/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s b/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s
index bbb2661f26..dd80574c89 100644
--- a/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s
+++ b/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s
@@ -1,4 +1,4 @@
-# Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org>
+# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without