diff options
author | Nikos Mavrogiannopoulos <nmav@gnutls.org> | 2013-12-14 13:00:55 +0100 |
---|---|---|
committer | Nikos Mavrogiannopoulos <nmav@gnutls.org> | 2013-12-14 13:09:07 +0100 |
commit | 226ae36af51105cd21a5d2bdcc21e9f4062f14bd (patch) | |
tree | 99a9a7005aacde34f7a3cb91965883f634765e6d /lib/accelerated/x86/coff | |
parent | 48097fa622ff63f9839cc11f2c88ef7af495e9a7 (diff) | |
download | gnutls-226ae36af51105cd21a5d2bdcc21e9f4062f14bd.tar.gz |
Added Mike Hamburg's SSSE3 AES implementation.
Diffstat (limited to 'lib/accelerated/x86/coff')
-rw-r--r-- | lib/accelerated/x86/coff/aes-ssse3-x86.s | 662 | ||||
-rw-r--r-- | lib/accelerated/x86/coff/aes-ssse3-x86_64.s | 1137 | ||||
-rw-r--r-- | lib/accelerated/x86/coff/aesni-x86.s | 2 | ||||
-rw-r--r-- | lib/accelerated/x86/coff/aesni-x86_64.s | 2 | ||||
-rw-r--r-- | lib/accelerated/x86/coff/cpuid-x86.s | 3 | ||||
-rw-r--r-- | lib/accelerated/x86/coff/cpuid-x86_64.s | 3 | ||||
-rw-r--r-- | lib/accelerated/x86/coff/e_padlock-x86.s | 2 | ||||
-rw-r--r-- | lib/accelerated/x86/coff/e_padlock-x86_64.s | 2 | ||||
-rw-r--r-- | lib/accelerated/x86/coff/ghash-x86_64.s | 2 | ||||
-rw-r--r-- | lib/accelerated/x86/coff/sha1-ssse3-x86.s | 2 | ||||
-rw-r--r-- | lib/accelerated/x86/coff/sha1-ssse3-x86_64.s | 2 | ||||
-rw-r--r-- | lib/accelerated/x86/coff/sha256-ssse3-x86.s | 2 | ||||
-rw-r--r-- | lib/accelerated/x86/coff/sha512-ssse3-x86.s | 2 | ||||
-rw-r--r-- | lib/accelerated/x86/coff/sha512-ssse3-x86_64.s | 2 |
14 files changed, 1813 insertions, 12 deletions
diff --git a/lib/accelerated/x86/coff/aes-ssse3-x86.s b/lib/accelerated/x86/coff/aes-ssse3-x86.s new file mode 100644 index 0000000000..6894b14b7c --- /dev/null +++ b/lib/accelerated/x86/coff/aes-ssse3-x86.s @@ -0,0 +1,662 @@ +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. +# +# *** This file is auto-generated *** +# +.file "vpaes-x86.s" +.text +.align 64 +.L_vpaes_consts: +.long 218628480,235210255,168496130,67568393 +.long 252381056,17041926,33884169,51187212 +.long 252645135,252645135,252645135,252645135 +.long 1512730624,3266504856,1377990664,3401244816 +.long 830229760,1275146365,2969422977,3447763452 +.long 3411033600,2979783055,338359620,2782886510 +.long 4209124096,907596821,221174255,1006095553 +.long 191964160,3799684038,3164090317,1589111125 +.long 182528256,1777043520,2877432650,3265356744 +.long 1874708224,3503451415,3305285752,363511674 +.long 1606117888,3487855781,1093350906,2384367825 +.long 197121,67569157,134941193,202313229 +.long 67569157,134941193,202313229,197121 +.long 134941193,202313229,197121,67569157 +.long 202313229,197121,67569157,134941193 +.long 33619971,100992007,168364043,235736079 +.long 235736079,33619971,100992007,168364043 +.long 168364043,235736079,33619971,100992007 +.long 100992007,168364043,235736079,33619971 +.long 50462976,117835012,185207048,252579084 +.long 252314880,51251460,117574920,184942860 +.long 184682752,252054788,50987272,118359308 +.long 118099200,185467140,251790600,50727180 +.long 2946363062,528716217,1300004225,1881839624 +.long 1532713819,1532713819,1532713819,1532713819 +.long 3602276352,4288629033,3737020424,4153884961 +.long 1354558464,32357713,2958822624,3775749553 +.long 1201988352,132424512,1572796698,503232858 +.long 2213177600,1597421020,4103937655,675398315 +.long 2749646592,4273543773,1511898873,121693092 +.long 3040248576,1103263732,2871565598,1608280554 +.long 2236667136,2588920351,482954393,64377734 +.long 3069987328,291237287,2117370568,3650299247 +.long 533321216,3573750986,2572112006,1401264716 +.long 1339849704,2721158661,548607111,3445553514 +.long 2128193280,3054596040,2183486460,1257083700 +.long 655635200,1165381986,3923443150,2344132524 +.long 190078720,256924420,290342170,357187870 +.long 1610966272,2263057382,4103205268,309794674 +.long 2592527872,2233205587,1335446729,3402964816 +.long 3973531904,3225098121,3002836325,1918774430 +.long 3870401024,2102906079,2284471353,4117666579 +.long 617007872,1021508343,366931923,691083277 +.long 2528395776,3491914898,2968704004,1613121270 +.long 3445188352,3247741094,844474987,4093578302 +.long 651481088,1190302358,1689581232,574775300 +.long 4289380608,206939853,2555985458,2489840491 +.long 2130264064,327674451,3566485037,3349835193 +.long 2470714624,316102159,3636825756,3393945945 +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 +.byte 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83 +.byte 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117 +.byte 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105 +.byte 118,101,114,115,105,116,121,41,0 +.align 64 +.def __vpaes_preheat; .scl 3; .type 32; .endef +.align 16 +__vpaes_preheat: + addl (%esp),%ebp + movdqa -48(%ebp),%xmm7 + movdqa -16(%ebp),%xmm6 + ret +.def __vpaes_encrypt_core; .scl 3; .type 32; .endef +.align 16 +__vpaes_encrypt_core: + movl $16,%ecx + movl 240(%edx),%eax + movdqa %xmm6,%xmm1 + movdqa (%ebp),%xmm2 + pandn %xmm0,%xmm1 + pand %xmm6,%xmm0 + movdqu (%edx),%xmm5 +.byte 102,15,56,0,208 + movdqa 16(%ebp),%xmm0 + pxor %xmm5,%xmm2 + psrld $4,%xmm1 + addl $16,%edx +.byte 102,15,56,0,193 + leal 192(%ebp),%ebx + pxor %xmm2,%xmm0 + jmp .L000enc_entry +.align 16 +.L001enc_loop: + movdqa 32(%ebp),%xmm4 + movdqa 48(%ebp),%xmm0 +.byte 102,15,56,0,226 +.byte 102,15,56,0,195 + pxor %xmm5,%xmm4 + movdqa 64(%ebp),%xmm5 + pxor %xmm4,%xmm0 + movdqa -64(%ebx,%ecx,1),%xmm1 +.byte 102,15,56,0,234 + movdqa 80(%ebp),%xmm2 + movdqa (%ebx,%ecx,1),%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 +.byte 102,15,56,0,193 + addl $16,%edx + pxor %xmm2,%xmm0 +.byte 102,15,56,0,220 + addl $16,%ecx + pxor %xmm0,%xmm3 +.byte 102,15,56,0,193 + andl $48,%ecx + subl $1,%eax + pxor %xmm3,%xmm0 +.L000enc_entry: + movdqa %xmm6,%xmm1 + movdqa -32(%ebp),%xmm5 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm6,%xmm0 +.byte 102,15,56,0,232 + movdqa %xmm7,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm7,%xmm4 + pxor %xmm5,%xmm3 +.byte 102,15,56,0,224 + movdqa %xmm7,%xmm2 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm7,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%edx),%xmm5 + pxor %xmm1,%xmm3 + jnz .L001enc_loop + movdqa 96(%ebp),%xmm4 + movdqa 112(%ebp),%xmm0 +.byte 102,15,56,0,226 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,195 + movdqa 64(%ebx,%ecx,1),%xmm1 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,193 + ret +.def __vpaes_decrypt_core; .scl 3; .type 32; .endef +.align 16 +__vpaes_decrypt_core: + leal 608(%ebp),%ebx + movl 240(%edx),%eax + movdqa %xmm6,%xmm1 + movdqa -64(%ebx),%xmm2 + pandn %xmm0,%xmm1 + movl %eax,%ecx + psrld $4,%xmm1 + movdqu (%edx),%xmm5 + shll $4,%ecx + pand %xmm6,%xmm0 +.byte 102,15,56,0,208 + movdqa -48(%ebx),%xmm0 + xorl $48,%ecx +.byte 102,15,56,0,193 + andl $48,%ecx + pxor %xmm5,%xmm2 + movdqa 176(%ebp),%xmm5 + pxor %xmm2,%xmm0 + addl $16,%edx + leal -352(%ebx,%ecx,1),%ecx + jmp .L002dec_entry +.align 16 +.L003dec_loop: + movdqa -32(%ebx),%xmm4 + movdqa -16(%ebx),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa (%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 16(%ebx),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 32(%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%ebx),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 64(%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%ebx),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + addl $16,%edx +.byte 102,15,58,15,237,12 + pxor %xmm1,%xmm0 + subl $1,%eax +.L002dec_entry: + movdqa %xmm6,%xmm1 + movdqa -32(%ebp),%xmm2 + pandn %xmm0,%xmm1 + pand %xmm6,%xmm0 + psrld $4,%xmm1 +.byte 102,15,56,0,208 + movdqa %xmm7,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm7,%xmm4 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm7,%xmm2 +.byte 102,15,56,0,211 + movdqa %xmm7,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%edx),%xmm0 + pxor %xmm1,%xmm3 + jnz .L003dec_loop + movdqa 96(%ebx),%xmm4 +.byte 102,15,56,0,226 + pxor %xmm0,%xmm4 + movdqa 112(%ebx),%xmm0 + movdqa (%ecx),%xmm2 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,194 + ret +.def __vpaes_schedule_core; .scl 3; .type 32; .endef +.align 16 +__vpaes_schedule_core: + addl (%esp),%ebp + movdqu (%esi),%xmm0 + movdqa 320(%ebp),%xmm2 + movdqa %xmm0,%xmm3 + leal (%ebp),%ebx + movdqa %xmm2,4(%esp) + call __vpaes_schedule_transform + movdqa %xmm0,%xmm7 + testl %edi,%edi + jnz .L004schedule_am_decrypting + movdqu %xmm0,(%edx) + jmp .L005schedule_go +.L004schedule_am_decrypting: + movdqa 256(%ebp,%ecx,1),%xmm1 +.byte 102,15,56,0,217 + movdqu %xmm3,(%edx) + xorl $48,%ecx +.L005schedule_go: + cmpl $192,%eax + ja .L006schedule_256 + je .L007schedule_192 +.L008schedule_128: + movl $10,%eax +.L009loop_schedule_128: + call __vpaes_schedule_round + decl %eax + jz .L010schedule_mangle_last + call __vpaes_schedule_mangle + jmp .L009loop_schedule_128 +.align 16 +.L007schedule_192: + movdqu 8(%esi),%xmm0 + call __vpaes_schedule_transform + movdqa %xmm0,%xmm6 + pxor %xmm4,%xmm4 + movhlps %xmm4,%xmm6 + movl $4,%eax +.L011loop_schedule_192: + call __vpaes_schedule_round +.byte 102,15,58,15,198,8 + call __vpaes_schedule_mangle + call __vpaes_schedule_192_smear + call __vpaes_schedule_mangle + call __vpaes_schedule_round + decl %eax + jz .L010schedule_mangle_last + call __vpaes_schedule_mangle + call __vpaes_schedule_192_smear + jmp .L011loop_schedule_192 +.align 16 +.L006schedule_256: + movdqu 16(%esi),%xmm0 + call __vpaes_schedule_transform + movl $7,%eax +.L012loop_schedule_256: + call __vpaes_schedule_mangle + movdqa %xmm0,%xmm6 + call __vpaes_schedule_round + decl %eax + jz .L010schedule_mangle_last + call __vpaes_schedule_mangle + pshufd $255,%xmm0,%xmm0 + movdqa %xmm7,20(%esp) + movdqa %xmm6,%xmm7 + call .L_vpaes_schedule_low_round + movdqa 20(%esp),%xmm7 + jmp .L012loop_schedule_256 +.align 16 +.L010schedule_mangle_last: + leal 384(%ebp),%ebx + testl %edi,%edi + jnz .L013schedule_mangle_last_dec + movdqa 256(%ebp,%ecx,1),%xmm1 +.byte 102,15,56,0,193 + leal 352(%ebp),%ebx + addl $32,%edx +.L013schedule_mangle_last_dec: + addl $-16,%edx + pxor 336(%ebp),%xmm0 + call __vpaes_schedule_transform + movdqu %xmm0,(%edx) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + ret +.def __vpaes_schedule_192_smear; .scl 3; .type 32; .endef +.align 16 +__vpaes_schedule_192_smear: + pshufd $128,%xmm6,%xmm1 + pshufd $254,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 + pxor %xmm0,%xmm6 + movdqa %xmm6,%xmm0 + movhlps %xmm1,%xmm6 + ret +.def __vpaes_schedule_round; .scl 3; .type 32; .endef +.align 16 +__vpaes_schedule_round: + movdqa 8(%esp),%xmm2 + pxor %xmm1,%xmm1 +.byte 102,15,58,15,202,15 +.byte 102,15,58,15,210,15 + pxor %xmm1,%xmm7 + pshufd $255,%xmm0,%xmm0 +.byte 102,15,58,15,192,1 + movdqa %xmm2,8(%esp) +.L_vpaes_schedule_low_round: + movdqa %xmm7,%xmm1 + pslldq $4,%xmm7 + pxor %xmm1,%xmm7 + movdqa %xmm7,%xmm1 + pslldq $8,%xmm7 + pxor %xmm1,%xmm7 + pxor 336(%ebp),%xmm7 + movdqa -16(%ebp),%xmm4 + movdqa -48(%ebp),%xmm5 + movdqa %xmm4,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm4,%xmm0 + movdqa -32(%ebp),%xmm2 +.byte 102,15,56,0,208 + pxor %xmm1,%xmm0 + movdqa %xmm5,%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + movdqa %xmm5,%xmm4 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm5,%xmm2 +.byte 102,15,56,0,211 + pxor %xmm0,%xmm2 + movdqa %xmm5,%xmm3 +.byte 102,15,56,0,220 + pxor %xmm1,%xmm3 + movdqa 32(%ebp),%xmm4 +.byte 102,15,56,0,226 + movdqa 48(%ebp),%xmm0 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 + pxor %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + ret +.def __vpaes_schedule_transform; .scl 3; .type 32; .endef +.align 16 +__vpaes_schedule_transform: + movdqa -16(%ebp),%xmm2 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + movdqa (%ebx),%xmm2 +.byte 102,15,56,0,208 + movdqa 16(%ebx),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm2,%xmm0 + ret +.def __vpaes_schedule_mangle; .scl 3; .type 32; .endef +.align 16 +__vpaes_schedule_mangle: + movdqa %xmm0,%xmm4 + movdqa 128(%ebp),%xmm5 + testl %edi,%edi + jnz .L014schedule_mangle_dec + addl $16,%edx + pxor 336(%ebp),%xmm4 +.byte 102,15,56,0,229 + movdqa %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 + jmp .L015schedule_mangle_both +.align 16 +.L014schedule_mangle_dec: + movdqa -16(%ebp),%xmm2 + leal 416(%ebp),%esi + movdqa %xmm2,%xmm1 + pandn %xmm4,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm4 + movdqa (%esi),%xmm2 +.byte 102,15,56,0,212 + movdqa 16(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + movdqa 32(%esi),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 48(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + movdqa 64(%esi),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 80(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + movdqa 96(%esi),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 112(%esi),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + addl $-16,%edx +.L015schedule_mangle_both: + movdqa 256(%ebp,%ecx,1),%xmm1 +.byte 102,15,56,0,217 + addl $-16,%ecx + andl $48,%ecx + movdqu %xmm3,(%edx) + ret +.globl _vpaes_set_encrypt_key +.def _vpaes_set_encrypt_key; .scl 2; .type 32; .endef +.align 16 +_vpaes_set_encrypt_key: +.L_vpaes_set_encrypt_key_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%eax + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movl %eax,%ebx + shrl $5,%ebx + addl $5,%ebx + movl %ebx,240(%edx) + movl $48,%ecx + movl $0,%edi + leal .L_vpaes_consts+0x30-.L016pic_point,%ebp + call __vpaes_schedule_core +.L016pic_point: + movl 48(%esp),%esp + xorl %eax,%eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _vpaes_set_decrypt_key +.def _vpaes_set_decrypt_key; .scl 2; .type 32; .endef +.align 16 +_vpaes_set_decrypt_key: +.L_vpaes_set_decrypt_key_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%eax + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movl %eax,%ebx + shrl $5,%ebx + addl $5,%ebx + movl %ebx,240(%edx) + shll $4,%ebx + leal 16(%edx,%ebx,1),%edx + movl $1,%edi + movl %eax,%ecx + shrl $1,%ecx + andl $32,%ecx + xorl $32,%ecx + leal .L_vpaes_consts+0x30-.L017pic_point,%ebp + call __vpaes_schedule_core +.L017pic_point: + movl 48(%esp),%esp + xorl %eax,%eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _vpaes_encrypt +.def _vpaes_encrypt; .scl 2; .type 32; .endef +.align 16 +_vpaes_encrypt: +.L_vpaes_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + leal .L_vpaes_consts+0x30-.L018pic_point,%ebp + call __vpaes_preheat +.L018pic_point: + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%edi + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movdqu (%esi),%xmm0 + call __vpaes_encrypt_core + movdqu %xmm0,(%edi) + movl 48(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _vpaes_decrypt +.def _vpaes_decrypt; .scl 2; .type 32; .endef +.align 16 +_vpaes_decrypt: +.L_vpaes_decrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + leal .L_vpaes_consts+0x30-.L019pic_point,%ebp + call __vpaes_preheat +.L019pic_point: + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%edi + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movdqu (%esi),%xmm0 + call __vpaes_decrypt_core + movdqu %xmm0,(%edi) + movl 48(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _vpaes_cbc_encrypt +.def _vpaes_cbc_encrypt; .scl 2; .type 32; .endef +.align 16 +_vpaes_cbc_encrypt: +.L_vpaes_cbc_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + subl $16,%eax + jc .L020cbc_abort + leal -56(%esp),%ebx + movl 36(%esp),%ebp + andl $-16,%ebx + movl 40(%esp),%ecx + xchgl %esp,%ebx + movdqu (%ebp),%xmm1 + subl %esi,%edi + movl %ebx,48(%esp) + movl %edi,(%esp) + movl %edx,4(%esp) + movl %ebp,8(%esp) + movl %eax,%edi + leal .L_vpaes_consts+0x30-.L021pic_point,%ebp + call __vpaes_preheat +.L021pic_point: + cmpl $0,%ecx + je .L022cbc_dec_loop + jmp .L023cbc_enc_loop +.align 16 +.L023cbc_enc_loop: + movdqu (%esi),%xmm0 + pxor %xmm1,%xmm0 + call __vpaes_encrypt_core + movl (%esp),%ebx + movl 4(%esp),%edx + movdqa %xmm0,%xmm1 + movdqu %xmm0,(%ebx,%esi,1) + leal 16(%esi),%esi + subl $16,%edi + jnc .L023cbc_enc_loop + jmp .L024cbc_done +.align 16 +.L022cbc_dec_loop: + movdqu (%esi),%xmm0 + movdqa %xmm1,16(%esp) + movdqa %xmm0,32(%esp) + call __vpaes_decrypt_core + movl (%esp),%ebx + movl 4(%esp),%edx + pxor 16(%esp),%xmm0 + movdqa 32(%esp),%xmm1 + movdqu %xmm0,(%ebx,%esi,1) + leal 16(%esi),%esi + subl $16,%edi + jnc .L022cbc_dec_loop +.L024cbc_done: + movl 8(%esp),%ebx + movl 48(%esp),%esp + movdqu %xmm1,(%ebx) +.L020cbc_abort: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + +.section .note.GNU-stack,"",%progbits diff --git a/lib/accelerated/x86/coff/aes-ssse3-x86_64.s b/lib/accelerated/x86/coff/aes-ssse3-x86_64.s new file mode 100644 index 0000000000..f8dbd266be --- /dev/null +++ b/lib/accelerated/x86/coff/aes-ssse3-x86_64.s @@ -0,0 +1,1137 @@ +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. +# +# *** This file is auto-generated *** +# +.text + + + + + + + + + + + + + + + + +.def _vpaes_encrypt_core; .scl 3; .type 32; .endef +.p2align 4 +_vpaes_encrypt_core: + movq %rdx,%r9 + movq $16,%r11 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa .Lk_ipt(%rip),%xmm2 + pandn %xmm0,%xmm1 + movdqu (%r9),%xmm5 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,208 + movdqa .Lk_ipt+16(%rip),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm5,%xmm2 + addq $16,%r9 + pxor %xmm2,%xmm0 + leaq .Lk_mc_backward(%rip),%r10 + jmp .Lenc_entry + +.p2align 4 +.Lenc_loop: + + movdqa %xmm13,%xmm4 + movdqa %xmm12,%xmm0 +.byte 102,15,56,0,226 +.byte 102,15,56,0,195 + pxor %xmm5,%xmm4 + movdqa %xmm15,%xmm5 + pxor %xmm4,%xmm0 + movdqa -64(%r11,%r10,1),%xmm1 +.byte 102,15,56,0,234 + movdqa (%r11,%r10,1),%xmm4 + movdqa %xmm14,%xmm2 +.byte 102,15,56,0,211 + movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 +.byte 102,15,56,0,193 + addq $16,%r9 + pxor %xmm2,%xmm0 +.byte 102,15,56,0,220 + addq $16,%r11 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,193 + andq $48,%r11 + subq $1,%rax + pxor %xmm3,%xmm0 + +.Lenc_entry: + + movdqa %xmm9,%xmm1 + movdqa %xmm11,%xmm5 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,232 + movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm10,%xmm4 + pxor %xmm5,%xmm3 +.byte 102,15,56,0,224 + movdqa %xmm10,%xmm2 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%r9),%xmm5 + pxor %xmm1,%xmm3 + jnz .Lenc_loop + + + movdqa -96(%r10),%xmm4 + movdqa -80(%r10),%xmm0 +.byte 102,15,56,0,226 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,195 + movdqa 64(%r11,%r10,1),%xmm1 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,193 + .byte 0xf3,0xc3 + + + + + + + +.def _vpaes_decrypt_core; .scl 3; .type 32; .endef +.p2align 4 +_vpaes_decrypt_core: + movq %rdx,%r9 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa .Lk_dipt(%rip),%xmm2 + pandn %xmm0,%xmm1 + movq %rax,%r11 + psrld $4,%xmm1 + movdqu (%r9),%xmm5 + shlq $4,%r11 + pand %xmm9,%xmm0 +.byte 102,15,56,0,208 + movdqa .Lk_dipt+16(%rip),%xmm0 + xorq $48,%r11 + leaq .Lk_dsbd(%rip),%r10 +.byte 102,15,56,0,193 + andq $48,%r11 + pxor %xmm5,%xmm2 + movdqa .Lk_mc_forward+48(%rip),%xmm5 + pxor %xmm2,%xmm0 + addq $16,%r9 + addq %r10,%r11 + jmp .Ldec_entry + +.p2align 4 +.Ldec_loop: + + + + movdqa -32(%r10),%xmm4 + movdqa -16(%r10),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 0(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 16(%r10),%xmm1 + +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 32(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%r10),%xmm1 + +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 64(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%r10),%xmm1 + +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + addq $16,%r9 +.byte 102,15,58,15,237,12 + pxor %xmm1,%xmm0 + subq $1,%rax + +.Ldec_entry: + + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + movdqa %xmm11,%xmm2 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,208 + movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm10,%xmm4 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm10,%xmm2 +.byte 102,15,56,0,211 + movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%r9),%xmm0 + pxor %xmm1,%xmm3 + jnz .Ldec_loop + + + movdqa 96(%r10),%xmm4 +.byte 102,15,56,0,226 + pxor %xmm0,%xmm4 + movdqa 112(%r10),%xmm0 + movdqa -352(%r11),%xmm2 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,194 + .byte 0xf3,0xc3 + + + + + + + +.def _vpaes_schedule_core; .scl 3; .type 32; .endef +.p2align 4 +_vpaes_schedule_core: + + + + + + call _vpaes_preheat + movdqa .Lk_rcon(%rip),%xmm8 + movdqu (%rdi),%xmm0 + + + movdqa %xmm0,%xmm3 + leaq .Lk_ipt(%rip),%r11 + call _vpaes_schedule_transform + movdqa %xmm0,%xmm7 + + leaq .Lk_sr(%rip),%r10 + testq %rcx,%rcx + jnz .Lschedule_am_decrypting + + + movdqu %xmm0,(%rdx) + jmp .Lschedule_go + +.Lschedule_am_decrypting: + + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,217 + movdqu %xmm3,(%rdx) + xorq $48,%r8 + +.Lschedule_go: + cmpl $192,%esi + ja .Lschedule_256 + je .Lschedule_192 + + + + + + + + + + +.Lschedule_128: + movl $10,%esi + +.Loop_schedule_128: + call _vpaes_schedule_round + decq %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + jmp .Loop_schedule_128 + + + + + + + + + + + + + + + + +.p2align 4 +.Lschedule_192: + movdqu 8(%rdi),%xmm0 + call _vpaes_schedule_transform + movdqa %xmm0,%xmm6 + pxor %xmm4,%xmm4 + movhlps %xmm4,%xmm6 + movl $4,%esi + +.Loop_schedule_192: + call _vpaes_schedule_round +.byte 102,15,58,15,198,8 + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + call _vpaes_schedule_mangle + call _vpaes_schedule_round + decq %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + jmp .Loop_schedule_192 + + + + + + + + + + + +.p2align 4 +.Lschedule_256: + movdqu 16(%rdi),%xmm0 + call _vpaes_schedule_transform + movl $7,%esi + +.Loop_schedule_256: + call _vpaes_schedule_mangle + movdqa %xmm0,%xmm6 + + + call _vpaes_schedule_round + decq %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + + + pshufd $255,%xmm0,%xmm0 + movdqa %xmm7,%xmm5 + movdqa %xmm6,%xmm7 + call _vpaes_schedule_low_round + movdqa %xmm5,%xmm7 + + jmp .Loop_schedule_256 + + + + + + + + + + + + +.p2align 4 +.Lschedule_mangle_last: + + leaq .Lk_deskew(%rip),%r11 + testq %rcx,%rcx + jnz .Lschedule_mangle_last_dec + + + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,193 + leaq .Lk_opt(%rip),%r11 + addq $32,%rdx + +.Lschedule_mangle_last_dec: + addq $-16,%rdx + pxor .Lk_s63(%rip),%xmm0 + call _vpaes_schedule_transform + movdqu %xmm0,(%rdx) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + .byte 0xf3,0xc3 + + + + + + + + + + + + + + + + +.def _vpaes_schedule_192_smear; .scl 3; .type 32; .endef +.p2align 4 +_vpaes_schedule_192_smear: + pshufd $128,%xmm6,%xmm1 + pshufd $254,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 + pxor %xmm0,%xmm6 + movdqa %xmm6,%xmm0 + movhlps %xmm1,%xmm6 + .byte 0xf3,0xc3 + + + + + + + + + + + + + + + + + + + + +.def _vpaes_schedule_round; .scl 3; .type 32; .endef +.p2align 4 +_vpaes_schedule_round: + + pxor %xmm1,%xmm1 +.byte 102,65,15,58,15,200,15 +.byte 102,69,15,58,15,192,15 + pxor %xmm1,%xmm7 + + + pshufd $255,%xmm0,%xmm0 +.byte 102,15,58,15,192,1 + + + + +_vpaes_schedule_low_round: + + movdqa %xmm7,%xmm1 + pslldq $4,%xmm7 + pxor %xmm1,%xmm7 + movdqa %xmm7,%xmm1 + pslldq $8,%xmm7 + pxor %xmm1,%xmm7 + pxor .Lk_s63(%rip),%xmm7 + + + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + movdqa %xmm11,%xmm2 +.byte 102,15,56,0,208 + pxor %xmm1,%xmm0 + movdqa %xmm10,%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + movdqa %xmm10,%xmm4 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm10,%xmm2 +.byte 102,15,56,0,211 + pxor %xmm0,%xmm2 + movdqa %xmm10,%xmm3 +.byte 102,15,56,0,220 + pxor %xmm1,%xmm3 + movdqa %xmm13,%xmm4 +.byte 102,15,56,0,226 + movdqa %xmm12,%xmm0 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 + + + pxor %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + .byte 0xf3,0xc3 + + + + + + + + + + + +.def _vpaes_schedule_transform; .scl 3; .type 32; .endef +.p2align 4 +_vpaes_schedule_transform: + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + movdqa (%r11),%xmm2 +.byte 102,15,56,0,208 + movdqa 16(%r11),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm2,%xmm0 + .byte 0xf3,0xc3 + + + + + + + + + + + + + + + + + + + + + + + + + +.def _vpaes_schedule_mangle; .scl 3; .type 32; .endef +.p2align 4 +_vpaes_schedule_mangle: + movdqa %xmm0,%xmm4 + movdqa .Lk_mc_forward(%rip),%xmm5 + testq %rcx,%rcx + jnz .Lschedule_mangle_dec + + + addq $16,%rdx + pxor .Lk_s63(%rip),%xmm4 +.byte 102,15,56,0,229 + movdqa %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 + + jmp .Lschedule_mangle_both +.p2align 4 +.Lschedule_mangle_dec: + + leaq .Lk_dksd(%rip),%r11 + movdqa %xmm9,%xmm1 + pandn %xmm4,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm4 + + movdqa 0(%r11),%xmm2 +.byte 102,15,56,0,212 + movdqa 16(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + + movdqa 32(%r11),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 48(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + + movdqa 64(%r11),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 80(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + + movdqa 96(%r11),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 112(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + + addq $-16,%rdx + +.Lschedule_mangle_both: + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,217 + addq $-16,%r8 + andq $48,%r8 + movdqu %xmm3,(%rdx) + .byte 0xf3,0xc3 + + + + + +.globl vpaes_set_encrypt_key +.def vpaes_set_encrypt_key; .scl 2; .type 32; .endef +.p2align 4 +vpaes_set_encrypt_key: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_vpaes_set_encrypt_key: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + leaq -184(%rsp),%rsp + movaps %xmm6,16(%rsp) + movaps %xmm7,32(%rsp) + movaps %xmm8,48(%rsp) + movaps %xmm9,64(%rsp) + movaps %xmm10,80(%rsp) + movaps %xmm11,96(%rsp) + movaps %xmm12,112(%rsp) + movaps %xmm13,128(%rsp) + movaps %xmm14,144(%rsp) + movaps %xmm15,160(%rsp) +.Lenc_key_body: + movl %esi,%eax + shrl $5,%eax + addl $5,%eax + movl %eax,240(%rdx) + + movl $0,%ecx + movl $48,%r8d + call _vpaes_schedule_core + movaps 16(%rsp),%xmm6 + movaps 32(%rsp),%xmm7 + movaps 48(%rsp),%xmm8 + movaps 64(%rsp),%xmm9 + movaps 80(%rsp),%xmm10 + movaps 96(%rsp),%xmm11 + movaps 112(%rsp),%xmm12 + movaps 128(%rsp),%xmm13 + movaps 144(%rsp),%xmm14 + movaps 160(%rsp),%xmm15 + leaq 184(%rsp),%rsp +.Lenc_key_epilogue: + xorl %eax,%eax + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_vpaes_set_encrypt_key: + +.globl vpaes_set_decrypt_key +.def vpaes_set_decrypt_key; .scl 2; .type 32; .endef +.p2align 4 +vpaes_set_decrypt_key: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_vpaes_set_decrypt_key: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + leaq -184(%rsp),%rsp + movaps %xmm6,16(%rsp) + movaps %xmm7,32(%rsp) + movaps %xmm8,48(%rsp) + movaps %xmm9,64(%rsp) + movaps %xmm10,80(%rsp) + movaps %xmm11,96(%rsp) + movaps %xmm12,112(%rsp) + movaps %xmm13,128(%rsp) + movaps %xmm14,144(%rsp) + movaps %xmm15,160(%rsp) +.Ldec_key_body: + movl %esi,%eax + shrl $5,%eax + addl $5,%eax + movl %eax,240(%rdx) + shll $4,%eax + leaq 16(%rdx,%rax,1),%rdx + + movl $1,%ecx + movl %esi,%r8d + shrl $1,%r8d + andl $32,%r8d + xorl $32,%r8d + call _vpaes_schedule_core + movaps 16(%rsp),%xmm6 + movaps 32(%rsp),%xmm7 + movaps 48(%rsp),%xmm8 + movaps 64(%rsp),%xmm9 + movaps 80(%rsp),%xmm10 + movaps 96(%rsp),%xmm11 + movaps 112(%rsp),%xmm12 + movaps 128(%rsp),%xmm13 + movaps 144(%rsp),%xmm14 + movaps 160(%rsp),%xmm15 + leaq 184(%rsp),%rsp +.Ldec_key_epilogue: + xorl %eax,%eax + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_vpaes_set_decrypt_key: + +.globl vpaes_encrypt +.def vpaes_encrypt; .scl 2; .type 32; .endef +.p2align 4 +vpaes_encrypt: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_vpaes_encrypt: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + leaq -184(%rsp),%rsp + movaps %xmm6,16(%rsp) + movaps %xmm7,32(%rsp) + movaps %xmm8,48(%rsp) + movaps %xmm9,64(%rsp) + movaps %xmm10,80(%rsp) + movaps %xmm11,96(%rsp) + movaps %xmm12,112(%rsp) + movaps %xmm13,128(%rsp) + movaps %xmm14,144(%rsp) + movaps %xmm15,160(%rsp) +.Lenc_body: + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_encrypt_core + movdqu %xmm0,(%rsi) + movaps 16(%rsp),%xmm6 + movaps 32(%rsp),%xmm7 + movaps 48(%rsp),%xmm8 + movaps 64(%rsp),%xmm9 + movaps 80(%rsp),%xmm10 + movaps 96(%rsp),%xmm11 + movaps 112(%rsp),%xmm12 + movaps 128(%rsp),%xmm13 + movaps 144(%rsp),%xmm14 + movaps 160(%rsp),%xmm15 + leaq 184(%rsp),%rsp +.Lenc_epilogue: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_vpaes_encrypt: + +.globl vpaes_decrypt +.def vpaes_decrypt; .scl 2; .type 32; .endef +.p2align 4 +vpaes_decrypt: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_vpaes_decrypt: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + leaq -184(%rsp),%rsp + movaps %xmm6,16(%rsp) + movaps %xmm7,32(%rsp) + movaps %xmm8,48(%rsp) + movaps %xmm9,64(%rsp) + movaps %xmm10,80(%rsp) + movaps %xmm11,96(%rsp) + movaps %xmm12,112(%rsp) + movaps %xmm13,128(%rsp) + movaps %xmm14,144(%rsp) + movaps %xmm15,160(%rsp) +.Ldec_body: + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_decrypt_core + movdqu %xmm0,(%rsi) + movaps 16(%rsp),%xmm6 + movaps 32(%rsp),%xmm7 + movaps 48(%rsp),%xmm8 + movaps 64(%rsp),%xmm9 + movaps 80(%rsp),%xmm10 + movaps 96(%rsp),%xmm11 + movaps 112(%rsp),%xmm12 + movaps 128(%rsp),%xmm13 + movaps 144(%rsp),%xmm14 + movaps 160(%rsp),%xmm15 + leaq 184(%rsp),%rsp +.Ldec_epilogue: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_vpaes_decrypt: +.globl vpaes_cbc_encrypt +.def vpaes_cbc_encrypt; .scl 2; .type 32; .endef +.p2align 4 +vpaes_cbc_encrypt: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_vpaes_cbc_encrypt: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 + + xchgq %rcx,%rdx + subq $16,%rcx + jc .Lcbc_abort + leaq -184(%rsp),%rsp + movaps %xmm6,16(%rsp) + movaps %xmm7,32(%rsp) + movaps %xmm8,48(%rsp) + movaps %xmm9,64(%rsp) + movaps %xmm10,80(%rsp) + movaps %xmm11,96(%rsp) + movaps %xmm12,112(%rsp) + movaps %xmm13,128(%rsp) + movaps %xmm14,144(%rsp) + movaps %xmm15,160(%rsp) +.Lcbc_body: + movdqu (%r8),%xmm6 + subq %rdi,%rsi + call _vpaes_preheat + cmpl $0,%r9d + je .Lcbc_dec_loop + jmp .Lcbc_enc_loop +.p2align 4 +.Lcbc_enc_loop: + movdqu (%rdi),%xmm0 + pxor %xmm6,%xmm0 + call _vpaes_encrypt_core + movdqa %xmm0,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + leaq 16(%rdi),%rdi + subq $16,%rcx + jnc .Lcbc_enc_loop + jmp .Lcbc_done +.p2align 4 +.Lcbc_dec_loop: + movdqu (%rdi),%xmm0 + movdqa %xmm0,%xmm7 + call _vpaes_decrypt_core + pxor %xmm6,%xmm0 + movdqa %xmm7,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + leaq 16(%rdi),%rdi + subq $16,%rcx + jnc .Lcbc_dec_loop +.Lcbc_done: + movdqu %xmm6,(%r8) + movaps 16(%rsp),%xmm6 + movaps 32(%rsp),%xmm7 + movaps 48(%rsp),%xmm8 + movaps 64(%rsp),%xmm9 + movaps 80(%rsp),%xmm10 + movaps 96(%rsp),%xmm11 + movaps 112(%rsp),%xmm12 + movaps 128(%rsp),%xmm13 + movaps 144(%rsp),%xmm14 + movaps 160(%rsp),%xmm15 + leaq 184(%rsp),%rsp +.Lcbc_epilogue: +.Lcbc_abort: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_vpaes_cbc_encrypt: + + + + + + +.def _vpaes_preheat; .scl 3; .type 32; .endef +.p2align 4 +_vpaes_preheat: + leaq .Lk_s0F(%rip),%r10 + movdqa -32(%r10),%xmm10 + movdqa -16(%r10),%xmm11 + movdqa 0(%r10),%xmm9 + movdqa 48(%r10),%xmm13 + movdqa 64(%r10),%xmm12 + movdqa 80(%r10),%xmm15 + movdqa 96(%r10),%xmm14 + .byte 0xf3,0xc3 + + + + + + + +.p2align 6 +_vpaes_consts: +.Lk_inv: +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 + +.Lk_s0F: +.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F + +.Lk_ipt: +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 + +.Lk_sb1: +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.Lk_sb2: +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.Lk_sbo: +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA + +.Lk_mc_forward: +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 + +.Lk_mc_backward: +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F + +.Lk_sr: +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +.Lk_rcon: +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_s63: +.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B + +.Lk_opt: +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 + +.Lk_deskew: +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + + + + + +.Lk_dksd: +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +.Lk_dksb: +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +.Lk_dkse: +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +.Lk_dks9: +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + + + + + +.Lk_dipt: +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 + +.Lk_dsb9: +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +.Lk_dsbd: +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +.Lk_dsbb: +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +.Lk_dsbe: +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 +.Lk_dsbo: +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.p2align 6 + + +.def se_handler; .scl 3; .type 32; .endef +.p2align 4 +se_handler: + pushq %rsi + pushq %rdi + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushfq + subq $64,%rsp + + movq 120(%r8),%rax + movq 248(%r8),%rbx + + movq 8(%r9),%rsi + movq 56(%r9),%r11 + + movl 0(%r11),%r10d + leaq (%rsi,%r10,1),%r10 + cmpq %r10,%rbx + jb .Lin_prologue + + movq 152(%r8),%rax + + movl 4(%r11),%r10d + leaq (%rsi,%r10,1),%r10 + cmpq %r10,%rbx + jae .Lin_prologue + + leaq 16(%rax),%rsi + leaq 512(%r8),%rdi + movl $20,%ecx +.long 0xa548f3fc + leaq 184(%rax),%rax + +.Lin_prologue: + movq 8(%rax),%rdi + movq 16(%rax),%rsi + movq %rax,152(%r8) + movq %rsi,168(%r8) + movq %rdi,176(%r8) + + movq 40(%r9),%rdi + movq %r8,%rsi + movl $154,%ecx +.long 0xa548f3fc + + movq %r9,%rsi + xorq %rcx,%rcx + movq 8(%rsi),%rdx + movq 0(%rsi),%r8 + movq 16(%rsi),%r9 + movq 40(%rsi),%r10 + leaq 56(%rsi),%r11 + leaq 24(%rsi),%r12 + movq %r10,32(%rsp) + movq %r11,40(%rsp) + movq %r12,48(%rsp) + movq %rcx,56(%rsp) + call *__imp_RtlVirtualUnwind(%rip) + + movl $1,%eax + addq $64,%rsp + popfq + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + popq %rdi + popq %rsi + .byte 0xf3,0xc3 + + +.section .pdata +.p2align 2 +.rva .LSEH_begin_vpaes_set_encrypt_key +.rva .LSEH_end_vpaes_set_encrypt_key +.rva .LSEH_info_vpaes_set_encrypt_key + +.rva .LSEH_begin_vpaes_set_decrypt_key +.rva .LSEH_end_vpaes_set_decrypt_key +.rva .LSEH_info_vpaes_set_decrypt_key + +.rva .LSEH_begin_vpaes_encrypt +.rva .LSEH_end_vpaes_encrypt +.rva .LSEH_info_vpaes_encrypt + +.rva .LSEH_begin_vpaes_decrypt +.rva .LSEH_end_vpaes_decrypt +.rva .LSEH_info_vpaes_decrypt + +.rva .LSEH_begin_vpaes_cbc_encrypt +.rva .LSEH_end_vpaes_cbc_encrypt +.rva .LSEH_info_vpaes_cbc_encrypt + +.section .xdata +.p2align 3 +.LSEH_info_vpaes_set_encrypt_key: +.byte 9,0,0,0 +.rva se_handler +.rva .Lenc_key_body,.Lenc_key_epilogue +.LSEH_info_vpaes_set_decrypt_key: +.byte 9,0,0,0 +.rva se_handler +.rva .Ldec_key_body,.Ldec_key_epilogue +.LSEH_info_vpaes_encrypt: +.byte 9,0,0,0 +.rva se_handler +.rva .Lenc_body,.Lenc_epilogue +.LSEH_info_vpaes_decrypt: +.byte 9,0,0,0 +.rva se_handler +.rva .Ldec_body,.Ldec_epilogue +.LSEH_info_vpaes_cbc_encrypt: +.byte 9,0,0,0 +.rva se_handler +.rva .Lcbc_body,.Lcbc_epilogue + +.section .note.GNU-stack,"",%progbits diff --git a/lib/accelerated/x86/coff/aesni-x86.s b/lib/accelerated/x86/coff/aesni-x86.s index 1970712d30..9c982a20b5 100644 --- a/lib/accelerated/x86/coff/aesni-x86.s +++ b/lib/accelerated/x86/coff/aesni-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org> +# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org> # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/coff/aesni-x86_64.s b/lib/accelerated/x86/coff/aesni-x86_64.s index 85b51085a5..30b822394b 100644 --- a/lib/accelerated/x86/coff/aesni-x86_64.s +++ b/lib/accelerated/x86/coff/aesni-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org> +# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org> # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/coff/cpuid-x86.s b/lib/accelerated/x86/coff/cpuid-x86.s index f35cfba63a..9931ff05ba 100644 --- a/lib/accelerated/x86/coff/cpuid-x86.s +++ b/lib/accelerated/x86/coff/cpuid-x86.s @@ -1,5 +1,6 @@ # -# Copyright (C) 2011-2012 Free Software Foundation, Inc. +# Copyright (C) 2011-2013 Free Software Foundation, Inc. +# Copyright (C) 2013 Nikos Mavrogiannopoulos # # Author: Nikos Mavrogiannopoulos # diff --git a/lib/accelerated/x86/coff/cpuid-x86_64.s b/lib/accelerated/x86/coff/cpuid-x86_64.s index 033df92ebf..3add1900cf 100644 --- a/lib/accelerated/x86/coff/cpuid-x86_64.s +++ b/lib/accelerated/x86/coff/cpuid-x86_64.s @@ -1,5 +1,6 @@ # -# Copyright (C) 2011-2012 Free Software Foundation, Inc. +# Copyright (C) 2011-2013 Free Software Foundation, Inc. +# Copyright (C) 2013 Nikos Mavrogiannopoulos # # Author: Nikos Mavrogiannopoulos # diff --git a/lib/accelerated/x86/coff/e_padlock-x86.s b/lib/accelerated/x86/coff/e_padlock-x86.s index d51d62ff73..328e6462f6 100644 --- a/lib/accelerated/x86/coff/e_padlock-x86.s +++ b/lib/accelerated/x86/coff/e_padlock-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org> +# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org> # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/coff/e_padlock-x86_64.s b/lib/accelerated/x86/coff/e_padlock-x86_64.s index 14c62fd176..6b73825042 100644 --- a/lib/accelerated/x86/coff/e_padlock-x86_64.s +++ b/lib/accelerated/x86/coff/e_padlock-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org> +# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org> # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/coff/ghash-x86_64.s b/lib/accelerated/x86/coff/ghash-x86_64.s index 951ee891b9..d61c82b9cc 100644 --- a/lib/accelerated/x86/coff/ghash-x86_64.s +++ b/lib/accelerated/x86/coff/ghash-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org> +# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org> # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/coff/sha1-ssse3-x86.s b/lib/accelerated/x86/coff/sha1-ssse3-x86.s index 9bd41a0de4..450f574e16 100644 --- a/lib/accelerated/x86/coff/sha1-ssse3-x86.s +++ b/lib/accelerated/x86/coff/sha1-ssse3-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org> +# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org> # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s b/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s index 75868a42c6..98fd50dcc1 100644 --- a/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s +++ b/lib/accelerated/x86/coff/sha1-ssse3-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org> +# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org> # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/coff/sha256-ssse3-x86.s b/lib/accelerated/x86/coff/sha256-ssse3-x86.s index 6fe27746ce..117b2bd413 100644 --- a/lib/accelerated/x86/coff/sha256-ssse3-x86.s +++ b/lib/accelerated/x86/coff/sha256-ssse3-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org> +# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org> # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/coff/sha512-ssse3-x86.s b/lib/accelerated/x86/coff/sha512-ssse3-x86.s index 79098da5c2..d68eeffb4a 100644 --- a/lib/accelerated/x86/coff/sha512-ssse3-x86.s +++ b/lib/accelerated/x86/coff/sha512-ssse3-x86.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org> +# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org> # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s b/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s index bbb2661f26..dd80574c89 100644 --- a/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s +++ b/lib/accelerated/x86/coff/sha512-ssse3-x86_64.s @@ -1,4 +1,4 @@ -# Copyright (c) 2011-2012, Andy Polyakov <appro@openssl.org> +# Copyright (c) 2011-2013, Andy Polyakov <appro@openssl.org> # All rights reserved. # # Redistribution and use in source and binary forms, with or without |