diff options
author | Nikos Mavrogiannopoulos <nmav@gnutls.org> | 2011-08-08 14:54:08 +0200 |
---|---|---|
committer | Nikos Mavrogiannopoulos <nmav@gnutls.org> | 2011-08-08 19:49:29 +0200 |
commit | 892498cd2c25e3cc7b20d6723affaf9734bc0b60 (patch) | |
tree | 9f9df753b59c1aef9441276e4e5c3d4dc0190dd4 | |
parent | 99c30761cd072643d7b589eb14d6c1cfd00c696f (diff) | |
download | gnutls-892498cd2c25e3cc7b20d6723affaf9734bc0b60.tar.gz |
Included appro's updates to AES-NI.
-rw-r--r-- | lib/accelerated/intel/asm/appro-aes-x86-64.s | 368 | ||||
-rw-r--r-- | lib/accelerated/intel/asm/appro-aes-x86.s | 295 |
2 files changed, 338 insertions, 325 deletions
diff --git a/lib/accelerated/intel/asm/appro-aes-x86-64.s b/lib/accelerated/intel/asm/appro-aes-x86-64.s index 98204d1ba0..f286fb12bd 100644 --- a/lib/accelerated/intel/asm/appro-aes-x86-64.s +++ b/lib/accelerated/intel/asm/appro-aes-x86-64.s @@ -42,14 +42,14 @@ aesni_encrypt: movups (%rdi),%xmm2 movl 240(%rdx),%eax - movaps (%rdx),%xmm0 - movaps 16(%rdx),%xmm1 + movups (%rdx),%xmm0 + movups 16(%rdx),%xmm1 leaq 32(%rdx),%rdx xorps %xmm0,%xmm2 .Loop_enc1_1: .byte 102,15,56,220,209 decl %eax - movaps (%rdx),%xmm1 + movups (%rdx),%xmm1 leaq 16(%rdx),%rdx jnz .Loop_enc1_1 .byte 102,15,56,221,209 @@ -63,14 +63,14 @@ aesni_encrypt: aesni_decrypt: movups (%rdi),%xmm2 movl 240(%rdx),%eax - movaps (%rdx),%xmm0 - movaps 16(%rdx),%xmm1 + movups (%rdx),%xmm0 + movups 16(%rdx),%xmm1 leaq 32(%rdx),%rdx xorps %xmm0,%xmm2 .Loop_dec1_2: .byte 102,15,56,222,209 decl %eax - movaps (%rdx),%xmm1 + movups (%rdx),%xmm1 leaq 16(%rdx),%rdx jnz .Loop_dec1_2 .byte 102,15,56,223,209 @@ -80,26 +80,26 @@ aesni_decrypt: .type _aesni_encrypt3,@function .align 16 _aesni_encrypt3: - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 shrl $1,%eax - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 .Lenc_loop3: .byte 102,15,56,220,209 .byte 102,15,56,220,217 decl %eax .byte 102,15,56,220,225 - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 102,15,56,220,216 leaq 32(%rcx),%rcx .byte 102,15,56,220,224 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 jnz .Lenc_loop3 .byte 102,15,56,220,209 @@ -113,26 +113,26 @@ _aesni_encrypt3: .type _aesni_decrypt3,@function .align 16 _aesni_decrypt3: - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 shrl $1,%eax - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 .Ldec_loop3: .byte 102,15,56,222,209 .byte 102,15,56,222,217 decl %eax .byte 102,15,56,222,225 - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 .byte 102,15,56,222,208 .byte 102,15,56,222,216 leaq 32(%rcx),%rcx .byte 102,15,56,222,224 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 jnz .Ldec_loop3 .byte 102,15,56,222,209 @@ -146,15 +146,15 @@ _aesni_decrypt3: .type _aesni_encrypt4,@function .align 16 _aesni_encrypt4: - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 shrl $1,%eax - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 xorps %xmm0,%xmm5 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 .Lenc_loop4: .byte 102,15,56,220,209 @@ -162,13 +162,13 @@ _aesni_encrypt4: decl %eax .byte 102,15,56,220,225 .byte 102,15,56,220,233 - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 102,15,56,220,216 leaq 32(%rcx),%rcx .byte 102,15,56,220,224 .byte 102,15,56,220,232 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 jnz .Lenc_loop4 .byte 102,15,56,220,209 @@ -184,15 +184,15 @@ _aesni_encrypt4: .type _aesni_decrypt4,@function .align 16 _aesni_decrypt4: - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 shrl $1,%eax - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 xorps %xmm0,%xmm4 xorps %xmm0,%xmm5 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 .Ldec_loop4: .byte 102,15,56,222,209 @@ -200,13 +200,13 @@ _aesni_decrypt4: decl %eax .byte 102,15,56,222,225 .byte 102,15,56,222,233 - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 .byte 102,15,56,222,208 .byte 102,15,56,222,216 leaq 32(%rcx),%rcx .byte 102,15,56,222,224 .byte 102,15,56,222,232 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 jnz .Ldec_loop4 .byte 102,15,56,222,209 @@ -222,9 +222,9 @@ _aesni_decrypt4: .type _aesni_encrypt6,@function .align 16 _aesni_encrypt6: - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 shrl $1,%eax - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 @@ -238,7 +238,7 @@ _aesni_encrypt6: pxor %xmm0,%xmm7 decl %eax .byte 102,15,56,220,241 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 .byte 102,15,56,220,249 jmp .Lenc_loop6_enter .align 16 @@ -251,7 +251,7 @@ _aesni_encrypt6: .byte 102,15,56,220,241 .byte 102,15,56,220,249 .Lenc_loop6_enter: - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 102,15,56,220,216 leaq 32(%rcx),%rcx @@ -259,7 +259,7 @@ _aesni_encrypt6: .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 jnz .Lenc_loop6 .byte 102,15,56,220,209 @@ -279,9 +279,9 @@ _aesni_encrypt6: .type _aesni_decrypt6,@function .align 16 _aesni_decrypt6: - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 shrl $1,%eax - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 @@ -295,7 +295,7 @@ _aesni_decrypt6: pxor %xmm0,%xmm7 decl %eax .byte 102,15,56,222,241 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 .byte 102,15,56,222,249 jmp .Ldec_loop6_enter .align 16 @@ -308,7 +308,7 @@ _aesni_decrypt6: .byte 102,15,56,222,241 .byte 102,15,56,222,249 .Ldec_loop6_enter: - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 .byte 102,15,56,222,208 .byte 102,15,56,222,216 leaq 32(%rcx),%rcx @@ -316,7 +316,7 @@ _aesni_decrypt6: .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 jnz .Ldec_loop6 .byte 102,15,56,222,209 @@ -336,9 +336,9 @@ _aesni_decrypt6: .type _aesni_encrypt8,@function .align 16 _aesni_encrypt8: - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 shrl $1,%eax - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 @@ -355,10 +355,10 @@ _aesni_encrypt8: pxor %xmm0,%xmm8 .byte 102,15,56,220,249 pxor %xmm0,%xmm9 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 jmp .Lenc_loop8_enter .align 16 .Lenc_loop8: @@ -371,7 +371,7 @@ _aesni_encrypt8: .byte 102,15,56,220,249 .byte 102,68,15,56,220,193 .byte 102,68,15,56,220,201 - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 .Lenc_loop8_enter: .byte 102,15,56,220,208 .byte 102,15,56,220,216 @@ -382,7 +382,7 @@ _aesni_encrypt8: .byte 102,15,56,220,248 .byte 102,68,15,56,220,192 .byte 102,68,15,56,220,200 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 jnz .Lenc_loop8 .byte 102,15,56,220,209 @@ -406,9 +406,9 @@ _aesni_encrypt8: .type _aesni_decrypt8,@function .align 16 _aesni_decrypt8: - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 shrl $1,%eax - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 xorps %xmm0,%xmm3 @@ -425,10 +425,10 @@ _aesni_decrypt8: pxor %xmm0,%xmm8 .byte 102,15,56,222,249 pxor %xmm0,%xmm9 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 .byte 102,68,15,56,222,193 .byte 102,68,15,56,222,201 - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 jmp .Ldec_loop8_enter .align 16 .Ldec_loop8: @@ -441,7 +441,7 @@ _aesni_decrypt8: .byte 102,15,56,222,249 .byte 102,68,15,56,222,193 .byte 102,68,15,56,222,201 - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 .Ldec_loop8_enter: .byte 102,15,56,222,208 .byte 102,15,56,222,216 @@ -452,7 +452,7 @@ _aesni_decrypt8: .byte 102,15,56,222,248 .byte 102,68,15,56,222,192 .byte 102,68,15,56,222,200 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 jnz .Ldec_loop8 .byte 102,15,56,222,209 @@ -481,7 +481,7 @@ aesni_ecb_encrypt: jz .Lecb_ret movl 240(%rcx),%eax - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 movq %rcx,%r11 movl %eax,%r10d testl %r8d,%r8d @@ -572,14 +572,14 @@ aesni_ecb_encrypt: jmp .Lecb_ret .align 16 .Lecb_enc_one: - movaps (%rcx),%xmm0 - movaps 16(%rcx),%xmm1 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 .Loop_enc1_3: .byte 102,15,56,220,209 decl %eax - movaps (%rcx),%xmm1 + movups (%rcx),%xmm1 leaq 16(%rcx),%rcx jnz .Loop_enc1_3 .byte 102,15,56,221,209 @@ -670,7 +670,7 @@ aesni_ecb_encrypt: call _aesni_decrypt8 - movaps (%r11),%xmm0 + movups (%r11),%xmm0 subq $128,%rdx jnc .Lecb_dec_loop8 @@ -705,7 +705,7 @@ aesni_ecb_encrypt: movups 80(%rdi),%xmm7 je .Lecb_dec_six movups 96(%rdi),%xmm8 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 call _aesni_decrypt8 movups %xmm2,(%rsi) movups %xmm3,16(%rsi) @@ -717,14 +717,14 @@ aesni_ecb_encrypt: jmp .Lecb_ret .align 16 .Lecb_dec_one: - movaps (%rcx),%xmm0 - movaps 16(%rcx),%xmm1 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 .Loop_dec1_4: .byte 102,15,56,222,209 decl %eax - movaps (%rcx),%xmm1 + movups (%rcx),%xmm1 leaq 16(%rcx),%rcx jnz .Loop_dec1_4 .byte 102,15,56,223,209 @@ -779,54 +779,54 @@ aesni_ecb_encrypt: .type aesni_ccm64_encrypt_blocks,@function .align 16 aesni_ccm64_encrypt_blocks: + movl 240(%rcx),%eax movdqu (%r8),%xmm9 - movdqu (%r9),%xmm3 - movdqa .Lincrement64(%rip),%xmm8 - movdqa .Lbswap_mask(%rip),%xmm9 -.byte 102,69,15,56,0,201 + movdqa .Lincrement64(%rip),%xmm6 + movdqa .Lbswap_mask(%rip),%xmm7 - movl 240(%rcx),%eax - movq %rcx,%r11 - movl %eax,%r10d + shrl $1,%eax + leaq 0(%rcx),%r11 + movdqu (%r9),%xmm3 movdqa %xmm9,%xmm2 - + movl %eax,%r10d + jmp .Lccm64_enc_outer +.align 16 .Lccm64_enc_outer: - movups (%rdi),%xmm8 -.byte 102,65,15,56,0,209 - movq %r11,%rcx + movups (%r11),%xmm0 movl %r10d,%eax + movups (%rdi),%xmm8 - movaps (%rcx),%xmm0 - shrl $1,%eax - movaps 16(%rcx),%xmm1 - xorps %xmm0,%xmm8 - leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 - xorps %xmm3,%xmm8 - movaps (%rcx),%xmm0 + movups 16(%r11),%xmm1 + xorps %xmm8,%xmm0 + leaq 32(%r11),%rcx + xorps %xmm0,%xmm3 + movups (%rcx),%xmm0 .Lccm64_enc2_loop: .byte 102,15,56,220,209 decl %eax .byte 102,15,56,220,217 - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 .byte 102,15,56,220,208 leaq 32(%rcx),%rcx .byte 102,15,56,220,216 - movaps 0(%rcx),%xmm0 + movups 0(%rcx),%xmm0 jnz .Lccm64_enc2_loop +.byte 102,68,15,56,0,207 .byte 102,15,56,220,209 .byte 102,15,56,220,217 + paddq %xmm6,%xmm9 .byte 102,15,56,221,208 .byte 102,15,56,221,216 - paddq %xmm8,%xmm9 decq %rdx leaq 16(%rdi),%rdi xorps %xmm2,%xmm8 movdqa %xmm9,%xmm2 movups %xmm8,(%rsi) leaq 16(%rsi),%rsi +.byte 102,68,15,56,0,207 jnz .Lccm64_enc_outer movups %xmm3,(%r9) @@ -836,35 +836,36 @@ aesni_ccm64_encrypt_blocks: .type aesni_ccm64_decrypt_blocks,@function .align 16 aesni_ccm64_decrypt_blocks: - movdqu (%r8),%xmm9 + movl 240(%rcx),%eax + movups (%r8),%xmm9 movdqu (%r9),%xmm3 - movdqa .Lincrement64(%rip),%xmm8 - movdqa .Lbswap_mask(%rip),%xmm9 + movdqa .Lincrement64(%rip),%xmm6 + movdqa .Lbswap_mask(%rip),%xmm7 - movl 240(%rcx),%eax - movdqa %xmm9,%xmm2 -.byte 102,69,15,56,0,201 + movaps %xmm9,%xmm2 movl %eax,%r10d movq %rcx,%r11 - movaps (%rcx),%xmm0 - movaps 16(%rcx),%xmm1 +.byte 102,68,15,56,0,207 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 .Loop_enc1_5: .byte 102,15,56,220,209 decl %eax - movaps (%rcx),%xmm1 + movups (%rcx),%xmm1 leaq 16(%rcx),%rcx jnz .Loop_enc1_5 .byte 102,15,56,221,209 -.Lccm64_dec_outer: - paddq %xmm8,%xmm9 movups (%rdi),%xmm8 + paddq %xmm6,%xmm9 +.byte 102,68,15,56,0,207 + leaq 16(%rdi),%rdi + jmp .Lccm64_dec_outer +.align 16 +.Lccm64_dec_outer: xorps %xmm2,%xmm8 movdqa %xmm9,%xmm2 - leaq 16(%rdi),%rdi -.byte 102,65,15,56,0,209 - movq %r11,%rcx movl %r10d,%eax movups %xmm8,(%rsi) leaq 16(%rsi),%rsi @@ -872,41 +873,48 @@ aesni_ccm64_decrypt_blocks: subq $1,%rdx jz .Lccm64_dec_break - movaps (%rcx),%xmm0 + movups (%r11),%xmm0 shrl $1,%eax - movaps 16(%rcx),%xmm1 + movups 16(%r11),%xmm1 xorps %xmm0,%xmm8 - leaq 32(%rcx),%rcx + leaq 32(%r11),%rcx xorps %xmm0,%xmm2 xorps %xmm8,%xmm3 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 .Lccm64_dec2_loop: .byte 102,15,56,220,209 decl %eax .byte 102,15,56,220,217 - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 .byte 102,15,56,220,208 leaq 32(%rcx),%rcx .byte 102,15,56,220,216 - movaps 0(%rcx),%xmm0 + movups 0(%rcx),%xmm0 jnz .Lccm64_dec2_loop + movups (%rdi),%xmm8 + paddq %xmm6,%xmm9 .byte 102,15,56,220,209 .byte 102,15,56,220,217 +.byte 102,68,15,56,0,207 + leaq 16(%rdi),%rdi .byte 102,15,56,221,208 +.byte 102,15,56,221,216 jmp .Lccm64_dec_outer .align 16 .Lccm64_dec_break: - movaps (%rcx),%xmm0 - movaps 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm3 + + movups (%r11),%xmm0 + movups 16(%r11),%xmm1 + xorps %xmm0,%xmm8 + leaq 32(%r11),%r11 + xorps %xmm8,%xmm3 .Loop_enc1_6: .byte 102,15,56,220,217 decl %eax - movaps (%rcx),%xmm1 - leaq 16(%rcx),%rcx + movups (%r11),%xmm1 + leaq 16(%r11),%r11 jnz .Loop_enc1_6 .byte 102,15,56,221,217 movups %xmm3,(%r9) @@ -960,10 +968,10 @@ aesni_ctr32_encrypt_blocks: .Lctr32_loop6: pshufd $192,%xmm13,%xmm5 por %xmm14,%xmm2 - movaps (%r11),%xmm0 + movups (%r11),%xmm0 pshufd $128,%xmm13,%xmm6 por %xmm14,%xmm3 - movaps 16(%r11),%xmm1 + movups 16(%r11),%xmm1 pshufd $64,%xmm13,%xmm7 por %xmm14,%xmm4 por %xmm14,%xmm5 @@ -986,7 +994,7 @@ aesni_ctr32_encrypt_blocks: pxor %xmm0,%xmm6 .byte 102,15,56,220,233 pxor %xmm0,%xmm7 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 decl %eax .byte 102,15,56,220,241 .byte 102,15,56,220,249 @@ -1001,7 +1009,7 @@ aesni_ctr32_encrypt_blocks: .byte 102,15,56,220,241 .byte 102,15,56,220,249 .Lctr32_enc_loop6_enter: - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 102,15,56,220,216 leaq 32(%rcx),%rcx @@ -1009,7 +1017,7 @@ aesni_ctr32_encrypt_blocks: .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 jnz .Lctr32_enc_loop6 .byte 102,15,56,220,209 @@ -1109,14 +1117,14 @@ aesni_ctr32_encrypt_blocks: movups (%rdi),%xmm8 movl 240(%rcx),%eax .Lctr32_one: - movaps (%rcx),%xmm0 - movaps 16(%rcx),%xmm1 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 .Loop_enc1_7: .byte 102,15,56,220,209 decl %eax - movaps (%rcx),%xmm1 + movups (%rcx),%xmm1 leaq 16(%rcx),%rcx jnz .Loop_enc1_7 .byte 102,15,56,221,209 @@ -1168,14 +1176,14 @@ aesni_xts_encrypt: movups (%r9),%xmm15 movl 240(%r8),%eax movl 240(%rcx),%r10d - movaps (%r8),%xmm0 - movaps 16(%r8),%xmm1 + movups (%r8),%xmm0 + movups 16(%r8),%xmm1 leaq 32(%r8),%r8 xorps %xmm0,%xmm15 .Loop_enc1_8: .byte 102,68,15,56,220,249 decl %eax - movaps (%r8),%xmm1 + movups (%r8),%xmm1 leaq 16(%r8),%r8 jnz .Loop_enc1_8 .byte 102,68,15,56,221,249 @@ -1242,13 +1250,13 @@ aesni_xts_encrypt: movdqu 80(%rdi),%xmm7 leaq 96(%rdi),%rdi pxor %xmm13,%xmm5 - movaps (%r11),%xmm0 + movups (%r11),%xmm0 pxor %xmm14,%xmm6 pxor %xmm15,%xmm7 - movaps 16(%r11),%xmm1 + movups 16(%r11),%xmm1 pxor %xmm0,%xmm2 pxor %xmm0,%xmm3 movdqa %xmm10,0(%rsp) @@ -1264,7 +1272,7 @@ aesni_xts_encrypt: movdqa %xmm13,48(%rsp) .byte 102,15,56,220,233 pxor %xmm0,%xmm7 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 decl %eax movdqa %xmm14,64(%rsp) .byte 102,15,56,220,241 @@ -1284,7 +1292,7 @@ aesni_xts_encrypt: .byte 102,15,56,220,241 .byte 102,15,56,220,249 .Lxts_enc_loop6_enter: - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 102,15,56,220,216 leaq 32(%rcx),%rcx @@ -1292,7 +1300,7 @@ aesni_xts_encrypt: .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 jnz .Lxts_enc_loop6 pshufd $19,%xmm14,%xmm9 @@ -1307,7 +1315,7 @@ aesni_xts_encrypt: .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 pshufd $19,%xmm14,%xmm9 pxor %xmm14,%xmm14 @@ -1322,7 +1330,7 @@ aesni_xts_encrypt: .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movaps 32(%rcx),%xmm0 + movups 32(%rcx),%xmm0 pshufd $19,%xmm14,%xmm9 pxor %xmm14,%xmm14 @@ -1432,14 +1440,14 @@ aesni_xts_encrypt: movups (%rdi),%xmm2 leaq 16(%rdi),%rdi xorps %xmm10,%xmm2 - movaps (%rcx),%xmm0 - movaps 16(%rcx),%xmm1 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 .Loop_enc1_9: .byte 102,15,56,220,209 decl %eax - movaps (%rcx),%xmm1 + movups (%rcx),%xmm1 leaq 16(%rcx),%rcx jnz .Loop_enc1_9 .byte 102,15,56,221,209 @@ -1537,14 +1545,14 @@ aesni_xts_encrypt: movups -16(%rsi),%xmm2 xorps %xmm10,%xmm2 - movaps (%rcx),%xmm0 - movaps 16(%rcx),%xmm1 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 .Loop_enc1_10: .byte 102,15,56,220,209 decl %eax - movaps (%rcx),%xmm1 + movups (%rcx),%xmm1 leaq 16(%rcx),%rcx jnz .Loop_enc1_10 .byte 102,15,56,221,209 @@ -1564,14 +1572,14 @@ aesni_xts_decrypt: movups (%r9),%xmm15 movl 240(%r8),%eax movl 240(%rcx),%r10d - movaps (%r8),%xmm0 - movaps 16(%r8),%xmm1 + movups (%r8),%xmm0 + movups 16(%r8),%xmm1 leaq 32(%r8),%r8 xorps %xmm0,%xmm15 .Loop_enc1_11: .byte 102,68,15,56,220,249 decl %eax - movaps (%r8),%xmm1 + movups (%r8),%xmm1 leaq 16(%r8),%r8 jnz .Loop_enc1_11 .byte 102,68,15,56,221,249 @@ -1644,13 +1652,13 @@ aesni_xts_decrypt: movdqu 80(%rdi),%xmm7 leaq 96(%rdi),%rdi pxor %xmm13,%xmm5 - movaps (%r11),%xmm0 + movups (%r11),%xmm0 pxor %xmm14,%xmm6 pxor %xmm15,%xmm7 - movaps 16(%r11),%xmm1 + movups 16(%r11),%xmm1 pxor %xmm0,%xmm2 pxor %xmm0,%xmm3 movdqa %xmm10,0(%rsp) @@ -1666,7 +1674,7 @@ aesni_xts_decrypt: movdqa %xmm13,48(%rsp) .byte 102,15,56,222,233 pxor %xmm0,%xmm7 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 decl %eax movdqa %xmm14,64(%rsp) .byte 102,15,56,222,241 @@ -1686,7 +1694,7 @@ aesni_xts_decrypt: .byte 102,15,56,222,241 .byte 102,15,56,222,249 .Lxts_dec_loop6_enter: - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 .byte 102,15,56,222,208 .byte 102,15,56,222,216 leaq 32(%rcx),%rcx @@ -1694,7 +1702,7 @@ aesni_xts_decrypt: .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 jnz .Lxts_dec_loop6 pshufd $19,%xmm14,%xmm9 @@ -1709,7 +1717,7 @@ aesni_xts_decrypt: .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 pshufd $19,%xmm14,%xmm9 pxor %xmm14,%xmm14 @@ -1724,7 +1732,7 @@ aesni_xts_decrypt: .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movaps 32(%rcx),%xmm0 + movups 32(%rcx),%xmm0 pshufd $19,%xmm14,%xmm9 pxor %xmm14,%xmm14 @@ -1843,14 +1851,14 @@ aesni_xts_decrypt: movups (%rdi),%xmm2 leaq 16(%rdi),%rdi xorps %xmm10,%xmm2 - movaps (%rcx),%xmm0 - movaps 16(%rcx),%xmm1 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 .Loop_dec1_12: .byte 102,15,56,222,209 decl %eax - movaps (%rcx),%xmm1 + movups (%rcx),%xmm1 leaq 16(%rcx),%rcx jnz .Loop_dec1_12 .byte 102,15,56,223,209 @@ -1947,14 +1955,14 @@ aesni_xts_decrypt: movups (%rdi),%xmm2 xorps %xmm11,%xmm2 - movaps (%rcx),%xmm0 - movaps 16(%rcx),%xmm1 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 .Loop_dec1_13: .byte 102,15,56,222,209 decl %eax - movaps (%rcx),%xmm1 + movups (%rcx),%xmm1 leaq 16(%rcx),%rcx jnz .Loop_dec1_13 .byte 102,15,56,223,209 @@ -1977,14 +1985,14 @@ aesni_xts_decrypt: movups (%rsi),%xmm2 xorps %xmm10,%xmm2 - movaps (%rcx),%xmm0 - movaps 16(%rcx),%xmm1 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 .Loop_dec1_14: .byte 102,15,56,222,209 decl %eax - movaps (%rcx),%xmm1 + movups (%rcx),%xmm1 leaq 16(%rcx),%rcx jnz .Loop_dec1_14 .byte 102,15,56,223,209 @@ -2019,15 +2027,15 @@ aesni_cbc_encrypt: movups (%rdi),%xmm3 leaq 16(%rdi),%rdi - movaps (%rcx),%xmm0 - movaps 16(%rcx),%xmm1 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 xorps %xmm0,%xmm3 leaq 32(%rcx),%rcx xorps %xmm3,%xmm2 .Loop_enc1_15: .byte 102,15,56,220,209 decl %eax - movaps (%rcx),%xmm1 + movups (%rcx),%xmm1 leaq 16(%rcx),%rcx jnz .Loop_enc1_15 .byte 102,15,56,221,209 @@ -2074,10 +2082,10 @@ aesni_cbc_encrypt: movups %xmm9,(%rsi) leaq 16(%rsi),%rsi .Lcbc_dec_loop8_enter: - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 movups (%rdi),%xmm2 movups 16(%rdi),%xmm3 - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx movdqu 32(%rdi),%xmm4 @@ -2101,10 +2109,10 @@ aesni_cbc_encrypt: pxor %xmm0,%xmm8 .byte 102,15,56,222,249 pxor %xmm0,%xmm9 - movaps (%rcx),%xmm0 + movups (%rcx),%xmm0 .byte 102,68,15,56,222,193 .byte 102,68,15,56,222,201 - movaps 16(%rcx),%xmm1 + movups 16(%rcx),%xmm1 call .Ldec_loop8_enter @@ -2202,14 +2210,14 @@ aesni_cbc_encrypt: jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_one: - movaps (%rcx),%xmm0 - movaps 16(%rcx),%xmm1 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 .Loop_dec1_16: .byte 102,15,56,222,209 decl %eax - movaps (%rcx),%xmm1 + movups (%rcx),%xmm1 leaq 16(%rcx),%rcx jnz .Loop_dec1_16 .byte 102,15,56,223,209 @@ -2332,28 +2340,28 @@ aesni_set_decrypt_key: jnz .Ldec_key_ret leaq 16(%rdx,%rsi,1),%rdi - movaps (%rdx),%xmm0 - movaps (%rdi),%xmm1 - movaps %xmm0,(%rdi) - movaps %xmm1,(%rdx) + movups (%rdx),%xmm0 + movups (%rdi),%xmm1 + movups %xmm0,(%rdi) + movups %xmm1,(%rdx) leaq 16(%rdx),%rdx leaq -16(%rdi),%rdi .Ldec_key_inverse: - movaps (%rdx),%xmm0 - movaps (%rdi),%xmm1 + movups (%rdx),%xmm0 + movups (%rdi),%xmm1 .byte 102,15,56,219,192 .byte 102,15,56,219,201 leaq 16(%rdx),%rdx leaq -16(%rdi),%rdi - movaps %xmm0,16(%rdi) - movaps %xmm1,-16(%rdx) + movups %xmm0,16(%rdi) + movups %xmm1,-16(%rdx) cmpq %rdx,%rdi ja .Ldec_key_inverse - movaps (%rdx),%xmm0 + movups (%rdx),%xmm0 .byte 102,15,56,219,192 - movaps %xmm0,(%rdi) + movups %xmm0,(%rdi) .Ldec_key_ret: addq $8,%rsp .byte 0xf3,0xc3 @@ -2383,7 +2391,7 @@ __aesni_set_encrypt_key: .L10rounds: movl $9,%esi - movaps %xmm0,(%rdx) + movups %xmm0,(%rdx) .byte 102,15,58,223,200,1 call .Lkey_expansion_128_cold .byte 102,15,58,223,200,2 @@ -2404,7 +2412,7 @@ __aesni_set_encrypt_key: call .Lkey_expansion_128 .byte 102,15,58,223,200,54 call .Lkey_expansion_128 - movaps %xmm0,(%rax) + movups %xmm0,(%rax) movl %esi,80(%rax) xorl %eax,%eax jmp .Lenc_key_ret @@ -2413,7 +2421,7 @@ __aesni_set_encrypt_key: .L12rounds: movq 16(%rdi),%xmm2 movl $11,%esi - movaps %xmm0,(%rdx) + movups %xmm0,(%rdx) .byte 102,15,58,223,202,1 call .Lkey_expansion_192a_cold .byte 102,15,58,223,202,2 @@ -2430,7 +2438,7 @@ __aesni_set_encrypt_key: call .Lkey_expansion_192a .byte 102,15,58,223,202,128 call .Lkey_expansion_192b - movaps %xmm0,(%rax) + movups %xmm0,(%rax) movl %esi,48(%rax) xorq %rax,%rax jmp .Lenc_key_ret @@ -2440,8 +2448,8 @@ __aesni_set_encrypt_key: movups 16(%rdi),%xmm2 movl $13,%esi leaq 16(%rax),%rax - movaps %xmm0,(%rdx) - movaps %xmm2,16(%rdx) + movups %xmm0,(%rdx) + movups %xmm2,16(%rdx) .byte 102,15,58,223,202,1 call .Lkey_expansion_256a_cold .byte 102,15,58,223,200,1 @@ -2468,7 +2476,7 @@ __aesni_set_encrypt_key: call .Lkey_expansion_256b .byte 102,15,58,223,202,64 call .Lkey_expansion_256a - movaps %xmm0,(%rax) + movups %xmm0,(%rax) movl %esi,16(%rax) xorq %rax,%rax jmp .Lenc_key_ret @@ -2483,7 +2491,7 @@ __aesni_set_encrypt_key: .align 16 .Lkey_expansion_128: - movaps %xmm0,(%rax) + movups %xmm0,(%rax) leaq 16(%rax),%rax .Lkey_expansion_128_cold: shufps $16,%xmm0,%xmm4 @@ -2496,7 +2504,7 @@ __aesni_set_encrypt_key: .align 16 .Lkey_expansion_192a: - movaps %xmm0,(%rax) + movups %xmm0,(%rax) leaq 16(%rax),%rax .Lkey_expansion_192a_cold: movaps %xmm2,%xmm5 @@ -2518,15 +2526,15 @@ __aesni_set_encrypt_key: .Lkey_expansion_192b: movaps %xmm0,%xmm3 shufps $68,%xmm0,%xmm5 - movaps %xmm5,(%rax) + movups %xmm5,(%rax) shufps $78,%xmm2,%xmm3 - movaps %xmm3,16(%rax) + movups %xmm3,16(%rax) leaq 32(%rax),%rax jmp .Lkey_expansion_192b_warm .align 16 .Lkey_expansion_256a: - movaps %xmm2,(%rax) + movups %xmm2,(%rax) leaq 16(%rax),%rax .Lkey_expansion_256a_cold: shufps $16,%xmm0,%xmm4 @@ -2539,7 +2547,7 @@ __aesni_set_encrypt_key: .align 16 .Lkey_expansion_256b: - movaps %xmm0,(%rax) + movups %xmm0,(%rax) leaq 16(%rax),%rax shufps $16,%xmm2,%xmm4 diff --git a/lib/accelerated/intel/asm/appro-aes-x86.s b/lib/accelerated/intel/asm/appro-aes-x86.s index a8dc2ac46b..2084749838 100644 --- a/lib/accelerated/intel/asm/appro-aes-x86.s +++ b/lib/accelerated/intel/asm/appro-aes-x86.s @@ -47,14 +47,14 @@ aesni_encrypt: movups (%eax),%xmm2 movl 240(%edx),%ecx movl 8(%esp),%eax - movaps (%edx),%xmm0 - movaps 16(%edx),%xmm1 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 .L000enc1_loop_1: .byte 102,15,56,220,209 decl %ecx - movaps (%edx),%xmm1 + movups (%edx),%xmm1 leal 16(%edx),%edx jnz .L000enc1_loop_1 .byte 102,15,56,221,209 @@ -71,14 +71,14 @@ aesni_decrypt: movups (%eax),%xmm2 movl 240(%edx),%ecx movl 8(%esp),%eax - movaps (%edx),%xmm0 - movaps 16(%edx),%xmm1 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 .L001dec1_loop_2: .byte 102,15,56,222,209 decl %ecx - movaps (%edx),%xmm1 + movups (%edx),%xmm1 leal 16(%edx),%edx jnz .L001dec1_loop_2 .byte 102,15,56,223,209 @@ -88,25 +88,25 @@ aesni_decrypt: .type _aesni_encrypt3,@function .align 16 _aesni_encrypt3: - movaps (%edx),%xmm0 + movups (%edx),%xmm0 shrl $1,%ecx - movaps 16(%edx),%xmm1 + movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 - movaps (%edx),%xmm0 + movups (%edx),%xmm0 .L002enc3_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 decl %ecx .byte 102,15,56,220,225 - movaps 16(%edx),%xmm1 + movups 16(%edx),%xmm1 .byte 102,15,56,220,208 .byte 102,15,56,220,216 leal 32(%edx),%edx .byte 102,15,56,220,224 - movaps (%edx),%xmm0 + movups (%edx),%xmm0 jnz .L002enc3_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 @@ -119,25 +119,25 @@ _aesni_encrypt3: .type _aesni_decrypt3,@function .align 16 _aesni_decrypt3: - movaps (%edx),%xmm0 + movups (%edx),%xmm0 shrl $1,%ecx - movaps 16(%edx),%xmm1 + movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 - movaps (%edx),%xmm0 + movups (%edx),%xmm0 .L003dec3_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 decl %ecx .byte 102,15,56,222,225 - movaps 16(%edx),%xmm1 + movups 16(%edx),%xmm1 .byte 102,15,56,222,208 .byte 102,15,56,222,216 leal 32(%edx),%edx .byte 102,15,56,222,224 - movaps (%edx),%xmm0 + movups (%edx),%xmm0 jnz .L003dec3_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 @@ -150,28 +150,28 @@ _aesni_decrypt3: .type _aesni_encrypt4,@function .align 16 _aesni_encrypt4: - movaps (%edx),%xmm0 - movaps 16(%edx),%xmm1 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 shrl $1,%ecx leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 pxor %xmm0,%xmm5 - movaps (%edx),%xmm0 + movups (%edx),%xmm0 .L004enc4_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 decl %ecx .byte 102,15,56,220,225 .byte 102,15,56,220,233 - movaps 16(%edx),%xmm1 + movups 16(%edx),%xmm1 .byte 102,15,56,220,208 .byte 102,15,56,220,216 leal 32(%edx),%edx .byte 102,15,56,220,224 .byte 102,15,56,220,232 - movaps (%edx),%xmm0 + movups (%edx),%xmm0 jnz .L004enc4_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 @@ -186,28 +186,28 @@ _aesni_encrypt4: .type _aesni_decrypt4,@function .align 16 _aesni_decrypt4: - movaps (%edx),%xmm0 - movaps 16(%edx),%xmm1 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 shrl $1,%ecx leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 pxor %xmm0,%xmm4 pxor %xmm0,%xmm5 - movaps (%edx),%xmm0 + movups (%edx),%xmm0 .L005dec4_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 decl %ecx .byte 102,15,56,222,225 .byte 102,15,56,222,233 - movaps 16(%edx),%xmm1 + movups 16(%edx),%xmm1 .byte 102,15,56,222,208 .byte 102,15,56,222,216 leal 32(%edx),%edx .byte 102,15,56,222,224 .byte 102,15,56,222,232 - movaps (%edx),%xmm0 + movups (%edx),%xmm0 jnz .L005dec4_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 @@ -222,9 +222,9 @@ _aesni_decrypt4: .type _aesni_encrypt6,@function .align 16 _aesni_encrypt6: - movaps (%edx),%xmm0 + movups (%edx),%xmm0 shrl $1,%ecx - movaps 16(%edx),%xmm1 + movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 @@ -238,7 +238,7 @@ _aesni_encrypt6: .byte 102,15,56,220,233 pxor %xmm0,%xmm7 .byte 102,15,56,220,241 - movaps (%edx),%xmm0 + movups (%edx),%xmm0 .byte 102,15,56,220,249 jmp .L_aesni_encrypt6_enter .align 16 @@ -252,7 +252,7 @@ _aesni_encrypt6: .byte 102,15,56,220,249 .align 16 .L_aesni_encrypt6_enter: - movaps 16(%edx),%xmm1 + movups 16(%edx),%xmm1 .byte 102,15,56,220,208 .byte 102,15,56,220,216 leal 32(%edx),%edx @@ -260,7 +260,7 @@ _aesni_encrypt6: .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movaps (%edx),%xmm0 + movups (%edx),%xmm0 jnz .L006enc6_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 @@ -279,9 +279,9 @@ _aesni_encrypt6: .type _aesni_decrypt6,@function .align 16 _aesni_decrypt6: - movaps (%edx),%xmm0 + movups (%edx),%xmm0 shrl $1,%ecx - movaps 16(%edx),%xmm1 + movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 pxor %xmm0,%xmm3 @@ -295,7 +295,7 @@ _aesni_decrypt6: .byte 102,15,56,222,233 pxor %xmm0,%xmm7 .byte 102,15,56,222,241 - movaps (%edx),%xmm0 + movups (%edx),%xmm0 .byte 102,15,56,222,249 jmp .L_aesni_decrypt6_enter .align 16 @@ -309,7 +309,7 @@ _aesni_decrypt6: .byte 102,15,56,222,249 .align 16 .L_aesni_decrypt6_enter: - movaps 16(%edx),%xmm1 + movups 16(%edx),%xmm1 .byte 102,15,56,222,208 .byte 102,15,56,222,216 leal 32(%edx),%edx @@ -317,7 +317,7 @@ _aesni_decrypt6: .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movaps (%edx),%xmm0 + movups (%edx),%xmm0 jnz .L007dec6_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 @@ -418,14 +418,14 @@ aesni_ecb_encrypt: jmp .L008ecb_ret .align 16 .L013ecb_enc_one: - movaps (%edx),%xmm0 - movaps 16(%edx),%xmm1 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 .L017enc1_loop_3: .byte 102,15,56,220,209 decl %ecx - movaps (%edx),%xmm1 + movups (%edx),%xmm1 leal 16(%edx),%edx jnz .L017enc1_loop_3 .byte 102,15,56,221,209 @@ -521,14 +521,14 @@ aesni_ecb_encrypt: jmp .L008ecb_ret .align 16 .L021ecb_dec_one: - movaps (%edx),%xmm0 - movaps 16(%edx),%xmm1 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 .L025dec1_loop_4: .byte 102,15,56,222,209 decl %ecx - movaps (%edx),%xmm1 + movups (%edx),%xmm1 leal 16(%edx),%edx jnz .L025dec1_loop_4 .byte 102,15,56,223,209 @@ -583,56 +583,55 @@ aesni_ccm64_encrypt_blocks: movl %ebp,48(%esp) movdqu (%ebx),%xmm7 movdqu (%ecx),%xmm3 + movl 240(%edx),%ecx movl $202182159,(%esp) movl $134810123,4(%esp) movl $67438087,8(%esp) movl $66051,12(%esp) - movl $1,%ecx + movl $1,%ebx xorl %ebp,%ebp - movl %ecx,16(%esp) + movl %ebx,16(%esp) movl %ebp,20(%esp) movl %ebp,24(%esp) movl %ebp,28(%esp) - movdqa (%esp),%xmm5 -.byte 102,15,56,0,253 - movl 240(%edx),%ecx - movl %edx,%ebp - movl %ecx,%ebx + shrl $1,%ecx + leal (%edx),%ebp movdqa %xmm7,%xmm2 + movl %ecx,%ebx + movdqa (%esp),%xmm5 .L026ccm64_enc_outer: - movups (%esi),%xmm6 -.byte 102,15,56,0,213 - movl %ebp,%edx + movups (%ebp),%xmm0 movl %ebx,%ecx - movaps (%edx),%xmm0 - shrl $1,%ecx - movaps 16(%edx),%xmm1 - xorps %xmm0,%xmm6 - leal 32(%edx),%edx + movups (%esi),%xmm6 xorps %xmm0,%xmm2 - xorps %xmm6,%xmm3 - movaps (%edx),%xmm0 + movups 16(%ebp),%xmm1 + xorps %xmm6,%xmm0 + leal 32(%ebp),%edx + xorps %xmm0,%xmm3 + movups (%edx),%xmm0 .L027ccm64_enc2_loop: .byte 102,15,56,220,209 decl %ecx .byte 102,15,56,220,217 - movaps 16(%edx),%xmm1 + movups 16(%edx),%xmm1 .byte 102,15,56,220,208 leal 32(%edx),%edx .byte 102,15,56,220,216 - movaps (%edx),%xmm0 + movups (%edx),%xmm0 jnz .L027ccm64_enc2_loop +.byte 102,15,56,0,253 .byte 102,15,56,220,209 .byte 102,15,56,220,217 + paddq 16(%esp),%xmm7 .byte 102,15,56,221,208 .byte 102,15,56,221,216 - paddq 16(%esp),%xmm7 decl %eax leal 16(%esi),%esi xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 movups %xmm6,(%edi) leal 16(%edi),%edi +.byte 102,15,56,0,253 jnz .L026ccm64_enc_outer movl 48(%esp),%esp movl 40(%esp),%edi @@ -664,80 +663,86 @@ aesni_ccm64_decrypt_blocks: movl %ebp,48(%esp) movdqu (%ebx),%xmm7 movdqu (%ecx),%xmm3 + movl 240(%edx),%ecx movl $202182159,(%esp) movl $134810123,4(%esp) movl $67438087,8(%esp) movl $66051,12(%esp) - movl $1,%ecx + movl $1,%ebx xorl %ebp,%ebp - movl %ecx,16(%esp) + movl %ebx,16(%esp) movl %ebp,20(%esp) movl %ebp,24(%esp) movl %ebp,28(%esp) movdqa (%esp),%xmm5 movdqa %xmm7,%xmm2 -.byte 102,15,56,0,253 - movl 240(%edx),%ecx movl %edx,%ebp movl %ecx,%ebx - movaps (%edx),%xmm0 - movaps 16(%edx),%xmm1 +.byte 102,15,56,0,253 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 .L028enc1_loop_5: .byte 102,15,56,220,209 decl %ecx - movaps (%edx),%xmm1 + movups (%edx),%xmm1 leal 16(%edx),%edx jnz .L028enc1_loop_5 .byte 102,15,56,221,209 -.L029ccm64_dec_outer: - paddq 16(%esp),%xmm7 movups (%esi),%xmm6 + paddq 16(%esp),%xmm7 +.byte 102,15,56,0,253 + leal 16(%esi),%esi + jmp .L029ccm64_dec_outer +.align 16 +.L029ccm64_dec_outer: xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 - leal 16(%esi),%esi -.byte 102,15,56,0,213 - movl %ebp,%edx movl %ebx,%ecx movups %xmm6,(%edi) leal 16(%edi),%edi subl $1,%eax jz .L030ccm64_dec_break - movaps (%edx),%xmm0 + movups (%ebp),%xmm0 shrl $1,%ecx - movaps 16(%edx),%xmm1 + movups 16(%ebp),%xmm1 xorps %xmm0,%xmm6 - leal 32(%edx),%edx + leal 32(%ebp),%edx xorps %xmm0,%xmm2 xorps %xmm6,%xmm3 - movaps (%edx),%xmm0 + movups (%edx),%xmm0 .L031ccm64_dec2_loop: .byte 102,15,56,220,209 decl %ecx .byte 102,15,56,220,217 - movaps 16(%edx),%xmm1 + movups 16(%edx),%xmm1 .byte 102,15,56,220,208 leal 32(%edx),%edx .byte 102,15,56,220,216 - movaps (%edx),%xmm0 + movups (%edx),%xmm0 jnz .L031ccm64_dec2_loop + movups (%esi),%xmm6 + paddq 16(%esp),%xmm7 .byte 102,15,56,220,209 .byte 102,15,56,220,217 +.byte 102,15,56,0,253 + leal 16(%esi),%esi .byte 102,15,56,221,208 .byte 102,15,56,221,216 jmp .L029ccm64_dec_outer .align 16 .L030ccm64_dec_break: - movaps (%edx),%xmm0 - movaps 16(%edx),%xmm1 + movl %ebp,%edx + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 xorps %xmm0,%xmm6 leal 32(%edx),%edx xorps %xmm6,%xmm3 .L032enc1_loop_6: .byte 102,15,56,220,217 decl %ecx - movaps (%edx),%xmm1 + movups (%edx),%xmm1 leal 16(%edx),%edx jnz .L032enc1_loop_6 .byte 102,15,56,221,217 @@ -826,8 +831,8 @@ aesni_ctr32_encrypt_blocks: por %xmm1,%xmm5 por %xmm1,%xmm6 por %xmm1,%xmm7 - movaps (%ebp),%xmm0 - movaps 16(%ebp),%xmm1 + movups (%ebp),%xmm0 + movups 16(%ebp),%xmm1 leal 32(%ebp),%edx decl %ecx pxor %xmm0,%xmm2 @@ -841,7 +846,7 @@ aesni_ctr32_encrypt_blocks: .byte 102,15,56,220,233 pxor %xmm0,%xmm7 .byte 102,15,56,220,241 - movaps (%edx),%xmm0 + movups (%edx),%xmm0 .byte 102,15,56,220,249 call .L_aesni_encrypt6_enter movups (%esi),%xmm1 @@ -920,14 +925,14 @@ aesni_ctr32_encrypt_blocks: movups (%ebx),%xmm2 movl 240(%edx),%ecx .L037ctr32_one: - movaps (%edx),%xmm0 - movaps 16(%edx),%xmm1 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 .L041enc1_loop_7: .byte 102,15,56,220,209 decl %ecx - movaps (%edx),%xmm1 + movups (%edx),%xmm1 leal 16(%edx),%edx jnz .L041enc1_loop_7 .byte 102,15,56,221,209 @@ -994,14 +999,14 @@ aesni_xts_encrypt: movl 40(%esp),%esi movl 240(%edx),%ecx movups (%esi),%xmm2 - movaps (%edx),%xmm0 - movaps 16(%edx),%xmm1 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 .L042enc1_loop_8: .byte 102,15,56,220,209 decl %ecx - movaps (%edx),%xmm1 + movups (%edx),%xmm1 leal 16(%edx),%edx jnz .L042enc1_loop_8 .byte 102,15,56,221,209 @@ -1064,7 +1069,7 @@ aesni_xts_encrypt: pshufd $19,%xmm0,%xmm7 movdqa %xmm1,64(%esp) paddq %xmm1,%xmm1 - movaps (%ebp),%xmm0 + movups (%ebp),%xmm0 pand %xmm3,%xmm7 movups (%esi),%xmm2 pxor %xmm1,%xmm7 @@ -1082,7 +1087,7 @@ aesni_xts_encrypt: pxor (%esp),%xmm2 movdqa %xmm7,80(%esp) pxor %xmm1,%xmm7 - movaps 16(%ebp),%xmm1 + movups 16(%ebp),%xmm1 leal 32(%ebp),%edx pxor 16(%esp),%xmm3 .byte 102,15,56,220,209 @@ -1095,7 +1100,7 @@ aesni_xts_encrypt: .byte 102,15,56,220,233 pxor %xmm0,%xmm7 .byte 102,15,56,220,241 - movaps (%edx),%xmm0 + movups (%edx),%xmm0 .byte 102,15,56,220,249 call .L_aesni_encrypt6_enter movdqa 80(%esp),%xmm1 @@ -1196,14 +1201,14 @@ aesni_xts_encrypt: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 - movaps (%edx),%xmm0 - movaps 16(%edx),%xmm1 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 .L051enc1_loop_9: .byte 102,15,56,220,209 decl %ecx - movaps (%edx),%xmm1 + movups (%edx),%xmm1 leal 16(%edx),%edx jnz .L051enc1_loop_9 .byte 102,15,56,221,209 @@ -1307,14 +1312,14 @@ aesni_xts_encrypt: movl %ebx,%ecx movups -16(%edi),%xmm2 xorps %xmm5,%xmm2 - movaps (%edx),%xmm0 - movaps 16(%edx),%xmm1 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 .L054enc1_loop_10: .byte 102,15,56,220,209 decl %ecx - movaps (%edx),%xmm1 + movups (%edx),%xmm1 leal 16(%edx),%edx jnz .L054enc1_loop_10 .byte 102,15,56,221,209 @@ -1341,14 +1346,14 @@ aesni_xts_decrypt: movl 40(%esp),%esi movl 240(%edx),%ecx movups (%esi),%xmm2 - movaps (%edx),%xmm0 - movaps 16(%edx),%xmm1 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 .L055enc1_loop_11: .byte 102,15,56,220,209 decl %ecx - movaps (%edx),%xmm1 + movups (%edx),%xmm1 leal 16(%edx),%edx jnz .L055enc1_loop_11 .byte 102,15,56,221,209 @@ -1416,7 +1421,7 @@ aesni_xts_decrypt: pshufd $19,%xmm0,%xmm7 movdqa %xmm1,64(%esp) paddq %xmm1,%xmm1 - movaps (%ebp),%xmm0 + movups (%ebp),%xmm0 pand %xmm3,%xmm7 movups (%esi),%xmm2 pxor %xmm1,%xmm7 @@ -1434,7 +1439,7 @@ aesni_xts_decrypt: pxor (%esp),%xmm2 movdqa %xmm7,80(%esp) pxor %xmm1,%xmm7 - movaps 16(%ebp),%xmm1 + movups 16(%ebp),%xmm1 leal 32(%ebp),%edx pxor 16(%esp),%xmm3 .byte 102,15,56,222,209 @@ -1447,7 +1452,7 @@ aesni_xts_decrypt: .byte 102,15,56,222,233 pxor %xmm0,%xmm7 .byte 102,15,56,222,241 - movaps (%edx),%xmm0 + movups (%edx),%xmm0 .byte 102,15,56,222,249 call .L_aesni_decrypt6_enter movdqa 80(%esp),%xmm1 @@ -1548,14 +1553,14 @@ aesni_xts_decrypt: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 - movaps (%edx),%xmm0 - movaps 16(%edx),%xmm1 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 .L064dec1_loop_12: .byte 102,15,56,222,209 decl %ecx - movaps (%edx),%xmm1 + movups (%edx),%xmm1 leal 16(%edx),%edx jnz .L064dec1_loop_12 .byte 102,15,56,223,209 @@ -1656,14 +1661,14 @@ aesni_xts_decrypt: movl %ebx,%ecx movups (%esi),%xmm2 xorps %xmm5,%xmm2 - movaps (%edx),%xmm0 - movaps 16(%edx),%xmm1 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 .L067dec1_loop_13: .byte 102,15,56,222,209 decl %ecx - movaps (%edx),%xmm1 + movups (%edx),%xmm1 leal 16(%edx),%edx jnz .L067dec1_loop_13 .byte 102,15,56,223,209 @@ -1683,14 +1688,14 @@ aesni_xts_decrypt: movl %ebx,%ecx movups (%edi),%xmm2 xorps %xmm6,%xmm2 - movaps (%edx),%xmm0 - movaps 16(%edx),%xmm1 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 .L069dec1_loop_14: .byte 102,15,56,222,209 decl %ecx - movaps (%edx),%xmm1 + movups (%edx),%xmm1 leal 16(%edx),%edx jnz .L069dec1_loop_14 .byte 102,15,56,223,209 @@ -1740,15 +1745,15 @@ aesni_cbc_encrypt: .L073cbc_enc_loop: movups (%esi),%xmm7 leal 16(%esi),%esi - movaps (%edx),%xmm0 - movaps 16(%edx),%xmm1 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 xorps %xmm0,%xmm7 leal 32(%edx),%edx xorps %xmm7,%xmm2 .L074enc1_loop_15: .byte 102,15,56,220,209 decl %ecx - movaps (%edx),%xmm1 + movups (%edx),%xmm1 leal 16(%edx),%edx jnz .L074enc1_loop_15 .byte 102,15,56,221,209 @@ -1863,14 +1868,14 @@ aesni_cbc_encrypt: jmp .L079cbc_dec_tail_collected .align 16 .L080cbc_dec_one: - movaps (%edx),%xmm0 - movaps 16(%edx),%xmm1 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 .L084dec1_loop_16: .byte 102,15,56,222,209 decl %ecx - movaps (%edx),%xmm1 + movups (%edx),%xmm1 leal 16(%edx),%edx jnz .L084dec1_loop_16 .byte 102,15,56,223,209 @@ -1961,7 +1966,7 @@ _aesni_set_encrypt_key: .align 16 .L09010rounds: movl $9,%ecx - movaps %xmm0,-16(%edx) + movups %xmm0,-16(%edx) .byte 102,15,58,223,200,1 call .L091key_128_cold .byte 102,15,58,223,200,2 @@ -1982,13 +1987,13 @@ _aesni_set_encrypt_key: call .L092key_128 .byte 102,15,58,223,200,54 call .L092key_128 - movaps %xmm0,(%edx) + movups %xmm0,(%edx) movl %ecx,80(%edx) xorl %eax,%eax ret .align 16 .L092key_128: - movaps %xmm0,(%edx) + movups %xmm0,(%edx) leal 16(%edx),%edx .L091key_128_cold: shufps $16,%xmm0,%xmm4 @@ -2002,7 +2007,7 @@ _aesni_set_encrypt_key: .L08812rounds: movq 16(%eax),%xmm2 movl $11,%ecx - movaps %xmm0,-16(%edx) + movups %xmm0,-16(%edx) .byte 102,15,58,223,202,1 call .L093key_192a_cold .byte 102,15,58,223,202,2 @@ -2019,13 +2024,13 @@ _aesni_set_encrypt_key: call .L095key_192a .byte 102,15,58,223,202,128 call .L094key_192b - movaps %xmm0,(%edx) + movups %xmm0,(%edx) movl %ecx,48(%edx) xorl %eax,%eax ret .align 16 .L095key_192a: - movaps %xmm0,(%edx) + movups %xmm0,(%edx) leal 16(%edx),%edx .align 16 .L093key_192a_cold: @@ -2047,9 +2052,9 @@ _aesni_set_encrypt_key: .L094key_192b: movaps %xmm0,%xmm3 shufps $68,%xmm0,%xmm5 - movaps %xmm5,(%edx) + movups %xmm5,(%edx) shufps $78,%xmm2,%xmm3 - movaps %xmm3,16(%edx) + movups %xmm3,16(%edx) leal 32(%edx),%edx jmp .L096key_192b_warm .align 16 @@ -2057,8 +2062,8 @@ _aesni_set_encrypt_key: movups 16(%eax),%xmm2 movl $13,%ecx leal 16(%edx),%edx - movaps %xmm0,-32(%edx) - movaps %xmm2,-16(%edx) + movups %xmm0,-32(%edx) + movups %xmm2,-16(%edx) .byte 102,15,58,223,202,1 call .L097key_256a_cold .byte 102,15,58,223,200,1 @@ -2085,13 +2090,13 @@ _aesni_set_encrypt_key: call .L098key_256b .byte 102,15,58,223,202,64 call .L099key_256a - movaps %xmm0,(%edx) + movups %xmm0,(%edx) movl %ecx,16(%edx) xorl %eax,%eax ret .align 16 .L099key_256a: - movaps %xmm2,(%edx) + movups %xmm2,(%edx) leal 16(%edx),%edx .L097key_256a_cold: shufps $16,%xmm0,%xmm4 @@ -2103,7 +2108,7 @@ _aesni_set_encrypt_key: ret .align 16 .L098key_256b: - movaps %xmm0,(%edx) + movups %xmm0,(%edx) leal 16(%edx),%edx shufps $16,%xmm2,%xmm4 xorps %xmm4,%xmm2 @@ -2146,26 +2151,26 @@ aesni_set_decrypt_key: testl %eax,%eax jnz .L100dec_key_ret leal 16(%edx,%ecx,1),%eax - movaps (%edx),%xmm0 - movaps (%eax),%xmm1 - movaps %xmm0,(%eax) - movaps %xmm1,(%edx) + movups (%edx),%xmm0 + movups (%eax),%xmm1 + movups %xmm0,(%eax) + movups %xmm1,(%edx) leal 16(%edx),%edx leal -16(%eax),%eax .L101dec_key_inverse: - movaps (%edx),%xmm0 - movaps (%eax),%xmm1 + movups (%edx),%xmm0 + movups (%eax),%xmm1 .byte 102,15,56,219,192 .byte 102,15,56,219,201 leal 16(%edx),%edx leal -16(%eax),%eax - movaps %xmm0,16(%eax) - movaps %xmm1,-16(%edx) + movups %xmm0,16(%eax) + movups %xmm1,-16(%edx) cmpl %edx,%eax ja .L101dec_key_inverse - movaps (%edx),%xmm0 + movups (%edx),%xmm0 .byte 102,15,56,219,192 - movaps %xmm0,(%edx) + movups %xmm0,(%edx) xorl %eax,%eax .L100dec_key_ret: ret |