diff options
author | Nikos Mavrogiannopoulos <nmav@gnutls.org> | 2012-03-19 22:55:14 +0100 |
---|---|---|
committer | Nikos Mavrogiannopoulos <nmav@gnutls.org> | 2012-03-19 22:58:02 +0100 |
commit | 9567d93c07f87ecb5c8560b7a45125de28710bc1 (patch) | |
tree | 31a779ef6d1e51589dc257599dca05ea6a768c01 /lib/accelerated/x86/coff | |
parent | abbfc182f738c654ebeaf75cf6893acc0947699b (diff) | |
download | gnutls-9567d93c07f87ecb5c8560b7a45125de28710bc1.tar.gz |
updated openssl code
Diffstat (limited to 'lib/accelerated/x86/coff')
-rw-r--r-- | lib/accelerated/x86/coff/padlock-x86-64-coff.s | 162 | ||||
-rw-r--r-- | lib/accelerated/x86/coff/padlock-x86-coff.s | 232 |
2 files changed, 288 insertions, 106 deletions
diff --git a/lib/accelerated/x86/coff/padlock-x86-64-coff.s b/lib/accelerated/x86/coff/padlock-x86-64-coff.s index b69b33275e..9f658ee761 100644 --- a/lib/accelerated/x86/coff/padlock-x86-64-coff.s +++ b/lib/accelerated/x86/coff/padlock-x86-64-coff.s @@ -354,8 +354,6 @@ padlock_ecb_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $128,%rcx - jbe .Lecb_short testl $32,(%rdx) jnz .Lecb_aligned testq $15,%rdi @@ -375,6 +373,21 @@ padlock_ecb_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + cmpq %rbx,%rcx + ja .Lecb_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $128,%rax + movq $-128,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz .Lecb_unaligned_tail jmp .Lecb_loop .p2align 4 .Lecb_loop: @@ -404,8 +417,8 @@ padlock_ecb_encrypt: testq $15,%rdi jz .Lecb_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi + shrq $3,%rcx .byte 0xf3,0x48,0xa5 subq %rbx,%rdi .Lecb_out_aligned: @@ -415,9 +428,26 @@ padlock_ecb_encrypt: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx - jnz .Lecb_loop - + jz .Lecb_break + cmpq %rbx,%rcx + jae .Lecb_loop +.Lecb_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp .Lecb_loop +.p2align 4 +.Lecb_break: + cmpq %rbp,%rsp je .Lecb_done pxor %xmm0,%xmm0 @@ -431,26 +461,39 @@ padlock_ecb_encrypt: .Lecb_done: leaq (%rbp),%rsp jmp .Lecb_exit -.p2align 4 -.Lecb_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -.Lecb_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja .Lecb_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp .Lecb_loop + .p2align 4 .Lecb_aligned: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $128,%rbp + movq $128-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz .Lecb_aligned_tail leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx .byte 0xf3,0x0f,0xa7,200 + testq %rbp,%rbp + jz .Lecb_exit + +.Lecb_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp .Lecb_loop .Lecb_exit: movl $1,%eax leaq 8(%rsp),%rsp @@ -489,8 +532,6 @@ padlock_cbc_encrypt: leaq 16(%rdx),%rdx xorl %eax,%eax xorl %ebx,%ebx - cmpq $64,%rcx - jbe .Lcbc_short testl $32,(%rdx) jnz .Lcbc_aligned testq $15,%rdi @@ -510,6 +551,21 @@ padlock_cbc_encrypt: negq %rax andq $512-1,%rbx leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + cmpq %rbx,%rcx + ja .Lcbc_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $64,%rax + movq $-64,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz .Lcbc_unaligned_tail jmp .Lcbc_loop .p2align 4 .Lcbc_loop: @@ -541,8 +597,8 @@ padlock_cbc_encrypt: testq $15,%rdi jz .Lcbc_out_aligned movq %rbx,%rcx - shrq $3,%rcx leaq (%rsp),%rsi + shrq $3,%rcx .byte 0xf3,0x48,0xa5 subq %rbx,%rdi .Lcbc_out_aligned: @@ -552,9 +608,26 @@ padlock_cbc_encrypt: addq %rbx,%rsi subq %rbx,%rcx movq $512,%rbx - jnz .Lcbc_loop - + jz .Lcbc_break + cmpq %rbx,%rcx + jae .Lcbc_loop +.Lcbc_unaligned_tail: + xorl %eax,%eax cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp .Lcbc_loop +.p2align 4 +.Lcbc_break: + cmpq %rbp,%rsp je .Lcbc_done pxor %xmm0,%xmm0 @@ -568,28 +641,41 @@ padlock_cbc_encrypt: .Lcbc_done: leaq (%rbp),%rsp jmp .Lcbc_exit -.p2align 4 -.Lcbc_short: - movq %rsp,%rbp - subq %rcx,%rsp - xorq %rbx,%rbx -.Lcbc_short_copy: - movups (%rsi,%rbx,1),%xmm0 - leaq 16(%rbx),%rbx - cmpq %rbx,%rcx - movaps %xmm0,-16(%rsp,%rbx,1) - ja .Lcbc_short_copy - movq %rsp,%rsi - movq %rcx,%rbx - jmp .Lcbc_loop + .p2align 4 .Lcbc_aligned: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $64,%rbp + movq $64-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz .Lcbc_aligned_tail leaq -16(%rdx),%rax leaq 16(%rdx),%rbx shrq $4,%rcx .byte 0xf3,0x0f,0xa7,208 movdqa (%rax),%xmm0 movdqa %xmm0,-16(%rdx) + testq %rbp,%rbp + jz .Lcbc_exit + +.Lcbc_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp .Lcbc_loop .Lcbc_exit: movl $1,%eax leaq 8(%rsp),%rsp diff --git a/lib/accelerated/x86/coff/padlock-x86-coff.s b/lib/accelerated/x86/coff/padlock-x86-coff.s index b068083fa6..69eb468638 100644 --- a/lib/accelerated/x86/coff/padlock-x86-coff.s +++ b/lib/accelerated/x86/coff/padlock-x86-coff.s @@ -180,16 +180,14 @@ _padlock_ecb_encrypt: leal 16(%edx),%edx xorl %eax,%eax xorl %ebx,%ebx - cmpl $128,%ecx - jbe .L006ecb_short testl $32,(%edx) - jnz .L007ecb_aligned + jnz .L006ecb_aligned testl $15,%edi setz %al testl $15,%esi setz %bl testl %ebx,%eax - jnz .L007ecb_aligned + jnz .L006ecb_aligned negl %eax movl $512,%ebx notl %eax @@ -201,10 +199,28 @@ _padlock_ecb_encrypt: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp - jmp .L008ecb_loop + movl %eax,16(%ebp) + cmpl %ebx,%ecx + ja .L007ecb_loop + movl %esi,%eax + cmpl %esp,%ebp + cmovel %edi,%eax + addl %ecx,%eax + negl %eax + andl $4095,%eax + cmpl $128,%eax + movl $-128,%eax + cmovael %ebx,%eax + andl %eax,%ebx + jz .L008ecb_unaligned_tail + jmp .L007ecb_loop .align 16 -.L008ecb_loop: +.L007ecb_loop: movl %edi,(%ebp) movl %esi,4(%ebp) movl %ecx,8(%ebp) @@ -229,8 +245,8 @@ _padlock_ecb_encrypt: testl $15,%edi jz .L010ecb_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi .L010ecb_out_aligned: @@ -240,43 +256,75 @@ _padlock_ecb_encrypt: addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz .L008ecb_loop + jz .L011ecb_break + cmpl %ebx,%ecx + jae .L007ecb_loop +.L008ecb_unaligned_tail: + xorl %eax,%eax + cmpl %ebp,%esp + cmovel %ecx,%eax + subl %eax,%esp + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L007ecb_loop +.align 16 +.L011ecb_break: cmpl %ebp,%esp - je .L011ecb_done + je .L012ecb_done pxor %xmm0,%xmm0 leal (%esp),%eax -.L012ecb_bzero: +.L013ecb_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja .L012ecb_bzero -.L011ecb_done: + ja .L013ecb_bzero +.L012ecb_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp - jmp .L013ecb_exit + jmp .L014ecb_exit .align 16 -.L006ecb_short: +.L006ecb_aligned: + leal (%esi,%ecx,1),%ebp + negl %ebp + andl $4095,%ebp xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -.L014ecb_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja .L014ecb_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp .L008ecb_loop -.align 16 -.L007ecb_aligned: + cmpl $128,%ebp + movl $127,%ebp + cmovael %eax,%ebp + andl %ecx,%ebp + subl %ebp,%ecx + jz .L015ecb_aligned_tail leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx .byte 243,15,167,200 -.L013ecb_exit: + testl %ebp,%ebp + jz .L014ecb_exit +.L015ecb_aligned_tail: + movl %ebp,%ecx + leal -24(%esp),%ebp + movl %ebp,%esp + movl %ebp,%eax + subl %ecx,%esp + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L007ecb_loop +.L014ecb_exit: movl $1,%eax leal 4(%esp),%esp .L004ecb_abort: @@ -299,19 +347,17 @@ _padlock_cbc_encrypt: movl 28(%esp),%edx movl 32(%esp),%ecx testl $15,%edx - jnz .L015cbc_abort + jnz .L016cbc_abort testl $15,%ecx - jnz .L015cbc_abort + jnz .L016cbc_abort leal .Lpadlock_saved_context,%eax pushfl cld call __padlock_verify_ctx -.L016cbc_pic_point: +.L017cbc_pic_point: leal 16(%edx),%edx xorl %eax,%eax xorl %ebx,%ebx - cmpl $64,%ecx - jbe .L017cbc_short testl $32,(%edx) jnz .L018cbc_aligned testl $15,%edi @@ -331,7 +377,25 @@ _padlock_cbc_encrypt: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp + movl %eax,16(%ebp) + cmpl %ebx,%ecx + ja .L019cbc_loop + movl %esi,%eax + cmpl %esp,%ebp + cmovel %edi,%eax + addl %ecx,%eax + negl %eax + andl $4095,%eax + cmpl $64,%eax + movl $-64,%eax + cmovael %ebx,%eax + andl %eax,%ebx + jz .L020cbc_unaligned_tail jmp .L019cbc_loop .align 16 .L019cbc_loop: @@ -343,13 +407,13 @@ _padlock_cbc_encrypt: testl $15,%edi cmovnzl %esp,%edi testl $15,%esi - jz .L020cbc_inp_aligned + jz .L021cbc_inp_aligned shrl $2,%ecx .byte 243,165 subl %ebx,%edi movl %ebx,%ecx movl %edi,%esi -.L020cbc_inp_aligned: +.L021cbc_inp_aligned: leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx @@ -359,61 +423,93 @@ _padlock_cbc_encrypt: movl (%ebp),%edi movl 12(%ebp),%ebx testl $15,%edi - jz .L021cbc_out_aligned + jz .L022cbc_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi -.L021cbc_out_aligned: +.L022cbc_out_aligned: movl 4(%ebp),%esi movl 8(%ebp),%ecx addl %ebx,%edi addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz .L019cbc_loop + jz .L023cbc_break + cmpl %ebx,%ecx + jae .L019cbc_loop +.L020cbc_unaligned_tail: + xorl %eax,%eax + cmpl %ebp,%esp + cmovel %ecx,%eax + subl %eax,%esp + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L019cbc_loop +.align 16 +.L023cbc_break: cmpl %ebp,%esp - je .L022cbc_done + je .L024cbc_done pxor %xmm0,%xmm0 leal (%esp),%eax -.L023cbc_bzero: +.L025cbc_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja .L023cbc_bzero -.L022cbc_done: + ja .L025cbc_bzero +.L024cbc_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp - jmp .L024cbc_exit -.align 16 -.L017cbc_short: - xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -.L025cbc_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja .L025cbc_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp .L019cbc_loop + jmp .L026cbc_exit .align 16 .L018cbc_aligned: + leal (%esi,%ecx,1),%ebp + negl %ebp + andl $4095,%ebp + xorl %eax,%eax + cmpl $64,%ebp + movl $63,%ebp + cmovael %eax,%ebp + andl %ecx,%ebp + subl %ebp,%ecx + jz .L027cbc_aligned_tail leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx .byte 243,15,167,208 movaps (%eax),%xmm0 movaps %xmm0,-16(%edx) -.L024cbc_exit: + testl %ebp,%ebp + jz .L026cbc_exit +.L027cbc_aligned_tail: + movl %ebp,%ecx + leal -24(%esp),%ebp + movl %ebp,%esp + movl %ebp,%eax + subl %ecx,%esp + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L019cbc_loop +.L026cbc_exit: movl $1,%eax leal 4(%esp),%esp -.L015cbc_abort: +.L016cbc_abort: popl %edi popl %esi popl %ebx @@ -437,10 +533,10 @@ __win32_segv_handler: movl 4(%esp),%edx movl 12(%esp),%ecx cmpl $3221225477,(%edx) - jne .L026ret + jne .L028ret addl $4,184(%ecx) movl $0,%eax -.L026ret: +.L028ret: ret .globl _padlock_sha1_oneshot .def _padlock_sha1_oneshot; .scl 2; .type 32; .endef |