summaryrefslogtreecommitdiff
path: root/lib/accelerated/x86/coff
diff options
context:
space:
mode:
authorNikos Mavrogiannopoulos <nmav@gnutls.org>2012-03-19 22:55:14 +0100
committerNikos Mavrogiannopoulos <nmav@gnutls.org>2012-03-19 22:58:02 +0100
commit9567d93c07f87ecb5c8560b7a45125de28710bc1 (patch)
tree31a779ef6d1e51589dc257599dca05ea6a768c01 /lib/accelerated/x86/coff
parentabbfc182f738c654ebeaf75cf6893acc0947699b (diff)
downloadgnutls-9567d93c07f87ecb5c8560b7a45125de28710bc1.tar.gz
updated openssl code
Diffstat (limited to 'lib/accelerated/x86/coff')
-rw-r--r--lib/accelerated/x86/coff/padlock-x86-64-coff.s162
-rw-r--r--lib/accelerated/x86/coff/padlock-x86-coff.s232
2 files changed, 288 insertions, 106 deletions
diff --git a/lib/accelerated/x86/coff/padlock-x86-64-coff.s b/lib/accelerated/x86/coff/padlock-x86-64-coff.s
index b69b33275e..9f658ee761 100644
--- a/lib/accelerated/x86/coff/padlock-x86-64-coff.s
+++ b/lib/accelerated/x86/coff/padlock-x86-64-coff.s
@@ -354,8 +354,6 @@ padlock_ecb_encrypt:
leaq 16(%rdx),%rdx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpq $128,%rcx
- jbe .Lecb_short
testl $32,(%rdx)
jnz .Lecb_aligned
testq $15,%rdi
@@ -375,6 +373,21 @@ padlock_ecb_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+ cmpq %rbx,%rcx
+ ja .Lecb_loop
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $128,%rax
+ movq $-128,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jz .Lecb_unaligned_tail
jmp .Lecb_loop
.p2align 4
.Lecb_loop:
@@ -404,8 +417,8 @@ padlock_ecb_encrypt:
testq $15,%rdi
jz .Lecb_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
+ shrq $3,%rcx
.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
.Lecb_out_aligned:
@@ -415,9 +428,26 @@ padlock_ecb_encrypt:
addq %rbx,%rsi
subq %rbx,%rcx
movq $512,%rbx
- jnz .Lecb_loop
-
+ jz .Lecb_break
+ cmpq %rbx,%rcx
+ jae .Lecb_loop
+.Lecb_unaligned_tail:
+ xorl %eax,%eax
cmpq %rsp,%rbp
+ cmoveq %rcx,%rax
+ movq %rdi,%r8
+ movq %rcx,%rbx
+ subq %rax,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ movq %rsp,%rsi
+ movq %r8,%rdi
+ movq %rbx,%rcx
+ jmp .Lecb_loop
+.p2align 4
+.Lecb_break:
+ cmpq %rbp,%rsp
je .Lecb_done
pxor %xmm0,%xmm0
@@ -431,26 +461,39 @@ padlock_ecb_encrypt:
.Lecb_done:
leaq (%rbp),%rsp
jmp .Lecb_exit
-.p2align 4
-.Lecb_short:
- movq %rsp,%rbp
- subq %rcx,%rsp
- xorq %rbx,%rbx
-.Lecb_short_copy:
- movups (%rsi,%rbx,1),%xmm0
- leaq 16(%rbx),%rbx
- cmpq %rbx,%rcx
- movaps %xmm0,-16(%rsp,%rbx,1)
- ja .Lecb_short_copy
- movq %rsp,%rsi
- movq %rcx,%rbx
- jmp .Lecb_loop
+
.p2align 4
.Lecb_aligned:
+ leaq (%rsi,%rcx,1),%rbp
+ negq %rbp
+ andq $4095,%rbp
+ xorl %eax,%eax
+ cmpq $128,%rbp
+ movq $128-1,%rbp
+ cmovaeq %rax,%rbp
+ andq %rcx,%rbp
+ subq %rbp,%rcx
+ jz .Lecb_aligned_tail
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
.byte 0xf3,0x0f,0xa7,200
+ testq %rbp,%rbp
+ jz .Lecb_exit
+
+.Lecb_aligned_tail:
+ movq %rdi,%r8
+ movq %rbp,%rbx
+ movq %rbp,%rcx
+ leaq (%rsp),%rbp
+ subq %rcx,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ leaq (%r8),%rdi
+ leaq (%rsp),%rsi
+ movq %rbx,%rcx
+ jmp .Lecb_loop
.Lecb_exit:
movl $1,%eax
leaq 8(%rsp),%rsp
@@ -489,8 +532,6 @@ padlock_cbc_encrypt:
leaq 16(%rdx),%rdx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpq $64,%rcx
- jbe .Lcbc_short
testl $32,(%rdx)
jnz .Lcbc_aligned
testq $15,%rdi
@@ -510,6 +551,21 @@ padlock_cbc_encrypt:
negq %rax
andq $512-1,%rbx
leaq (%rax,%rbp,1),%rsp
+ movq $512,%rax
+ cmovzq %rax,%rbx
+ cmpq %rbx,%rcx
+ ja .Lcbc_loop
+ movq %rsi,%rax
+ cmpq %rsp,%rbp
+ cmoveq %rdi,%rax
+ addq %rcx,%rax
+ negq %rax
+ andq $4095,%rax
+ cmpq $64,%rax
+ movq $-64,%rax
+ cmovaeq %rbx,%rax
+ andq %rax,%rbx
+ jz .Lcbc_unaligned_tail
jmp .Lcbc_loop
.p2align 4
.Lcbc_loop:
@@ -541,8 +597,8 @@ padlock_cbc_encrypt:
testq $15,%rdi
jz .Lcbc_out_aligned
movq %rbx,%rcx
- shrq $3,%rcx
leaq (%rsp),%rsi
+ shrq $3,%rcx
.byte 0xf3,0x48,0xa5
subq %rbx,%rdi
.Lcbc_out_aligned:
@@ -552,9 +608,26 @@ padlock_cbc_encrypt:
addq %rbx,%rsi
subq %rbx,%rcx
movq $512,%rbx
- jnz .Lcbc_loop
-
+ jz .Lcbc_break
+ cmpq %rbx,%rcx
+ jae .Lcbc_loop
+.Lcbc_unaligned_tail:
+ xorl %eax,%eax
cmpq %rsp,%rbp
+ cmoveq %rcx,%rax
+ movq %rdi,%r8
+ movq %rcx,%rbx
+ subq %rax,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ movq %rsp,%rsi
+ movq %r8,%rdi
+ movq %rbx,%rcx
+ jmp .Lcbc_loop
+.p2align 4
+.Lcbc_break:
+ cmpq %rbp,%rsp
je .Lcbc_done
pxor %xmm0,%xmm0
@@ -568,28 +641,41 @@ padlock_cbc_encrypt:
.Lcbc_done:
leaq (%rbp),%rsp
jmp .Lcbc_exit
-.p2align 4
-.Lcbc_short:
- movq %rsp,%rbp
- subq %rcx,%rsp
- xorq %rbx,%rbx
-.Lcbc_short_copy:
- movups (%rsi,%rbx,1),%xmm0
- leaq 16(%rbx),%rbx
- cmpq %rbx,%rcx
- movaps %xmm0,-16(%rsp,%rbx,1)
- ja .Lcbc_short_copy
- movq %rsp,%rsi
- movq %rcx,%rbx
- jmp .Lcbc_loop
+
.p2align 4
.Lcbc_aligned:
+ leaq (%rsi,%rcx,1),%rbp
+ negq %rbp
+ andq $4095,%rbp
+ xorl %eax,%eax
+ cmpq $64,%rbp
+ movq $64-1,%rbp
+ cmovaeq %rax,%rbp
+ andq %rcx,%rbp
+ subq %rbp,%rcx
+ jz .Lcbc_aligned_tail
leaq -16(%rdx),%rax
leaq 16(%rdx),%rbx
shrq $4,%rcx
.byte 0xf3,0x0f,0xa7,208
movdqa (%rax),%xmm0
movdqa %xmm0,-16(%rdx)
+ testq %rbp,%rbp
+ jz .Lcbc_exit
+
+.Lcbc_aligned_tail:
+ movq %rdi,%r8
+ movq %rbp,%rbx
+ movq %rbp,%rcx
+ leaq (%rsp),%rbp
+ subq %rcx,%rsp
+ shrq $3,%rcx
+ leaq (%rsp),%rdi
+.byte 0xf3,0x48,0xa5
+ leaq (%r8),%rdi
+ leaq (%rsp),%rsi
+ movq %rbx,%rcx
+ jmp .Lcbc_loop
.Lcbc_exit:
movl $1,%eax
leaq 8(%rsp),%rsp
diff --git a/lib/accelerated/x86/coff/padlock-x86-coff.s b/lib/accelerated/x86/coff/padlock-x86-coff.s
index b068083fa6..69eb468638 100644
--- a/lib/accelerated/x86/coff/padlock-x86-coff.s
+++ b/lib/accelerated/x86/coff/padlock-x86-coff.s
@@ -180,16 +180,14 @@ _padlock_ecb_encrypt:
leal 16(%edx),%edx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpl $128,%ecx
- jbe .L006ecb_short
testl $32,(%edx)
- jnz .L007ecb_aligned
+ jnz .L006ecb_aligned
testl $15,%edi
setz %al
testl $15,%esi
setz %bl
testl %ebx,%eax
- jnz .L007ecb_aligned
+ jnz .L006ecb_aligned
negl %eax
movl $512,%ebx
notl %eax
@@ -201,10 +199,28 @@ _padlock_ecb_encrypt:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
- jmp .L008ecb_loop
+ movl %eax,16(%ebp)
+ cmpl %ebx,%ecx
+ ja .L007ecb_loop
+ movl %esi,%eax
+ cmpl %esp,%ebp
+ cmovel %edi,%eax
+ addl %ecx,%eax
+ negl %eax
+ andl $4095,%eax
+ cmpl $128,%eax
+ movl $-128,%eax
+ cmovael %ebx,%eax
+ andl %eax,%ebx
+ jz .L008ecb_unaligned_tail
+ jmp .L007ecb_loop
.align 16
-.L008ecb_loop:
+.L007ecb_loop:
movl %edi,(%ebp)
movl %esi,4(%ebp)
movl %ecx,8(%ebp)
@@ -229,8 +245,8 @@ _padlock_ecb_encrypt:
testl $15,%edi
jz .L010ecb_out_aligned
movl %ebx,%ecx
- shrl $2,%ecx
leal (%esp),%esi
+ shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
.L010ecb_out_aligned:
@@ -240,43 +256,75 @@ _padlock_ecb_encrypt:
addl %ebx,%esi
subl %ebx,%ecx
movl $512,%ebx
- jnz .L008ecb_loop
+ jz .L011ecb_break
+ cmpl %ebx,%ecx
+ jae .L007ecb_loop
+.L008ecb_unaligned_tail:
+ xorl %eax,%eax
+ cmpl %ebp,%esp
+ cmovel %ecx,%eax
+ subl %eax,%esp
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp .L007ecb_loop
+.align 16
+.L011ecb_break:
cmpl %ebp,%esp
- je .L011ecb_done
+ je .L012ecb_done
pxor %xmm0,%xmm0
leal (%esp),%eax
-.L012ecb_bzero:
+.L013ecb_bzero:
movaps %xmm0,(%eax)
leal 16(%eax),%eax
cmpl %eax,%ebp
- ja .L012ecb_bzero
-.L011ecb_done:
+ ja .L013ecb_bzero
+.L012ecb_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
- jmp .L013ecb_exit
+ jmp .L014ecb_exit
.align 16
-.L006ecb_short:
+.L006ecb_aligned:
+ leal (%esi,%ecx,1),%ebp
+ negl %ebp
+ andl $4095,%ebp
xorl %eax,%eax
- leal -24(%esp),%ebp
- subl %ecx,%eax
- leal (%eax,%ebp,1),%esp
- andl $-16,%esp
- xorl %ebx,%ebx
-.L014ecb_short_copy:
- movups (%esi,%ebx,1),%xmm0
- leal 16(%ebx),%ebx
- cmpl %ebx,%ecx
- movaps %xmm0,-16(%esp,%ebx,1)
- ja .L014ecb_short_copy
- movl %esp,%esi
- movl %ecx,%ebx
- jmp .L008ecb_loop
-.align 16
-.L007ecb_aligned:
+ cmpl $128,%ebp
+ movl $127,%ebp
+ cmovael %eax,%ebp
+ andl %ecx,%ebp
+ subl %ebp,%ecx
+ jz .L015ecb_aligned_tail
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
.byte 243,15,167,200
-.L013ecb_exit:
+ testl %ebp,%ebp
+ jz .L014ecb_exit
+.L015ecb_aligned_tail:
+ movl %ebp,%ecx
+ leal -24(%esp),%ebp
+ movl %ebp,%esp
+ movl %ebp,%eax
+ subl %ecx,%esp
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp .L007ecb_loop
+.L014ecb_exit:
movl $1,%eax
leal 4(%esp),%esp
.L004ecb_abort:
@@ -299,19 +347,17 @@ _padlock_cbc_encrypt:
movl 28(%esp),%edx
movl 32(%esp),%ecx
testl $15,%edx
- jnz .L015cbc_abort
+ jnz .L016cbc_abort
testl $15,%ecx
- jnz .L015cbc_abort
+ jnz .L016cbc_abort
leal .Lpadlock_saved_context,%eax
pushfl
cld
call __padlock_verify_ctx
-.L016cbc_pic_point:
+.L017cbc_pic_point:
leal 16(%edx),%edx
xorl %eax,%eax
xorl %ebx,%ebx
- cmpl $64,%ecx
- jbe .L017cbc_short
testl $32,(%edx)
jnz .L018cbc_aligned
testl $15,%edi
@@ -331,7 +377,25 @@ _padlock_cbc_encrypt:
negl %eax
andl $511,%ebx
leal (%eax,%ebp,1),%esp
+ movl $512,%eax
+ cmovzl %eax,%ebx
+ movl %ebp,%eax
+ andl $-16,%ebp
andl $-16,%esp
+ movl %eax,16(%ebp)
+ cmpl %ebx,%ecx
+ ja .L019cbc_loop
+ movl %esi,%eax
+ cmpl %esp,%ebp
+ cmovel %edi,%eax
+ addl %ecx,%eax
+ negl %eax
+ andl $4095,%eax
+ cmpl $64,%eax
+ movl $-64,%eax
+ cmovael %ebx,%eax
+ andl %eax,%ebx
+ jz .L020cbc_unaligned_tail
jmp .L019cbc_loop
.align 16
.L019cbc_loop:
@@ -343,13 +407,13 @@ _padlock_cbc_encrypt:
testl $15,%edi
cmovnzl %esp,%edi
testl $15,%esi
- jz .L020cbc_inp_aligned
+ jz .L021cbc_inp_aligned
shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
movl %ebx,%ecx
movl %edi,%esi
-.L020cbc_inp_aligned:
+.L021cbc_inp_aligned:
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
@@ -359,61 +423,93 @@ _padlock_cbc_encrypt:
movl (%ebp),%edi
movl 12(%ebp),%ebx
testl $15,%edi
- jz .L021cbc_out_aligned
+ jz .L022cbc_out_aligned
movl %ebx,%ecx
- shrl $2,%ecx
leal (%esp),%esi
+ shrl $2,%ecx
.byte 243,165
subl %ebx,%edi
-.L021cbc_out_aligned:
+.L022cbc_out_aligned:
movl 4(%ebp),%esi
movl 8(%ebp),%ecx
addl %ebx,%edi
addl %ebx,%esi
subl %ebx,%ecx
movl $512,%ebx
- jnz .L019cbc_loop
+ jz .L023cbc_break
+ cmpl %ebx,%ecx
+ jae .L019cbc_loop
+.L020cbc_unaligned_tail:
+ xorl %eax,%eax
+ cmpl %ebp,%esp
+ cmovel %ecx,%eax
+ subl %eax,%esp
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp .L019cbc_loop
+.align 16
+.L023cbc_break:
cmpl %ebp,%esp
- je .L022cbc_done
+ je .L024cbc_done
pxor %xmm0,%xmm0
leal (%esp),%eax
-.L023cbc_bzero:
+.L025cbc_bzero:
movaps %xmm0,(%eax)
leal 16(%eax),%eax
cmpl %eax,%ebp
- ja .L023cbc_bzero
-.L022cbc_done:
+ ja .L025cbc_bzero
+.L024cbc_done:
+ movl 16(%ebp),%ebp
leal 24(%ebp),%esp
- jmp .L024cbc_exit
-.align 16
-.L017cbc_short:
- xorl %eax,%eax
- leal -24(%esp),%ebp
- subl %ecx,%eax
- leal (%eax,%ebp,1),%esp
- andl $-16,%esp
- xorl %ebx,%ebx
-.L025cbc_short_copy:
- movups (%esi,%ebx,1),%xmm0
- leal 16(%ebx),%ebx
- cmpl %ebx,%ecx
- movaps %xmm0,-16(%esp,%ebx,1)
- ja .L025cbc_short_copy
- movl %esp,%esi
- movl %ecx,%ebx
- jmp .L019cbc_loop
+ jmp .L026cbc_exit
.align 16
.L018cbc_aligned:
+ leal (%esi,%ecx,1),%ebp
+ negl %ebp
+ andl $4095,%ebp
+ xorl %eax,%eax
+ cmpl $64,%ebp
+ movl $63,%ebp
+ cmovael %eax,%ebp
+ andl %ecx,%ebp
+ subl %ebp,%ecx
+ jz .L027cbc_aligned_tail
leal -16(%edx),%eax
leal 16(%edx),%ebx
shrl $4,%ecx
.byte 243,15,167,208
movaps (%eax),%xmm0
movaps %xmm0,-16(%edx)
-.L024cbc_exit:
+ testl %ebp,%ebp
+ jz .L026cbc_exit
+.L027cbc_aligned_tail:
+ movl %ebp,%ecx
+ leal -24(%esp),%ebp
+ movl %ebp,%esp
+ movl %ebp,%eax
+ subl %ecx,%esp
+ andl $-16,%ebp
+ andl $-16,%esp
+ movl %eax,16(%ebp)
+ movl %edi,%eax
+ movl %ecx,%ebx
+ shrl $2,%ecx
+ leal (%esp),%edi
+.byte 243,165
+ movl %esp,%esi
+ movl %eax,%edi
+ movl %ebx,%ecx
+ jmp .L019cbc_loop
+.L026cbc_exit:
movl $1,%eax
leal 4(%esp),%esp
-.L015cbc_abort:
+.L016cbc_abort:
popl %edi
popl %esi
popl %ebx
@@ -437,10 +533,10 @@ __win32_segv_handler:
movl 4(%esp),%edx
movl 12(%esp),%ecx
cmpl $3221225477,(%edx)
- jne .L026ret
+ jne .L028ret
addl $4,184(%ecx)
movl $0,%eax
-.L026ret:
+.L028ret:
ret
.globl _padlock_sha1_oneshot
.def _padlock_sha1_oneshot; .scl 2; .type 32; .endef