summaryrefslogtreecommitdiff
path: root/libavcodec/arm
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/arm')
-rw-r--r--libavcodec/arm/vp9itxfm_neon.S57
1 files changed, 20 insertions, 37 deletions
diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index ebbbda9248..adc9896db4 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -1575,7 +1575,6 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
beq idct32x32_dc_add_neon
push {r4-r8,lr}
vpush {q4-q6}
- movrel r8, min_eob_idct_idct_32 + 2
@ Align the stack, allocate a temp buffer
T mov r7, sp
@@ -1597,6 +1596,8 @@ A and r7, sp, #15
cmp r3, #135
ble idct32x32_half_add_neon
+ movrel r8, min_eob_idct_idct_32 + 2
+
.irp i, 0, 4, 8, 12, 16, 20, 24, 28
add r0, sp, #(\i*64)
.if \i > 0
@@ -1634,72 +1635,54 @@ A and r7, sp, #15
pop {r4-r8,pc}
endfunc
-function idct32x32_quarter_add_neon
+.macro idct32_partial size
+function idct32x32_\size\()_add_neon
.irp i, 0, 4
add r0, sp, #(\i*64)
+.ifc \size,quarter
.if \i == 4
cmp r3, #9
ble 1f
.endif
+.endif
add r2, r6, #(\i*2)
- bl idct32_1d_4x32_pass1_quarter_neon
-.endr
- b 3f
-
-1:
- @ Write zeros to the temp buffer for pass 2
- vmov.i16 q14, #0
- vmov.i16 q15, #0
-.rept 8
- vst1.16 {q14-q15}, [r0,:128]!
-.endr
-3:
-.irp i, 0, 4, 8, 12, 16, 20, 24, 28
- add r0, r4, #(\i)
- mov r1, r5
- add r2, sp, #(\i*2)
- bl idct32_1d_4x32_pass2_quarter_neon
+ bl idct32_1d_4x32_pass1_\size\()_neon
.endr
- add sp, sp, r7
- vpop {q4-q6}
- pop {r4-r8,pc}
-endfunc
-
-function idct32x32_half_add_neon
-.irp i, 0, 4, 8, 12
+.ifc \size,half
+.irp i, 8, 12
add r0, sp, #(\i*64)
-.if \i > 0
- ldrh_post r1, r8, #2
- cmp r3, r1
- it le
- movle r1, #(16 - \i)/2
+.if \i == 12
+ cmp r3, #70
ble 1f
.endif
add r2, r6, #(\i*2)
- bl idct32_1d_4x32_pass1_half_neon
+ bl idct32_1d_4x32_pass1_\size\()_neon
.endr
+.endif
b 3f
1:
@ Write zeros to the temp buffer for pass 2
vmov.i16 q14, #0
vmov.i16 q15, #0
-2:
- subs r1, r1, #1
-.rept 4
+.rept 8
vst1.16 {q14-q15}, [r0,:128]!
.endr
- bne 2b
+
3:
.irp i, 0, 4, 8, 12, 16, 20, 24, 28
add r0, r4, #(\i)
mov r1, r5
add r2, sp, #(\i*2)
- bl idct32_1d_4x32_pass2_half_neon
+ bl idct32_1d_4x32_pass2_\size\()_neon
.endr
add sp, sp, r7
vpop {q4-q6}
pop {r4-r8,pc}
endfunc
+.endm
+
+idct32_partial quarter
+idct32_partial half