summaryrefslogtreecommitdiff
path: root/libavcodec/arm/dsputil_neon.S
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/arm/dsputil_neon.S')
-rw-r--r--libavcodec/arm/dsputil_neon.S142
1 files changed, 43 insertions, 99 deletions
diff --git a/libavcodec/arm/dsputil_neon.S b/libavcodec/arm/dsputil_neon.S
index 3b9b542a68..1574ad6496 100644
--- a/libavcodec/arm/dsputil_neon.S
+++ b/libavcodec/arm/dsputil_neon.S
@@ -2,20 +2,20 @@
* ARM NEON optimised DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -587,106 +587,50 @@ NOVFP vdup.32 q8, r2
.unreq len
endfunc
-function ff_vector_fmul_sv_scalar_2_neon, export=1
-VFP vdup.32 d16, d0[0]
-NOVFP vdup.32 d16, r3
-NOVFP ldr r3, [sp]
- vld1.32 {d0},[r1,:64]!
- vld1.32 {d1},[r1,:64]!
-1: subs r3, r3, #4
- vmul.f32 d4, d0, d16
- vmul.f32 d5, d1, d16
- ldr r12, [r2], #4
- vld1.32 {d2},[r12,:64]
- ldr r12, [r2], #4
- vld1.32 {d3},[r12,:64]
- vmul.f32 d4, d4, d2
- vmul.f32 d5, d5, d3
- beq 2f
- vld1.32 {d0},[r1,:64]!
- vld1.32 {d1},[r1,:64]!
- vst1.32 {d4},[r0,:64]!
- vst1.32 {d5},[r0,:64]!
- b 1b
-2: vst1.32 {d4},[r0,:64]!
- vst1.32 {d5},[r0,:64]!
- bx lr
-endfunc
-
-function ff_vector_fmul_sv_scalar_4_neon, export=1
-VFP vdup.32 q10, d0[0]
-NOVFP vdup.32 q10, r3
-NOVFP ldr r3, [sp]
- push {lr}
- bics lr, r3, #7
- beq 3f
- vld1.32 {q0},[r1,:128]!
- vld1.32 {q2},[r1,:128]!
-1: ldr r12, [r2], #4
- vld1.32 {q1},[r12,:128]
- ldr r12, [r2], #4
- vld1.32 {q3},[r12,:128]
- vmul.f32 q8, q0, q10
- vmul.f32 q8, q8, q1
- vmul.f32 q9, q2, q10
- vmul.f32 q9, q9, q3
- subs lr, lr, #8
- beq 2f
- vld1.32 {q0},[r1,:128]!
- vld1.32 {q2},[r1,:128]!
- vst1.32 {q8},[r0,:128]!
- vst1.32 {q9},[r0,:128]!
- b 1b
-2: vst1.32 {q8},[r0,:128]!
- vst1.32 {q9},[r0,:128]!
- ands r3, r3, #7
- it eq
- popeq {pc}
-3: vld1.32 {q0},[r1,:128]!
- ldr r12, [r2], #4
- vld1.32 {q1},[r12,:128]
- vmul.f32 q0, q0, q10
- vmul.f32 q0, q0, q1
- vst1.32 {q0},[r0,:128]!
- subs r3, r3, #4
- bgt 3b
- pop {pc}
-endfunc
-
-function ff_sv_fmul_scalar_2_neon, export=1
+function ff_vector_fmac_scalar_neon, export=1
VFP len .req r2
+VFP acc .req r3
NOVFP len .req r3
-VFP vdup.32 q8, d0[0]
-NOVFP vdup.32 q8, r2
- ldr r12, [r1], #4
- vld1.32 {d0},[r12,:64]
- ldr r12, [r1], #4
- vld1.32 {d1},[r12,:64]
-1: vmul.f32 q1, q0, q8
- subs len, len, #4
+NOVFP acc .req r2
+VFP vdup.32 q15, d0[0]
+NOVFP vdup.32 q15, r2
+ bics r12, len, #15
+ mov acc, r0
+ beq 3f
+ vld1.32 {q0}, [r1,:128]!
+ vld1.32 {q8}, [acc,:128]!
+ vld1.32 {q1}, [r1,:128]!
+ vld1.32 {q9}, [acc,:128]!
+1: vmla.f32 q8, q0, q15
+ vld1.32 {q2}, [r1,:128]!
+ vld1.32 {q10}, [acc,:128]!
+ vmla.f32 q9, q1, q15
+ vld1.32 {q3}, [r1,:128]!
+ vld1.32 {q11}, [acc,:128]!
+ vmla.f32 q10, q2, q15
+ vst1.32 {q8}, [r0,:128]!
+ vmla.f32 q11, q3, q15
+ vst1.32 {q9}, [r0,:128]!
+ subs r12, r12, #16
beq 2f
- ldr r12, [r1], #4
- vld1.32 {d0},[r12,:64]
- ldr r12, [r1], #4
- vld1.32 {d1},[r12,:64]
- vst1.32 {q1},[r0,:128]!
+ vld1.32 {q0}, [r1,:128]!
+ vld1.32 {q8}, [acc,:128]!
+ vst1.32 {q10}, [r0,:128]!
+ vld1.32 {q1}, [r1,:128]!
+ vld1.32 {q9}, [acc,:128]!
+ vst1.32 {q11}, [r0,:128]!
b 1b
-2: vst1.32 {q1},[r0,:128]!
- bx lr
- .unreq len
-endfunc
-
-function ff_sv_fmul_scalar_4_neon, export=1
-VFP len .req r2
-NOVFP len .req r3
-VFP vdup.32 q8, d0[0]
-NOVFP vdup.32 q8, r2
-1: ldr r12, [r1], #4
- vld1.32 {q0},[r12,:128]
- vmul.f32 q0, q0, q8
- vst1.32 {q0},[r0,:128]!
+2: vst1.32 {q10}, [r0,:128]!
+ vst1.32 {q11}, [r0,:128]!
+ ands len, len, #15
+ it eq
+ bxeq lr
+3: vld1.32 {q0}, [r1,:128]!
+ vld1.32 {q8}, [acc,:128]!
+ vmla.f32 q8, q0, q15
+ vst1.32 {q8}, [r0,:128]!
subs len, len, #4
- bgt 1b
+ bgt 3b
bx lr
.unreq len
endfunc