diff options
author | Martin Storsjö <martin@martin.st> | 2013-07-19 10:59:17 +0300 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2013-07-22 10:15:37 +0300 |
commit | b63bb251ea6d6ba23295294e37a92625c0192206 (patch) | |
tree | 0557efc37e13206b791ac55ed6c2302e3272776e | |
parent | d6e4f5fef0d811e180fd7541941e07dca9e11dc0 (diff) | |
download | ffmpeg-b63bb251ea6d6ba23295294e37a92625c0192206.tar.gz |
arm: Add VFP-accelerated version of imdct_half
Before After
Mean StdDev Mean StdDev Change
This function 2653.0 28.5 1108.8 51.4 +139.3%
Overall 17049.5 408.2 15973.0 223.2 +6.7%
Signed-off-by: Martin Storsjö <martin@martin.st>
-rw-r--r-- | libavcodec/arm/Makefile | 1 | ||||
-rw-r--r-- | libavcodec/arm/fft_init_arm.c | 9 | ||||
-rw-r--r-- | libavcodec/arm/mdct_vfp.S | 206 | ||||
-rw-r--r-- | libavcodec/arm/synth_filter_vfp.S | 2 |
4 files changed, 217 insertions, 1 deletions
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index c5f90c2427..e95d94ace8 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -53,6 +53,7 @@ ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o \ arm/vp8dsp_armv6.o VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_vfp.o +VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o NEON-OBJS += arm/fmtconvert_neon.o diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c index 133fb84f69..1c4568ddc2 100644 --- a/libavcodec/arm/fft_init_arm.c +++ b/libavcodec/arm/fft_init_arm.c @@ -26,6 +26,8 @@ void ff_fft_permute_neon(FFTContext *s, FFTComplex *z); void ff_fft_calc_neon(FFTContext *s, FFTComplex *z); +void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input); + void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); @@ -48,6 +50,13 @@ av_cold void ff_fft_init_arm(FFTContext *s) { int cpu_flags = av_get_cpu_flags(); + if (have_vfp(cpu_flags)) { +#if CONFIG_MDCT + if (!have_vfpv3(cpu_flags)) + s->imdct_half = ff_imdct_half_vfp; +#endif + } + if (have_neon(cpu_flags)) { s->fft_permute = ff_fft_permute_neon; s->fft_calc = ff_fft_calc_neon; diff --git a/libavcodec/arm/mdct_vfp.S b/libavcodec/arm/mdct_vfp.S new file mode 100644 index 0000000000..7413a41c66 --- /dev/null +++ b/libavcodec/arm/mdct_vfp.S @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2013 RISC OS Open Ltd + * Author: Ben Avison <bavison@riscosopen.org> + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +CONTEXT .req a1 +ORIGOUT .req a2 +IN .req a3 +OUT .req v1 +REVTAB .req v2 +TCOS .req v3 +TSIN .req v4 +OLDFPSCR .req v5 +J0 .req a2 +J1 .req a4 +J2 .req ip +J3 .req lr + +.macro prerotation_innerloop + .set trig_lo, k + .set trig_hi, n4 - k - 2 + .set in_lo, trig_lo * 2 + .set in_hi, trig_hi * 2 + vldr d8, [TCOS, #trig_lo*4] @ s16,s17 + vldr d9, [TCOS, #trig_hi*4] @ s18,s19 + vldr s0, [IN, #in_hi*4 + 12] + vldr s1, [IN, #in_hi*4 + 4] + vldr s2, [IN, #in_lo*4 + 12] + vldr s3, [IN, #in_lo*4 + 4] + vmul.f s8, s0, s16 @ vector operation + vldr d10, [TSIN, #trig_lo*4] @ s20,s21 + vldr d11, [TSIN, #trig_hi*4] @ s22,s23 + vldr s4, [IN, #in_lo*4] + vldr s5, [IN, #in_lo*4 + 8] + vldr s6, [IN, #in_hi*4] + vldr s7, [IN, #in_hi*4 + 8] + ldr J0, [REVTAB, #trig_lo*2] + vmul.f s12, s0, s20 @ vector operation + ldr J2, [REVTAB, #trig_hi*2] + mov J1, J0, lsr #16 + and J0, J0, #255 @ halfword value will be < n4 + vmls.f s8, s4, s20 @ vector operation + mov J3, J2, lsr #16 + and J2, J2, #255 @ halfword value will be < n4 + add J0, OUT, J0, lsl #3 + vmla.f s12, s4, s16 @ vector operation + add J1, OUT, J1, lsl #3 + add J2, OUT, J2, lsl #3 + add J3, OUT, J3, lsl #3 + vstr s8, [J0] + vstr s9, [J1] + vstr s10, [J2] + vstr s11, [J3] + vstr s12, [J0, #4] + vstr s13, [J1, #4] + vstr s14, [J2, #4] + vstr s15, [J3, #4] + .set k, k + 2 +.endm + +.macro postrotation_innerloop tail, head + .set trig_lo_head, n8 - k - 2 + .set trig_hi_head, n8 + k + .set out_lo_head, trig_lo_head * 2 + .set out_hi_head, trig_hi_head * 2 + .set trig_lo_tail, n8 - (k - 2) - 2 + .set trig_hi_tail, n8 + (k - 2) + .set out_lo_tail, trig_lo_tail * 2 + .set out_hi_tail, trig_hi_tail * 2 + .if (k & 2) == 0 + TCOS_D0_HEAD .req d10 @ s20,s21 + TCOS_D1_HEAD .req d11 @ s22,s23 + TCOS_S0_TAIL .req s24 + .else + TCOS_D0_HEAD .req d12 @ s24,s25 + TCOS_D1_HEAD .req d13 @ s26,s27 + TCOS_S0_TAIL .req s20 + .endif + .ifnc "\tail","" + vmls.f s8, s0, TCOS_S0_TAIL @ vector operation + .endif + .ifnc "\head","" + vldr d8, [TSIN, #trig_lo_head*4] @ s16,s17 + vldr d9, [TSIN, #trig_hi_head*4] @ s18,s19 + vldr TCOS_D0_HEAD, [TCOS, #trig_lo_head*4] + .endif + .ifnc "\tail","" + vmla.f s12, s4, TCOS_S0_TAIL @ vector operation + .endif + .ifnc "\head","" + vldr s0, [OUT, #out_lo_head*4] + vldr s1, [OUT, #out_lo_head*4 + 8] + vldr s2, [OUT, #out_hi_head*4] + vldr s3, [OUT, #out_hi_head*4 + 8] + vldr s4, [OUT, #out_lo_head*4 + 4] + vldr s5, [OUT, #out_lo_head*4 + 12] + vldr s6, [OUT, #out_hi_head*4 + 4] + vldr s7, [OUT, #out_hi_head*4 + 12] + .endif + .ifnc "\tail","" + vstr s8, [OUT, #out_lo_tail*4] + vstr s9, [OUT, #out_lo_tail*4 + 8] + vstr s10, [OUT, #out_hi_tail*4] + vstr s11, [OUT, #out_hi_tail*4 + 8] + .endif + .ifnc "\head","" + vmul.f s8, s4, s16 @ vector operation + .endif + .ifnc "\tail","" + vstr s12, [OUT, #out_hi_tail*4 + 12] + vstr s13, [OUT, #out_hi_tail*4 + 4] + vstr s14, [OUT, #out_lo_tail*4 + 12] + vstr s15, [OUT, #out_lo_tail*4 + 4] + .endif + .ifnc "\head","" + vmul.f s12, s0, s16 @ vector operation + vldr TCOS_D1_HEAD, [TCOS, #trig_hi_head*4] + .endif + .unreq TCOS_D0_HEAD + .unreq TCOS_D1_HEAD + .unreq TCOS_S0_TAIL + .ifnc "\head","" + .set k, k + 2 + .endif +.endm + + +/* void ff_imdct_half_vfp(FFTContext *s, + * FFTSample *output, + * const FFTSample *input) + */ +function ff_imdct_half_vfp, export=1 + ldr ip, [CONTEXT, #5*4] @ mdct_bits + teq ip, #6 + it ne + bne ff_imdct_half_c @ only case currently accelerated is the one used by DCA + + .set n, 1<<6 + .set n2, n/2 + .set n4, n/4 + .set n8, n/8 + + push {v1-v5,lr} + vpush {s16-s27} + fmrx OLDFPSCR, FPSCR + ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 + fmxr FPSCR, lr + mov OUT, ORIGOUT + ldr REVTAB, [CONTEXT, #2*4] + ldr TCOS, [CONTEXT, #6*4] + ldr TSIN, [CONTEXT, #7*4] + + .set k, 0 + .rept n8/2 + prerotation_innerloop + .endr + + fmxr FPSCR, OLDFPSCR + mov ORIGOUT, OUT + ldr ip, [CONTEXT, #9*4] + blx ip @ s->fft_calc(s, output) + ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 + fmxr FPSCR, lr + + .set k, 0 + postrotation_innerloop , head + .rept n8/2 - 1 + postrotation_innerloop tail, head + .endr + postrotation_innerloop tail + + fmxr FPSCR, OLDFPSCR + vpop {s16-s27} + pop {v1-v5,pc} +endfunc + + .unreq CONTEXT + .unreq ORIGOUT + .unreq IN + .unreq OUT + .unreq REVTAB + .unreq TCOS + .unreq TSIN + .unreq OLDFPSCR + .unreq J0 + .unreq J1 + .unreq J2 + .unreq J3 diff --git a/libavcodec/arm/synth_filter_vfp.S b/libavcodec/arm/synth_filter_vfp.S index 1b99e64598..c219c41875 100644 --- a/libavcodec/arm/synth_filter_vfp.S +++ b/libavcodec/arm/synth_filter_vfp.S @@ -132,7 +132,7 @@ function ff_synth_filter_float_vfp, export=1 str lr, [P_SB_OFF] @ rotate offset, modulo buffer size, ready for next call ldr a3, [sp, #(16+6+2)*4] @ fetch in from stack, to pass to imdct_half VFP vmov s16, SCALE @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case - bl ff_imdct_half_c + bl ff_imdct_half_vfp VFP vmov SCALE, s16 fmrx OLDFPSCR, FPSCR |