diff options
author | Muhammad Faiz <mfcc64@gmail.com> | 2017-03-16 11:33:16 +0700 |
---|---|---|
committer | Muhammad Faiz <mfcc64@gmail.com> | 2017-03-19 12:24:41 +0700 |
commit | de1308429ae649c899b74365f0dc72847676ba75 (patch) | |
tree | f3072f9c6af39f6fbaa110452cfa8afba1a20887 | |
parent | 3d5c2169e44e98de1589c13d593f62c1b73cf94e (diff) | |
download | ffmpeg-de1308429ae649c899b74365f0dc72847676ba75.tar.gz |
swresample/x86/resample: extend resample_double to support avx and fma3
benchmark:
sse2 10.670s
avx 8.763s
fma3 8.380s
Signed-off-by: Muhammad Faiz <mfcc64@gmail.com>
-rw-r--r-- | libswresample/x86/resample.asm | 15 | ||||
-rw-r--r-- | libswresample/x86/resample_init.c | 10 |
2 files changed, 22 insertions, 3 deletions
diff --git a/libswresample/x86/resample.asm b/libswresample/x86/resample.asm index 4163df1aa1..7107cf9d42 100644 --- a/libswresample/x86/resample.asm +++ b/libswresample/x86/resample.asm @@ -203,7 +203,7 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_count, dst, frac, \ ; horizontal sum & store %if mmsize == 32 vextractf128 xm1, m0, 0x1 - addps xm0, xm1 + addp%4 xm0, xm1 %endif movhlps xm1, xm0 %ifidn %1, float @@ -489,8 +489,8 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \ %if mmsize == 32 vextractf128 xm1, m0, 0x1 vextractf128 xm3, m2, 0x1 - addps xm0, xm1 - addps xm2, xm3 + addp%4 xm0, xm1 + addp%4 xm2, xm3 %endif cvtsi2s%4 xm1, fracd subp%4 xm2, xm0 @@ -608,3 +608,12 @@ RESAMPLE_FNS int16, 2, 1 INIT_XMM sse2 RESAMPLE_FNS double, 8, 3, d, pdbl_1 + +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +RESAMPLE_FNS double, 8, 3, d, pdbl_1 +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +RESAMPLE_FNS double, 8, 3, d, pdbl_1 +%endif diff --git a/libswresample/x86/resample_init.c b/libswresample/x86/resample_init.c index e515762b98..c6b2a36060 100644 --- a/libswresample/x86/resample_init.c +++ b/libswresample/x86/resample_init.c @@ -42,6 +42,8 @@ RESAMPLE_FUNCS(float, avx); RESAMPLE_FUNCS(float, fma3); RESAMPLE_FUNCS(float, fma4); RESAMPLE_FUNCS(double, sse2); +RESAMPLE_FUNCS(double, avx); +RESAMPLE_FUNCS(double, fma3); av_cold void swri_resample_dsp_x86_init(ResampleContext *c) { @@ -85,6 +87,14 @@ av_cold void swri_resample_dsp_x86_init(ResampleContext *c) c->dsp.resample_linear = ff_resample_linear_double_sse2; c->dsp.resample_common = ff_resample_common_double_sse2; } + if (EXTERNAL_AVX_FAST(mm_flags)) { + c->dsp.resample_linear = ff_resample_linear_double_avx; + c->dsp.resample_common = ff_resample_common_double_avx; + } + if (EXTERNAL_FMA3_FAST(mm_flags)) { + c->dsp.resample_linear = ff_resample_linear_double_fma3; + c->dsp.resample_common = ff_resample_common_double_fma3; + } break; } } |