diff options
author | James Almer <jamrial@gmail.com> | 2015-08-03 03:28:37 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2015-08-03 17:11:13 -0300 |
commit | 5750d6c5e9d184488f4dc0f9e81cbcc28cb2f2d1 (patch) | |
tree | 41fcfa1cde031405eebbab47c280511c96476d28 | |
parent | 2ca0ed9cfda21c8a6f9884b93613602782dcda71 (diff) | |
download | ffmpeg-5750d6c5e9d184488f4dc0f9e81cbcc28cb2f2d1.tar.gz |
x86: move XOP emulation code back to x86inc
Only two functions that use xop multiply-accumulate instructions where the
first operand is the same as the fourth actually took advantage of the macros.
This further reduces differences with x264's x86inc.
Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com>
Signed-off-by: James Almer <jamrial@gmail.com>
-rw-r--r-- | libavcodec/x86/flacdsp.asm | 9 | ||||
-rw-r--r-- | libavutil/x86/x86inc.asm | 16 | ||||
-rw-r--r-- | libavutil/x86/x86util.asm | 19 | ||||
-rw-r--r-- | libswresample/x86/resample.asm | 7 |
4 files changed, 31 insertions, 20 deletions
diff --git a/libavcodec/x86/flacdsp.asm b/libavcodec/x86/flacdsp.asm index 901c440ccd..7138611526 100644 --- a/libavcodec/x86/flacdsp.asm +++ b/libavcodec/x86/flacdsp.asm @@ -25,6 +25,15 @@ SECTION .text +%macro PMACSDQL 5 +%if cpuflag(xop) + pmacsdql %1, %2, %3, %1 +%else + pmuldq %2, %3 + paddq %1, %2 +%endif +%endmacro + %macro LPC_32 1 INIT_XMM %1 cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index d4c5e698fa..28a2d87f8f 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -1427,6 +1427,22 @@ AVX_INSTR pfmul, 3dnow, 1, 0, 1 %undef i %undef j +%macro FMA_INSTR 3 + %macro %1 4-7 %1, %2, %3 + %if cpuflag(xop) + v%5 %1, %2, %3, %4 + %else + %6 %1, %2, %3 + %7 %1, %4 + %endif + %endmacro +%endmacro + +FMA_INSTR pmacsww, pmullw, paddw +FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation +FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation +FMA_INSTR pmadcswd, pmaddwd, paddd + ; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. ; This lets us use tzcnt without bumping the yasm version requirement yet. %define tzcnt rep bsf diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm index d6702c1466..bf64d179b9 100644 --- a/libavutil/x86/x86util.asm +++ b/libavutil/x86/x86util.asm @@ -765,25 +765,6 @@ %endif %endmacro -%macro PMA_EMU 4 - %macro %1 5-8 %2, %3, %4 - %if cpuflag(xop) - v%6 %1, %2, %3, %4 - %elifidn %1, %4 - %7 %5, %2, %3 - %8 %1, %4, %5 - %else - %7 %1, %2, %3 - %8 %1, %4 - %endif - %endmacro -%endmacro - -PMA_EMU PMACSWW, pmacsww, pmullw, paddw -PMA_EMU PMACSDD, pmacsdd, pmulld, paddd ; sse4 emulation -PMA_EMU PMACSDQL, pmacsdql, pmuldq, paddq ; sse4 emulation -PMA_EMU PMADCSWD, pmadcswd, pmaddwd, paddd - ; Wrapper for non-FMA version of fmaddps %macro FMULADD_PS 5 %if cpuflag(fma3) || cpuflag(fma4) diff --git a/libswresample/x86/resample.asm b/libswresample/x86/resample.asm index a57ff37bb9..4989aa6991 100644 --- a/libswresample/x86/resample.asm +++ b/libswresample/x86/resample.asm @@ -176,7 +176,12 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \ .inner_loop: movu m1, [srcq+min_filter_count_x4q*1] %ifidn %1, int16 - PMADCSWD m0, m1, [filterq+min_filter_count_x4q*1], m0, m1 +%if cpuflag(xop) + vpmadcswd m0, m1, [filterq+min_filter_count_x4q*1], m0 +%else + pmaddwd m1, [filterq+min_filter_count_x4q*1] + paddd m0, m1 +%endif %else ; float/double %if cpuflag(fma4) || cpuflag(fma3) fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0 |