diff options
-rw-r--r-- | libavcodec/x86/Makefile | 1 | ||||
-rw-r--r-- | libavcodec/x86/dsputil_mmx.c | 3 | ||||
-rw-r--r-- | libavcodec/x86/dsputil_mmx.h | 2 | ||||
-rw-r--r-- | libavcodec/x86/vorbisdsp.asm | 83 | ||||
-rw-r--r-- | libavcodec/x86/vorbisdsp_init.c | 77 |
5 files changed, 92 insertions, 74 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 6069968a09..0bade86375 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -62,6 +62,7 @@ YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \ x86/rv40dsp.o YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o +YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp.o YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 65247c0016..9a282e8dfb 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -38,9 +38,6 @@ DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL; DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; -DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = - { 0x8000000080000000ULL, 0x8000000080000000ULL }; - DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL }; diff --git a/libavcodec/x86/dsputil_mmx.h b/libavcodec/x86/dsputil_mmx.h index a142406a6e..49688dc665 100644 --- a/libavcodec/x86/dsputil_mmx.h +++ b/libavcodec/x86/dsputil_mmx.h @@ -31,8 +31,6 @@ typedef struct xmm_reg { uint64_t a, b; } xmm_reg; extern const uint64_t ff_bone; extern const uint64_t ff_wtwo; -extern const uint64_t ff_pdw_80000000[2]; - extern const xmm_reg ff_pw_3; extern const xmm_reg ff_pw_4; extern const xmm_reg ff_pw_5; diff --git a/libavcodec/x86/vorbisdsp.asm b/libavcodec/x86/vorbisdsp.asm new file mode 100644 index 0000000000..c54650eef5 --- /dev/null +++ b/libavcodec/x86/vorbisdsp.asm @@ -0,0 +1,83 @@ +;****************************************************************************** +;* Vorbis x86 optimizations +;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu> +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pdw_80000000: times 4 dd 0x80000000 + +SECTION .text + +%if ARCH_X86_32 +INIT_MMX 3dnow +cglobal vorbis_inverse_coupling, 3, 3, 6, mag, ang, block_size + pxor m7, m7 + lea magq, [magq+block_sizeq*4] + lea angq, [angq+block_sizeq*4] + neg block_sizeq +.loop: + mova m0, [magq+block_sizeq*4] + mova m1, [angq+block_sizeq*4] + mova m2, m0 + mova m3, m1 + pfcmpge m2, m7 ; m <= 0.0 + pfcmpge m3, m7 ; a <= 0.0 + pslld m2, 31 ; keep only the sign bit + pxor m1, m2 + mova m4, m3 + pand m3, m1 + pandn m4, m1 + pfadd m3, m0 ; a = m + ((a < 0) & (a ^ sign(m))) + pfsub m0, m4 ; m = m + ((a > 0) & (a ^ sign(m))) + mova [angq+block_sizeq*4], m3 + mova [magq+block_sizeq*4], m0 + add block_sizeq, 2 + jl .loop + femms + RET +%endif + +INIT_XMM sse +cglobal vorbis_inverse_coupling, 3, 4, 6, mag, ang, block_size, cntr + mova m5, [pdw_80000000] + xor cntrq, cntrq +align 16 +.loop: + mova m0, [magq+cntrq*4] + mova m1, [angq+cntrq*4] + xorps m2, m2 + xorps m3, m3 + cmpleps m2, m0 ; m <= 0.0 + cmpleps m3, m1 ; a <= 0.0 + andps m2, m5 ; keep only the sign bit + xorps m1, m2 + mova m4, m3 + andps m3, m1 + andnps m4, m1 + addps m3, m0 ; a = m + ((a < 0) & (a ^ sign(m))) + subps m0, m4 ; m = m + ((a > 0) & (a ^ sign(m))) + mova [angq+cntrq*4], m3 + mova [magq+cntrq*4], m0 + add cntrq, 4 + cmp cntrq, block_sizeq + jl .loop + RET diff --git a/libavcodec/x86/vorbisdsp_init.c b/libavcodec/x86/vorbisdsp_init.c index 6f86f6720d..703cd93f17 100644 --- a/libavcodec/x86/vorbisdsp_init.c +++ b/libavcodec/x86/vorbisdsp_init.c @@ -21,83 +21,22 @@ #include "config.h" #include "libavutil/cpu.h" #include "libavcodec/vorbisdsp.h" -#include "dsputil_mmx.h" // for ff_pdw_80000000 -#if HAVE_INLINE_ASM -#if ARCH_X86_32 -static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, - intptr_t blocksize) -{ - int i; - __asm__ volatile ("pxor %%mm7, %%mm7":); - for (i = 0; i < blocksize; i += 2) { - __asm__ volatile ( - "movq %0, %%mm0 \n\t" - "movq %1, %%mm1 \n\t" - "movq %%mm0, %%mm2 \n\t" - "movq %%mm1, %%mm3 \n\t" - "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0 - "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0 - "pslld $31, %%mm2 \n\t" // keep only the sign bit - "pxor %%mm2, %%mm1 \n\t" - "movq %%mm3, %%mm4 \n\t" - "pand %%mm1, %%mm3 \n\t" - "pandn %%mm1, %%mm4 \n\t" - "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m))) - "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m))) - "movq %%mm3, %1 \n\t" - "movq %%mm0, %0 \n\t" - : "+m"(mag[i]), "+m"(ang[i]) - :: "memory" - ); - } - __asm__ volatile ("femms"); -} -#endif - -static void vorbis_inverse_coupling_sse(float *mag, float *ang, - intptr_t blocksize) -{ - int i; - - __asm__ volatile ( - "movaps %0, %%xmm5 \n\t" - :: "m"(ff_pdw_80000000[0]) - ); - for (i = 0; i < blocksize; i += 4) { - __asm__ volatile ( - "movaps %0, %%xmm0 \n\t" - "movaps %1, %%xmm1 \n\t" - "xorps %%xmm2, %%xmm2 \n\t" - "xorps %%xmm3, %%xmm3 \n\t" - "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0 - "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0 - "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit - "xorps %%xmm2, %%xmm1 \n\t" - "movaps %%xmm3, %%xmm4 \n\t" - "andps %%xmm1, %%xmm3 \n\t" - "andnps %%xmm1, %%xmm4 \n\t" - "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m))) - "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m))) - "movaps %%xmm3, %1 \n\t" - "movaps %%xmm0, %0 \n\t" - : "+m"(mag[i]), "+m"(ang[i]) - :: "memory" - ); - } -} -#endif +void ff_vorbis_inverse_coupling_3dnow(float *mag, float *ang, + intptr_t blocksize); +void ff_vorbis_inverse_coupling_sse(float *mag, float *ang, + intptr_t blocksize); void ff_vorbisdsp_init_x86(VorbisDSPContext *dsp) { -#if HAVE_INLINE_ASM +#if HAVE_YASM int mm_flags = av_get_cpu_flags(); #if ARCH_X86_32 if (mm_flags & AV_CPU_FLAG_3DNOW) - dsp->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; + dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_3dnow; #endif /* ARCH_X86_32 */ if (mm_flags & AV_CPU_FLAG_SSE) - dsp->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; -#endif /* HAVE_INLINE_ASM */ + dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_sse; +#endif /* HAVE_YASM */ } |