summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libavcodec/x86/Makefile1
-rw-r--r--libavcodec/x86/dsputil_mmx.c3
-rw-r--r--libavcodec/x86/dsputil_mmx.h2
-rw-r--r--libavcodec/x86/vorbisdsp.asm83
-rw-r--r--libavcodec/x86/vorbisdsp_init.c77
5 files changed, 92 insertions, 74 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 6069968a09..0bade86375 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -62,6 +62,7 @@ YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \
x86/rv40dsp.o
YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o
YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
+YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp.o
YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 65247c0016..9a282e8dfb 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -38,9 +38,6 @@
DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
-DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
- { 0x8000000080000000ULL, 0x8000000080000000ULL };
-
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
diff --git a/libavcodec/x86/dsputil_mmx.h b/libavcodec/x86/dsputil_mmx.h
index a142406a6e..49688dc665 100644
--- a/libavcodec/x86/dsputil_mmx.h
+++ b/libavcodec/x86/dsputil_mmx.h
@@ -31,8 +31,6 @@ typedef struct xmm_reg { uint64_t a, b; } xmm_reg;
extern const uint64_t ff_bone;
extern const uint64_t ff_wtwo;
-extern const uint64_t ff_pdw_80000000[2];
-
extern const xmm_reg ff_pw_3;
extern const xmm_reg ff_pw_4;
extern const xmm_reg ff_pw_5;
diff --git a/libavcodec/x86/vorbisdsp.asm b/libavcodec/x86/vorbisdsp.asm
new file mode 100644
index 0000000000..c54650eef5
--- /dev/null
+++ b/libavcodec/x86/vorbisdsp.asm
@@ -0,0 +1,83 @@
+;******************************************************************************
+;* Vorbis x86 optimizations
+;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pdw_80000000: times 4 dd 0x80000000
+
+SECTION .text
+
+%if ARCH_X86_32
+INIT_MMX 3dnow
+cglobal vorbis_inverse_coupling, 3, 3, 6, mag, ang, block_size
+ pxor m7, m7
+ lea magq, [magq+block_sizeq*4]
+ lea angq, [angq+block_sizeq*4]
+ neg block_sizeq
+.loop:
+ mova m0, [magq+block_sizeq*4]
+ mova m1, [angq+block_sizeq*4]
+ mova m2, m0
+ mova m3, m1
+ pfcmpge m2, m7 ; m <= 0.0
+ pfcmpge m3, m7 ; a <= 0.0
+ pslld m2, 31 ; keep only the sign bit
+ pxor m1, m2
+ mova m4, m3
+ pand m3, m1
+ pandn m4, m1
+ pfadd m3, m0 ; a = m + ((a < 0) & (a ^ sign(m)))
+ pfsub m0, m4 ; m = m + ((a > 0) & (a ^ sign(m)))
+ mova [angq+block_sizeq*4], m3
+ mova [magq+block_sizeq*4], m0
+ add block_sizeq, 2
+ jl .loop
+ femms
+ RET
+%endif
+
+INIT_XMM sse
+cglobal vorbis_inverse_coupling, 3, 4, 6, mag, ang, block_size, cntr
+ mova m5, [pdw_80000000]
+ xor cntrq, cntrq
+align 16
+.loop:
+ mova m0, [magq+cntrq*4]
+ mova m1, [angq+cntrq*4]
+ xorps m2, m2
+ xorps m3, m3
+ cmpleps m2, m0 ; m <= 0.0
+ cmpleps m3, m1 ; a <= 0.0
+ andps m2, m5 ; keep only the sign bit
+ xorps m1, m2
+ mova m4, m3
+ andps m3, m1
+ andnps m4, m1
+ addps m3, m0 ; a = m + ((a < 0) & (a ^ sign(m)))
+ subps m0, m4 ; m = m + ((a > 0) & (a ^ sign(m)))
+ mova [angq+cntrq*4], m3
+ mova [magq+cntrq*4], m0
+ add cntrq, 4
+ cmp cntrq, block_sizeq
+ jl .loop
+ RET
diff --git a/libavcodec/x86/vorbisdsp_init.c b/libavcodec/x86/vorbisdsp_init.c
index 6f86f6720d..703cd93f17 100644
--- a/libavcodec/x86/vorbisdsp_init.c
+++ b/libavcodec/x86/vorbisdsp_init.c
@@ -21,83 +21,22 @@
#include "config.h"
#include "libavutil/cpu.h"
#include "libavcodec/vorbisdsp.h"
-#include "dsputil_mmx.h" // for ff_pdw_80000000
-#if HAVE_INLINE_ASM
-#if ARCH_X86_32
-static void vorbis_inverse_coupling_3dnow(float *mag, float *ang,
- intptr_t blocksize)
-{
- int i;
- __asm__ volatile ("pxor %%mm7, %%mm7":);
- for (i = 0; i < blocksize; i += 2) {
- __asm__ volatile (
- "movq %0, %%mm0 \n\t"
- "movq %1, %%mm1 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "movq %%mm1, %%mm3 \n\t"
- "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
- "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
- "pslld $31, %%mm2 \n\t" // keep only the sign bit
- "pxor %%mm2, %%mm1 \n\t"
- "movq %%mm3, %%mm4 \n\t"
- "pand %%mm1, %%mm3 \n\t"
- "pandn %%mm1, %%mm4 \n\t"
- "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
- "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
- "movq %%mm3, %1 \n\t"
- "movq %%mm0, %0 \n\t"
- : "+m"(mag[i]), "+m"(ang[i])
- :: "memory"
- );
- }
- __asm__ volatile ("femms");
-}
-#endif
-
-static void vorbis_inverse_coupling_sse(float *mag, float *ang,
- intptr_t blocksize)
-{
- int i;
-
- __asm__ volatile (
- "movaps %0, %%xmm5 \n\t"
- :: "m"(ff_pdw_80000000[0])
- );
- for (i = 0; i < blocksize; i += 4) {
- __asm__ volatile (
- "movaps %0, %%xmm0 \n\t"
- "movaps %1, %%xmm1 \n\t"
- "xorps %%xmm2, %%xmm2 \n\t"
- "xorps %%xmm3, %%xmm3 \n\t"
- "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
- "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
- "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
- "xorps %%xmm2, %%xmm1 \n\t"
- "movaps %%xmm3, %%xmm4 \n\t"
- "andps %%xmm1, %%xmm3 \n\t"
- "andnps %%xmm1, %%xmm4 \n\t"
- "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
- "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
- "movaps %%xmm3, %1 \n\t"
- "movaps %%xmm0, %0 \n\t"
- : "+m"(mag[i]), "+m"(ang[i])
- :: "memory"
- );
- }
-}
-#endif
+void ff_vorbis_inverse_coupling_3dnow(float *mag, float *ang,
+ intptr_t blocksize);
+void ff_vorbis_inverse_coupling_sse(float *mag, float *ang,
+ intptr_t blocksize);
void ff_vorbisdsp_init_x86(VorbisDSPContext *dsp)
{
-#if HAVE_INLINE_ASM
+#if HAVE_YASM
int mm_flags = av_get_cpu_flags();
#if ARCH_X86_32
if (mm_flags & AV_CPU_FLAG_3DNOW)
- dsp->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
+ dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_3dnow;
#endif /* ARCH_X86_32 */
if (mm_flags & AV_CPU_FLAG_SSE)
- dsp->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
-#endif /* HAVE_INLINE_ASM */
+ dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_sse;
+#endif /* HAVE_YASM */
}