diff options
Diffstat (limited to 'libavcodec/x86')
72 files changed, 2200 insertions, 403 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index eb82ef572e..3e998efafc 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -16,18 +16,22 @@ MMX-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o MMX-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_mmx.o MMX-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp_mmx.o MMX-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhd_mmx.o -MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o +MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o \ + x86/dwt.o MMX-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o MMX-OBJS-$(CONFIG_FFT) += x86/fft.o +MMX-OBJS-$(CONFIG_GPL) += x86/idct_mmx.o MMX-OBJS-$(CONFIG_H264DSP) += x86/h264dsp_mmx.o MMX-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o MMX-OBJS-$(CONFIG_LPC) += x86/lpc_mmx.o MMX-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodec_mmx.o MMX-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp-init.o MMX-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp-init.o +MMX-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp-init.o MMX-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp_init.o MMX-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp_init.o \ x86/rv40dsp_init.o +MMX-OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o MMX-OBJS-$(CONFIG_VP5_DECODER) += x86/vp56dsp_init.o MMX-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp_init.o @@ -36,9 +40,12 @@ MMX-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp-init.o YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o YASM-OBJS-$(CONFIG_DCT) += x86/dct32_sse.o +YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_mmx.o x86/diracdsp_yasm.o YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \ $(YASM-OBJS-FFT-yes) + +YASM-OBJS-$(CONFIG_DWT) += x86/dwt_yasm.o YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \ x86/h264_chromamc_10bit.o YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \ @@ -53,9 +60,11 @@ YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_10bit.o YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36_sse.o YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o +YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o YASM-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp.o YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \ x86/rv40dsp.o +YASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_yasm.o YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp.o diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm index ef828bb0d5..acbab35cae 100644 --- a/libavcodec/x86/ac3dsp.asm +++ b/libavcodec/x86/ac3dsp.asm @@ -2,25 +2,25 @@ ;* x86-optimized AC-3 DSP utils ;* Copyright (c) 2011 Justin Ruggles ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** -%include "x86inc.asm" -%include "x86util.asm" +%include "libavutil/x86/x86inc.asm" +%include "libavutil/x86/x86util.asm" SECTION_RODATA diff --git a/libavcodec/x86/ac3dsp_mmx.c b/libavcodec/x86/ac3dsp_mmx.c index 0ac8685c6d..5549f3e550 100644 --- a/libavcodec/x86/ac3dsp_mmx.c +++ b/libavcodec/x86/ac3dsp_mmx.c @@ -2,20 +2,20 @@ * x86-optimized AC-3 DSP utils * Copyright (c) 2011 Justin Ruggles * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h index 02dbc54db7..3fcd8abbc9 100644 --- a/libavcodec/x86/cabac.h +++ b/libavcodec/x86/cabac.h @@ -73,10 +73,7 @@ "test "lowword" , "lowword" \n\t"\ "jnz 2f \n\t"\ "mov "byte" , %%"REG_c" \n\t"\ - "cmp "end" , %%"REG_c" \n\t"\ - "jge 1f \n\t"\ "add"OPSIZE" $2 , "byte" \n\t"\ - "1: \n\t"\ "movzwl (%%"REG_c") , "tmp" \n\t"\ "lea -1("low") , %%ecx \n\t"\ "xor "low" , %%ecx \n\t"\ @@ -93,6 +90,7 @@ #else /* BROKEN_RELOCATIONS */ #define TABLES_ARG +#define RIP_ARG #if HAVE_FAST_CMOV #define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\ @@ -134,10 +132,7 @@ "test "lowword" , "lowword" \n\t"\ " jnz 2f \n\t"\ "mov "byte" , %%"REG_c" \n\t"\ - "cmp "end" , %%"REG_c" \n\t"\ - "jge 1f \n\t"\ "add"OPSIZE" $2 , "byte" \n\t"\ - "1: \n\t"\ "movzwl (%%"REG_c") , "tmp" \n\t"\ "lea -1("low") , %%ecx \n\t"\ "xor "low" , %%ecx \n\t"\ @@ -155,7 +150,8 @@ #endif /* BROKEN_RELOCATIONS */ -#if HAVE_7REGS +#if HAVE_7REGS && !(defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\ + && !(defined(__i386) && !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1) #define get_cabac_inline get_cabac_inline_x86 static av_always_inline int get_cabac_inline_x86(CABACContext *c, uint8_t *const state) @@ -209,10 +205,9 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val) "movzwl (%1), %%edx \n\t" "bswap %%edx \n\t" "shrl $15, %%edx \n\t" + "add $2, %1 \n\t" "addl %%edx, %%eax \n\t" - "cmp %a5(%2), %1 \n\t" - "jge 1f \n\t" - "add"OPSIZE" $2, %a4(%2) \n\t" + "mov %1, %a4(%2) \n\t" "1: \n\t" "movl %%eax, %a3(%2) \n\t" diff --git a/libavcodec/x86/cavsdsp_mmx.c b/libavcodec/x86/cavsdsp_mmx.c index 4ed7d8e598..05f192fc71 100644 --- a/libavcodec/x86/cavsdsp_mmx.c +++ b/libavcodec/x86/cavsdsp_mmx.c @@ -5,20 +5,20 @@ * MMX-optimized DSP functions, based on H.264 optimizations by * Michael Niedermayer and Loren Merritt * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm index 9d6169ca66..02b5f3fc89 100644 --- a/libavcodec/x86/dct32_sse.asm +++ b/libavcodec/x86/dct32_sse.asm @@ -2,25 +2,25 @@ ;* 32 point SSE-optimized DCT transform ;* Copyright (c) 2010 Vitor Sessak ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** -%include "x86inc.asm" -%include "x86util.asm" +%include "libavutil/x86/x86inc.asm" +%include "libavutil/x86/x86util.asm" SECTION_RODATA 32 diff --git a/libavcodec/x86/deinterlace.asm b/libavcodec/x86/deinterlace.asm index 8613485d5d..a09473bdae 100644 --- a/libavcodec/x86/deinterlace.asm +++ b/libavcodec/x86/deinterlace.asm @@ -3,25 +3,25 @@ ;* Copyright (c) 2010 Vitor Sessak ;* Copyright (c) 2002 Michael Niedermayer ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** -%include "x86inc.asm" -%include "x86util.asm" +%include "libavutil/x86/x86inc.asm" +%include "libavutil/x86/x86util.asm" SECTION_RODATA diff --git a/libavcodec/x86/diracdsp_mmx.c b/libavcodec/x86/diracdsp_mmx.c new file mode 100644 index 0000000000..693a9af4f8 --- /dev/null +++ b/libavcodec/x86/diracdsp_mmx.c @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2010 David Conrad + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "dsputil_mmx.h" +#include "diracdsp_mmx.h" + +void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); +void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); +void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); +void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); + +#define HPEL_FILTER(MMSIZE, EXT) \ + void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int); \ + void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int); \ + \ + static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \ + const uint8_t *src, int stride, int width, int height) \ + { \ + while( height-- ) \ + { \ + ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \ + ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width); \ + ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width); \ + \ + dsth += stride; \ + dstv += stride; \ + dstc += stride; \ + src += stride; \ + } \ + } + +#if !ARCH_X86_64 +HPEL_FILTER(8, mmx) +#endif +HPEL_FILTER(16, sse2) + +#define PIXFUNC(PFX, IDX, EXT) \ + /*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/ \ + c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \ + c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT + +void ff_diracdsp_init_mmx(DiracDSPContext* c) +{ + int mm_flags = av_get_cpu_flags(); + +#if HAVE_YASM + c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx; +#if !ARCH_X86_64 + c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx; + c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx; + c->dirac_hpel_filter = dirac_hpel_filter_mmx; + c->add_rect_clamped = ff_add_rect_clamped_mmx; + c->put_signed_rect_clamped = ff_put_signed_rect_clamped_mmx; +#endif +#endif + + PIXFUNC(put, 0, mmx); + PIXFUNC(avg, 0, mmx); + + if (mm_flags & AV_CPU_FLAG_MMX2) { + PIXFUNC(avg, 0, mmx2); + } + + if (mm_flags & AV_CPU_FLAG_SSE2) { +#if HAVE_YASM + c->dirac_hpel_filter = dirac_hpel_filter_sse2; + c->add_rect_clamped = ff_add_rect_clamped_sse2; + c->put_signed_rect_clamped = ff_put_signed_rect_clamped_sse2; + + c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2; + c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2; +#endif + c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2; + c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2; + c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2; + c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2; + } +} diff --git a/libavcodec/x86/diracdsp_mmx.h b/libavcodec/x86/diracdsp_mmx.h new file mode 100644 index 0000000000..3d8e1173fa --- /dev/null +++ b/libavcodec/x86/diracdsp_mmx.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2010 David Conrad + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_DIRACDSP_H +#define AVCODEC_X86_DIRACDSP_H + +#include "libavcodec/diracdsp.h" + +void ff_diracdsp_init_mmx(DiracDSPContext* c); + +DECL_DIRAC_PIXOP(put, mmx); +DECL_DIRAC_PIXOP(avg, mmx); +DECL_DIRAC_PIXOP(avg, mmx2); + +void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); +void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); +void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); +void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); + +void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int); +void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int); + +void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); +void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); +void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); + +void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); +void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); + +#endif diff --git a/libavcodec/x86/diracdsp_yasm.asm b/libavcodec/x86/diracdsp_yasm.asm new file mode 100644 index 0000000000..049bf16dbf --- /dev/null +++ b/libavcodec/x86/diracdsp_yasm.asm @@ -0,0 +1,260 @@ +;****************************************************************************** +;* Copyright (c) 2010 David Conrad +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" + +SECTION_RODATA +pw_3: times 8 dw 3 +pw_7: times 8 dw 7 +pw_16: times 8 dw 16 +pw_32: times 8 dw 32 +pb_128: times 16 db 128 + +section .text + +%macro UNPACK_ADD 6 + mov%5 %1, %3 + mov%6 m5, %4 + mova m4, %1 + mova %2, m5 + punpcklbw %1, m7 + punpcklbw m5, m7 + punpckhbw m4, m7 + punpckhbw %2, m7 + paddw %1, m5 + paddw %2, m4 +%endmacro + +%macro HPEL_FILTER 1 +; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width); +cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3 + mov src0q, srcq + lea stridex3q, [3*strideq] + sub src0q, stridex3q + pxor m7, m7 +.loop: + ; 7*(src[0] + src[1]) + UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a + pmullw m0, [pw_7] + pmullw m1, [pw_7] + + ; 3*( ... + src[-2] + src[3]) + UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a + paddw m0, m2 + paddw m1, m3 + pmullw m0, [pw_3] + pmullw m1, [pw_3] + + ; ... - 7*(src[-1] + src[2]) + UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a + pmullw m2, [pw_7] + pmullw m3, [pw_7] + psubw m0, m2 + psubw m1, m3 + + ; ... - (src[-3] + src[4]) + UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a + psubw m0, m2 + psubw m1, m3 + + paddw m0, [pw_16] + paddw m1, [pw_16] + psraw m0, 5 + psraw m1, 5 + packuswb m0, m1 + mova [dstq], m0 + add dstq, mmsize + add srcq, mmsize + add src0q, mmsize + sub widthd, mmsize + jg .loop + RET + +; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width); +cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width + dec widthd + pxor m7, m7 + and widthd, ~(mmsize-1) +.loop: + ; 7*(src[0] + src[1]) + UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u + pmullw m0, [pw_7] + pmullw m1, [pw_7] + + ; 3*( ... + src[-2] + src[3]) + UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u + paddw m0, m2 + paddw m1, m3 + pmullw m0, [pw_3] + pmullw m1, [pw_3] + + ; ... - 7*(src[-1] + src[2]) + UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u + pmullw m2, [pw_7] + pmullw m3, [pw_7] + psubw m0, m2 + psubw m1, m3 + + ; ... - (src[-3] + src[4]) + UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u + psubw m0, m2 + psubw m1, m3 + + paddw m0, [pw_16] + paddw m1, [pw_16] + psraw m0, 5 + psraw m1, 5 + packuswb m0, m1 + mova [dstq + widthq], m0 + sub widthd, mmsize + jge .loop + RET +%endmacro + +%macro PUT_RECT 1 +; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height) +cglobal put_signed_rect_clamped_%1, 5,7,3, dst, dst_stride, src, src_stride, w, dst2, src2 + mova m0, [pb_128] + add wd, (mmsize-1) + and wd, ~(mmsize-1) + +%if ARCH_X86_64 + mov r7d, r5m + mov r8d, wd + %define wspill r8d + %define hd r7d +%else + mov r4m, wd + %define wspill r4m + %define hd r5mp +%endif + +.loopy + lea src2q, [srcq+src_strideq*2] + lea dst2q, [dstq+dst_strideq] +.loopx: + sub wd, mmsize + mova m1, [srcq +2*wq] + mova m2, [src2q+2*wq] + packsswb m1, [srcq +2*wq+mmsize] + packsswb m2, [src2q+2*wq+mmsize] + paddb m1, m0 + paddb m2, m0 + mova [dstq +wq], m1 + mova [dst2q+wq], m2 + jg .loopx + + lea srcq, [srcq+src_strideq*4] + lea dstq, [dstq+dst_strideq*2] + sub hd, 2 + mov wd, wspill + jg .loopy + RET +%endm + +%macro ADD_RECT 1 +; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height) +cglobal add_rect_clamped_%1, 7,7,3, dst, src, stride, idwt, idwt_stride, w, h + mova m0, [pw_32] + add wd, (mmsize-1) + and wd, ~(mmsize-1) + +%if ARCH_X86_64 + mov r8d, wd + %define wspill r8d +%else + mov r5m, wd + %define wspill r5m +%endif + +.loop: + sub wd, mmsize + movu m1, [srcq +2*wq] ; FIXME: ensure alignment + paddw m1, m0 + psraw m1, 6 + movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment + paddw m2, m0 + psraw m2, 6 + paddw m1, [idwtq+2*wq] + paddw m2, [idwtq+2*wq+mmsize] + packuswb m1, m2 + mova [dstq +wq], m1 + jg .loop + + lea srcq, [srcq + 2*strideq] + add dstq, strideq + lea idwtq, [idwtq+ 2*idwt_strideq] + sub hd, 1 + mov wd, wspill + jg .loop + RET +%endm + +%macro ADD_OBMC 2 +; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen) +cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen + pxor m4, m4 +.loop: +%assign i 0 +%rep %1 / mmsize + mova m0, [srcq+i] + mova m1, m0 + punpcklbw m0, m4 + punpckhbw m1, m4 + mova m2, [obmcq+i] + mova m3, m2 + punpcklbw m2, m4 + punpckhbw m3, m4 + pmullw m0, m2 + pmullw m1, m3 + movu m2, [dstq+2*i] + movu m3, [dstq+2*i+mmsize] + paddw m0, m2 + paddw m1, m3 + movu [dstq+2*i], m0 + movu [dstq+2*i+mmsize], m1 +%assign i i+mmsize +%endrep + lea srcq, [srcq+strideq] + lea dstq, [dstq+2*strideq] + add obmcq, 32 + sub yblend, 1 + jg .loop + RET +%endm + +INIT_MMX +%if ARCH_X86_64 == 0 +PUT_RECT mmx +ADD_RECT mmx + +HPEL_FILTER mmx +ADD_OBMC 32, mmx +ADD_OBMC 16, mmx +%endif +ADD_OBMC 8, mmx + +INIT_XMM +PUT_RECT sse2 +ADD_RECT sse2 + +HPEL_FILTER sse2 +ADD_OBMC 32, sse2 +ADD_OBMC 16, sse2 diff --git a/libavcodec/x86/dnxhd_mmx.c b/libavcodec/x86/dnxhd_mmx.c index 54293aa280..3c1e86955f 100644 --- a/libavcodec/x86/dnxhd_mmx.c +++ b/libavcodec/x86/dnxhd_mmx.c @@ -4,20 +4,20 @@ * * VC-3 encoder funded by the British Broadcasting Corporation * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 3daa09b314..f4ed7565e7 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -3,23 +3,23 @@ * Copyright (c) 2000, 2001 Fabrice Bellard * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> * - * MMX optimization by Nick Kurshev <nickols_k@mail.ru> - * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * MMX optimization by Nick Kurshev <nickols_k@mail.ru> */ #include "libavutil/cpu.h" @@ -31,6 +31,7 @@ #include "libavcodec/ac3dec.h" #include "dsputil_mmx.h" #include "idct_xvid.h" +#include "diracdsp_mmx.h" //#undef NDEBUG //#include <assert.h> @@ -836,7 +837,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, : "+r"(ptr) : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height) ); - } else { + } else if(w==16){ __asm__ volatile ( "1: \n\t" "movd (%0), %%mm0 \n\t" @@ -857,6 +858,25 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, : "+r"(ptr) : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height) ); + } else { + av_assert1(w == 4); + __asm__ volatile ( + "1: \n\t" + "movd (%0), %%mm0 \n\t" + "punpcklbw %%mm0, %%mm0 \n\t" + "punpcklwd %%mm0, %%mm0 \n\t" + "movd %%mm0, -4(%0) \n\t" + "movd -4(%0, %2), %%mm1 \n\t" + "punpcklbw %%mm1, %%mm1 \n\t" + "punpckhwd %%mm1, %%mm1 \n\t" + "punpckhdq %%mm1, %%mm1 \n\t" + "movd %%mm1, (%0, %2) \n\t" + "add %1, %0 \n\t" + "cmp %3, %0 \n\t" + "jb 1b \n\t" + : "+r"(ptr) + : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height) + ); } /* top and bottom (and hopefully also the corners) */ @@ -2162,8 +2182,115 @@ void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, avg_pixels8_mmx2(dst, src, stride, 8); } +/* only used in VP3/5/6 */ +static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h) +{ +// START_TIMER + MOVQ_BFE(mm6); + __asm__ volatile( + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%2), %%mm1 \n\t" + "movq (%1,%4), %%mm2 \n\t" + "movq (%2,%4), %%mm3 \n\t" + PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%3) \n\t" + "movq %%mm5, (%3,%4) \n\t" + + "movq (%1,%4,2), %%mm0 \n\t" + "movq (%2,%4,2), %%mm1 \n\t" + "movq (%1,%5), %%mm2 \n\t" + "movq (%2,%5), %%mm3 \n\t" + "lea (%1,%4,4), %1 \n\t" + "lea (%2,%4,4), %2 \n\t" + PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%3,%4,2) \n\t" + "movq %%mm5, (%3,%5) \n\t" + "lea (%3,%4,4), %3 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+r"(h), "+r"(a), "+r"(b), "+r"(dst) + :"r"((x86_reg)stride), "r"((x86_reg)3L*stride) + :"memory"); +// STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx") +} +static void put_vp_no_rnd_pixels16_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h) +{ + put_vp_no_rnd_pixels8_l2_mmx(dst, a, b, stride, h); + put_vp_no_rnd_pixels8_l2_mmx(dst+8, a+8, b+8, stride, h); +} + +#if CONFIG_DIRAC_DECODER +#define DIRAC_PIXOP(OPNAME, EXT)\ +void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\ +{\ + OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\ +}\ +void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\ +{\ + OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\ +}\ +void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\ +{\ + OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\ + OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\ +} + +DIRAC_PIXOP(put, mmx) +DIRAC_PIXOP(avg, mmx) +DIRAC_PIXOP(avg, mmx2) + +void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) +{ + put_pixels16_sse2(dst, src[0], stride, h); +} +void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) +{ + avg_pixels16_sse2(dst, src[0], stride, h); +} +void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) +{ + put_pixels16_sse2(dst , src[0] , stride, h); + put_pixels16_sse2(dst+16, src[0]+16, stride, h); +} +void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) +{ + avg_pixels16_sse2(dst , src[0] , stride, h); + avg_pixels16_sse2(dst+16, src[0]+16, stride, h); +} +#endif + /* XXX: Those functions should be suppressed ASAP when all IDCTs are * converted. */ +#if CONFIG_GPL +static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, + DCTELEM *block) +{ + ff_mmx_idct(block); + ff_put_pixels_clamped_mmx(block, dest, line_size); +} + +static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, + DCTELEM *block) +{ + ff_mmx_idct(block); + ff_add_pixels_clamped_mmx(block, dest, line_size); +} + +static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, + DCTELEM *block) +{ + ff_mmxext_idct(block); + ff_put_pixels_clamped_mmx(block, dest, line_size); +} + +static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, + DCTELEM *block) +{ + ff_mmxext_idct(block); + ff_add_pixels_clamped_mmx(block, dest, line_size); +} +#endif static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block) { @@ -2597,6 +2724,9 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags) c->add_bytes = add_bytes_mmx; + c->put_no_rnd_pixels_l2[0]= put_vp_no_rnd_pixels16_l2_mmx; + c->put_no_rnd_pixels_l2[1]= put_vp_no_rnd_pixels8_l2_mmx; + if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { c->h263_v_loop_filter = h263_v_loop_filter_mmx; c->h263_h_loop_filter = h263_h_loop_filter_mmx; @@ -3014,12 +3144,25 @@ void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx) #if HAVE_INLINE_ASM const int idct_algo = avctx->idct_algo; - if (avctx->bits_per_raw_sample <= 8) { + if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) { if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) { c->idct_put = ff_simple_idct_put_mmx; c->idct_add = ff_simple_idct_add_mmx; c->idct = ff_simple_idct_mmx; c->idct_permutation_type = FF_SIMPLE_IDCT_PERM; +#if CONFIG_GPL + } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) { + if (mm_flags & AV_CPU_FLAG_MMX2) { + c->idct_put = ff_libmpeg2mmx2_idct_put; + c->idct_add = ff_libmpeg2mmx2_idct_add; + c->idct = ff_mmxext_idct; + } else { + c->idct_put = ff_libmpeg2mmx_idct_put; + c->idct_add = ff_libmpeg2mmx_idct_add; + c->idct = ff_mmx_idct; + } + c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM; +#endif } else if (idct_algo == FF_IDCT_CAVS) { c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; } else if (idct_algo == FF_IDCT_XVIDMMX) { diff --git a/libavcodec/x86/dsputil_mmx.h b/libavcodec/x86/dsputil_mmx.h index f1db78d5c4..d0b0344917 100644 --- a/libavcodec/x86/dsputil_mmx.h +++ b/libavcodec/x86/dsputil_mmx.h @@ -2,20 +2,20 @@ * MMX optimized DSP utils * Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/dsputil_mmx_avg_template.c b/libavcodec/x86/dsputil_mmx_avg_template.c index 8b116b74e2..6f768595c0 100644 --- a/libavcodec/x86/dsputil_mmx_avg_template.c +++ b/libavcodec/x86/dsputil_mmx_avg_template.c @@ -7,20 +7,20 @@ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> * and improved by Zdenek Kabelac <kabi@users.sf.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/dsputil_mmx_qns_template.c b/libavcodec/x86/dsputil_mmx_qns_template.c index 20a40a175e..77a41b9dcb 100644 --- a/libavcodec/x86/dsputil_mmx_qns_template.c +++ b/libavcodec/x86/dsputil_mmx_qns_template.c @@ -5,20 +5,20 @@ * MMX optimization by Michael Niedermayer <michaelni@gmx.at> * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/dsputil_mmx_rnd_template.c b/libavcodec/x86/dsputil_mmx_rnd_template.c index 34a2c0bca8..e4c91381fa 100644 --- a/libavcodec/x86/dsputil_mmx_rnd_template.c +++ b/libavcodec/x86/dsputil_mmx_rnd_template.c @@ -7,20 +7,20 @@ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> * and improved by Zdenek Kabelac <kabi@users.sf.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index af2de15a25..06d2027c69 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -2,24 +2,24 @@ ;* MMX optimized DSP utils ;* Copyright (c) 2008 Loren Merritt ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** -%include "x86inc.asm" +%include "libavutil/x86/x86inc.asm" %include "x86util.asm" SECTION_RODATA @@ -1371,3 +1371,4 @@ cglobal bswap32_buf, 3,4,3 mov [r0], r2d .end: RET + diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c index 0ac4d2c10d..77b7b7620e 100644 --- a/libavcodec/x86/dsputilenc_mmx.c +++ b/libavcodec/x86/dsputilenc_mmx.c @@ -3,20 +3,20 @@ * Copyright (c) 2000, 2001 Fabrice Bellard * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * * MMX optimization by Nick Kurshev <nickols_k@mail.ru> @@ -821,8 +821,9 @@ static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, i } #undef SUM -static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ +static void diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){ x86_reg i=0; + if(w>=16) __asm__ volatile( "1: \n\t" "movq (%2, %0), %%mm0 \n\t" diff --git a/libavcodec/x86/dsputilenc_yasm.asm b/libavcodec/x86/dsputilenc_yasm.asm index cfd4e6d28b..1be359d667 100644 --- a/libavcodec/x86/dsputilenc_yasm.asm +++ b/libavcodec/x86/dsputilenc_yasm.asm @@ -4,25 +4,25 @@ ;* Copyright (c) 2000, 2001 Fabrice Bellard ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;***************************************************************************** -%include "x86inc.asm" -%include "x86util.asm" +%include "libavutil/x86/x86inc.asm" +%include "libavutil/x86/x86util.asm" SECTION .text diff --git a/libavcodec/x86/dwt.c b/libavcodec/x86/dwt.c new file mode 100644 index 0000000000..7ef9e2414e --- /dev/null +++ b/libavcodec/x86/dwt.c @@ -0,0 +1,202 @@ +/* + * MMX optimized discrete wavelet transform + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * Copyright (c) 2010 David Conrad + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86_cpu.h" +#include "dsputil_mmx.h" +#include "dwt.h" + +#define COMPOSE_VERTICAL(ext, align) \ +void ff_vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \ +void ff_vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \ +void ff_vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \ +void ff_vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \ +void ff_vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width); \ +void ff_horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\ +void ff_horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\ +\ +static void vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \ +{ \ + int i, width_align = width&~(align-1); \ +\ + for(i=width_align; i<width; i++) \ + b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \ +\ + ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \ +} \ +\ +static void vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \ +{ \ + int i, width_align = width&~(align-1); \ +\ + for(i=width_align; i<width; i++) \ + b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \ +\ + ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \ +} \ +\ +static void vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \ + IDWTELEM *b3, IDWTELEM *b4, int width) \ +{ \ + int i, width_align = width&~(align-1); \ +\ + for(i=width_align; i<width; i++) \ + b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \ +\ + ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \ +} \ +\ +static void vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \ + IDWTELEM *b3, IDWTELEM *b4, int width) \ +{ \ + int i, width_align = width&~(align-1); \ +\ + for(i=width_align; i<width; i++) \ + b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \ +\ + ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \ +} \ +static void vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width) \ +{ \ + int i, width_align = width&~(align-1); \ +\ + for(i=width_align; i<width; i++) { \ + b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \ + b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \ + } \ +\ + ff_vertical_compose_haar##ext(b0, b1, width_align); \ +} \ +static void horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\ +{\ + int w2= w>>1;\ + int x= w2 - (w2&(align-1));\ + ff_horizontal_compose_haar0i##ext(b, tmp, w);\ +\ + for (; x < w2; x++) {\ + b[2*x ] = tmp[x];\ + b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\ + }\ +}\ +static void horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\ +{\ + int w2= w>>1;\ + int x= w2 - (w2&(align-1));\ + ff_horizontal_compose_haar1i##ext(b, tmp, w);\ +\ + for (; x < w2; x++) {\ + b[2*x ] = (tmp[x] + 1)>>1;\ + b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\ + }\ +}\ +\ + +#if HAVE_YASM +#if !ARCH_X86_64 +COMPOSE_VERTICAL(_mmx, 4) +#endif +COMPOSE_VERTICAL(_sse2, 8) + + +void ff_horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w); + +static void horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w) +{ + int w2= w>>1; + int x= w2 - (w2&7); + ff_horizontal_compose_dd97i_ssse3(b, tmp, w); + + for (; x < w2; x++) { + b[2*x ] = (tmp[x] + 1)>>1; + b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1; + } +} +#endif + +void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type) +{ +#if HAVE_YASM + int mm_flags = av_get_cpu_flags(); + +#if !ARCH_X86_64 + if (!(mm_flags & AV_CPU_FLAG_MMX)) + return; + + switch (type) { + case DWT_DIRAC_DD9_7: + d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx; + d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx; + break; + case DWT_DIRAC_LEGALL5_3: + d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx; + d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx; + break; + case DWT_DIRAC_DD13_7: + d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx; + d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx; + break; + case DWT_DIRAC_HAAR0: + d->vertical_compose = (void*)vertical_compose_haar_mmx; + d->horizontal_compose = horizontal_compose_haar0i_mmx; + break; + case DWT_DIRAC_HAAR1: + d->vertical_compose = (void*)vertical_compose_haar_mmx; + d->horizontal_compose = horizontal_compose_haar1i_mmx; + break; + } +#endif + + if (!(mm_flags & AV_CPU_FLAG_SSE2)) + return; + + switch (type) { + case DWT_DIRAC_DD9_7: + d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2; + d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2; + break; + case DWT_DIRAC_LEGALL5_3: + d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2; + d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2; + break; + case DWT_DIRAC_DD13_7: + d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2; + d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2; + break; + case DWT_DIRAC_HAAR0: + d->vertical_compose = (void*)vertical_compose_haar_sse2; + d->horizontal_compose = horizontal_compose_haar0i_sse2; + break; + case DWT_DIRAC_HAAR1: + d->vertical_compose = (void*)vertical_compose_haar_sse2; + d->horizontal_compose = horizontal_compose_haar1i_sse2; + break; + } + + if (!(mm_flags & AV_CPU_FLAG_SSSE3)) + return; + + switch (type) { + case DWT_DIRAC_DD9_7: + d->horizontal_compose = horizontal_compose_dd97i_ssse3; + break; + } +#endif // HAVE_YASM +} diff --git a/libavcodec/x86/dwt.h b/libavcodec/x86/dwt.h new file mode 100644 index 0000000000..199f61175b --- /dev/null +++ b/libavcodec/x86/dwt.h @@ -0,0 +1,30 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_DWT_H +#define AVCODEC_X86_DWT_H + +#include "libavcodec/dwt.h" + +void ff_horizontal_compose_dd97i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x); +void ff_horizontal_compose_haar1i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x); +void ff_horizontal_compose_haar0i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x); + +void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type); + +#endif diff --git a/libavcodec/x86/dwt_yasm.asm b/libavcodec/x86/dwt_yasm.asm new file mode 100644 index 0000000000..ac6c505409 --- /dev/null +++ b/libavcodec/x86/dwt_yasm.asm @@ -0,0 +1,291 @@ +;****************************************************************************** +;* MMX optimized discrete wavelet trasnform +;* Copyright (c) 2010 David Conrad +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" + +SECTION_RODATA +pw_1: times 8 dw 1 +pw_2: times 8 dw 2 +pw_8: times 8 dw 8 +pw_16: times 8 dw 16 +pw_1991: times 4 dw 9,-1 + +section .text + +; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2 +%macro COMPOSE_53iL0 4 + paddw %2, %3 + paddw %2, %4 + psraw %2, 2 + psubw %1, %2 +%endm + +; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4 +; if %4 is supplied, %1 is loaded unaligned from there +; m2: clobbered m3: pw_8 m4: pw_1991 +%macro COMPOSE_DD97iH0 3-4 + paddw m0, %3 + paddw m1, %2 + psubw m0, m3 + mova m2, m1 + punpcklwd m1, m0 + punpckhwd m2, m0 + pmaddwd m1, m4 + pmaddwd m2, m4 +%if %0 > 3 + movu %1, %4 +%endif + psrad m1, 4 + psrad m2, 4 + packssdw m1, m2 + paddw m1, %1 +%endm + +%macro COMPOSE_VERTICAL 1 +; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, +; int width) +cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width + mova m2, [pw_2] +.loop: + sub widthd, mmsize/2 + mova m1, [b0q+2*widthq] + mova m0, [b1q+2*widthq] + COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2 + mova [b1q+2*widthq], m0 + jg .loop + REP_RET + +; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, +; int width) +cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width + mova m1, [pw_1] +.loop: + sub widthd, mmsize/2 + mova m0, [b0q+2*widthq] + paddw m0, [b2q+2*widthq] + paddw m0, m1 + psraw m0, 1 + paddw m0, [b1q+2*widthq] + mova [b1q+2*widthq], m0 + jg .loop + REP_RET + +; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, +; IDWTELEM *b3, IDWTELEM *b4, int width) +cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width + mova m3, [pw_8] + mova m4, [pw_1991] +.loop: + sub widthd, mmsize/2 + mova m0, [b0q+2*widthq] + mova m1, [b1q+2*widthq] + COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq] + mova [b2q+2*widthq], m1 + jg .loop + REP_RET + +; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, +; IDWTELEM *b3, IDWTELEM *b4, int width) +cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width + mova m3, [pw_16] + mova m4, [pw_1991] +.loop: + sub widthd, mmsize/2 + mova m0, [b0q+2*widthq] + mova m1, [b1q+2*widthq] + mova m5, [b2q+2*widthq] + paddw m0, [b4q+2*widthq] + paddw m1, [b3q+2*widthq] + psubw m0, m3 + mova m2, m1 + punpcklwd m1, m0 + punpckhwd m2, m0 + pmaddwd m1, m4 + pmaddwd m2, m4 + psrad m1, 5 + psrad m2, 5 + packssdw m1, m2 + psubw m5, m1 + mova [b2q+2*widthq], m5 + jg .loop + REP_RET + +; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width) +cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width + mova m3, [pw_1] +.loop: + sub widthd, mmsize/2 + mova m1, [b1q+2*widthq] + mova m0, [b0q+2*widthq] + mova m2, m1 + paddw m1, m3 + psraw m1, 1 + psubw m0, m1 + mova [b0q+2*widthq], m0 + paddw m2, m0 + mova [b1q+2*widthq], m2 + jg .loop + REP_RET +%endmacro + +; extend the left and right edges of the tmp array by %1 and %2 respectively +%macro EDGE_EXTENSION 3 + mov %3, [tmpq] +%assign %%i 1 +%rep %1 + mov [tmpq-2*%%i], %3 + %assign %%i %%i+1 +%endrep + mov %3, [tmpq+2*w2q-2] +%assign %%i 0 +%rep %2 + mov [tmpq+2*w2q+2*%%i], %3 + %assign %%i %%i+1 +%endrep +%endmacro + + +%macro HAAR_HORIZONTAL 2 +; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width) +cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2 + mov w2d, wd + xor xq, xq + shr w2d, 1 + lea b_w2q, [bq+wq] + mova m3, [pw_1] +.lowpass_loop: + movu m1, [b_w2q + 2*xq] + mova m0, [bq + 2*xq] + paddw m1, m3 + psraw m1, 1 + psubw m0, m1 + mova [tmpq + 2*xq], m0 + add xq, mmsize/2 + cmp xq, w2q + jl .lowpass_loop + + xor xq, xq + and w2q, ~(mmsize/2 - 1) + cmp w2q, mmsize/2 + jl .end + +.highpass_loop: + movu m1, [b_w2q + 2*xq] + mova m0, [tmpq + 2*xq] + paddw m1, m0 + + ; shift and interleave +%if %2 == 1 + paddw m0, m3 + paddw m1, m3 + psraw m0, 1 + psraw m1, 1 +%endif + mova m2, m0 + punpcklwd m0, m1 + punpckhwd m2, m1 + mova [bq+4*xq], m0 + mova [bq+4*xq+mmsize], m2 + + add xq, mmsize/2 + cmp xq, w2q + jl .highpass_loop +.end: + REP_RET +%endmacro + + +INIT_XMM +; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width) +cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2 + mov w2d, wd + xor xd, xd + shr w2d, 1 + lea b_w2q, [bq+wq] + movu m4, [bq+wq] + mova m7, [pw_2] + pslldq m4, 14 +.lowpass_loop: + movu m1, [b_w2q + 2*xq] + mova m0, [bq + 2*xq] + mova m2, m1 + palignr m1, m4, 14 + mova m4, m2 + COMPOSE_53iL0 m0, m1, m2, m7 + mova [tmpq + 2*xq], m0 + add xd, mmsize/2 + cmp xd, w2d + jl .lowpass_loop + + EDGE_EXTENSION 1, 2, xw + ; leave the last up to 7 (sse) or 3 (mmx) values for C + xor xd, xd + and w2d, ~(mmsize/2 - 1) + cmp w2d, mmsize/2 + jl .end + + mova m7, [tmpq-mmsize] + mova m0, [tmpq] + mova m5, [pw_1] + mova m3, [pw_8] + mova m4, [pw_1991] +.highpass_loop: + mova m6, m0 + palignr m0, m7, 14 + mova m7, [tmpq + 2*xq + 16] + mova m1, m7 + mova m2, m7 + palignr m1, m6, 2 + palignr m2, m6, 4 + COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq] + mova m0, m7 + mova m7, m6 + + ; shift and interleave + paddw m6, m5 + paddw m1, m5 + psraw m6, 1 + psraw m1, 1 + mova m2, m6 + punpcklwd m6, m1 + punpckhwd m2, m1 + mova [bq+4*xq], m6 + mova [bq+4*xq+mmsize], m2 + + add xd, mmsize/2 + cmp xd, w2d + jl .highpass_loop +.end: + REP_RET + + +%if ARCH_X86_64 == 0 +INIT_MMX +COMPOSE_VERTICAL mmx +HAAR_HORIZONTAL mmx, 0 +HAAR_HORIZONTAL mmx, 1 +%endif + +;;INIT_XMM +INIT_XMM +COMPOSE_VERTICAL sse2 +HAAR_HORIZONTAL sse2, 0 +HAAR_HORIZONTAL sse2, 1 diff --git a/libavcodec/x86/fdct_mmx.c b/libavcodec/x86/fdct_mmx.c index 3614fd151a..f8fef4d8b2 100644 --- a/libavcodec/x86/fdct_mmx.c +++ b/libavcodec/x86/fdct_mmx.c @@ -13,20 +13,20 @@ * a page about fdct at http://www.geocities.com/ssavekar/dct.htm * Skal's fdct at http://skal.planet-d.net/coding/dct.html * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -70,7 +70,7 @@ DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) }; DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW }; -static struct +static const struct { DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4]; } fdct_r_row_sse2 = @@ -153,7 +153,7 @@ DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct 29692, -12299, 26722, -31521, }; -static struct +static const struct { DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256]; } tab_frw_01234567_sse2 = diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c index fcde3fa797..852c6b8d1e 100644 --- a/libavcodec/x86/fft.c +++ b/libavcodec/x86/fft.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h index 6e80b95d11..3f8b21d95b 100644 --- a/libavcodec/x86/fft.h +++ b/libavcodec/x86/fft.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm index 6082d9ee36..faa27a01c6 100644 --- a/libavcodec/x86/fft_mmx.asm +++ b/libavcodec/x86/fft_mmx.asm @@ -6,20 +6,20 @@ ;* This algorithm (though not any of the implementation details) is ;* based on libdjbfft by D. J. Bernstein. ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -28,8 +28,8 @@ ; in blocks as conventient to the vector size. ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) -%include "x86inc.asm" -%include "x86util.asm" +%include "libavutil/x86/x86inc.asm" +%include "libavutil/x86/x86util.asm" %if ARCH_X86_64 %define pointer resq @@ -37,6 +37,8 @@ %define pointer resd %endif +SECTION_RODATA + struc FFTContext .nbits: resd 1 .reverse: resd 1 @@ -52,8 +54,6 @@ struc FFTContext .imdcthalf:pointer 1 endstruc -SECTION_RODATA - %define M_SQRT1_2 0.70710678118654752440 %define M_COS_PI_1_8 0.923879532511287 %define M_COS_PI_3_8 0.38268343236509 @@ -394,6 +394,7 @@ fft32_interleave_avx: sub r2d, mmsize/4 jg .deint_loop ret + %endif INIT_XMM sse diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm index 0fd14fefa3..7368b8f518 100644 --- a/libavcodec/x86/fmtconvert.asm +++ b/libavcodec/x86/fmtconvert.asm @@ -2,25 +2,25 @@ ;* x86 optimized Format Conversion Utils ;* Copyright (c) 2008 Loren Merritt ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** -%include "x86inc.asm" -%include "x86util.asm" +%include "libavutil/x86/x86inc.asm" +%include "libavutil/x86/x86util.asm" SECTION_TEXT diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c index fbdc5262b9..814a17f631 100644 --- a/libavcodec/x86/fmtconvert_mmx.c +++ b/libavcodec/x86/fmtconvert_mmx.c @@ -3,20 +3,20 @@ * Copyright (c) 2000, 2001 Fabrice Bellard * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * * MMX optimization by Nick Kurshev <nickols_k@mail.ru> diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm index 64a4efe057..4155947e58 100644 --- a/libavcodec/x86/h264_chromamc.asm +++ b/libavcodec/x86/h264_chromamc.asm @@ -3,25 +3,25 @@ ;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>, ;* 2005-2008 Loren Merritt ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** -%include "x86inc.asm" -%include "x86util.asm" +%include "libavutil/x86/x86inc.asm" +%include "libavutil/x86/x86util.asm" SECTION_RODATA diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 940a8f77e1..a4ab08722a 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -7,25 +7,25 @@ ;* Jason Garrett-Glaser <darkshikari@gmail.com> ;* Oskar Arvidsson <oskar@irock.se> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** -%include "x86inc.asm" -%include "x86util.asm" +%include "libavutil/x86/x86inc.asm" +%include "libavutil/x86/x86util.asm" SECTION_RODATA @@ -390,8 +390,10 @@ cglobal deblock_h_luma_8, 5,9 INIT_XMM sse2 DEBLOCK_LUMA +%if HAVE_AVX INIT_XMM avx DEBLOCK_LUMA +%endif %else @@ -509,8 +511,10 @@ INIT_MMX mmx2 DEBLOCK_LUMA v8, 8 INIT_XMM sse2 DEBLOCK_LUMA v, 16 +%if HAVE_AVX INIT_XMM avx DEBLOCK_LUMA v, 16 +%endif %endif ; ARCH @@ -781,8 +785,10 @@ cglobal deblock_h_luma_intra_8, 2,4 INIT_XMM sse2 DEBLOCK_LUMA_INTRA v +%if HAVE_AVX INIT_XMM avx DEBLOCK_LUMA_INTRA v +%endif %if ARCH_X86_64 == 0 INIT_MMX mmx2 DEBLOCK_LUMA_INTRA v8 @@ -843,7 +849,11 @@ cglobal deblock_h_chroma_8, 5,7 TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) movq buf0, m0 movq buf1, m3 - call ff_chroma_inter_body_mmx2 + LOAD_MASK r2d, r3d + movd m6, [r4] ; tc0 + punpcklbw m6, m6 + pand m7, m6 + DEBLOCK_P0_Q0 movq m0, buf0 movq m3, buf1 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm index ba2f91490e..d625eee4a4 100644 --- a/libavcodec/x86/h264_deblock_10bit.asm +++ b/libavcodec/x86/h264_deblock_10bit.asm @@ -24,8 +24,8 @@ ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** -%include "x86inc.asm" -%include "x86util.asm" +%include "libavutil/x86/x86inc.asm" +%include "libavutil/x86/x86util.asm" SECTION_RODATA @@ -165,7 +165,7 @@ cglobal deblock_v_luma_10, 5,5,8*(mmsize/16) SUB rsp, pad shl r2d, 2 shl r3d, 2 - LOAD_AB m4, m5, r2, r3 + LOAD_AB m4, m5, r2d, r3d mov r3, 32/mmsize mov r2, r0 sub r0, r1 @@ -222,7 +222,7 @@ cglobal deblock_h_luma_10, 5,6,8*(mmsize/16) SUB rsp, pad shl r2d, 2 shl r3d, 2 - LOAD_AB m4, m5, r2, r3 + LOAD_AB m4, m5, r2d, r3d mov r3, r1 mova am, m4 add r3, r1 @@ -351,7 +351,7 @@ cglobal deblock_v_luma_10, 5,5,15 %define mask2 m11 shl r2d, 2 shl r3d, 2 - LOAD_AB m12, m13, r2, r3 + LOAD_AB m12, m13, r2d, r3d mov r2, r0 sub r0, r1 sub r0, r1 @@ -379,7 +379,7 @@ cglobal deblock_v_luma_10, 5,5,15 cglobal deblock_h_luma_10, 5,7,15 shl r2d, 2 shl r3d, 2 - LOAD_AB m12, m13, r2, r3 + LOAD_AB m12, m13, r2d, r3d mov r2, r1 add r2, r1 add r2, r1 @@ -418,9 +418,11 @@ cglobal deblock_h_luma_10, 5,7,15 INIT_XMM sse2 DEBLOCK_LUMA_64 +%if HAVE_AVX INIT_XMM avx DEBLOCK_LUMA_64 %endif +%endif %macro SWAPMOVA 2 %ifid %1 @@ -713,8 +715,10 @@ cglobal deblock_h_luma_intra_10, 4,7,16 INIT_XMM sse2 DEBLOCK_LUMA_INTRA_64 +%if HAVE_AVX INIT_XMM avx DEBLOCK_LUMA_INTRA_64 +%endif %endif @@ -798,10 +802,12 @@ DEBLOCK_LUMA_INTRA INIT_XMM sse2 DEBLOCK_LUMA DEBLOCK_LUMA_INTRA +%if HAVE_AVX INIT_XMM avx DEBLOCK_LUMA DEBLOCK_LUMA_INTRA %endif +%endif ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp ; out: %1=p0', %2=q0' @@ -857,7 +863,7 @@ cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16) .loop: %endif CHROMA_V_LOAD r5 - LOAD_AB m4, m5, r2, r3 + LOAD_AB m4, m5, r2d, r3d LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 pxor m4, m4 CHROMA_V_LOAD_TC m6, r4 @@ -891,7 +897,7 @@ cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16) .loop: %endif CHROMA_V_LOAD r4 - LOAD_AB m4, m5, r2, r3 + LOAD_AB m4, m5, r2d, r3d LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 CHROMA_V_STORE @@ -912,5 +918,7 @@ DEBLOCK_CHROMA %endif INIT_XMM sse2 DEBLOCK_CHROMA +%if HAVE_AVX INIT_XMM avx DEBLOCK_CHROMA +%endif diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h index 2daa40ae52..b059cf9423 100644 --- a/libavcodec/x86/h264_i386.h +++ b/libavcodec/x86/h264_i386.h @@ -2,20 +2,20 @@ * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index cc83806884..da045f71e2 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -9,25 +9,25 @@ ;* Holger Lubitz <hal@duncan.ol.sub.de> ;* Min Chen <chenm001.163.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;***************************************************************************** -%include "x86inc.asm" -%include "x86util.asm" +%include "libavutil/x86/x86inc.asm" +%include "libavutil/x86/x86util.asm" SECTION_RODATA diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm index 2aab9864d6..67bc68ec5c 100644 --- a/libavcodec/x86/h264_idct_10bit.asm +++ b/libavcodec/x86/h264_idct_10bit.asm @@ -22,8 +22,8 @@ ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** -%include "x86inc.asm" -%include "x86util.asm" +%include "libavutil/x86/x86inc.asm" +%include "libavutil/x86/x86util.asm" SECTION_RODATA diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm index 50a615b054..fd78456294 100644 --- a/libavcodec/x86/h264_intrapred.asm +++ b/libavcodec/x86/h264_intrapred.asm @@ -5,25 +5,25 @@ ;* Copyright (c) 2010 Loren Merritt ;* Copyright (c) 2010 Ronald S. Bultje ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** -%include "x86inc.asm" -%include "x86util.asm" +%include "libavutil/x86/x86inc.asm" +%include "libavutil/x86/x86util.asm" SECTION_RODATA diff --git a/libavcodec/x86/h264_intrapred_10bit.asm b/libavcodec/x86/h264_intrapred_10bit.asm index 1423b561ac..79fa23e71d 100644 --- a/libavcodec/x86/h264_intrapred_10bit.asm +++ b/libavcodec/x86/h264_intrapred_10bit.asm @@ -22,8 +22,8 @@ ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** -%include "x86inc.asm" -%include "x86util.asm" +%include "libavutil/x86/x86inc.asm" +%include "libavutil/x86/x86util.asm" SECTION_RODATA diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c index 08c35c7138..59ab3ea27e 100644 --- a/libavcodec/x86/h264_intrapred_init.c +++ b/libavcodec/x86/h264_intrapred_init.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2010 Jason Garrett-Glaser * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/h264_qpel_mmx.c b/libavcodec/x86/h264_qpel_mmx.c index fc1635de8b..71a1fbeed9 100644 --- a/libavcodec/x86/h264_qpel_mmx.c +++ b/libavcodec/x86/h264_qpel_mmx.c @@ -2,20 +2,20 @@ * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt * Copyright (c) 2011 Daniel Kang * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -1286,6 +1286,6 @@ QPEL16_OP(mc31, MMX)\ QPEL16_OP(mc32, MMX)\ QPEL16_OP(mc33, MMX) -#if ARCH_X86_32 && HAVE_YASM // ARCH_X86_64 implies sse2+ +#if CONFIG_H264QPEL && ARCH_X86_32 && HAVE_YASM // ARCH_X86_64 implies sse2+ QPEL16(mmxext) #endif diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm index 22ce72d19f..26233bf080 100644 --- a/libavcodec/x86/h264_weight.asm +++ b/libavcodec/x86/h264_weight.asm @@ -4,24 +4,24 @@ ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt ;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** -%include "x86inc.asm" +%include "libavutil/x86/x86inc.asm" SECTION .text @@ -253,6 +253,13 @@ BIWEIGHT_FUNC_HALF_MM 8, 8, sse2 add off_regd, 1 or off_regd, 1 add r4, 1 + cmp r5, 128 + jne .normal + sar r5, 1 + sar r6, 1 + sar off_regd, 1 + sub r4, 1 +.normal movd m4, r5d movd m0, r6d movd m5, off_regd diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index 0612ffbb8b..87b9452501 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -133,7 +133,7 @@ LF_IFUNC(v, chroma_intra, depth, avx) LF_FUNCS(uint8_t, 8) LF_FUNCS(uint16_t, 10) -#if ARCH_X86_32 +#if ARCH_X86_32 && HAVE_YASM LF_FUNC(v8, luma, 8, mmx2) static void ff_deblock_v_luma_8_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) @@ -292,7 +292,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3; c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3; } - if (mm_flags & AV_CPU_FLAG_AVX) { + if (HAVE_AVX && mm_flags & AV_CPU_FLAG_AVX) { #if HAVE_ALIGNED_STACK c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx; c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx; diff --git a/libavcodec/x86/idct_mmx.c b/libavcodec/x86/idct_mmx.c new file mode 100644 index 0000000000..2408ab26ad --- /dev/null +++ b/libavcodec/x86/idct_mmx.c @@ -0,0 +1,632 @@ +/* + * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> + * + * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. + * See http://libmpeg2.sourceforge.net/ for updates. + * + * mpeg2dec is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpeg2dec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with mpeg2dec; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/common.h" +#include "libavcodec/dsputil.h" + +#include "libavutil/x86_cpu.h" +#include "dsputil_mmx.h" + +#if HAVE_INLINE_ASM + +#define ROW_SHIFT 11 +#define COL_SHIFT 6 + +#define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT))) +#define rounder(bias) {round (bias), round (bias)} + + +#if 0 +/* C row IDCT - it is just here to document the MMXEXT and MMX versions */ +static inline void idct_row (int16_t * row, int offset, + int16_t * table, int32_t * rounder) +{ + int C1, C2, C3, C4, C5, C6, C7; + int a0, a1, a2, a3, b0, b1, b2, b3; + + row += offset; + + C1 = table[1]; + C2 = table[2]; + C3 = table[3]; + C4 = table[4]; + C5 = table[5]; + C6 = table[6]; + C7 = table[7]; + + a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder; + a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder; + a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder; + a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder; + + b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; + b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; + b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; + b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; + + row[0] = (a0 + b0) >> ROW_SHIFT; + row[1] = (a1 + b1) >> ROW_SHIFT; + row[2] = (a2 + b2) >> ROW_SHIFT; + row[3] = (a3 + b3) >> ROW_SHIFT; + row[4] = (a3 - b3) >> ROW_SHIFT; + row[5] = (a2 - b2) >> ROW_SHIFT; + row[6] = (a1 - b1) >> ROW_SHIFT; + row[7] = (a0 - b0) >> ROW_SHIFT; +} +#endif + + +/* MMXEXT row IDCT */ + +#define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \ + c4, c6, c4, c6, \ + c1, c3, -c1, -c5, \ + c5, c7, c3, -c7, \ + c4, -c6, c4, -c6, \ + -c4, c2, c4, -c2, \ + c5, -c1, c3, -c1, \ + c7, c3, c7, -c5 } + +static inline void mmxext_row_head (int16_t * const row, const int offset, + const int16_t * const table) +{ + __asm__ volatile( + "movq (%0), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */ + + "movq 8(%0), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */ + "movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */ + + "movq (%1), %%mm3 \n\t" /* mm3 = -C2 -C4 C2 C4 */ + "movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */ + + "movq 8(%1), %%mm4 \n\t" /* mm4 = C6 C4 C6 C4 */ + "pmaddwd %%mm0, %%mm3 \n\t" /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ + + "pshufw $0x4e, %%mm2, %%mm2 \n\t" /* mm2 = x2 x0 x6 x4 */ + :: "r" ((row+offset)), "r" (table) + ); +} + +static inline void mmxext_row (const int16_t * const table, + const int32_t * const rounder) +{ + __asm__ volatile ( + "movq 16(%0), %%mm1 \n\t" /* mm1 = -C5 -C1 C3 C1 */ + "pmaddwd %%mm2, %%mm4 \n\t" /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */ + + "pmaddwd 32(%0), %%mm0 \n\t" /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */ + "pshufw $0x4e, %%mm6, %%mm6 \n\t" /* mm6 = x3 x1 x7 x5 */ + + "movq 24(%0), %%mm7 \n\t" /* mm7 = -C7 C3 C7 C5 */ + "pmaddwd %%mm5, %%mm1 \n\t" /* mm1= -C1*x5-C5*x7 C1*x1+C3*x3 */ + + "paddd (%1), %%mm3 \n\t" /* mm3 += rounder */ + "pmaddwd %%mm6, %%mm7 \n\t" /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */ + + "pmaddwd 40(%0), %%mm2 \n\t" /* mm2= C4*x0-C2*x2 -C4*x4+C2*x6 */ + "paddd %%mm4, %%mm3 \n\t" /* mm3 = a1 a0 + rounder */ + + "pmaddwd 48(%0), %%mm5 \n\t" /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */ + "movq %%mm3, %%mm4 \n\t" /* mm4 = a1 a0 + rounder */ + + "pmaddwd 56(%0), %%mm6 \n\t" /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */ + "paddd %%mm7, %%mm1 \n\t" /* mm1 = b1 b0 */ + + "paddd (%1), %%mm0 \n\t" /* mm0 += rounder */ + "psubd %%mm1, %%mm3 \n\t" /* mm3 = a1-b1 a0-b0 + rounder */ + + "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm3 \n\t" /* mm3 = y6 y7 */ + "paddd %%mm4, %%mm1 \n\t" /* mm1 = a1+b1 a0+b0 + rounder */ + + "paddd %%mm2, %%mm0 \n\t" /* mm0 = a3 a2 + rounder */ + "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm1 \n\t" /* mm1 = y1 y0 */ + + "paddd %%mm6, %%mm5 \n\t" /* mm5 = b3 b2 */ + "movq %%mm0, %%mm4 \n\t" /* mm4 = a3 a2 + rounder */ + + "paddd %%mm5, %%mm0 \n\t" /* mm0 = a3+b3 a2+b2 + rounder */ + "psubd %%mm5, %%mm4 \n\t" /* mm4 = a3-b3 a2-b2 + rounder */ + : : "r" (table), "r" (rounder)); +} + +static inline void mmxext_row_tail (int16_t * const row, const int store) +{ + __asm__ volatile ( + "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */ + + "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm4 \n\t" /* mm4 = y4 y5 */ + + "packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */ + + "packssdw %%mm3, %%mm4 \n\t" /* mm4 = y6 y7 y4 y5 */ + + "movq %%mm1, (%0) \n\t" /* save y3 y2 y1 y0 */ + "pshufw $0xb1, %%mm4, %%mm4 \n\t" /* mm4 = y7 y6 y5 y4 */ + + /* slot */ + + "movq %%mm4, 8(%0) \n\t" /* save y7 y6 y5 y4 */ + :: "r" (row+store) + ); +} + +static inline void mmxext_row_mid (int16_t * const row, const int store, + const int offset, + const int16_t * const table) +{ + __asm__ volatile ( + "movq (%0,%1), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */ + "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */ + + "movq 8(%0,%1), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */ + "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm4 \n\t" /* mm4 = y4 y5 */ + + "packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */ + "movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */ + + "packssdw %%mm3, %%mm4 \n\t" /* mm4 = y6 y7 y4 y5 */ + "movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */ + + "movq %%mm1, (%0,%2) \n\t" /* save y3 y2 y1 y0 */ + "pshufw $0xb1, %%mm4, %%mm4\n\t" /* mm4 = y7 y6 y5 y4 */ + + "movq (%3), %%mm3 \n\t" /* mm3 = -C2 -C4 C2 C4 */ + "movq %%mm4, 8(%0,%2) \n\t" /* save y7 y6 y5 y4 */ + + "pmaddwd %%mm0, %%mm3 \n\t" /* mm3= -C4*x4-C2*x6 C4*x0+C2*x2 */ + + "movq 8(%3), %%mm4 \n\t" /* mm4 = C6 C4 C6 C4 */ + "pshufw $0x4e, %%mm2, %%mm2\n\t" /* mm2 = x2 x0 x6 x4 */ + :: "r" (row), "r" ((x86_reg) (2*offset)), "r" ((x86_reg) (2*store)), "r" (table) + ); +} + + +/* MMX row IDCT */ + +#define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ + c4, c6, -c4, -c2, \ + c1, c3, c3, -c7, \ + c5, c7, -c1, -c5, \ + c4, -c6, c4, -c2, \ + -c4, c2, c4, -c6, \ + c5, -c1, c7, -c5, \ + c7, c3, c3, -c1 } + +static inline void mmx_row_head (int16_t * const row, const int offset, + const int16_t * const table) +{ + __asm__ volatile ( + "movq (%0), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */ + + "movq 8(%0), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */ + "movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */ + + "movq (%1), %%mm3 \n\t" /* mm3 = C6 C4 C2 C4 */ + "movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */ + + "punpckldq %%mm0, %%mm0 \n\t" /* mm0 = x2 x0 x2 x0 */ + + "movq 8(%1), %%mm4 \n\t" /* mm4 = -C2 -C4 C6 C4 */ + "pmaddwd %%mm0, %%mm3 \n\t" /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ + + "movq 16(%1), %%mm1 \n\t" /* mm1 = -C7 C3 C3 C1 */ + "punpckhdq %%mm2, %%mm2 \n\t" /* mm2 = x6 x4 x6 x4 */ + :: "r" ((row+offset)), "r" (table) + ); +} + +static inline void mmx_row (const int16_t * const table, + const int32_t * const rounder) +{ + __asm__ volatile ( + "pmaddwd %%mm2, %%mm4 \n\t" /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */ + "punpckldq %%mm5, %%mm5 \n\t" /* mm5 = x3 x1 x3 x1 */ + + "pmaddwd 32(%0), %%mm0 \n\t" /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */ + "punpckhdq %%mm6, %%mm6 \n\t" /* mm6 = x7 x5 x7 x5 */ + + "movq 24(%0), %%mm7 \n\t" /* mm7 = -C5 -C1 C7 C5 */ + "pmaddwd %%mm5, %%mm1 \n\t" /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */ + + "paddd (%1), %%mm3 \n\t" /* mm3 += rounder */ + "pmaddwd %%mm6, %%mm7 \n\t" /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */ + + "pmaddwd 40(%0), %%mm2 \n\t" /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */ + "paddd %%mm4, %%mm3 \n\t" /* mm3 = a1 a0 + rounder */ + + "pmaddwd 48(%0), %%mm5 \n\t" /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */ + "movq %%mm3, %%mm4 \n\t" /* mm4 = a1 a0 + rounder */ + + "pmaddwd 56(%0), %%mm6 \n\t" /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */ + "paddd %%mm7, %%mm1 \n\t" /* mm1 = b1 b0 */ + + "paddd (%1), %%mm0 \n\t" /* mm0 += rounder */ + "psubd %%mm1, %%mm3 \n\t" /* mm3 = a1-b1 a0-b0 + rounder */ + + "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm3 \n\t" /* mm3 = y6 y7 */ + "paddd %%mm4, %%mm1 \n\t" /* mm1 = a1+b1 a0+b0 + rounder */ + + "paddd %%mm2, %%mm0 \n\t" /* mm0 = a3 a2 + rounder */ + "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm1 \n\t" /* mm1 = y1 y0 */ + + "paddd %%mm6, %%mm5 \n\t" /* mm5 = b3 b2 */ + "movq %%mm0, %%mm7 \n\t" /* mm7 = a3 a2 + rounder */ + + "paddd %%mm5, %%mm0 \n\t" /* mm0 = a3+b3 a2+b2 + rounder */ + "psubd %%mm5, %%mm7 \n\t" /* mm7 = a3-b3 a2-b2 + rounder */ + :: "r" (table), "r" (rounder) + ); +} + +static inline void mmx_row_tail (int16_t * const row, const int store) +{ + __asm__ volatile ( + "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */ + + "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm7 \n\t" /* mm7 = y4 y5 */ + + "packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */ + + "packssdw %%mm3, %%mm7 \n\t" /* mm7 = y6 y7 y4 y5 */ + + "movq %%mm1, (%0) \n\t" /* save y3 y2 y1 y0 */ + "movq %%mm7, %%mm4 \n\t" /* mm4 = y6 y7 y4 y5 */ + + "pslld $16, %%mm7 \n\t" /* mm7 = y7 0 y5 0 */ + + "psrld $16, %%mm4 \n\t" /* mm4 = 0 y6 0 y4 */ + + "por %%mm4, %%mm7 \n\t" /* mm7 = y7 y6 y5 y4 */ + + /* slot */ + + "movq %%mm7, 8(%0) \n\t" /* save y7 y6 y5 y4 */ + :: "r" (row+store) + ); +} + +static inline void mmx_row_mid (int16_t * const row, const int store, + const int offset, const int16_t * const table) +{ + + __asm__ volatile ( + "movq (%0,%1), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */ + "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */ + + "movq 8(%0,%1), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */ + "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm7 \n\t" /* mm7 = y4 y5 */ + + "packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */ + "movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */ + + "packssdw %%mm3, %%mm7 \n\t" /* mm7 = y6 y7 y4 y5 */ + "movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */ + + "movq %%mm1, (%0,%2) \n\t" /* save y3 y2 y1 y0 */ + "movq %%mm7, %%mm1 \n\t" /* mm1 = y6 y7 y4 y5 */ + + "punpckldq %%mm0, %%mm0 \n\t" /* mm0 = x2 x0 x2 x0 */ + "psrld $16, %%mm7 \n\t" /* mm7 = 0 y6 0 y4 */ + + "movq (%3), %%mm3 \n\t" /* mm3 = C6 C4 C2 C4 */ + "pslld $16, %%mm1 \n\t" /* mm1 = y7 0 y5 0 */ + + "movq 8(%3), %%mm4 \n\t" /* mm4 = -C2 -C4 C6 C4 */ + "por %%mm1, %%mm7 \n\t" /* mm7 = y7 y6 y5 y4 */ + + "movq 16(%3), %%mm1 \n\t" /* mm1 = -C7 C3 C3 C1 */ + "punpckhdq %%mm2, %%mm2 \n\t" /* mm2 = x6 x4 x6 x4 */ + + "movq %%mm7, 8(%0,%2) \n\t" /* save y7 y6 y5 y4 */ + "pmaddwd %%mm0, %%mm3 \n\t" /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ + : : "r" (row), "r" ((x86_reg) (2*offset)), "r" ((x86_reg) (2*store)), "r" (table) + ); +} + + +#if 0 +/* C column IDCT - it is just here to document the MMXEXT and MMX versions */ +static inline void idct_col (int16_t * col, int offset) +{ +/* multiplication - as implemented on mmx */ +#define F(c,x) (((c) * (x)) >> 16) + +/* saturation - it helps us handle torture test cases */ +#define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x)) + + int16_t x0, x1, x2, x3, x4, x5, x6, x7; + int16_t y0, y1, y2, y3, y4, y5, y6, y7; + int16_t a0, a1, a2, a3, b0, b1, b2, b3; + int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12; + + col += offset; + + x0 = col[0*8]; + x1 = col[1*8]; + x2 = col[2*8]; + x3 = col[3*8]; + x4 = col[4*8]; + x5 = col[5*8]; + x6 = col[6*8]; + x7 = col[7*8]; + + u04 = S (x0 + x4); + v04 = S (x0 - x4); + u26 = S (F (T2, x6) + x2); + v26 = S (F (T2, x2) - x6); + + a0 = S (u04 + u26); + a1 = S (v04 + v26); + a2 = S (v04 - v26); + a3 = S (u04 - u26); + + u17 = S (F (T1, x7) + x1); + v17 = S (F (T1, x1) - x7); + u35 = S (F (T3, x5) + x3); + v35 = S (F (T3, x3) - x5); + + b0 = S (u17 + u35); + b3 = S (v17 - v35); + u12 = S (u17 - u35); + v12 = S (v17 + v35); + u12 = S (2 * F (C4, u12)); + v12 = S (2 * F (C4, v12)); + b1 = S (u12 + v12); + b2 = S (u12 - v12); + + y0 = S (a0 + b0) >> COL_SHIFT; + y1 = S (a1 + b1) >> COL_SHIFT; + y2 = S (a2 + b2) >> COL_SHIFT; + y3 = S (a3 + b3) >> COL_SHIFT; + + y4 = S (a3 - b3) >> COL_SHIFT; + y5 = S (a2 - b2) >> COL_SHIFT; + y6 = S (a1 - b1) >> COL_SHIFT; + y7 = S (a0 - b0) >> COL_SHIFT; + + col[0*8] = y0; + col[1*8] = y1; + col[2*8] = y2; + col[3*8] = y3; + col[4*8] = y4; + col[5*8] = y5; + col[6*8] = y6; + col[7*8] = y7; +} +#endif + + +/* MMX column IDCT */ +static inline void idct_col (int16_t * const col, const int offset) +{ +#define T1 13036 +#define T2 27146 +#define T3 43790 +#define C4 23170 + + DECLARE_ALIGNED(8, static const short, t1_vector)[] = { + T1,T1,T1,T1, + T2,T2,T2,T2, + T3,T3,T3,T3, + C4,C4,C4,C4 + }; + + /* column code adapted from Peter Gubanov */ + /* http://www.elecard.com/peter/idct.shtml */ + + __asm__ volatile ( + "movq (%0), %%mm0 \n\t" /* mm0 = T1 */ + + "movq 2*8(%1), %%mm1 \n\t" /* mm1 = x1 */ + "movq %%mm0, %%mm2 \n\t" /* mm2 = T1 */ + + "movq 7*2*8(%1), %%mm4 \n\t" /* mm4 = x7 */ + "pmulhw %%mm1, %%mm0 \n\t" /* mm0 = T1*x1 */ + + "movq 16(%0), %%mm5 \n\t" /* mm5 = T3 */ + "pmulhw %%mm4, %%mm2 \n\t" /* mm2 = T1*x7 */ + + "movq 2*5*8(%1), %%mm6 \n\t" /* mm6 = x5 */ + "movq %%mm5, %%mm7 \n\t" /* mm7 = T3-1 */ + + "movq 3*8*2(%1), %%mm3 \n\t" /* mm3 = x3 */ + "psubsw %%mm4, %%mm0 \n\t" /* mm0 = v17 */ + + "movq 8(%0), %%mm4 \n\t" /* mm4 = T2 */ + "pmulhw %%mm3, %%mm5 \n\t" /* mm5 = (T3-1)*x3 */ + + "paddsw %%mm2, %%mm1 \n\t" /* mm1 = u17 */ + "pmulhw %%mm6, %%mm7 \n\t" /* mm7 = (T3-1)*x5 */ + + /* slot */ + + "movq %%mm4, %%mm2 \n\t" /* mm2 = T2 */ + "paddsw %%mm3, %%mm5 \n\t" /* mm5 = T3*x3 */ + + "pmulhw 2*8*2(%1), %%mm4 \n\t" /* mm4 = T2*x2 */ + "paddsw %%mm6, %%mm7 \n\t" /* mm7 = T3*x5 */ + + "psubsw %%mm6, %%mm5 \n\t" /* mm5 = v35 */ + "paddsw %%mm3, %%mm7 \n\t" /* mm7 = u35 */ + + "movq 6*8*2(%1), %%mm3 \n\t" /* mm3 = x6 */ + "movq %%mm0, %%mm6 \n\t" /* mm6 = v17 */ + + "pmulhw %%mm3, %%mm2 \n\t" /* mm2 = T2*x6 */ + "psubsw %%mm5, %%mm0 \n\t" /* mm0 = b3 */ + + "psubsw %%mm3, %%mm4 \n\t" /* mm4 = v26 */ + "paddsw %%mm6, %%mm5 \n\t" /* mm5 = v12 */ + + "movq %%mm0, 3*8*2(%1)\n\t" /* save b3 in scratch0 */ + "movq %%mm1, %%mm6 \n\t" /* mm6 = u17 */ + + "paddsw 2*8*2(%1), %%mm2 \n\t" /* mm2 = u26 */ + "paddsw %%mm7, %%mm6 \n\t" /* mm6 = b0 */ + + "psubsw %%mm7, %%mm1 \n\t" /* mm1 = u12 */ + "movq %%mm1, %%mm7 \n\t" /* mm7 = u12 */ + + "movq 0*8(%1), %%mm3 \n\t" /* mm3 = x0 */ + "paddsw %%mm5, %%mm1 \n\t" /* mm1 = u12+v12 */ + + "movq 24(%0), %%mm0 \n\t" /* mm0 = C4/2 */ + "psubsw %%mm5, %%mm7 \n\t" /* mm7 = u12-v12 */ + + "movq %%mm6, 5*8*2(%1)\n\t" /* save b0 in scratch1 */ + "pmulhw %%mm0, %%mm1 \n\t" /* mm1 = b1/2 */ + + "movq %%mm4, %%mm6 \n\t" /* mm6 = v26 */ + "pmulhw %%mm0, %%mm7 \n\t" /* mm7 = b2/2 */ + + "movq 4*8*2(%1), %%mm5 \n\t" /* mm5 = x4 */ + "movq %%mm3, %%mm0 \n\t" /* mm0 = x0 */ + + "psubsw %%mm5, %%mm3 \n\t" /* mm3 = v04 */ + "paddsw %%mm5, %%mm0 \n\t" /* mm0 = u04 */ + + "paddsw %%mm3, %%mm4 \n\t" /* mm4 = a1 */ + "movq %%mm0, %%mm5 \n\t" /* mm5 = u04 */ + + "psubsw %%mm6, %%mm3 \n\t" /* mm3 = a2 */ + "paddsw %%mm2, %%mm5 \n\t" /* mm5 = a0 */ + + "paddsw %%mm1, %%mm1 \n\t" /* mm1 = b1 */ + "psubsw %%mm2, %%mm0 \n\t" /* mm0 = a3 */ + + "paddsw %%mm7, %%mm7 \n\t" /* mm7 = b2 */ + "movq %%mm3, %%mm2 \n\t" /* mm2 = a2 */ + + "movq %%mm4, %%mm6 \n\t" /* mm6 = a1 */ + "paddsw %%mm7, %%mm3 \n\t" /* mm3 = a2+b2 */ + + "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm3\n\t" /* mm3 = y2 */ + "paddsw %%mm1, %%mm4\n\t" /* mm4 = a1+b1 */ + + "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm4\n\t" /* mm4 = y1 */ + "psubsw %%mm1, %%mm6 \n\t" /* mm6 = a1-b1 */ + + "movq 5*8*2(%1), %%mm1 \n\t" /* mm1 = b0 */ + "psubsw %%mm7, %%mm2 \n\t" /* mm2 = a2-b2 */ + + "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm6\n\t" /* mm6 = y6 */ + "movq %%mm5, %%mm7 \n\t" /* mm7 = a0 */ + + "movq %%mm4, 1*8*2(%1)\n\t" /* save y1 */ + "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm2\n\t" /* mm2 = y5 */ + + "movq %%mm3, 2*8*2(%1)\n\t" /* save y2 */ + "paddsw %%mm1, %%mm5 \n\t" /* mm5 = a0+b0 */ + + "movq 3*8*2(%1), %%mm4 \n\t" /* mm4 = b3 */ + "psubsw %%mm1, %%mm7 \n\t" /* mm7 = a0-b0 */ + + "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm5\n\t" /* mm5 = y0 */ + "movq %%mm0, %%mm3 \n\t" /* mm3 = a3 */ + + "movq %%mm2, 5*8*2(%1)\n\t" /* save y5 */ + "psubsw %%mm4, %%mm3 \n\t" /* mm3 = a3-b3 */ + + "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm7\n\t" /* mm7 = y7 */ + "paddsw %%mm0, %%mm4 \n\t" /* mm4 = a3+b3 */ + + "movq %%mm5, 0*8*2(%1)\n\t" /* save y0 */ + "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm3\n\t" /* mm3 = y4 */ + + "movq %%mm6, 6*8*2(%1)\n\t" /* save y6 */ + "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm4\n\t" /* mm4 = y3 */ + + "movq %%mm7, 7*8*2(%1)\n\t" /* save y7 */ + + "movq %%mm3, 4*8*2(%1)\n\t" /* save y4 */ + + "movq %%mm4, 3*8*2(%1)\n\t" /* save y3 */ + :: "r" (t1_vector), "r" (col+offset) + ); + +#undef T1 +#undef T2 +#undef T3 +#undef C4 +} + + +DECLARE_ALIGNED(8, static const int32_t, rounder0)[] = + rounder ((1 << (COL_SHIFT - 1)) - 0.5); +DECLARE_ALIGNED(8, static const int32_t, rounder4)[] = rounder (0); +DECLARE_ALIGNED(8, static const int32_t, rounder1)[] = + rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ +DECLARE_ALIGNED(8, static const int32_t, rounder7)[] = + rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */ +DECLARE_ALIGNED(8, static const int32_t, rounder2)[] = + rounder (0.60355339059); /* C2 * (C6+C2)/2 */ +DECLARE_ALIGNED(8, static const int32_t, rounder6)[] = + rounder (-0.25); /* C2 * (C6-C2)/2 */ +DECLARE_ALIGNED(8, static const int32_t, rounder3)[] = + rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ +DECLARE_ALIGNED(8, static const int32_t, rounder5)[] = + rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ + +#undef COL_SHIFT +#undef ROW_SHIFT + +#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ +void idct (int16_t * const block) \ +{ \ + DECLARE_ALIGNED(16, static const int16_t, table04)[] = \ + table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ + DECLARE_ALIGNED(16, static const int16_t, table17)[] = \ + table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ + DECLARE_ALIGNED(16, static const int16_t, table26)[] = \ + table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ + DECLARE_ALIGNED(16, static const int16_t, table35)[] = \ + table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ + \ + idct_row_head (block, 0*8, table04); \ + idct_row (table04, rounder0); \ + idct_row_mid (block, 0*8, 4*8, table04); \ + idct_row (table04, rounder4); \ + idct_row_mid (block, 4*8, 1*8, table17); \ + idct_row (table17, rounder1); \ + idct_row_mid (block, 1*8, 7*8, table17); \ + idct_row (table17, rounder7); \ + idct_row_mid (block, 7*8, 2*8, table26); \ + idct_row (table26, rounder2); \ + idct_row_mid (block, 2*8, 6*8, table26); \ + idct_row (table26, rounder6); \ + idct_row_mid (block, 6*8, 3*8, table35); \ + idct_row (table35, rounder3); \ + idct_row_mid (block, 3*8, 5*8, table35); \ + idct_row (table35, rounder5); \ + idct_row_tail (block, 5*8); \ + \ + idct_col (block, 0); \ + idct_col (block, 4); \ +} + +declare_idct (ff_mmxext_idct, mmxext_table, + mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) + +declare_idct (ff_mmx_idct, mmx_table, + mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) + +#endif /* HAVE_INLINE_ASM */ diff --git a/libavcodec/x86/idct_mmx_xvid.c b/libavcodec/x86/idct_mmx_xvid.c index e4c778c398..5d8027fb27 100644 --- a/libavcodec/x86/idct_mmx_xvid.c +++ b/libavcodec/x86/idct_mmx_xvid.c @@ -22,20 +22,20 @@ * * conversion to gcc syntax by Michael Niedermayer * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License - * along with Libav; if not, write to the Free Software Foundation, + * along with FFmpeg; if not, write to the Free Software Foundation, * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/idct_sse2_xvid.c b/libavcodec/x86/idct_sse2_xvid.c index 8249e97ccf..3708f93df8 100644 --- a/libavcodec/x86/idct_sse2_xvid.c +++ b/libavcodec/x86/idct_sse2_xvid.c @@ -9,7 +9,7 @@ * * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid. * - * This file is part of Libav. + * This file is part of FFmpeg. * * Vertical pass is an implementation of the scheme: * Loeffler C., Ligtenberg A., and Moschytz C.S.: @@ -23,18 +23,18 @@ * * More details at http://skal.planet-d.net/coding/dct.html * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License - * along with Libav; if not, write to the Free Software Foundation, + * along with FFmpeg; if not, write to the Free Software Foundation, * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/idct_xvid.h b/libavcodec/x86/idct_xvid.h index 495d2caaf9..be91d1c68a 100644 --- a/libavcodec/x86/idct_xvid.h +++ b/libavcodec/x86/idct_xvid.h @@ -1,20 +1,20 @@ /* * XVID MPEG-4 VIDEO CODEC * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/imdct36_sse.asm b/libavcodec/x86/imdct36_sse.asm index 937a2cc416..336e9f0c54 100644 --- a/libavcodec/x86/imdct36_sse.asm +++ b/libavcodec/x86/imdct36_sse.asm @@ -371,8 +371,10 @@ DEFINE_IMDCT INIT_XMM ssse3 DEFINE_IMDCT +%if HAVE_AVX INIT_XMM avx DEFINE_IMDCT +%endif INIT_XMM sse @@ -717,5 +719,7 @@ cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp INIT_XMM sse DEFINE_FOUR_IMDCT +%if HAVE_AVX INIT_XMM avx DEFINE_FOUR_IMDCT +%endif diff --git a/libavcodec/x86/lpc_mmx.c b/libavcodec/x86/lpc_mmx.c index 27bebe856a..087a324a43 100644 --- a/libavcodec/x86/lpc_mmx.c +++ b/libavcodec/x86/lpc_mmx.c @@ -2,20 +2,20 @@ * MMX optimized LPC DSP utils * Copyright (c) 2007 Loren Merritt * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/mathops.h b/libavcodec/x86/mathops.h index e056eb0a2d..78f37f24eb 100644 --- a/libavcodec/x86/mathops.h +++ b/libavcodec/x86/mathops.h @@ -2,20 +2,20 @@ * simple math operations * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/mlpdsp.c b/libavcodec/x86/mlpdsp.c index 400855d7c4..7ea77fc1b8 100644 --- a/libavcodec/x86/mlpdsp.c +++ b/libavcodec/x86/mlpdsp.c @@ -2,20 +2,20 @@ * MLP DSP functions x86-optimized * Copyright (c) 2009 Ramiro Polla * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/motion_est_mmx.c b/libavcodec/x86/motion_est_mmx.c index ab845c1129..1d3545a5e8 100644 --- a/libavcodec/x86/motion_est_mmx.c +++ b/libavcodec/x86/motion_est_mmx.c @@ -5,20 +5,20 @@ * * mostly by Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/mpegaudiodec_mmx.c b/libavcodec/x86/mpegaudiodec_mmx.c index 88a347796c..0d6cc08305 100644 --- a/libavcodec/x86/mpegaudiodec_mmx.c +++ b/libavcodec/x86/mpegaudiodec_mmx.c @@ -2,20 +2,20 @@ * MMX optimized MP3 decoding functions * Copyright (c) 2010 Vitor Sessak * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -214,11 +214,17 @@ static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \ } \ } +#if HAVE_YASM +#if HAVE_SSE DECL_IMDCT_BLOCKS(sse,sse) DECL_IMDCT_BLOCKS(sse2,sse) DECL_IMDCT_BLOCKS(sse3,sse) DECL_IMDCT_BLOCKS(ssse3,sse) +#endif +#if HAVE_AVX DECL_IMDCT_BLOCKS(avx,avx) +#endif +#endif void ff_mpadsp_init_mmx(MPADSPContext *s) { @@ -244,8 +250,11 @@ void ff_mpadsp_init_mmx(MPADSPContext *s) } #endif /* HAVE_INLINE_ASM */ #if HAVE_YASM - if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) { + if (0) { +#if HAVE_AVX + } else if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) { s->imdct36_blocks_float = imdct36_blocks_avx; +#endif #if HAVE_SSE } else if (mm_flags & AV_CPU_FLAG_SSSE3) { s->imdct36_blocks_float = imdct36_blocks_ssse3; diff --git a/libavcodec/x86/mpegvideo_mmx.c b/libavcodec/x86/mpegvideo_mmx.c index 85f6866342..44d4cd3a8a 100644 --- a/libavcodec/x86/mpegvideo_mmx.c +++ b/libavcodec/x86/mpegvideo_mmx.c @@ -5,20 +5,20 @@ * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru> * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/mpegvideo_mmx_template.c b/libavcodec/x86/mpegvideo_mmx_template.c index 53e09bdbfe..82e4ffa558 100644 --- a/libavcodec/x86/mpegvideo_mmx_template.c +++ b/libavcodec/x86/mpegvideo_mmx_template.c @@ -3,20 +3,20 @@ * * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -110,10 +110,15 @@ static int RENAME(dct_quantize)(MpegEncContext *s, if (s->mb_intra) { int dummy; - if (n < 4) + if (n < 4){ q = s->y_dc_scale; - else + bias = s->q_intra_matrix16[qscale][1]; + qmat = s->q_intra_matrix16[qscale][0]; + }else{ q = s->c_dc_scale; + bias = s->q_chroma_intra_matrix16[qscale][1]; + qmat = s->q_chroma_intra_matrix16[qscale][0]; + } /* note: block[0] is assumed to be positive */ if (!s->h263_aic) { __asm__ volatile ( @@ -128,8 +133,6 @@ static int RENAME(dct_quantize)(MpegEncContext *s, block[0]=0; //avoid fake overflow // temp_block[0] = (block[0] + (q >> 1)) / q; last_non_zero_p1 = 1; - bias = s->q_intra_matrix16[qscale][1]; - qmat = s->q_intra_matrix16[qscale][0]; } else { last_non_zero_p1 = 0; bias = s->q_inter_matrix16[qscale][1]; diff --git a/libavcodec/x86/pngdsp-init.c b/libavcodec/x86/pngdsp-init.c index aa21847db2..7a12730620 100644 --- a/libavcodec/x86/pngdsp-init.c +++ b/libavcodec/x86/pngdsp-init.c @@ -2,20 +2,20 @@ * x86 PNG optimizations. * Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/pngdsp.asm b/libavcodec/x86/pngdsp.asm index d6e6374c89..8999c17c82 100644 --- a/libavcodec/x86/pngdsp.asm +++ b/libavcodec/x86/pngdsp.asm @@ -28,7 +28,7 @@ SECTION_RODATA cextern pw_255 -section .text align=16 +SECTION_TEXT 16 ; %1 = nr. of xmm registers used %macro ADD_BYTES_FN 1 diff --git a/libavcodec/x86/proresdsp-init.c b/libavcodec/x86/proresdsp-init.c index f202f9f0cf..c4aeb7f503 100644 --- a/libavcodec/x86/proresdsp-init.c +++ b/libavcodec/x86/proresdsp-init.c @@ -29,11 +29,14 @@ void ff_prores_idct_put_10_sse4(uint16_t *dst, int linesize, void ff_prores_idct_put_10_avx (uint16_t *dst, int linesize, DCTELEM *block, const int16_t *qmat); -void ff_proresdsp_x86_init(ProresDSPContext *dsp) +void ff_proresdsp_x86_init(ProresDSPContext *dsp, AVCodecContext *avctx) { #if ARCH_X86_64 && HAVE_YASM int flags = av_get_cpu_flags(); + if(avctx->flags & CODEC_FLAG_BITEXACT) + return; + if (flags & AV_CPU_FLAG_SSE2) { dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; dsp->idct_put = ff_prores_idct_put_10_sse2; diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm index bce36ac1fc..d626fdce08 100644 --- a/libavcodec/x86/proresdsp.asm +++ b/libavcodec/x86/proresdsp.asm @@ -1,23 +1,24 @@ ;****************************************************************************** ;* x86-SIMD-optimized IDCT for prores -;* this is identical to "simple" IDCT except for the clip range +;* this is identical to "simple" IDCT written by Michael Niedermayer +;* except for the clip range ;* ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -48,10 +49,10 @@ w1_plus_w5: times 4 dw W1sh2, +W5sh2 w5_min_w1: times 4 dw W5sh2, -W1sh2 w5_plus_w7: times 4 dw W5sh2, +W7sh2 w7_min_w5: times 4 dw W7sh2, -W5sh2 -row_round: times 8 dw (1<<14) +pw_88: times 8 dw 0x2008 +cextern pw_1 cextern pw_4 -cextern pw_8 cextern pw_512 cextern pw_1019 @@ -92,14 +93,12 @@ section .text align=16 ; a2 -= W6 * row[2]; ; a3 -= W2 * row[2]; %ifidn %1, col - paddw m10,[pw_8] + paddw m10,[pw_88] %endif - SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7] %ifidn %1, row - psubw m10,[row_round] + paddw m10,[pw_1] %endif - SIGNEXTEND m8, m9, m14 ; { row[2] }[0-3] / [4-7] - SIGNEXTEND m10, m11, m14 ; { row[0] }[0-3] / [4-7] + SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7] pmaddwd m2, m0, [w4_plus_w6] pmaddwd m3, m1, [w4_plus_w6] pmaddwd m4, m0, [w4_min_w6] @@ -108,75 +107,33 @@ section .text align=16 pmaddwd m7, m1, [w4_min_w2] pmaddwd m0, [w4_plus_w2] pmaddwd m1, [w4_plus_w2] - pslld m2, 2 - pslld m3, 2 - pslld m4, 2 - pslld m5, 2 - pslld m6, 2 - pslld m7, 2 - pslld m0, 2 - pslld m1, 2 ; a0: -1*row[0]-1*row[2] ; a1: -1*row[0] ; a2: -1*row[0] ; a3: -1*row[0]+1*row[2] - psubd m2, m10 ; a1[0-3] - psubd m3, m11 ; a1[4-7] - psubd m4, m10 ; a2[0-3] - psubd m5, m11 ; a2[4-7] - psubd m0, m10 - psubd m1, m11 - psubd m6, m10 - psubd m7, m11 - psubd m0, m8 ; a0[0-3] - psubd m1, m9 ; a0[4-7] - paddd m6, m8 ; a3[0-3] - paddd m7, m9 ; a3[4-7] ; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4] ; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6] ; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6] ; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4] SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7] - SIGNEXTEND m13, m14, m10 ; { row[4] }[0-3] / [4-7] pmaddwd m10, m8, [w4_plus_w6] pmaddwd m11, m9, [w4_plus_w6] - pslld m10, 2 - pslld m11, 2 - psubd m10, m13 - psubd m11, m14 paddd m0, m10 ; a0[0-3] paddd m1, m11 ; a0[4-7] pmaddwd m10, m8, [w4_min_w6] pmaddwd m11, m9, [w4_min_w6] - pslld m10, 2 - pslld m11, 2 - psubd m10, m13 - psubd m11, m14 paddd m6, m10 ; a3[0-3] paddd m7, m11 ; a3[4-7] pmaddwd m10, m8, [w4_min_w2] pmaddwd m11, m9, [w4_min_w2] pmaddwd m8, [w4_plus_w2] pmaddwd m9, [w4_plus_w2] - pslld m10, 2 - pslld m11, 2 - pslld m8, 2 - pslld m9, 2 - psubd m10, m13 - psubd m11, m14 - psubd m8, m13 - psubd m9, m14 psubd m4, m10 ; a2[0-3] intermediate psubd m5, m11 ; a2[4-7] intermediate psubd m2, m8 ; a1[0-3] intermediate psubd m3, m9 ; a1[4-7] intermediate - SIGNEXTEND m12, m13, m10 ; { row[6] }[0-3] / [4-7] - psubd m4, m12 ; a2[0-3] - psubd m5, m13 ; a2[4-7] - paddd m2, m12 ; a1[0-3] - paddd m3, m13 ; a1[4-7] ; load/store mova [r2+ 0], m0 @@ -207,8 +164,6 @@ section .text align=16 ; b3 = MUL(W7, row[1]); ; MAC(b3, -W5, row[3]); SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7] - SIGNEXTEND m10, m11, m12 ; { row[1] }[0-3] / [4-7] - SIGNEXTEND m8, m9, m12 ; { row[3] }[0-3] / [4-7] pmaddwd m2, m0, [w3_min_w7] pmaddwd m3, m1, [w3_min_w7] pmaddwd m4, m0, [w5_min_w1] @@ -217,35 +172,11 @@ section .text align=16 pmaddwd m7, m1, [w7_min_w5] pmaddwd m0, [w1_plus_w3] pmaddwd m1, [w1_plus_w3] - pslld m2, 2 - pslld m3, 2 - pslld m4, 2 - pslld m5, 2 - pslld m6, 2 - pslld m7, 2 - pslld m0, 2 - pslld m1, 2 ; b0: +1*row[1]+2*row[3] ; b1: +2*row[1]-1*row[3] ; b2: -1*row[1]-1*row[3] ; b3: +1*row[1]+1*row[3] - psubd m2, m8 - psubd m3, m9 - paddd m0, m8 - paddd m1, m9 - paddd m8, m10 ; { row[1] + row[3] }[0-3] - paddd m9, m11 ; { row[1] + row[3] }[4-7] - paddd m10, m10 - paddd m11, m11 - paddd m0, m8 ; b0[0-3] - paddd m1, m9 ; b0[4-7] - paddd m2, m10 ; b1[0-3] - paddd m3, m11 ; b2[4-7] - psubd m4, m8 ; b2[0-3] - psubd m5, m9 ; b2[4-7] - paddd m6, m8 ; b3[0-3] - paddd m7, m9 ; b3[4-7] ; MAC(b0, W5, row[5]); ; MAC(b0, W7, row[7]); @@ -256,38 +187,16 @@ section .text align=16 ; MAC(b3, W3, row[5]); ; MAC(b3, -W1, row[7]); SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7] - SIGNEXTEND m13, m12, m11 ; { row[5] }[0-3] / [4-7] - SIGNEXTEND m14, m11, m10 ; { row[7] }[0-3] / [4-7] ; b0: -1*row[5]+1*row[7] ; b1: -1*row[5]+1*row[7] ; b2: +1*row[5]+2*row[7] ; b3: +2*row[5]-1*row[7] - paddd m4, m13 - paddd m5, m12 - paddd m6, m13 - paddd m7, m12 - psubd m13, m14 ; { row[5] - row[7] }[0-3] - psubd m12, m11 ; { row[5] - row[7] }[4-7] - paddd m14, m14 - paddd m11, m11 - psubd m0, m13 - psubd m1, m12 - psubd m2, m13 - psubd m3, m12 - paddd m4, m14 - paddd m5, m11 - paddd m6, m13 - paddd m7, m12 pmaddwd m10, m8, [w1_plus_w5] pmaddwd m11, m9, [w1_plus_w5] pmaddwd m12, m8, [w5_plus_w7] pmaddwd m13, m9, [w5_plus_w7] - pslld m10, 2 - pslld m11, 2 - pslld m12, 2 - pslld m13, 2 psubd m2, m10 ; b1[0-3] psubd m3, m11 ; b1[4-7] paddd m0, m12 ; b0[0-3] @@ -296,10 +205,6 @@ section .text align=16 pmaddwd m13, m9, [w7_plus_w3] pmaddwd m8, [w3_min_w1] pmaddwd m9, [w3_min_w1] - pslld m12, 2 - pslld m13, 2 - pslld m8, 2 - pslld m9, 2 paddd m4, m12 ; b2[0-3] paddd m5, m13 ; b2[4-7] paddd m6, m8 ; b3[0-3] @@ -346,7 +251,7 @@ cglobal prores_idct_put_10, 4, 4, %1 pmullw m13,[r3+64] pmullw m12,[r3+96] - IDCT_1D row, 17 + IDCT_1D row, 15 ; transpose for second part of IDCT TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3 @@ -361,20 +266,11 @@ cglobal prores_idct_put_10, 4, 4, %1 ; for (i = 0; i < 8; i++) ; idctSparseColAdd(dest + i, line_size, block + i); - IDCT_1D col, 20 + IDCT_1D col, 18 ; clip/store - mova m6, [pw_512] mova m3, [pw_4] mova m5, [pw_1019] - paddw m8, m6 - paddw m0, m6 - paddw m1, m6 - paddw m2, m6 - paddw m4, m6 - paddw m11, m6 - paddw m9, m6 - paddw m10, m6 pmaxsw m8, m3 pmaxsw m0, m3 pmaxsw m1, m3 @@ -423,7 +319,9 @@ INIT_XMM sse2 idct_put_fn 16 INIT_XMM sse4 idct_put_fn 16 +%if HAVE_AVX INIT_XMM avx idct_put_fn 16 +%endif %endif diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm index c43b77abd2..78d8c92b0b 100644 --- a/libavcodec/x86/rv34dsp.asm +++ b/libavcodec/x86/rv34dsp.asm @@ -49,7 +49,7 @@ SECTION .text cglobal rv34_idct_%1, 1, 2, 0 movsx r1, word [r0] IDCT_DC r1 - movd m0, r1 + movd m0, r1d pshufw m0, m0, 0 movq [r0+ 0], m0 movq [r0+ 8], m0 @@ -70,7 +70,7 @@ cglobal rv34_idct_dc_add, 3, 3 ; calculate DC IDCT_DC_ROUND r2 pxor m1, m1 - movd m0, r2 + movd m0, r2d psubw m1, m0 packuswb m0, m0 packuswb m1, m1 @@ -175,7 +175,7 @@ cglobal rv34_idct_dc_add, 3, 3, 6 pxor m1, m1 ; calculate DC - movd m0, r2 + movd m0, r2d lea r2, [r0+r1*2] movd m2, [r0] movd m3, [r0+r1] diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm index ae740c213a..70c0c0400f 100644 --- a/libavcodec/x86/rv40dsp.asm +++ b/libavcodec/x86/rv40dsp.asm @@ -466,8 +466,8 @@ cglobal rv40_weight_func_%1_%2, 6, 7, 8 add r2, r6 neg r6 - movd m2, r3 - movd m3, r4 + movd m2, r3d + movd m3, r4d %ifidn %1,rnd %define RND 0 SPLATW m2, m2 diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c index bee33657b5..c508ac9328 100644 --- a/libavcodec/x86/rv40dsp_init.c +++ b/libavcodec/x86/rv40dsp_init.c @@ -56,6 +56,8 @@ DECLARE_WEIGHT(mmx2) DECLARE_WEIGHT(sse2) DECLARE_WEIGHT(ssse3) +#if HAVE_YASM + /** @{ */ /** * Define one qpel function. @@ -182,6 +184,8 @@ QPEL_FUNCS_SET (OP, 3, 1, OPT) \ QPEL_FUNCS_SET (OP, 3, 2, OPT) /** @} */ +#endif //HAVE_YASM + void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp) { #if HAVE_YASM diff --git a/libavcodec/x86/simple_idct_mmx.c b/libavcodec/x86/simple_idct_mmx.c index f455eb8974..20e51a47f4 100644 --- a/libavcodec/x86/simple_idct_mmx.c +++ b/libavcodec/x86/simple_idct_mmx.c @@ -3,20 +3,20 @@ * * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavcodec/dsputil.h" diff --git a/libavcodec/x86/snowdsp_mmx.c b/libavcodec/x86/snowdsp_mmx.c index 770cc1cc73..5d47206e81 100644 --- a/libavcodec/x86/snowdsp_mmx.c +++ b/libavcodec/x86/snowdsp_mmx.c @@ -2,20 +2,20 @@ * MMX and SSE2 optimized snow DSP utils * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -675,14 +675,14 @@ static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM #define snow_inner_add_yblock_sse2_end_8\ "sal $1, %%"REG_c" \n\t"\ - "addl $"PTR_SIZE"*2, %1 \n\t"\ + "add"OPSIZE" $"PTR_SIZE"*2, %1 \n\t"\ snow_inner_add_yblock_sse2_end_common1\ "sar $1, %%"REG_c" \n\t"\ "sub $2, %2 \n\t"\ snow_inner_add_yblock_sse2_end_common2 #define snow_inner_add_yblock_sse2_end_16\ - "addl $"PTR_SIZE"*1, %1 \n\t"\ + "add"OPSIZE" $"PTR_SIZE"*1, %1 \n\t"\ snow_inner_add_yblock_sse2_end_common1\ "dec %2 \n\t"\ snow_inner_add_yblock_sse2_end_common2 diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c new file mode 100644 index 0000000000..c961178e51 --- /dev/null +++ b/libavcodec/x86/v210-init.c @@ -0,0 +1,48 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/cpu.h" +#include "libavcodec/v210dec.h" + +extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); + +extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); + +av_cold void v210_x86_init(V210DecContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + +#if HAVE_YASM + if (s->aligned_input) { + if (cpu_flags & AV_CPU_FLAG_SSSE3) + s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3; + + if (HAVE_AVX && cpu_flags & AV_CPU_FLAG_AVX) + s->unpack_frame = ff_v210_planar_unpack_aligned_avx; + } + else { + if (cpu_flags & AV_CPU_FLAG_SSSE3) + s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3; + + if (HAVE_AVX && cpu_flags & AV_CPU_FLAG_AVX) + s->unpack_frame = ff_v210_planar_unpack_unaligned_avx; + } +#endif +} diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm new file mode 100644 index 0000000000..f39419e2b2 --- /dev/null +++ b/libavcodec/x86/v210.asm @@ -0,0 +1,89 @@ +;****************************************************************************** +;* V210 SIMD unpack +;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu> +;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com> +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86inc.asm" +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +v210_mask: times 4 dd 0x3ff +v210_mult: dw 64,4,64,4,64,4,64,4 +v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1 +v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1 + +SECTION .text + +%macro v210_planar_unpack 2 + +; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width) +cglobal v210_planar_unpack_%1_%2, 5, 5 + movsxdifnidn r4, r4d + lea r1, [r1+2*r4] + add r2, r4 + add r3, r4 + neg r4 + + mova m3, [v210_mult] + mova m4, [v210_mask] + mova m5, [v210_luma_shuf] + mova m6, [v210_chroma_shuf] +.loop +%ifidn %1, unaligned + movu m0, [r0] +%else + mova m0, [r0] +%endif + + pmullw m1, m0, m3 + psrld m0, 10 + psrlw m1, 6 ; u0 v0 y1 y2 v1 u2 y4 y5 + pand m0, m4 ; y0 __ u1 __ y3 __ v2 __ + + shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __ + pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __ + movu [r1+2*r4], m2 + + shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __ + pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __ + movq [r2+r4], m1 + movhps [r3+r4], m1 + + add r0, mmsize + add r4, 6 + jl .loop + + REP_RET +%endmacro + +INIT_XMM +v210_planar_unpack unaligned, ssse3 +%if HAVE_AVX +INIT_AVX +v210_planar_unpack unaligned, avx +%endif + +INIT_XMM +v210_planar_unpack aligned, ssse3 +%if HAVE_AVX +INIT_AVX +v210_planar_unpack aligned, avx +%endif diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c index aae08c2364..5922a6dbf7 100644 --- a/libavcodec/x86/vc1dsp_mmx.c +++ b/libavcodec/x86/vc1dsp_mmx.c @@ -755,6 +755,9 @@ void ff_vc1dsp_init_mmx(VC1DSPContext *dsp) dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx; dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx; dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx; + + if (HAVE_YASM) + dsp->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_mmx_nornd; } if (mm_flags & AV_CPU_FLAG_MMXEXT) { @@ -782,6 +785,16 @@ void ff_vc1dsp_init_mmx(VC1DSPContext *dsp) dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmx2; dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmx2; dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmx2; + + if (HAVE_YASM) + dsp->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_mmx2_nornd; + } else if (HAVE_YASM && mm_flags & AV_CPU_FLAG_3DNOW) { + dsp->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_3dnow_nornd; + } + + if (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSSE3) { + dsp->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_ssse3_nornd; + dsp->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_ssse3_nornd; } #endif /* HAVE_INLINE_ASM */ @@ -795,14 +808,10 @@ void ff_vc1dsp_init_mmx(VC1DSPContext *dsp) #if HAVE_YASM if (mm_flags & AV_CPU_FLAG_MMX) { - dsp->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_mmx_nornd; } if (mm_flags & AV_CPU_FLAG_MMXEXT) { ASSIGN_LF(mmx2); - dsp->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_mmx2_nornd; - } else if (mm_flags & AV_CPU_FLAG_3DNOW) { - dsp->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_3dnow_nornd; } if (mm_flags & AV_CPU_FLAG_SSE2) { @@ -813,8 +822,6 @@ void ff_vc1dsp_init_mmx(VC1DSPContext *dsp) } if (mm_flags & AV_CPU_FLAG_SSSE3) { ASSIGN_LF(ssse3); - dsp->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_ssse3_nornd; - dsp->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_ssse3_nornd; } if (mm_flags & AV_CPU_FLAG_SSE4) { dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse4; diff --git a/libavcodec/x86/vc1dsp_yasm.asm b/libavcodec/x86/vc1dsp_yasm.asm index ced2b5ba88..590aa509a7 100644 --- a/libavcodec/x86/vc1dsp_yasm.asm +++ b/libavcodec/x86/vc1dsp_yasm.asm @@ -2,25 +2,25 @@ ;* VC1 deblocking optimizations ;* Copyright (c) 2009 David Conrad ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** -%include "x86inc.asm" -%include "x86util.asm" +%include "libavutil/x86/x86inc.asm" +%include "libavutil/x86/x86util.asm" cextern pw_4 cextern pw_5 diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm index 7a88892c11..dcd55f5f8b 100644 --- a/libavcodec/x86/vp3dsp.asm +++ b/libavcodec/x86/vp3dsp.asm @@ -2,25 +2,25 @@ ;* MMX/SSE2-optimized functions for the VP3 decoder ;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** -%include "x86inc.asm" -%include "x86util.asm" +%include "libavutil/x86/x86inc.asm" +%include "libavutil/x86/x86util.asm" ; MMX-optimized functions cribbed from the original VP3 source code. diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c index 45af041a6e..8905452dcf 100644 --- a/libavcodec/x86/vp3dsp_init.c +++ b/libavcodec/x86/vp3dsp_init.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/vp56_arith.h b/libavcodec/x86/vp56_arith.h index be2dd30b8d..ddbf38b1a9 100644 --- a/libavcodec/x86/vp56_arith.h +++ b/libavcodec/x86/vp56_arith.h @@ -4,20 +4,20 @@ * Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org> * Copyright (C) 2010 Eli Friedman * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/vp56dsp.asm b/libavcodec/x86/vp56dsp.asm index ca4d97ec15..21ff041d61 100644 --- a/libavcodec/x86/vp56dsp.asm +++ b/libavcodec/x86/vp56dsp.asm @@ -3,25 +3,25 @@ ;* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com> ;* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** -%include "x86inc.asm" -%include "x86util.asm" +%include "libavutil/x86/x86inc.asm" +%include "libavutil/x86/x86util.asm" cextern pw_64 diff --git a/libavcodec/x86/vp56dsp_init.c b/libavcodec/x86/vp56dsp_init.c index ae04440611..9108aa446d 100644 --- a/libavcodec/x86/vp56dsp_init.c +++ b/libavcodec/x86/vp56dsp_init.c @@ -3,20 +3,20 @@ * Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com> * Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c index 64dd8ceadf..8c17fa0382 100644 --- a/libavcodec/x86/vp8dsp-init.c +++ b/libavcodec/x86/vp8dsp-init.c @@ -3,20 +3,20 @@ * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> * Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 531b205b7b..75c0b56f94 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -3,25 +3,25 @@ ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** -%include "x86inc.asm" -%include "x86util.asm" +%include "libavutil/x86/x86inc.asm" +%include "libavutil/x86/x86util.asm" SECTION_RODATA |