diff options
Diffstat (limited to 'libavcodec/x86')
106 files changed, 5594 insertions, 828 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 6f4935bc3e..c7cdc2fcf5 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -25,9 +25,11 @@ OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o +OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp_init.o OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp_init.o \ x86/rv40dsp_init.o +OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o @@ -36,6 +38,7 @@ OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp_init.o OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o +OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_mmx.o \ @@ -44,9 +47,12 @@ MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_mmx.o \ x86/idct_sse2_xvid.o \ x86/rnd_mmx.o \ x86/simple_idct.o +MMX-OBJS-$(CONFIG_DIRAC_DECODER) += x86/dirac_dwt.o MMX-OBJS-$(CONFIG_HPELDSP) += x86/fpel_mmx.o \ x86/hpeldsp_mmx.o \ x86/rnd_mmx.o +MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o +MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o YASM-OBJS += x86/deinterlace.o \ @@ -55,6 +61,8 @@ YASM-OBJS += x86/deinterlace.o \ YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o YASM-OBJS-$(CONFIG_DCT) += x86/dct32.o +YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_mmx.o x86/diracdsp_yasm.o\ + x86/dwt_yasm.o YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil.o \ x86/fpel.o \ x86/mpeg4qpel.o \ @@ -81,9 +89,11 @@ YASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \ YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o +YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o YASM-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp.o YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \ x86/rv40dsp.o +YASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o @@ -91,4 +101,7 @@ YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o \ x86/vp8dsp_loopfilter.o -YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp.o +YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9itxfm.o \ + x86/vp9lpf.o \ + x86/vp9mc.o +YASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm index b43231855c..89a64f50d4 100644 --- a/libavcodec/x86/ac3dsp.asm +++ b/libavcodec/x86/ac3dsp.asm @@ -2,20 +2,20 @@ ;* x86-optimized AC-3 DSP utils ;* Copyright (c) 2011 Justin Ruggles ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/ac3dsp_init.c b/libavcodec/x86/ac3dsp_init.c index f3a921384e..388fc0c696 100644 --- a/libavcodec/x86/ac3dsp_init.c +++ b/libavcodec/x86/ac3dsp_init.c @@ -2,20 +2,20 @@ * x86-optimized AC-3 DSP utils * Copyright (c) 2011 Justin Ruggles * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -65,6 +65,11 @@ void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input, void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input, const int16_t *window, unsigned int len); +#if ARCH_X86_32 && defined(__INTEL_COMPILER) +# undef HAVE_7REGS +# define HAVE_7REGS 0 +#endif + #if HAVE_SSE_INLINE && HAVE_7REGS #define IF1(x) x diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h index fdb0a2975c..558d287032 100644 --- a/libavcodec/x86/cabac.h +++ b/libavcodec/x86/cabac.h @@ -1,20 +1,20 @@ /* * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -27,8 +27,27 @@ #include "libavutil/internal.h" #include "config.h" +#if (defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\ + || ( !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1) +# define BROKEN_COMPILER 1 +#else +# define BROKEN_COMPILER 0 +#endif + #if HAVE_INLINE_ASM +#ifndef UNCHECKED_BITSTREAM_READER +#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER +#endif + +#if UNCHECKED_BITSTREAM_READER +#define END_CHECK(end) "" +#else +#define END_CHECK(end) \ + "cmp "end" , %%"REG_c" \n\t"\ + "jge 1f \n\t" +#endif + #ifdef BROKEN_RELOCATIONS #define TABLES_ARG , "r"(tables) @@ -73,8 +92,7 @@ "test "lowword" , "lowword" \n\t"\ "jnz 2f \n\t"\ "mov "byte" , %%"REG_c" \n\t"\ - "cmp "end" , %%"REG_c" \n\t"\ - "jge 1f \n\t"\ + END_CHECK(end)\ "add"OPSIZE" $2 , "byte" \n\t"\ "1: \n\t"\ "movzwl (%%"REG_c") , "tmp" \n\t"\ @@ -93,6 +111,7 @@ #else /* BROKEN_RELOCATIONS */ #define TABLES_ARG +#define RIP_ARG #if HAVE_FAST_CMOV #define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\ @@ -134,8 +153,7 @@ "test "lowword" , "lowword" \n\t"\ " jnz 2f \n\t"\ "mov "byte" , %%"REG_c" \n\t"\ - "cmp "end" , %%"REG_c" \n\t"\ - "jge 1f \n\t"\ + END_CHECK(end)\ "add"OPSIZE" $2 , "byte" \n\t"\ "1: \n\t"\ "movzwl (%%"REG_c") , "tmp" \n\t"\ @@ -154,8 +172,7 @@ #endif /* BROKEN_RELOCATIONS */ - -#if HAVE_7REGS +#if HAVE_7REGS && !BROKEN_COMPILER #define get_cabac_inline get_cabac_inline_x86 static av_always_inline int get_cabac_inline_x86(CABACContext *c, uint8_t *const state) @@ -178,17 +195,19 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c, AV_STRINGIFY(H264_LPS_RANGE_OFFSET), AV_STRINGIFY(H264_MLPS_STATE_OFFSET), "%8") - : "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp) + : "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp) : "r"(state), "r"(c), "i"(offsetof(CABACContext, bytestream)), "i"(offsetof(CABACContext, bytestream_end)) TABLES_ARG + ,"1"(c->low), "2"(c->range) : "%"REG_c, "memory" ); return bit & 1; } #endif /* HAVE_7REGS */ +#if !BROKEN_COMPILER #define get_cabac_bypass_sign get_cabac_bypass_sign_x86 static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val) { @@ -211,10 +230,16 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val) "movzwl (%1), %%edx \n\t" "bswap %%edx \n\t" "shrl $15, %%edx \n\t" +#if UNCHECKED_BITSTREAM_READER + "add $2, %1 \n\t" + "addl %%edx, %%eax \n\t" + "mov %1, %c4(%2) \n\t" +#else "addl %%edx, %%eax \n\t" "cmp %c5(%2), %1 \n\t" "jge 1f \n\t" "add"OPSIZE" $2, %c4(%2) \n\t" +#endif "1: \n\t" "movl %%eax, %c3(%2) \n\t" @@ -268,6 +293,7 @@ static av_always_inline int get_cabac_bypass_x86(CABACContext *c) ); return res; } +#endif /* !BROKEN_COMPILER */ #endif /* HAVE_INLINE_ASM */ #endif /* AVCODEC_X86_CABAC_H */ diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c index bc9cbf7411..aaa09d1784 100644 --- a/libavcodec/x86/cavsdsp.c +++ b/libavcodec/x86/cavsdsp.c @@ -5,20 +5,20 @@ * MMX-optimized DSP functions, based on H.264 optimizations by * Michael Niedermayer and Loren Merritt * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c index 5b8d1b224f..3bba80bd87 100644 --- a/libavcodec/x86/constants.c +++ b/libavcodec/x86/constants.c @@ -1,20 +1,20 @@ /* * MMX/SSE constants used across x86 dsp optimizations. * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h index f38fbe3425..8097bc49f0 100644 --- a/libavcodec/x86/constants.h +++ b/libavcodec/x86/constants.h @@ -1,20 +1,20 @@ /* * MMX/SSE constants used across x86 dsp optimizations. * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/dct32.asm b/libavcodec/x86/dct32.asm index 9c147b9c00..6fd5ba350d 100644 --- a/libavcodec/x86/dct32.asm +++ b/libavcodec/x86/dct32.asm @@ -2,20 +2,20 @@ ;* 32 point SSE-optimized DCT transform ;* Copyright (c) 2010 Vitor Sessak ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -192,6 +192,7 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 INIT_YMM avx SECTION_TEXT +%if HAVE_AVX_EXTERNAL ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in) cglobal dct32_float, 2,3,8, out, in, tmp ; pass 1 @@ -264,6 +265,7 @@ cglobal dct32_float, 2,3,8, out, in, tmp INIT_XMM PASS6_AND_PERMUTE RET +%endif %if ARCH_X86_64 %define SPILL SWAP diff --git a/libavcodec/x86/dct_init.c b/libavcodec/x86/dct_init.c index 7bda5e81b6..85e2d0c3e6 100644 --- a/libavcodec/x86/dct_init.c +++ b/libavcodec/x86/dct_init.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/deinterlace.asm b/libavcodec/x86/deinterlace.asm index b2828f3f17..3812dbe601 100644 --- a/libavcodec/x86/deinterlace.asm +++ b/libavcodec/x86/deinterlace.asm @@ -3,20 +3,20 @@ ;* Copyright (c) 2010 Vitor Sessak ;* Copyright (c) 2002 Michael Niedermayer ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/dirac_dwt.c b/libavcodec/x86/dirac_dwt.c new file mode 100644 index 0000000000..04c514f4fd --- /dev/null +++ b/libavcodec/x86/dirac_dwt.c @@ -0,0 +1,202 @@ +/* + * MMX optimized discrete wavelet transform + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * Copyright (c) 2010 David Conrad + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86/asm.h" +#include "dsputil_x86.h" +#include "dirac_dwt.h" + +#define COMPOSE_VERTICAL(ext, align) \ +void ff_vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \ +void ff_vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \ +void ff_vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \ +void ff_vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \ +void ff_vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width); \ +void ff_horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\ +void ff_horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\ +\ +static void vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \ +{ \ + int i, width_align = width&~(align-1); \ +\ + for(i=width_align; i<width; i++) \ + b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \ +\ + ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \ +} \ +\ +static void vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \ +{ \ + int i, width_align = width&~(align-1); \ +\ + for(i=width_align; i<width; i++) \ + b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \ +\ + ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \ +} \ +\ +static void vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \ + IDWTELEM *b3, IDWTELEM *b4, int width) \ +{ \ + int i, width_align = width&~(align-1); \ +\ + for(i=width_align; i<width; i++) \ + b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \ +\ + ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \ +} \ +\ +static void vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \ + IDWTELEM *b3, IDWTELEM *b4, int width) \ +{ \ + int i, width_align = width&~(align-1); \ +\ + for(i=width_align; i<width; i++) \ + b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \ +\ + ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \ +} \ +static void vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width) \ +{ \ + int i, width_align = width&~(align-1); \ +\ + for(i=width_align; i<width; i++) { \ + b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \ + b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \ + } \ +\ + ff_vertical_compose_haar##ext(b0, b1, width_align); \ +} \ +static void horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\ +{\ + int w2= w>>1;\ + int x= w2 - (w2&(align-1));\ + ff_horizontal_compose_haar0i##ext(b, tmp, w);\ +\ + for (; x < w2; x++) {\ + b[2*x ] = tmp[x];\ + b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\ + }\ +}\ +static void horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\ +{\ + int w2= w>>1;\ + int x= w2 - (w2&(align-1));\ + ff_horizontal_compose_haar1i##ext(b, tmp, w);\ +\ + for (; x < w2; x++) {\ + b[2*x ] = (tmp[x] + 1)>>1;\ + b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\ + }\ +}\ +\ + +#if HAVE_YASM +#if !ARCH_X86_64 +COMPOSE_VERTICAL(_mmx, 4) +#endif +COMPOSE_VERTICAL(_sse2, 8) + + +void ff_horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w); + +static void horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w) +{ + int w2= w>>1; + int x= w2 - (w2&7); + ff_horizontal_compose_dd97i_ssse3(b, tmp, w); + + for (; x < w2; x++) { + b[2*x ] = (tmp[x] + 1)>>1; + b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1; + } +} +#endif + +void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type) +{ +#if HAVE_YASM + int mm_flags = av_get_cpu_flags(); + +#if !ARCH_X86_64 + if (!(mm_flags & AV_CPU_FLAG_MMX)) + return; + + switch (type) { + case DWT_DIRAC_DD9_7: + d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx; + d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx; + break; + case DWT_DIRAC_LEGALL5_3: + d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx; + d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx; + break; + case DWT_DIRAC_DD13_7: + d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx; + d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx; + break; + case DWT_DIRAC_HAAR0: + d->vertical_compose = (void*)vertical_compose_haar_mmx; + d->horizontal_compose = horizontal_compose_haar0i_mmx; + break; + case DWT_DIRAC_HAAR1: + d->vertical_compose = (void*)vertical_compose_haar_mmx; + d->horizontal_compose = horizontal_compose_haar1i_mmx; + break; + } +#endif + + if (!(mm_flags & AV_CPU_FLAG_SSE2)) + return; + + switch (type) { + case DWT_DIRAC_DD9_7: + d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2; + d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2; + break; + case DWT_DIRAC_LEGALL5_3: + d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2; + d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2; + break; + case DWT_DIRAC_DD13_7: + d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2; + d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2; + break; + case DWT_DIRAC_HAAR0: + d->vertical_compose = (void*)vertical_compose_haar_sse2; + d->horizontal_compose = horizontal_compose_haar0i_sse2; + break; + case DWT_DIRAC_HAAR1: + d->vertical_compose = (void*)vertical_compose_haar_sse2; + d->horizontal_compose = horizontal_compose_haar1i_sse2; + break; + } + + if (!(mm_flags & AV_CPU_FLAG_SSSE3)) + return; + + switch (type) { + case DWT_DIRAC_DD9_7: + d->horizontal_compose = horizontal_compose_dd97i_ssse3; + break; + } +#endif // HAVE_YASM +} diff --git a/libavcodec/x86/dirac_dwt.h b/libavcodec/x86/dirac_dwt.h new file mode 100644 index 0000000000..126b29029f --- /dev/null +++ b/libavcodec/x86/dirac_dwt.h @@ -0,0 +1,30 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_DIRAC_DWT_H +#define AVCODEC_X86_DIRAC_DWT_H + +#include "libavcodec/dirac_dwt.h" + +void ff_horizontal_compose_dd97i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x); +void ff_horizontal_compose_haar1i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x); +void ff_horizontal_compose_haar0i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x); + +void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type); + +#endif diff --git a/libavcodec/x86/diracdsp_mmx.c b/libavcodec/x86/diracdsp_mmx.c new file mode 100644 index 0000000000..a28bb82060 --- /dev/null +++ b/libavcodec/x86/diracdsp_mmx.c @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2010 David Conrad + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "dsputil_x86.h" +#include "diracdsp_mmx.h" + +void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); +void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); +void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); +void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); + +#define HPEL_FILTER(MMSIZE, EXT) \ + void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int); \ + void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int); \ + \ + static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \ + const uint8_t *src, int stride, int width, int height) \ + { \ + while( height-- ) \ + { \ + ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \ + ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width); \ + ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width); \ + \ + dsth += stride; \ + dstv += stride; \ + dstc += stride; \ + src += stride; \ + } \ + } + +#if !ARCH_X86_64 +HPEL_FILTER(8, mmx) +#endif +HPEL_FILTER(16, sse2) + +#define PIXFUNC(PFX, IDX, EXT) \ + /*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/ \ + c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \ + c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT + +void ff_diracdsp_init_mmx(DiracDSPContext* c) +{ + int mm_flags = av_get_cpu_flags(); + + if (!(mm_flags & AV_CPU_FLAG_MMX)) + return; + +#if HAVE_YASM + c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx; +#if !ARCH_X86_64 + c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx; + c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx; + c->dirac_hpel_filter = dirac_hpel_filter_mmx; + c->add_rect_clamped = ff_add_rect_clamped_mmx; + c->put_signed_rect_clamped = ff_put_signed_rect_clamped_mmx; +#endif +#endif + +#if HAVE_MMX_INLINE + PIXFUNC(put, 0, mmx); + PIXFUNC(avg, 0, mmx); +#endif + +#if HAVE_MMXEXT_INLINE + if (mm_flags & AV_CPU_FLAG_MMX2) { + PIXFUNC(avg, 0, mmxext); + } +#endif + + if (mm_flags & AV_CPU_FLAG_SSE2) { +#if HAVE_YASM + c->dirac_hpel_filter = dirac_hpel_filter_sse2; + c->add_rect_clamped = ff_add_rect_clamped_sse2; + c->put_signed_rect_clamped = ff_put_signed_rect_clamped_sse2; + + c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2; + c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2; +#endif +#if HAVE_SSE2_INLINE + c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2; + c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2; + c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2; + c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2; +#endif + } +} diff --git a/libavcodec/x86/diracdsp_mmx.h b/libavcodec/x86/diracdsp_mmx.h new file mode 100644 index 0000000000..89858544f3 --- /dev/null +++ b/libavcodec/x86/diracdsp_mmx.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2010 David Conrad + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_DIRACDSP_H +#define AVCODEC_X86_DIRACDSP_H + +#include "libavcodec/diracdsp.h" + +void ff_diracdsp_init_mmx(DiracDSPContext* c); + +DECL_DIRAC_PIXOP(put, mmx); +DECL_DIRAC_PIXOP(avg, mmx); +DECL_DIRAC_PIXOP(avg, mmxext); + +void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); +void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); +void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); +void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); + +void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int); +void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int); + +void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); +void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); +void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); + +void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); +void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); + +#endif diff --git a/libavcodec/x86/diracdsp_yasm.asm b/libavcodec/x86/diracdsp_yasm.asm new file mode 100644 index 0000000000..3e9765b42d --- /dev/null +++ b/libavcodec/x86/diracdsp_yasm.asm @@ -0,0 +1,264 @@ +;****************************************************************************** +;* Copyright (c) 2010 David Conrad +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA +pw_3: times 8 dw 3 +pw_7: times 8 dw 7 +pw_16: times 8 dw 16 +pw_32: times 8 dw 32 +pb_128: times 16 db 128 + +section .text + +%macro UNPACK_ADD 6 + mov%5 %1, %3 + mov%6 m5, %4 + mova m4, %1 + mova %2, m5 + punpcklbw %1, m7 + punpcklbw m5, m7 + punpckhbw m4, m7 + punpckhbw %2, m7 + paddw %1, m5 + paddw %2, m4 +%endmacro + +%macro HPEL_FILTER 1 +; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width); +cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3 + mov src0q, srcq + lea stridex3q, [3*strideq] + sub src0q, stridex3q + pxor m7, m7 +.loop: + ; 7*(src[0] + src[1]) + UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a + pmullw m0, [pw_7] + pmullw m1, [pw_7] + + ; 3*( ... + src[-2] + src[3]) + UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a + paddw m0, m2 + paddw m1, m3 + pmullw m0, [pw_3] + pmullw m1, [pw_3] + + ; ... - 7*(src[-1] + src[2]) + UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a + pmullw m2, [pw_7] + pmullw m3, [pw_7] + psubw m0, m2 + psubw m1, m3 + + ; ... - (src[-3] + src[4]) + UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a + psubw m0, m2 + psubw m1, m3 + + paddw m0, [pw_16] + paddw m1, [pw_16] + psraw m0, 5 + psraw m1, 5 + packuswb m0, m1 + mova [dstq], m0 + add dstq, mmsize + add srcq, mmsize + add src0q, mmsize + sub widthd, mmsize + jg .loop + RET + +; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width); +cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width + dec widthd + pxor m7, m7 + and widthd, ~(mmsize-1) +.loop: + ; 7*(src[0] + src[1]) + UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u + pmullw m0, [pw_7] + pmullw m1, [pw_7] + + ; 3*( ... + src[-2] + src[3]) + UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u + paddw m0, m2 + paddw m1, m3 + pmullw m0, [pw_3] + pmullw m1, [pw_3] + + ; ... - 7*(src[-1] + src[2]) + UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u + pmullw m2, [pw_7] + pmullw m3, [pw_7] + psubw m0, m2 + psubw m1, m3 + + ; ... - (src[-3] + src[4]) + UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u + psubw m0, m2 + psubw m1, m3 + + paddw m0, [pw_16] + paddw m1, [pw_16] + psraw m0, 5 + psraw m1, 5 + packuswb m0, m1 + mova [dstq + widthq], m0 + sub widthd, mmsize + jge .loop + RET +%endmacro + +%macro PUT_RECT 1 +; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height) +cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2 + mova m0, [pb_128] + add wd, (mmsize-1) + and wd, ~(mmsize-1) + +%if ARCH_X86_64 + movsxd dst_strideq, dst_strided + movsxd src_strideq, src_strided + mov r7d, r5m + mov r8d, wd + %define wspill r8d + %define hd r7d +%else + mov r4m, wd + %define wspill r4m + %define hd r5mp +%endif + +.loopy + lea src2q, [srcq+src_strideq*2] + lea dst2q, [dstq+dst_strideq] +.loopx: + sub wd, mmsize + mova m1, [srcq +2*wq] + mova m2, [src2q+2*wq] + packsswb m1, [srcq +2*wq+mmsize] + packsswb m2, [src2q+2*wq+mmsize] + paddb m1, m0 + paddb m2, m0 + mova [dstq +wq], m1 + mova [dst2q+wq], m2 + jg .loopx + + lea srcq, [srcq+src_strideq*4] + lea dstq, [dstq+dst_strideq*2] + sub hd, 2 + mov wd, wspill + jg .loopy + RET +%endm + +%macro ADD_RECT 1 +; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height) +cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h + mova m0, [pw_32] + add wd, (mmsize-1) + and wd, ~(mmsize-1) + +%if ARCH_X86_64 + movsxd strideq, strided + movsxd idwt_strideq, idwt_strided + mov r8d, wd + %define wspill r8d +%else + mov r5m, wd + %define wspill r5m +%endif + +.loop: + sub wd, mmsize + movu m1, [srcq +2*wq] ; FIXME: ensure alignment + paddw m1, m0 + psraw m1, 6 + movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment + paddw m2, m0 + psraw m2, 6 + paddw m1, [idwtq+2*wq] + paddw m2, [idwtq+2*wq+mmsize] + packuswb m1, m2 + mova [dstq +wq], m1 + jg .loop + + lea srcq, [srcq + 2*strideq] + add dstq, strideq + lea idwtq, [idwtq+ 2*idwt_strideq] + sub hd, 1 + mov wd, wspill + jg .loop + RET +%endm + +%macro ADD_OBMC 2 +; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen) +cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen + pxor m4, m4 +.loop: +%assign i 0 +%rep %1 / mmsize + mova m0, [srcq+i] + mova m1, m0 + punpcklbw m0, m4 + punpckhbw m1, m4 + mova m2, [obmcq+i] + mova m3, m2 + punpcklbw m2, m4 + punpckhbw m3, m4 + pmullw m0, m2 + pmullw m1, m3 + movu m2, [dstq+2*i] + movu m3, [dstq+2*i+mmsize] + paddw m0, m2 + paddw m1, m3 + movu [dstq+2*i], m0 + movu [dstq+2*i+mmsize], m1 +%assign i i+mmsize +%endrep + lea srcq, [srcq+strideq] + lea dstq, [dstq+2*strideq] + add obmcq, 32 + sub yblend, 1 + jg .loop + RET +%endm + +INIT_MMX +%if ARCH_X86_64 == 0 +PUT_RECT mmx +ADD_RECT mmx + +HPEL_FILTER mmx +ADD_OBMC 32, mmx +ADD_OBMC 16, mmx +%endif +ADD_OBMC 8, mmx + +INIT_XMM +PUT_RECT sse2 +ADD_RECT sse2 + +HPEL_FILTER sse2 +ADD_OBMC 32, sse2 +ADD_OBMC 16, sse2 diff --git a/libavcodec/x86/dnxhdenc.c b/libavcodec/x86/dnxhdenc.c index 0bab69f67d..c7e776a4c1 100644 --- a/libavcodec/x86/dnxhdenc.c +++ b/libavcodec/x86/dnxhdenc.c @@ -4,20 +4,20 @@ * * VC-3 encoder funded by the British Broadcasting Corporation * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm index 5d73ff8ee4..77069e20f8 100644 --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -1,21 +1,23 @@ ;****************************************************************************** ;* MMX optimized DSP utils ;* Copyright (c) 2008 Loren Merritt +;* Copyright (c) 2003-2013 Michael Niedermayer +;* Copyright (c) 2013 Daniel Kang ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -605,6 +607,7 @@ cglobal bswap32_buf, 3,4,3 cglobal bswap32_buf, 3,4,5 mov r3, r1 %endif + or r3, r0 and r3, 15 jz .start_align BSWAP_LOOPS u diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c index 82864e8857..e0b40410a7 100644 --- a/libavcodec/x86/dsputil_init.c +++ b/libavcodec/x86/dsputil_init.c @@ -1,18 +1,21 @@ /* - * This file is part of Libav. + * Copyright (c) 2000, 2001 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> * - * Libav is free software; you can redistribute it and/or + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -529,24 +532,11 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, c->clear_block = ff_clear_block_mmx; c->clear_blocks = ff_clear_blocks_mmx; c->draw_edges = ff_draw_edges_mmx; - - switch (avctx->idct_algo) { - case FF_IDCT_AUTO: - case FF_IDCT_SIMPLEMMX: - c->idct_put = ff_simple_idct_put_mmx; - c->idct_add = ff_simple_idct_add_mmx; - c->idct = ff_simple_idct_mmx; - c->idct_permutation_type = FF_SIMPLE_IDCT_PERM; - break; - case FF_IDCT_XVIDMMX: - c->idct_put = ff_idct_xvid_mmx_put; - c->idct_add = ff_idct_xvid_mmx_add; - c->idct = ff_idct_xvid_mmx; - break; - } } +#if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM) c->gmc = ff_gmc_mmx; +#endif c->add_bytes = ff_add_bytes_mmx; #endif /* HAVE_MMX_INLINE */ @@ -562,7 +552,7 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx, #if HAVE_MMXEXT_INLINE const int high_bit_depth = avctx->bits_per_raw_sample > 8; - if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) { + if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX && avctx->lowres == 0) { c->idct_put = ff_idct_xvid_mmxext_put; c->idct_add = ff_idct_xvid_mmxext_add; c->idct = ff_idct_xvid_mmxext; @@ -595,19 +585,21 @@ static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, c->vector_clipf = ff_vector_clipf_sse; -#if FF_API_XVMC -FF_DISABLE_DEPRECATION_WARNINGS /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */ - if (CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1) + if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb) return; -FF_ENABLE_DEPRECATION_WARNINGS -#endif /* FF_API_XVMC */ if (!high_bit_depth) { c->clear_block = ff_clear_block_sse; c->clear_blocks = ff_clear_blocks_sse; } #endif /* HAVE_SSE_INLINE */ + +#if HAVE_YASM +#if HAVE_INLINE_ASM && CONFIG_VIDEODSP + c->gmc = ff_gmc_sse; +#endif +#endif /* HAVE_YASM */ } static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, @@ -616,7 +608,7 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, #if HAVE_SSE2_INLINE const int high_bit_depth = avctx->bits_per_raw_sample > 8; - if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) { + if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX && avctx->lowres == 0) { c->idct_put = ff_idct_xvid_sse2_put; c->idct_add = ff_idct_xvid_sse2_add; c->idct = ff_idct_xvid_sse2; @@ -663,12 +655,30 @@ av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx) int cpu_flags = av_get_cpu_flags(); #if HAVE_7REGS && HAVE_INLINE_ASM - if (cpu_flags & AV_CPU_FLAG_CMOV) + if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_CMOV) c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_cmov; #endif - if (X86_MMX(cpu_flags)) + if (X86_MMX(cpu_flags)) { +#if HAVE_INLINE_ASM + const int idct_algo = avctx->idct_algo; + + if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) { + if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) { + c->idct_put = ff_simple_idct_put_mmx; + c->idct_add = ff_simple_idct_add_mmx; + c->idct = ff_simple_idct_mmx; + c->idct_permutation_type = FF_SIMPLE_IDCT_PERM; + } else if (idct_algo == FF_IDCT_XVIDMMX) { + c->idct_put = ff_idct_xvid_mmx_put; + c->idct_add = ff_idct_xvid_mmx_add; + c->idct = ff_idct_xvid_mmx; + } + } +#endif /* HAVE_INLINE_ASM */ + dsputil_init_mmx(c, avctx, cpu_flags); + } if (X86_MMXEXT(cpu_flags)) dsputil_init_mmxext(c, avctx, cpu_flags); diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 885c10a70f..df8cfdb464 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -3,30 +3,33 @@ * Copyright (c) 2000, 2001 Fabrice Bellard * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> * - * MMX optimization by Nick Kurshev <nickols_k@mail.ru> - * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * MMX optimization by Nick Kurshev <nickols_k@mail.ru> */ #include "config.h" +#include "libavutil/avassert.h" #include "libavutil/cpu.h" #include "libavutil/x86/asm.h" +#include "libavcodec/videodsp.h" #include "constants.h" #include "dsputil_x86.h" +#include "diracdsp_mmx.h" #if HAVE_INLINE_ASM @@ -279,7 +282,7 @@ void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, : "+r"(ptr) : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height) ); - } else { + } else if(w==16){ __asm__ volatile ( "1: \n\t" "movd (%0), %%mm0 \n\t" @@ -300,6 +303,25 @@ void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, : "+r"(ptr) : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height) ); + } else { + av_assert1(w == 4); + __asm__ volatile ( + "1: \n\t" + "movd (%0), %%mm0 \n\t" + "punpcklbw %%mm0, %%mm0 \n\t" + "punpcklwd %%mm0, %%mm0 \n\t" + "movd %%mm0, -4(%0) \n\t" + "movd -4(%0, %2), %%mm1 \n\t" + "punpcklbw %%mm1, %%mm1 \n\t" + "punpckhwd %%mm1, %%mm1 \n\t" + "punpckhdq %%mm1, %%mm1 \n\t" + "movd %%mm1, (%0, %2) \n\t" + "add %1, %0 \n\t" + "cmp %3, %0 \n\t" + "jb 1b \n\t" + : "+r"(ptr) + : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height) + ); } /* top and bottom (and hopefully also the corners) */ @@ -345,10 +367,17 @@ void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, } } -void ff_gmc_mmx(uint8_t *dst, uint8_t *src, - int stride, int h, int ox, int oy, - int dxx, int dxy, int dyx, int dyy, - int shift, int r, int width, int height) +typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride, + ptrdiff_t src_linesize, + int block_w, int block_h, + int src_x, int src_y, int w, int h); + +static av_always_inline void gmc(uint8_t *dst, uint8_t *src, + int stride, int h, int ox, int oy, + int dxx, int dxy, int dyx, int dyy, + int shift, int r, int width, int height, + emulated_edge_mc_func *emu_edge_fn) { const int w = 8; const int ix = ox >> (16 + shift); @@ -363,19 +392,24 @@ void ff_gmc_mmx(uint8_t *dst, uint8_t *src, const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys }; const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys }; const uint64_t shift2 = 2 * shift; +#define MAX_STRIDE 4096U +#define MAX_H 8U + uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE]; int x, y; const int dxw = (dxx - (1 << (16 + shift))) * (w - 1); const int dyh = (dyy - (1 << (16 + shift))) * (h - 1); const int dxh = dxy * (h - 1); const int dyw = dyx * (w - 1); + int need_emu = (unsigned)ix >= width - w || + (unsigned)iy >= height - h; + if ( // non-constant fullpel offset (3% of blocks) ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) | (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift) // uses more than 16 bits of subpel mv (only at huge resolution) - || (dxx | dxy | dyx | dyy) & 15 || - (unsigned)ix >= width - w || - (unsigned)iy >= height - h) { + || (dxx | dxy | dyx | dyy) & 15 + || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) { // FIXME could still use mmx for some of the rows ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height); @@ -383,6 +417,10 @@ void ff_gmc_mmx(uint8_t *dst, uint8_t *src, } src += ix + iy * stride; + if (need_emu) { + emu_edge_fn(edge_buf, src, stride, stride, w + 1, h + 1, ix, iy, width, height); + src = edge_buf; + } __asm__ volatile ( "movd %0, %%mm6 \n\t" @@ -461,6 +499,108 @@ void ff_gmc_mmx(uint8_t *dst, uint8_t *src, } } +#if CONFIG_VIDEODSP +#if HAVE_YASM +#if ARCH_X86_32 +void ff_gmc_mmx(uint8_t *dst, uint8_t *src, + int stride, int h, int ox, int oy, + int dxx, int dxy, int dyx, int dyy, + int shift, int r, int width, int height) +{ + gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, + width, height, &ff_emulated_edge_mc_8); +} +#endif +void ff_gmc_sse(uint8_t *dst, uint8_t *src, + int stride, int h, int ox, int oy, + int dxx, int dxy, int dyx, int dyy, + int shift, int r, int width, int height) +{ + gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, + width, height, &ff_emulated_edge_mc_8); +} +#else +void ff_gmc_mmx(uint8_t *dst, uint8_t *src, + int stride, int h, int ox, int oy, + int dxx, int dxy, int dyx, int dyy, + int shift, int r, int width, int height) +{ + gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, + width, height, &ff_emulated_edge_mc_8); +} +#endif +#endif + +#if CONFIG_DIRAC_DECODER +#define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\ +void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\ +{\ + if (h&3)\ + ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\ + else\ + OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\ +}\ +void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\ +{\ + if (h&3)\ + ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\ + else\ + OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\ +}\ +void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\ +{\ + if (h&3) {\ + ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\ + } else {\ + OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\ + OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\ + }\ +} + +#if HAVE_MMX_INLINE +PIXELS16(static, ff_avg, , , _mmxext) +DIRAC_PIXOP(put, ff_put, mmx) +DIRAC_PIXOP(avg, ff_avg, mmx) +#endif + +#if HAVE_YASM +DIRAC_PIXOP(avg, ff_avg, mmxext) + +void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) +{ + if (h&3) + ff_put_dirac_pixels16_c(dst, src, stride, h); + else + ff_put_pixels16_sse2(dst, src[0], stride, h); +} +void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) +{ + if (h&3) + ff_avg_dirac_pixels16_c(dst, src, stride, h); + else + ff_avg_pixels16_sse2(dst, src[0], stride, h); +} +void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) +{ + if (h&3) { + ff_put_dirac_pixels32_c(dst, src, stride, h); + } else { + ff_put_pixels16_sse2(dst , src[0] , stride, h); + ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h); + } +} +void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) +{ + if (h&3) { + ff_avg_dirac_pixels32_c(dst, src, stride, h); + } else { + ff_avg_pixels16_sse2(dst , src[0] , stride, h); + ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h); + } +} +#endif +#endif + void ff_vector_clipf_sse(float *dst, const float *src, float min, float max, int len) { diff --git a/libavcodec/x86/dsputil_qns_template.c b/libavcodec/x86/dsputil_qns_template.c index 20a40a175e..bde6b0a606 100644 --- a/libavcodec/x86/dsputil_qns_template.c +++ b/libavcodec/x86/dsputil_qns_template.c @@ -5,20 +5,20 @@ * MMX optimization by Michael Niedermayer <michaelni@gmx.at> * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -28,7 +28,7 @@ static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[ { x86_reg i=0; - assert(FFABS(scale) < MAX_ABS); + av_assert2(FFABS(scale) < MAX_ABS); scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; SET_RND(mm6); diff --git a/libavcodec/x86/dsputil_x86.c b/libavcodec/x86/dsputil_x86.c index 144339be64..f43b9d782d 100644 --- a/libavcodec/x86/dsputil_x86.c +++ b/libavcodec/x86/dsputil_x86.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2009 Loren Merritt <lorenm@u.washington.edu> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/dsputil_x86.h b/libavcodec/x86/dsputil_x86.h index c8615b2472..356b2c142f 100644 --- a/libavcodec/x86/dsputil_x86.h +++ b/libavcodec/x86/dsputil_x86.h @@ -2,20 +2,20 @@ * MMX optimized DSP utils * Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -129,6 +129,11 @@ void ff_gmc_mmx(uint8_t *dst, uint8_t *src, int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height); +void ff_gmc_sse(uint8_t *dst, uint8_t *src, + int stride, int h, int ox, int oy, + int dxx, int dxy, int dyx, int dyy, + int shift, int r, int width, int height); + void ff_vector_clipf_sse(float *dst, const float *src, float min, float max, int len); @@ -162,6 +167,10 @@ void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels, void ff_put_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); + +void ff_mmx_idct(int16_t *block); +void ff_mmxext_idct(int16_t *block); + void ff_deinterlace_line_mmx(uint8_t *dst, const uint8_t *lum_m4, const uint8_t *lum_m3, const uint8_t *lum_m2, const uint8_t *lum_m1, diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm index 7e4fd8152c..1839bee24a 100644 --- a/libavcodec/x86/dsputilenc.asm +++ b/libavcodec/x86/dsputilenc.asm @@ -4,20 +4,20 @@ ;* Copyright (c) 2000, 2001 Fabrice Bellard ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;***************************************************************************** diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c index a1f80afe53..5de8ade8b1 100644 --- a/libavcodec/x86/dsputilenc_mmx.c +++ b/libavcodec/x86/dsputilenc_mmx.c @@ -5,20 +5,20 @@ * * MMX optimization by Nick Kurshev <nickols_k@mail.ru> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -424,8 +424,8 @@ static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { int tmp; - assert( (((int)pix) & 7) == 0); - assert((line_size &7) ==0); + av_assert2( (((int)pix) & 7) == 0); + av_assert2((line_size &7) ==0); #define SUM(in0, in1, out0, out1) \ "movq (%0), %%mm2\n"\ @@ -487,8 +487,8 @@ static int vsad_intra16_mmxext(void *v, uint8_t *pix, uint8_t *dummy, { int tmp; - assert( (((int)pix) & 7) == 0); - assert((line_size &7) ==0); + av_assert2( (((int)pix) & 7) == 0); + av_assert2((line_size &7) ==0); #define SUM(in0, in1, out0, out1) \ "movq (%0), " #out0 "\n"\ @@ -527,9 +527,9 @@ static int vsad_intra16_mmxext(void *v, uint8_t *pix, uint8_t *dummy, static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { int tmp; - assert( (((int)pix1) & 7) == 0); - assert( (((int)pix2) & 7) == 0); - assert((line_size &7) ==0); + av_assert2( (((int)pix1) & 7) == 0); + av_assert2( (((int)pix2) & 7) == 0); + av_assert2((line_size &7) ==0); #define SUM(in0, in1, out0, out1) \ "movq (%0),%%mm2\n"\ @@ -607,9 +607,9 @@ static int vsad16_mmxext(void *v, uint8_t *pix1, uint8_t *pix2, { int tmp; - assert( (((int)pix1) & 7) == 0); - assert( (((int)pix2) & 7) == 0); - assert((line_size &7) ==0); + av_assert2( (((int)pix1) & 7) == 0); + av_assert2( (((int)pix2) & 7) == 0); + av_assert2((line_size &7) ==0); #define SUM(in0, in1, out0, out1) \ "movq (%0)," #out0 "\n"\ @@ -661,8 +661,9 @@ static int vsad16_mmxext(void *v, uint8_t *pix1, uint8_t *pix2, } #undef SUM -static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ +static void diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){ x86_reg i=0; + if(w>=16) __asm__ volatile( "1: \n\t" "movq (%2, %0), %%mm0 \n\t" diff --git a/libavcodec/x86/dwt_yasm.asm b/libavcodec/x86/dwt_yasm.asm new file mode 100644 index 0000000000..5253abc6c8 --- /dev/null +++ b/libavcodec/x86/dwt_yasm.asm @@ -0,0 +1,306 @@ +;****************************************************************************** +;* MMX optimized discrete wavelet trasnform +;* Copyright (c) 2010 David Conrad +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA +pw_1: times 8 dw 1 +pw_2: times 8 dw 2 +pw_8: times 8 dw 8 +pw_16: times 8 dw 16 +pw_1991: times 4 dw 9,-1 + +section .text + +; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2 +%macro COMPOSE_53iL0 4 + paddw %2, %3 + paddw %2, %4 + psraw %2, 2 + psubw %1, %2 +%endm + +; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4 +; if %4 is supplied, %1 is loaded unaligned from there +; m2: clobbered m3: pw_8 m4: pw_1991 +%macro COMPOSE_DD97iH0 3-4 + paddw m0, %3 + paddw m1, %2 + psubw m0, m3 + mova m2, m1 + punpcklwd m1, m0 + punpckhwd m2, m0 + pmaddwd m1, m4 + pmaddwd m2, m4 +%if %0 > 3 + movu %1, %4 +%endif + psrad m1, 4 + psrad m2, 4 + packssdw m1, m2 + paddw m1, %1 +%endm + +%macro COMPOSE_VERTICAL 1 +; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, +; int width) +cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width + mova m2, [pw_2] +%if ARCH_X86_64 + mov widthd, widthd +%endif +.loop: + sub widthq, mmsize/2 + mova m1, [b0q+2*widthq] + mova m0, [b1q+2*widthq] + COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2 + mova [b1q+2*widthq], m0 + jg .loop + REP_RET + +; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, +; int width) +cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width + mova m1, [pw_1] +%if ARCH_X86_64 + mov widthd, widthd +%endif +.loop: + sub widthq, mmsize/2 + mova m0, [b0q+2*widthq] + paddw m0, [b2q+2*widthq] + paddw m0, m1 + psraw m0, 1 + paddw m0, [b1q+2*widthq] + mova [b1q+2*widthq], m0 + jg .loop + REP_RET + +; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, +; IDWTELEM *b3, IDWTELEM *b4, int width) +cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width + mova m3, [pw_8] + mova m4, [pw_1991] +%if ARCH_X86_64 + mov widthd, widthd +%endif +.loop: + sub widthq, mmsize/2 + mova m0, [b0q+2*widthq] + mova m1, [b1q+2*widthq] + COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq] + mova [b2q+2*widthq], m1 + jg .loop + REP_RET + +; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, +; IDWTELEM *b3, IDWTELEM *b4, int width) +cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width + mova m3, [pw_16] + mova m4, [pw_1991] +%if ARCH_X86_64 + mov widthd, widthd +%endif +.loop: + sub widthq, mmsize/2 + mova m0, [b0q+2*widthq] + mova m1, [b1q+2*widthq] + mova m5, [b2q+2*widthq] + paddw m0, [b4q+2*widthq] + paddw m1, [b3q+2*widthq] + psubw m0, m3 + mova m2, m1 + punpcklwd m1, m0 + punpckhwd m2, m0 + pmaddwd m1, m4 + pmaddwd m2, m4 + psrad m1, 5 + psrad m2, 5 + packssdw m1, m2 + psubw m5, m1 + mova [b2q+2*widthq], m5 + jg .loop + REP_RET + +; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width) +cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width + mova m3, [pw_1] +%if ARCH_X86_64 + mov widthd, widthd +%endif +.loop: + sub widthq, mmsize/2 + mova m1, [b1q+2*widthq] + mova m0, [b0q+2*widthq] + mova m2, m1 + paddw m1, m3 + psraw m1, 1 + psubw m0, m1 + mova [b0q+2*widthq], m0 + paddw m2, m0 + mova [b1q+2*widthq], m2 + jg .loop + REP_RET +%endmacro + +; extend the left and right edges of the tmp array by %1 and %2 respectively +%macro EDGE_EXTENSION 3 + mov %3, [tmpq] +%assign %%i 1 +%rep %1 + mov [tmpq-2*%%i], %3 + %assign %%i %%i+1 +%endrep + mov %3, [tmpq+2*w2q-2] +%assign %%i 0 +%rep %2 + mov [tmpq+2*w2q+2*%%i], %3 + %assign %%i %%i+1 +%endrep +%endmacro + + +%macro HAAR_HORIZONTAL 2 +; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width) +cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2 + mov w2d, wd + xor xq, xq + shr w2d, 1 + lea b_w2q, [bq+wq] + mova m3, [pw_1] +.lowpass_loop: + movu m1, [b_w2q + 2*xq] + mova m0, [bq + 2*xq] + paddw m1, m3 + psraw m1, 1 + psubw m0, m1 + mova [tmpq + 2*xq], m0 + add xq, mmsize/2 + cmp xq, w2q + jl .lowpass_loop + + xor xq, xq + and w2q, ~(mmsize/2 - 1) + cmp w2q, mmsize/2 + jl .end + +.highpass_loop: + movu m1, [b_w2q + 2*xq] + mova m0, [tmpq + 2*xq] + paddw m1, m0 + + ; shift and interleave +%if %2 == 1 + paddw m0, m3 + paddw m1, m3 + psraw m0, 1 + psraw m1, 1 +%endif + mova m2, m0 + punpcklwd m0, m1 + punpckhwd m2, m1 + mova [bq+4*xq], m0 + mova [bq+4*xq+mmsize], m2 + + add xq, mmsize/2 + cmp xq, w2q + jl .highpass_loop +.end: + REP_RET +%endmacro + + +INIT_XMM +; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width) +cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2 + mov w2d, wd + xor xd, xd + shr w2d, 1 + lea b_w2q, [bq+wq] + movu m4, [bq+wq] + mova m7, [pw_2] + pslldq m4, 14 +.lowpass_loop: + movu m1, [b_w2q + 2*xq] + mova m0, [bq + 2*xq] + mova m2, m1 + palignr m1, m4, 14 + mova m4, m2 + COMPOSE_53iL0 m0, m1, m2, m7 + mova [tmpq + 2*xq], m0 + add xd, mmsize/2 + cmp xd, w2d + jl .lowpass_loop + + EDGE_EXTENSION 1, 2, xw + ; leave the last up to 7 (sse) or 3 (mmx) values for C + xor xd, xd + and w2d, ~(mmsize/2 - 1) + cmp w2d, mmsize/2 + jl .end + + mova m7, [tmpq-mmsize] + mova m0, [tmpq] + mova m5, [pw_1] + mova m3, [pw_8] + mova m4, [pw_1991] +.highpass_loop: + mova m6, m0 + palignr m0, m7, 14 + mova m7, [tmpq + 2*xq + 16] + mova m1, m7 + mova m2, m7 + palignr m1, m6, 2 + palignr m2, m6, 4 + COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq] + mova m0, m7 + mova m7, m6 + + ; shift and interleave + paddw m6, m5 + paddw m1, m5 + psraw m6, 1 + psraw m1, 1 + mova m2, m6 + punpcklwd m6, m1 + punpckhwd m2, m1 + mova [bq+4*xq], m6 + mova [bq+4*xq+mmsize], m2 + + add xd, mmsize/2 + cmp xd, w2d + jl .highpass_loop +.end: + REP_RET + + +%if ARCH_X86_64 == 0 +INIT_MMX +COMPOSE_VERTICAL mmx +HAAR_HORIZONTAL mmx, 0 +HAAR_HORIZONTAL mmx, 1 +%endif + +;;INIT_XMM +INIT_XMM +COMPOSE_VERTICAL sse2 +HAAR_HORIZONTAL sse2, 0 +HAAR_HORIZONTAL sse2, 1 diff --git a/libavcodec/x86/fdct.c b/libavcodec/x86/fdct.c index 6d595aa76f..11a13bb704 100644 --- a/libavcodec/x86/fdct.c +++ b/libavcodec/x86/fdct.c @@ -13,20 +13,20 @@ * a page about fdct at http://www.geocities.com/ssavekar/dct.htm * Skal's fdct at http://skal.planet-d.net/coding/dct.html * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -70,7 +70,7 @@ DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) }; DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW }; -static struct +static const struct { DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4]; } fdct_r_row_sse2 = @@ -153,7 +153,7 @@ DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct 29692, -12299, 26722, -31521, }; -static struct +static const struct { DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256]; } tab_frw_01234567_sse2 = diff --git a/libavcodec/x86/fft.asm b/libavcodec/x86/fft.asm index e4744a3b60..cae404c1a2 100644 --- a/libavcodec/x86/fft.asm +++ b/libavcodec/x86/fft.asm @@ -6,20 +6,20 @@ ;* This algorithm (though not any of the implementation details) is ;* based on libdjbfft by D. J. Bernstein. ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -36,6 +36,8 @@ %define pointer resd %endif +SECTION_RODATA 32 + struc FFTContext .nbits: resd 1 .reverse: resd 1 @@ -51,13 +53,10 @@ struc FFTContext .imdcthalf:pointer 1 endstruc -SECTION_RODATA - %define M_SQRT1_2 0.70710678118654752440 %define M_COS_PI_1_8 0.923879532511287 %define M_COS_PI_3_8 0.38268343236509 -align 32 ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8 ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8 @@ -305,6 +304,7 @@ IF%1 mova Z(1), m5 INIT_YMM avx +%if HAVE_AVX_EXTERNAL align 16 fft8_avx: mova m0, Z(0) @@ -394,6 +394,8 @@ fft32_interleave_avx: jg .deint_loop ret +%endif + INIT_XMM sse align 16 @@ -537,6 +539,7 @@ DEFINE_ARGS zc, w, n, o1, o3 INIT_YMM avx +%if HAVE_AVX_EXTERNAL %macro INTERL_AVX 5 vunpckhps %3, %2, %1 vunpcklps %2, %2, %1 @@ -558,6 +561,7 @@ cglobal fft_calc, 2,5,8 FFT_DISPATCH _interleave %+ SUFFIX, r1 REP_RET +%endif INIT_XMM sse @@ -776,9 +780,11 @@ align 8 dispatch_tab %+ fullsuffix: pointer list_of_fft %endmacro ; DECL_FFT +%if HAVE_AVX_EXTERNAL INIT_YMM avx DECL_FFT 6 DECL_FFT 6, _interleave +%endif INIT_XMM sse DECL_FFT 5 DECL_FFT 5, _interleave @@ -1080,4 +1086,7 @@ DECL_IMDCT POSROTATESHUF_3DNOW %endif INIT_YMM avx + +%if HAVE_AVX_EXTERNAL DECL_IMDCT POSROTATESHUF_AVX +%endif diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h index a604956836..398091eb1f 100644 --- a/libavcodec/x86/fft.h +++ b/libavcodec/x86/fft.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/fft_init.c b/libavcodec/x86/fft_init.c index 7ca72c54a4..5682230c8e 100644 --- a/libavcodec/x86/fft_init.c +++ b/libavcodec/x86/fft_init.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm index e7803df4e5..60078e2c2c 100644 --- a/libavcodec/x86/fmtconvert.asm +++ b/libavcodec/x86/fmtconvert.asm @@ -2,20 +2,20 @@ ;* x86 optimized Format Conversion Utils ;* Copyright (c) 2008 Loren Merritt ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/fmtconvert_init.c b/libavcodec/x86/fmtconvert_init.c index 3d75df92bd..d300dfd864 100644 --- a/libavcodec/x86/fmtconvert_init.c +++ b/libavcodec/x86/fmtconvert_init.c @@ -5,20 +5,20 @@ * * MMX optimization by Nick Kurshev <nickols_k@mail.ru> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/fpel.asm b/libavcodec/x86/fpel.asm index 43b039d0e0..dc363d1a63 100644 --- a/libavcodec/x86/fpel.asm +++ b/libavcodec/x86/fpel.asm @@ -4,20 +4,20 @@ ;* Copyright (c) 2003-2013 Michael Niedermayer ;* Copyright (c) 2013 Daniel Kang ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/fpel_mmx.c b/libavcodec/x86/fpel_mmx.c index 1ae8f86466..384ab89d9c 100644 --- a/libavcodec/x86/fpel_mmx.c +++ b/libavcodec/x86/fpel_mmx.c @@ -4,20 +4,20 @@ * Copyright (c) 2000, 2001 Fabrice Bellard * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/h263_loopfilter.asm b/libavcodec/x86/h263_loopfilter.asm index a940aad07a..a21baf1629 100644 --- a/libavcodec/x86/h263_loopfilter.asm +++ b/libavcodec/x86/h263_loopfilter.asm @@ -1,20 +1,22 @@ ;****************************************************************************** ;* MMX-optimized H.263 loop filter +;* Copyright (c) 2003-2013 Michael Niedermayer +;* Copyright (c) 2013 Daniel Kang ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/h263dsp_init.c b/libavcodec/x86/h263dsp_init.c index d4fab981bf..ab81063233 100644 --- a/libavcodec/x86/h263dsp_init.c +++ b/libavcodec/x86/h263dsp_init.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2013 Diego Biurrun <diego@biurrun.de> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm index b7b18e03f8..32681aafd7 100644 --- a/libavcodec/x86/h264_chromamc.asm +++ b/libavcodec/x86/h264_chromamc.asm @@ -3,20 +3,20 @@ ;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>, ;* 2005-2008 Loren Merritt ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/h264_chromamc_10bit.asm b/libavcodec/x86/h264_chromamc_10bit.asm index aec7678d75..beb7c0fe20 100644 --- a/libavcodec/x86/h264_chromamc_10bit.asm +++ b/libavcodec/x86/h264_chromamc_10bit.asm @@ -5,20 +5,20 @@ ;* ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -251,8 +251,10 @@ cglobal %1_h264_chroma_mc2_10, 6,7 %define CHROMAMC_AVG NOTHING INIT_XMM sse2 CHROMA_MC8 put +%if HAVE_AVX_EXTERNAL INIT_XMM avx CHROMA_MC8 put +%endif INIT_MMX mmxext CHROMA_MC4 put CHROMA_MC2 put @@ -260,8 +262,10 @@ CHROMA_MC2 put %define CHROMAMC_AVG AVG INIT_XMM sse2 CHROMA_MC8 avg +%if HAVE_AVX_EXTERNAL INIT_XMM avx CHROMA_MC8 avg +%endif INIT_MMX mmxext CHROMA_MC4 avg CHROMA_MC2 avg diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 6e29ce7373..1317783bec 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -7,20 +7,20 @@ ;* Jason Garrett-Glaser <darkshikari@gmail.com> ;* Oskar Arvidsson <oskar@irock.se> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -382,8 +382,10 @@ cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64 INIT_XMM sse2 DEBLOCK_LUMA +%if HAVE_AVX_EXTERNAL INIT_XMM avx DEBLOCK_LUMA +%endif %else @@ -495,8 +497,10 @@ INIT_MMX mmxext DEBLOCK_LUMA v8, 8 INIT_XMM sse2 DEBLOCK_LUMA v, 16 +%if HAVE_AVX_EXTERNAL INIT_XMM avx DEBLOCK_LUMA v, 16 +%endif %endif ; ARCH @@ -768,8 +772,10 @@ cglobal deblock_h_luma_intra_8, 2,4,8,0x80 INIT_XMM sse2 DEBLOCK_LUMA_INTRA v +%if HAVE_AVX_EXTERNAL INIT_XMM avx DEBLOCK_LUMA_INTRA v +%endif %if ARCH_X86_64 == 0 INIT_MMX mmxext DEBLOCK_LUMA_INTRA v8 @@ -830,7 +836,11 @@ cglobal deblock_h_chroma_8, 5,7 TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) movq buf0, m0 movq buf1, m3 - call ff_chroma_inter_body_mmxext + LOAD_MASK r2d, r3d + movd m6, [r4] ; tc0 + punpcklbw m6, m6 + pand m7, m6 + DEBLOCK_P0_Q0 movq m0, buf0 movq m3, buf1 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm index 3b81ef6fcf..fdaf510bd6 100644 --- a/libavcodec/x86/h264_deblock_10bit.asm +++ b/libavcodec/x86/h264_deblock_10bit.asm @@ -7,20 +7,20 @@ ;* Loren Merritt <lorenm@u.washington.edu> ;* Jason Garrett-Glaser <darkshikari@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -417,9 +417,11 @@ cglobal deblock_h_luma_10, 5,7,15 INIT_XMM sse2 DEBLOCK_LUMA_64 +%if HAVE_AVX_EXTERNAL INIT_XMM avx DEBLOCK_LUMA_64 %endif +%endif %macro SWAPMOVA 2 %ifid %1 @@ -712,8 +714,10 @@ cglobal deblock_h_luma_intra_10, 4,7,16 INIT_XMM sse2 DEBLOCK_LUMA_INTRA_64 +%if HAVE_AVX_EXTERNAL INIT_XMM avx DEBLOCK_LUMA_INTRA_64 +%endif %endif @@ -797,10 +801,12 @@ DEBLOCK_LUMA_INTRA INIT_XMM sse2 DEBLOCK_LUMA DEBLOCK_LUMA_INTRA +%if HAVE_AVX_EXTERNAL INIT_XMM avx DEBLOCK_LUMA DEBLOCK_LUMA_INTRA %endif +%endif ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp ; out: %1=p0', %2=q0' @@ -911,5 +917,7 @@ DEBLOCK_CHROMA %endif INIT_XMM sse2 DEBLOCK_CHROMA +%if HAVE_AVX_EXTERNAL INIT_XMM avx DEBLOCK_CHROMA +%endif diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h index bb881c35df..0dc0a7cb0f 100644 --- a/libavcodec/x86/h264_i386.h +++ b/libavcodec/x86/h264_i386.h @@ -2,20 +2,20 @@ * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index 2771291a8e..9af98a9a0c 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -9,20 +9,20 @@ ;* Holger Lubitz <hal@duncan.ol.sub.de> ;* Min Chen <chenm001.163.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;***************************************************************************** diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm index 4e51d2b5d0..df212888dc 100644 --- a/libavcodec/x86/h264_idct_10bit.asm +++ b/libavcodec/x86/h264_idct_10bit.asm @@ -5,20 +5,20 @@ ;* ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -83,8 +83,10 @@ cglobal h264_idct_add_10, 3,3 INIT_XMM sse2 IDCT_ADD_10 +%if HAVE_AVX_EXTERNAL INIT_XMM avx IDCT_ADD_10 +%endif ;----------------------------------------------------------------------------- ; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) @@ -115,9 +117,11 @@ add4x4_idct %+ SUFFIX: INIT_XMM sse2 ALIGN 16 ADD4x4IDCT +%if HAVE_AVX_EXTERNAL INIT_XMM avx ALIGN 16 ADD4x4IDCT +%endif %macro ADD16_OP 2 cmp byte [r4+%2], 0 @@ -153,8 +157,10 @@ cglobal h264_idct_add16_10, 5,6 INIT_XMM sse2 IDCT_ADD16_10 +%if HAVE_AVX_EXTERNAL INIT_XMM avx IDCT_ADD16_10 +%endif ;----------------------------------------------------------------------------- ; void h264_idct_dc_add(pixel *dst, dctcoef *block, int stride) @@ -218,8 +224,10 @@ cglobal h264_idct8_dc_add_10,3,4,7 INIT_XMM sse2 IDCT8_DC_ADD +%if HAVE_AVX_EXTERNAL INIT_XMM avx IDCT8_DC_ADD +%endif ;----------------------------------------------------------------------------- ; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) @@ -289,8 +297,10 @@ cglobal h264_idct_add16intra_10,5,7,8 INIT_XMM sse2 IDCT_ADD16INTRA_10 +%if HAVE_AVX_EXTERNAL INIT_XMM avx IDCT_ADD16INTRA_10 +%endif %assign last_block 36 ;----------------------------------------------------------------------------- @@ -324,8 +334,10 @@ cglobal h264_idct_add8_10,5,8,7 INIT_XMM sse2 IDCT_ADD8 +%if HAVE_AVX_EXTERNAL INIT_XMM avx IDCT_ADD8 +%endif ;----------------------------------------------------------------------------- ; void h264_idct8_add(pixel *dst, dctcoef *block, int stride) @@ -531,8 +543,10 @@ h264_idct8_add1_10 %+ SUFFIX: INIT_XMM sse2 IDCT8_ADD +%if HAVE_AVX_EXTERNAL INIT_XMM avx IDCT8_ADD +%endif ;----------------------------------------------------------------------------- ; h264_idct8_add4(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) @@ -569,5 +583,7 @@ cglobal h264_idct8_add4_10, 0,7,16 INIT_XMM sse2 IDCT8_ADD4 +%if HAVE_AVX_EXTERNAL INIT_XMM avx IDCT8_ADD4 +%endif diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm index b9db3f450f..3064ec5201 100644 --- a/libavcodec/x86/h264_intrapred.asm +++ b/libavcodec/x86/h264_intrapred.asm @@ -5,20 +5,20 @@ ;* Copyright (c) 2010 Loren Merritt ;* Copyright (c) 2010 Ronald S. Bultje ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -2486,10 +2486,7 @@ cglobal pred4x4_tm_vp8_8, 3,3 pshufb mm3, mm6 pshufb mm4, mm6 pshufb mm5, mm6 - psubw mm2, mm7 - psubw mm3, mm7 - psubw mm4, mm7 - psubw mm5, mm7 + psubw mm0, mm7 paddw mm2, mm0 paddw mm3, mm0 paddw mm4, mm0 diff --git a/libavcodec/x86/h264_intrapred_10bit.asm b/libavcodec/x86/h264_intrapred_10bit.asm index 1b7974b790..54eaee53c5 100644 --- a/libavcodec/x86/h264_intrapred_10bit.asm +++ b/libavcodec/x86/h264_intrapred_10bit.asm @@ -5,20 +5,20 @@ ;* ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -82,8 +82,10 @@ INIT_XMM sse2 PRED4x4_DR INIT_XMM ssse3 PRED4x4_DR +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED4x4_DR +%endif ;----------------------------------------------------------------------------- ; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride) @@ -119,8 +121,10 @@ INIT_XMM sse2 PRED4x4_VR INIT_XMM ssse3 PRED4x4_VR +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED4x4_VR +%endif ;----------------------------------------------------------------------------- ; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride) @@ -159,8 +163,10 @@ INIT_XMM sse2 PRED4x4_HD INIT_XMM ssse3 PRED4x4_HD +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED4x4_HD +%endif ;----------------------------------------------------------------------------- ; void pred4x4_dc(pixel *src, const pixel *topright, int stride) @@ -228,8 +234,10 @@ cglobal pred4x4_down_left_10, 3, 3 INIT_XMM sse2 PRED4x4_DL +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED4x4_DL +%endif ;----------------------------------------------------------------------------- ; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride) @@ -255,8 +263,10 @@ cglobal pred4x4_vertical_left_10, 3, 3 INIT_XMM sse2 PRED4x4_VL +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED4x4_VL +%endif ;----------------------------------------------------------------------------- ; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride) @@ -563,8 +573,10 @@ cglobal pred8x8l_top_dc_10, 4, 4, 6 INIT_XMM sse2 PRED8x8L_TOP_DC +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED8x8L_TOP_DC +%endif ;----------------------------------------------------------------------------- ;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride) @@ -620,8 +632,10 @@ cglobal pred8x8l_dc_10, 4, 6, 6 INIT_XMM sse2 PRED8x8L_DC +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED8x8L_DC +%endif ;----------------------------------------------------------------------------- ; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride) @@ -653,8 +667,10 @@ cglobal pred8x8l_vertical_10, 4, 4, 6 INIT_XMM sse2 PRED8x8L_VERTICAL +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED8x8L_VERTICAL +%endif ;----------------------------------------------------------------------------- ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride) @@ -707,8 +723,10 @@ INIT_XMM sse2 PRED8x8L_HORIZONTAL INIT_XMM ssse3 PRED8x8L_HORIZONTAL +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED8x8L_HORIZONTAL +%endif ;----------------------------------------------------------------------------- ;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride) @@ -773,8 +791,10 @@ INIT_XMM sse2 PRED8x8L_DOWN_LEFT INIT_XMM ssse3 PRED8x8L_DOWN_LEFT +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED8x8L_DOWN_LEFT +%endif ;----------------------------------------------------------------------------- ;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride) @@ -845,8 +865,10 @@ INIT_XMM sse2 PRED8x8L_DOWN_RIGHT INIT_XMM ssse3 PRED8x8L_DOWN_RIGHT +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED8x8L_DOWN_RIGHT +%endif ;----------------------------------------------------------------------------- ; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride) @@ -913,8 +935,10 @@ INIT_XMM sse2 PRED8x8L_VERTICAL_RIGHT INIT_XMM ssse3 PRED8x8L_VERTICAL_RIGHT +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED8x8L_VERTICAL_RIGHT +%endif ;----------------------------------------------------------------------------- ; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride) @@ -972,8 +996,10 @@ INIT_XMM sse2 PRED8x8L_HORIZONTAL_UP INIT_XMM ssse3 PRED8x8L_HORIZONTAL_UP +%if HAVE_AVX_EXTERNAL INIT_XMM avx PRED8x8L_HORIZONTAL_UP +%endif ;----------------------------------------------------------------------------- diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c index f934256706..1724153932 100644 --- a/libavcodec/x86/h264_intrapred_init.c +++ b/libavcodec/x86/h264_intrapred_init.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2010 Jason Garrett-Glaser * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index 90857cec88..fd6068ffb1 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -2,20 +2,20 @@ * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt * Copyright (c) 2011 Daniel Kang * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -338,7 +338,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t * DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ + av_assert2(((int)temp & 7) == 0);\ ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ }\ @@ -348,7 +348,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t * DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ + av_assert2(((int)temp & 7) == 0);\ ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ }\ @@ -358,7 +358,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t * DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ + av_assert2(((int)temp & 7) == 0);\ ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ }\ @@ -368,7 +368,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t * DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ uint8_t * const halfHV= temp;\ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ + av_assert2(((int)temp & 7) == 0);\ ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ }\ diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm index e14df84431..456187140c 100644 --- a/libavcodec/x86/h264_qpel_10bit.asm +++ b/libavcodec/x86/h264_qpel_10bit.asm @@ -5,20 +5,20 @@ ;* ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm index bc6c72541b..2d287ba443 100644 --- a/libavcodec/x86/h264_qpel_8bit.asm +++ b/libavcodec/x86/h264_qpel_8bit.asm @@ -6,20 +6,20 @@ ;* ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm index 646acdffe3..4759a063a6 100644 --- a/libavcodec/x86/h264_weight.asm +++ b/libavcodec/x86/h264_weight.asm @@ -4,20 +4,20 @@ ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt ;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -135,6 +135,13 @@ WEIGHT_FUNC_HALF_MM 8, 8 add off_regd, 1 or off_regd, 1 add r4, 1 + cmp r5, 128 + jne .normal + sar r5, 1 + sar r6, 1 + sar off_regd, 1 + sub r4, 1 +.normal %if cpuflag(ssse3) movd m4, r5d movd m0, r6d diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm index 3b09e420d2..b7845fd74a 100644 --- a/libavcodec/x86/h264_weight_10bit.asm +++ b/libavcodec/x86/h264_weight_10bit.asm @@ -5,20 +5,20 @@ ;* ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/h264chroma_init.c b/libavcodec/x86/h264chroma_init.c index eec1653d3f..3d8d5b0fe1 100644 --- a/libavcodec/x86/h264chroma_init.c +++ b/libavcodec/x86/h264chroma_init.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c index e9d93e0af9..ae449e7e57 100644 --- a/libavcodec/x86/h264dsp_init.c +++ b/libavcodec/x86/h264dsp_init.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -211,6 +211,7 @@ H264_BIWEIGHT_10_SSE(4, 10) av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) { +#if HAVE_YASM int cpu_flags = av_get_cpu_flags(); if (chroma_format_idc <= 1 && EXTERNAL_MMXEXT(cpu_flags)) @@ -366,4 +367,5 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, #endif /* HAVE_ALIGNED_STACK */ } } +#endif } diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm index ff6e57a5d1..4eaba6eaca 100644 --- a/libavcodec/x86/hpeldsp.asm +++ b/libavcodec/x86/hpeldsp.asm @@ -1,20 +1,27 @@ ;****************************************************************************** +;* +;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org> +;* Copyright (c) Nick Kurshev <nickols_k@mail.ru> +;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> +;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz> +;* Copyright (c) 2013 Daniel Kang +;* ;* MMX optimized hpel functions ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c index c4f555c367..385f01b4bd 100644 --- a/libavcodec/x86/hpeldsp_init.c +++ b/libavcodec/x86/hpeldsp_init.c @@ -3,20 +3,20 @@ * Copyright (c) 2000, 2001 Fabrice Bellard * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * * MMX optimization by Nick Kurshev <nickols_k@mail.ru> diff --git a/libavcodec/x86/hpeldsp_mmx.c b/libavcodec/x86/hpeldsp_mmx.c index fece265da8..50db36dc1b 100644 --- a/libavcodec/x86/hpeldsp_mmx.c +++ b/libavcodec/x86/hpeldsp_mmx.c @@ -3,20 +3,20 @@ * * Copyright (c) 2001 Fabrice Bellard * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/hpeldsp_rnd_template.c b/libavcodec/x86/hpeldsp_rnd_template.c index 516a03aec2..94e06d820a 100644 --- a/libavcodec/x86/hpeldsp_rnd_template.c +++ b/libavcodec/x86/hpeldsp_rnd_template.c @@ -7,20 +7,20 @@ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> * and improved by Zdenek Kabelac <kabi@users.sf.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/idct_mmx_xvid.c b/libavcodec/x86/idct_mmx_xvid.c index 27723393bf..4cd6de101c 100644 --- a/libavcodec/x86/idct_mmx_xvid.c +++ b/libavcodec/x86/idct_mmx_xvid.c @@ -22,20 +22,20 @@ * * conversion to gcc syntax by Michael Niedermayer * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License - * along with Libav; if not, write to the Free Software Foundation, + * along with FFmpeg; if not, write to the Free Software Foundation, * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/idct_sse2_xvid.c b/libavcodec/x86/idct_sse2_xvid.c index 50655d6bc0..af4790ca92 100644 --- a/libavcodec/x86/idct_sse2_xvid.c +++ b/libavcodec/x86/idct_sse2_xvid.c @@ -9,7 +9,7 @@ * * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid. * - * This file is part of Libav. + * This file is part of FFmpeg. * * Vertical pass is an implementation of the scheme: * Loeffler C., Ligtenberg A., and Moschytz C.S.: @@ -23,22 +23,21 @@ * * More details at http://skal.planet-d.net/coding/dct.html * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License - * along with Libav; if not, write to the Free Software Foundation, + * along with FFmpeg; if not, write to the Free Software Foundation, * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "libavutil/internal.h" #include "libavutil/mem.h" #include "libavutil/x86/asm.h" #include "idct_xvid.h" diff --git a/libavcodec/x86/idct_xvid.h b/libavcodec/x86/idct_xvid.h index aea28bab96..7a2847b864 100644 --- a/libavcodec/x86/idct_xvid.h +++ b/libavcodec/x86/idct_xvid.h @@ -1,20 +1,20 @@ /* * XVID MPEG-4 VIDEO CODEC * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/imdct36.asm b/libavcodec/x86/imdct36.asm index 633fcd9d59..d311fbe1a0 100644 --- a/libavcodec/x86/imdct36.asm +++ b/libavcodec/x86/imdct36.asm @@ -2,20 +2,20 @@ ;* 36 point SSE-optimized IMDCT transform ;* Copyright (c) 2011 Vitor Sessak ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -370,8 +370,10 @@ DEFINE_IMDCT INIT_XMM ssse3 DEFINE_IMDCT +%if HAVE_AVX_EXTERNAL INIT_XMM avx DEFINE_IMDCT +%endif INIT_XMM sse @@ -716,5 +718,7 @@ cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp INIT_XMM sse DEFINE_FOUR_IMDCT +%if HAVE_AVX_EXTERNAL INIT_XMM avx DEFINE_FOUR_IMDCT +%endif diff --git a/libavcodec/x86/lpc.c b/libavcodec/x86/lpc.c index 12245c4298..8a74755d1b 100644 --- a/libavcodec/x86/lpc.c +++ b/libavcodec/x86/lpc.c @@ -2,26 +2,25 @@ * MMX optimized LPC DSP utils * Copyright (c) 2007 Loren Merritt * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavutil/attributes.h" #include "libavutil/cpu.h" -#include "libavutil/internal.h" #include "libavutil/mem.h" #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" @@ -152,7 +151,7 @@ av_cold void ff_lpc_init_x86(LPCContext *c) #if HAVE_SSE2_INLINE int cpu_flags = av_get_cpu_flags(); - if (INLINE_SSE2(cpu_flags) && (cpu_flags & AV_CPU_FLAG_SSE2SLOW)) { + if (HAVE_SSE2_INLINE && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) { c->lpc_apply_welch_window = lpc_apply_welch_window_sse2; c->lpc_compute_autocorr = lpc_compute_autocorr_sse2; } diff --git a/libavcodec/x86/mathops.h b/libavcodec/x86/mathops.h index a62094ee97..9c48afeb20 100644 --- a/libavcodec/x86/mathops.h +++ b/libavcodec/x86/mathops.h @@ -2,20 +2,20 @@ * simple math operations * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/mlpdsp.c b/libavcodec/x86/mlpdsp.c index 72fc637764..94849b7e79 100644 --- a/libavcodec/x86/mlpdsp.c +++ b/libavcodec/x86/mlpdsp.c @@ -2,25 +2,24 @@ * MLP DSP functions x86-optimized * Copyright (c) 2009 Ramiro Polla * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavutil/attributes.h" -#include "libavutil/internal.h" #include "libavutil/cpu.h" #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" diff --git a/libavcodec/x86/motion_est.c b/libavcodec/x86/motion_est.c index 41b9c5c781..5f5d93e9bc 100644 --- a/libavcodec/x86/motion_est.c +++ b/libavcodec/x86/motion_est.c @@ -5,25 +5,25 @@ * * mostly by Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavutil/attributes.h" -#include "libavutil/internal.h" +#include "libavutil/avassert.h" #include "libavutil/mem.h" #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" @@ -41,7 +41,7 @@ DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL; static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) { - x86_reg len= -(stride*h); + x86_reg len= -(x86_reg)stride*h; __asm__ volatile( ".p2align 4 \n\t" "1: \n\t" @@ -204,7 +204,7 @@ static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2, static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) { - x86_reg len= -(stride*h); + x86_reg len= -(x86_reg)stride*h; __asm__ volatile( ".p2align 4 \n\t" "1: \n\t" @@ -242,7 +242,7 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) { - x86_reg len= -(stride*h); + x86_reg len= -(x86_reg)stride*h; __asm__ volatile( "movq (%1, %%"REG_a"), %%mm0 \n\t" "movq 1(%1, %%"REG_a"), %%mm2 \n\t" @@ -332,7 +332,7 @@ static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) #define PIX_SAD(suf)\ static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ {\ - assert(h==8);\ + av_assert2(h==8);\ __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t":);\ \ @@ -342,7 +342,7 @@ static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h }\ static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ {\ - assert(h==8);\ + av_assert2(h==8);\ __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t"\ "movq %0, %%mm5 \n\t"\ @@ -356,7 +356,7 @@ static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, in \ static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ {\ - assert(h==8);\ + av_assert2(h==8);\ __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t"\ "movq %0, %%mm5 \n\t"\ @@ -370,7 +370,7 @@ static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, in \ static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ {\ - assert(h==8);\ + av_assert2(h==8);\ __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t"\ ::);\ @@ -467,7 +467,7 @@ av_cold void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx) c->pix_abs[1][3] = sad8_xy2_mmxext; } } - if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW)) { + if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW) && avctx->codec_id != AV_CODEC_ID_SNOW) { c->sad[0]= sad16_sse2; } #endif /* HAVE_INLINE_ASM */ diff --git a/libavcodec/x86/mpeg4qpel.asm b/libavcodec/x86/mpeg4qpel.asm index df20ea9dc6..ca52375a76 100644 --- a/libavcodec/x86/mpeg4qpel.asm +++ b/libavcodec/x86/mpeg4qpel.asm @@ -1,21 +1,23 @@ ;****************************************************************************** ;* mpeg4 qpel +;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> ;* Copyright (c) 2008 Loren Merritt +;* Copyright (c) 2013 Daniel Kang ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/mpegaudiodsp.c b/libavcodec/x86/mpegaudiodsp.c index f45cb7e4a4..2ec4c5dc6c 100644 --- a/libavcodec/x86/mpegaudiodsp.c +++ b/libavcodec/x86/mpegaudiodsp.c @@ -2,20 +2,20 @@ * MMX optimized MP3 decoding functions * Copyright (c) 2010 Vitor Sessak * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -26,11 +26,16 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/mpegaudiodsp.h" -void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win); -void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win); -void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win); -void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win); -void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win); +#define DECL(CPU)\ +static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\ +void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win); + +DECL(sse) +DECL(sse2) +DECL(sse3) +DECL(ssse3) +DECL(avx) + void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win, float *tmpbuf); void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win, @@ -217,11 +222,15 @@ static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \ } \ } +#if HAVE_SSE DECL_IMDCT_BLOCKS(sse,sse) DECL_IMDCT_BLOCKS(sse2,sse) DECL_IMDCT_BLOCKS(sse3,sse) DECL_IMDCT_BLOCKS(ssse3,sse) +#endif +#if HAVE_AVX_EXTERNAL DECL_IMDCT_BLOCKS(avx,avx) +#endif #endif /* HAVE_YASM */ av_cold void ff_mpadsp_init_x86(MPADSPContext *s) diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c index 25b44e6768..b2ce68062c 100644 --- a/libavcodec/x86/mpegvideo.c +++ b/libavcodec/x86/mpegvideo.c @@ -2,20 +2,20 @@ * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru> * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -27,7 +27,7 @@ #include "libavcodec/mpegvideo.h" #include "dsputil_x86.h" -#if HAVE_INLINE_ASM +#if HAVE_MMX_INLINE static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, int16_t *block, int n, int qscale) @@ -36,7 +36,7 @@ static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, qmul = qscale << 1; - assert(s->block_last_index[n]>=0 || s->h263_aic); + av_assert2(s->block_last_index[n]>=0 || s->h263_aic); if (!s->h263_aic) { if (n < 4) @@ -112,7 +112,7 @@ static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, qmul = qscale << 1; qadd = (qscale - 1) | 1; - assert(s->block_last_index[n]>=0 || s->h263_aic); + av_assert2(s->block_last_index[n]>=0 || s->h263_aic); nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; @@ -172,7 +172,7 @@ static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, const uint16_t *quant_matrix; int block0; - assert(s->block_last_index[n]>=0); + av_assert2(s->block_last_index[n]>=0); nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; @@ -240,7 +240,7 @@ static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, x86_reg nCoeffs; const uint16_t *quant_matrix; - assert(s->block_last_index[n]>=0); + av_assert2(s->block_last_index[n]>=0); nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; @@ -307,7 +307,7 @@ static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, const uint16_t *quant_matrix; int block0; - assert(s->block_last_index[n]>=0); + av_assert2(s->block_last_index[n]>=0); if(s->alternate_scan) nCoeffs= 63; //FIXME else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; @@ -372,7 +372,7 @@ static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, x86_reg nCoeffs; const uint16_t *quant_matrix; - assert(s->block_last_index[n]>=0); + av_assert2(s->block_last_index[n]>=0); if(s->alternate_scan) nCoeffs= 63; //FIXME else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; @@ -443,11 +443,11 @@ __asm__ volatile( ); } -#endif /* HAVE_INLINE_ASM */ +#endif /* HAVE_MMX_INLINE */ av_cold void ff_MPV_common_init_x86(MpegEncContext *s) { -#if HAVE_INLINE_ASM +#if HAVE_MMX_INLINE int cpu_flags = av_get_cpu_flags(); if (INLINE_MMX(cpu_flags)) { @@ -459,5 +459,5 @@ av_cold void ff_MPV_common_init_x86(MpegEncContext *s) s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; } -#endif /* HAVE_INLINE_ASM */ +#endif /* HAVE_MMX_INLINE */ } diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c index 19ab83a7fa..7dd9959087 100644 --- a/libavcodec/x86/mpegvideoenc.c +++ b/libavcodec/x86/mpegvideoenc.c @@ -2,20 +2,20 @@ * The simplest mpeg encoder (well, it was the simplest!) * Copyright (c) 2000,2001 Fabrice Bellard * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -194,7 +194,7 @@ static void denoise_dct_sse2(MpegEncContext *s, int16_t *block){ } #endif /* HAVE_INLINE_ASM */ -av_cold void ff_MPV_encode_init_x86(MpegEncContext *s) +av_cold void ff_dct_encode_init_x86(MpegEncContext *s) { const int dct_algo = s->avctx->dct_algo; int i; diff --git a/libavcodec/x86/mpegvideoenc_template.c b/libavcodec/x86/mpegvideoenc_template.c index a8d2a2cf8a..0defc404fe 100644 --- a/libavcodec/x86/mpegvideoenc_template.c +++ b/libavcodec/x86/mpegvideoenc_template.c @@ -3,20 +3,20 @@ * * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -100,7 +100,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, const uint16_t *qmat, *bias; LOCAL_ALIGNED_16(int16_t, temp_block, [64]); - assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly? + av_assert2((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly? //s->fdct (block); RENAMEl(ff_fdct) (block); //cannot be anything else ... @@ -110,10 +110,15 @@ static int RENAME(dct_quantize)(MpegEncContext *s, if (s->mb_intra) { int dummy; - if (n < 4) + if (n < 4){ q = s->y_dc_scale; - else + bias = s->q_intra_matrix16[qscale][1]; + qmat = s->q_intra_matrix16[qscale][0]; + }else{ q = s->c_dc_scale; + bias = s->q_chroma_intra_matrix16[qscale][1]; + qmat = s->q_chroma_intra_matrix16[qscale][0]; + } /* note: block[0] is assumed to be positive */ if (!s->h263_aic) { __asm__ volatile ( @@ -128,8 +133,6 @@ static int RENAME(dct_quantize)(MpegEncContext *s, block[0]=0; //avoid fake overflow // temp_block[0] = (block[0] + (q >> 1)) / q; last_non_zero_p1 = 1; - bias = s->q_intra_matrix16[qscale][1]; - qmat = s->q_intra_matrix16[qscale][0]; } else { last_non_zero_p1 = 0; bias = s->q_inter_matrix16[qscale][1]; diff --git a/libavcodec/x86/pngdsp.asm b/libavcodec/x86/pngdsp.asm index c05f3da017..8e23ccfbc6 100644 --- a/libavcodec/x86/pngdsp.asm +++ b/libavcodec/x86/pngdsp.asm @@ -4,20 +4,20 @@ ;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu> ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/pngdsp_init.c b/libavcodec/x86/pngdsp_init.c index 34a3da36d7..7dca62c675 100644 --- a/libavcodec/x86/pngdsp_init.c +++ b/libavcodec/x86/pngdsp_init.c @@ -2,20 +2,20 @@ * x86 PNG optimizations. * Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm index 855f2094b7..aedacc2cdc 100644 --- a/libavcodec/x86/proresdsp.asm +++ b/libavcodec/x86/proresdsp.asm @@ -1,23 +1,24 @@ ;****************************************************************************** ;* x86-SIMD-optimized IDCT for prores -;* this is identical to "simple" IDCT except for the clip range +;* this is identical to "simple" IDCT written by Michael Niedermayer +;* except for the clip range ;* ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -47,10 +48,10 @@ w1_plus_w5: times 4 dw W1sh2, +W5sh2 w5_min_w1: times 4 dw W5sh2, -W1sh2 w5_plus_w7: times 4 dw W5sh2, +W7sh2 w7_min_w5: times 4 dw W7sh2, -W5sh2 -row_round: times 8 dw (1<<14) +pw_88: times 8 dw 0x2008 +cextern pw_1 cextern pw_4 -cextern pw_8 cextern pw_512 cextern pw_1019 @@ -91,14 +92,12 @@ section .text align=16 ; a2 -= W6 * row[2]; ; a3 -= W2 * row[2]; %ifidn %1, col - paddw m10,[pw_8] + paddw m10,[pw_88] %endif - SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7] %ifidn %1, row - psubw m10,[row_round] + paddw m10,[pw_1] %endif - SIGNEXTEND m8, m9, m14 ; { row[2] }[0-3] / [4-7] - SIGNEXTEND m10, m11, m14 ; { row[0] }[0-3] / [4-7] + SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7] pmaddwd m2, m0, [w4_plus_w6] pmaddwd m3, m1, [w4_plus_w6] pmaddwd m4, m0, [w4_min_w6] @@ -107,75 +106,33 @@ section .text align=16 pmaddwd m7, m1, [w4_min_w2] pmaddwd m0, [w4_plus_w2] pmaddwd m1, [w4_plus_w2] - pslld m2, 2 - pslld m3, 2 - pslld m4, 2 - pslld m5, 2 - pslld m6, 2 - pslld m7, 2 - pslld m0, 2 - pslld m1, 2 ; a0: -1*row[0]-1*row[2] ; a1: -1*row[0] ; a2: -1*row[0] ; a3: -1*row[0]+1*row[2] - psubd m2, m10 ; a1[0-3] - psubd m3, m11 ; a1[4-7] - psubd m4, m10 ; a2[0-3] - psubd m5, m11 ; a2[4-7] - psubd m0, m10 - psubd m1, m11 - psubd m6, m10 - psubd m7, m11 - psubd m0, m8 ; a0[0-3] - psubd m1, m9 ; a0[4-7] - paddd m6, m8 ; a3[0-3] - paddd m7, m9 ; a3[4-7] ; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4] ; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6] ; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6] ; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4] SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7] - SIGNEXTEND m13, m14, m10 ; { row[4] }[0-3] / [4-7] pmaddwd m10, m8, [w4_plus_w6] pmaddwd m11, m9, [w4_plus_w6] - pslld m10, 2 - pslld m11, 2 - psubd m10, m13 - psubd m11, m14 paddd m0, m10 ; a0[0-3] paddd m1, m11 ; a0[4-7] pmaddwd m10, m8, [w4_min_w6] pmaddwd m11, m9, [w4_min_w6] - pslld m10, 2 - pslld m11, 2 - psubd m10, m13 - psubd m11, m14 paddd m6, m10 ; a3[0-3] paddd m7, m11 ; a3[4-7] pmaddwd m10, m8, [w4_min_w2] pmaddwd m11, m9, [w4_min_w2] pmaddwd m8, [w4_plus_w2] pmaddwd m9, [w4_plus_w2] - pslld m10, 2 - pslld m11, 2 - pslld m8, 2 - pslld m9, 2 - psubd m10, m13 - psubd m11, m14 - psubd m8, m13 - psubd m9, m14 psubd m4, m10 ; a2[0-3] intermediate psubd m5, m11 ; a2[4-7] intermediate psubd m2, m8 ; a1[0-3] intermediate psubd m3, m9 ; a1[4-7] intermediate - SIGNEXTEND m12, m13, m10 ; { row[6] }[0-3] / [4-7] - psubd m4, m12 ; a2[0-3] - psubd m5, m13 ; a2[4-7] - paddd m2, m12 ; a1[0-3] - paddd m3, m13 ; a1[4-7] ; load/store mova [r2+ 0], m0 @@ -206,8 +163,6 @@ section .text align=16 ; b3 = MUL(W7, row[1]); ; MAC(b3, -W5, row[3]); SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7] - SIGNEXTEND m10, m11, m12 ; { row[1] }[0-3] / [4-7] - SIGNEXTEND m8, m9, m12 ; { row[3] }[0-3] / [4-7] pmaddwd m2, m0, [w3_min_w7] pmaddwd m3, m1, [w3_min_w7] pmaddwd m4, m0, [w5_min_w1] @@ -216,35 +171,11 @@ section .text align=16 pmaddwd m7, m1, [w7_min_w5] pmaddwd m0, [w1_plus_w3] pmaddwd m1, [w1_plus_w3] - pslld m2, 2 - pslld m3, 2 - pslld m4, 2 - pslld m5, 2 - pslld m6, 2 - pslld m7, 2 - pslld m0, 2 - pslld m1, 2 ; b0: +1*row[1]+2*row[3] ; b1: +2*row[1]-1*row[3] ; b2: -1*row[1]-1*row[3] ; b3: +1*row[1]+1*row[3] - psubd m2, m8 - psubd m3, m9 - paddd m0, m8 - paddd m1, m9 - paddd m8, m10 ; { row[1] + row[3] }[0-3] - paddd m9, m11 ; { row[1] + row[3] }[4-7] - paddd m10, m10 - paddd m11, m11 - paddd m0, m8 ; b0[0-3] - paddd m1, m9 ; b0[4-7] - paddd m2, m10 ; b1[0-3] - paddd m3, m11 ; b2[4-7] - psubd m4, m8 ; b2[0-3] - psubd m5, m9 ; b2[4-7] - paddd m6, m8 ; b3[0-3] - paddd m7, m9 ; b3[4-7] ; MAC(b0, W5, row[5]); ; MAC(b0, W7, row[7]); @@ -255,38 +186,16 @@ section .text align=16 ; MAC(b3, W3, row[5]); ; MAC(b3, -W1, row[7]); SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7] - SIGNEXTEND m13, m12, m11 ; { row[5] }[0-3] / [4-7] - SIGNEXTEND m14, m11, m10 ; { row[7] }[0-3] / [4-7] ; b0: -1*row[5]+1*row[7] ; b1: -1*row[5]+1*row[7] ; b2: +1*row[5]+2*row[7] ; b3: +2*row[5]-1*row[7] - paddd m4, m13 - paddd m5, m12 - paddd m6, m13 - paddd m7, m12 - psubd m13, m14 ; { row[5] - row[7] }[0-3] - psubd m12, m11 ; { row[5] - row[7] }[4-7] - paddd m14, m14 - paddd m11, m11 - psubd m0, m13 - psubd m1, m12 - psubd m2, m13 - psubd m3, m12 - paddd m4, m14 - paddd m5, m11 - paddd m6, m13 - paddd m7, m12 pmaddwd m10, m8, [w1_plus_w5] pmaddwd m11, m9, [w1_plus_w5] pmaddwd m12, m8, [w5_plus_w7] pmaddwd m13, m9, [w5_plus_w7] - pslld m10, 2 - pslld m11, 2 - pslld m12, 2 - pslld m13, 2 psubd m2, m10 ; b1[0-3] psubd m3, m11 ; b1[4-7] paddd m0, m12 ; b0[0-3] @@ -295,10 +204,6 @@ section .text align=16 pmaddwd m13, m9, [w7_plus_w3] pmaddwd m8, [w3_min_w1] pmaddwd m9, [w3_min_w1] - pslld m12, 2 - pslld m13, 2 - pslld m8, 2 - pslld m9, 2 paddd m4, m12 ; b2[0-3] paddd m5, m13 ; b2[4-7] paddd m6, m8 ; b3[0-3] @@ -345,7 +250,7 @@ cglobal prores_idct_put_10, 4, 4, %1 pmullw m13,[r3+64] pmullw m12,[r3+96] - IDCT_1D row, 17 + IDCT_1D row, 15 ; transpose for second part of IDCT TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3 @@ -360,20 +265,11 @@ cglobal prores_idct_put_10, 4, 4, %1 ; for (i = 0; i < 8; i++) ; idctSparseColAdd(dest + i, line_size, block + i); - IDCT_1D col, 20 + IDCT_1D col, 18 ; clip/store - mova m6, [pw_512] mova m3, [pw_4] mova m5, [pw_1019] - paddw m8, m6 - paddw m0, m6 - paddw m1, m6 - paddw m2, m6 - paddw m4, m6 - paddw m11, m6 - paddw m9, m6 - paddw m10, m6 pmaxsw m8, m3 pmaxsw m0, m3 pmaxsw m1, m3 @@ -422,7 +318,9 @@ INIT_XMM sse2 idct_put_fn 16 INIT_XMM sse4 idct_put_fn 16 +%if HAVE_AVX_EXTERNAL INIT_XMM avx idct_put_fn 16 +%endif %endif diff --git a/libavcodec/x86/proresdsp_init.c b/libavcodec/x86/proresdsp_init.c index d63382c554..fa4a2d48e3 100644 --- a/libavcodec/x86/proresdsp_init.c +++ b/libavcodec/x86/proresdsp_init.c @@ -3,20 +3,20 @@ * * Copyright (c) 2010-2011 Maxim Poliakovski * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -32,7 +32,7 @@ void ff_prores_idct_put_10_sse4(uint16_t *dst, int linesize, void ff_prores_idct_put_10_avx (uint16_t *dst, int linesize, int16_t *block, const int16_t *qmat); -av_cold void ff_proresdsp_x86_init(ProresDSPContext *dsp) +av_cold void ff_proresdsp_x86_init(ProresDSPContext *dsp, AVCodecContext *avctx) { #if ARCH_X86_64 int cpu_flags = av_get_cpu_flags(); diff --git a/libavcodec/x86/qpel.asm b/libavcodec/x86/qpel.asm index c90b3932af..c2ffb86717 100644 --- a/libavcodec/x86/qpel.asm +++ b/libavcodec/x86/qpel.asm @@ -4,20 +4,20 @@ ;* Copyright (c) 2003-2013 Michael Niedermayer ;* Copyright (c) 2013 Daniel Kang ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/rnd_mmx.c b/libavcodec/x86/rnd_mmx.c index db4515a9c5..326e2f395b 100644 --- a/libavcodec/x86/rnd_mmx.c +++ b/libavcodec/x86/rnd_mmx.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/rnd_template.c b/libavcodec/x86/rnd_template.c index e9a5a45799..e37fc19506 100644 --- a/libavcodec/x86/rnd_template.c +++ b/libavcodec/x86/rnd_template.c @@ -7,20 +7,20 @@ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> * and improved by Zdenek Kabelac <kabi@users.sf.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm index 4d9c35b600..7732d65b2a 100644 --- a/libavcodec/x86/rv34dsp.asm +++ b/libavcodec/x86/rv34dsp.asm @@ -2,20 +2,20 @@ ;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/rv34dsp_init.c b/libavcodec/x86/rv34dsp_init.c index 586e4e9a6d..99c56f9d09 100644 --- a/libavcodec/x86/rv34dsp_init.c +++ b/libavcodec/x86/rv34dsp_init.c @@ -2,20 +2,20 @@ * RV30/40 MMX/SSE2 optimizations * Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm index d12b079eb7..792a54f572 100644 --- a/libavcodec/x86/rv40dsp.asm +++ b/libavcodec/x86/rv40dsp.asm @@ -4,20 +4,20 @@ ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c index 781f467490..75ba8ba12c 100644 --- a/libavcodec/x86/rv40dsp_init.c +++ b/libavcodec/x86/rv40dsp_init.c @@ -2,20 +2,20 @@ * RV40 decoder motion compensation functions x86-optimised * Copyright (c) 2008 Konstantin Shishkov * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm index 36a0918473..adc13c4353 100644 --- a/libavcodec/x86/sbrdsp.asm +++ b/libavcodec/x86/sbrdsp.asm @@ -2,20 +2,20 @@ ;* AAC Spectral Band Replication decoding functions ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -26,6 +26,12 @@ SECTION_RODATA ps_mask times 2 dd 1<<31, 0 ps_mask2 times 2 dd 0, 1<<31 ps_neg times 4 dd 1<<31 +ps_noise0 times 2 dd 1.0, 0.0, +ps_noise2 times 2 dd -1.0, 0.0 +ps_noise13 dd 0.0, 1.0, 0.0, -1.0 + dd 0.0, -1.0, 0.0, 1.0 + dd 0.0, 1.0, 0.0, -1.0 +cextern sbr_noise_table SECTION_TEXT @@ -136,7 +142,6 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1) mova m3, m1 mova m4, m2 - mova m7, [ps_mask] ; Set pointers %if ARCH_X86_64 == 0 || WIN64 @@ -156,30 +161,28 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E shl start, 3 ; offset from num loops mova m0, [X_lowq + start] - movlhps m1, m1 ; (a2 a3 a2 a3) - movlhps m2, m2 ; (a0 a1 a0 a1) - shufps m3, m3, q0101 ; (a3 a2 a3 a2) - shufps m4, m4, q0101 ; (a1 a0 a1 a0) - xorps m3, m7 ; (-a3 a2 -a3 a2) - xorps m4, m7 ; (-a1 a0 -a1 a0) + shufps m3, m3, q1111 + shufps m4, m4, q1111 + xorps m3, [ps_mask] + shufps m1, m1, q0000 + shufps m2, m2, q0000 + xorps m4, [ps_mask] .loop2: - mova m5, m0 + movu m7, [X_lowq + start + 8] ; BbCc mova m6, m0 - shufps m0, m0, q2200 ; {Xl[-2][0],",Xl[-1][0],"} - shufps m5, m5, q3311 ; {Xl[-2][1],",Xl[-1][1],"} - mulps m0, m2 - mulps m5, m4 - mova m7, m6 - addps m5, m0 - mova m0, [X_lowq + start + 2*2*4] - shufps m6, m0, q0022 ; {Xl[-1][0],",Xl[0][0],"} - shufps m7, m0, q1133 ; {Xl[-1][1],",Xl[1][1],"} - mulps m6, m1 + mova m5, m7 + shufps m0, m0, q2301 ; aAbB + shufps m7, m7, q2301 ; bBcC + mulps m0, m4 mulps m7, m3 - addps m5, m6 + mulps m6, m2 + mulps m5, m1 + addps m7, m0 + mova m0, [X_lowq + start +16] ; CcDd addps m7, m0 - addps m5, m7 - mova [X_highq + start], m5 + addps m6, m5 + addps m7, m6 + mova [X_highq + start], m7 add start, 16 jnz .loop2 RET @@ -246,33 +249,47 @@ cglobal sbr_neg_odd_64, 1,2,4,z jne .loop REP_RET -INIT_XMM sse2 ; sbr_qmf_deint_bfly(float *v, const float *src0, const float *src1) +%macro SBR_QMF_DEINT_BFLY 0 cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c mov cq, 64*4-2*mmsize lea vrevq, [vq + 64*4] .loop: mova m0, [src0q+cq] mova m1, [src1q] - mova m2, [src0q+cq+mmsize] - mova m3, [src1q+mmsize] - pshufd m4, m0, q0123 - pshufd m5, m1, q0123 - pshufd m6, m2, q0123 - pshufd m7, m3, q0123 - addps m3, m4 + mova m4, [src0q+cq+mmsize] + mova m5, [src1q+mmsize] +%if cpuflag(sse2) + pshufd m2, m0, q0123 + pshufd m3, m1, q0123 + pshufd m6, m4, q0123 + pshufd m7, m5, q0123 +%else + shufps m2, m0, m0, q0123 + shufps m3, m1, m1, q0123 + shufps m6, m4, m4, q0123 + shufps m7, m5, m5, q0123 +%endif + addps m5, m2 subps m0, m7 addps m1, m6 - subps m2, m5 + subps m4, m3 mova [vrevq], m1 - mova [vrevq+mmsize], m3 + mova [vrevq+mmsize], m5 mova [vq+cq], m0 - mova [vq+cq+mmsize], m2 + mova [vq+cq+mmsize], m4 add src1q, 2*mmsize add vrevq, 2*mmsize sub cq, 2*mmsize jge .loop REP_RET +%endmacro + +INIT_XMM sse +SBR_QMF_DEINT_BFLY + +INIT_XMM sse2 +SBR_QMF_DEINT_BFLY INIT_XMM sse2 cglobal sbr_qmf_pre_shuffle, 1,4,6,z @@ -303,3 +320,106 @@ cglobal sbr_qmf_pre_shuffle, 1,4,6,z movq m2, [zq] movq [r2q], m2 REP_RET + +%ifdef PIC +%define NREGS 1 +%if UNIX64 +%define NOISE_TABLE r6q ; r5q is m_max +%else +%define NOISE_TABLE r5q +%endif +%else +%define NREGS 0 +%define NOISE_TABLE sbr_noise_table +%endif + +%macro LOAD_NST 1 +%ifdef PIC + lea NOISE_TABLE, [%1] + mova m0, [kxq + NOISE_TABLE] +%else + mova m0, [kxq + %1] +%endif +%endmacro + +INIT_XMM sse2 +; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m, +; const float *q_filt, int noise, +; int kx, int m_max) +cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max + mova m0, [ps_noise0] + jmp apply_noise_main + +; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m, +; const float *q_filt, int noise, +; int kx, int m_max) +cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max + and kxq, 1 + shl kxq, 4 + LOAD_NST ps_noise13 + jmp apply_noise_main + +; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m, +; const float *q_filt, int noise, +; int kx, int m_max) +cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max + mova m0, [ps_noise2] + jmp apply_noise_main + +; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m, +; const float *q_filt, int noise, +; int kx, int m_max) +cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max + and kxq, 1 + shl kxq, 4 + LOAD_NST ps_noise13+16 + +apply_noise_main: +%if ARCH_X86_64 == 0 || WIN64 + mov kxd, m_maxm +%define count kxq +%else +%define count m_maxq +%endif + dec noiseq + shl count, 2 +%ifdef PIC + lea NOISE_TABLE, [sbr_noise_table] +%endif + lea Yq, [Yq + 2*count] + add s_mq, count + add q_filtq, count + shl noiseq, 3 + pxor m5, m5 + neg count +.loop: + mova m1, [q_filtq + count] + movu m3, [noiseq + NOISE_TABLE + 1*mmsize] + movu m4, [noiseq + NOISE_TABLE + 2*mmsize] + add noiseq, 2*mmsize + and noiseq, 0x1ff<<3 + punpckhdq m2, m1, m1 + punpckldq m1, m1 + mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] + mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] + mova m3, [s_mq + count] + ; TODO: replace by a vpermd in AVX2 + punpckhdq m4, m3, m3 + punpckldq m3, m3 + pcmpeqd m6, m3, m5 ; m6 == 0 + pcmpeqd m7, m4, m5 ; m7 == 0 + mulps m3, m0 ; s_m[m] * phi_sign + mulps m4, m0 ; s_m[m] * phi_sign + pand m1, m6 + pand m2, m7 + movu m6, [Yq + 2*count] + movu m7, [Yq + 2*count + mmsize] + addps m3, m1 + addps m4, m2 + addps m6, m3 + addps m7, m4 + movu [Yq + 2*count], m6 + movu [Yq + 2*count + mmsize], m7 + add count, mmsize + jl .loop + RET diff --git a/libavcodec/x86/sbrdsp_init.c b/libavcodec/x86/sbrdsp_init.c index 9600852163..2b912d0e9e 100644 --- a/libavcodec/x86/sbrdsp_init.c +++ b/libavcodec/x86/sbrdsp_init.c @@ -2,20 +2,20 @@ * AAC Spectral Band Replication decoding functions * Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -34,9 +34,23 @@ void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2], float bw, int start, int end); void ff_sbr_neg_odd_64_sse(float *z); void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float *z); +void ff_sbr_qmf_deint_bfly_sse(float *v, const float *src0, const float *src1); void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1); void ff_sbr_qmf_pre_shuffle_sse2(float *z); +void ff_sbr_hf_apply_noise_0_sse2(float (*Y)[2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); +void ff_sbr_hf_apply_noise_1_sse2(float (*Y)[2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); +void ff_sbr_hf_apply_noise_2_sse2(float (*Y)[2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); +void ff_sbr_hf_apply_noise_3_sse2(float (*Y)[2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); + av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s) { int cpu_flags = av_get_cpu_flags(); @@ -48,10 +62,15 @@ av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s) s->hf_g_filt = ff_sbr_hf_g_filt_sse; s->hf_gen = ff_sbr_hf_gen_sse; s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_sse; + s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse; } if (EXTERNAL_SSE2(cpu_flags)) { s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse2; s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_sse2; + s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_sse2; + s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_sse2; + s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse2; + s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse2; } } diff --git a/libavcodec/x86/simple_idct.c b/libavcodec/x86/simple_idct.c index 36f0b477e0..c666b1a6df 100644 --- a/libavcodec/x86/simple_idct.c +++ b/libavcodec/x86/simple_idct.c @@ -3,24 +3,23 @@ * * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavcodec/simple_idct.h" -#include "libavutil/internal.h" #include "libavutil/mem.h" #include "dsputil_x86.h" @@ -81,7 +80,7 @@ DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= { static inline void idct(int16_t *block) { - DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; + LOCAL_ALIGNED_8(int64_t, align_tmp, [16]); int16_t * const temp= (int16_t*)align_tmp; __asm__ volatile( diff --git a/libavcodec/x86/snowdsp.c b/libavcodec/x86/snowdsp.c new file mode 100644 index 0000000000..735e7905a0 --- /dev/null +++ b/libavcodec/x86/snowdsp.c @@ -0,0 +1,902 @@ +/* + * MMX and SSE2 optimized snow DSP utils + * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/snow.h" +#include "libavcodec/snow_dwt.h" +#include "dsputil_x86.h" + +#if HAVE_INLINE_ASM + +static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){ + const int w2= (width+1)>>1; + const int w_l= (width>>1); + const int w_r= w2 - 1; + int i; + + { // Lift 0 + IDWTELEM * const ref = b + w2 - 1; + IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice + // (the first time erroneously), we allow the SSE2 code to run an extra pass. + // The savings in code and time are well worth having to store this value and + // calculate b[0] correctly afterwards. + + i = 0; + __asm__ volatile( + "pcmpeqd %%xmm7, %%xmm7 \n\t" + "pcmpeqd %%xmm3, %%xmm3 \n\t" + "psllw $1, %%xmm3 \n\t" + "paddw %%xmm7, %%xmm3 \n\t" + "psllw $13, %%xmm3 \n\t" + ::); + for(; i<w_l-15; i+=16){ + __asm__ volatile( + "movdqu (%1), %%xmm1 \n\t" + "movdqu 16(%1), %%xmm5 \n\t" + "movdqu 2(%1), %%xmm2 \n\t" + "movdqu 18(%1), %%xmm6 \n\t" + "paddw %%xmm1, %%xmm2 \n\t" + "paddw %%xmm5, %%xmm6 \n\t" + "paddw %%xmm7, %%xmm2 \n\t" + "paddw %%xmm7, %%xmm6 \n\t" + "pmulhw %%xmm3, %%xmm2 \n\t" + "pmulhw %%xmm3, %%xmm6 \n\t" + "paddw (%0), %%xmm2 \n\t" + "paddw 16(%0), %%xmm6 \n\t" + "movdqa %%xmm2, (%0) \n\t" + "movdqa %%xmm6, 16(%0) \n\t" + :: "r"(&b[i]), "r"(&ref[i]) + : "memory" + ); + } + snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); + b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); + } + + { // Lift 1 + IDWTELEM * const dst = b+w2; + + i = 0; + for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){ + dst[i] = dst[i] - (b[i] + b[i + 1]); + } + for(; i<w_r-15; i+=16){ + __asm__ volatile( + "movdqu (%1), %%xmm1 \n\t" + "movdqu 16(%1), %%xmm5 \n\t" + "movdqu 2(%1), %%xmm2 \n\t" + "movdqu 18(%1), %%xmm6 \n\t" + "paddw %%xmm1, %%xmm2 \n\t" + "paddw %%xmm5, %%xmm6 \n\t" + "movdqa (%0), %%xmm0 \n\t" + "movdqa 16(%0), %%xmm4 \n\t" + "psubw %%xmm2, %%xmm0 \n\t" + "psubw %%xmm6, %%xmm4 \n\t" + "movdqa %%xmm0, (%0) \n\t" + "movdqa %%xmm4, 16(%0) \n\t" + :: "r"(&dst[i]), "r"(&b[i]) + : "memory" + ); + } + snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); + } + + { // Lift 2 + IDWTELEM * const ref = b+w2 - 1; + IDWTELEM b_0 = b[0]; + + i = 0; + __asm__ volatile( + "psllw $15, %%xmm7 \n\t" + "pcmpeqw %%xmm6, %%xmm6 \n\t" + "psrlw $13, %%xmm6 \n\t" + "paddw %%xmm7, %%xmm6 \n\t" + ::); + for(; i<w_l-15; i+=16){ + __asm__ volatile( + "movdqu (%1), %%xmm0 \n\t" + "movdqu 16(%1), %%xmm4 \n\t" + "movdqu 2(%1), %%xmm1 \n\t" + "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts + "paddw %%xmm6, %%xmm0 \n\t" + "paddw %%xmm6, %%xmm4 \n\t" + "paddw %%xmm7, %%xmm1 \n\t" + "paddw %%xmm7, %%xmm5 \n\t" + "pavgw %%xmm1, %%xmm0 \n\t" + "pavgw %%xmm5, %%xmm4 \n\t" + "psubw %%xmm7, %%xmm0 \n\t" + "psubw %%xmm7, %%xmm4 \n\t" + "psraw $1, %%xmm0 \n\t" + "psraw $1, %%xmm4 \n\t" + "movdqa (%0), %%xmm1 \n\t" + "movdqa 16(%0), %%xmm5 \n\t" + "paddw %%xmm1, %%xmm0 \n\t" + "paddw %%xmm5, %%xmm4 \n\t" + "psraw $2, %%xmm0 \n\t" + "psraw $2, %%xmm4 \n\t" + "paddw %%xmm1, %%xmm0 \n\t" + "paddw %%xmm5, %%xmm4 \n\t" + "movdqa %%xmm0, (%0) \n\t" + "movdqa %%xmm4, 16(%0) \n\t" + :: "r"(&b[i]), "r"(&ref[i]) + : "memory" + ); + } + snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); + b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS); + } + + { // Lift 3 + IDWTELEM * const src = b+w2; + + i = 0; + for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){ + temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS); + } + for(; i<w_r-7; i+=8){ + __asm__ volatile( + "movdqu 2(%1), %%xmm2 \n\t" + "movdqu 18(%1), %%xmm6 \n\t" + "paddw (%1), %%xmm2 \n\t" + "paddw 16(%1), %%xmm6 \n\t" + "movdqu (%0), %%xmm0 \n\t" + "movdqu 16(%0), %%xmm4 \n\t" + "paddw %%xmm2, %%xmm0 \n\t" + "paddw %%xmm6, %%xmm4 \n\t" + "psraw $1, %%xmm2 \n\t" + "psraw $1, %%xmm6 \n\t" + "paddw %%xmm0, %%xmm2 \n\t" + "paddw %%xmm4, %%xmm6 \n\t" + "movdqa %%xmm2, (%2) \n\t" + "movdqa %%xmm6, 16(%2) \n\t" + :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) + : "memory" + ); + } + snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); + } + + { + snow_interleave_line_header(&i, width, b, temp); + + for (; (i & 0x3E) != 0x3E; i-=2){ + b[i+1] = temp[i>>1]; + b[i] = b[i>>1]; + } + for (i-=62; i>=0; i-=64){ + __asm__ volatile( + "movdqa (%1), %%xmm0 \n\t" + "movdqa 16(%1), %%xmm2 \n\t" + "movdqa 32(%1), %%xmm4 \n\t" + "movdqa 48(%1), %%xmm6 \n\t" + "movdqa (%1), %%xmm1 \n\t" + "movdqa 16(%1), %%xmm3 \n\t" + "movdqa 32(%1), %%xmm5 \n\t" + "movdqa 48(%1), %%xmm7 \n\t" + "punpcklwd (%2), %%xmm0 \n\t" + "punpcklwd 16(%2), %%xmm2 \n\t" + "punpcklwd 32(%2), %%xmm4 \n\t" + "punpcklwd 48(%2), %%xmm6 \n\t" + "movdqa %%xmm0, (%0) \n\t" + "movdqa %%xmm2, 32(%0) \n\t" + "movdqa %%xmm4, 64(%0) \n\t" + "movdqa %%xmm6, 96(%0) \n\t" + "punpckhwd (%2), %%xmm1 \n\t" + "punpckhwd 16(%2), %%xmm3 \n\t" + "punpckhwd 32(%2), %%xmm5 \n\t" + "punpckhwd 48(%2), %%xmm7 \n\t" + "movdqa %%xmm1, 16(%0) \n\t" + "movdqa %%xmm3, 48(%0) \n\t" + "movdqa %%xmm5, 80(%0) \n\t" + "movdqa %%xmm7, 112(%0) \n\t" + :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1]) + : "memory" + ); + } + } +} + +static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){ + const int w2= (width+1)>>1; + const int w_l= (width>>1); + const int w_r= w2 - 1; + int i; + + { // Lift 0 + IDWTELEM * const ref = b + w2 - 1; + + i = 1; + b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); + __asm__ volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "pcmpeqw %%mm3, %%mm3 \n\t" + "psllw $1, %%mm3 \n\t" + "paddw %%mm7, %%mm3 \n\t" + "psllw $13, %%mm3 \n\t" + ::); + for(; i<w_l-7; i+=8){ + __asm__ volatile( + "movq (%1), %%mm2 \n\t" + "movq 8(%1), %%mm6 \n\t" + "paddw 2(%1), %%mm2 \n\t" + "paddw 10(%1), %%mm6 \n\t" + "paddw %%mm7, %%mm2 \n\t" + "paddw %%mm7, %%mm6 \n\t" + "pmulhw %%mm3, %%mm2 \n\t" + "pmulhw %%mm3, %%mm6 \n\t" + "paddw (%0), %%mm2 \n\t" + "paddw 8(%0), %%mm6 \n\t" + "movq %%mm2, (%0) \n\t" + "movq %%mm6, 8(%0) \n\t" + :: "r"(&b[i]), "r"(&ref[i]) + : "memory" + ); + } + snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); + } + + { // Lift 1 + IDWTELEM * const dst = b+w2; + + i = 0; + for(; i<w_r-7; i+=8){ + __asm__ volatile( + "movq (%1), %%mm2 \n\t" + "movq 8(%1), %%mm6 \n\t" + "paddw 2(%1), %%mm2 \n\t" + "paddw 10(%1), %%mm6 \n\t" + "movq (%0), %%mm0 \n\t" + "movq 8(%0), %%mm4 \n\t" + "psubw %%mm2, %%mm0 \n\t" + "psubw %%mm6, %%mm4 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm4, 8(%0) \n\t" + :: "r"(&dst[i]), "r"(&b[i]) + : "memory" + ); + } + snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); + } + + { // Lift 2 + IDWTELEM * const ref = b+w2 - 1; + + i = 1; + b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS); + __asm__ volatile( + "psllw $15, %%mm7 \n\t" + "pcmpeqw %%mm6, %%mm6 \n\t" + "psrlw $13, %%mm6 \n\t" + "paddw %%mm7, %%mm6 \n\t" + ::); + for(; i<w_l-7; i+=8){ + __asm__ volatile( + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm4 \n\t" + "movq 2(%1), %%mm1 \n\t" + "movq 10(%1), %%mm5 \n\t" + "paddw %%mm6, %%mm0 \n\t" + "paddw %%mm6, %%mm4 \n\t" + "paddw %%mm7, %%mm1 \n\t" + "paddw %%mm7, %%mm5 \n\t" + "pavgw %%mm1, %%mm0 \n\t" + "pavgw %%mm5, %%mm4 \n\t" + "psubw %%mm7, %%mm0 \n\t" + "psubw %%mm7, %%mm4 \n\t" + "psraw $1, %%mm0 \n\t" + "psraw $1, %%mm4 \n\t" + "movq (%0), %%mm1 \n\t" + "movq 8(%0), %%mm5 \n\t" + "paddw %%mm1, %%mm0 \n\t" + "paddw %%mm5, %%mm4 \n\t" + "psraw $2, %%mm0 \n\t" + "psraw $2, %%mm4 \n\t" + "paddw %%mm1, %%mm0 \n\t" + "paddw %%mm5, %%mm4 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm4, 8(%0) \n\t" + :: "r"(&b[i]), "r"(&ref[i]) + : "memory" + ); + } + snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); + } + + { // Lift 3 + IDWTELEM * const src = b+w2; + i = 0; + + for(; i<w_r-7; i+=8){ + __asm__ volatile( + "movq 2(%1), %%mm2 \n\t" + "movq 10(%1), %%mm6 \n\t" + "paddw (%1), %%mm2 \n\t" + "paddw 8(%1), %%mm6 \n\t" + "movq (%0), %%mm0 \n\t" + "movq 8(%0), %%mm4 \n\t" + "paddw %%mm2, %%mm0 \n\t" + "paddw %%mm6, %%mm4 \n\t" + "psraw $1, %%mm2 \n\t" + "psraw $1, %%mm6 \n\t" + "paddw %%mm0, %%mm2 \n\t" + "paddw %%mm4, %%mm6 \n\t" + "movq %%mm2, (%2) \n\t" + "movq %%mm6, 8(%2) \n\t" + :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) + : "memory" + ); + } + snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); + } + + { + snow_interleave_line_header(&i, width, b, temp); + + for (; (i & 0x1E) != 0x1E; i-=2){ + b[i+1] = temp[i>>1]; + b[i] = b[i>>1]; + } + for (i-=30; i>=0; i-=32){ + __asm__ volatile( + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm2 \n\t" + "movq 16(%1), %%mm4 \n\t" + "movq 24(%1), %%mm6 \n\t" + "movq (%1), %%mm1 \n\t" + "movq 8(%1), %%mm3 \n\t" + "movq 16(%1), %%mm5 \n\t" + "movq 24(%1), %%mm7 \n\t" + "punpcklwd (%2), %%mm0 \n\t" + "punpcklwd 8(%2), %%mm2 \n\t" + "punpcklwd 16(%2), %%mm4 \n\t" + "punpcklwd 24(%2), %%mm6 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm2, 16(%0) \n\t" + "movq %%mm4, 32(%0) \n\t" + "movq %%mm6, 48(%0) \n\t" + "punpckhwd (%2), %%mm1 \n\t" + "punpckhwd 8(%2), %%mm3 \n\t" + "punpckhwd 16(%2), %%mm5 \n\t" + "punpckhwd 24(%2), %%mm7 \n\t" + "movq %%mm1, 8(%0) \n\t" + "movq %%mm3, 24(%0) \n\t" + "movq %%mm5, 40(%0) \n\t" + "movq %%mm7, 56(%0) \n\t" + :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1]) + : "memory" + ); + } + } +} + +#if HAVE_7REGS +#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\ + ""op" ("r",%%"REG_d"), %%"t0" \n\t"\ + ""op" 16("r",%%"REG_d"), %%"t1" \n\t"\ + ""op" 32("r",%%"REG_d"), %%"t2" \n\t"\ + ""op" 48("r",%%"REG_d"), %%"t3" \n\t" + +#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\ + snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3) + +#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\ + snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3) + +#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\ + "psubw %%"s0", %%"t0" \n\t"\ + "psubw %%"s1", %%"t1" \n\t"\ + "psubw %%"s2", %%"t2" \n\t"\ + "psubw %%"s3", %%"t3" \n\t" + +#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\ + "movdqa %%"s0", ("w",%%"REG_d") \n\t"\ + "movdqa %%"s1", 16("w",%%"REG_d") \n\t"\ + "movdqa %%"s2", 32("w",%%"REG_d") \n\t"\ + "movdqa %%"s3", 48("w",%%"REG_d") \n\t" + +#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\ + "psraw $"n", %%"t0" \n\t"\ + "psraw $"n", %%"t1" \n\t"\ + "psraw $"n", %%"t2" \n\t"\ + "psraw $"n", %%"t3" \n\t" + +#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\ + "paddw %%"s0", %%"t0" \n\t"\ + "paddw %%"s1", %%"t1" \n\t"\ + "paddw %%"s2", %%"t2" \n\t"\ + "paddw %%"s3", %%"t3" \n\t" + +#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\ + "pmulhw %%"s0", %%"t0" \n\t"\ + "pmulhw %%"s1", %%"t1" \n\t"\ + "pmulhw %%"s2", %%"t2" \n\t"\ + "pmulhw %%"s3", %%"t3" \n\t" + +#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\ + "movdqa %%"s0", %%"t0" \n\t"\ + "movdqa %%"s1", %%"t1" \n\t"\ + "movdqa %%"s2", %%"t2" \n\t"\ + "movdqa %%"s3", %%"t3" \n\t" + +static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ + x86_reg i = width; + + while(i & 0x1F) + { + i--; + b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; + b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; + b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; + b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; + } + i+=i; + + __asm__ volatile ( + "jmp 2f \n\t" + "1: \n\t" + snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6") + + + "pcmpeqw %%xmm0, %%xmm0 \n\t" + "pcmpeqw %%xmm2, %%xmm2 \n\t" + "paddw %%xmm2, %%xmm2 \n\t" + "paddw %%xmm0, %%xmm2 \n\t" + "psllw $13, %%xmm2 \n\t" + snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6") + + "pcmpeqw %%xmm7, %%xmm7 \n\t" + "pcmpeqw %%xmm5, %%xmm5 \n\t" + "psllw $15, %%xmm7 \n\t" + "psrlw $13, %%xmm5 \n\t" + "paddw %%xmm7, %%xmm5 \n\t" + snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6") + "movq (%2,%%"REG_d"), %%xmm1 \n\t" + "movq 8(%2,%%"REG_d"), %%xmm3 \n\t" + "paddw %%xmm7, %%xmm1 \n\t" + "paddw %%xmm7, %%xmm3 \n\t" + "pavgw %%xmm1, %%xmm0 \n\t" + "pavgw %%xmm3, %%xmm2 \n\t" + "movq 16(%2,%%"REG_d"), %%xmm1 \n\t" + "movq 24(%2,%%"REG_d"), %%xmm3 \n\t" + "paddw %%xmm7, %%xmm1 \n\t" + "paddw %%xmm7, %%xmm3 \n\t" + "pavgw %%xmm1, %%xmm4 \n\t" + "pavgw %%xmm3, %%xmm6 \n\t" + snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") + + snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6") + + "2: \n\t" + "sub $64, %%"REG_d" \n\t" + "jge 1b \n\t" + :"+d"(i) + :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); +} + +#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\ + ""op" ("r",%%"REG_d"), %%"t0" \n\t"\ + ""op" 8("r",%%"REG_d"), %%"t1" \n\t"\ + ""op" 16("r",%%"REG_d"), %%"t2" \n\t"\ + ""op" 24("r",%%"REG_d"), %%"t3" \n\t" + +#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\ + snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3) + +#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\ + snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3) + +#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\ + "movq %%"s0", ("w",%%"REG_d") \n\t"\ + "movq %%"s1", 8("w",%%"REG_d") \n\t"\ + "movq %%"s2", 16("w",%%"REG_d") \n\t"\ + "movq %%"s3", 24("w",%%"REG_d") \n\t" + +#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\ + "movq %%"s0", %%"t0" \n\t"\ + "movq %%"s1", %%"t1" \n\t"\ + "movq %%"s2", %%"t2" \n\t"\ + "movq %%"s3", %%"t3" \n\t" + + +static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ + x86_reg i = width; + while(i & 15) + { + i--; + b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; + b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; + b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; + b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; + } + i+=i; + __asm__ volatile( + "jmp 2f \n\t" + "1: \n\t" + + snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7") + snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7") + "pcmpeqw %%mm0, %%mm0 \n\t" + "pcmpeqw %%mm2, %%mm2 \n\t" + "paddw %%mm2, %%mm2 \n\t" + "paddw %%mm0, %%mm2 \n\t" + "psllw $13, %%mm2 \n\t" + snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7") + snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7") + snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7") + snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7") + snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7") + snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6") + "pcmpeqw %%mm7, %%mm7 \n\t" + "pcmpeqw %%mm5, %%mm5 \n\t" + "psllw $15, %%mm7 \n\t" + "psrlw $13, %%mm5 \n\t" + "paddw %%mm7, %%mm5 \n\t" + snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6") + "movq (%2,%%"REG_d"), %%mm1 \n\t" + "movq 8(%2,%%"REG_d"), %%mm3 \n\t" + "paddw %%mm7, %%mm1 \n\t" + "paddw %%mm7, %%mm3 \n\t" + "pavgw %%mm1, %%mm0 \n\t" + "pavgw %%mm3, %%mm2 \n\t" + "movq 16(%2,%%"REG_d"), %%mm1 \n\t" + "movq 24(%2,%%"REG_d"), %%mm3 \n\t" + "paddw %%mm7, %%mm1 \n\t" + "paddw %%mm7, %%mm3 \n\t" + "pavgw %%mm1, %%mm4 \n\t" + "pavgw %%mm3, %%mm6 \n\t" + snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6") + snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") + + snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") + snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") + snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6") + + "2: \n\t" + "sub $32, %%"REG_d" \n\t" + "jge 1b \n\t" + :"+d"(i) + :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); +} +#endif //HAVE_7REGS + +#define snow_inner_add_yblock_sse2_header \ + IDWTELEM * * dst_array = sb->line + src_y;\ + x86_reg tmp;\ + __asm__ volatile(\ + "mov %7, %%"REG_c" \n\t"\ + "mov %6, %2 \n\t"\ + "mov %4, %%"REG_S" \n\t"\ + "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ + "pcmpeqd %%xmm3, %%xmm3 \n\t"\ + "psllw $15, %%xmm3 \n\t"\ + "psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\ + "1: \n\t"\ + "mov %1, %%"REG_D" \n\t"\ + "mov (%%"REG_D"), %%"REG_D" \n\t"\ + "add %3, %%"REG_D" \n\t" + +#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ + "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ + "movq (%%"REG_d"), %%"out_reg1" \n\t"\ + "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\ + "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ + "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ + "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ + "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "punpcklbw %%xmm7, %%xmm4 \n\t"\ + "pmullw %%xmm0, %%"out_reg1" \n\t"\ + "pmullw %%xmm4, %%"out_reg2" \n\t" + +#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\ + "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ + "movq (%%"REG_d"), %%"out_reg1" \n\t"\ + "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\ + "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ + "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ + "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ + "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "punpcklbw %%xmm7, %%xmm4 \n\t"\ + "pmullw %%xmm0, %%"out_reg1" \n\t"\ + "pmullw %%xmm4, %%"out_reg2" \n\t" + +#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \ + snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\ + "paddusw %%xmm2, %%xmm1 \n\t"\ + "paddusw %%xmm6, %%xmm5 \n\t" + +#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \ + snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\ + "paddusw %%xmm2, %%xmm1 \n\t"\ + "paddusw %%xmm6, %%xmm5 \n\t" + +#define snow_inner_add_yblock_sse2_end_common1\ + "add $32, %%"REG_S" \n\t"\ + "add %%"REG_c", %0 \n\t"\ + "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ + "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ + "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ + "add %%"REG_c", (%%"REG_a") \n\t" + +#define snow_inner_add_yblock_sse2_end_common2\ + "jnz 1b \n\t"\ + :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ + :\ + "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ + "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); + +#define snow_inner_add_yblock_sse2_end_8\ + "sal $1, %%"REG_c" \n\t"\ + "add"OPSIZE" $"PTR_SIZE"*2, %1 \n\t"\ + snow_inner_add_yblock_sse2_end_common1\ + "sar $1, %%"REG_c" \n\t"\ + "sub $2, %2 \n\t"\ + snow_inner_add_yblock_sse2_end_common2 + +#define snow_inner_add_yblock_sse2_end_16\ + "add"OPSIZE" $"PTR_SIZE"*1, %1 \n\t"\ + snow_inner_add_yblock_sse2_end_common1\ + "dec %2 \n\t"\ + snow_inner_add_yblock_sse2_end_common2 + +static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, + int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ +snow_inner_add_yblock_sse2_header +snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0") +snow_inner_add_yblock_sse2_accum_8("2", "8") +snow_inner_add_yblock_sse2_accum_8("1", "128") +snow_inner_add_yblock_sse2_accum_8("0", "136") + + "mov %0, %%"REG_d" \n\t" + "movdqa (%%"REG_D"), %%xmm0 \n\t" + "movdqa %%xmm1, %%xmm2 \n\t" + + "punpckhwd %%xmm7, %%xmm1 \n\t" + "punpcklwd %%xmm7, %%xmm2 \n\t" + "paddd %%xmm2, %%xmm0 \n\t" + "movdqa 16(%%"REG_D"), %%xmm2 \n\t" + "paddd %%xmm1, %%xmm2 \n\t" + "paddd %%xmm3, %%xmm0 \n\t" + "paddd %%xmm3, %%xmm2 \n\t" + + "mov %1, %%"REG_D" \n\t" + "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t" + "add %3, %%"REG_D" \n\t" + + "movdqa (%%"REG_D"), %%xmm4 \n\t" + "movdqa %%xmm5, %%xmm6 \n\t" + "punpckhwd %%xmm7, %%xmm5 \n\t" + "punpcklwd %%xmm7, %%xmm6 \n\t" + "paddd %%xmm6, %%xmm4 \n\t" + "movdqa 16(%%"REG_D"), %%xmm6 \n\t" + "paddd %%xmm5, %%xmm6 \n\t" + "paddd %%xmm3, %%xmm4 \n\t" + "paddd %%xmm3, %%xmm6 \n\t" + + "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */ + "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */ + "packssdw %%xmm2, %%xmm0 \n\t" + "packuswb %%xmm7, %%xmm0 \n\t" + "movq %%xmm0, (%%"REG_d") \n\t" + + "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */ + "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */ + "packssdw %%xmm6, %%xmm4 \n\t" + "packuswb %%xmm7, %%xmm4 \n\t" + "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t" +snow_inner_add_yblock_sse2_end_8 +} + +static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, + int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ +snow_inner_add_yblock_sse2_header +snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0") +snow_inner_add_yblock_sse2_accum_16("2", "16") +snow_inner_add_yblock_sse2_accum_16("1", "512") +snow_inner_add_yblock_sse2_accum_16("0", "528") + + "mov %0, %%"REG_d" \n\t" + "psrlw $4, %%xmm1 \n\t" + "psrlw $4, %%xmm5 \n\t" + "paddw (%%"REG_D"), %%xmm1 \n\t" + "paddw 16(%%"REG_D"), %%xmm5 \n\t" + "paddw %%xmm3, %%xmm1 \n\t" + "paddw %%xmm3, %%xmm5 \n\t" + "psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */ + "psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */ + "packuswb %%xmm5, %%xmm1 \n\t" + + "movdqu %%xmm1, (%%"REG_d") \n\t" + +snow_inner_add_yblock_sse2_end_16 +} + +#define snow_inner_add_yblock_mmx_header \ + IDWTELEM * * dst_array = sb->line + src_y;\ + x86_reg tmp;\ + __asm__ volatile(\ + "mov %7, %%"REG_c" \n\t"\ + "mov %6, %2 \n\t"\ + "mov %4, %%"REG_S" \n\t"\ + "pxor %%mm7, %%mm7 \n\t" /* 0 */\ + "pcmpeqd %%mm3, %%mm3 \n\t"\ + "psllw $15, %%mm3 \n\t"\ + "psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\ + "1: \n\t"\ + "mov %1, %%"REG_D" \n\t"\ + "mov (%%"REG_D"), %%"REG_D" \n\t"\ + "add %3, %%"REG_D" \n\t" + +#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ + "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ + "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\ + "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\ + "punpcklbw %%mm7, %%"out_reg1" \n\t"\ + "punpcklbw %%mm7, %%"out_reg2" \n\t"\ + "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\ + "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + "pmullw %%mm0, %%"out_reg1" \n\t"\ + "pmullw %%mm4, %%"out_reg2" \n\t" + +#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \ + snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\ + "paddusw %%mm2, %%mm1 \n\t"\ + "paddusw %%mm6, %%mm5 \n\t" + +#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\ + "mov %0, %%"REG_d" \n\t"\ + "psrlw $4, %%mm1 \n\t"\ + "psrlw $4, %%mm5 \n\t"\ + "paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\ + "paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\ + "paddw %%mm3, %%mm1 \n\t"\ + "paddw %%mm3, %%mm5 \n\t"\ + "psraw $4, %%mm1 \n\t"\ + "psraw $4, %%mm5 \n\t"\ + "packuswb %%mm5, %%mm1 \n\t"\ + "movq %%mm1, "write_offset"(%%"REG_d") \n\t" + +#define snow_inner_add_yblock_mmx_end(s_step)\ + "add $"s_step", %%"REG_S" \n\t"\ + "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ + "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ + "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ + "add %%"REG_c", (%%"REG_a") \n\t"\ + "add"OPSIZE " $"PTR_SIZE"*1, %1 \n\t"\ + "add %%"REG_c", %0 \n\t"\ + "dec %2 \n\t"\ + "jnz 1b \n\t"\ + :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ + :\ + "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ + "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); + +static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, + int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ +snow_inner_add_yblock_mmx_header +snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") +snow_inner_add_yblock_mmx_accum("2", "8", "0") +snow_inner_add_yblock_mmx_accum("1", "128", "0") +snow_inner_add_yblock_mmx_accum("0", "136", "0") +snow_inner_add_yblock_mmx_mix("0", "0") +snow_inner_add_yblock_mmx_end("16") +} + +static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, + int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ +snow_inner_add_yblock_mmx_header +snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") +snow_inner_add_yblock_mmx_accum("2", "16", "0") +snow_inner_add_yblock_mmx_accum("1", "512", "0") +snow_inner_add_yblock_mmx_accum("0", "528", "0") +snow_inner_add_yblock_mmx_mix("0", "0") + +snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8") +snow_inner_add_yblock_mmx_accum("2", "24", "8") +snow_inner_add_yblock_mmx_accum("1", "520", "8") +snow_inner_add_yblock_mmx_accum("0", "536", "8") +snow_inner_add_yblock_mmx_mix("16", "8") +snow_inner_add_yblock_mmx_end("32") +} + +static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, + int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ + + if (b_w == 16) + inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + else if (b_w == 8 && obmc_stride == 16) { + if (!(b_h & 1)) + inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + else + inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + } else + ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); +} + +static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, + int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ + if (b_w == 16) + inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + else if (b_w == 8 && obmc_stride == 16) + inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + else + ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); +} + +#endif /* HAVE_INLINE_ASM */ + +void ff_dwt_init_x86(SnowDWTContext *c) +{ +#if HAVE_INLINE_ASM + int mm_flags = av_get_cpu_flags(); + + if (mm_flags & AV_CPU_FLAG_MMX) { + if(mm_flags & AV_CPU_FLAG_SSE2 & 0){ + c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; +#if HAVE_7REGS + c->vertical_compose97i = ff_snow_vertical_compose97i_sse2; +#endif + c->inner_add_yblock = ff_snow_inner_add_yblock_sse2; + } + else{ + if (mm_flags & AV_CPU_FLAG_MMXEXT) { + c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx; +#if HAVE_7REGS + c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; +#endif + } + c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; + } + } +#endif /* HAVE_INLINE_ASM */ +} diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c new file mode 100644 index 0000000000..02c5eaa2c2 --- /dev/null +++ b/libavcodec/x86/v210-init.c @@ -0,0 +1,48 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/cpu.h" +#include "libavcodec/v210dec.h" + +extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); + +extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); + +av_cold void v210_x86_init(V210DecContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + +#if HAVE_YASM + if (s->aligned_input) { + if (cpu_flags & AV_CPU_FLAG_SSSE3) + s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3; + + if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX) + s->unpack_frame = ff_v210_planar_unpack_aligned_avx; + } + else { + if (cpu_flags & AV_CPU_FLAG_SSSE3) + s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3; + + if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX) + s->unpack_frame = ff_v210_planar_unpack_unaligned_avx; + } +#endif +} diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm new file mode 100644 index 0000000000..6554a43de1 --- /dev/null +++ b/libavcodec/x86/v210.asm @@ -0,0 +1,88 @@ +;****************************************************************************** +;* V210 SIMD unpack +;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu> +;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +v210_mask: times 4 dd 0x3ff +v210_mult: dw 64,4,64,4,64,4,64,4 +v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1 +v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1 + +SECTION .text + +%macro v210_planar_unpack 2 + +; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width) +cglobal v210_planar_unpack_%1_%2, 5, 5, 7 + movsxdifnidn r4, r4d + lea r1, [r1+2*r4] + add r2, r4 + add r3, r4 + neg r4 + + mova m3, [v210_mult] + mova m4, [v210_mask] + mova m5, [v210_luma_shuf] + mova m6, [v210_chroma_shuf] +.loop +%ifidn %1, unaligned + movu m0, [r0] +%else + mova m0, [r0] +%endif + + pmullw m1, m0, m3 + psrld m0, 10 + psrlw m1, 6 ; u0 v0 y1 y2 v1 u2 y4 y5 + pand m0, m4 ; y0 __ u1 __ y3 __ v2 __ + + shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __ + pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __ + movu [r1+2*r4], m2 + + shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __ + pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __ + movq [r2+r4], m1 + movhps [r3+r4], m1 + + add r0, mmsize + add r4, 6 + jl .loop + + REP_RET +%endmacro + +INIT_XMM +v210_planar_unpack unaligned, ssse3 +%if HAVE_AVX_EXTERNAL +INIT_AVX +v210_planar_unpack unaligned, avx +%endif + +INIT_XMM +v210_planar_unpack aligned, ssse3 +%if HAVE_AVX_EXTERNAL +INIT_AVX +v210_planar_unpack aligned, avx +%endif diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp.asm index adf08d7d84..546688cf9d 100644 --- a/libavcodec/x86/vc1dsp.asm +++ b/libavcodec/x86/vc1dsp.asm @@ -2,20 +2,20 @@ ;* VC1 deblocking optimizations ;* Copyright (c) 2009 David Conrad ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/vc1dsp.h b/libavcodec/x86/vc1dsp.h index 9b6c8ada26..fdd4de1813 100644 --- a/libavcodec/x86/vc1dsp.h +++ b/libavcodec/x86/vc1dsp.h @@ -1,20 +1,20 @@ /* * VC-1 and WMV3 decoder - X86 DSP init functions * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c index 15fd2c830c..5ceacd348e 100644 --- a/libavcodec/x86/vc1dsp_mmx.c +++ b/libavcodec/x86/vc1dsp_mmx.c @@ -25,7 +25,6 @@ */ #include "libavutil/cpu.h" -#include "libavutil/internal.h" #include "libavutil/mem.h" #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" diff --git a/libavcodec/x86/videodsp.asm b/libavcodec/x86/videodsp.asm index 59f19378ca..1ac02574d6 100644 --- a/libavcodec/x86/videodsp.asm +++ b/libavcodec/x86/videodsp.asm @@ -2,20 +2,20 @@ ;* Core video DSP functions ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -54,13 +54,13 @@ SECTION .text ; | | <- bottom is copied from last line in body of source ; '----' <- bh %if ARCH_X86_64 -cglobal emu_edge_vvar, 7, 8, 1, dst, src, dst_stride, src_stride, \ +cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \ start_y, end_y, bh, w %else ; x86-32 cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w %define src_strideq r3mp -%define dst_strideq r2mp - mov srcq, r1mp +%define dst_strideq r1mp + mov srcq, r2mp mov start_yq, r4mp mov end_yq, r5mp mov bhq, r6mp @@ -102,8 +102,8 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w imul wd, 0x01010101 ; w *= 0x01010101 movd m0, wd mov wq, n_wordsq ; initialize w -%if cpuflag(sse) - shufps m0, m0, q0000 ; splat +%if cpuflag(sse2) + pshufd m0, m0, q0000 ; splat %else ; mmx punpckldq m0, m0 ; splat %endif ; mmx/sse @@ -124,7 +124,7 @@ INIT_MMX mmx hvar_fn %endif -INIT_XMM sse +INIT_XMM sse2 hvar_fn ; macro to read/write a horizontal number of pixels (%2) to/from registers @@ -137,42 +137,49 @@ hvar_fn ; - if (%2 & 3) fills 1, 2 or 4 bytes in eax ; writing data out is in the same way %macro READ_NUM_BYTES 2 -%assign %%off 0 ; offset in source buffer -%assign %%idx 0 ; mmx/xmm register index +%assign %%off 0 ; offset in source buffer +%assign %%mmx_idx 0 ; mmx register index +%assign %%xmm_idx 0 ; xmm register index %rep %2/mmsize - movu m %+ %%idx, [srcq+%%off] +%if mmsize == 16 + movu xmm %+ %%xmm_idx, [srcq+%%off] +%assign %%xmm_idx %%xmm_idx+1 +%else ; mmx + movu mm %+ %%mmx_idx, [srcq+%%off] +%assign %%mmx_idx %%mmx_idx+1 +%endif %assign %%off %%off+mmsize -%assign %%idx %%idx+1 %endrep ; %2/mmsize %if mmsize == 16 %if (%2-%%off) >= 8 %if %2 > 16 && (%2-%%off) > 8 - movu m %+ %%idx, [srcq+%2-16] + movu xmm %+ %%xmm_idx, [srcq+%2-16] +%assign %%xmm_idx %%xmm_idx+1 %assign %%off %2 %else - movq m %+ %%idx, [srcq+%%off] + movq mm %+ %%mmx_idx, [srcq+%%off] +%assign %%mmx_idx %%mmx_idx+1 %assign %%off %%off+8 %endif -%assign %%idx %%idx+1 %endif ; (%2-%%off) >= 8 %endif %if (%2-%%off) >= 4 %if %2 > 8 && (%2-%%off) > 4 - movq m %+ %%idx, [srcq+%2-8] + movq mm %+ %%mmx_idx, [srcq+%2-8] %assign %%off %2 %else - movd m %+ %%idx, [srcq+%%off] + movd mm %+ %%mmx_idx, [srcq+%%off] %assign %%off %%off+4 %endif -%assign %%idx %%idx+1 +%assign %%mmx_idx %%mmx_idx+1 %endif ; (%2-%%off) >= 4 %if (%2-%%off) >= 1 %if %2 >= 4 - movd m %+ %%idx, [srcq+%2-4] + movd mm %+ %%mmx_idx, [srcq+%2-4] %elif (%2-%%off) == 1 mov valb, [srcq+%2-1] %elif (%2-%%off) == 2 @@ -180,48 +187,55 @@ hvar_fn %elifidn %1, body mov vald, [srcq+%2-3] %else - movd m %+ %%idx, [srcq+%2-3] + movd mm %+ %%mmx_idx, [srcq+%2-3] %endif %endif ; (%2-%%off) >= 1 %endmacro ; READ_NUM_BYTES %macro WRITE_NUM_BYTES 2 -%assign %%off 0 ; offset in destination buffer -%assign %%idx 0 ; mmx/xmm register index +%assign %%off 0 ; offset in destination buffer +%assign %%mmx_idx 0 ; mmx register index +%assign %%xmm_idx 0 ; xmm register index %rep %2/mmsize - movu [dstq+%%off], m %+ %%idx +%if mmsize == 16 + movu [dstq+%%off], xmm %+ %%xmm_idx +%assign %%xmm_idx %%xmm_idx+1 +%else ; mmx + movu [dstq+%%off], mm %+ %%mmx_idx +%assign %%mmx_idx %%mmx_idx+1 +%endif %assign %%off %%off+mmsize -%assign %%idx %%idx+1 %endrep ; %2/mmsize %if mmsize == 16 %if (%2-%%off) >= 8 %if %2 > 16 && (%2-%%off) > 8 - movu [dstq+%2-16], m %+ %%idx + movu [dstq+%2-16], xmm %+ %%xmm_idx +%assign %%xmm_idx %%xmm_idx+1 %assign %%off %2 %else - movq [dstq+%%off], m %+ %%idx + movq [dstq+%%off], mm %+ %%mmx_idx +%assign %%mmx_idx %%mmx_idx+1 %assign %%off %%off+8 %endif -%assign %%idx %%idx+1 %endif ; (%2-%%off) >= 8 %endif %if (%2-%%off) >= 4 %if %2 > 8 && (%2-%%off) > 4 - movq [dstq+%2-8], m %+ %%idx + movq [dstq+%2-8], mm %+ %%mmx_idx %assign %%off %2 %else - movd [dstq+%%off], m %+ %%idx + movd [dstq+%%off], mm %+ %%mmx_idx %assign %%off %%off+4 %endif -%assign %%idx %%idx+1 +%assign %%mmx_idx %%mmx_idx+1 %endif ; (%2-%%off) >= 4 %if (%2-%%off) >= 1 %if %2 >= 4 - movd [dstq+%2-4], m %+ %%idx + movd [dstq+%2-4], mm %+ %%mmx_idx %elif (%2-%%off) == 1 mov [dstq+%2-1], valb %elif (%2-%%off) == 2 @@ -231,7 +245,7 @@ hvar_fn shr vald, 16 mov [dstq+%2-1], valb %else - movd vald, m %+ %%idx + movd vald, mm %+ %%mmx_idx mov [dstq+%2-3], valw shr vald, 16 mov [dstq+%2-1], valb @@ -248,30 +262,30 @@ hvar_fn %rep 1+%2-%1 %if %%n <= 3 %if ARCH_X86_64 -cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, src, dst_stride, src_stride, \ +cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \ start_y, end_y, val, bh mov bhq, r6mp ; r6mp = bhmp %else ; x86-32 cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh mov dstq, r0mp - mov srcq, r1mp + mov srcq, r2mp mov start_yq, r4mp mov end_yq, r5mp mov bhq, r6mp -%define dst_strideq r2mp +%define dst_strideq r1mp %define src_strideq r3mp %endif ; x86-64/32 %else %if ARCH_X86_64 -cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, src, dst_stride, src_stride, \ +cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \ start_y, end_y, bh %else ; x86-32 cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh - mov srcq, r1mp + mov srcq, r2mp mov start_yq, r4mp mov end_yq, r5mp mov bhq, r6mp -%define dst_strideq r2mp +%define dst_strideq r1mp %define src_strideq r3mp %endif ; x86-64/32 %endif @@ -330,25 +344,23 @@ VERTICAL_EXTEND 16, 22 ; obviously not the same on both sides. %macro READ_V_PIXEL 2 -%if %1 == 2 - movzx valw, byte %2 - imul valw, 0x0101 -%else movzx vald, byte %2 imul vald, 0x01010101 %if %1 >= 8 movd m0, vald %if mmsize == 16 - shufps m0, m0, q0000 + pshufd m0, m0, q0000 %else punpckldq m0, m0 -%endif -%endif ; %1 >= 8 -%endif +%endif ; mmsize == 16 +%endif ; %1 > 16 %endmacro ; READ_V_PIXEL %macro WRITE_V_PIXEL 2 %assign %%off 0 + +%if %1 >= 8 + %rep %1/mmsize movu [%2+%%off], m0 %assign %%off %%off+mmsize @@ -364,27 +376,29 @@ VERTICAL_EXTEND 16, 22 %assign %%off %%off+8 %endif %endif ; %1-%%off >= 8 -%endif +%endif ; mmsize == 16 %if %1-%%off >= 4 -%if %1 > 8 %% %1-%%off > 4 +%if %1 > 8 && %1-%%off > 4 movq [%2+%1-8], m0 %assign %%off %1 -%elif %1 >= 8 && %1-%%off >= 4 - movd [%2+%%off], m0 -%assign %%off %%off+4 %else - mov [%2+%%off], vald + movd [%2+%%off], m0 %assign %%off %%off+4 %endif %endif ; %1-%%off >= 4 -%if %1-%%off >= 2 -%if %1 >= 8 - movd [%2+%1-4], m0 -%else +%else ; %1 < 8 + +%rep %1/4 + mov [%2+%%off], vald +%assign %%off %%off+4 +%endrep ; %1/4 + +%endif ; %1 >=/< 8 + +%if %1-%%off == 2 mov [%2+%%off], valw -%endif %endif ; (%1-%%off)/2 %endmacro ; WRITE_V_PIXEL @@ -409,7 +423,7 @@ H_EXTEND 2, 14 H_EXTEND 16, 22 %endif -INIT_XMM sse +INIT_XMM sse2 H_EXTEND 16, 22 %macro PREFETCH_FN 1 diff --git a/libavcodec/x86/videodsp_init.c b/libavcodec/x86/videodsp_init.c index 79d980194c..2013a93b90 100644 --- a/libavcodec/x86/videodsp_init.c +++ b/libavcodec/x86/videodsp_init.c @@ -1,25 +1,27 @@ /* + * Copyright (C) 2002-2012 Michael Niedermayer * Copyright (C) 2012 Ronald S. Bultje * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "config.h" #include "libavutil/attributes.h" +#include "libavutil/avassert.h" #include "libavutil/common.h" #include "libavutil/cpu.h" #include "libavutil/mem.h" @@ -28,11 +30,11 @@ #include "libavcodec/videodsp.h" #if HAVE_YASM -typedef void emu_edge_vfix_func(uint8_t *dst, const uint8_t *src, - x86_reg dst_stride, x86_reg src_stride, +typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride, + const uint8_t *src, x86_reg src_stride, x86_reg start_y, x86_reg end_y, x86_reg bh); -typedef void emu_edge_vvar_func(uint8_t *dst, const uint8_t *src, - x86_reg dst_stride, x86_reg src_stride, +typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride, + const uint8_t *src, x86_reg src_stride, x86_reg start_y, x86_reg end_y, x86_reg bh, x86_reg w); @@ -115,17 +117,17 @@ static emu_edge_hfix_func *hfixtbl_mmx[11] = { }; #endif extern emu_edge_hvar_func ff_emu_edge_hvar_mmx; -extern emu_edge_hfix_func ff_emu_edge_hfix16_sse; -extern emu_edge_hfix_func ff_emu_edge_hfix18_sse; -extern emu_edge_hfix_func ff_emu_edge_hfix20_sse; -extern emu_edge_hfix_func ff_emu_edge_hfix22_sse; -static emu_edge_hfix_func *hfixtbl_sse[11] = { +extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2; +extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2; +extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2; +extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2; +static emu_edge_hfix_func *hfixtbl_sse2[11] = { ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx, - ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse, ff_emu_edge_hfix18_sse, - ff_emu_edge_hfix20_sse, ff_emu_edge_hfix22_sse + ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2, + ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2 }; -extern emu_edge_hvar_func ff_emu_edge_hvar_sse; +extern emu_edge_hvar_func ff_emu_edge_hvar_sse2; static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src, ptrdiff_t dst_stride, @@ -141,14 +143,16 @@ static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src, x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p; if (!w || !h) - return; + return; if (src_y >= h) { - src -= src_y * src_stride; - src_y = src_y_add = h - 1; + src -= src_y*src_stride; + src_y_add = h - 1; + src_y = h - 1; } else if (src_y <= -block_h) { - src -= src_y*src_stride; - src_y = src_y_add = 1 - block_h; + src -= src_y*src_stride; + src_y_add = 1 - block_h; + src_y = 1 - block_h; } if (src_x >= w) { src += w - 1 - src_x; @@ -162,18 +166,17 @@ static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src, start_x = FFMAX(0, -src_x); end_y = FFMIN(block_h, h-src_y); end_x = FFMIN(block_w, w-src_x); - assert(start_x < end_x && block_w > 0); - assert(start_y < end_y && block_h > 0); + av_assert2(start_x < end_x && block_w > 0); + av_assert2(start_y < end_y && block_h > 0); // fill in the to-be-copied part plus all above/below src += (src_y_add + start_y) * src_stride + start_x; w = end_x - start_x; if (w <= 22) { - vfix_tbl[w - 1](dst + start_x, src, - dst_stride, src_stride, + vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride, start_y, end_y, block_h); } else { - v_extend_var(dst + start_x, src, dst_stride, src_stride, + v_extend_var(dst + start_x, dst_stride, src, src_stride, start_y, end_y, block_h, w); } @@ -211,9 +214,8 @@ static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, src_x, src_y, w, h, vfixtbl_mmx, &ff_emu_edge_vvar_mmx, hfixtbl_mmx, &ff_emu_edge_hvar_mmx); } -#endif -static av_noinline void emulated_edge_mc_sse(uint8_t * buf,const uint8_t *src, +static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, ptrdiff_t buf_stride, ptrdiff_t src_stride, int block_w, int block_h, @@ -221,7 +223,19 @@ static av_noinline void emulated_edge_mc_sse(uint8_t * buf,const uint8_t *src, { emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, - hfixtbl_sse, &ff_emu_edge_hvar_sse); + hfixtbl_mmx, &ff_emu_edge_hvar_mmx); +} +#endif + +static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src, + ptrdiff_t buf_stride, + ptrdiff_t src_stride, + int block_w, int block_h, + int src_x, int src_y, int w, int h) +{ + emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, + src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, + hfixtbl_sse2, &ff_emu_edge_hvar_sse2); } #endif /* HAVE_YASM */ @@ -244,8 +258,13 @@ av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc) if (EXTERNAL_MMXEXT(cpu_flags)) { ctx->prefetch = ff_prefetch_mmxext; } +#if ARCH_X86_32 if (EXTERNAL_SSE(cpu_flags) && bpc <= 8) { ctx->emulated_edge_mc = emulated_edge_mc_sse; } +#endif /* ARCH_X86_32 */ + if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) { + ctx->emulated_edge_mc = emulated_edge_mc_sse2; + } #endif /* HAVE_YASM */ } diff --git a/libavcodec/x86/vorbisdsp.asm b/libavcodec/x86/vorbisdsp.asm index c54650eef5..b25d838868 100644 --- a/libavcodec/x86/vorbisdsp.asm +++ b/libavcodec/x86/vorbisdsp.asm @@ -2,20 +2,20 @@ ;* Vorbis x86 optimizations ;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/vorbisdsp_init.c b/libavcodec/x86/vorbisdsp_init.c index 2a978b66aa..284a528a0c 100644 --- a/libavcodec/x86/vorbisdsp_init.c +++ b/libavcodec/x86/vorbisdsp_init.c @@ -1,20 +1,20 @@ /* * Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm index fc8a047224..24496ae8cf 100644 --- a/libavcodec/x86/vp3dsp.asm +++ b/libavcodec/x86/vp3dsp.asm @@ -2,20 +2,20 @@ ;* MMX/SSE2-optimized functions for the VP3 decoder ;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c index 9e38014e4d..1f02a6f709 100644 --- a/libavcodec/x86/vp3dsp_init.c +++ b/libavcodec/x86/vp3dsp_init.c @@ -1,18 +1,20 @@ /* - * This file is part of Libav. + * Copyright (c) 2009 David Conrad <lessen42@gmail.com> * - * Libav is free software; you can redistribute it and/or + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -21,6 +23,7 @@ #include "libavutil/attributes.h" #include "libavutil/cpu.h" #include "libavutil/x86/cpu.h" +#include "libavutil/x86/asm.h" #include "libavcodec/avcodec.h" #include "libavcodec/dsputil.h" #include "libavcodec/vp3dsp.h" @@ -40,10 +43,68 @@ void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride, void ff_vp3_h_loop_filter_mmxext(uint8_t *src, int stride, int *bounding_values); +#if HAVE_MMX_INLINE + +#define MOVQ_BFE(regd) \ + __asm__ volatile ( \ + "pcmpeqd %%"#regd", %%"#regd" \n\t" \ + "paddb %%"#regd", %%"#regd" \n\t" ::) + +#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ + "movq "#rega", "#regr" \n\t" \ + "movq "#regc", "#regp" \n\t" \ + "pand "#regb", "#regr" \n\t" \ + "pand "#regd", "#regp" \n\t" \ + "pxor "#rega", "#regb" \n\t" \ + "pxor "#regc", "#regd" \n\t" \ + "pand %%mm6, "#regb" \n\t" \ + "pand %%mm6, "#regd" \n\t" \ + "psrlq $1, "#regb" \n\t" \ + "psrlq $1, "#regd" \n\t" \ + "paddb "#regb", "#regr" \n\t" \ + "paddb "#regd", "#regp" \n\t" + +static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, ptrdiff_t stride, int h) +{ +// START_TIMER + MOVQ_BFE(mm6); + __asm__ volatile( + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%2), %%mm1 \n\t" + "movq (%1,%4), %%mm2 \n\t" + "movq (%2,%4), %%mm3 \n\t" + PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%3) \n\t" + "movq %%mm5, (%3,%4) \n\t" + + "movq (%1,%4,2), %%mm0 \n\t" + "movq (%2,%4,2), %%mm1 \n\t" + "movq (%1,%5), %%mm2 \n\t" + "movq (%2,%5), %%mm3 \n\t" + "lea (%1,%4,4), %1 \n\t" + "lea (%2,%4,4), %2 \n\t" + PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%3,%4,2) \n\t" + "movq %%mm5, (%3,%5) \n\t" + "lea (%3,%4,4), %3 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+r"(h), "+r"(a), "+r"(b), "+r"(dst) + :"r"((x86_reg)stride), "r"((x86_reg)3L*stride) + :"memory"); +// STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx") +} +#endif /* HAVE_MMX_INLINE */ + av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags) { int cpu_flags = av_get_cpu_flags(); +#if HAVE_MMX_INLINE + c->put_no_rnd_pixels_l2 = put_vp_no_rnd_pixels8_l2_mmx; +#endif /* HAVE_MMX_INLINE */ + #if ARCH_X86_32 if (EXTERNAL_MMX(cpu_flags)) { c->idct_put = ff_vp3_idct_put_mmx; diff --git a/libavcodec/x86/vp56_arith.h b/libavcodec/x86/vp56_arith.h index 0a693684af..e71dbf8ed0 100644 --- a/libavcodec/x86/vp56_arith.h +++ b/libavcodec/x86/vp56_arith.h @@ -4,20 +4,20 @@ * Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org> * Copyright (C) 2010 Eli Friedman * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/vp6dsp.asm b/libavcodec/x86/vp6dsp.asm index 80f8ca5f38..3d874ea62a 100644 --- a/libavcodec/x86/vp6dsp.asm +++ b/libavcodec/x86/vp6dsp.asm @@ -3,20 +3,20 @@ ;* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com> ;* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/vp6dsp_init.c b/libavcodec/x86/vp6dsp_init.c index cd94f3e038..82baee7e97 100644 --- a/libavcodec/x86/vp6dsp_init.c +++ b/libavcodec/x86/vp6dsp_init.c @@ -3,20 +3,20 @@ * Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com> * Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index d41b6b46ae..85c7e9948c 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -3,20 +3,20 @@ ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c index 69460aa73f..982e17d598 100644 --- a/libavcodec/x86/vp8dsp_init.c +++ b/libavcodec/x86/vp8dsp_init.c @@ -3,20 +3,20 @@ * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> * Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -382,7 +382,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; } - if (EXTERNAL_SSE2(cpu_flags) && (cpu_flags & AV_CPU_FLAG_SSE2SLOW)) { + if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) { VP8_LUMA_MC_FUNC(0, 16, sse2); VP8_MC_FUNC(1, 8, sse2); VP8_BILINEAR_MC_FUNC(0, 16, sse2); diff --git a/libavcodec/x86/vp8dsp_loopfilter.asm b/libavcodec/x86/vp8dsp_loopfilter.asm index cbad085558..45dd54b6a2 100644 --- a/libavcodec/x86/vp8dsp_loopfilter.asm +++ b/libavcodec/x86/vp8dsp_loopfilter.asm @@ -3,20 +3,20 @@ ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c index ce58c08a3b..ab3396e098 100644 --- a/libavcodec/x86/vp9dsp_init.c +++ b/libavcodec/x86/vp9dsp_init.c @@ -1,41 +1,38 @@ /* * VP9 SIMD optimizations * - * Copyright (c) 2013 Ronald S. Bultje <rsbultje@gmail.com> + * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavutil/attributes.h" #include "libavutil/cpu.h" -#include "libavutil/internal.h" #include "libavutil/mem.h" #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" -#include "libavcodec/vp9.h" +#include "libavcodec/vp9dsp.h" #if HAVE_YASM -#define fpel_func(avg, sz, opt) \ -void ff_ ## avg ## sz ## _ ## opt(uint8_t *dst, const uint8_t *src, \ - ptrdiff_t dst_stride, \ - ptrdiff_t src_stride, \ - int h, int mx, int my) - +#define fpel_func(avg, sz, opt) \ +void ff_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) fpel_func(put, 4, mmx); fpel_func(put, 8, mmx); fpel_func(put, 16, sse); @@ -48,55 +45,46 @@ fpel_func(avg, 32, sse2); fpel_func(avg, 64, sse2); #undef fpel_func -#define mc_func(avg, sz, dir, opt) \ -void \ -ff_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst, \ - const uint8_t *src, \ - ptrdiff_t dst_stride, \ - ptrdiff_t src_stride, \ - int h, \ - const int8_t (*filter)[16]) - -#define mc_funcs(sz) \ - mc_func(put, sz, h, ssse3); \ - mc_func(avg, sz, h, ssse3); \ - mc_func(put, sz, v, ssse3); \ - mc_func(avg, sz, v, ssse3) +#define mc_func(avg, sz, dir, opt) \ +void ff_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, const int8_t (*filter)[16]) +#define mc_funcs(sz) \ +mc_func(put, sz, h, ssse3); \ +mc_func(avg, sz, h, ssse3); \ +mc_func(put, sz, v, ssse3); \ +mc_func(avg, sz, v, ssse3) mc_funcs(4); mc_funcs(8); +#if ARCH_X86_64 +mc_funcs(16); +#endif #undef mc_funcs #undef mc_func -#define mc_rep_func(avg, sz, hsz, dir, opt) \ -static av_always_inline void \ -ff_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst, \ - const uint8_t *src, \ - ptrdiff_t dst_stride, \ - ptrdiff_t src_stride, \ - int h, \ - const int8_t (*filter)[16]) \ -{ \ - ff_ ## avg ## _8tap_1d_ ## dir ## _ ## hsz ## _ ## opt(dst, src, \ - dst_stride, \ - src_stride, \ - h, \ - filter); \ - ff_ ## avg ## _8tap_1d_ ## dir ## _ ## hsz ## _ ## opt(dst + hsz, \ - src + hsz, \ - dst_stride, \ - src_stride, \ - h, filter); \ +#define mc_rep_func(avg, sz, hsz, dir, opt) \ +static av_always_inline void \ +ff_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, const int8_t (*filter)[16]) \ +{ \ + ff_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst, dst_stride, src, \ + src_stride, h, filter); \ + ff_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst + hsz, dst_stride, src + hsz, \ + src_stride, h, filter); \ } -#define mc_rep_funcs(sz, hsz) \ - mc_rep_func(put, sz, hsz, h, ssse3); \ - mc_rep_func(avg, sz, hsz, h, ssse3); \ - mc_rep_func(put, sz, hsz, v, ssse3); \ - mc_rep_func(avg, sz, hsz, v, ssse3) +#define mc_rep_funcs(sz, hsz) \ +mc_rep_func(put, sz, hsz, h, ssse3); \ +mc_rep_func(avg, sz, hsz, h, ssse3); \ +mc_rep_func(put, sz, hsz, v, ssse3); \ +mc_rep_func(avg, sz, hsz, v, ssse3) +#if ARCH_X86_32 mc_rep_funcs(16, 8); +#endif mc_rep_funcs(32, 16); mc_rep_funcs(64, 32); @@ -105,36 +93,29 @@ mc_rep_funcs(64, 32); extern const int8_t ff_filters_ssse3[3][15][4][16]; -#define filter_8tap_2d_fn(op, sz, f, fname) \ -static void \ -op ## _8tap_ ## fname ## _ ## sz ## hv_ssse3(uint8_t *dst, \ - const uint8_t *src, \ - ptrdiff_t dst_stride, \ - ptrdiff_t src_stride, \ - int h, int mx, int my) \ -{ \ - LOCAL_ALIGNED_16(uint8_t, temp, [71 * 64]); \ - ff_put_8tap_1d_h_ ## sz ## _ssse3(temp, src - 3 * src_stride, \ - 64, src_stride, \ - h + 7, \ - ff_filters_ssse3[f][mx - 1]); \ - ff_ ## op ## _8tap_1d_v_ ## sz ## _ssse3(dst, temp + 3 * 64, \ - dst_stride, 64, \ - h, \ - ff_filters_ssse3[f][my - 1]); \ +#define filter_8tap_2d_fn(op, sz, f, fname) \ +static void op##_8tap_##fname##_##sz##hv_ssse3(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + LOCAL_ALIGNED_16(uint8_t, temp, [71 * 64]); \ + ff_put_8tap_1d_h_##sz##_ssse3(temp, 64, src - 3 * src_stride, src_stride, \ + h + 7, ff_filters_ssse3[f][mx - 1]); \ + ff_##op##_8tap_1d_v_##sz##_ssse3(dst, dst_stride, temp + 3 * 64, 64, \ + h, ff_filters_ssse3[f][my - 1]); \ } -#define filters_8tap_2d_fn(op, sz) \ - filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, regular) \ - filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, sharp) \ - filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth) +#define filters_8tap_2d_fn(op, sz) \ +filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, regular) \ +filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, sharp) \ +filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth) #define filters_8tap_2d_fn2(op) \ - filters_8tap_2d_fn(op, 64) \ - filters_8tap_2d_fn(op, 32) \ - filters_8tap_2d_fn(op, 16) \ - filters_8tap_2d_fn(op, 8) \ - filters_8tap_2d_fn(op, 4) +filters_8tap_2d_fn(op, 64) \ +filters_8tap_2d_fn(op, 32) \ +filters_8tap_2d_fn(op, 16) \ +filters_8tap_2d_fn(op, 8) \ +filters_8tap_2d_fn(op, 4) filters_8tap_2d_fn2(put) filters_8tap_2d_fn2(avg) @@ -143,36 +124,30 @@ filters_8tap_2d_fn2(avg) #undef filters_8tap_2d_fn #undef filter_8tap_2d_fn -#define filter_8tap_1d_fn(op, sz, f, fname, dir, dvar) \ -static void \ -op ## _8tap_ ## fname ## _ ## sz ## dir ## _ssse3(uint8_t *dst, \ - const uint8_t *src, \ - ptrdiff_t dst_stride, \ - ptrdiff_t src_stride, \ - int h, int mx, \ - int my) \ -{ \ - ff_ ## op ## _8tap_1d_ ## dir ## _ ## sz ## _ssse3(dst, src, \ - dst_stride, \ - src_stride, h, \ - ff_filters_ssse3[f][dvar - 1]); \ +#define filter_8tap_1d_fn(op, sz, f, fname, dir, dvar) \ +static void op##_8tap_##fname##_##sz##dir##_ssse3(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + ff_##op##_8tap_1d_##dir##_##sz##_ssse3(dst, dst_stride, src, src_stride, \ + h, ff_filters_ssse3[f][dvar - 1]); \ } -#define filters_8tap_1d_fn(op, sz, dir, dvar) \ - filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, regular, dir, dvar) \ - filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, sharp, dir, dvar) \ - filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth, dir, dvar) +#define filters_8tap_1d_fn(op, sz, dir, dvar) \ +filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, regular, dir, dvar) \ +filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, sharp, dir, dvar) \ +filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth, dir, dvar) -#define filters_8tap_1d_fn2(op, sz) \ - filters_8tap_1d_fn(op, sz, h, mx) \ - filters_8tap_1d_fn(op, sz, v, my) +#define filters_8tap_1d_fn2(op, sz) \ +filters_8tap_1d_fn(op, sz, h, mx) \ +filters_8tap_1d_fn(op, sz, v, my) #define filters_8tap_1d_fn3(op) \ - filters_8tap_1d_fn2(op, 64) \ - filters_8tap_1d_fn2(op, 32) \ - filters_8tap_1d_fn2(op, 16) \ - filters_8tap_1d_fn2(op, 8) \ - filters_8tap_1d_fn2(op, 4) +filters_8tap_1d_fn2(op, 64) \ +filters_8tap_1d_fn2(op, 32) \ +filters_8tap_1d_fn2(op, 16) \ +filters_8tap_1d_fn2(op, 8) \ +filters_8tap_1d_fn2(op, 4) filters_8tap_1d_fn3(put) filters_8tap_1d_fn3(avg) @@ -182,6 +157,38 @@ filters_8tap_1d_fn3(avg) #undef filters_8tap_1d_fn3 #undef filter_8tap_1d_fn +#define itxfm_func(typea, typeb, size, opt) \ +void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \ + int16_t *block, int eob) +#define itxfm_funcs(size, opt) \ +itxfm_func(idct, idct, size, opt); \ +itxfm_func(iadst, idct, size, opt); \ +itxfm_func(idct, iadst, size, opt); \ +itxfm_func(iadst, iadst, size, opt) + +itxfm_func(idct, idct, 4, ssse3); +itxfm_func(idct, idct, 8, ssse3); +itxfm_func(idct, idct, 8, avx); +itxfm_funcs(16, ssse3); +itxfm_funcs(16, avx); +itxfm_func(idct, idct, 32, ssse3); +itxfm_func(idct, idct, 32, avx); + +#undef itxfm_func +#undef itxfm_funcs + +#define lpf_funcs(size1, size2, opt) \ +void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \ + int E, int I, int H); \ +void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \ + int E, int I, int H) + +lpf_funcs(16, 16, sse2); +lpf_funcs(16, 16, ssse3); +lpf_funcs(16, 16, avx); + +#undef lpf_funcs + #endif /* HAVE_YASM */ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) @@ -189,29 +196,29 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) #if HAVE_YASM int cpu_flags = av_get_cpu_flags(); -#define init_fpel(idx1, idx2, sz, type, opt) \ - dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \ - dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \ - dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \ - dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_ ## type ## sz ## _ ## opt +#define init_fpel(idx1, idx2, sz, type, opt) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \ + dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_##type##sz##_##opt #define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \ - dsp->mc[idx1][FILTER_8TAP_SMOOTH][idx2][idxh][idxv] = type ## _8tap_smooth_ ## sz ## dir ## _ ## opt; \ - dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type ## _8tap_regular_ ## sz ## dir ## _ ## opt; \ - dsp->mc[idx1][FILTER_8TAP_SHARP][idx2][idxh][idxv] = type ## _8tap_sharp_ ## sz ## dir ## _ ## opt + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_##opt; \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_##opt; \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_##opt -#define init_subpel2(idx, idxh, idxv, dir, type, opt) \ +#define init_subpel2(idx, idxh, idxv, dir, type, opt) \ init_subpel1(0, idx, idxh, idxv, 64, dir, type, opt); \ init_subpel1(1, idx, idxh, idxv, 32, dir, type, opt); \ init_subpel1(2, idx, idxh, idxv, 16, dir, type, opt); \ init_subpel1(3, idx, idxh, idxv, 8, dir, type, opt); \ init_subpel1(4, idx, idxh, idxv, 4, dir, type, opt) -#define init_subpel3(idx, type, opt) \ +#define init_subpel3(idx, type, opt) \ init_subpel2(idx, 1, 1, hv, type, opt); \ - init_subpel2(idx, 0, 1, v, type, opt); \ - init_subpel2(idx, 1, 0, h, type, opt) + init_subpel2(idx, 0, 1, v, type, opt); \ + init_subpel2(idx, 1, 0, h, type, opt) if (EXTERNAL_MMX(cpu_flags)) { init_fpel(4, 0, 4, put, mmx); @@ -230,11 +237,45 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) init_fpel(2, 1, 16, avg, sse2); init_fpel(1, 1, 32, avg, sse2); init_fpel(0, 1, 64, avg, sse2); + if (ARCH_X86_64) { + dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_sse2; + dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_sse2; + } } if (EXTERNAL_SSSE3(cpu_flags)) { init_subpel3(0, put, ssse3); init_subpel3(1, avg, ssse3); + dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3; + if (ARCH_X86_64) { + dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3; + dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_ssse3; + dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_ssse3; + dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_ssse3; + dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3; + dsp->itxfm_add[TX_32X32][ADST_ADST] = + dsp->itxfm_add[TX_32X32][ADST_DCT] = + dsp->itxfm_add[TX_32X32][DCT_ADST] = + dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3; + dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_ssse3; + dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_ssse3; + } + } + + if (EXTERNAL_AVX(cpu_flags)) { + if (ARCH_X86_64) { + dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx; + dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx; + dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx; + dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx; + dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx; + dsp->itxfm_add[TX_32X32][ADST_ADST] = + dsp->itxfm_add[TX_32X32][ADST_DCT] = + dsp->itxfm_add[TX_32X32][DCT_ADST] = + dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx; + dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_avx; + dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_avx; + } } #undef init_fpel diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm new file mode 100644 index 0000000000..dfc877e8e5 --- /dev/null +++ b/libavcodec/x86/vp9itxfm.asm @@ -0,0 +1,1494 @@ +;****************************************************************************** +;* VP9 IDCT SIMD optimizations +;* +;* Copyright (C) 2013 Clément Bœsch <u pkh me> +;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pw_11585x2: times 8 dw 23170 +pw_m11585x2: times 8 dw -23170 + +%macro VP9_IDCT_COEFFS 2-3 0 +pw_%1x2: times 8 dw %1*2 +pw_m%1x2: times 8 dw -%1*2 +pw_%2x2: times 8 dw %2*2 +pw_m%2x2: times 8 dw -%2*2 +pw_m%1_%2: times 4 dw -%1, %2 +pw_%2_%1: times 4 dw %2, %1 +pw_m%2_m%1: times 4 dw -%2, -%1 +%if %3 == 1 +pw_m%2_%1: times 4 dw -%2, %1 +pw_%1_%2: times 4 dw %1, %2 +%endif +%endmacro + +VP9_IDCT_COEFFS 15137, 6270, 1 +VP9_IDCT_COEFFS 16069, 3196, 1 +VP9_IDCT_COEFFS 9102, 13623, 1 +VP9_IDCT_COEFFS 16305, 1606 +VP9_IDCT_COEFFS 10394, 12665 +VP9_IDCT_COEFFS 14449, 7723 +VP9_IDCT_COEFFS 4756, 15679 +VP9_IDCT_COEFFS 16364, 804 +VP9_IDCT_COEFFS 11003, 12140 +VP9_IDCT_COEFFS 14811, 7005 +VP9_IDCT_COEFFS 5520, 15426 +VP9_IDCT_COEFFS 15893, 3981 +VP9_IDCT_COEFFS 8423, 14053 +VP9_IDCT_COEFFS 13160, 9760 +VP9_IDCT_COEFFS 2404, 16207 + +pd_8192: times 4 dd 8192 +pw_2048: times 8 dw 2048 +pw_1024: times 8 dw 1024 +pw_512: times 8 dw 512 +pw_m1: times 8 dw -1 + +SECTION .text + +; (a*x + b*y + round) >> shift +%macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2 + pmaddwd m%1, m%2, %4 + pmaddwd m%2, %5 + paddd m%1, %3 + paddd m%2, %3 + psrad m%1, 14 + psrad m%2, 14 +%endmacro + +%macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2 + VP9_MULSUB_2W_2X %7, %6, %5, [pw_m%3_%4], [pw_%4_%3] + VP9_MULSUB_2W_2X %1, %2, %5, [pw_m%3_%4], [pw_%4_%3] + packssdw m%1, m%7 + packssdw m%2, m%6 +%endmacro + +%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2 +%if %0 == 7 + punpckhwd m%6, m%2, m%1 + punpcklwd m%2, m%1 + VP9_MULSUB_2W_4X %1, %2, %3, %4, %5, %6, %7 +%else + punpckhwd m%8, m%4, m%3 + punpcklwd m%2, m%4, m%3 + VP9_MULSUB_2W_4X %1, %2, %5, %6, %7, %8, %9 +%endif +%endmacro + +%macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst + movh m%3, [%6] + movh m%4, [%6+strideq] + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 + paddw m%3, m%1 + paddw m%4, m%2 + packuswb m%3, m%5 + packuswb m%4, m%5 + movh [%6], m%3 + movh [%6+strideq], m%4 +%endmacro + +;------------------------------------------------------------------------------------------- +; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;------------------------------------------------------------------------------------------- + +%macro VP9_IDCT4_1D_FINALIZE 0 + SUMSUB_BA w, 3, 2, 4 ; m3=t3+t0, m2=-t3+t0 + SUMSUB_BA w, 1, 0, 4 ; m1=t2+t1, m0=-t2+t1 + SWAP 0, 3, 2 ; 3102 -> 0123 +%endmacro + +%macro VP9_IDCT4_1D 0 + SUMSUB_BA w, 2, 0, 4 ; m2=IN(0)+IN(2) m0=IN(0)-IN(2) + pmulhrsw m2, m6 ; m2=t0 + pmulhrsw m0, m6 ; m0=t1 + VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5 ; m1=t2, m3=t3 + VP9_IDCT4_1D_FINALIZE +%endmacro + +; 2x2 top left corner +%macro VP9_IDCT4_2x2_1D 0 + pmulhrsw m0, m5 ; m0=t1 + mova m2, m0 ; m2=t0 + mova m3, m1 + pmulhrsw m1, m6 ; m1=t2 + pmulhrsw m3, m7 ; m3=t3 + VP9_IDCT4_1D_FINALIZE +%endmacro + +%macro VP9_IDCT4_WRITEOUT 0 + mova m5, [pw_2048] + pmulhrsw m0, m5 ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 + pmulhrsw m1, m5 + VP9_STORE_2X 0, 1, 6, 7, 4 + lea dstq, [dstq+2*strideq] + pmulhrsw m2, m5 + pmulhrsw m3, m5 + VP9_STORE_2X 2, 3, 6, 7, 4 +%endmacro + +INIT_MMX ssse3 +cglobal vp9_idct_idct_4x4_add, 4,4,0, dst, stride, block, eob + + cmp eobd, 4 ; 2x2 or smaller + jg .idctfull + + cmp eobd, 1 ; faster path for when only DC is set + jne .idct2x2 + + movd m0, [blockq] + mova m5, [pw_11585x2] + pmulhrsw m0, m5 + pmulhrsw m0, m5 + pshufw m0, m0, 0 + pxor m4, m4 + movh [blockq], m4 + pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 + VP9_STORE_2X 0, 0, 6, 7, 4 + lea dstq, [dstq+2*strideq] + VP9_STORE_2X 0, 0, 6, 7, 4 + RET + +; faster path for when only top left 2x2 block is set +.idct2x2: + movd m0, [blockq+0] + movd m1, [blockq+8] + mova m5, [pw_11585x2] + mova m6, [pw_6270x2] + mova m7, [pw_15137x2] + VP9_IDCT4_2x2_1D + TRANSPOSE4x4W 0, 1, 2, 3, 4 + VP9_IDCT4_2x2_1D + pxor m4, m4 ; used for the block reset, and VP9_STORE_2X + movh [blockq+ 0], m4 + movh [blockq+ 8], m4 + VP9_IDCT4_WRITEOUT + RET + +.idctfull: ; generic full 4x4 idct/idct + mova m0, [blockq+ 0] + mova m1, [blockq+ 8] + mova m2, [blockq+16] + mova m3, [blockq+24] + mova m6, [pw_11585x2] + mova m7, [pd_8192] ; rounding + VP9_IDCT4_1D + TRANSPOSE4x4W 0, 1, 2, 3, 4 + VP9_IDCT4_1D + pxor m4, m4 ; used for the block reset, and VP9_STORE_2X + mova [blockq+ 0], m4 + mova [blockq+ 8], m4 + mova [blockq+16], m4 + mova [blockq+24], m4 + VP9_IDCT4_WRITEOUT + RET + +%if ARCH_X86_64 ; TODO: 32-bit? (32-bit limited to 8 xmm reg, we use more) + +;------------------------------------------------------------------------------------------- +; void vp9_idct_idct_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;------------------------------------------------------------------------------------------- + +%macro VP9_IDCT8_1D_FINALIZE 0 + SUMSUB_BA w, 3, 10, 4 ; m3=t0+t7, m10=t0-t7 + SUMSUB_BA w, 1, 2, 4 ; m1=t1+t6, m2=t1-t6 + SUMSUB_BA w, 11, 0, 4 ; m11=t2+t5, m0=t2-t5 + SUMSUB_BA w, 9, 8, 4 ; m9=t3+t4, m8=t3-t4 + SWAP 11, 10, 2 + SWAP 3, 9, 0 +%endmacro + +%macro VP9_IDCT8_1D 0 + SUMSUB_BA w, 8, 0, 4 ; m8=IN(0)+IN(4) m0=IN(0)-IN(4) + pmulhrsw m8, m12 ; m8=t0a + pmulhrsw m0, m12 ; m0=t1a + VP9_UNPACK_MULSUB_2W_4X 2, 10, 15137, 6270, m7, 4, 5 ; m2=t2a, m10=t3a + VP9_UNPACK_MULSUB_2W_4X 1, 11, 16069, 3196, m7, 4, 5 ; m1=t4a, m11=t7a + VP9_UNPACK_MULSUB_2W_4X 9, 3, 9102, 13623, m7, 4, 5 ; m9=t5a, m3=t6a + SUMSUB_BA w, 10, 8, 4 ; m10=t0a+t3a (t0), m8=t0a-t3a (t3) + SUMSUB_BA w, 2, 0, 4 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2) + SUMSUB_BA w, 9, 1, 4 ; m9=t4a+t5a (t4), m1=t4a-t5a (t5a) + SUMSUB_BA w, 3, 11, 4 ; m3=t7a+t6a (t7), m11=t7a-t6a (t6a) + SUMSUB_BA w, 1, 11, 4 ; m1=t6a+t5a (t6), m11=t6a-t5a (t5) + pmulhrsw m1, m12 ; m1=t6 + pmulhrsw m11, m12 ; m11=t5 + VP9_IDCT8_1D_FINALIZE +%endmacro + +%macro VP9_IDCT8_4x4_1D 0 + pmulhrsw m0, m12 ; m0=t1a/t0a + pmulhrsw m10, m2, [pw_15137x2] ; m10=t3a + pmulhrsw m2, [pw_6270x2] ; m2=t2a + pmulhrsw m11, m1, [pw_16069x2] ; m11=t7a + pmulhrsw m1, [pw_3196x2] ; m1=t4a + pmulhrsw m9, m3, [pw_9102x2] ; m9=-t5a + pmulhrsw m3, [pw_13623x2] ; m3=t6a + psubw m8, m0, m10 ; m8=t0a-t3a (t3) + paddw m10, m0 ; m10=t0a+t3a (t0) + SUMSUB_BA w, 2, 0, 4 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2) + SUMSUB_BA w, 9, 1, 4 ; m1=t4a+t5a (t4), m9=t4a-t5a (t5a) + SWAP 1, 9 + SUMSUB_BA w, 3, 11, 4 ; m3=t7a+t6a (t7), m11=t7a-t6a (t6a) + SUMSUB_BA w, 1, 11, 4 ; m1=t6a+t5a (t6), m11=t6a-t5a (t5) + pmulhrsw m1, m12 ; m1=t6 + pmulhrsw m11, m12 ; m11=t5 + VP9_IDCT8_1D_FINALIZE +%endmacro + +; TODO: a lot of t* copies can probably be removed and merged with +; following SUMSUBs from VP9_IDCT8_1D_FINALIZE with AVX +%macro VP9_IDCT8_2x2_1D 0 + pmulhrsw m0, m12 ; m0=t0 + mova m3, m1 + pmulhrsw m1, m6 ; m1=t4 + pmulhrsw m3, m7 ; m3=t7 + mova m2, m0 ; m2=t1 + mova m10, m0 ; m10=t2 + mova m8, m0 ; m8=t3 + mova m11, m3 ; t5 = t7a ... + mova m9, m3 ; t6 = t7a ... + psubw m11, m1 ; t5 = t7a - t4a + paddw m9, m1 ; t6 = t7a + t4a + pmulhrsw m11, m12 ; m11=t5 + pmulhrsw m9, m12 ; m9=t6 + SWAP 0, 10 + SWAP 9, 1 + VP9_IDCT8_1D_FINALIZE +%endmacro + +%macro VP9_IDCT8_WRITEOUT 0 + mova m5, [pw_1024] + pmulhrsw m0, m5 ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5 + pmulhrsw m1, m5 + VP9_STORE_2X 0, 1, 6, 7, 4 + lea dstq, [dstq+2*strideq] + pmulhrsw m2, m5 + pmulhrsw m3, m5 + VP9_STORE_2X 2, 3, 6, 7, 4 + lea dstq, [dstq+2*strideq] + pmulhrsw m8, m5 + pmulhrsw m9, m5 + VP9_STORE_2X 8, 9, 6, 7, 4 + lea dstq, [dstq+2*strideq] + pmulhrsw m10, m5 + pmulhrsw m11, m5 + VP9_STORE_2X 10, 11, 6, 7, 4 +%endmacro + +%macro VP9_IDCT_IDCT_8x8_ADD_XMM 1 +INIT_XMM %1 +cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob + + mova m12, [pw_11585x2] ; often used + + cmp eobd, 12 ; top left half or less + jg .idctfull + + cmp eobd, 3 ; top left corner or less + jg .idcthalf + + cmp eobd, 1 ; faster path for when only DC is set + jne .idcttopleftcorner + + movd m0, [blockq] + pmulhrsw m0, m12 + pmulhrsw m0, m12 + SPLATW m0, m0, 0 + pxor m4, m4 + movd [blockq], m4 + mova m5, [pw_1024] + pmulhrsw m0, m5 ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5 + VP9_STORE_2X 0, 0, 6, 7, 4 + lea dstq, [dstq+2*strideq] + VP9_STORE_2X 0, 0, 6, 7, 4 + lea dstq, [dstq+2*strideq] + VP9_STORE_2X 0, 0, 6, 7, 4 + lea dstq, [dstq+2*strideq] + VP9_STORE_2X 0, 0, 6, 7, 4 + RET + +; faster path for when only left corner is set (3 input: DC, right to DC, below +; to DC). Note: also working with a 2x2 block +.idcttopleftcorner: + movd m0, [blockq+0] + movd m1, [blockq+16] + mova m6, [pw_3196x2] + mova m7, [pw_16069x2] + VP9_IDCT8_2x2_1D + TRANSPOSE8x8W 0, 1, 2, 3, 8, 9, 10, 11, 4 + VP9_IDCT8_2x2_1D + pxor m4, m4 ; used for the block reset, and VP9_STORE_2X + movd [blockq+ 0], m4 + movd [blockq+16], m4 + VP9_IDCT8_WRITEOUT + RET + +.idcthalf: + movh m0, [blockq + 0] + movh m1, [blockq +16] + movh m2, [blockq +32] + movh m3, [blockq +48] + VP9_IDCT8_4x4_1D + TRANSPOSE8x8W 0, 1, 2, 3, 8, 9, 10, 11, 4 + VP9_IDCT8_4x4_1D + pxor m4, m4 + movh [blockq+ 0], m4 + movh [blockq+16], m4 + movh [blockq+32], m4 + movh [blockq+48], m4 + VP9_IDCT8_WRITEOUT + RET + +.idctfull: ; generic full 8x8 idct/idct + mova m0, [blockq+ 0] ; IN(0) + mova m1, [blockq+ 16] ; IN(1) + mova m2, [blockq+ 32] ; IN(2) + mova m3, [blockq+ 48] ; IN(3) + mova m8, [blockq+ 64] ; IN(4) + mova m9, [blockq+ 80] ; IN(5) + mova m10, [blockq+ 96] ; IN(6) + mova m11, [blockq+112] ; IN(7) + mova m7, [pd_8192] ; rounding + VP9_IDCT8_1D + TRANSPOSE8x8W 0, 1, 2, 3, 8, 9, 10, 11, 4 + VP9_IDCT8_1D + pxor m4, m4 ; used for the block reset, and VP9_STORE_2X + mova [blockq+ 0], m4 + mova [blockq+ 16], m4 + mova [blockq+ 32], m4 + mova [blockq+ 48], m4 + mova [blockq+ 64], m4 + mova [blockq+ 80], m4 + mova [blockq+ 96], m4 + mova [blockq+112], m4 + VP9_IDCT8_WRITEOUT + RET +%endmacro + +VP9_IDCT_IDCT_8x8_ADD_XMM ssse3 +VP9_IDCT_IDCT_8x8_ADD_XMM avx + +;--------------------------------------------------------------------------------------------- +; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;--------------------------------------------------------------------------------------------- + +; at the end of this macro, m7 is stored in stack_scratch +; everything else (t0-6 and t8-15) is stored in m0-6 and m8-15 +; the following sumsubs have not been done yet: +; SUMSUB_BA w, 6, 9, 15 ; t6, t9 +; SUMSUB_BA w, 7, 8, 15 ; t7, t8 +%macro VP9_IDCT16_1D_START 4 ; src, nnzc, stride, stack_scratch +%if %2 <= 4 + mova m3, [%1+ 1*%3] ; IN(1) + mova m12, [%1+ 2*%3] ; IN(2) + mova m0, [%1+ 3*%3] ; IN(3) + + pmulhrsw m15, m12, [pw_16069x2] ; t6-7 + pmulhrsw m12, [pw_3196x2] ; t4-5 + pmulhrsw m4, m3, [pw_16305x2] ; t14-15 + pmulhrsw m3, [pw_1606x2] ; t8-9 + pmulhrsw m7, m0, [pw_m4756x2] ; t10-11 + pmulhrsw m0, [pw_15679x2] ; t12-13 + + ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7 + ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15 + + paddw m14, m15, m12 + psubw m13, m15, m12 + pmulhrsw m13, [pw_11585x2] ; t5 + pmulhrsw m14, [pw_11585x2] ; t6 + + VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137, 6270, [pd_8192], 10, 11 ; t9, t14 + VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 10, 11 ; t10, t13 + + ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 + ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15 +%else + mova m5, [%1+ 1*%3] ; IN(1) + mova m14, [%1+ 2*%3] ; IN(2) + mova m6, [%1+ 3*%3] ; IN(3) + mova m9, [%1+ 4*%3] ; IN(4) + mova m7, [%1+ 5*%3] ; IN(5) + mova m15, [%1+ 6*%3] ; IN(6) + mova m4, [%1+ 7*%3] ; IN(7) +%if %2 <= 8 + pmulhrsw m8, m9, [pw_15137x2] ; t3 + pmulhrsw m9, [pw_6270x2] ; t2 + pmulhrsw m13, m14, [pw_16069x2] ; t7 + pmulhrsw m14, [pw_3196x2] ; t4 + pmulhrsw m12, m15, [pw_m9102x2] ; t5 + pmulhrsw m15, [pw_13623x2] ; t6 + pmulhrsw m2, m5, [pw_16305x2] ; t15 + pmulhrsw m5, [pw_1606x2] ; t8 + pmulhrsw m3, m4, [pw_m10394x2] ; t9 + pmulhrsw m4, [pw_12665x2] ; t14 + pmulhrsw m0, m7, [pw_14449x2] ; t13 + pmulhrsw m7, [pw_7723x2] ; t10 + pmulhrsw m1, m6, [pw_m4756x2] ; t11 + pmulhrsw m6, [pw_15679x2] ; t12 +%else + mova m3, [%1+ 9*%3] ; IN(9) + mova m12, [%1+10*%3] ; IN(10) + mova m0, [%1+11*%3] ; IN(11) + mova m8, [%1+12*%3] ; IN(12) + mova m1, [%1+13*%3] ; IN(13) + mova m13, [%1+14*%3] ; IN(14) + mova m2, [%1+15*%3] ; IN(15) + + ; m10=in0, m5=in1, m14=in2, m6=in3, m9=in4, m7=in5, m15=in6, m4=in7 + ; m11=in8, m3=in9, m12=in10 m0=in11, m8=in12, m1=in13, m13=in14, m2=in15 + + VP9_UNPACK_MULSUB_2W_4X 9, 8, 15137, 6270, [pd_8192], 10, 11 ; t2, t3 + VP9_UNPACK_MULSUB_2W_4X 14, 13, 16069, 3196, [pd_8192], 10, 11 ; t4, t7 + VP9_UNPACK_MULSUB_2W_4X 12, 15, 9102, 13623, [pd_8192], 10, 11 ; t5, t6 + VP9_UNPACK_MULSUB_2W_4X 5, 2, 16305, 1606, [pd_8192], 10, 11 ; t8, t15 + VP9_UNPACK_MULSUB_2W_4X 3, 4, 10394, 12665, [pd_8192], 10, 11 ; t9, t14 + VP9_UNPACK_MULSUB_2W_4X 7, 0, 14449, 7723, [pd_8192], 10, 11 ; t10, t13 + VP9_UNPACK_MULSUB_2W_4X 1, 6, 4756, 15679, [pd_8192], 10, 11 ; t11, t12 +%endif + + ; m11=t0, m10=t1, m9=t2, m8=t3, m14=t4, m12=t5, m15=t6, m13=t7 + ; m5=t8, m3=t9, m7=t10, m1=t11, m6=t12, m0=t13, m4=t14, m2=t15 + + SUMSUB_BA w, 12, 14, 10 ; t4, t5 + SUMSUB_BA w, 15, 13, 10 ; t7, t6 + SUMSUB_BA w, 3, 5, 10 ; t8, t9 + SUMSUB_BA w, 7, 1, 10 ; t11, t10 + SUMSUB_BA w, 0, 6, 10 ; t12, t13 + SUMSUB_BA w, 4, 2, 10 ; t15, t14 + + ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7 + ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15 + + SUMSUB_BA w, 14, 13, 10 + pmulhrsw m13, [pw_11585x2] ; t5 + pmulhrsw m14, [pw_11585x2] ; t6 + VP9_UNPACK_MULSUB_2W_4X 2, 5, 15137, 6270, [pd_8192], 10, 11 ; t9, t14 + VP9_UNPACK_MULSUB_2W_4X 6, 1, 6270, m15137, [pd_8192], 10, 11 ; t10, t13 +%endif + + ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7 + ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15 + + SUMSUB_BA w, 7, 3, 10 ; t8, t11 + SUMSUB_BA w, 6, 2, 10 ; t9, t10 + SUMSUB_BA w, 0, 4, 10 ; t15, t12 + SUMSUB_BA w, 1, 5, 10 ; t14. t13 + + ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 + ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15 + + SUMSUB_BA w, 2, 5, 10 + SUMSUB_BA w, 3, 4, 10 + pmulhrsw m5, [pw_11585x2] ; t10 + pmulhrsw m4, [pw_11585x2] ; t11 + pmulhrsw m3, [pw_11585x2] ; t12 + pmulhrsw m2, [pw_11585x2] ; t13 + + ; backup first register + mova [%4], m7 + + ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 + ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15 + + ; from load/start +%if %2 <= 4 + mova m11, [%1+ 0*%3] ; IN(0) + pmulhrsw m11, [pw_11585x2] ; t0-t3 + + psubw m8, m11, m15 + paddw m15, m11 + psubw m9, m11, m14 + paddw m14, m11 + psubw m10, m11, m13 + paddw m13, m11 +%else + mova m10, [%1+ 0*%3] ; IN(0) +%if %2 <= 8 + pmulhrsw m10, [pw_11585x2] ; t0 and t1 + psubw m11, m10, m8 + paddw m8, m10 +%else + mova m11, [%1+ 8*%3] ; IN(8) + + ; from 3 stages back + SUMSUB_BA w, 11, 10, 7 + pmulhrsw m11, [pw_11585x2] ; t0 + pmulhrsw m10, [pw_11585x2] ; t1 + + ; from 2 stages back + SUMSUB_BA w, 8, 11, 7 ; t0, t3 +%endif + SUMSUB_BA w, 9, 10, 7 ; t1, t2 + + ; from 1 stage back + SUMSUB_BA w, 15, 8, 7 ; t0, t7 + SUMSUB_BA w, 14, 9, 7 ; t1, t6 + SUMSUB_BA w, 13, 10, 7 ; t2, t5 +%endif + SUMSUB_BA w, 12, 11, 7 ; t3, t4 + + SUMSUB_BA w, 0, 15, 7 ; t0, t15 + SUMSUB_BA w, 1, 14, 7 ; t1, t14 + SUMSUB_BA w, 2, 13, 7 ; t2, t13 + SUMSUB_BA w, 3, 12, 7 ; t3, t12 + SUMSUB_BA w, 4, 11, 7 ; t4, t11 + SUMSUB_BA w, 5, 10, 7 ; t5, t10 +%endmacro + +%macro VP9_IDCT16_1D 2-3 16 ; src, pass, nnzc + VP9_IDCT16_1D_START %1, %3, 32, rsp+32 + +%if %2 == 1 + ; backup a different register + mova [rsp+16], m15 + mova m7, [rsp+32] + + SUMSUB_BA w, 6, 9, 15 ; t6, t9 + SUMSUB_BA w, 7, 8, 15 ; t7, t8 + + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 15 + mova [rsp+ 0], m0 + mova [rsp+ 32], m1 + mova [rsp+ 64], m2 + mova [rsp+ 96], m3 + mova [rsp+128], m4 + mova [rsp+160], m5 + mova [rsp+192], m6 + mova [rsp+224], m7 + + mova m15, [rsp+16] + TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0 + mova [rsp+ 16], m8 + mova [rsp+ 48], m9 + mova [rsp+ 80], m10 + mova [rsp+112], m11 + mova [rsp+144], m12 + mova [rsp+176], m13 + mova [rsp+208], m14 + mova [rsp+240], m15 +%else ; %2 == 2 + ; backup more registers + mova [rsp+64], m8 + mova [rsp+96], m9 + + pxor m7, m7 + pmulhrsw m0, [pw_512] + pmulhrsw m1, [pw_512] + VP9_STORE_2X 0, 1, 8, 9, 7 + lea dstq, [dstq+strideq*2] + pmulhrsw m2, [pw_512] + pmulhrsw m3, [pw_512] + VP9_STORE_2X 2, 3, 8, 9, 7 + lea dstq, [dstq+strideq*2] + pmulhrsw m4, [pw_512] + pmulhrsw m5, [pw_512] + VP9_STORE_2X 4, 5, 8, 9, 7 + lea dstq, [dstq+strideq*2] + + ; restore from cache + SWAP 0, 7 ; move zero from m7 to m0 + mova m7, [rsp+32] + mova m8, [rsp+64] + mova m9, [rsp+96] + + SUMSUB_BA w, 6, 9, 1 ; t6, t9 + SUMSUB_BA w, 7, 8, 1 ; t7, t8 + + pmulhrsw m6, [pw_512] + pmulhrsw m7, [pw_512] + VP9_STORE_2X 6, 7, 1, 2, 0 + lea dstq, [dstq+strideq*2] + pmulhrsw m8, [pw_512] + pmulhrsw m9, [pw_512] + VP9_STORE_2X 8, 9, 1, 2, 0 + lea dstq, [dstq+strideq*2] + pmulhrsw m10, [pw_512] + pmulhrsw m11, [pw_512] + VP9_STORE_2X 10, 11, 1, 2, 0 + lea dstq, [dstq+strideq*2] + pmulhrsw m12, [pw_512] + pmulhrsw m13, [pw_512] + VP9_STORE_2X 12, 13, 1, 2, 0 + lea dstq, [dstq+strideq*2] + pmulhrsw m14, [pw_512] + pmulhrsw m15, [pw_512] + VP9_STORE_2X 14, 15, 1, 2, 0 +%endif ; %2 == 1/2 +%endmacro + +%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg +%assign %%y 0 +%rep %3 +%assign %%x 0 +%rep %3*2/mmsize + mova [%1+%%y+%%x], %4 +%assign %%x (%%x+mmsize) +%endrep +%assign %%y (%%y+%2) +%endrep +%endmacro + +%macro VP9_STORE_2XFULL 6-7 strideq; dc, tmp1, tmp2, tmp3, tmp4, zero, stride + mova m%3, [dstq] + mova m%5, [dstq+%7] + punpcklbw m%2, m%3, m%6 + punpckhbw m%3, m%6 + punpcklbw m%4, m%5, m%6 + punpckhbw m%5, m%6 + paddw m%2, m%1 + paddw m%3, m%1 + paddw m%4, m%1 + paddw m%5, m%1 + packuswb m%2, m%3 + packuswb m%4, m%5 + mova [dstq], m%2 + mova [dstq+%7], m%4 +%endmacro + +%macro VP9_IDCT_IDCT_16x16_ADD_XMM 1 +INIT_XMM %1 +cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob + ; 2x2=eob=3, 4x4=eob=10 + cmp eobd, 38 + jg .idctfull + cmp eobd, 1 ; faster path for when only DC is set + jne .idct8x8 + + ; dc-only + movd m0, [blockq] + mova m1, [pw_11585x2] + pmulhrsw m0, m1 + pmulhrsw m0, m1 + SPLATW m0, m0, q0000 + pmulhrsw m0, [pw_512] + pxor m5, m5 + movd [blockq], m5 +%rep 7 + VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5 + lea dstq, [dstq+2*strideq] +%endrep + VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5 + RET + +.idct8x8: + DEFINE_ARGS dst, stride, block, cnt, dst_bak + VP9_IDCT16_1D blockq, 1, 8 + + mov cntd, 2 + mov dst_bakq, dstq +.loop2_8x8: + VP9_IDCT16_1D rsp, 2, 8 + lea dstq, [dst_bakq+8] + add rsp, 16 + dec cntd + jg .loop2_8x8 + sub rsp, 32 + + ; at the end of the loop, m0 should still be zero + ; use that to zero out block coefficients + ZERO_BLOCK blockq, 32, 8, m0 + RET + +.idctfull: + DEFINE_ARGS dst, stride, block, cnt, dst_bak + mov cntd, 2 +.loop1_full: + VP9_IDCT16_1D blockq, 1 + add blockq, 16 + add rsp, 256 + dec cntd + jg .loop1_full + sub blockq, 32 + sub rsp, 512 + + mov cntd, 2 + mov dst_bakq, dstq +.loop2_full: + VP9_IDCT16_1D rsp, 2 + lea dstq, [dst_bakq+8] + add rsp, 16 + dec cntd + jg .loop2_full + sub rsp, 32 + + ; at the end of the loop, m0 should still be zero + ; use that to zero out block coefficients + ZERO_BLOCK blockq, 32, 16, m0 + RET +%endmacro + +VP9_IDCT_IDCT_16x16_ADD_XMM ssse3 +VP9_IDCT_IDCT_16x16_ADD_XMM avx + +;--------------------------------------------------------------------------------------------- +; void vp9_iadst_iadst_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;--------------------------------------------------------------------------------------------- + +%macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2 + punpckhwd m%4, m%2, m%1 + punpcklwd m%2, m%1 + pmaddwd m%3, m%4, [pw_m%5_%6] + pmaddwd m%4, [pw_%6_%5] + pmaddwd m%1, m%2, [pw_m%5_%6] + pmaddwd m%2, [pw_%6_%5] +%endmacro + +%macro VP9_RND_SH_SUMSUB_BA 6 ; dst1 [src1], dst2 [src2], src3, src4, tmp, round + SUMSUB_BA d, %1, %2, %5 + SUMSUB_BA d, %3, %4, %5 + paddd m%1, %6 + paddd m%2, %6 + paddd m%3, %6 + paddd m%4, %6 + psrad m%1, 14 + psrad m%2, 14 + psrad m%3, 14 + psrad m%4, 14 + packssdw m%1, m%3 + packssdw m%2, m%4 +%endmacro + +%macro VP9_IADST16_1D 2 ; src, pass +%assign %%str 16*%2 + mova m0, [%1+ 0*32] ; in0 + mova m1, [%1+15*32] ; in15 + mova m8, [%1+ 7*32] ; in7 + mova m9, [%1+ 8*32] ; in8 + + VP9_UNPACK_MULSUB_2D_4X 1, 0, 2, 3, 16364, 804 ; m1/2=t1[d], m0/3=t0[d] + VP9_UNPACK_MULSUB_2D_4X 8, 9, 11, 10, 11003, 12140 ; m8/11=t9[d], m9/10=t8[d] + VP9_RND_SH_SUMSUB_BA 9, 0, 10, 3, 4, [pd_8192] ; m9=t0[w], m0=t8[w] + VP9_RND_SH_SUMSUB_BA 8, 1, 11, 2, 4, [pd_8192] ; m8=t1[w], m1=t9[w] + + mova m11, [%1+ 2*32] ; in2 + mova m10, [%1+13*32] ; in13 + mova m3, [%1+ 5*32] ; in5 + mova m2, [%1+10*32] ; in10 + + VP9_UNPACK_MULSUB_2D_4X 10, 11, 6, 7, 15893, 3981 ; m10/6=t3[d], m11/7=t2[d] + VP9_UNPACK_MULSUB_2D_4X 3, 2, 4, 5, 8423, 14053 ; m3/4=t11[d], m2/5=t10[d] + VP9_RND_SH_SUMSUB_BA 2, 11, 5, 7, 12, [pd_8192] ; m2=t2[w], m11=t10[w] + VP9_RND_SH_SUMSUB_BA 3, 10, 4, 6, 12, [pd_8192] ; m3=t3[w], m10=t11[w] + + mova [rsp+ 0*%%str], m9 ; make some scratch space (t0:m9->r0) + mova m4, [%1+ 4*32] ; in4 + mova m5, [%1+11*32] ; in11 + mova m12, [%1+ 3*32] ; in3 + mova m13, [%1+12*32] ; in12 + + VP9_UNPACK_MULSUB_2D_4X 5, 4, 7, 6, 14811, 7005 ; m5/7=t5[d], m4/6=t4[d] + VP9_UNPACK_MULSUB_2D_4X 12, 13, 14, 15, 5520, 15426 ; m12/14=t13[d], m13/15=t12[d] + VP9_RND_SH_SUMSUB_BA 13, 4, 15, 6, 9, [pd_8192] ; m13=t4[w], m4=t12[w] + VP9_RND_SH_SUMSUB_BA 12, 5, 14, 7, 9, [pd_8192] ; m12=t5[w], m5=t13[w] + + mova [rsp+ 2*%%str], m8 ; t1:m9->r2 + mova [rsp+ 3*%%str], m2 ; t2:m2->r3 + mova [rsp+ 4*%%str], m3 ; t3:m3->r4 + mova [rsp+ 5*%%str], m13 ; t4:m13->r5 + mova m2, [%1+ 6*32] ; in6 + mova m3, [%1+ 9*32] ; in9 + mova m8, [%1+ 1*32] ; in1 + mova m9, [%1+14*32] ; in14 + + VP9_UNPACK_MULSUB_2D_4X 3, 2, 7, 6, 13160, 9760 ; m3/7=t7[d], m2/6=t6[d] + VP9_UNPACK_MULSUB_2D_4X 8, 9, 13, 14, 2404, 16207 ; m8/13=t15[d], m9/14=t14[d] + VP9_RND_SH_SUMSUB_BA 9, 2, 14, 6, 15, [pd_8192] ; m9=t6[w], m2=t14[w] + VP9_RND_SH_SUMSUB_BA 8, 3, 13, 7, 15, [pd_8192] ; m8=t7[w], m3=t15[w] + + ; r0=t0, r2=t1, r3=t2, r4=t3, r5=t4, m12=t5, m9=t6, m8=t7 + ; m0=t8, m1=t9, m11=t10, m10=t11, m4=t12, m5=t13, m2=t14, m3=t15 + + ; handle t8-15 first + VP9_UNPACK_MULSUB_2D_4X 0, 1, 6, 7, 16069, 3196 ; m1/7=t8[d], m0/6=t9[d] + VP9_UNPACK_MULSUB_2D_4X 5, 4, 13, 14, 3196, 16069 ; m5/13=t12[d], m4/14=t13[d] + VP9_RND_SH_SUMSUB_BA 5, 1, 13, 7, 15, [pd_8192] ; m5=t8[w], m1=t12[w] + VP9_RND_SH_SUMSUB_BA 4, 0, 14, 6, 15, [pd_8192] ; m4=t9[w], m0=t13[w] + + VP9_UNPACK_MULSUB_2D_4X 11, 10, 6, 7, 9102, 13623 ; m11/6=t11[d], m10/7=t10[d] + VP9_UNPACK_MULSUB_2D_4X 3, 2, 13, 14, 13623, 9102 ; m3/13=t14[d], m2/14=t15[d] + VP9_RND_SH_SUMSUB_BA 3, 10, 13, 7, 15, [pd_8192] ; m3=t10[w], m10=t14[w] + VP9_RND_SH_SUMSUB_BA 2, 11, 14, 6, 15, [pd_8192] ; m2=t11[w], m11=t15[w] + + ; m5=t8, m4=t9, m3=t10, m2=t11, m1=t12, m0=t13, m10=t14, m11=t15 + + VP9_UNPACK_MULSUB_2D_4X 1, 0, 6, 7, 15137, 6270 ; m1/6=t13[d], m0/7=t12[d] + VP9_UNPACK_MULSUB_2D_4X 11, 10, 13, 14, 6270, 15137 ; m11/13=t14[d], m10/14=t15[d] + VP9_RND_SH_SUMSUB_BA 11, 0, 13, 7, 15, [pd_8192] ; m11=out2[w], m0=t14[w] + VP9_RND_SH_SUMSUB_BA 10, 1, 14, 6, 15, [pd_8192] + psignw m10, [pw_m1] ; m10=out13[w], m1=t15[w] + + SUMSUB_BA w, 3, 5, 15 + psignw m3, [pw_m1] ; m3=out1[w], m5=t10[w] + SUMSUB_BA w, 2, 4, 15 ; m2=out14[w], m4=t11[w] + + SUMSUB_BA w, 5, 4, 15 + pmulhrsw m5, [pw_11585x2] ; m5=out6[w] + pmulhrsw m4, [pw_11585x2] ; m4=out9[w] + SUMSUB_BA w, 1, 0, 15 + pmulhrsw m1, [pw_m11585x2] ; m1=out5[w] + pmulhrsw m0, [pw_11585x2] ; m0=out10[w] + + ; m3=out1, m11=out2, m1=out5, m5=out6, m4=out9, m0=out10, m10=out13, m2=out14 + + mova m6, [rsp+ 0*%%str] + mova m7, [rsp+ 2*%%str] + mova m13, [rsp+ 3*%%str] + mova m14, [rsp+ 4*%%str] + mova m15, [rsp+ 5*%%str] + mova [rsp+ 8*%%str], m5 + mova [rsp+ 9*%%str], m4 + mova [rsp+10*%%str], m0 + mova [rsp+11*%%str], m10 + mova [rsp+12*%%str], m2 + + ; m6=t0, m7=t1, m13=t2, m14=t3, m15=t4, m12=t5, m9=t6, m8=t7 + ; m3=out1, m11=out2, m1=out5, r8=out6, r9=out9, r10=out10, r11=out13, r12=out14 + + SUMSUB_BA w, 15, 6, 0 ; m15=t0[w], m6=t4[w] + SUMSUB_BA w, 12, 7, 0 ; m12=t1[w], m7=t5[w] + SUMSUB_BA w, 9, 13, 0 ; m9=t2[w], m13=t6[w] + SUMSUB_BA w, 8, 14, 0 ; m8=t3[w], m14=t7[w] + + VP9_UNPACK_MULSUB_2D_4X 6, 7, 0, 2, 15137, 6270 ; m6/0=t5[d], m7/2=t4[d] + VP9_UNPACK_MULSUB_2D_4X 14, 13, 4, 5, 6270, 15137 ; m14/4=t6[d], m13/5=t7[d] + VP9_RND_SH_SUMSUB_BA 14, 7, 4, 2, 10, [pd_8192] + psignw m14, [pw_m1] ; m14=out3[w], m7=t6[w] + VP9_RND_SH_SUMSUB_BA 13, 6, 5, 0, 10, [pd_8192] ; m13=out12[w], m6=t7[w] + SUMSUB_BA w, 9, 15, 10 ; m9=out0[w], m15=t2[w] + SUMSUB_BA w, 8, 12, 10 + psignw m8, [pw_m1] ; m8=out15[w], m12=t3[w] + + SUMSUB_BA w, 12, 15, 10 + pmulhrsw m12, [pw_m11585x2] ; m12=out7[w] + pmulhrsw m15, [pw_11585x2] ; m15=out8[w] + SUMSUB_BA w, 7, 6, 10 + pmulhrsw m7, [pw_11585x2] ; m7=out4[w] + pmulhrsw m6, [pw_11585x2] ; m6=out11[w] + + ; m9=out0, m14=out3, m7=out4, m12=out7, m15=out8, m6=out11, m13=out12, m8=out15 + ; m3=out1, m11=out2, m1=out5, r8=out6, r9=out9, r10=out10, r11=out13, r12=out14 + +%if %2 == 1 + mova m0, [rsp+ 8*%%str] + TRANSPOSE8x8W 9, 3, 11, 14, 7, 1, 0, 12, 2 + mova [rsp+ 0*16], m9 + mova [rsp+ 2*16], m3 + mova [rsp+ 4*16], m11 + mova [rsp+ 6*16], m14 + mova m9, [rsp+ 9*%%str] + mova m3, [rsp+10*%%str] + mova m11, [rsp+11*%%str] + mova m14, [rsp+12*%%str] + mova [rsp+ 8*16], m7 + mova [rsp+10*16], m1 + mova [rsp+12*16], m0 + mova [rsp+14*16], m12 + + TRANSPOSE8x8W 15, 9, 3, 6, 13, 11, 14, 8, 2 + mova [rsp+ 1*16], m15 + mova [rsp+ 3*16], m9 + mova [rsp+ 5*16], m3 + mova [rsp+ 7*16], m6 + mova [rsp+ 9*16], m13 + mova [rsp+11*16], m11 + mova [rsp+13*16], m14 + mova [rsp+15*16], m8 +%else + mova m5, [rsp+ 8*%%str] + pxor m0, m0 + + pmulhrsw m9, [pw_512] + pmulhrsw m3, [pw_512] + VP9_STORE_2X 9, 3, 2, 4, 0 + lea dstq, [dstq+strideq*2] + pmulhrsw m11, [pw_512] + pmulhrsw m14, [pw_512] + VP9_STORE_2X 11, 14, 2, 4, 0 + lea dstq, [dstq+strideq*2] + pmulhrsw m7, [pw_512] + pmulhrsw m1, [pw_512] + VP9_STORE_2X 7, 1, 2, 4, 0 + lea dstq, [dstq+strideq*2] + pmulhrsw m5, [pw_512] + pmulhrsw m12, [pw_512] + VP9_STORE_2X 5, 12, 2, 4, 0 + lea dstq, [dstq+strideq*2] + + mova m9, [rsp+ 9*%%str] + mova m3, [rsp+10*%%str] + mova m11, [rsp+11*%%str] + mova m14, [rsp+12*%%str] + + pmulhrsw m15, [pw_512] + pmulhrsw m9, [pw_512] + VP9_STORE_2X 15, 9, 2, 4, 0 + lea dstq, [dstq+strideq*2] + pmulhrsw m3, [pw_512] + pmulhrsw m6, [pw_512] + VP9_STORE_2X 3, 6, 2, 4, 0 + lea dstq, [dstq+strideq*2] + pmulhrsw m13, [pw_512] + pmulhrsw m11, [pw_512] + VP9_STORE_2X 13, 11, 2, 4, 0 + lea dstq, [dstq+strideq*2] + pmulhrsw m14, [pw_512] + pmulhrsw m8, [pw_512] + VP9_STORE_2X 14, 8, 2, 4, 0 +%endif +%endmacro + +%macro IADST16_FN 5 +INIT_XMM %5 +cglobal vp9_%1_%3_16x16_add, 3, 5, 16, 512, dst, stride, block, eob + ; potential eob checks go here + + DEFINE_ARGS dst, stride, block, cnt, dst_bak + mov cntd, 2 +.loop1_full: + VP9_%2_1D blockq, 1 + add blockq, 16 + add rsp, 256 + dec cntd + jg .loop1_full + sub blockq, 32 + sub rsp, 512 + + mov cntd, 2 + mov dst_bakq, dstq +.loop2_full: + VP9_%4_1D rsp, 2 + lea dstq, [dst_bakq+8] + add rsp, 16 + dec cntd + jg .loop2_full + sub rsp, 32 + + ; at the end of the loop, m0 should still be zero + ; use that to zero out block coefficients + ZERO_BLOCK blockq, 32, 16, m0 + RET +%endmacro + +IADST16_FN idct, IDCT16, iadst, IADST16, ssse3 +IADST16_FN idct, IDCT16, iadst, IADST16, avx +IADST16_FN iadst, IADST16, idct, IDCT16, ssse3 +IADST16_FN iadst, IADST16, idct, IDCT16, avx +IADST16_FN iadst, IADST16, iadst, IADST16, ssse3 +IADST16_FN iadst, IADST16, iadst, IADST16, avx + +;--------------------------------------------------------------------------------------------- +; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;--------------------------------------------------------------------------------------------- + +%macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc +%assign %%str 16*%2*%2 + ; first do t0-15, this can be done identical to idct16x16 + VP9_IDCT16_1D_START %1, %3/2, 64*2, rsp+ 4*%%str + + ; backup a different register + mova [rsp+30*%%str], m15 ; t15 + mova m7, [rsp+ 4*%%str] + + SUMSUB_BA w, 6, 9, 15 ; t6, t9 + SUMSUB_BA w, 7, 8, 15 ; t7, t8 + + ; store everything on stack to make space available for t16-31 + ; we store interleaved with the output of the second half (t16-31) + ; so we don't need to allocate extra stack space + mova [rsp+ 0*%%str], m0 ; t0 + mova [rsp+ 4*%%str], m1 ; t1 + mova [rsp+ 8*%%str], m2 ; t2 + mova [rsp+12*%%str], m3 ; t3 + mova [rsp+16*%%str], m4 ; t4 + mova [rsp+20*%%str], m5 ; t5 + mova [rsp+24*%%str], m6 ; t6 + mova [rsp+28*%%str], m7 ; t7 + mova [rsp+ 2*%%str], m8 ; t8 + mova [rsp+ 6*%%str], m9 ; t9 + mova [rsp+10*%%str], m10 ; t10 + mova [rsp+14*%%str], m11 ; t11 + mova [rsp+18*%%str], m12 ; t12 + mova [rsp+22*%%str], m13 ; t13 + mova [rsp+26*%%str], m14 ; t14 + + ; then, secondly, do t16-31 +%if %3 <= 8 + mova m4, [%1+ 1*64] + mova m3, [%1+ 3*64] + mova m0, [%1+ 5*64] + mova m7, [%1+ 7*64] + + pmulhrsw m11, m4, [pw_16364x2] ;t31 + pmulhrsw m4, [pw_804x2] ;t16 + pmulhrsw m8, m7, [pw_m5520x2] ;t19 + pmulhrsw m7, [pw_15426x2] ;t28 + pmulhrsw m15, m0, [pw_15893x2] ;t27 + pmulhrsw m0, [pw_3981x2] ;t20 + pmulhrsw m12, m3, [pw_m2404x2] ;t23 + pmulhrsw m3, [pw_16207x2] ;t24 + + ; m4=t16/17, m8=t18/19, m0=t20/21, m12=t22/23, + ; m3=t24/25, m15=t26/27, m7=t28/29, m11=t30/31 + + VP9_UNPACK_MULSUB_2W_4X 5, 10, 11, 4, 16069, 3196, [pd_8192], 6, 9 ; t17, t30 + VP9_UNPACK_MULSUB_2W_4X 9, 6, 7, 8, 3196, m16069, [pd_8192], 1, 14 ; t18, t29 + ; from 1 stage forward + SUMSUB_BA w, 8, 4, 1 + ; temporary storage + mova [rsp+17*%%str], m8 ; t16 + mova [rsp+21*%%str], m4 ; t19 + VP9_UNPACK_MULSUB_2W_4X 1, 14, 15, 0, 9102, 13623, [pd_8192], 4, 8 ; t21, t26 + VP9_UNPACK_MULSUB_2W_4X 13, 2, 3, 12, 13623, m9102, [pd_8192], 4, 8 ; t22, t25 + + ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23, + ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31 +%else + mova m10, [%1+ 1*64] + mova m13, [%1+ 3*64] + mova m14, [%1+ 5*64] + mova m9, [%1+ 7*64] + mova m8, [%1+ 9*64] + mova m15, [%1+11*64] + mova m12, [%1+13*64] + mova m11, [%1+15*64] +%if %3 <= 16 + pmulhrsw m5, m10, [pw_16364x2] + pmulhrsw m10, [pw_804x2] + pmulhrsw m4, m11, [pw_m11003x2] + pmulhrsw m11, [pw_12140x2] + pmulhrsw m7, m8, [pw_14811x2] + pmulhrsw m8, [pw_7005x2] + pmulhrsw m6, m9, [pw_m5520x2] + pmulhrsw m9, [pw_15426x2] + pmulhrsw m1, m14, [pw_15893x2] + pmulhrsw m14, [pw_3981x2] + pmulhrsw m0, m15, [pw_m8423x2] + pmulhrsw m15, [pw_14053x2] +%else + mova m4, [%1+17*64] + mova m0, [%1+21*64] + mova m7, [%1+23*64] + mova m6, [%1+25*64] + mova m1, [%1+27*64] + mova m5, [%1+31*64] + + ; m10=in1, m4=in17, m8=in9, m6=in25, m14=in5, m0=in21, m12=in13, m2=in29, + ; m13=in3, m3=in19, m15=in11, m1=in27, m9=in7, m7=in23, m11=in15, m5=in31 + + VP9_UNPACK_MULSUB_2W_4X 10, 5, 16364, 804, [pd_8192], 2, 3 ; t16, t31 + VP9_UNPACK_MULSUB_2W_4X 4, 11, 11003, 12140, [pd_8192], 2, 3 ; t17, t30 + VP9_UNPACK_MULSUB_2W_4X 8, 7, 14811, 7005, [pd_8192], 2, 3 ; t18, t29 + VP9_UNPACK_MULSUB_2W_4X 6, 9, 5520, 15426, [pd_8192], 2, 3 ; t19, t28 + VP9_UNPACK_MULSUB_2W_4X 14, 1, 15893, 3981, [pd_8192], 2, 3 ; t20, t27 + VP9_UNPACK_MULSUB_2W_4X 0, 15, 8423, 14053, [pd_8192], 2, 3 ; t21, t26 +%endif + + ; from 1 stage forward + SUMSUB_BA w, 4, 10, 2 + SUMSUB_BA w, 8, 6, 2 + ; from 2 stages forward + SUMSUB_BA w, 8, 4, 2 + ; temporary storage + mova [rsp+17*%%str], m8 ; t16 + mova [rsp+21*%%str], m4 ; t19 +%if %3 <= 16 + pmulhrsw m3, m12, [pw_13160x2] + pmulhrsw m12, [pw_9760x2] + pmulhrsw m2, m13, [pw_m2404x2] + pmulhrsw m13, [pw_16207x2] +%else + mova m2, [%1+29*64] + mova m3, [%1+19*64] + VP9_UNPACK_MULSUB_2W_4X 12, 3, 13160, 9760, [pd_8192], 4, 8 ; t22, t25 + VP9_UNPACK_MULSUB_2W_4X 2, 13, 2404, 16207, [pd_8192], 4, 8 ; t23, t24 +%endif + + ; m10=t16, m4=t17, m8=t18, m6=t19, m14=t20, m0=t21, m12=t22, m2=t23, + ; m13=t24, m3=t25, m15=t26, m1=t27, m9=t28, m7=t29, m11=t30, m5=t31 + + SUMSUB_BA w, 0, 14, 4 + SUMSUB_BA w, 12, 2, 4 + SUMSUB_BA w, 3, 13, 4 + SUMSUB_BA w, 15, 1, 4 + SUMSUB_BA w, 7, 9, 4 + SUMSUB_BA w, 11, 5, 4 + + ; m4=t16, m10=t17, m6=t18, m8=t19, m0=t20, m14=t21, m2=t22, m12=t23, + ; m3=t24, m13=t25, m1=t26, m15=t27, m7=t28, m9=t29, m5=t30, m11=t31 + + VP9_UNPACK_MULSUB_2W_4X 5, 10, 16069, 3196, [pd_8192], 4, 8 ; t17, t30 + VP9_UNPACK_MULSUB_2W_4X 9, 6, 3196, m16069, [pd_8192], 4, 8 ; t18, t29 + VP9_UNPACK_MULSUB_2W_4X 1, 14, 9102, 13623, [pd_8192], 4, 8 ; t21, t26 + VP9_UNPACK_MULSUB_2W_4X 13, 2, 13623, m9102, [pd_8192], 4, 8 ; t22, t25 +%endif + + ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23, + ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31 + + SUMSUB_BA w, 9, 5, 4 + SUMSUB_BA w, 1, 13, 4 + SUMSUB_BA w, 0, 12, 4 + SUMSUB_BA w, 15, 3, 4 + SUMSUB_BA w, 14, 2, 4 + SUMSUB_BA w, 6, 10, 4 + SUMSUB_BA w, 7, 11, 4 + + ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23, + ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31 + + mova m8, [rsp+17*%%str] ; t16 + ; from 2 stages forward + SUMSUB_BA w, 0, 8, 4 + SUMSUB_BA w, 15, 7, 4 + ; from 3 stages forward + SUMSUB_BA w, 8, 7, 4 + pmulhrsw m7, [pw_11585x2] + pmulhrsw m8, [pw_11585x2] + ; store t16/t23 + mova [rsp+ 1*%%str], m0 ; t16 + mova [rsp+29*%%str], m7 ; t23 + + mova m4, [rsp+21*%%str] ; t19 + VP9_UNPACK_MULSUB_2W_4X 10, 5, 15137, 6270, [pd_8192], 0, 7 ; t18, t29 + VP9_UNPACK_MULSUB_2W_4X 11, 4, 15137, 6270, [pd_8192], 0, 7 ; t19, t28 + VP9_UNPACK_MULSUB_2W_4X 3, 12, 6270, m15137, [pd_8192], 0, 7 ; t20, t27 + VP9_UNPACK_MULSUB_2W_4X 2, 13, 6270, m15137, [pd_8192], 0, 7 ; t21, t26 + + ; m8=t16, m9=t17, m10=t18, m11=t19, m3=t20, m2=t21, m1=t22, m0=t23, + ; m15=t24, m14=t25, m13=t26, m12=t27, m4=t28, m5=t29, m6=t30, m7=t31 + + SUMSUB_BA w, 1, 9, 0 + SUMSUB_BA w, 2, 10, 0 + SUMSUB_BA w, 3, 11, 0 + SUMSUB_BA w, 12, 4, 0 + SUMSUB_BA w, 13, 5, 0 + SUMSUB_BA w, 14, 6, 0 + + ; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23, + ; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31 + + SUMSUB_BA w, 9, 6, 0 + SUMSUB_BA w, 10, 5, 0 + SUMSUB_BA w, 11, 4, 0 + + pmulhrsw m6, [pw_11585x2] + pmulhrsw m9, [pw_11585x2] + pmulhrsw m5, [pw_11585x2] + pmulhrsw m10, [pw_11585x2] + pmulhrsw m4, [pw_11585x2] + pmulhrsw m11, [pw_11585x2] + + ; m0=t16, m1=t17, m2=t18, m3=t19, m4=t20, m5=t21, m6=t22, m7=t23, + ; m8=t24, m9=t25, m10=t26, m11=t27, m12=t28, m13=t29, m14=t30, m15=t31 + + ; store t17-19 (and t20-22 for pass 1) - keep t24-31 in registers for + ; final sumsub in pass 1, or keep t20-22 and t24-31 in registers for + ; final sumsub of pass 2 + mova [rsp+ 5*%%str], m1 ; t17 + mova [rsp+ 9*%%str], m2 ; t18 + mova [rsp+13*%%str], m3 ; t19 + + ; then do final pass to sumsub+store the two halves +%if %2 == 1 + mova [rsp+17*%%str], m4 ; t20 + mova [rsp+21*%%str], m5 ; t21 + mova [rsp+25*%%str], m6 ; t22 + + mova m0, [rsp+ 0*%%str] ; t0 + mova m1, [rsp+ 4*%%str] ; t1 + mova m2, [rsp+ 8*%%str] ; t2 + mova m3, [rsp+12*%%str] ; t3 + mova m4, [rsp+16*%%str] ; t4 + mova m5, [rsp+20*%%str] ; t5 + mova m6, [rsp+24*%%str] ; t6 + + SUMSUB_BA w, 15, 0, 7 + mova [rsp+ 3*%%str], m0 ; t15 + mova m7, [rsp+28*%%str] ; t7 + SUMSUB_BA w, 14, 1, 0 + SUMSUB_BA w, 13, 2, 0 + SUMSUB_BA w, 12, 3, 0 + SUMSUB_BA w, 11, 4, 0 + SUMSUB_BA w, 10, 5, 0 + SUMSUB_BA w, 9, 6, 0 + SUMSUB_BA w, 8, 7, 0 + + TRANSPOSE8x8W 15, 14, 13, 12, 11, 10, 9, 8, 0 + mova [rsp+ 0*%%str], m15 + mova [rsp+ 4*%%str], m14 + mova [rsp+ 8*%%str], m13 + mova [rsp+12*%%str], m12 + mova [rsp+16*%%str], m11 + mova [rsp+20*%%str], m10 + mova [rsp+24*%%str], m9 + mova [rsp+28*%%str], m8 + + mova m0, [rsp+ 3*%%str] ; t15 + TRANSPOSE8x8W 7, 6, 5, 4, 3, 2, 1, 0, 8 + mova [rsp+ 3*%%str], m7 + mova [rsp+ 7*%%str], m6 + mova [rsp+11*%%str], m5 + mova [rsp+15*%%str], m4 + mova [rsp+19*%%str], m3 + mova [rsp+23*%%str], m2 + mova [rsp+27*%%str], m1 + mova [rsp+31*%%str], m0 + + mova m15, [rsp+ 2*%%str] ; t8 + mova m14, [rsp+ 6*%%str] ; t9 + mova m13, [rsp+10*%%str] ; t10 + mova m12, [rsp+14*%%str] ; t11 + mova m11, [rsp+18*%%str] ; t12 + mova m10, [rsp+22*%%str] ; t13 + mova m9, [rsp+26*%%str] ; t14 + mova m8, [rsp+30*%%str] ; t15 + mova m7, [rsp+ 1*%%str] ; t16 + mova m6, [rsp+ 5*%%str] ; t17 + mova m5, [rsp+ 9*%%str] ; t18 + mova m4, [rsp+13*%%str] ; t19 + mova m3, [rsp+17*%%str] ; t20 + mova m2, [rsp+21*%%str] ; t21 + mova m1, [rsp+25*%%str] ; t22 + + SUMSUB_BA w, 7, 8, 0 + mova [rsp+ 2*%%str], m8 + mova m0, [rsp+29*%%str] ; t23 + SUMSUB_BA w, 6, 9, 8 + SUMSUB_BA w, 5, 10, 8 + SUMSUB_BA w, 4, 11, 8 + SUMSUB_BA w, 3, 12, 8 + SUMSUB_BA w, 2, 13, 8 + SUMSUB_BA w, 1, 14, 8 + SUMSUB_BA w, 0, 15, 8 + + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 + mova [rsp+ 1*%%str], m0 + mova [rsp+ 5*%%str], m1 + mova [rsp+ 9*%%str], m2 + mova [rsp+13*%%str], m3 + mova [rsp+17*%%str], m4 + mova [rsp+21*%%str], m5 + mova [rsp+25*%%str], m6 + mova [rsp+29*%%str], m7 + + mova m8, [rsp+ 2*%%str] + TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0 + mova [rsp+ 2*%%str], m8 + mova [rsp+ 6*%%str], m9 + mova [rsp+10*%%str], m10 + mova [rsp+14*%%str], m11 + mova [rsp+18*%%str], m12 + mova [rsp+22*%%str], m13 + mova [rsp+26*%%str], m14 + mova [rsp+30*%%str], m15 +%else + ; t0-7 is in [rsp+{0,4,8,12,16,20,24,28}*%%str] + ; t8-15 is in [rsp+{2,6,10,14,18,22,26,30}*%%str] + ; t16-19 and t23 is in [rsp+{1,5,9,13,29}*%%str] + ; t20-22 is in m4-6 + ; t24-31 is in m8-15 + pxor m7, m7 + +%macro %%STORE_2X2 7-8 1 ; src[1-4], tmp[1-2], zero, inc_dst_ptrs + SUMSUB_BA w, %4, %1, %5 + SUMSUB_BA w, %3, %2, %5 + pmulhrsw m%4, [pw_512] + pmulhrsw m%3, [pw_512] + VP9_STORE_2X %4, %3, %5, %6, %7 +%if %8 == 1 + add dstq, stride2q +%endif + pmulhrsw m%2, [pw_512] + pmulhrsw m%1, [pw_512] + VP9_STORE_2X %2, %1, %5, %6, %7, dst_endq +%if %8 == 1 + sub dst_endq, stride2q +%endif +%endmacro + + ; store t0-1 and t30-31 + mova m0, [rsp+ 0*%%str] + mova m1, [rsp+ 4*%%str] + %%STORE_2X2 0, 1, 14, 15, 2, 3, 7 + + ; store t2-3 and t28-29 + mova m0, [rsp+ 8*%%str] + mova m1, [rsp+12*%%str] + %%STORE_2X2 0, 1, 12, 13, 2, 3, 7 + + ; store t4-5 and t26-27 + mova m0, [rsp+16*%%str] + mova m1, [rsp+20*%%str] + %%STORE_2X2 0, 1, 10, 11, 2, 3, 7 + + ; store t6-7 and t24-25 + mova m0, [rsp+24*%%str] + mova m1, [rsp+28*%%str] + %%STORE_2X2 0, 1, 8, 9, 2, 3, 7 + + ; store t8-9 and t22-23 + mova m0, [rsp+ 2*%%str] + mova m1, [rsp+ 6*%%str] + mova m8, [rsp+29*%%str] + %%STORE_2X2 0, 1, 6, 8, 2, 3, 7 + + ; store t10-11 and t20-21 + mova m0, [rsp+10*%%str] + mova m1, [rsp+14*%%str] + %%STORE_2X2 0, 1, 4, 5, 2, 3, 7 + + ; store t12-13 and t18-19 + mova m0, [rsp+18*%%str] + mova m1, [rsp+22*%%str] + mova m5, [rsp+13*%%str] + mova m4, [rsp+ 9*%%str] + %%STORE_2X2 0, 1, 4, 5, 2, 3, 7 + + ; store t14-17 + mova m0, [rsp+26*%%str] + mova m1, [rsp+30*%%str] + mova m5, [rsp+ 5*%%str] + mova m4, [rsp+ 1*%%str] + %%STORE_2X2 0, 1, 4, 5, 2, 3, 7, 0 +%endif +%endmacro + +%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1 +INIT_XMM %1 +cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob + cmp eobd, 135 + jg .idctfull + cmp eobd, 34 + jg .idct16x16 + cmp eobd, 1 + jg .idct8x8 + + ; dc-only case + movd m0, [blockq] + mova m1, [pw_11585x2] + pmulhrsw m0, m1 + pmulhrsw m0, m1 + SPLATW m0, m0, q0000 + pmulhrsw m0, [pw_512] + pxor m5, m5 + movd [blockq], m5 + DEFINE_ARGS dst, stride, block, cnt +%rep 31 + VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize + add dstq, strideq +%endrep + VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize + RET + + DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2 +.idct8x8: + VP9_IDCT32_1D blockq, 1, 8 + + mov stride30q, strideq ; stride + lea stride2q, [strideq*2] ; stride*2 + shl stride30q, 5 ; stride*32 + mov cntd, 4 + sub stride30q, stride2q ; stride*30 +.loop2_8x8: + mov dstq, dst_bakq + lea dst_endq, [dst_bakq+stride30q] + VP9_IDCT32_1D rsp, 2, 8 + add dst_bakq, 8 + add rsp, 16 + dec cntd + jg .loop2_8x8 + sub rsp, 64 + + ; at the end of the loop, m7 should still be zero + ; use that to zero out block coefficients + ZERO_BLOCK blockq, 64, 8, m7 + RET + +.idct16x16: + mov cntd, 2 +.loop1_16x16: + VP9_IDCT32_1D blockq, 1, 16 + add blockq, 16 + add rsp, 512 + dec cntd + jg .loop1_16x16 + sub blockq, 32 + sub rsp, 1024 + + mov stride30q, strideq ; stride + lea stride2q, [strideq*2] ; stride*2 + shl stride30q, 5 ; stride*32 + mov cntd, 4 + sub stride30q, stride2q ; stride*30 +.loop2_16x16: + mov dstq, dst_bakq + lea dst_endq, [dst_bakq+stride30q] + VP9_IDCT32_1D rsp, 2, 16 + add dst_bakq, 8 + add rsp, 16 + dec cntd + jg .loop2_16x16 + sub rsp, 64 + + ; at the end of the loop, m7 should still be zero + ; use that to zero out block coefficients + ZERO_BLOCK blockq, 64, 16, m7 + RET + +.idctfull: + mov cntd, 4 +.loop1_full: + VP9_IDCT32_1D blockq, 1 + add blockq, 16 + add rsp, 512 + dec cntd + jg .loop1_full + sub blockq, 64 + sub rsp, 2048 + + mov stride30q, strideq ; stride + lea stride2q, [strideq*2] ; stride*2 + shl stride30q, 5 ; stride*32 + mov cntd, 4 + sub stride30q, stride2q ; stride*30 +.loop2_full: + mov dstq, dst_bakq + lea dst_endq, [dst_bakq+stride30q] + VP9_IDCT32_1D rsp, 2 + add dst_bakq, 8 + add rsp, 16 + dec cntd + jg .loop2_full + sub rsp, 64 + + ; at the end of the loop, m7 should still be zero + ; use that to zero out block coefficients + ZERO_BLOCK blockq, 64, 32, m7 + RET +%endmacro + +VP9_IDCT_IDCT_32x32_ADD_XMM ssse3 +VP9_IDCT_IDCT_32x32_ADD_XMM avx + +%endif ; x86-64 diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm new file mode 100644 index 0000000000..75ce849e59 --- /dev/null +++ b/libavcodec/x86/vp9lpf.asm @@ -0,0 +1,673 @@ +;****************************************************************************** +;* VP9 loop filter SIMD optimizations +;* +;* Copyright (C) 2013-2014 Clément Bœsch <u pkh me> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%if ARCH_X86_64 + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +cextern pb_3 +cextern pb_80 + +pb_4: times 16 db 0x04 +pb_10: times 16 db 0x10 +pb_40: times 16 db 0x40 +pb_81: times 16 db 0x81 +pb_f8: times 16 db 0xf8 +pb_fe: times 16 db 0xfe + +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 + +SECTION .text + +; %1 = abs(%2-%3) +%macro ABSSUB 4 ; dst, src1 (RO), src2 (RO), tmp + psubusb %1, %3, %2 + psubusb %4, %2, %3 + por %1, %4 +%endmacro + +; %1 = %1<=%2 +%macro CMP_LTE 4 ; src/dst, cmp, tmp, pb_80 + pxor %1, %4 + pcmpgtb %3, %2, %1 ; cmp > src? + pcmpeqb %1, %2 ; cmp == src? XXX: avoid this with a -1/+1 well placed? + por %1, %3 ; cmp >= src? +%endmacro + +; %1 = abs(%2-%3) <= %4 +%macro ABSSUB_CMP 6-7 [pb_80]; dst, src1, src2, cmp, tmp1, tmp2, [pb_80] + ABSSUB %1, %2, %3, %6 ; dst = abs(src1-src2) + CMP_LTE %1, %4, %6, %7 ; dst <= cmp +%endmacro + +%macro MASK_APPLY 4 ; %1=new_data/dst %2=old_data %3=mask %4=tmp + pand %1, %3 ; new &= mask + pandn %4, %3, %2 ; tmp = ~mask & old + por %1, %4 ; new&mask | old&~mask +%endmacro + +%macro FILTER_SUBx2_ADDx2 8 ; %1=dst %2=h/l %3=cache %4=sub1 %5=sub2 %6=add1 %7=add2 %8=rshift + punpck%2bw %3, %4, m0 + psubw %1, %3 + punpck%2bw %3, %5, m0 + psubw %1, %3 + punpck%2bw %3, %6, m0 + paddw %1, %3 + punpck%2bw %3, %7, m0 + paddw %1, %3 + mova %3, %1 + psraw %1, %8 +%endmacro + +%macro FILTER_INIT 7-8 ; tmp1, tmp2, cacheL, cacheH, dstp, filterid, [source] + FILTER%6_INIT %1, l, %3 + FILTER%6_INIT %2, h, %4 + packuswb %1, %2 +%if %0 == 8 + MASK_APPLY %1, %8, %7, %2 +%else + MASK_APPLY %1, %5, %7, %2 +%endif + mova %5, %1 +%endmacro + +%macro FILTER_UPDATE 11-12 ; tmp1, tmp2, cacheL, cacheH, dstp, -, -, +, +, rshift, [source] + FILTER_SUBx2_ADDx2 %1, l, %3, %6, %7, %8, %9, %10 + FILTER_SUBx2_ADDx2 %2, h, %4, %6, %7, %8, %9, %10 + packuswb %1, %2 +%if %0 == 12 + MASK_APPLY %1, %12, %11, %2 +%else + MASK_APPLY %1, %5, %11, %2 +%endif + mova %5, %1 +%endmacro + +%macro SRSHIFT3B_2X 4 ; reg1, reg2, [pb_10], tmp + mova %4, [pb_f8] + pand %1, %4 + pand %2, %4 + psrlq %1, 3 + psrlq %2, 3 + pxor %1, %3 + pxor %2, %3 + psubb %1, %3 + psubb %2, %3 +%endmacro + +%macro EXTRACT_POS_NEG 3 ; i8, neg, pos + pxor %3, %3 + pxor %2, %2 + pcmpgtb %3, %1 ; i8 < 0 mask + psubb %2, %1 ; neg values (only the originally - will be kept) + pand %2, %3 ; negative values of i8 (but stored as +) + pandn %3, %1 ; positive values of i8 +%endmacro + +; clip_u8(u8 + i8) +%macro SIGN_ADD 5 ; dst, u8, i8, tmp1, tmp2 + EXTRACT_POS_NEG %3, %4, %5 + psubusb %1, %2, %4 ; sub the negatives + paddusb %1, %5 ; add the positives +%endmacro + +; clip_u8(u8 - i8) +%macro SIGN_SUB 5 ; dst, u8, i8, tmp1, tmp2 + EXTRACT_POS_NEG %3, %4, %5 + psubusb %1, %2, %5 ; sub the positives + paddusb %1, %4 ; add the negatives +%endmacro + +%macro FILTER6_INIT 3 ; %1=dst %2=h/l %3=cache + punpck%2bw %3, m14, m0 ; p3: B->W + mova %1, %3 ; p3 + paddw %1, %3 ; p3*2 + paddw %1, %3 ; p3*3 + punpck%2bw %3, m15, m0 ; p2: B->W + paddw %1, %3 ; p3*3 + p2 + paddw %1, %3 ; p3*3 + p2*2 + punpck%2bw %3, m10, m0 ; p1: B->W + paddw %1, %3 ; p3*3 + p2*2 + p1 + punpck%2bw %3, m11, m0 ; p0: B->W + paddw %1, %3 ; p3*3 + p2*2 + p1 + p0 + punpck%2bw %3, m12, m0 ; q0: B->W + paddw %1, %3 ; p3*3 + p2*2 + p1 + p0 + q0 + paddw %1, [pw_4] ; p3*3 + p2*2 + p1 + p0 + q0 + 4 + mova %3, %1 ; base for next line (cache) + psraw %1, 3 ; (p3*3 + p2*2 + p1 + p0 + q0 + 4) >> 3 +%endmacro + +%macro FILTER14_INIT 3 ; %1=dst %2=h/l %3=cache + punpck%2bw %1, m2, m0 ; p7: B->W + mova %3, %1 + psllw %1, 3 ; p7*8 + psubw %1, %3 ; p7*7 + punpck%2bw %3, m3, m0 ; p6: B->W + paddw %1, %3 ; p7*7 + p6 + paddw %1, %3 ; p7*7 + p6*2 + punpck%2bw %3, m8, m0 ; p5: B->W + paddw %1, %3 ; p7*7 + p6*2 + p5 + punpck%2bw %3, m9, m0 ; p4: B->W + paddw %1, %3 ; p7*7 + p6*2 + p5 + p4 + punpck%2bw %3, m14, m0 ; p3: B->W + paddw %1, %3 ; p7*7 + p6*2 + p5 + p4 + p3 + punpck%2bw %3, m15, m0 ; p2: B->W + paddw %1, %3 ; p7*7 + p6*2 + p5 + .. + p2 + punpck%2bw %3, m10, m0 ; p1: B->W + paddw %1, %3 ; p7*7 + p6*2 + p5 + .. + p1 + punpck%2bw %3, m11, m0 ; p0: B->W + paddw %1, %3 ; p7*7 + p6*2 + p5 + .. + p0 + punpck%2bw %3, m12, m0 ; q0: B->W + paddw %1, %3 ; p7*7 + p6*2 + p5 + .. + p0 + q0 + paddw %1, [pw_8] ; p7*7 + p6*2 + p5 + .. + p0 + q0 + 8 + mova %3, %1 ; base for next line (cache) + psraw %1, 4 ; (p7*7 + p6*2 + p5 + .. + p0 + q0 + 8) >> 4 +%endmacro + +%macro TRANSPOSE16x16B 17 + mova %17, m%16 + SBUTTERFLY bw, %1, %2, %16 + SBUTTERFLY bw, %3, %4, %16 + SBUTTERFLY bw, %5, %6, %16 + SBUTTERFLY bw, %7, %8, %16 + SBUTTERFLY bw, %9, %10, %16 + SBUTTERFLY bw, %11, %12, %16 + SBUTTERFLY bw, %13, %14, %16 + mova m%16, %17 + mova %17, m%14 + SBUTTERFLY bw, %15, %16, %14 + SBUTTERFLY wd, %1, %3, %14 + SBUTTERFLY wd, %2, %4, %14 + SBUTTERFLY wd, %5, %7, %14 + SBUTTERFLY wd, %6, %8, %14 + SBUTTERFLY wd, %9, %11, %14 + SBUTTERFLY wd, %10, %12, %14 + SBUTTERFLY wd, %13, %15, %14 + mova m%14, %17 + mova %17, m%12 + SBUTTERFLY wd, %14, %16, %12 + SBUTTERFLY dq, %1, %5, %12 + SBUTTERFLY dq, %2, %6, %12 + SBUTTERFLY dq, %3, %7, %12 + SBUTTERFLY dq, %4, %8, %12 + SBUTTERFLY dq, %9, %13, %12 + SBUTTERFLY dq, %10, %14, %12 + SBUTTERFLY dq, %11, %15, %12 + mova m%12, %17 + mova %17, m%8 + SBUTTERFLY dq, %12, %16, %8 + SBUTTERFLY qdq, %1, %9, %8 + SBUTTERFLY qdq, %2, %10, %8 + SBUTTERFLY qdq, %3, %11, %8 + SBUTTERFLY qdq, %4, %12, %8 + SBUTTERFLY qdq, %5, %13, %8 + SBUTTERFLY qdq, %6, %14, %8 + SBUTTERFLY qdq, %7, %15, %8 + mova m%8, %17 + mova %17, m%1 + SBUTTERFLY qdq, %8, %16, %1 + mova m%1, %17 + SWAP %2, %9 + SWAP %3, %5 + SWAP %4, %13 + SWAP %6, %11 + SWAP %8, %15 + SWAP %12, %14 +%endmacro + +%macro LPF_16_16 1 + lea mstrideq, [strideq] + neg mstrideq + +%ifidn %1, h + lea dstq, [dstq + 8*strideq - 8] ; go from top center (h pos) to center left (v pos) + lea dst1q, [dstq + 8*mstrideq] ; dst1 = &dst[stride * -8] + lea dst2q, [dst1q + 1* strideq] ; dst2 = &dst[stride * -7] + movu m0, [dst1q ] ; m0 = dst[stride * -8] (p7) + movu m1, [dst2q ] ; m1 = dst[stride * -7] (p6) + movu m2, [dst1q + 2* strideq] ; m2 = dst[stride * -6] (p5) + movu m3, [dst2q + 2* strideq] ; m3 = dst[stride * -5] (p4) + lea dst1q, [dstq] ; dst1 = &dst[stride * +0] + lea dst2q, [dstq + 1*strideq] ; dst2 = &dst[stride * +1] + movu m4, [dst1q + 4*mstrideq] ; m4 = dst[stride * -4] (p3) + movu m5, [dst2q + 4*mstrideq] ; m5 = dst[stride * -3] (p2) + movu m6, [dst1q + 2*mstrideq] ; m6 = dst[stride * -2] (p1) + movu m7, [dst2q + 2*mstrideq] ; m7 = dst[stride * -1] (p0) + movu m8, [dst1q] ; m8 = dst[stride * +0] (q0) + movu m9, [dst2q] ; m9 = dst[stride * +1] (q1) + movu m10, [dst1q + 2* strideq] ; m10 = dst[stride * +2] (q2) + movu m11, [dst2q + 2* strideq] ; m11 = dst[stride * +3] (q3) + movu m12, [dst1q + 4* strideq] ; m12 = dst[stride * +4] (q4) + movu m13, [dst2q + 4* strideq] ; m13 = dst[stride * +5] (q5) + lea dst1q, [dstq + 8* strideq] ; dst1 = &dst[stride * +8] + movu m14, [dst1q + 2*mstrideq] ; m14 = dst[stride * +6] (q6) + movu m15, [dst1q + 1*mstrideq] ; m15 = dst[stride * +7] (q7) + TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp] + mova [rsp + 0], m0 ; dst[stride * -8] (p7) + mova [rsp + 16], m1 ; dst[stride * -7] (p6) + mova [rsp + 32], m2 ; dst[stride * -6] (p5) + mova [rsp + 48], m3 ; dst[stride * -5] (p4) + mova [rsp + 64], m4 ; dst[stride * -4] (p3) + mova [rsp + 80], m5 ; dst[stride * -3] (p2) + mova [rsp + 96], m6 ; dst[stride * -2] (p1) + mova [rsp + 112], m7 ; dst[stride * -1] (p0) + mova [rsp + 128], m8 ; dst[stride * +0] (q0) + mova [rsp + 144], m9 ; dst[stride * +1] (q1) + mova [rsp + 160], m10 ; dst[stride * +2] (q2) + mova [rsp + 176], m11 ; dst[stride * +3] (q3) + mova [rsp + 192], m12 ; dst[stride * +4] (q4) + mova [rsp + 208], m13 ; dst[stride * +5] (q5) + mova [rsp + 224], m14 ; dst[stride * +6] (q6) + mova [rsp + 240], m15 ; dst[stride * +7] (q7) +%endif + + ; calc fm mask +%if cpuflag(ssse3) + pxor m0, m0 +%endif + SPLATB_REG m2, I, m0 ; I I I I ... + SPLATB_REG m3, E, m0 ; E E E E ... + mova m0, [pb_80] + pxor m2, m0 + pxor m3, m0 +%ifidn %1, v + lea dst1q, [dstq + 2*mstrideq] ; dst1 = &dst[stride * -2] + lea dst2q, [dstq + 2* strideq] ; dst2 = &dst[stride * +2] + mova m8, [dstq + 4*mstrideq] ; m8 = dst[stride * -4] (p3) + mova m9, [dst1q + 1*mstrideq] ; m9 = dst[stride * -3] (p2) + mova m10, [dstq + 2*mstrideq] ; m10 = dst[stride * -2] (p1) + mova m11, [dstq + 1*mstrideq] ; m11 = dst[stride * -1] (p0) + mova m12, [dstq ] ; m12 = dst[stride * +0] (q0) + mova m13, [dstq + 1* strideq] ; m13 = dst[stride * +1] (q1) + mova m14, [dstq + 2* strideq] ; m14 = dst[stride * +2] (q2) + mova m15, [dst2q + 1* strideq] ; m15 = dst[stride * +3] (q3) +%else + SWAP 8, 4, 12 + SWAP 9, 5, 13 + SWAP 10, 6, 14 + SWAP 11, 7, 15 +%endif + ABSSUB_CMP m5, m8, m9, m2, m6, m7, m0 ; m5 = abs(p3-p2) <= I + ABSSUB_CMP m1, m9, m10, m2, m6, m7, m0 ; m1 = abs(p2-p1) <= I + pand m5, m1 + ABSSUB_CMP m1, m10, m11, m2, m6, m7, m0 ; m1 = abs(p1-p0) <= I + pand m5, m1 + ABSSUB_CMP m1, m12, m13, m2, m6, m7, m0 ; m1 = abs(q1-q0) <= I + pand m5, m1 + ABSSUB_CMP m1, m13, m14, m2, m6, m7, m0 ; m1 = abs(q2-q1) <= I + pand m5, m1 + ABSSUB_CMP m1, m14, m15, m2, m6, m7, m0 ; m1 = abs(q3-q2) <= I + pand m5, m1 + ABSSUB m1, m11, m12, m7 ; abs(p0-q0) + paddusb m1, m1 ; abs(p0-q0) * 2 + ABSSUB m2, m10, m13, m7 ; abs(p1-q1) + pand m2, [pb_fe] ; drop lsb so shift can work + psrlq m2, 1 ; abs(p1-q1)/2 + paddusb m1, m2 ; abs(p0-q0)*2 + abs(p1-q1)/2 + pxor m1, m0 + pcmpgtb m4, m3, m1 ; E > X? + pcmpeqb m3, m1 ; E == X? + por m3, m4 ; E >= X? + pand m3, m5 ; fm final value + + ; (m3: fm, m8..15: p3 p2 p1 p0 q0 q1 q2 q3) + ; calc flat8in and hev masks + mova m6, [pb_81] ; [1 1 1 1 ...] ^ 0x80 + ABSSUB_CMP m2, m8, m11, m6, m4, m5 ; abs(p3 - p0) <= 1 + mova m8, [pb_80] + ABSSUB_CMP m1, m9, m11, m6, m4, m5, m8 ; abs(p2 - p0) <= 1 + pand m2, m1 + ABSSUB m4, m10, m11, m5 ; abs(p1 - p0) +%if cpuflag(ssse3) + pxor m0, m0 +%endif + SPLATB_REG m7, H, m0 ; H H H H ... + pxor m7, m8 + pxor m4, m8 + pcmpgtb m0, m4, m7 ; abs(p1 - p0) > H (1/2 hev condition) + pxor m4, m8 + mova m1, m4 + CMP_LTE m1, m6, m5, m8 ; abs(p1 - p0) <= 1 + pand m2, m1 ; (flat8in) + ABSSUB m4, m13, m12, m1 ; abs(q1 - q0) + pxor m4, m8 + pcmpgtb m5, m4, m7 ; abs(q1 - q0) > H (2/2 hev condition) + pxor m4, m8 + por m0, m5 ; hev final value + mova m1, m4 + CMP_LTE m1, m6, m5, m8 ; abs(q1 - q0) <= 1 + pand m2, m1 ; (flat8in) + ABSSUB_CMP m1, m14, m12, m6, m4, m5, m8 ; abs(q2 - q0) <= 1 + pand m2, m1 + ABSSUB_CMP m1, m15, m12, m6, m4, m5, m8 ; abs(q3 - q0) <= 1 + pand m2, m1 ; flat8in final value + + ; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3) + ; calc flat8out mask +%ifidn %1, v + lea dst2q, [dstq + 8*mstrideq] ; dst2 = &dst[stride * -8] (p7) + lea dst1q, [dst2q + 1*strideq] ; dst1 = &dst[stride * -7] (p6) + mova m8, [dst2q] ; m8 = p7 + mova m9, [dst1q] ; m9 = p6 +%else + mova m8, [rsp + 0] ; m8 = p7 + mova m9, [rsp + 16] ; m9 = p6 +%endif + ABSSUB_CMP m1, m8, m11, m6, m4, m5 ; abs(p7 - p0) <= 1 + ABSSUB_CMP m7, m9, m11, m6, m4, m5 ; abs(p6 - p0) <= 1 + pand m1, m7 +%ifidn %1, v + mova m8, [dst1q + 1*strideq] ; m8 = dst[stride * -6] (p5) + mova m9, [dst1q + 2*strideq] ; m9 = dst[stride * -5] (p4) +%else + mova m8, [rsp + 32] ; m8 = p5 + mova m9, [rsp + 48] ; m9 = p4 +%endif + ABSSUB_CMP m7, m8, m11, m6, m4, m5 ; abs(p5 - p0) <= 1 + pand m1, m7 + ABSSUB_CMP m7, m9, m11, m6, m4, m5 ; abs(p4 - p0) <= 1 + pand m1, m7 +%ifidn %1, v + lea dst2q, [dstq + 4*strideq] ; dst2 = &dst[stride * +4] (q4) + lea dst1q, [dst2q + 1*strideq] ; dst1 = &dst[stride * +5] (q5) + mova m14, [dst2q] ; m14 = q4 + mova m15, [dst1q] ; m15 = q5 +%else + mova m14, [rsp + 192] ; m14 = q4 + mova m15, [rsp + 208] ; m15 = q5 +%endif + ABSSUB_CMP m7, m14, m12, m6, m4, m5 ; abs(q4 - q0) <= 1 + pand m1, m7 + ABSSUB_CMP m7, m15, m12, m6, m4, m5 ; abs(q5 - q0) <= 1 + pand m1, m7 +%ifidn %1, v + mova m14, [dst1q + 1*strideq] ; m14 = dst[stride * +6] (q6) + mova m15, [dst1q + 2*strideq] ; m15 = dst[stride * +7] (q7) +%else + mova m14, [rsp + 224] ; m14 = q6 + mova m15, [rsp + 240] ; m15 = q7 +%endif + ABSSUB_CMP m7, m14, m12, m6, m4, m5 ; abs(q4 - q0) <= 1 + pand m1, m7 + ABSSUB_CMP m7, m15, m12, m6, m4, m5 ; abs(q5 - q0) <= 1 + pand m1, m7 ; flat8out final value + + ; if (fm) { + ; if (out && in) filter_14() + ; else if (in) filter_6() + ; else if (hev) filter_2() + ; else filter_4() + ; } + ; + ; f14: fm & out & in + ; f6: fm & ~f14 & in => fm & ~(out & in) & in => fm & ~out & in + ; f2: fm & ~f14 & ~f6 & hev => fm & ~(out & in) & ~(~out & in) & hev => fm & ~in & hev + ; f4: fm & ~f14 & ~f6 & ~f2 => fm & ~(out & in) & ~(~out & in) & ~(~in & hev) => fm & ~in & ~hev + + ; (m0: hev, m1: flat8out, m2: flat8in, m3: fm, m8..15: p5 p4 p1 p0 q0 q1 q6 q7) + ; filter2() + mova m6, [pb_80] + pxor m15, m12, m6 ; q0 ^ 0x80 + pxor m14, m11, m6 ; p0 ^ 0x80 + psubsb m15, m14 ; (signed) q0 - p0 + pxor m4, m10, m6 ; p1 ^ 0x80 + pxor m5, m13, m6 ; q1 ^ 0x80 + psubsb m4, m5 ; (signed) p1 - q1 + paddsb m4, m15 ; (q0 - p0) + (p1 - q1) + paddsb m4, m15 ; 2*(q0 - p0) + (p1 - q1) + paddsb m4, m15 ; 3*(q0 - p0) + (p1 - q1) + paddsb m6, m4, [pb_4] ; m6: f1 = clip(f + 4, 127) + paddsb m4, [pb_3] ; m4: f2 = clip(f + 3, 127) + mova m14, [pb_10] ; will be reused in filter4() + SRSHIFT3B_2X m6, m4, m14, m7 ; f1 and f2 sign byte shift by 3 + SIGN_SUB m7, m12, m6, m5, m9 ; m7 = q0 - f1 + SIGN_ADD m8, m11, m4, m5, m9 ; m8 = p0 + f2 + pandn m6, m2, m3 ; ~mask(in) & mask(fm) + pand m6, m0 ; (~mask(in) & mask(fm)) & mask(hev) + MASK_APPLY m7, m12, m6, m5 ; m7 = filter2(q0) & mask / we write it in filter4() + MASK_APPLY m8, m11, m6, m5 ; m8 = filter2(p0) & mask / we write it in filter4() + + ; (m0: hev, m1: flat8out, m2: flat8in, m3: fm, m7..m8: q0' p0', m10..13: p1 p0 q0 q1, m14: pb_10, m15: q0-p0) + ; filter4() + mova m4, m15 + paddsb m15, m4 ; 2 * (q0 - p0) + paddsb m15, m4 ; 3 * (q0 - p0) + paddsb m6, m15, [pb_4] ; m6: f1 = clip(f + 4, 127) + paddsb m15, [pb_3] ; m15: f2 = clip(f + 3, 127) + SRSHIFT3B_2X m6, m15, m14, m9 ; f1 and f2 sign byte shift by 3 + pandn m5, m2, m3 ; ~mask(in) & mask(fm) + pandn m0, m5 ; ~mask(hev) & (~mask(in) & mask(fm)) + SIGN_SUB m9, m12, m6, m4, m14 ; q0 - f1 + MASK_APPLY m9, m7, m0, m5 ; m9 = filter4(q0) & mask +%ifidn %1, v + mova [dstq], m9 ; update q0 +%else + mova [rsp + 128], m9 ; update q0 +%endif + SIGN_ADD m7, m11, m15, m4, m14 ; p0 + f2 + MASK_APPLY m7, m8, m0, m5 ; m7 = filter4(p0) & mask +%ifidn %1, v + mova [dstq + 1*mstrideq], m7 ; update p0 +%else + mova [rsp + 112], m7 ; update p0 +%endif + paddb m6, [pb_80] ; + pxor m8, m8 ; f=(f1+1)>>1 + pavgb m6, m8 ; + psubb m6, [pb_40] ; + SIGN_ADD m7, m10, m6, m8, m9 ; p1 + f + SIGN_SUB m4, m13, m6, m8, m9 ; q1 - f + MASK_APPLY m7, m10, m0, m14 ; m7 = filter4(p1) + MASK_APPLY m4, m13, m0, m14 ; m4 = filter4(q1) +%ifidn %1, v + mova [dstq + 2*mstrideq], m7 ; update p1 + mova [dstq + 1* strideq], m4 ; update q1 +%else + mova [rsp + 96], m7 ; update p1 + mova [rsp + 144], m4 ; update q1 +%endif + + ; (m1: flat8out, m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1) + ; filter6() + pxor m0, m0 + pand m2, m3 ; mask(fm) & mask(in) + pandn m3, m1, m2 ; ~mask(out) & (mask(fm) & mask(in)) +%ifidn %1, v + lea dst1q, [dstq + 2*strideq] ; dst1 = &dst[stride * +2] (q2) + mova m8, [dst1q] ; m8 = q2 + mova m9, [dst1q + 1*strideq] ; m9 = q3 + lea dst1q, [dstq + 4*mstrideq] ; dst1 = &dst[stride * -4] (p3) + lea dst2q, [dst1q + 1*strideq] ; dst2 = &dst[stride * -3] (p2) + mova m14, [dst1q] ; m14 = p3 + mova m15, [dst2q] ; m15 = p2 + FILTER_INIT m4, m5, m6, m7, [dst2q ], 6, m3, m15 ; [p2] + FILTER_UPDATE m6, m7, m4, m5, [dst2q + 1*strideq], m14, m15, m10, m13, 3, m3 ; [p1] -p3 -p2 +p1 +q1 + FILTER_UPDATE m4, m5, m6, m7, [dst2q + 2*strideq], m14, m10, m11, m8, 3, m3 ; [p0] -p3 -p1 +p0 +q2 + FILTER_UPDATE m6, m7, m4, m5, [dstq ], m14, m11, m12, m9, 3, m3 ; [q0] -p3 -p0 +q0 +q3 + FILTER_UPDATE m4, m5, m6, m7, [dstq + 1*strideq], m15, m12, m13, m9, 3, m3 ; [q1] -p2 -q0 +q1 +q3 + FILTER_UPDATE m6, m7, m4, m5, [dstq + 2*strideq], m10, m13, m8, m9, 3, m3, m8 ; [q2] -p1 -q1 +q2 +q3 +%else + mova m14, [rsp + 64] ; m14 = p3 + mova m15, [rsp + 80] ; m15 = p2 + mova m8, [rsp + 160] ; m8 = q2 + mova m9, [rsp + 176] ; m9 = q3 + FILTER_INIT m4, m5, m6, m7, [rsp + 80], 6, m3, m15 ; [p2] + FILTER_UPDATE m6, m7, m4, m5, [rsp + 96], m14, m15, m10, m13, 3, m3 ; [p1] -p3 -p2 +p1 +q1 + FILTER_UPDATE m4, m5, m6, m7, [rsp + 112], m14, m10, m11, m8, 3, m3 ; [p0] -p3 -p1 +p0 +q2 + FILTER_UPDATE m6, m7, m4, m5, [rsp + 128], m14, m11, m12, m9, 3, m3 ; [q0] -p3 -p0 +q0 +q3 + FILTER_UPDATE m4, m5, m6, m7, [rsp + 144], m15, m12, m13, m9, 3, m3 ; [q1] -p2 -q0 +q1 +q3 + FILTER_UPDATE m6, m7, m4, m5, [rsp + 160], m10, m13, m8, m9, 3, m3, m8 ; [q2] -p1 -q1 +q2 +q3 +%endif + + ; (m0: 0, m1: flat8out, m2: fm & flat8in, m8..15: q2 q3 p1 p0 q0 q1 p3 p2) + ; filter14() + ; + ; m2 m3 m8 m9 m14 m15 m10 m11 m12 m13 + ; + ; q2 q3 p3 p2 p1 p0 q0 q1 + ; p6 -7 p7 p6 p5 p4 . . . . . + ; p5 -6 -p7 -p6 +p5 +q1 . . . . + ; p4 -5 -p7 -p5 +p4 +q2 . . . q2 + ; p3 -4 -p7 -p4 +p3 +q3 . . . q3 + ; p2 -3 -p7 -p3 +p2 +q4 . . . q4 + ; p1 -2 -p7 -p2 +p1 +q5 . . . q5 + ; p0 -1 -p7 -p1 +p0 +q6 . . . q6 + ; q0 +0 -p7 -p0 +q0 +q7 . . . q7 + ; q1 +1 -p6 -q0 +q1 +q7 q1 . . . + ; q2 +2 -p5 -q1 +q2 +q7 . q2 . . + ; q3 +3 -p4 -q2 +q3 +q7 . q3 . . + ; q4 +4 -p3 -q3 +q4 +q7 . q4 . . + ; q5 +5 -p2 -q4 +q5 +q7 . q5 . . + ; q6 +6 -p1 -q5 +q6 +q7 . q6 . . + + pand m1, m2 ; mask(out) & (mask(fm) & mask(in)) +%ifidn %1, v + lea dst1q, [dstq + 8*mstrideq] ; dst1 = &dst[stride * -8] (p7) + lea dst2q, [dst1q + 1* strideq] ; dst2 = &dst[stride * -7] (p6) + mova m2, [dst1q] ; m2 = p7 + mova m3, [dst2q] ; m3 = p6 + mova m8, [dst1q + 2*strideq] ; m8 = p5 (dst[stride * -6]) + mova m9, [dst2q + 2*strideq] ; m9 = p4 (dst[stride * -5]) + FILTER_INIT m4, m5, m6, m7, [dst2q], 14, m1, m3 ; [p6] + FILTER_UPDATE m6, m7, m4, m5, [dst2q + 1*strideq], m2, m3, m8, m13, 4, m1, m8 ; [p5] -p7 -p6 +p5 +q1 + lea dst1q, [dstq + 1*strideq] ; dst1 = &dst[stride * +1] (q1) + mova m13, [dst1q + 1*strideq] ; m13=dst[stride * +2] (q2) + FILTER_UPDATE m4, m5, m6, m7, [dst2q + 2*strideq], m2, m8, m9, m13, 4, m1, m9 ; [p4] -p7 -p5 +p4 +q2 + lea dst2q, [dst2q + 4*strideq] + mova m13, [dst1q + 2*strideq] ; m13=dst[stride * +3] (q3) + FILTER_UPDATE m6, m7, m4, m5, [dst2q + 1*mstrideq], m2, m9, m14, m13, 4, m1, m14 ; [p3] -p7 -p4 +p3 +q3 + mova m13, [dstq + 4*strideq] ; m13=dst[stride * +4] (q4) + FILTER_UPDATE m4, m5, m6, m7, [dst2q], m2, m14, m15, m13, 4, m1 ; [p2] -p7 -p3 +p2 +q4 + mova m13, [dst1q + 4*strideq] ; m13=dst[stride * +5] (q5) + FILTER_UPDATE m6, m7, m4, m5, [dst2q + 1*strideq], m2, m15, m10, m13, 4, m1 ; [p1] -p7 -p2 +p1 +q5 + lea dst1q, [dst1q + 4*strideq] ; dst1 = &dst[stride * +5] (q5) + mova m13, [dst1q + 1*strideq] ; m13=dst[stride * +6] (q6) + FILTER_UPDATE m4, m5, m6, m7, [dst2q + 2*strideq], m2, m10, m11, m13, 4, m1 ; [p0] -p7 -p1 +p0 +q6 + lea dst2q, [dst2q + 4*strideq] + mova m13, [dst1q + 2*strideq] ; m13=dst[stride * +7] (q7) + FILTER_UPDATE m6, m7, m4, m5, [dst2q + 1*mstrideq], m2, m11, m12, m13, 4, m1 ; [q0] -p7 -p0 +q0 +q7 + mova m2, [dst2q] ; m2=dst[stride * +1] (q1) + FILTER_UPDATE m4, m5, m6, m7, [dst2q], m3, m12, m2, m13, 4, m1 ; [q1] -p6 -q0 +q1 +q7 + mova m3, [dst2q + 1*strideq] ; m3=dst[stride * +2] (q2) + FILTER_UPDATE m6, m7, m4, m5, [dst2q + 1*strideq], m8, m2, m3, m13, 4, m1 ; [q2] -p5 -q1 +q2 +q7 + mova m8, [dst2q + 2*strideq] ; m8=dst[stride * +3] (q3) + FILTER_UPDATE m4, m5, m6, m7, [dst2q + 2*strideq], m9, m3, m8, m13, 4, m1, m8 ; [q3] -p4 -q2 +q3 +q7 + lea dst2q, [dst2q + 4*strideq] + mova m9, [dst2q + 1*mstrideq] ; m9=dst[stride * +4] (q4) + FILTER_UPDATE m6, m7, m4, m5, [dst2q + 1*mstrideq],m14, m8, m9, m13, 4, m1, m9 ; [q4] -p3 -q3 +q4 +q7 + mova m14, [dst2q] ; m14=dst[stride * +5] (q5) + FILTER_UPDATE m4, m5, m6, m7, [dst2q], m15, m9, m14, m13, 4, m1, m14 ; [q5] -p2 -q4 +q5 +q7 + mova m15, [dst2q + 1*strideq] ; m15=dst[stride * +6] (q6) + FILTER_UPDATE m6, m7, m4, m5, [dst2q + 1*strideq], m10, m14, m15, m13, 4, m1, m15 ; [q6] -p1 -q5 +q6 +q7 +%else + mova m2, [rsp + 0] ; m2 = p7 + mova m3, [rsp + 16] ; m3 = p6 + mova m8, [rsp + 32] ; m8 = p5 + mova m9, [rsp + 48] ; m9 = p4 + FILTER_INIT m4, m5, m6, m7, [rsp + 16], 14, m1, m3 ; [p6] + FILTER_UPDATE m6, m7, m4, m5, [rsp + 32], m2, m3, m8, m13, 4, m1, m8 ; [p5] -p7 -p6 +p5 +q1 + mova m13, [rsp + 160] ; m13 = q2 + FILTER_UPDATE m4, m5, m6, m7, [rsp + 48], m2, m8, m9, m13, 4, m1, m9 ; [p4] -p7 -p5 +p4 +q2 + mova m13, [rsp + 176] ; m13 = q3 + FILTER_UPDATE m6, m7, m4, m5, [rsp + 64], m2, m9, m14, m13, 4, m1, m14 ; [p3] -p7 -p4 +p3 +q3 + mova m13, [rsp + 192] ; m13 = q4 + FILTER_UPDATE m4, m5, m6, m7, [rsp + 80], m2, m14, m15, m13, 4, m1 ; [p2] -p7 -p3 +p2 +q4 + mova m13, [rsp + 208] ; m13 = q5 + FILTER_UPDATE m6, m7, m4, m5, [rsp + 96], m2, m15, m10, m13, 4, m1 ; [p1] -p7 -p2 +p1 +q5 + mova m13, [rsp + 224] ; m13 = q6 + FILTER_UPDATE m4, m5, m6, m7, [rsp + 112], m2, m10, m11, m13, 4, m1 ; [p0] -p7 -p1 +p0 +q6 + mova m13, [rsp + 240] ; m13 = q7 + FILTER_UPDATE m6, m7, m4, m5, [rsp + 128], m2, m11, m12, m13, 4, m1 ; [q0] -p7 -p0 +q0 +q7 + mova m2, [rsp + 144] ; m2 = q1 + FILTER_UPDATE m4, m5, m6, m7, [rsp + 144], m3, m12, m2, m13, 4, m1 ; [q1] -p6 -q0 +q1 +q7 + mova m3, [rsp + 160] ; m3 = q2 + FILTER_UPDATE m6, m7, m4, m5, [rsp + 160], m8, m2, m3, m13, 4, m1 ; [q2] -p5 -q1 +q2 +q7 + mova m8, [rsp + 176] ; m8 = q3 + FILTER_UPDATE m4, m5, m6, m7, [rsp + 176], m9, m3, m8, m13, 4, m1, m8 ; [q3] -p4 -q2 +q3 +q7 + mova m9, [rsp + 192] ; m9 = q4 + FILTER_UPDATE m6, m7, m4, m5, [rsp + 192], m14, m8, m9, m13, 4, m1, m9 ; [q4] -p3 -q3 +q4 +q7 + mova m14, [rsp + 208] ; m14 = q5 + FILTER_UPDATE m4, m5, m6, m7, [rsp + 208], m15, m9, m14, m13, 4, m1, m14 ; [q5] -p2 -q4 +q5 +q7 + mova m15, [rsp + 224] ; m15 = q6 + FILTER_UPDATE m6, m7, m4, m5, [rsp + 224], m10, m14, m15, m13, 4, m1, m15 ; [q6] -p1 -q5 +q6 +q7 +%endif + +%ifidn %1, h + mova m0, [rsp + 0] ; dst[stride * -8] (p7) + mova m1, [rsp + 16] ; dst[stride * -7] (p6) + mova m2, [rsp + 32] ; dst[stride * -6] (p5) + mova m3, [rsp + 48] ; dst[stride * -5] (p4) + mova m4, [rsp + 64] ; dst[stride * -4] (p3) + mova m5, [rsp + 80] ; dst[stride * -3] (p2) + mova m6, [rsp + 96] ; dst[stride * -2] (p1) + mova m7, [rsp + 112] ; dst[stride * -1] (p0) + mova m8, [rsp + 128] ; dst[stride * +0] (q0) + mova m9, [rsp + 144] ; dst[stride * +1] (q1) + mova m10, [rsp + 160] ; dst[stride * +2] (q2) + mova m11, [rsp + 176] ; dst[stride * +3] (q3) + mova m12, [rsp + 192] ; dst[stride * +4] (q4) + mova m13, [rsp + 208] ; dst[stride * +5] (q5) + mova m14, [rsp + 224] ; dst[stride * +6] (q6) + mova m15, [rsp + 240] ; dst[stride * +7] (q7) + TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp] + lea dst1q, [dstq + 8*mstrideq] ; dst1 = &dst[stride * -8] + lea dst2q, [dst1q + 1* strideq] ; dst2 = &dst[stride * -7] + movu [dst1q ], m0 ; dst[stride * -8] (p7) + movu [dst2q ], m1 ; dst[stride * -7] (p6) + movu [dst1q + 2* strideq], m2 ; dst[stride * -6] (p5) + movu [dst2q + 2* strideq], m3 ; dst[stride * -5] (p4) + lea dst1q, [dstq] ; dst1 = &dst[stride * +0] + lea dst2q, [dstq + 1*strideq] ; dst2 = &dst[stride * +1] + movu [dst1q + 4*mstrideq], m4 ; dst[stride * -4] (p3) + movu [dst2q + 4*mstrideq], m5 ; dst[stride * -3] (p2) + movu [dst1q + 2*mstrideq], m6 ; dst[stride * -2] (p1) + movu [dst2q + 2*mstrideq], m7 ; dst[stride * -1] (p0) + movu [dst1q ], m8 ; dst[stride * +0] (q0) + movu [dst2q ], m9 ; dst[stride * +1] (q1) + movu [dst1q + 2* strideq], m10 ; dst[stride * +2] (q2) + movu [dst2q + 2* strideq], m11 ; dst[stride * +3] (q3) + movu [dst1q + 4* strideq], m12 ; dst[stride * +4] (q4) + movu [dst2q + 4* strideq], m13 ; dst[stride * +5] (q5) + lea dst1q, [dstq + 8*strideq] ; dst1 = &dst[stride * +8] + movu [dst1q + 2*mstrideq], m14 ; dst[stride * +6] (q6) + movu [dst1q + 1*mstrideq], m15 ; dst[stride * +7] (q7) +%endif +%endmacro + +%macro LPF_16_16_VH 1 +INIT_XMM %1 +cglobal vp9_loop_filter_v_16_16, 5,8,16, dst, stride, E, I, H, mstride, dst1, dst2 + LPF_16_16 v + RET +cglobal vp9_loop_filter_h_16_16, 5,8,16, 256, dst, stride, E, I, H, mstride, dst1, dst2 + LPF_16_16 h + RET +%endmacro + +LPF_16_16_VH sse2 +LPF_16_16_VH ssse3 +LPF_16_16_VH avx + +%endif ; x86-64 diff --git a/libavcodec/x86/vp9dsp.asm b/libavcodec/x86/vp9mc.asm index 6488f3092d..488ad70ecf 100644 --- a/libavcodec/x86/vp9dsp.asm +++ b/libavcodec/x86/vp9mc.asm @@ -1,22 +1,22 @@ ;****************************************************************************** -;* VP9 SIMD optimizations +;* VP9 MC SIMD optimizations ;* ;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -87,7 +87,7 @@ SECTION .text %macro filter_h_fn 1 %assign %%px mmsize/2 -cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, src, dstride, sstride, h, filtery +cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, dstride, src, sstride, h, filtery mova m6, [pw_256] mova m7, [filteryq+ 0] %if ARCH_X86_64 && mmsize > 8 @@ -145,30 +145,85 @@ INIT_XMM ssse3 filter_h_fn put filter_h_fn avg +%if ARCH_X86_64 +%macro filter_hx2_fn 1 +%assign %%px mmsize +cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 14, dst, dstride, src, sstride, h, filtery + mova m13, [pw_256] + mova m8, [filteryq+ 0] + mova m9, [filteryq+16] + mova m10, [filteryq+32] + mova m11, [filteryq+48] +.loop: + movu m0, [srcq-3] + movu m1, [srcq-2] + movu m2, [srcq-1] + movu m3, [srcq+0] + movu m4, [srcq+1] + movu m5, [srcq+2] + movu m6, [srcq+3] + movu m7, [srcq+4] + add srcq, sstrideq + SBUTTERFLY bw, 0, 1, 12 + SBUTTERFLY bw, 2, 3, 12 + SBUTTERFLY bw, 4, 5, 12 + SBUTTERFLY bw, 6, 7, 12 + pmaddubsw m0, m8 + pmaddubsw m1, m8 + pmaddubsw m2, m9 + pmaddubsw m3, m9 + pmaddubsw m4, m10 + pmaddubsw m5, m10 + pmaddubsw m6, m11 + pmaddubsw m7, m11 + paddw m0, m2 + paddw m1, m3 + paddw m4, m6 + paddw m5, m7 + paddsw m0, m4 + paddsw m1, m5 + pmulhrsw m0, m13 + pmulhrsw m1, m13 + packuswb m0, m1 +%ifidn %1, avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_XMM ssse3 +filter_hx2_fn put +filter_hx2_fn avg + +%endif ; ARCH_X86_64 + %macro filter_v_fn 1 %assign %%px mmsize/2 %if ARCH_X86_64 -cglobal %1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, src, dstride, sstride, h, filtery, src4, sstride3 +cglobal %1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3 %else -cglobal %1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, src, dstride, sstride, filtery, src4, sstride3 +cglobal %1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3 mov filteryq, r5mp %define hd r4mp %endif - sub srcq, sstrideq - lea sstride3q, [sstrideq*3] - sub srcq, sstrideq mova m6, [pw_256] - sub srcq, sstrideq + lea sstride3q, [sstrideq*3] + lea src4q, [srcq+sstrideq] + sub srcq, sstride3q mova m7, [filteryq+ 0] - lea src4q, [srcq+sstrideq*4] %if ARCH_X86_64 && mmsize > 8 mova m8, [filteryq+16] mova m9, [filteryq+32] mova m10, [filteryq+48] %endif .loop: - ; FIXME maybe reuse loads from previous rows, or just more generally - ; unroll this to prevent multiple loads of the same data? + ; FIXME maybe reuse loads from previous rows, or just + ; more generally unroll this to prevent multiple loads of + ; the same data? movh m0, [srcq] movh m1, [srcq+sstrideq] movh m2, [srcq+sstrideq*2] @@ -219,6 +274,70 @@ INIT_XMM ssse3 filter_v_fn put filter_v_fn avg +%if ARCH_X86_64 + +%macro filter_vx2_fn 1 +%assign %%px mmsize +cglobal %1_8tap_1d_v_ %+ %%px, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3 + mova m13, [pw_256] + lea sstride3q, [sstrideq*3] + lea src4q, [srcq+sstrideq] + sub srcq, sstride3q + mova m8, [filteryq+ 0] + mova m9, [filteryq+16] + mova m10, [filteryq+32] + mova m11, [filteryq+48] +.loop: + ; FIXME maybe reuse loads from previous rows, or just + ; more generally unroll this to prevent multiple loads of + ; the same data? + movu m0, [srcq] + movu m1, [srcq+sstrideq] + movu m2, [srcq+sstrideq*2] + movu m3, [srcq+sstride3q] + movu m4, [src4q] + movu m5, [src4q+sstrideq] + movu m6, [src4q+sstrideq*2] + movu m7, [src4q+sstride3q] + add srcq, sstrideq + add src4q, sstrideq + SBUTTERFLY bw, 0, 1, 12 + SBUTTERFLY bw, 2, 3, 12 + SBUTTERFLY bw, 4, 5, 12 + SBUTTERFLY bw, 6, 7, 12 + pmaddubsw m0, m8 + pmaddubsw m1, m8 + pmaddubsw m2, m9 + pmaddubsw m3, m9 + pmaddubsw m4, m10 + pmaddubsw m5, m10 + pmaddubsw m6, m11 + pmaddubsw m7, m11 + paddw m0, m2 + paddw m1, m3 + paddw m4, m6 + paddw m5, m7 + paddsw m0, m4 + paddsw m1, m5 + pmulhrsw m0, m13 + pmulhrsw m1, m13 + packuswb m0, m1 +%ifidn %1, avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_XMM ssse3 +filter_vx2_fn put +filter_vx2_fn avg + +%endif ; ARCH_X86_64 + %macro fpel_fn 6 %if %2 == 4 %define %%srcfn movh @@ -229,11 +348,11 @@ filter_v_fn avg %endif %if %2 <= 16 -cglobal %1%2, 5, 7, 4, dst, src, dstride, sstride, h, dstride3, sstride3 +cglobal %1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3 lea sstride3q, [sstrideq*3] lea dstride3q, [dstrideq*3] %else -cglobal %1%2, 5, 5, 4, dst, src, dstride, sstride, h +cglobal %1%2, 5, 5, 4, dst, dstride, src, sstride, h %endif .loop: %%srcfn m0, [srcq] diff --git a/libavcodec/x86/w64xmmtest.c b/libavcodec/x86/w64xmmtest.c index 2f064cad7b..25e833fef3 100644 --- a/libavcodec/x86/w64xmmtest.c +++ b/libavcodec/x86/w64xmmtest.c @@ -2,20 +2,20 @@ * check XMM registers for clobbers on Win64 * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -65,6 +65,13 @@ wrap(avcodec_encode_audio2(AVCodecContext *avctx, got_packet_ptr); } +wrap(avcodec_encode_video(AVCodecContext *avctx, + uint8_t *buf, int buf_size, + const AVFrame *pict)) +{ + testxmmclobbers(avcodec_encode_video, avctx, buf, buf_size, pict); +} + wrap(avcodec_encode_subtitle(AVCodecContext *avctx, uint8_t *buf, int buf_size, const AVSubtitle *sub)) |